/* Filename: Mantel.sas Purpose: SAS code to compute two distance matrices and compare them. The assemblage distance matrix is regressed on the geographical distance matrix. We compute a standardized regression coefficient. Hypothesis testing is done via randomization. Finally the STANDARDIZED distances used in the regrssion are output and plotted. Last Update: FDN 3.19.07 */ PROC IMPORT OUT= WORK.ppdata DATAFILE= "H:\www\anth589b\PovertyPointData.xls" DBMS=EXCEL REPLACE; SHEET="Sheet1$"; GETNAMES=YES; MIXED=NO; SCANTEXT=YES; USEDATE=YES; SCANTIME=YES; RUN; PROC IMPORT OUT= WORK.ppcoords DATAFILE= "H:\www\anth589b\PovertyPointCoords.xls" DBMS=EXCEL REPLACE; SHEET="Sheet1$"; GETNAMES=YES; MIXED=NO; SCANTEXT=YES; USEDATE=YES; SCANTIME=YES; RUN; title1 'Poverty Point: Mantel Test'; goptions reset=global gunit=in ftitle="garamond" ftext="garamond" htitle=.5 htext=.3; proc sort data=ppdata; by site; proc sort data=ppcoords; by site; proc print data =ppdata; proc print data=ppcoords; run; proc print; run; proc iml; use ppdata ; ******* dataset with type frequencies; read all into xmat [rowname=site colname=label] ; ******* specify and ID variable for the rows: rowname = ? ); print xmat; ; use ppcoords ; *******read the spatial coordinate data; read all into coords [rowname=site colname=dims]; ******* specify and ID variable for the rows (rowname); ******* and a name for the variable names (colname); print coords; disttype =3; ******* DISTTYPE: 1 = Euclidean distances from proportions 2 = Euclidean distances from raw data values 3 = Euclidean distances standardized proportions 4 = manhattan distances from proportions 5 = manhattan distances from raw data values 6 = manhattan distances from standardized proportions 7 = Nei's (1972) genetic distance 8 = chisquared distance; /* first we define the functions that well need*/ ************************************************************************** Function: Distance Description: Compute a Euclidean distance matrix for the rows of an input dataset. Converts the rows to relative freqs first. Arguments: XMAT: the input data set ROWNAME: a character variable that names the rows DISTTYPE: 1 = Euclidean distances from proportions 2 = Euclidean distances from raw data values 3 = Euclidean distances standardized proportions 4 = manhattan distances from proportions 5 = manhattan distances from raw data values 6 = manhattan distances from standardized proportions 7 = Nei's (1972) genetic distance 8 = chisquared distance Returns: Dmat: the distance matrix FromMat: A matrix of row labels for the distances ToMat Another matrix of row labels; START distance(xmat,rowname,disttype,DMAT,FROMMAT,TOMAT); n=nrow(xmat); dmat=j(n,n,.); FromMat= cshape(' ',n,n,16); ToMat=FromMat; if (disttype=1) | (disttype=3) | (disttype = 4) | (disttype=6) then do; do i=1 to n; xmat[i,]= xmat[i,]/sum(xmat[i,]); end; if (disttype= 3 ) | (disttype = 6) then do; nonzeroindices = loc(xmat[+,]); *an error trap for types with no occurences; xmat = xmat[,nonzeroindices]; do j=1 to ncol(xmat); mean= xmat[+,j] /xmat[+,+]; devs= (xmat[,j]-mean); std= sqrt((ssq(devs)/(n-1))); xmat[,j]= (xmat[,j] - mean)/std; end; print xmat; end; end; if (disttype= 1) | (disttype= 2) | (disttype= 3) then do; do i=1 to n by 1; do j=1 to n by 1; dmat[i,j]= sqrt((xmat[i,]-xmat[j,])*(xmat[i,]-xmat[j,])`); FromMat[i,j]=rowname[i,]; ToMat[i,j]=rowname[j,]; end; end; end; if (disttype= 4 ) | (disttype= 5 ) | (disttype= 6) then do; do i=1 to n by 1; do j=1 to n by 1; dmat[i,j]= sum(abs(xmat[i,]-xmat[j,])); FromMat[i,j]=rowname[i,]; ToMat[i,j]=rowname[j,]; end; end; end; if (disttype= 7) then do; do i=1 to n by 1; do j=1 to n by 1; dmat[i,j]= -log((xmat[i,]*xmat[j,]`)/sqrt(xmat[i,]*xmat[i,]`)#(xmat[j,]*xmat[j,]`)); FromMat[i,j]=rowname[i,]; ToMat[i,j]=rowname[j,]; end; end; end; if (disttype= 8) then do; do i=1 to n by 1; do j=1 to n by 1; dmat[i,j]= sqrt (sum( (((xmat[i,]/xmat[i,+]) - (xmat[j,]/xmat[j,+]))##2) / (xmat[+,]/xmat[+,+]) )) ; FromMat[i,j]=rowname[i,]; ToMat[i,j]=rowname[j,]; end; end; end; FINISH distance ; ******************************************************************************************************; ****************************************************************************************************** Function: MATTOCOL Description:Puts the lower triangle of a symmetric matrix in a col vector Arguments: Dmat -- the symmetric matrix Returns: A column vector; START MATTOCOL (dmat); n=nrow(dmat); distvec= dmat[{1},{2}:n]`; do i={2} to (n-{1}); distvec=distvec//( dmat[i,(i+{1}):n])`; end; return (distvec); FINISH MATTOCOL; ******************************************************************************************************; ******************************************************************************************************* Function: PERMUTE Description:Randomly permutes the order of the rows and cols of a symmteric matrix. Arguments: Dmat: the symmetric matrix. Returns: the randomly permuted matrix.; START PERMUTE (dmat); n=nrow(dmat); ran= uniform(repeat(0,n,1)); order = rank(ran); newmat = dmat[order,order]; return (newmat); FINISH; *****************************************************************************************************; ****************************************************************************************************** Function: REG Description:Computes a standardized regression coefficient for two distance matrices. Arguments: X: independent variable matrix Y: dependent variable matrix Returns: a standardized Beta; START reg (y, x); y1=standard(mattocol(y)); x1=standard(mattocol(x)); n=nrow(x1); xmat= j(nrow(x1),1,1) || x1; xpx= xmat`*xmat; xpy = xmat`*y1; xpxi = inv(xpx); b= xpxi*xpy; return (b`[,2]); FINISH reg; *****************************************************************************************************; **************************************************************************** Function: GETPVALUE Description:finds the depth of an observed test statistic in a sampling distrbution. Arguments: HOSampDist: The sampling distribution of the test stat, under H0. teststat: The observed value of the test statistic. Returns: The p value associated with a one-tailed test of the statistic in question; start getpvalue(HOSampDist,teststat); nsamples=nrow(HOSampdist); rindex=rank(HOSampDist); sorted=j(nsamples,1,0); sorted[rindex,] = HOSampDist[1:Nsamples,]; fvalues=((1:nsamples)-.5)/Nsamples; do i=1 to nsamples; if i=1 then do ; if teststat <= sorted then pvalue=1; end; if (1 < i) & (i < nsamples) then do; if (sorted[i,] < teststat) & ( teststat <= sorted[i+1,]) then pvalue=1-fvalues[,i]; end; if i=nsamples then do; if sorted[i,] <= teststat then pvalue=0; end; end; return(pvalue); finish getpvalue; ********************************************************************; /* OK lets use the functions that we have defined */ call distance(xmat,site,disttype,ASSEMDIST,aFROMMAT,aTOMAT); *compute the assemblage distances; call distance(coords,site,2,Geodist,cFROMMAT,cTOMAT); *compute the geodistances; beta= reg(assemdist,geodist); *do the regression; print 'The observed betas (standardized)' beta; *print the results; /* now compute the randomization distribution of the beta estimates */ iter=1000; ranbetas = j(iter,1,0); do i=1 to iter; ranassemdist= permute(assemdist); ranbetas[i,] = reg(ranassemdist,geodist); end; pvalue=getpvalue(ranbetas,beta); *find the pvalue; print 'Probability of a greater value (one tailed)' pvalue; /*output the distances */ y1=(mattocol(assemdist)); x1=(mattocol(geodist)); distances =y1||x1; fromvec=mattocol(afrommat) ; tovec=mattocol(atomat); fromtovec= concat(fromvec,tovec); label= {'assemdist' 'geodist'}; create distances from distances [rowname=fromtovec colname=label]; append from distances [rowname=fromtovec]; close distances; /* output the randomization ditribution */ label1 = 'beta'; create ranbetas from ranbetas [colname=label1]; append from ranbetas; close ranbetas; data distances; set distances; fromtovec = compress(fromtovec); label assemdist = ' Assemblage Distance'; label geodist = ' Geographical Distance'; proc print; run; title2 ' '; symbol1 value=dot c= grey h=.25 POINTLABEL=("#fromtovec" j=r font='garamond'); proc gplot data=distances; plot assemdist*geodist =1; run; Title2 'The Randomization Distribution'; proc univariate data=ranbetas noprint; histogram beta / cfill=grey cbarline= black kernel ; run;