/* Filename: DeBoeretal.sas */ /* Purpose: PCA and MDS of Betty Megger's data from AM-MA-9on the Rio Negro */ /* Last Update: FDN 4.23.02 */ data amma9; input context $ code $ PajuraPlain VilaPlain MadadaPlain PajuraRed PajuraPainted PajuraGrooved; cards; C2:0-10 a 143 26 41 18 3 4 C1:0-10 b 556 180 102 38 11 10 C1:10-20 c 562 183 146 26 1 0 C2:10-20 d 536 152 179 18 2 5 C1:20-40 e 543 188 153 13 0 0 C3:20-30 f 101 26 30 22 9 2 C3:30-40 g 123 44 29 18 4 1 C3:40-60 h 56 20 25 5 2 2 C2:20-30 i 356 237 154 13 0 0 C2:30-40 j 133 93 57 30 1 0 ; /* convert to relative frequencies */ data amma9pct; set amma9; total= sum(of _numeric_); array pct {*} _numeric_; do i=1 to dim(pct); pct(i)= pct(i)/total; end; drop i total; proc print; run; /* do the pca -- note we use the COV matrix */ proc princomp data = amma9pct out=pca outstat=eigenstructure covariance; var _numeric_; /* plot the PC's */ proc plot data=pca; plot prin2*prin1 = '*' $ code; run; /* plot the eigenvectors */ data eigenvectors; set eigenstructure; where _type_='SCORE'; proc transpose data = eigenvectors out=eigenvectors name=type; proc print data=eigenvectors; proc plot data=eigenvectors; plot prin2*prin1= '*' $ type; run; /* compute a distance matrix for MDS */ proc iml; ************************************************************************** Function: EUCDIST Description:Compute Euclidean distance between two row vectors. Arguments: X1,X2: the two row vectors. pct: set to 1 if you want X1 and X2 coverted to relative frequencies first. Returns: Euclidean distance between X1 and X2. ; start eucdist(x1,x2,pct); if pct=1 then do; x1=x1/sum(x1); x2=x2/sum(x2); end; dist= sqrt((x1-x2)*(x1-x2)`); return(dist); finish eucdist; ****************************************************************************; ************************************************************************** Function: NEIDIST Description:Compute Nei's (1972) distance between two row vectorsof counts. Arguments: X1,X2: the two row vectors. Returns: Nei's (genetic) distance between X1 and X2. ; start neidist(x1,x2); *assumes input as counts; x1=x1/sum(x1); x2=x2/sum(x2); dist=1-((x1*x2`)/sqrt(x1*x1`)#(x2*x2`)); *you can change "-log" to "1-"; return(dist); finish neidist; **************************************************************************; ************************************************************************** Function: MANHATDIST Description:Computes the Manhattan distancee between two row vectors. Arguments: X1,X2: the two row vectors. pct: set to 1 if you want X1 and X2 converted to relative frequencies first. Returns: Manhattan distance between X1 and X2. ; start manhatdist(x1,x2,pct); if pct=1 then do; x1=x1/sum(x1); x2=x2/sum(x2); end; dist= sum(abs(x1-x2)); return(dist); finish manhatdist; **************************************************************************; ************************************************************************** Function: DISTMAT Description: Compute a distance matrix for the rows of an input dataset. Arguments: XMAT: the input data set ROWNAME: a character variable that names the rows DISTYPE: 1 get you euclidean, 2 for manhattan, 3 for nei Returns: Dmat: the distance matrix FromMat: A matrix of row labels for the distances ToMat Another matrix of row labels; START distmat(xmat,rowname,disttype,DMAT,FROMMAT,TOMAT); n=nrow(xmat); dmat=j(n,n,0); FromMat= cshape(' ',n,n,8); ToMat=FromMat; do i=1 to n by 1; do j=1 to n by 1; if disttype=1 then dmat[i,j]= eucdist(xmat[i,],xmat[j,],1) ; if disttype=2 then dmat[i,j]= manhatdist(xmat[i,],xmat[j,],1); if disttype=3 then dmat[i,j]= neidist(xmat[i,],xmat[j,]) ; FromMat[i,j]=rowname[i,]; ToMat[i,j]=rowname[j,]; end; end; FINISH distmat ; ******************************************************************************************************; /* ok -- let's do the business */ use amma9; read all into xmat [rowname=code] ; *read the data into IML; call distmat(xmat,code,2,DMat,FromMat,ToMat); *get the distance matrix; create dmat from dmat [colname=code rowname=code ] ; append from dmat [rowname=code rowname=code ] ; close dmat; proc print; run; proc mds data=dmat out = mds pineigval; proc print; proc plot; plot dim2*dim1 = '*' $ _name_; run;