/* Filename: CA2.SAS */ /* Purpose: Diagnostic output for CA: 1. a simple randomization test of the */ /* of the proportion on inertia accounted for by each CA axis. 2)Als o */ /* writes out two files containing the original data matrix, */ /* with rows and columns sorted on the CA Axis 1 and Axis 2 scores for */ /* seriation plotting. */ /* Last Update: FDN 4.2.07 */ proc iml ; use dataset; ******name of input dataset ; read all into x [rowname=midden colnane=vars]; ******name of ID variable in input dataset; ntrials=5000; ***************************************************************************************** Function: PERMUTEcols Description:Randomly permutes the entries of each row of a matrix Arguments: inmat: the matrix. Returns: the randomly permuted matrix.; START PERMUTEcols (inmat); n=nrow(inmat); p=ncol(inmat); newmat = j(n,p,.); do i=1 to n; ran= uniform(repeat(0,p,1)); order = rank(ran); newmat[i,] = inmat[i,order]; end; return (newmat); FINISH; *****************************************************************************************; **************************************************************************** Function: GETPVALUE Description:finds the depth of an observed test statistic in a sampling distrbution. Arguments: HOSampDist: The sampling distribution of the test stat, under H0. teststat: The observed value of the test statistic. Returns: The p value associated with a one-tailed test of the statistic in question; ; start getpvalue(HOSampDist,teststat,PVALUE); nsamples=nrow(HOSampdist); rindex=rank(HOSampDist); sorted=j(nsamples,1,0); sorted[rindex,] = HOSampDist[1:Nsamples,]; fvalues=((1:nsamples)-.5)/Nsamples; do i=1 to nsamples; if i=1 then do ; if teststat < sorted[i,] then pvalue=1; if teststat = sorted[i,] then pvalue=1-fvalues[,i]; end; if ((i >= 1) & (i < nsamples)) then do; if (sorted[i,] <= teststat) & ( teststat < sorted[i+1,]) then pvalue=1-fvalues[,i]; end; if i=nsamples then do; if teststat = sorted[i,] then pvalue=1-fvalues[,i]; if teststat > sorted[i,] then pvalue=0; end; end; *print sorted teststat pvalue; finish getpvalue; ********************************************************************; *get the pct inertias; p= x/sum(x); *get the matrix of proportions; colsums =p[+,] ; rowsums = p[,+] ; expected= rowsums*colsums; *compute the expected proportions -- as in contingency table; q= (p-expected)/(expected##.5); *deviations from the expecteds, divide by the square root of the expecteds ; cov=q`*q; *cov matrix --errr... sort of--- just like PCA; evalvec = (eigval(cov))`; *compute the eigenvalues; pctinvec = evalvec/ repeat(evalvec[,+],1,ncol(x)); cumpctinvec = cusum(pctinvec); *get the evals from randomization; ranevalmat = j(ntrials,ncol(x),.); ranpctinmat=ranevalmat; cumranpctinmat=ranevalmat; do i=1 to ntrials; xran=permutecols(x); p= xran/sum(xran); *get the matrix of proportions; colsums =p[+,] ; rowsums = p[,+] ; expected= rowsums*colsums; *compute the expected proportions -- as in contingency table; q= (p-expected)/(expected##.5); *deviations from the expecteds, divide by the square root of the expecteds ; cov=q`*q; *cov matrix --errr... sort of--- just like PCA; ranevalmat[i,] = (eigval(cov))`; *compute the eigenvalues; ranpctinmat[i,] = ranevalmat[i,]/ repeat(ranevalmat[i,+],1,ncol(x)); cumranpctinmat[i,] = cusum(ranpctinmat[i,]); end; * get the means for the randomizations; meanranpct = j(1,ncol(x),.); meancumranpct=meanranpct; do i= 1 to ncol(meanranpct); meanranpct[,i] =ranpctinmat[+,i]/nrow(ranpctinmat); meancumranpct[,i] =cumranpctinmat[+,i]/nrow(cumranpctinmat); end; *get the pvalues from the randomized CA inertias; PctInertia_Pvalue = j(1,ncol(x)-1,.); CumPctInertia_Pvalue =PctInertia_Pvalue; do i = 1 to (ncol(x)-1); ranpctin=ranpctinmat [,i] ; realpctin=pctinvec[,i] ; call getpvalue(ranpctin,realpctin,PVALUE); PctInertia_Pvalue[,i] = Pvalue; cumranpctin=cumranpctinmat[,i] ; realcumpctin=cumpctinvec[,i] ; call getpvalue(cumranpctin,realcumpctin,PVALUE); CumPctInertia_Pvalue[,i] =pvalue; end; evalvec=evalvec`; pctinvec = pctinvec`; meanranpct = meanranpct`; PctInertia_Pvalue= PctInertia_Pvalue` ; print evalvec pctinvec meanranpct PctInertia_Pvalue; *complete the CA on the data; p= x/sum(x); *get the matrix of proportions; colsums =p[+,] ; rowsums = p[,+] ; expected= rowsums*colsums; *compute the expected proportions -- as in contingency table; q= (p-expected)/(expected##.5); *deviations from the expecteds, divide by the square root of the expecteds ; cov=q`*q; *cov matrix --errr... sort of--- just like PCA; call eigen(l,u,cov); *compute the eigenvalues (l) and eigenvectors; *now compute the coordinates of the rows and columns in their joint spaces ; *do the columns first; dcols=diag(p[+,]##-.5); *Get a diagonal matrix of the inverse of the sq. roots of the col sums; *print dcols u; v=dcols*u; *rescale each eigenvector element by the inverse of sq root of its col sum; *note that this gives less influence to cols with big sums; *now get the row scores ; drows=diag(p[,+]##-1); rowprofiles=drows*p; *the rowprofiles are conditional probabilities ~ row perecnts; *print drows p rowprofiles; f= rowprofiles*v ; *compute the row scores; *sort the data matrix on the dim1 scores; colorder = rank(v[,1])`; *print colorder; sortedx= x; sortedx[,colorder]= x; *print x sortedx; sortedvars=vars; sortedvars[,colorder]= vars; *print sortedvars; roworder = rank(f[,1]); ssortedx = sortedx; ssortedx[roworder,] = sortedx; sortedcases = midden; sortedcases[roworder,] = midden; *output the sorted data for seriation; create seriationdata1 from ssortedx [rowname=sortedcases colname=sortedvars]; append from ssortedx [rowname=sortedcases]; *sort the data matrix on the dim2 scores; colorder = rank(v[,2])`; *print colorder; sortedx= x; sortedx[,colorder]= x; *print x sortedx; sortedvars=vars; sortedvars[,colorder]= vars; *print sortedvars; roworder = rank(f[,2]); ssortedx = sortedx; ssortedx[roworder,] = sortedx; sortedcases = midden; sortedcases[roworder,] = midden; *output the sorted data for seriation; create seriationdata2 from ssortedx [rowname=sortedcases colname=sortedvars]; append from ssortedx [rowname=sortedcases]; proc print data=seriationdata1; proc print data=seriationdata2; pROC EXPORT DATA= WORK.SERIATIONDATA1 OUTFILE= "C:\localdata\seriationdata1.xls" DBMS=EXCEL REPLACE; SHEET="data"; RUN; pROC EXPORT DATA= WORK.SERIATIONDATA2 OUTFILE= "C:\localdata\seriationdata2.xls" DBMS=EXCEL REPLACE; SHEET="data"; RUN;