/*Filename: AR5Loess.sas */ /*Purpose: Fits a loess surface to the AR5 House size data, maps the trend and */ /* the residuals, looks at autocorrelation in the residuals */ /*Last Update: FDN 2.15.07 */ options ls=80 ps=40 ; /*start by setting the graphics options --Note that names of windows fonts are given in quotes*/ goptions reset=global gunit=in ftitle="garamond" ftext="garamond" htitle=.3 htext=.2; symbol1 value=circle h=.2 ; *specify the symbol to be used in plots; title1 'AR5 Analysis'; ******* Titles for output here; /*import the data */ PROC IMPORT OUT= WORK.house DATAFILE= "H:\Courses\SpatialAnalysis\ArcGISExamples\LizData \house.dbf" DBMS=DBF REPLACE; GETDELETED=NO; RUN; /*fix the use of 0 for missing values, also the coordinates*/ data house; set house; if diam__m__e ne 0 then diam =diam__m__e; northing = point_y - 8321000; easting= point_x - 373000; if diam ne .; *this line drops records with missing values for diam from the data ; /* plot the point locations */ proc gplot data=house; plot northing * easting run; /* do a univariate summary of the diameter measurements*/ proc univariate data=house; var diam; histogram diam / cfill=grey; qqplot diam / normal (mu= est sigma=est); run; /*Compute some initial correlograms */ /*invoke Proc IML -- Interactive Matrix Language */ proc iml; use house ; ******** supply dataset name; read all var{northing easting} into c; ******** names of northing (y) and easting (x) vars; read all var{diam} into x; ******** name of z variable; nsteps= 10 ; ******** enter number of lags ; lag= 20 ; ******** enter lag size; n=nrow(x); distmat= j(n,n,0); gc=j(nsteps,1,0); lagd=gc; semivar=gc; *dimension vars; nedges=gc; moran=gc; diffmat=distmat; cpmat=distmat; meandist=gc; mean = (sum(x)/n); do j=1 to n by 1; *get a matrix of distances between data points; do k=1 to n by 1; *and matrices of squared differences and crossproducts; distmat[k,j]= ((c[k,]-c[j,])*(c[k,]-c[j,])`)##.5; diffmat[k,j]= (x[k,]-x[j,])##2; cpmat[k,j] = (x[k,]-mean)#(x[j,]-mean); end; end; z = x - ((sum(x))/n); *deviations from mean; ss=ssq(z); variance= ss/(n-1); print variance; do i=1 to nsteps; *compute for the i'th lag; lagd[i,]= i#lag; lh= (lagd[i,]-(lag/2)); hh= (lagd[i,]+(lag/2)); print lh hh; w= (distmat > lh) # (distmat <= hh); * binary weights matrix; sumw= sum(w); meandist[i,]= (sum(w#distmat))/sumw; *average diatnce between point pairs; semivar[i,]= ((sum(w#diffmat))/sumw)*.5; *semivariance; gc[i,]= semivar[i,]/variance; *gearys c; moran[i,] = ((sum(w#cpmat))/sumw)/ (ss/n); nedges[i,]= sumw/2; end; result= semivar||gc|| moran || lagd|| meandist || nedges; label= {semivar gearys_c morans_i lagdist meandist nedges}; create autocorr from result [colname=label]; append from result ; proc print; run; symbol1 value=circle h=.2 i=join ; *specify the symbol to be used in plots; proc gplot data=autocorr; plot gearys_c * meandist=1; plot semivar*meandist =1; plot morans_i*meandist =1; run; /* Proc Variogram with different angle classes*/ /* to check for anisotropy */ proc variogram data=house outvar = variogramdata; var diam; coordinates xcoord=easting ycoord=northing; compute lagdistance=10 maxlags=20 ndirections=4; proc print; run; proc gplot data=variogramdata; where lag >= 1; plot (variog covar) * distance =angle; run; /* do the loess fit-- note the use of the 'direct' statment to force SAS to estimate */ /* one local regression for each point. This is critial to getting results that match */ /* Clevelands orginal algorithm. */ proc loess data =house; ods Output OutputStatistics=Statout(rename= (depvar=diam pred=predicted)); model diam = northing easting / degree=1 select=gcv direct; data statout; set statout; residual= diam-predicted; /* this bit computes a pseudo-r-squared summary of the loess fit = */ /*1-(sum of squared residuals/sum of squared deveiation from the mean for the diam variable */ proc iml; use statout; read all var {diam predicted residual}; mean= sum(diam)/nrow(diam); SS_diam = sum((diam-mean)##2); SS_residual = sum((residual)##2); Pseudo_R_squared = 1-(SS_Residual/SS_diam); print SS_diam SS_residual Pseudo_R_squared; run; /* do a univarite summary of the residuals*/ proc univariate data=statout; var residual; histogram residual / cfill=grey; qqplot diam / normal (mu= est sigma=est); run; /*now we dreate a special dataset that can be used to plot the houses on the contour map */ /*note that we use the radius_m filed so that ALL houses get plotted. Houses with no */ /*measurable simater are plotted at the mean size fo the entire dataset */ data housesites; set house; y=northing; x=easting; FUNCTION='SYMBOL'; COLOR='BLACK'; size= radius_m*.5; text='CIRCLE'; XSYS='2'; YSYS='2'; OUTPUT; /* Take the predicted values from loess, interpolte interveing values on a regular grid, */ /* using a thin-plate spline. This will allow us to look at the surface in a contour map */ proc g3grid data=statout out=quickanddirtysurface; grid northing * easting =predicted / spline; /*make the contour map and annotate it with the house locations */ proc gcontour data = quickanddirtysurface; plot northing* easting= predicted / nlevels=30 anno=housesites; run; /* do the same thing for the residuals */ /* first we prep a dataset to map the residual values in space: negative residuals */ /* appear as open circles, positive ones as filled circles. Circle size scales with */ /* absolute value of the residual */ data houseresids; set statout; y=northing; x=easting; FUNCTION='SYMBOL'; COLOR='BLACK'; size= abs(residual)*2; if residual <= 0 then text='CIRCLE'; if resIdual > 0 then text= 'DOT'; XSYS='2'; YSYS='2'; OUTPUT; proc g3grid data=statout out=quickanddirtysurface; grid northing * easting =residual / spline; proc gcontour data = quickanddirtysurface; plot northing* easting= residual / nlevels=20 llevels = 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 anno=houseresids; run; /* look at the variogram of residuals */ /*invoke Proc IML -- Interactive Matrix Language */ proc iml; use statout ; ******** supply dataset name; read all var{northing easting} into c; ******** names of northing (y) and easting (x) vars; read all var{residual} into x; ******** name of z variable; nsteps= 10 ; ******** enter number of lags ; lag= 20 ; ******** enter lag size; n=nrow(x); distmat= j(n,n,0); gc=j(nsteps,1,0); lagd=gc; semivar=gc; *dimension vars; nedges=gc; moran=gc; diffmat=distmat; cpmat=distmat; meandist=gc; mean = (sum(x)/n); do j=1 to n by 1; *get a matrix of distances between data points; do k=1 to n by 1; *and matrices of squared differences and crossproducts; distmat[k,j]= ((c[k,]-c[j,])*(c[k,]-c[j,])`)##.5; diffmat[k,j]= (x[k,]-x[j,])##2; cpmat[k,j] = (x[k,]-mean)#(x[j,]-mean); end; end; z = x - ((sum(x))/n); *deviations from mean; ss=ssq(z); variance= ss/(n-1); print variance; do i=1 to nsteps; *compute for the i'th lag; lagd[i,]= i#lag; lh= (lagd[i,]-(lag/2)); hh= (lagd[i,]+(lag/2)); print lh hh; w= (distmat > lh) # (distmat <= hh); * binary weights matrix; sumw= sum(w); meandist[i,]= (sum(w#distmat))/sumw; *average diatnce between point pairs; semivar[i,]= ((sum(w#diffmat))/sumw)*.5; *semivariance; gc[i,]= semivar[i,]/variance; *gearys c; moran[i,] = ((sum(w#cpmat))/sumw)/ (ss/n); nedges[i,]= sumw/2; end; result= semivar||gc|| moran || lagd|| meandist || nedges; label= {semivar gearys_c morans_i lagdist meandist nedges}; create autocorr from result [colname=label]; append from result ; proc print; run; proc print; proc gplot data=autocorr; plot gearys_c * meandist=1; plot semivar*meandist =1; plot morans_i*meandist =1; run; /* Proc Variogram with different angle classes*/ /* to check for anisotropy */ proc variogram data=statout outvar = variogramdata; var residual; coordinates xcoord=easting ycoord=northing; compute lagdistance=20 maxlags=10 ndirections=4; proc print; run; proc gplot data=variogramdata; where lag >= 1; plot (variog covar) * distance =angle; run;