# ----------------------------------------------------- # Program: OutliersCIs1.S # Author: Steven M. Boker # Date: Tue Oct 26 10:31:01 EST 2004 # # Selection, Outliers and Influence # # ----------------------------------------------------- # ----------------------------------------------------- # Create a temporary dataframe Selecting by Sex isMale <- edaStudent$sex=="Male" & !is.na(edaStudent$sex) t1 <- data.frame(hscgpa=edaStudent$hscgpa[isMale], ztest=edaStudent$ztest[isMale], totunits=edaStudent$totunits[isMale], income=edaStudent$income[isMale]/10000) pdf("Out1MalePairs1.pdf", height=5 ,width=5) pairs(t1, labels=c("GPA", "Z-Test", "Units", "Income"), cex=.75) dev.off() isFemale <- edaStudent$sex=="Female" & !is.na(edaStudent$sex) t1 <- data.frame(hscgpa=edaStudent$hscgpa[isFemale], ztest=edaStudent$ztest[isFemale], totunits=edaStudent$totunits[isFemale], income=edaStudent$income[isFemale]/10000) pdf("Out1FemalePairs1.pdf", height=5 ,width=5) pairs(t1, labels=c("GPA", "Z-Test", "Units", "Income"), cex=.75) dev.off() # ----------------------------------------------------- # Create a temporary dataframe Selecting by income extremes highIncome <- edaStudent$income > 80000 & !is.na(edaStudent$income) t1 <- data.frame(hscgpa=edaStudent$hscgpa[highIncome], ztest=edaStudent$ztest[highIncome], totunits=edaStudent$totunits[highIncome], income=edaStudent$income[highIncome]/10000) pdf("Out1HighIncomePairs1.pdf", height=5 ,width=5) pairs(t1, labels=c("GPA", "Z-Test", "Units", "Income"), cex=.75) dev.off() lowIncome <- edaStudent$income < 40000 & !is.na(edaStudent$income) t1 <- data.frame(hscgpa=edaStudent$hscgpa[lowIncome], ztest=edaStudent$ztest[lowIncome], totunits=edaStudent$totunits[lowIncome], income=edaStudent$income[lowIncome]/10000) pdf("Out1LowIncomePairs1.pdf", height=5 ,width=5) pairs(t1, labels=c("GPA", "Z-Test", "Units", "Income"), cex=.75) dev.off() # ----------------------------------------------------- # Create a distribution with some outliers tX <- c(rnorm(197, mean=10, sd=2), 20, -3, 45) tY <- (tX * .3) + rnorm(200, mean=0, sd=.5) tData <- data.frame(x=tX,y=tY) # ----------------------------------------------------- # Calculate Median, Interquartile and Adjacent Values xLength <- length(tX) xSorted <- sort(tX) xMedian <- xSorted[floor((xLength+1)/2)] xLowerHinge <- xSorted[floor((xLength+1)/4)] xUpperHinge <- xSorted[floor((3*(xLength+1))/4)] xInterQuartile <- xUpperHinge - xLowerHinge xUpperAdjacent <- xUpperHinge + (1.5 * xInterQuartile) xLowerAdjacent <- xLowerHinge - (1.5 * xInterQuartile) yLength <- length(tY) ySorted <- sort(tY) yMedian <- ySorted[floor((yLength+1)/2)] yLowerHinge <- ySorted[floor((yLength+1)/4)] yUpperHinge <- ySorted[floor((3*(yLength+1))/4)] yInterQuartile <- yUpperHinge - yLowerHinge yUpperAdjacent <- yUpperHinge + (1.5 * yInterQuartile) yLowerAdjacent <- yLowerHinge - (1.5 * yInterQuartile) # ----------------------------------------------------- # Remove observations outside Upper and Lower Adjacent values tSelect <- tX > xUpperAdjacent | tX < xLowerAdjacent | tY > yUpperAdjacent | tY < yLowerAdjacent tData2 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # Scatterplot with and without the outliers pdf("Out1NoOutliers.pdf", height=5 ,width=5) plot(c(-10,50), c(-5,15), xlab="X", ylab="Y", type="n") lines(tData$x, tData$y, type="p", cex=.5) abline(lm(y~x, data=tData), col=5) abline(lm(y~x, data=tData2), col=6) dev.off() # ----------------------------------------------------- # Create a distribution with one INFLUENTIAL outlier tX <- c(rnorm(199, mean=10, sd=2), 45) tY <- (tX * .3) + c(rnorm(199, mean=0, sd=.5), -15) tData <- data.frame(x=tX,y=tY) # ----------------------------------------------------- # Calculate Median, Interquartile and Adjacent Values xLength <- length(tX) xSorted <- sort(tX) xMedian <- xSorted[floor((xLength+1)/2)] xLowerHinge <- xSorted[floor((xLength+1)/4)] xUpperHinge <- xSorted[floor((3*(xLength+1))/4)] xInterQuartile <- xUpperHinge - xLowerHinge xUpperAdjacent <- xUpperHinge + (1.5 * xInterQuartile) xLowerAdjacent <- xLowerHinge - (1.5 * xInterQuartile) yLength <- length(tY) ySorted <- sort(tY) yMedian <- ySorted[floor((yLength+1)/2)] yLowerHinge <- ySorted[floor((yLength+1)/4)] yUpperHinge <- ySorted[floor((3*(yLength+1))/4)] yInterQuartile <- yUpperHinge - yLowerHinge yUpperAdjacent <- yUpperHinge + (1.5 * yInterQuartile) yLowerAdjacent <- yLowerHinge - (1.5 * yInterQuartile) # ----------------------------------------------------- # Remove observations outside Upper and Lower Adjacent values tSelect <- tX > xUpperAdjacent | tX < xLowerAdjacent | tY > yUpperAdjacent | tY < yLowerAdjacent tData2 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # Scatterplot with and without the outliers pdf("Out1WithOutliers.pdf", height=5 ,width=5) plot(c(-10,50), c(-5,15), xlab="X", ylab="Y", type="n") lines(tData$x, tData$y, type="p", cex=.5) abline(lm(y~x, data=tData), col=5) abline(lm(y~x, data=tData2), col=6) dev.off() # ----------------------------------------------------- # Influence calculation from Linear Model tLM <- lm(y~x, data=tData) summary(tLM) # ----------------------------------------------------- # The hat diagonal measures influence tInfluence <- lm.influence(tLM) summary(tInfluence$hat) pdf("Out1hatvalues1.pdf", height=5 ,width=5) plot(hatvalues(tLM), residuals(tLM), xlab="Hat Matrix Diagonal", ylab="Residual Value") dev.off() # ----------------------------------------------------- # Removing observations with hat diagonal greater than .2 tSelect <- tInfluence$hat > .2 tData3 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # The coefficients from lm.influence are results of lm with one observation # removed for each observation. summary(data.frame(tInfluence$coefficients)) # ----------------------------------------------------- # Calculating Mahalanobis Distance (p<.001 is 13.8 for 2 df) mDistance <- (length(tX) - 1)*(tInfluence$hat - 1/length(tX)) pdf("Out1Mahalanobis1.pdf", height=5 ,width=5) plot(mDistance, residuals(tLM), xlab="Mahalanobis Distance", ylab="Residual Value") dev.off() qchisq(p=0.999, df=2, ncp=0) tSelect <- mDistance > 13.8 tData4 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # Quit the program