# ----------------------------------------------------- # Program: OutliersCIs1.S # Author: Steven M. Boker # Date: Tue Oct 26 10:31:01 EST 2004 # # Selection, Outliers and Influence # # ----------------------------------------------------- # ----------------------------------------------------- # Create a temporary dataframe Selecting by Sex tSelect <- edaStudent$sex=="Male" & !is.na(edaStudent$sex) t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect], ztest=edaStudent$ztest[tSelect], totunits=edaStudent$totunits[tSelect], income=edaStudent$income[tSelect]/10000) tSelect <- edaStudent$sex=="Female" & !is.na(edaStudent$sex) t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect], ztest=edaStudent$ztest[tSelect], totunits=edaStudent$totunits[tSelect], income=edaStudent$income[tSelect]/10000) # ----------------------------------------------------- # Create a temporary dataframe Selecting by income extremes tSelect <- edaStudent$income > 80000 & !is.na(edaStudent$income) t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect], ztest=edaStudent$ztest[tSelect], totunits=edaStudent$totunits[tSelect], income=edaStudent$income[tSelect]/10000) tSelect <- edaStudent$income < 40000 & !is.na(edaStudent$income) t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect], ztest=edaStudent$ztest[tSelect], totunits=edaStudent$totunits[tSelect], income=edaStudent$income[tSelect]/10000) # ----------------------------------------------------- # Create a distribution with some outliers tX <- c(rnorm(197, mean=10, sd=2), 20, -3, 45) tY <- (tX * .3) + rnorm(200, mean=0, sd=.5) tData <- data.frame(x=tX,y=tY) # ----------------------------------------------------- # Calculate Median, Interquartile and Adjacent Values xLength <- length(tX) xSorted <- sort(tX) xMedian <- xSorted[floor((xLength+1)/2)] xLowerHinge <- xSorted[floor((xLength+1)/4)] xUpperHinge <- xSorted[floor((3*(xLength+1))/4)] xInterQuartile <- xUpperHinge - xLowerHinge xUpperAdjacent <- xUpperHinge + (1.5 * xInterQuartile) xLowerAdjacent <- xLowerHinge - (1.5 * xInterQuartile) yLength <- length(tY) ySorted <- sort(tY) yMedian <- ySorted[floor((yLength+1)/2)] yLowerHinge <- ySorted[floor((yLength+1)/4)] yUpperHinge <- ySorted[floor((3*(yLength+1))/4)] yInterQuartile <- yUpperHinge - yLowerHinge yUpperAdjacent <- yUpperHinge + (1.5 * yInterQuartile) yLowerAdjacent <- yLowerHinge - (1.5 * yInterQuartile) # ----------------------------------------------------- # Remove observations outside Upper and Lower Adjacent values tSelect <- tX > xUpperAdjacent | tX < xLowerAdjacent | tY > yUpperAdjacent | tY < yLowerAdjacent tData2 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # Scatterplot with and without the outliers graphsheet(height=6.4,width=6.4) plot(c(-10,50), c(-5,15), xlab="X", ylab="Y", type="n") lines(tData$x, tData$y, type="p", cex=.5) abline(lm(y~x, data=tData), col=5) abline(lm(y~x, data=tData2), col=6) # ----------------------------------------------------- # Create a distribution with one INFLUENTIAL outlier tX <- c(rnorm(199, mean=10, sd=2), 45) tY <- (tX * .3) + c(rnorm(199, mean=0, sd=.5), -15) tData <- data.frame(x=tX,y=tY) # ----------------------------------------------------- # Calculate Median, Interquartile and Adjacent Values xLength <- length(tX) xSorted <- sort(tX) xMedian <- xSorted[floor((xLength+1)/2)] xLowerHinge <- xSorted[floor((xLength+1)/4)] xUpperHinge <- xSorted[floor((3*(xLength+1))/4)] xInterQuartile <- xUpperHinge - xLowerHinge xUpperAdjacent <- xUpperHinge + (1.5 * xInterQuartile) xLowerAdjacent <- xLowerHinge - (1.5 * xInterQuartile) yLength <- length(tY) ySorted <- sort(tY) yMedian <- ySorted[floor((yLength+1)/2)] yLowerHinge <- ySorted[floor((yLength+1)/4)] yUpperHinge <- ySorted[floor((3*(yLength+1))/4)] yInterQuartile <- yUpperHinge - yLowerHinge yUpperAdjacent <- yUpperHinge + (1.5 * yInterQuartile) yLowerAdjacent <- yLowerHinge - (1.5 * yInterQuartile) # ----------------------------------------------------- # Remove observations outside Upper and Lower Adjacent values tSelect <- tX > xUpperAdjacent | tX < xLowerAdjacent | tY > yUpperAdjacent | tY < yLowerAdjacent tData2 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # Scatterplot with and without the outliers graphsheet(height=6.4,width=6.4) plot(c(-10,50), c(-5,15), xlab="X", ylab="Y", type="n") lines(tData$x, tData$y, type="p", cex=.5) abline(lm(y~x, data=tData), col=5) abline(lm(y~x, data=tData2), col=6) # ----------------------------------------------------- # Influence calculation from Linear Model tLM <- lm(y~x, data=tData) summary(tLM) # ----------------------------------------------------- # The hat diagonal measures influence tInfluence <- lm.influence(tLM) summary(tInfluence$hat) # ----------------------------------------------------- # Removing observations with hat diagonal greater than .2 tSelect <- tInfluence$hat > .2 tData3 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # The coefficients from lm.influence are results of lm with one observation # removed for each observation. summary(data.frame(tInfluence$coefficients)) # ----------------------------------------------------- # Calculating Mahalanobis Distance (p<.001 is 13.8 with 2 df) mDistance <- (length(tX) - 1)*(tInfluence$hat - 1/length(tX)) tSelect <- mDistance > 13.8 tData4 <- data.frame(x=tX[!tSelect],y=tY[!tSelect]) # ----------------------------------------------------- # Quit the program