# -----------------------------------------------------
# Program: OutliersCIs1.S  
#  Author: Steven M. Boker
#    Date: Tue Oct 26 10:31:01 EST 2004
#
#  Selection, Outliers and Influence
#
# -----------------------------------------------------

# -----------------------------------------------------
#  Create a temporary dataframe Selecting by Sex


tSelect <- edaStudent$sex=="Male" & !is.na(edaStudent$sex)

t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect],
        ztest=edaStudent$ztest[tSelect],
        totunits=edaStudent$totunits[tSelect],
        income=edaStudent$income[tSelect]/10000)


tSelect <- edaStudent$sex=="Female" & !is.na(edaStudent$sex)

t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect],
        ztest=edaStudent$ztest[tSelect],
        totunits=edaStudent$totunits[tSelect],
        income=edaStudent$income[tSelect]/10000)

# -----------------------------------------------------
#  Create a temporary dataframe Selecting by income extremes


tSelect <- edaStudent$income > 80000 & !is.na(edaStudent$income)

t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect],
        ztest=edaStudent$ztest[tSelect],
        totunits=edaStudent$totunits[tSelect],
        income=edaStudent$income[tSelect]/10000)

tSelect <- edaStudent$income < 40000 & !is.na(edaStudent$income)

t1 <- data.frame(hscgpa=edaStudent$hscgpa[tSelect],
        ztest=edaStudent$ztest[tSelect],
        totunits=edaStudent$totunits[tSelect],
        income=edaStudent$income[tSelect]/10000)


# -----------------------------------------------------
#  Create a distribution with some outliers

tX <- c(rnorm(197, mean=10, sd=2), 20, -3, 45)
tY <- (tX * .3) + rnorm(200, mean=0, sd=.5)
tData <- data.frame(x=tX,y=tY)

# -----------------------------------------------------
#  Calculate Median, Interquartile and Adjacent Values

xLength <- length(tX)
xSorted <- sort(tX)
xMedian <- xSorted[floor((xLength+1)/2)]
xLowerHinge <- xSorted[floor((xLength+1)/4)]
xUpperHinge <- xSorted[floor((3*(xLength+1))/4)]
xInterQuartile <- xUpperHinge - xLowerHinge
xUpperAdjacent <- xUpperHinge + (1.5 * xInterQuartile)
xLowerAdjacent <- xLowerHinge - (1.5 * xInterQuartile)

yLength <- length(tY)
ySorted <- sort(tY)
yMedian <- ySorted[floor((yLength+1)/2)]
yLowerHinge <- ySorted[floor((yLength+1)/4)]
yUpperHinge <- ySorted[floor((3*(yLength+1))/4)]
yInterQuartile <- yUpperHinge - yLowerHinge
yUpperAdjacent <- yUpperHinge + (1.5 * yInterQuartile)
yLowerAdjacent <- yLowerHinge - (1.5 * yInterQuartile)

# -----------------------------------------------------
#  Remove observations outside Upper and Lower Adjacent values

tSelect <- tX > xUpperAdjacent | tX < xLowerAdjacent |
           tY > yUpperAdjacent | tY < yLowerAdjacent

tData2 <- data.frame(x=tX[!tSelect],y=tY[!tSelect])


# -----------------------------------------------------
#  Scatterplot with and without the outliers

graphsheet(height=6.4,width=6.4)
plot(c(-10,50), c(-5,15), xlab="X", ylab="Y", type="n")
lines(tData$x, tData$y, type="p", cex=.5)
abline(lm(y~x, data=tData), col=5)
abline(lm(y~x, data=tData2), col=6)

# -----------------------------------------------------
#  Create a distribution with one INFLUENTIAL outlier

tX <- c(rnorm(199, mean=10, sd=2), 45)
tY <- (tX * .3) + c(rnorm(199, mean=0, sd=.5), -15)
tData <- data.frame(x=tX,y=tY)

# -----------------------------------------------------
#  Calculate Median, Interquartile and Adjacent Values

xLength <- length(tX)
xSorted <- sort(tX)
xMedian <- xSorted[floor((xLength+1)/2)]
xLowerHinge <- xSorted[floor((xLength+1)/4)]
xUpperHinge <- xSorted[floor((3*(xLength+1))/4)]
xInterQuartile <- xUpperHinge - xLowerHinge
xUpperAdjacent <- xUpperHinge + (1.5 * xInterQuartile)
xLowerAdjacent <- xLowerHinge - (1.5 * xInterQuartile)

yLength <- length(tY)
ySorted <- sort(tY)
yMedian <- ySorted[floor((yLength+1)/2)]
yLowerHinge <- ySorted[floor((yLength+1)/4)]
yUpperHinge <- ySorted[floor((3*(yLength+1))/4)]
yInterQuartile <- yUpperHinge - yLowerHinge
yUpperAdjacent <- yUpperHinge + (1.5 * yInterQuartile)
yLowerAdjacent <- yLowerHinge - (1.5 * yInterQuartile)

# -----------------------------------------------------
#  Remove observations outside Upper and Lower Adjacent values

tSelect <- tX > xUpperAdjacent | tX < xLowerAdjacent |
           tY > yUpperAdjacent | tY < yLowerAdjacent

tData2 <- data.frame(x=tX[!tSelect],y=tY[!tSelect])

# -----------------------------------------------------
#  Scatterplot with and without the outliers


graphsheet(height=6.4,width=6.4)
plot(c(-10,50), c(-5,15), xlab="X", ylab="Y", type="n")
lines(tData$x, tData$y, type="p", cex=.5)
abline(lm(y~x, data=tData), col=5)
abline(lm(y~x, data=tData2), col=6)

# -----------------------------------------------------
#  Influence calculation from Linear Model

tLM <- lm(y~x, data=tData)
summary(tLM)

# -----------------------------------------------------
#  The hat diagonal measures influence

tInfluence <- lm.influence(tLM)
summary(tInfluence$hat)

# -----------------------------------------------------
#  Removing observations with hat diagonal greater than .2

tSelect <- tInfluence$hat > .2
tData3 <- data.frame(x=tX[!tSelect],y=tY[!tSelect])

# -----------------------------------------------------
#  The coefficients from lm.influence are results of lm with one observation
#  removed for each observation.

summary(data.frame(tInfluence$coefficients))

# -----------------------------------------------------
#  Calculating Mahalanobis Distance (p<.001 is 13.8 with 2 df)

mDistance <- (length(tX) - 1)*(tInfluence$hat - 1/length(tX))
tSelect <- mDistance > 13.8
tData4 <- data.frame(x=tX[!tSelect],y=tY[!tSelect])

# -----------------------------------------------------
# Quit the program




