# --------------------------------------------------------------------- # Program: Transformations2.S # Author: Steven M. Boker # Date: Tue Oct 5 13:11:53 EST 2004 # # More transformations and dealing with Missing Values # # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Missing values (NA) # x5 is "right censored" x5 <- rnorm(300, mean=0, sd=1) x5[x5 > .5] <- NA summary(x5) # x6 is "left censored" x6 <- rnorm(300, mean=0, sd=1) x6[x6 < -.5] <- NA summary(x6) # x7 is "missing completely at random" x7 <- rnorm(300, mean=0, sd=1) tIndex <- floor(runif(100, min=1, max=300)) x7[tIndex] <- NA summary(x7) # --------------------------------------------------------------------- # Boxplots of the three distributions graphsheet(height=6.4,width=6.4) boxplot(x5, x6, x7, names=c("x5", "x6", "x7")) # --------------------------------------------------------------------- # Quantile-Normal plots for the distributions graphsheet(height=6.4,width=6.4) qqmath(~ x7, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="x7") graphsheet(height=6.4,width=6.4) qqmath(~ x6, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="x6") graphsheet(height=6.4,width=6.4) qqmath(~ x5, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="x5") # --------------------------------------------------------------------- # Mean and variance of x7 mean(x7) var(x7) # --------------------------------------------------------------------- # Oh-Oh. What do we do now? Use the is.na() function to deselect NAs tSelect <- !is.na(x7) mean(x7[tSelect]) var(x7[tSelect]) # --------------------------------------------------------------------- # What do is.na(x7) and !is.na(x7) do? Select missing and nonmissing. tSelect <- is.na(x7) tSelect tSelect <- !is.na(x7) tSelect # --------------------------------------------------------------------- # What is the percentage of missing values? round(100 * (length(x7[is.na(x7)]) / length(x7)), 1) # --------------------------------------------------------------------- # Right censoring is different than a ceiling effect # x5 is "right censored" x5b <- rnorm(300, mean=0, sd=1) x5b[x5b > .5] <- .5 summary(x5b) graphsheet(height=6.4,width=6.4) qqmath(~ x5b, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="x5b") # --------------------------------------------------------------------- # Create Platykurtic and Leptokurtotic Data platyData <- rnorm(300, mean=0, sd=.25) + runif(300, min=-2, max=2) summary(platyData) leptoData <- rnorm(300, mean=0, sd=2)^3 / 100 summary(leptoData) # --------------------------------------------------------------------- # Create Histograms and Boxplots of Platykurtic and Leptokurtotic Data graphsheet(height=6.4,width=6.4) par(mfrow=c(2,1)) theBreaks <- seq(-2.5, 2.5, by=.25) hist(platyData, breaks=theBreaks, xlab="Platykurtosis") hist(leptoData, breaks=theBreaks, xlab="Leptokurtosis") graphsheet(height=6.4,width=6.4) boxplot(platyData, leptoData, names=c("Platykurtosis","Leptokurtosis")) # --------------------------------------------------------------------- # Create Quantile Normal Plots of Platykurtic and Leptokurtotic Data graphsheet(height=6.4,width=6.4) qqmath(~ platyData, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="Platykurtotic Data") graphsheet(height=6.4,width=6.4) qqmath(~ leptoData, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="Leptokurtotic Data") # --------------------------------------------------------------------- # A function for a power transformation for data centered at zero centeredPower <- function(dataVector, exponent) { tData <- dataVector tData[dataVector < 0] <- tData[dataVector < 0] * -1 tData <- tData^(exponent) tData[dataVector < 0] <- tData[dataVector < 0] * -1 return(tData) } # --------------------------------------------------------------------- # Create two variables x and x^2 and look at their histograms x <- rnorm(300, mean=0, sd=1) xSquared <- x^2 graphsheet(height=6.4,width=6.4) par(mfrow=c(2,1)) hist(x) hist(xSquared) graphsheet(height=6.4,width=6.4) qqmath(~ xSquared, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="xSquared") # --------------------------------------------------------------------- # Use the centeredPower transformation with a power of 2 x <- rnorm(300, mean=0, sd=1) xSquared <- centeredPower(x, 2) graphsheet(height=6.4,width=6.4) par(mfrow=c(2,1)) hist(x) hist(xSquared) graphsheet(height=6.4,width=6.4) qqmath(~ xSquared, distribution=qnorm, prepanel = prepanel.qqmathline, panel = function(x, y) { panel.qqmathline(y, distribution = qnorm) panel.qqmath(x, y) }, aspect=1, xlab = "Normal Distribution", ylab="xSquared")