############################################################################### # Title: Data Mining Functions # From the Introduction to R with Applications in Data Mining Workshop # Author(s): Julian Karch # Date created: Thu Apr 14 ############################################################################### ###Matrix basics ------------------- #create a matrix A <- rbind(c(1,2,3),c(4,5,6),c(7,8,9),c(10,11,12)) #transpose t(A) #multiply matrices res <- t(A) %*% A #select parts of a matrix #row 1 column 1 A[1,1] #row 1 all columns A[1,] #row 1 to 3 all columns A[1:3,] #row 2 to 3 columns 1 to 2 A[2:3,1:2] #column 1 all rows A[,1] #eigenvectors and eigenvalues eigs <- eigen(res) ###load data--------------- #load the two datasets require(psych) artificial <- read.csv('example.dat') artificial <- artificial[,2:12] data(iris) ###clustering--------------- ##kmeans clustering #load the stats library which contains kMeansa require(stats) #kmeans(x, centers, iter.max = 10, nstart = 1,algorithm = c("Hartigan-Wong", "Lloyd", "Forgy","MacQueen")) #aritificial data resultKMeans1 <- kmeans(artificial[,2:11], 2, iter.max = 100, nstart = 10) #real data resultKMeans2 <- kmeans(iris[,1:4], 4, iter.max = 100, nstart = 10) ##check out mclust package for kmeans using the covariance matrix of the data #load the cluster library which contains agglomerative and partioning clustering require(cluster) ##agglomerative clustering #agnes(x, diss = inherits(x, "dist"), metric = "euclidean",stand = FALSE, method = "average", par.method, # keep.diss = n < 100, keep.data = !diss) resultAgnes <- agnes(artificial[,2:11]) #plot the result plot(resultAgnes) ##partioning clustering resultDiana <- diana(artificial[,2:11]) plot(resultDiana) ###Classifier----------------------------- #load the class library which contains kNN require(class) #load my functions, right now only BAC source('helperfunctions.R') #first column contains label train <- artificial[,2:11] label <- factor(artificial[,1]) #knn.cv(train, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE) resultKNN <- knn.cv(train,label,k=3) #complete BAC based on real and predicted labels bacperf <- bacP(label,resultKNN) #last column contains label train <- iris[,1:4] label <- factor(iris[,5]) #knn.cv(train, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE) resultKNN <- knn.cv(train,label,k=3) bacperf <- bacP(label,resultKNN)