rm(list = ls()) # clear the memory ########################### ############################################################################### ############################################################################### confusion.matrix = function(class, pred) { J = length(unique(class)) mat = matrix(nrow = J, ncol = J) for(j in 1:J) { for(k in 1:J) { mat[j,k] = sum((class == j)*(pred == k)) }} error.rate = 1-sum(diag(mat))/sum(mat) list(mat = mat, rate = error.rate) } ############################################################################### ## Example - Iris data; J=3. Compare with the glm function library(MASS) source("http://www.stat.ualberta.ca/~wiens/stat575/R%20scripts/multilogistic.R") data = read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T11-5.DAT") class = data[,5] X = as.matrix(data[,1:4]) colnames(X) = c("sep.length", "sep.width", "pet.length","pet.width") X23 = X[51:150,c(1,3)] # Eliminate the first class from the training sample class23 = class[51:150]-1 logitfit = multilogistic(X23, class23, scoring = F, newdata = NULL, tol = 1e-4) logitfit$coef pred.logitfit = logitfit$pred.class confuse.logit = confusion.matrix(class23, pred.logitfit) cat(paste("Logistic discrimination: APER =", confuse.logit$rate, "Confusion matrix is"), "\n") print(confuse.logit$mat) fit = glm(class23-1 ~ X23, family = binomial()) fit$coef pred.glmfit = predict(fit, as.data.frame(X23)) pred.glmfit = (pred.glmfit > 0) confuse.glm = confusion.matrix(class23, pred.glmfit+1) cat(paste("GLM: APER =", confuse.glm$rate, "Confusion matrix is"), "\n") print(confuse.glm$mat) ############################################################################### ############################################################################### ## Compare with lin and quad discrimination, using the entire data set. ## First classify only the training set lin = lda(X, class) pred.lin = as.numeric(predict(lin, X)$class) quad = qda(X, class) pred.quad = as.numeric(predict(quad, X)$class) logitfit = multilogistic(X, class, scoring = F, newdata = NULL, tol = 1e-4) pred.logitfit = logitfit$pred.class confuse.lin = confusion.matrix(class, pred.lin) confuse.quad = confusion.matrix(class, pred.quad) confuse.logit = confusion.matrix(class, pred.logitfit) cat(paste("Linear discrimination: APER =", round(confuse.lin$rate,3), "Confusion matrix is"), "\n") print(confuse.lin$mat) cat(paste("Quadratic discrimination: APER =", round(confuse.quad$rate,3), "Confusion matrix is"), "\n") print(confuse.quad$mat) cat(paste("Logistic discrimination: APER =", round(confuse.logit$rate,3), "Confusion matrix is"), "\n") print(confuse.logit$mat) ############# ## Now use the holdout procedure: pred.holdout.lin = vector(length = nrow(X)) pred.holdout.quad = vector(length = nrow(X)) for (i in 1:nrow(X)) { lin = lda(X[-i,],class[-i]) quad = qda(X[-i,],class[-i]) pred.holdout.lin[i] = as.numeric(predict(lin, X[i,])$class) pred.holdout.quad[i] = as.numeric(predict(quad, X[i,])$class) } confuse.holdout.lin = confusion.matrix(class, pred.holdout.lin) confuse.holdout.quad = confusion.matrix(class, pred.holdout.quad) # Holdout procedure using multilogistic(X, class, newdata, scoring, tol): pred.holdout.logit = vector(length = nrow(X)) for (i in 1:nrow(X)) { new = multilogistic(X[-i,], class[-i], scoring = F, newdata = rbind(X[i,]), tol = 1e-4)$pred.new cat("i = ", i, "pred.new = ", new, "\n") pred.holdout.logit[i] = new } confuse.holdout.logit = confusion.matrix(class, pred.holdout.logit) cat(paste("Logistic discrimination: estimated error rate =", round(confuse.holdout.logit$rate,3), "Confusion matrix is"), "\n") print(confuse.holdout.logit$mat) cat(paste("Linear discrimination: estimated error rate =", round(confuse.holdout.lin$rate,3), "Confusion matrix is"), "\n") print(confuse.holdout.lin$mat) cat(paste("Quadratic discrimination: estimated error rate =", round(confuse.holdout.quad$rate,3), "Confusion matrix is"), "\n") print(confuse.holdout.quad$mat)