rm(list = ls()) # clear the memory ########################### # Triangular data in T12-3.DAT L = matrix(0, 11, 11) L[outer(1:11, 1:11, '<=')] <- scan("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T12-3.DAT") X = L + t(L) - diag(diag(L)) rownames(X) = c("E", "N", "Da", "Du", "G", "Fr", "Sp", "I", "P", "H", "Fi") colnames(X) = rownames(X) X #Similarities; e.g. English/French = 4: {3,6,7,9} d = as.dist(10-X) fit1 = hclust(d, method = "single") fit2 = hclust(d, method = "complete") fit3 = hclust(d, method = "ward") par(mfrow=c(3,1)) # cut each tree into 3 clusters and # draw dendogram with borders around the 3 clusters plot(fit1, sub = "single linkage") groups1 <- cutree(fit1, k=3) rect.hclust(fit1, k=3, border="red") groups1 plot(fit2, sub = "complete linkage") groups2 <- cutree(fit2, k=3) rect.hclust(fit2, k=3, border="blue") groups2 plot(fit3, sub = "Ward's linkage") groups3 <- cutree(fit3, k=3) rect.hclust(fit3, k=3, border="green") groups3 #################################### # Example 12.12 rm(list = ls()) # clear the memory X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T12-4.DAT")[,1:8]) rownames(X) = read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T12-4.DAT")[,9] ssratio = vector(length = 10) for(k in 1:length(ssratio)) { fit = kmeans(X, k, nstart = 25) # Tries numerous random starts ssratio[k] = fit$betweenss/fit$totss } par(mfrow=c(3,1)) plot(ssratio) # Levels out after k = 4 or 5 fit1 = kmeans(X, 5, nstart = 25) fit2 = hclust(dist(X), method = "centroid") # compare with agglomerative clustering #plot(fit2) #rect.hclust(fit2, k = 5, border="red") groups1 = fit1$cluster groups2 = cutree(fit2, k = 5) groups1 groups2 plot(groups1, sub = "k-means") plot(groups2, sub = "hierarchical, centroid linkage") #################################### # Example 12.13 - Iris data using Mclust() X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T11-5.DAT")) knowngroups = X[,5] X = X[,1:4] dev.new() library(mclust) fit3 <- Mclust(X) plot(fit3, X) # plot results fit3$parameters # display the best model dev.new() fit4 <- Mclust(X, G=3) plot(fit4, X) # plot results fit4$parameters # display the best model #################################### rm(list = ls()) # clear the memory X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T11-5.DAT")) knowngroups = X[,5] X = X[,1:4] # Metric MDS d = dist(X, p=1) # distances between the rows; p = power in Minkowski distance fit1 = cmdscale(d,eig=TRUE, k = 2) # k is the number of dimensions of the result fit1 # view results fit1$GOF # Two different versions of Stress (or something like it) # plot solution par(mfrow = c(2,1)) x1 = fit1$points[(knowngroups==1),1] y1 = fit1$points[(knowngroups==1),2] x2 = fit1$points[(knowngroups==2),1] y2 = fit1$points[(knowngroups==2),2] x3 = fit1$points[(knowngroups==3),1] y3 = fit1$points[(knowngroups==3),2] plot(x1, y1, xlim = c(min(x1,x2,x3), max(x1,x2,x3)), ylim = c(min(y1,y2,y3), max(y1,y2,y3)), pch=21, col = "black", xlab="Coordinate 1", ylab="Coordinate 2", main="Metric MDS in two dimensions") points(x2, y2, pch=23, col = "red") points(x3, y3, pch=25, col = "blue") legend(-2,1.5, legend = c("pi1: setosa", "pi2: versicolor", "pi3: virginica"), col = c("black", "red", "blue"), pch = c(21, 23, 25)) gof1 = vector(length = 4) for(k in 1:length(gof1)) { fit1 = cmdscale(d, eig=TRUE, k) gof1[k] = fit1$GOF[1] } plot(gof1, main = "GOF vs. k") # k = 3 is best? k = 2 is good enough? fit1 = cmdscale(d, eig=TRUE, k = 2) fit1 # Nonmetric MDS library(MASS) d = dist(X[-143,], p=1) # all points must be distinct knowngroups = knowngroups[-143] fit2 = isoMDS(d, y = cmdscale(d, 2), k = 2, p = 1) # k is the number of dim; y is initial config fit2 # view results # plot solution dev.new() par(mfrow = c(2,1)) x1 = -fit2$points[(knowngroups==1),1] y1 = fit2$points[(knowngroups==1),2] x2 = -fit2$points[(knowngroups==2),1] y2 = fit2$points[(knowngroups==2),2] x3 = -fit2$points[(knowngroups==3),1] y3 = fit2$points[(knowngroups==3),2] plot(x1, y1, xlim = c(min(x1,x2,x3), max(x1,x2,x3)), ylim = c(min(y1,y2,y3), max(y1,y2,y3)), pch=21, col = "black", xlab="Coordinate 1", ylab="Coordinate 2", main="Nonmetric MDS in two dimensions") points(x2, y2, pch=23, col = "red") points(x3, y3, pch=25, col = "blue") legend(-2,.8, legend = c("pi1: setosa", "pi2: versicolor", "pi3: virginica"), col = c("black", "red", "blue"), pch = c(21, 23, 25)) stress2 = vector(length = 4) for(k in 1:length(stress2)) { fit2 = isoMDS(d, y = cmdscale(d, k), k) stress2[k] = fit2$stress } plot(stress2, main = "Stress vs. k") # k = 2 is good enough? fit2 = isoMDS(d, k = 2) fit2 #################################### rm(list = ls()) # clear the memory X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T12-8.DAT")) rownames(X) = c("P0", "P1", "P2", "P3", "P4", "P5", "P6") colnames(X) = c("A", "B", "C", "D") x = as.table(X) x par(mfrow=c(2,2)) plot(x, main = "type by site") plot(t(x), main = "site by type") library(ca) prop.table(x, 1) # row percentages prop.table(x, 2) # column percentages fit <- ca(x) print(fit) # basic results summary(fit) # extended results plot(fit) # symmetric map; see help(plot.ca) plot(fit, mass = TRUE, contrib = "absolute", arrows = c(FALSE, TRUE)) #Check: n = sum(x) P = as.matrix(x/n) I = nrow(P) J = ncol(P) r = as.vector(P%*%rep(1,J)) c = as.vector(t(P)%*%rep(1,I)) P0 = P - r%*%t(c) Dr = diag(r) Dc = diag(c) rootDr = diag(sqrt(r)) rootDc = diag(sqrt(c)) B0 = solve(rootDr,P0)%*%solve(rootDc) s = svd(B0, nu=I, nv=J) U0 = s$u V0 = s$v lambda0 = s$d inertia = lambda0^2 inertia = inertia[1:3] #check: cat("Inertias are the first 3 of the lambda0^2:", inertia, "\n") #to plot: I-vectors f2,f3; J-vectors g2,g3 f1 = lambda0[1]*solve(rootDr,U0[,1]) f2 = lambda0[2]*solve(rootDr,U0[,2]) g1 = lambda0[1]*solve(rootDc,V0[,1]) g2 = lambda0[2]*solve(rootDc,V0[,2]) F = cbind(f1,f2) G = cbind(g1,g2) F1 = cbind(f1, rep(0, length(f1))) G1 = cbind(g1, rep(0, length(g1))) dev.new() plot(F1, pch=21, xlab="", ylab="", ylim=c(0,0), xlim = c(min(f1,g1)-.1,max(f1,g1)+.1), bg = "blue", yaxt = "n") text(F1, labels = rownames(X), adj= c(-.25,1.5)) points(G1, pch=24, bg = "red") text(G1, labels = colnames(X), adj= c(-.25,-1.5)) title(main = "one-dimensional representation") dev.new() plot(F, pch=21, xlab="", ylab="", ylim=c(-1, .7), xlim=c(-.8,1.1), bg = "blue") text(F, labels = rownames(X), adj= c(-.25,-1.5), cex=.75) points(G, pch=24, bg = "red") text(G, labels = colnames(X), adj= c(-.25,-1.5), cex=.75) abline(h=0, lty = 3) abline(v = 0, lty = 3) title(main = "two-dimensional representation")