rm(list = ls()) # clear the memory ########################### X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T8-4.DAT")) colnames(X) = c("JPM", "Citi", "WellsF", "Shell", "Exxon") out = prcomp(X, scale = T) # out$center = colMeans(X) out$center # out$scale = vector of standard deviations of the columns of X out$scale # out$sdev = square roots of the eigenvalues of R (assuming that scale = T); # thus sum(out$sdev^2) = p out$sdev Gamma = out$rotation # matrix whose columns are the eigenvectors of R scores = out$x # columns are the sample principal components # cov(scores) = diag(out$sdev^2) = diag matrix of the eigenvalues of R # check: compare round(cov(scores),4) with eigen(cor(X)) plot(out, type='l', main = "Scree plot") # A 'scree' plot indicating the contributions of the sample pc's # The first two account for 3/4 of the variation; the first three for 87%. cumsum(out$sdev^2)/5 # [1] 0.4874546 0.7688572 0.8689597 0.9489660 1.0000000 lam1 = out$sdev[1]^2 ci = c(log(lam1) - sqrt(2)*qnorm(.975)/sqrt(nrow(X)), log(lam1) + sqrt(2)*qnorm(.975)/sqrt(nrow(X))) ci1 = exp(ci) lam2 = out$sdev[2]^2 ci = c(log(lam2) - sqrt(2)*qnorm(.975)/sqrt(nrow(X)), log(lam2) + sqrt(2)*qnorm(.975)/sqrt(nrow(X))) ci2 = exp(ci) ############################################################# rm(list = ls()) # clear the memory X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T5-8.DAT")) colnames(X) = c("legal", "extraordinary", "holdover", "COA", "meeting") #X0 = X-as.vector(rep(1,16))%*%t(as.vector(colMeans(X))) # R will always center the data at least, so the result using X0 will be the same as using X # thus the sample scores will always have an average of 0 ## Without scaling: par(mfrow = c(3,1)) out = prcomp(X, scale = F) plot(out, type = 'l', main = "") title(main = "pca; without scaling", outer = T, line = -2) round(cumsum(out$sdev^2)/sum(out$sdev^2),3) out ## Plot first two pca's and confidence ellipsoid scores = out$x p1 = scores[,1] p2 = scores[,2] r = sqrt(qchisq(.95,2)) phi = 2*pi*seq(0, 1, length=200) z = rbind(r*cos(phi), r*sin(phi)) plot(out$sdev[1]*z[1,],out$sdev[2]*z[2,], type = 'l', ylim = c(-4000, 4000), xlab = "first pc", ylab = "second pc") points(p1, p2) text(p1[11], p2[11], "#11", adj = 1.5) y345 = scores[,3:5] lam345 = diag(out$sdev^2)[3:5,3:5] t = diag(y345%*%solve(lam345,t(y345))) plot(t, type = 'l', ylim=c(0,8.2), xlab = "pay period") abline(h = qchisq(.95, 3)) text(12, t[12], "#12", adj = 1.5) text(13, t[13], "#13", adj = -1) ## How much of the variation of the individual variables is explained by the correlatons with the first two pcs? RHO = diag(1/sqrt(diag(cov(X))))%*%out$rotation%*%diag(out$sdev) RHO^2 # check: rowSums(RHO^2) #################################### ## With scaling (but then does the asymptotic chi-square approximation remain valid?): out = prcomp(X, scale = T) dev.new() par(mfrow = c(3,1)) plot(out, type = 'l', main = "") title(main = "pca; with scaling", outer = T, line = -2) round(cumsum(out$sdev^2)/sum(out$sdev^2),3) out ## Plot first two pca's and confidence ellipsoid scores = out$x p1 = scores[,1] p2 = scores[,2] r = sqrt(qchisq(.95,2)) phi = 2*pi*seq(0, 1, length=200) z = rbind(r*cos(phi), r*sin(phi)) plot(out$sdev[1]*z[1,],out$sdev[2]*z[2,], xlab = "first pc", ylab = "second pc", ylim = c(-3,3), type = 'l') points(p1, p2) text(p1[11], p2[11], "#11", adj = 1.5) y345 = scores[,3:5] lam345 = diag(out$sdev^2)[3:5,3:5] t = diag(y345%*%solve(lam345,t(y345))) plot(t, type = 'l', ylim=c(0,8.2), xlab = "pay period") abline(h = qchisq(.95, 3)) text(12, t[12], "#12", adj = 1.5) text(13, t[13], "#13", adj = -1) ## How much of the variation of the individual variables is explained by the correlatons with the first two pcs? RHO = out$rotation%*%diag(out$sdev) # sigma[i,i] = 1 when prcomp based on R RHO^2 # check: rowSums(RHO^2)