rm(list = ls()) # clear the memory ########################### ## FACTOR ANALYSIS; STOCK PRICE DATA ## X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T8-4.DAT")) colnames(X) = c("JPM", "Citi", "WellsF", "Shell", "Exxon") # Principal component solution fit1 <- princomp(X, cor=TRUE) summary(fit1) # print variance accounted for plot(fit1, type = 'l') # First two factors account for 77% of the total variance. # Do this from scratch: S = cov(X) R = cor(X) eig = eigen(R) lambda = eig$values Gamma = eig$vectors # Gamma = fit1$loadings will do this too. sqrt(lambda) # same as 'standard deviations of the components', i.e. fit1$sdev L = Gamma%*%diag(sqrt(lambda)) L # Solution with m=2: m = 2 L2 = L[,1:m] psi = diag(R) - diag(L2%*%t(L2)) # agrees with Table 9.2 in text resids1 = R - L2%*%t(L2) - diag(psi) sum(L2[,1]^2)/5 # = lambda1/5, prop of variance explained by first factor cat("The matrix L of factor loadings, with m = 2, is","\n") print(L2) cat("\n", "Diagonal elements of Psi matrix are","\n", round(psi,3), "\n") cat("\n", "Residual matrix is", "\n") print(round(resids1,3)) # To be discussed later: #loadings(fit1) # pc loadings dev.new() plot(fit1,type="lines") # scree plot #fit1$scores # the principal components #biplot(fit1) # Maximum likelihood solution fit2 = factanal(X, m, rotation = "none") # agrees with text (almost); "varimax" is the default fit2 # check: colSums(fit2$loadings^2) colSums(fit2$loadings^2) L = fit2$loadings L Psi = diag(fit2$uniquenesses) round(t(L)%*%solve(Psi,L),4) # diagonal resids2 = R - (L%*%t(L) + Psi) # much smaller than resids1 resids2 ##### Rotations ############### fit2 # mle; no rotation fit3 = factanal(X, m, rotation = "varimax") fit3 # Bank stocks (variables 1,2,3) load heavily on factor 1, much less so on factor 2; # oil stocks load heavily on factor 2, much less so on factor 1. rotate1 = varimax(L2) # rotation of loadings matrix for pc solution rotate2 = varimax(loadings(fit2)) # identical to fit3, but labelling of factors is reversed rotate2 rotate1 ### WLS scores ###### fit.ml.1 = factanal(X, m, rotation = "none", scores = "Bartlett") fit.ml.2 = factanal(X, m, rotation = "varimax", scores = "Bartlett") # pc scores: xbar = colMeans(X) Xcentred = X - rep(1,nrow(X))%*%t(xbar) fit.pc.scores = solve(t(L2)%*%L2, t(L2)%*%t(Xcentred)) fit.pc.scores.rot = t(varimax(L2)$rotmat)%*%fit.pc.scores fit.pc.scores[, 1:4] # the factor scores f_i -- there are n = 103 of these fit.pc.scores.rot[, 1:4] ### Regression scores ###### fit4 = factanal(X, m, rotation = "varimax", scores = "regression") out = cbind(fit.ml.2$scores, t(fit.pc.scores.rot),fit4$scores) # All use varimax colnames(out) = c("wls.mle.f1", "wls.mle.f2", "wls.pc.f1", "wls.pc.f2", "regr.f1", "regr.f2") out[1:10,] # There are n = 103 rows dev.new() plot(fit4$scores, main = "Factor scores: first of 2 regression methods, \n mle estimates, varimax rotation") abline(h=0) abline(v=0) ############################################################## ############################################################## ## FACTOR ANALYSIS; OLYMPIC DECATHLON DATA ## rm(list = ls()) # clear the memory X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T1-9.dat")[,2:8]) rownames(X) = read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T1-9.dat")[,1] colnames(X) = c("100m", "200m", "400m", "800m", "1500m", "3000m", "mrthn") X # First 3 columns are times in seconds; others are in minutes X[7,] # Canada R = cor(X) R eig = eigen(R) lambda = eig$values round(lambda,3) # [1] 5.808 0.629 0.279 0.125 0.091 0.055 0.014 contrib = cumsum(eigen(R)$values)/7 contrib # [1] 0.8296606 0.9194740 0.9593789 0.9771725 0.9901684 0.9979568 1.0000000 plot(1-contrib, type = 'l', ylab = "remaining proportion of variance") ## Suggests m = 2 or 3 ?? m = 2 # yields interpretable factors (??) fit0 = factanal(X, m, rotation = "none", scores = "regression") # mle; no rotation L0 = fit0$loadings psi0 = fit0$uniquenesses commnly0 = diag(L0%*%t(L0)) fsc0 = solve(R, L0) # factor score coefficients out0 = cbind(L0, commnly0, psi0, fsc0) colnames(out0)[5:6] = c("fsc1", "fsc2") out0 fit1 = factanal(X, m, rotation = "varimax", scores = "regression") # mle; rotation L1 = fit1$loadings psi1 = fit1$uniquenesses commnly1 = diag(L1%*%t(L1)) fsc1 = solve(R, L1) # factor score coefficients fhat1 = scale(X)%*%fsc1 out1 = cbind(L1, commnly1, psi1, fsc1) colnames(out1)[5:6] = c("fsc1", "fsc2") out1 resids.mle = R - L1%*%t(L1) - diag(psi1) round(resids.mle,3) # pc solution V = eig$vectors L2 = V%*%diag(sqrt(lambda)) L2 = L2[,1:m] psi2 = diag(R) - diag(L2%*%t(L2)) commnly2 = diag(L2%*%t(L2)) fsc2 = t(solve(t(L2)%*%L2, t(L2))) out2 = cbind(L2, commnly2, psi2, fsc2) colnames(out2)[1:2] = c("Factor1", "Factor2") colnames(out2)[5:6] = c("fsc1", "fsc2") out2 pc.rot = varimax(L2) L3 = pc.rot$loadings psi3 = diag(R) - diag(L3%*%t(L3)) commnly3 = diag(L3%*%t(L3)) fsc3 = t(solve(t(L3)%*%L3, t(L3))) fhat3 = scale(X)%*%fsc3 out3 = cbind(L3, commnly3, psi3, fsc3) colnames(out3)[1:2] = c("Factor1", "Factor2") colnames(out3)[5:6] = c("fsc1", "fsc2") out3 resids.pc = R - L2%*%t(L2) - diag(psi2) round(resids.pc,3) round(resids.mle, 3) # Much smaller residuals out0 # mle; no rotation out1 # mle; rotation out2 # pc; no rotation out3 # pc; rotation dev.new() par(mfrow=c(2,2)) plot(L1, main = "rotated ml loadings") text(L1[,1], L1[,2], pos=4) plot(L3, main = "rotated pc loadings", xlab = "Factor1", ylab = "Factor2") text(L3[,1], L3[,2], pos=4) plot(fhat1, main = "ml factor scores") plot(-fhat3, main = "pc factor scores", xlab = "Factor1", ylab = "Factor 2")