rm(list = ls())  # clear the memory
###########################

X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T8-4.DAT"))
colnames(X) = c("JPM", "Citi", "WellsF", "Shell", "Exxon")

out = prcomp(X, scale = T)

# out$center = colMeans(X)
out$center
# out$scale = vector of standard deviations of the columns of X
out$scale
# out$sdev = square roots of the eigenvalues of R (assuming that scale = T);
# 	thus sum(out$sdev^2) = p
out$sdev

 

Gamma = out$rotation # matrix whose columns are the eigenvectors of R
scores = out$x # columns are the sample principal components
	# cov(scores) = diag(out$sdev^2) = diag matrix of the eigenvalues of R
	# check: compare round(cov(scores),4) with eigen(cor(X))

plot(out, type='l', main = "Scree plot") # A 'scree' plot indicating the contributions of the sample pc's
	# The first two account for 3/4 of the variation; the first three for 87%.
cumsum(out$sdev^2)/5
# [1] 0.4874546 0.7688572 0.8689597 0.9489660 1.0000000

lam1 = out$sdev[1]^2
ci = c(log(lam1) - sqrt(2)*qnorm(.975)/sqrt(nrow(X)), log(lam1) + sqrt(2)*qnorm(.975)/sqrt(nrow(X)))
ci1 = exp(ci)


lam2 = out$sdev[2]^2
ci = c(log(lam2) - sqrt(2)*qnorm(.975)/sqrt(nrow(X)), log(lam2) + sqrt(2)*qnorm(.975)/sqrt(nrow(X)))
ci2 = exp(ci)

#############################################################
rm(list = ls())  # clear the memory
X = as.matrix(read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T5-8.DAT"))
colnames(X) = c("legal", "extraordinary", "holdover", "COA", "meeting")
#X0 = X-as.vector(rep(1,16))%*%t(as.vector(colMeans(X))) 
 # R will always center the data at least, so the result using X0 will be the same as using X
 # thus the sample scores will always have an average of 0

## Without scaling:
par(mfrow = c(3,1))
out = prcomp(X, scale = F)
plot(out, type = 'l', main = "")
title(main = "pca; without scaling", outer = T, line = -2)
round(cumsum(out$sdev^2)/sum(out$sdev^2),3)

out

## Plot first two pca's and confidence ellipsoid
scores = out$x
p1 = scores[,1]
p2 = scores[,2]

r = sqrt(qchisq(.95,2))
phi = 2*pi*seq(0, 1, length=200)
z = rbind(r*cos(phi), r*sin(phi))
plot(out$sdev[1]*z[1,],out$sdev[2]*z[2,], type = 'l', ylim = c(-4000, 4000), xlab = "first pc", ylab = "second pc")
points(p1, p2)
text(p1[11], p2[11], "#11", adj = 1.5)

y345 = scores[,3:5]
lam345 = diag(out$sdev^2)[3:5,3:5]
t = diag(y345%*%solve(lam345,t(y345)))

plot(t, type = 'l', ylim=c(0,8.2), xlab = "pay period")
abline(h = qchisq(.95, 3))
text(12, t[12],  "#12", adj = 1.5)
text(13, t[13],  "#13", adj = -1)

## How much of the variation of the individual variables is explained by the correlatons with the first two pcs?

RHO = diag(1/sqrt(diag(cov(X))))%*%out$rotation%*%diag(out$sdev)
RHO^2 # check: rowSums(RHO^2)

####################################
 
## With scaling (but then does the asymptotic chi-square approximation remain valid?):
out = prcomp(X, scale = T)
dev.new()
par(mfrow = c(3,1))
plot(out, type = 'l', main = "")
title(main = "pca; with scaling", outer = T, line = -2)
round(cumsum(out$sdev^2)/sum(out$sdev^2),3)
out

## Plot first two pca's and confidence ellipsoid
scores = out$x
p1 = scores[,1]
p2 = scores[,2]

r = sqrt(qchisq(.95,2))
phi = 2*pi*seq(0, 1, length=200)
z = rbind(r*cos(phi), r*sin(phi))
plot(out$sdev[1]*z[1,],out$sdev[2]*z[2,], xlab = "first pc", ylab = "second pc", ylim = c(-3,3), type = 'l')
points(p1, p2)
text(p1[11], p2[11], "#11", adj = 1.5)

y345 = scores[,3:5]
lam345 = diag(out$sdev^2)[3:5,3:5]
t = diag(y345%*%solve(lam345,t(y345)))

plot(t, type = 'l', ylim=c(0,8.2), xlab = "pay period")
abline(h = qchisq(.95, 3))
text(12, t[12],  "#12", adj = 1.5)
text(13, t[13],  "#13", adj = -1)

## How much of the variation of the individual variables is explained by the correlatons with the first two pcs?

RHO = out$rotation%*%diag(out$sdev) # sigma[i,i] = 1 when prcomp based on R
RHO^2 # check: rowSums(RHO^2)