# Correlation transformation and ridge regression; acetylene data
# Enter the data:
conv =c(49,50.2,50.5,48.5,47.5,44.5,28,31.5,34.5,35,38,38.5,15,17,20.5,29.5)
temp = c(rep(1300,6),rep(1200,6),rep(1100,4))
mole = c(7.5,9,11,13.5,17,23,5.3,7.5,11,13.5,17,23,5.3,7.5,11,17)
cont = c(120,120,115,130,135,120,400,380,320,260,340,410,840,980,920,860)/10000

x = cbind(temp, mole, cont, temp*mole, temp*cont, mole*cont, temp^2, mole^2, cont^2)
dimnames(x) = list(NULL, c("T", "M", "C", "T*M", "T*C", "M*C", "T2", "M2", "C2"))
n = length(conv)


# Form z = the standardized version of x, with r = z'z:

m = apply(x, 2, mean)  # vector of 9 column means
z=x-outer(rep(1,n),m) # subtract the means from the columns of x
s = sqrt(15*apply(x, 2, var))
z = z/outer(rep(1,n),s) # z is x in standardized form
r = crossprod(z)  # check that this is the correlation matrix

y = (conv - mean(conv))/sqrt((n-1)*var(conv)) # Standardize y as well

# Fit the regression in terms of the standardized variables:
fit1 = lsfit(z, y, int = F)
ls.print(fit1)
d = ls.diag(fit1)


fits = z%*%fit1$coef
plot(fits,d$std.res, ylab="std.res")

# Transformation hasn't made much difference;
# Condition numbers still huge:
v = eigen(r)$values; min(v); max(v)/min(v)

## Ridge regression:
# THETA will be a matrix whose j-th row is the ridge regression estimates
# using K[j] as the biasing constant
# This is applied to the standardized variables

# Write a function to do this for fixed 'k':
p = ncol(z)
theta = function(k) {
solve(r+k*diag(p), t(z)%*%y)
}

K = seq(from = 0.001, to = .3, by = .001)
THETA = matrix(nrow=p+1, ncol = length(K))
for(j in 1:(length(K))) THETA[,j] = c(K[j],theta(K[j]))
THETA = t(THETA)

# Plot the ridge traces for the linear terms, interactions, and quadratic terms separately
# mplot plots the columns of one matrix against those of another


par(mfrow = c(3,2))

matplot(THETA[,c(1,1,1)], THETA[,2:4], 
type = "l", lty=c(1,2,5), xlab = "k", ylab = "coef", col=1)
title(sub="(a) linear terms")
legend(x="topright", legend = c("T", "M", "C"), lty=c(1,2,5))

matplot(THETA[,c(1,1,1)], THETA[,5:7], 
type = "l", lty=c(1,2,5), xlab = "k", ylab = "coef", col=1)
title(sub="(b) interaction terms")
legend(x="topright", legend = c("TM", "TC", "MC"), lty=c(1,2,5))

matplot(THETA[,c(1,1,1)], THETA[,8:10], 
type = "l", lty=c(1,2,5), xlab = "k", ylab = "coef", col=1)
title(sub="(c) quadratic terms")
legend(x="topright", legend = c("T2", "M2", "C2"), lty=c(1,2,5))

# Things seem to have stabilized at about k=.03
# Do a regression with this value of k, using 'pseudovalues' to get the full regression output from R:
# Here the original data will be used
k = .03
xnew = cbind(1,x)
px = ncol(xnew)
fit = lsfit(rbind(xnew, sqrt(k)*diag(px)), c(conv, rep(0, px)), int=F)
ls.print(fit)
thetas = coef(fit)
fits = xnew%*%thetas
resids = conv - fits
plot(fits,resids, ylab="residuals")
title(sub = "(d) residuals from ridge regression with k = .03")

# Output suggests all terms involving 'C' are not significant
# Fit a model without these terms
fit2 = lm(conv ~ (temp + mole)^2 + I(temp^2) + I(mole^2))
summary.lm(fit2)
d = ls.diag(fit2)
fits = predict(fit2)
plot(fits, d$std.res, ylab = "std.res")
title(sub = "(e) residuals from model without C")

plot(cont, d$std.res, ylab = "std.res")
title(sub = "(f) residuals versus C")