# Correlation transformation and ridge regression; acetylene data # Enter the data: conv =c(49,50.2,50.5,48.5,47.5,44.5,28,31.5,34.5,35,38,38.5,15,17,20.5,29.5) temp = c(rep(1300,6),rep(1200,6),rep(1100,4)) mole = c(7.5,9,11,13.5,17,23,5.3,7.5,11,13.5,17,23,5.3,7.5,11,17) cont = c(120,120,115,130,135,120,400,380,320,260,340,410,840,980,920,860)/10000 x = cbind(temp, mole, cont, temp*mole, temp*cont, mole*cont, temp^2, mole^2, cont^2) dimnames(x) = list(NULL, c("T", "M", "C", "T*M", "T*C", "M*C", "T2", "M2", "C2")) n = length(conv) # Form z = the standardized version of x, with r = z'z: m = apply(x, 2, mean) # vector of 9 column means z=x-outer(rep(1,n),m) # subtract the means from the columns of x s = sqrt(15*apply(x, 2, var)) z = z/outer(rep(1,n),s) # z is x in standardized form r = crossprod(z) # check that this is the correlation matrix y = (conv - mean(conv))/sqrt((n-1)*var(conv)) # Standardize y as well # Fit the regression in terms of the standardized variables: fit1 = lsfit(z, y, int = F) ls.print(fit1) d = ls.diag(fit1) fits = z%*%fit1$coef plot(fits,d$std.res, ylab="std.res") # Transformation hasn't made much difference; # Condition numbers still huge: v = eigen(r)$values; min(v); max(v)/min(v) ## Ridge regression: # THETA will be a matrix whose j-th row is the ridge regression estimates # using K[j] as the biasing constant # This is applied to the standardized variables # Write a function to do this for fixed 'k': p = ncol(z) theta = function(k) { solve(r+k*diag(p), t(z)%*%y) } K = seq(from = 0.001, to = .3, by = .001) THETA = matrix(nrow=p+1, ncol = length(K)) for(j in 1:(length(K))) THETA[,j] = c(K[j],theta(K[j])) THETA = t(THETA) # Plot the ridge traces for the linear terms, interactions, and quadratic terms separately # mplot plots the columns of one matrix against those of another par(mfrow = c(3,2)) matplot(THETA[,c(1,1,1)], THETA[,2:4], type = "l", lty=c(1,2,5), xlab = "k", ylab = "coef", col=1) title(sub="(a) linear terms") legend(x="topright", legend = c("T", "M", "C"), lty=c(1,2,5)) matplot(THETA[,c(1,1,1)], THETA[,5:7], type = "l", lty=c(1,2,5), xlab = "k", ylab = "coef", col=1) title(sub="(b) interaction terms") legend(x="topright", legend = c("TM", "TC", "MC"), lty=c(1,2,5)) matplot(THETA[,c(1,1,1)], THETA[,8:10], type = "l", lty=c(1,2,5), xlab = "k", ylab = "coef", col=1) title(sub="(c) quadratic terms") legend(x="topright", legend = c("T2", "M2", "C2"), lty=c(1,2,5)) # Things seem to have stabilized at about k=.03 # Do a regression with this value of k, using 'pseudovalues' to get the full regression output from R: # Here the original data will be used k = .03 xnew = cbind(1,x) px = ncol(xnew) fit = lsfit(rbind(xnew, sqrt(k)*diag(px)), c(conv, rep(0, px)), int=F) ls.print(fit) thetas = coef(fit) fits = xnew%*%thetas resids = conv - fits plot(fits,resids, ylab="residuals") title(sub = "(d) residuals from ridge regression with k = .03") # Output suggests all terms involving 'C' are not significant # Fit a model without these terms fit2 = lm(conv ~ (temp + mole)^2 + I(temp^2) + I(mole^2)) summary.lm(fit2) d = ls.diag(fit2) fits = predict(fit2) plot(fits, d$std.res, ylab = "std.res") title(sub = "(e) residuals from model without C") plot(cont, d$std.res, ylab = "std.res") title(sub = "(f) residuals versus C")