## R function to fit a multivariate logistic model by maximum likelihood ## Does Newton-Raphson; stops when the L1-norm of the coefficient vector drops below 'tol' ## ## Data: ## 1. an n by p matrix X, rows are observations arising from ## population 1 (the first n1 rows) ... population J (the final nJ rows). ## 2. A vector 'class' of population assignments, i.e. c(rep(1,n1),..,rep(J,nJ)). ## 3. newdata: an m by p matrix - like X - of new data to classify ## (default = NULL). Use rbind() if there is only one row. ## ## Output: ## coef = a p+1 by J-1 matrix; j-th columms contains coefficient estimates for class j ## fits = an n by J matrix, i-th row is the vector (p(1|x), ... p(J-1|x), p(J|x))' with x = X[i,] ## pred.class = an n by 1 vector of predicted class memberships (i-th element = arg max of i-th row of fits) ## ## Problems: Hessian is often nearly singular, so an option (set "scoring = TRUE") is to estimate it by ## sample proportions. But then many more iterations are needed. ## ## open and run "multilogistic example.R" to run some examples ############################################################################### ############################################################################### multilogistic = function(X, class, scoring = FALSE, newdata = NULL, tol = 1e-5) { X = as.matrix(X) n = nrow(X) p = ncol(X) J = length(unique(class)) d = (p+1)*(J-1) if(scoring == T) { w = tabulate(class)/n w = w[-J] if(length(w)>1 ) { W.constant = diag(w) - w%*%t(w)} else W.constant = w-w^2 } ############# Some subfunctions ############# pJ = function(Theta,x){ z = c(1,x) s = sum(exp(t(Theta)%*%z)) 1/(1+s) } pvec = function(Theta,x) { # Vector (p(1|x), ... p(J-1|x))' ppJ = pJ(Theta,x) z = c(1,x) ppJ*exp(t(Theta)%*%z) } qvec = function(Theta,x) { # Kronecker product of pvec with z = c(1,x); a long column z = c(1,x) pp = as.vector(pvec(Theta,x)) kronecker(pp,z) } W = function(Theta,x) { pp = as.vector(pvec(Theta,x)) if(length(pp) > 1) w = diag(pp) - pp%*%t(pp) else w = pp-pp^2 w } H = function(Theta,x) { z = c(1,x) Z = z%*%t(z) kronecker(W(Theta,x),Z) } neg.hess = function(Theta, X, scoring) { if(scoring == F) { n = nrow(X) p = ncol(X) d = (p+1)*(J-1) hh = array(dim = c(d,d,n)) for(i in 1:n) hh[,,i] = H(Theta, X[i,]) mat = apply(hh, c(1,2), sum) } else { Z = cbind(1,X) mat = kronecker(W.constant, t(Z)%*%Z) } mat } ############# end of subfunctions ############# z.totals = rowsum(cbind(1,X),class) tot = as.vector(t(z.totals[-J,])) # The vector t of all but the last group totals Theta = matrix(nrow = p+1, ncol = J-1, data = 0) # Start with a p+1 by J-1 matrix of zeros # theta[j] is j-th column of Theta theta = as.vector(Theta) rel.change = 1 iterate = 0 if(scoring == T) { Z = cbind(1,X) negHess.inv = kronecker(solve(W.constant), solve(t(Z)%*%Z)) } ##### Iterative step ##### while(rel.change > tol) { # Compute the gradient qq = matrix(nrow = d, ncol = n) for(i in 1:n) qq[,i] = qvec(Theta, X[i,]) qsum = apply(qq,1,sum) ldot = tot - qsum # The gradient of the log-likelihood if(scoring == F) negHess = neg.hess(Theta, X, scoring) if(scoring == F && any(is.na(negHess))) theta.new = theta else { if((scoring == F && det(negHess) > 1e-20) || (scoring == T)) { if(scoring == F) { theta.new = theta + solve(negHess,ldot) } else { theta.new = theta + negHess.inv%*%ldot } } else { cat("det = ", det(negHess), "L1(ldot) = ", max(abs(ldot)), "\n") theta.new = theta } } rel.change = max(abs((theta.new - theta)/theta)) theta = theta.new Theta = matrix(nrow = p+1, ncol = J-1, data = theta.new) iterate = iterate + 1 # cat("iteration ", iterate, "relative change = ", rel.change, "\n") } ########## pp = matrix(nrow = n, ncol = J) pred = vector(length = n) for(i in 1:n) pp[i,] = c(pvec(Theta, X[i,]), pJ(Theta,X[i,])) for(i in 1:n) pred[i] = which.max(pp[i,]) pred.new = NULL if(is.matrix(newdata)) { newdata = as.matrix(newdata) m = nrow(newdata) pp.new = matrix(nrow = m, ncol = J) pred.new = vector(length = m) for(i in 1:m) pp.new[i,] = c(pvec(Theta, newdata[i,]), pJ(Theta,newdata[i,])) for(i in 1:m) pred.new[i] = which.max(pp.new[i,]) } list(coef = Theta, fits = pp, pred.class = pred, pred.new = pred.new) }