## R function to fit a multivariate logistic model by maximum likelihood
## Does Newton-Raphson; stops when the L1-norm of the coefficient vector drops below 'tol'
##
## Data: 
##	1. an n by p matrix X, rows are observations arising from
## 	population 1 (the first n1 rows) ... population J (the final nJ rows).
## 	2. A vector 'class' of population assignments, i.e. c(rep(1,n1),..,rep(J,nJ)).
##	3. newdata: an m by p matrix - like X - of new data to classify
##   (default = NULL). Use rbind() if there is only one row.
##
## Output: 
## coef = a p+1 by J-1 matrix; j-th columms contains coefficient estimates for class j
## fits = an n by J matrix, i-th row is the vector (p(1|x), ... p(J-1|x), p(J|x))' with x = X[i,]
## pred.class = an n by 1 vector of predicted class memberships (i-th element = arg max of i-th row of fits)
##
## Problems: Hessian is often nearly singular, so an option (set "scoring = TRUE") is to estimate it by
## sample proportions. But then many more iterations are needed.
##
## open and run "multilogistic example.R" to run some examples
###############################################################################
###############################################################################


multilogistic = function(X, class, scoring = FALSE, newdata = NULL, tol = 1e-5) {

X = as.matrix(X)
n = nrow(X)
p = ncol(X)
J = length(unique(class))
d = (p+1)*(J-1)

if(scoring == T) {
w = tabulate(class)/n
w = w[-J]
if(length(w)>1 ) {
	W.constant = diag(w) - w%*%t(w)} else
	W.constant = w-w^2
}

############# Some subfunctions #############

pJ = function(Theta,x){
	z = c(1,x)
	s = sum(exp(t(Theta)%*%z))
	1/(1+s)
}

pvec = function(Theta,x) {  # Vector (p(1|x), ... p(J-1|x))'
	ppJ = pJ(Theta,x)
	z = c(1,x)
	ppJ*exp(t(Theta)%*%z)
}

qvec = function(Theta,x) { # Kronecker product of pvec with z = c(1,x); a long column
	z = c(1,x)
	pp = as.vector(pvec(Theta,x))
	kronecker(pp,z)
}

W = function(Theta,x) {
	pp = as.vector(pvec(Theta,x))
	if(length(pp) > 1) w = diag(pp) - pp%*%t(pp) else w = pp-pp^2
	w
}

H = function(Theta,x) {
	z = c(1,x)
	Z = z%*%t(z)
	kronecker(W(Theta,x),Z)
}

neg.hess = function(Theta, X, scoring) {
	if(scoring == F) {
		n = nrow(X)
		p = ncol(X)
		d = (p+1)*(J-1)
		hh = array(dim = c(d,d,n))
		for(i in 1:n) hh[,,i] = H(Theta, X[i,])
		mat = apply(hh, c(1,2), sum)
	} else {		
		Z = cbind(1,X)
		mat = kronecker(W.constant, t(Z)%*%Z) 
	}
	mat
}

############# end of subfunctions #############

z.totals = rowsum(cbind(1,X),class)

tot = as.vector(t(z.totals[-J,])) # The vector t of all but the last group totals

Theta = matrix(nrow = p+1, ncol = J-1, data = 0)  
	# Start with a p+1 by J-1 matrix of zeros
	# theta[j] is j-th column of Theta
theta = as.vector(Theta)

rel.change = 1
iterate = 0
if(scoring == T) {
	Z = cbind(1,X)
	negHess.inv = kronecker(solve(W.constant), solve(t(Z)%*%Z))
	}

##### Iterative step #####

while(rel.change > tol) {
	# Compute the gradient
	qq = matrix(nrow = d, ncol = n)
	for(i in 1:n) qq[,i] = qvec(Theta, X[i,])
	qsum = apply(qq,1,sum)
	ldot = tot - qsum # The gradient of the log-likelihood
	if(scoring == F) negHess = neg.hess(Theta, X, scoring)
	if(scoring == F && any(is.na(negHess))) theta.new = theta
	else {
		if((scoring == F && det(negHess) > 1e-20) || (scoring == T)) {
			if(scoring == F) {
				theta.new = theta + solve(negHess,ldot)
				} else {
					theta.new = theta + negHess.inv%*%ldot
					}  		
			} else { 
				cat("det = ", det(negHess), "L1(ldot) = ", max(abs(ldot)), "\n")
				theta.new = theta
			}
		}
	rel.change = max(abs((theta.new - theta)/theta))
	theta = theta.new
	Theta = matrix(nrow = p+1, ncol = J-1, data = theta.new)
	iterate = iterate + 1
# 	cat("iteration ", iterate, "relative change = ", rel.change, "\n")
}

##########

pp = matrix(nrow = n, ncol = J)
pred = vector(length = n)
for(i in 1:n) pp[i,] = c(pvec(Theta, X[i,]), pJ(Theta,X[i,]))
for(i in 1:n) pred[i] = which.max(pp[i,])

pred.new = NULL
if(is.matrix(newdata)) {
	newdata = as.matrix(newdata)
	m = nrow(newdata)
	pp.new = matrix(nrow = m, ncol = J)
	pred.new = vector(length = m)
	for(i in 1:m) pp.new[i,] = c(pvec(Theta, newdata[i,]), pJ(Theta,newdata[i,]))
	for(i in 1:m) pred.new[i] = which.max(pp.new[i,])
	}


list(coef = Theta, fits = pp, pred.class = pred, pred.new = pred.new)

}