rm(list = ls())  # clear the memory
###########################

data = read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T11-5.DAT")
class = data[,5]
X = data[,1:4]
X = as.matrix(X)
colnames(X) = c("sep.length","sep.width", "pet.length","pet.width")

#################################################
# Plot Petal and sepal length; all three classes
plot(X[(class==1),1],X[(class==1),3], pch=21, col = "black", 
	xlab = "sepal length", ylab = "petal length",
 	xlim = c(.9*min(X[,1]),1.1*max(X[,1])), ylim = c(.9*min(X[,3]),1.1*max(X[,3])))
points(X[(class==2),1],X[(class==2),3], pch=23, col = "red")
points(X[(class==3),1],X[(class==3),3], pch=25, col = "blue")
legend("topleft", legend = c("pi1: setosa", "pi2: versicolor", "pi3: virginica"), col = c("black", "red", "blue"), pch = c(21, 23, 25))


#################################################
# Reduce to only two groups (2 and 3)
dev.new()
X23 = X[51:150,c(1,3)]  # Eliminate the first class from the training sample
class23 = class[51:150]

plot(X23[(class23==2),1],X23[(class23==2),2], pch=23, col = "red", 
	xlab = "sepal length", ylab = "petal length",
 	xlim = c(.9*min(X23[,1]),1.1*max(X23[,1])), ylim = c(.9*min(X23[,2]),1.1*max(X23[,2])))
points(X23[(class23==3),1],X23[(class23==3),2], pch=25, col = "blue")
legend("topleft", legend = c("pi2: versicolor", "pi3: virginica"), col = c("red", "blue"), pch = c(23, 25))


sample2 = X23[(class23==2),]
sample3 = X23[(class23==3),]

x2bar = colMeans(sample2)
x3bar = colMeans(sample3)


S2 = cov(sample2)
S3 = cov(sample3)
Spool = (S2+S3)/2

beta2 = solve(Spool, x2bar)
beta3 = solve(Spool, x3bar)

alpha2 = log(1/2) - .5*sum(beta2*x2bar)
alpha3 = log(1/2) - .5*sum(beta3*x3bar)

L2 = function(x) alpha2 + sum(beta2*x)
L3 = function(x) alpha3 + sum(beta3*x)

alpha32 = alpha3 - alpha2
beta32 = beta3 - beta2
a32 = -alpha32/beta32[2]
b32 = -beta32[1]/beta32[2]
abline(a32, b32, lty = 1) #The line L3-L2=0 as a function of x1; 
		# this is the boundary between classes 3,2; 
		# x2 above this line -> class 3
a32 + b32*5 #= 4.24; x2 = 4 is below this so (6,4) -> class 2
title(main = "Linear (solid line) and logistic (broken line) \n discrimators between groups 1 and 2")


#Classifier:
belong = function(x) {
vec = c(L2(x), L3(x))
which.max(vec) + 1
}

belong(c(6,4))

#################################################
# Use the built-in R function; see that it produces identical results
# install and load the MASS library
library(MASS)
lin = lda(X23, class23, prior = c(1,1)/2) 
 # Use equal priors for comparison with the above 

##Compare classifications of original data
pred1 = as.numeric(predict(lin, X23)$class) + 1

pred2 = vector(length=100)
for(i in 1:100) pred2[i] = belong(X23[i,])
# Quadratic discrimination
quad = qda(X23, class23, prior = c(1,1)/2)
pred3 = as.numeric(predict(quad, X23)$class) + 1

pred1-pred2
pred2-pred3

#################################################
# Logistic classification
X23 = as.matrix(X23)
class.01 = class23 - 2
fit = glm(class.01 ~ X23, family = binomial())

a = fit$coef[1]
b1 = fit$coef[2]
b2 = fit$coef[3]
A = -a/b2
B = -b1/b2
abline(A, B, lty = 2) # add the discriminationg line to the current plot

pred4 = predict(fit, as.data.frame(X23))
pred4 = (pred4 > 0) + 2


#################################################

confusion.matrix = function(class, pred) {
	J = length(unique(class))
	mat = matrix(nrow = J, ncol = J)
	for(j in 1:J) { for(k in 1:J) {
	mat[j,k] = sum((class == j)*(pred == k))
		}}
	error.rate = 1-sum(diag(mat))/sum(mat)
list(mat = mat, rate = error.rate)
}

confuse.lin = confusion.matrix(class23-1, pred1-1)
confuse.quad = confusion.matrix(class23-1, pred2-1)
confuse.logistic = confusion.matrix(class23-1, pred4-1)

confuse.lin
confuse.quad
confuse.logistic

#################################################
#################################################
# Use the complete data set 
# Requires my own multilogistic() function when there are more than 2 classes
source("http://www.stat.ualberta.ca/~wiens/stat575/R%20scripts/multilogistic.R")
# Test on training set
# Here the priors are estimated (the default with lda and quad)
lin = lda(X, class)
pred.lin = as.numeric(predict(lin, X)$class)
confuse.lin = confusion.matrix(class, pred.lin)
cat(paste("Linear discrimination: APER =", round(confuse.lin$rate,3), "Confusion matrix is"), "\n")
print(confuse.lin$mat)

quad = qda(X, class)
pred.quad = as.numeric(predict(quad, X)$class)
confuse.quad = confusion.matrix(class, pred.quad)
cat(paste("Quadratic discrimination: APER =", round(confuse.quad$rate,3), "Confusion matrix is"), "\n")
print(confuse.quad$mat)

source("multilogistic.R")
logitfit = multilogistic(X, class, scoring = F, newdata = NULL, tol = 1e-4)
pred.logitfit = logitfit$pred.class
confuse.logit = confusion.matrix(class, pred.logitfit)
cat(paste("Logistic discrimination: APER =", round(confuse.logit$rate,3), "Confusion matrix is"), "\n")
print(confuse.logit$mat)

##################################################
# Holdout procedure:
pred.holdout.lin = vector(length = nrow(X))
pred.holdout.quad = vector(length = nrow(X))
for (i in 1:nrow(X)) {
	lin2 = lda(X[-i,],class[-i])
	quad2 = qda(X[-i,],class[-i])
	pred.holdout.lin[i] = as.numeric(predict(lin2, X[i,])$class)
	pred.holdout.quad[i] = as.numeric(predict(quad2, X[i,])$class)
	}
confuse.holdout.lin = confusion.matrix(class, pred.holdout.lin)
confuse.holdout.quad = confusion.matrix(class, pred.holdout.quad)

## Holdout procedure using multilogistic(X, class, newdata, tol):
## Uncomment these next lines to do logistic holdout; takes a long time.

#pred.holdout.logit = vector(length = nrow(X))
#for (i in 1:nrow(X)) {
#	new = multilogistic(X[-i,], class[-i], scoring = F, newdata = rbind(X[i,]), tol = 1e-4)$pred.new
#	cat("i = ", i, "pred.new = ", new, "\n")
#	pred.holdout.logit[i] = new
#	}
#confuse.holdout.logit = confusion.matrix(class, pred.holdout.logit)


cat(paste("Linear discrimination: estimated error rate =", round(confuse.holdout.lin$rate,3), "Confusion matrix is"), "\n")
print(confuse.holdout.lin$mat)

cat(paste("Quadratic discrimination: estimated error rate =", round(confuse.holdout.quad$rate,3), "Confusion matrix is"), "\n")
print(confuse.holdout.quad$mat)


#cat(paste("Logistic discrimination: estimated error rate =", round(confuse.holdout.logit$rate,3), "Confusion matrix is"), "\n")
#print(confuse.holdout.logit$mat)



##################################################
##################################################
rm(list = ls())  # clear the memory

data = read.table("http://www.stat.ualberta.ca/~wiens/stat575/datasets/T11-5.DAT")
class = data[,5]
X = data[,1:4]
X = as.matrix(X)
colnames(X) = c("sep.length","sep.width", "pet.length","pet.width")


# Fisher classification; go through the calculations step-by-step:

n = nrow(X)
J = length(unique(class))

sample1 = X[(class==1),]
sample2 = X[(class==2),]
sample3 = X[(class==3),]

x1bar = colMeans(sample1)
x2bar = colMeans(sample2)
x3bar = colMeans(sample3)
B = (J-1)*cov(rbind(x1bar, x2bar, x3bar))

S1 = cov(sample1)
S2 = cov(sample2)
S3 = cov(sample3)
W = (n-J)*(S1+S2+S3)/3

# Compute W^(-1/2):
eig = eigen(W)
Q = eig$vectors
D = diag(eig$values)
W.neg.root = Q%*%diag(1/sqrt(eig$values))%*%t(Q)
#check:
W.neg.root%*%W%*%W.neg.root

M = W.neg.root%*%B%*%W.neg.root
eig2 = eigen(M)
E = eig2$vectors
lambda = eig2$values
Lambda = diag(lambda) # 2 nonzero eigenvalues: s = 2 = min(J-1,p) = min(2,4).
A0 =  W.neg.root%*%E[,c(1,2)] # First two sample discriminants

lambda[1]/(lambda[1] + lambda[2]) # .99; no need to go beyond the first sample discriminant.


dev.new()
par(mfrow=c(2, 2))

#############################################################
## First use only one sample discriminant, computed above
# Compute L1 on the training set
L1 = X%*%A0[,1] 
L1.1 = x1bar%*%A0[,1] 
L1.2 = x2bar%*%A0[,1] 
L1.3 = x3bar%*%A0[,1] 

plot(x = L1[(class==1)],y = rep(0,50), pch=21, col = "black",
	xlab = "first discriminant", yaxt = "n", 
	xlim = c(1.1*min(L1), 1.1*max(L1)), ylab = "", ylim = c(-.3,.3),
	main = "First sample discriminant; \n lines at discriminant means")
	legend(-.25,-.2,  legend = c("pi1: setosa", "pi2: versicolor", "pi3: virginica"), 
	yjust = .5, col = c("black", "red", "blue"), pch = c(21, 23, 25))
points(x = L1[(class==2)],y = rep(0,50), pch=23, col = "red")
points(x = L1[(class==3)],y = rep(0,50), pch=25, col = "blue")
lines(rbind(c(L1.1,-.1), c(L1.1, .1)), col = "black")
lines(rbind(c(L1.2,-.1), c(L1.2, .1)), col = "red")
lines(rbind(c(L1.3,-.1), c(L1.3, .1)), col = "blue")

## Use two discriminants

L2 = X%*%A0 
L2.1 = x1bar%*%A0
L2.2 = x2bar%*%A0
L2.3 = x3bar%*%A0


plot(L2[(class==1),],pch=21, col = "black",
	xlim = c(1.1*min(L2[,1]), 1.1*max(L2[,1])), 
	ylim = c(1.1*min(L2[,2]), 1.1*max(L2[,2])),
	ylab = "second discriminant", xlab = "first discriminant",
	main = "Both sample discriminants; \n discriminant means indicated")

points(L2[(class==2),], pch=23, col = "red")
points(L2[(class==3),], pch=25, col = "blue")
points(L2.1[1], L2.1[2], pch='X', col = "black")
points(L2.2[1], L2.2[2], pch='Y', col = "black")
points(L2.3[1], L2.3[2], pch='Z', col = "black")

#############################################################

#Repeat, using 
lin1 = lda(X, class)
A = -lin1$scaling  # columns are the coefficients of the two sample discriminants
#############################################################
## First use only one sample discriminant, computed above
# Compute L1 on the training set
L1 = X%*%A[,1] 
plot(x = L1[(class==1)],y = rep(0,50), pch=21, col = "black",
	xlab = "first discriminant", yaxt = "n", 
	xlim = c(1.1*min(L1), 1.1*max(L1)), ylab = "", ylim = c(-.3,.3),
	main = "First sample discriminant, \n usingt lda output (times -1)")
	legend(-.25,-.2,  legend = c("pi1: setosa", "pi2: versicolor", "pi3: virginica"), 
	yjust = .5, col = c("black", "red", "blue"), pch = c(21, 23, 25))
points(x = L1[(class==2)],y = rep(0,50), pch=23, col = "red")
points(x = L1[(class==3)],y = rep(0,50), pch=25, col = "blue")

## Use two discriminants

L2 = X%*%A 
plot(L2[(class==1),],pch=21, col = "black",
	xlim = c(1.1*min(L2[,1]), 1.1*max(L2[,1])), 
	ylim = c(1.1*min(L2[,2]), 1.1*max(L2[,2])),
	ylab = "second discriminant", xlab = "first discriminant",
	main = "Both sample discriminants, \n usingt lda output (times -1)")
points(L2[(class==2),], pch=23, col = "red")
points(L2[(class==3),], pch=25, col = "blue")

#############################################################


cat("Sample discriminants calculated above are", "\n")
round(A0,2)
cat("Sample discriminants returned by lda are", "\n")
round(lin1$scaling,2)