######################################################
############  Handout #9 for ST476/576  ##############
##############  Discriminant Analysis  ###############
######################################################


##################################################
################# Example 1 ######################
##################################################

#### Read in data ####
cancer.data <- read.table(file="W:/teaching/stat4765762012/data/breast_cancer_data.txt", header=T,sep=",")
nrow(cancer.data)
ncol(cancer.data)
names(cancer.data)
########################################################################
######################## Description of data ###########################
########################################################################
# Title: Wisconsin Diagnostic Breast Cancer (WDBC)
#
# 1) ID number
# 2) Diagnosis (M = malignant, B = benign)
# 3-32)
# 
# Ten real-valued features are computed for each cell nucleus:
# 
# 	a) radius (mean of distances from center to points on the perimeter)
#	b) texture (standard deviation of gray-scale values)
#	c) perimeter
#	d) area
#	e) smoothness (local variation in radius lengths)
#	f) compactness (perimeter^2 / area - 1.0)
#	g) concavity (severity of concave portions of the contour)
#	h) concave points (number of concave portions of the contour)
#	i) symmetry 
#	j) fractal dimension ("coastline approximation" - 1)
#
# The mean, standard error, and "worst" or largest (mean of the three
# largest values) of these features were computed for each image,
# resulting in 30 features.  For instance, field 3 is Mean Radius, field
# 13 is Radius SE, field 23 is Worst Radius.
# 
# All feature values are recoded with four significant digits.
##########################################################################
#http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
#for more information
##########################################################################


pairs(cancer.data[,2:12])

## Perform Linear Discriminat Analysis (LDA) on the data
library(MASS)

ans.lda <- lda(diagnosis ~ radius.mean + texture.mean + perimeter.mean + area.mean + smoothness.mean + compactness.mean + concavity.mean + con_pts.mean + symmetry.mean + fractal.mean, data=cancer.data, CV=F)

## Alternatively
X <- cancer.data[,3:12]
group <- cancer.data[,2]
ans.lda <- lda(x=X, grouping=group, CV=F)

## Calculate APER
fit <- predict(ans.lda, X)$class
APER <- mean(fit != cancer.data$diagnosis)
APER


## Calculate AER.cv
cv.fit <- lda(x=X, grouping=group, CV=T)$class
AER.cv <- mean(cv.fit != cancer.data$diagnosis)
AER.cv


## Try including the sd and max variables as well
X.all <- cancer.data[,3:32]
ans.lda <- lda(x=X.all, grouping=group, CV=F)
APER.all <- mean(predict(ans.lda, X.all)$class != cancer.data$diagnosis)
APER.all

cv.fit <- lda(x=X.all, grouping=group, CV=T)$class
AER.cv.all <- mean(cv.fit != cancer.data$diagnosis)
AER.cv.all

ans.lda


## Try to standardize X
X.stand <- X.all
for(i in 1:ncol(X.all))
  X.stand[,i] <- (X.stand[,i]-mean(X.stand[,i]))/sd(X.stand[,i])

ans.lda <- lda(x=X.stand, grouping=group, CV=F)

## Notice APER is unchanged
APER.stand <- mean(predict(ans.lda, X.stand)$class != cancer.data$diagnosis)
APER.stand

ans.lda


##################################################
################# Example 2 CART ######################
##################################################
dat <-read.csv(file="W:/teaching/stat4765762012/data/treedata.csv",head=T)  
dat2 <- subset(dat,dat$species=="Quercus rubra") #get a subset: a red oak subset
ro.plots <- unique(as.character(dat2$plotID))    #plots with red oak
u.plots <- unique(as.character(dat$plotID)) #all plots
nob.plots <- u.plots[is.element(u.plots,ro.plots)==F] #plots without red oak
dat3 <- subset(dat,is.element(as.character(dat$plotID),nob.plots)) #dataset of no red oak
dat4 <- subset(dat3,duplicated(dat3$plotID)==F)  #one row per plot
dat4$cover <- 0     #cover of red oak is zero in these plots
rodat = rbind(dat2,dat4)     #new dataframe of presences and absences
nrow(rodat)
head(rodat)

library(rpart)
rp1 <- rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class")
rp1
plot(rp1); text(rp1)
summary(rp1)
rp2 = rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class", 
control=rpart.control(cp=0.005))
plot(rp2); text(rp2)
plotcp(rp2)
printcp(rp2)
rp2.pruned = prune(rp2,cp=.02)    #cp=.02 is the desired CP threshold in the above table
rp2.pruned
exp = predict(rp2.pruned,type="class")     #predicted values from our pruned rp2 tree
obs = factor(rodat$cover>0)           #actual presences of red oak
sum(as.numeric(exp!=obs))             #total misclassifications