########################################CART########################################
####################Use package tree####################
install.packages("tree")
library(tree)
#creat a new dataset rodat from treedata.csv
dat <-read.csv(file="W:/teaching/stat579/data/treedata.csv",head=T)  
dat2 <- subset(dat,dat$species=="Quercus rubra") #get a subset: a red oak subset
ro.plots <- unique(as.character(dat2$plotID))    #plots with red oak
u.plots <- unique(as.character(dat$plotID)) #all plots
nob.plots <- u.plots[is.element(u.plots,ro.plots)==F] #plots without red oak
dat3 <- subset(dat,is.element(as.character(dat$plotID),nob.plots)) #dataset of no red oak
dat4 <- subset(dat3,duplicated(dat3$plotID)==F)  #one row per plot
dat4$cover <- 0     #cover of red oak is zero in these plots
rodat = rbind(dat2,dat4)     #new dataframe of presences and absences
nrow(rodat)
head(rodat)

#using one predictor variable: elevation to model 
#red oak cover class (including zeros). Cover is a numeric variable, 
#tree package assumes we desire a regression tree model:
rt1 = tree(cover~elev,data=rodat)
rt1
summary(rt1)
sum(sapply(rodat$cover,function(x)(x-mean(rodat$cover))^2))
sum(sapply(resid(rt1),function(x)(x-mean(resid(rt1)))^2))
newdata<-rodat[which(rodat$elev<1509.5),]
sum(sapply(newdata$cover,function(x)(x-mean(newdata$cover))^2))
newdata2<-rodat[which(rodat$elev<677.95),]
sum(sapply(newdata2$cover,function(x)(x-mean(newdata2$cover))^2))

plot(rt1); text(rt1)
with(rodat,plot(elev,cover))
x = seq(0,2000)
lines(x,predict(rt1,newdata=list(elev=x)),col="lightblue",lwd=3)

#The highest elevation set has virtually no variance because these are (almost) all absences: 
#the residual deviance of this terminal node is low (87.51).  The other two terminal nodes 
#do a poor job of predicting red oak abundance, with residual deviances of 5379 (middle elevations) 
#and 1055 (lowest elevations).  We should expect additional variables to add the most to our model 
#in these regions of elevation.

#There are default values in tree that determine the stopping
#rules associated with few samples or splits that add little additional explained deviance.  
#In this case we had plenty of remaining samples in each node but the added amount of explained 
#deviance achieved with another split did not exceed the built-in threshold of 1% of the null deviance.  
examine this more closely by changing this default to a lower threshold, and then plotting residual
 deviance against tree size (# nodes) with the prune.tree function:

rt2 = tree(cover~elev,data=rodat,control=tree.control(1039,mindev=.003)) 
#changing this default to a lower threshold

plot(prune.tree(rt2)) #plotting residual deviance against tree size (# nodes)
abline(v=3,col="red")
prune.tree(rt2)

plot(cv.tree(rt2))

##classification tree
rt3 = tree(cover~disturb,data=rodat)
rt3
summary(rt3)
plot(rt3);text(rt3)

##classification tree using more variables
ct1 = tree(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,data=rodat)
ct1
plot(ct1); text(ct1)
plot(cv.tree(ct1))
btree<-prune.tree(ct1,best=7)
btree
summary(btree)

########################################Use package rpart######################################### 
library(rpart)
rp1 <- rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class")
rp1
plot(rp1); text(rp1)
summary(rp1)
rp2 = rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class", 
control=rpart.control(cp=0.005))
plot(rp2); text(rp2)
plotcp(rp2)
printcp(rp2)
rp2.pruned = prune(rp2,cp=.02)    #cp=.02 is the desired CP threshold in the above table
rp2.pruned
exp = predict(rp2.pruned,type="class")     #predicted values from our pruned rp2 tree
obs = factor(rodat$cover>0)           #actual presences of red oak
sum(as.numeric(exp!=obs))             #total misclassifications 


########################################Random Forest######################################### 
###classification
library(randomForest)
data(iris)
help(iris)
nrow(iris)
head(iris)
set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
                        proximity=TRUE)
print(iris.rf)
iris.rf
predict(iris.rf)
importance(iris.rf) # variable importance score
round(importance(iris.rf), 2)
#iris.rf$proximity
max(iris.rf$proximity) #proximity
min(iris.rf$proximity)
iris.rf$proximity[1:10,1:10]
iris.rf$proximity[1:10,51:80]
getTree(randomForest(iris[,-5], iris[,5]), 3, labelVar=TRUE)

###Regression:
data(airquality)
nrow(airquality)
head(airquality)
newdata<-na.omit(airquality) #construct a new data without the missing value
#regression tree
install.packages("tree")
library(tree)
rt1 <- tree(Ozone ~ .,data=airquality)
rt1
plot(rt1); text(rt1)
#random forest
set.seed(131)
ozone.rf <- randomForest(Ozone ~ ., data=airquality, mtry=3,
                         importance=TRUE, na.action=na.omit)
#mtry:Number of variables randomly sampled as candidates at each split. Default value for classification is (sqrt(p)
#where p is number of variables in x) and regression (p/3)
print(ozone.rf)
#calculate mse or prediction error
yhat<-ozone.rf$predicted
yhatnoob<-predict(ozone.rf,airquality[,-1])
res<-newdata$Ozone-yhat
mse<-sum(res^2)/nrow(newdata)
#calculate r^2
ssto<-sum((newdata$Ozone-mean(newdata$Ozone))^2)
sse<-sum(res^2)
ratio<-(ssto-sse)/ssto
ratio

## Show "importance" of variables: higher value mean more important:
round(importance(ozone.rf), 2)