########################################CART######################################## ####################Use package tree#################### install.packages("tree") library(tree) #creat a new dataset rodat from treedata.csv dat <-read.csv(file="W:/teaching/stat579/data/treedata.csv",head=T) dat2 <- subset(dat,dat$species=="Quercus rubra") #get a subset: a red oak subset ro.plots <- unique(as.character(dat2$plotID)) #plots with red oak u.plots <- unique(as.character(dat$plotID)) #all plots nob.plots <- u.plots[is.element(u.plots,ro.plots)==F] #plots without red oak dat3 <- subset(dat,is.element(as.character(dat$plotID),nob.plots)) #dataset of no red oak dat4 <- subset(dat3,duplicated(dat3$plotID)==F) #one row per plot dat4$cover <- 0 #cover of red oak is zero in these plots rodat = rbind(dat2,dat4) #new dataframe of presences and absences nrow(rodat) head(rodat) #using one predictor variable: elevation to model #red oak cover class (including zeros). Cover is a numeric variable, #tree package assumes we desire a regression tree model: rt1 = tree(cover~elev,data=rodat) rt1 summary(rt1) sum(sapply(rodat$cover,function(x)(x-mean(rodat$cover))^2)) sum(sapply(resid(rt1),function(x)(x-mean(resid(rt1)))^2)) newdata<-rodat[which(rodat$elev<1509.5),] sum(sapply(newdata$cover,function(x)(x-mean(newdata$cover))^2)) newdata2<-rodat[which(rodat$elev<677.95),] sum(sapply(newdata2$cover,function(x)(x-mean(newdata2$cover))^2)) plot(rt1); text(rt1) with(rodat,plot(elev,cover)) x = seq(0,2000) lines(x,predict(rt1,newdata=list(elev=x)),col="lightblue",lwd=3) #The highest elevation set has virtually no variance because these are (almost) all absences: #the residual deviance of this terminal node is low (87.51). The other two terminal nodes #do a poor job of predicting red oak abundance, with residual deviances of 5379 (middle elevations) #and 1055 (lowest elevations). We should expect additional variables to add the most to our model #in these regions of elevation. #There are default values in tree that determine the stopping #rules associated with few samples or splits that add little additional explained deviance. #In this case we had plenty of remaining samples in each node but the added amount of explained #deviance achieved with another split did not exceed the built-in threshold of 1% of the null deviance. examine this more closely by changing this default to a lower threshold, and then plotting residual deviance against tree size (# nodes) with the prune.tree function: rt2 = tree(cover~elev,data=rodat,control=tree.control(1039,mindev=.003)) #changing this default to a lower threshold plot(prune.tree(rt2)) #plotting residual deviance against tree size (# nodes) abline(v=3,col="red") prune.tree(rt2) plot(cv.tree(rt2)) ##classification tree rt3 = tree(cover~disturb,data=rodat) rt3 summary(rt3) plot(rt3);text(rt3) ##classification tree using more variables ct1 = tree(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,data=rodat) ct1 plot(ct1); text(ct1) plot(cv.tree(ct1)) btree<-prune.tree(ct1,best=7) btree summary(btree) ########################################Use package rpart######################################### library(rpart) rp1 <- rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class") rp1 plot(rp1); text(rp1) summary(rp1) rp2 = rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class", control=rpart.control(cp=0.005)) plot(rp2); text(rp2) plotcp(rp2) printcp(rp2) rp2.pruned = prune(rp2,cp=.02) #cp=.02 is the desired CP threshold in the above table rp2.pruned exp = predict(rp2.pruned,type="class") #predicted values from our pruned rp2 tree obs = factor(rodat$cover>0) #actual presences of red oak sum(as.numeric(exp!=obs)) #total misclassifications ########################################Random Forest######################################### ###classification library(randomForest) data(iris) help(iris) nrow(iris) head(iris) set.seed(71) iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE, proximity=TRUE) print(iris.rf) iris.rf predict(iris.rf) importance(iris.rf) # variable importance score round(importance(iris.rf), 2) #iris.rf$proximity max(iris.rf$proximity) #proximity min(iris.rf$proximity) iris.rf$proximity[1:10,1:10] iris.rf$proximity[1:10,51:80] getTree(randomForest(iris[,-5], iris[,5]), 3, labelVar=TRUE) ###Regression: data(airquality) nrow(airquality) head(airquality) newdata<-na.omit(airquality) #construct a new data without the missing value #regression tree install.packages("tree") library(tree) rt1 <- tree(Ozone ~ .,data=airquality) rt1 plot(rt1); text(rt1) #random forest set.seed(131) ozone.rf <- randomForest(Ozone ~ ., data=airquality, mtry=3, importance=TRUE, na.action=na.omit) #mtry:Number of variables randomly sampled as candidates at each split. Default value for classification is (sqrt(p) #where p is number of variables in x) and regression (p/3) print(ozone.rf) #calculate mse or prediction error yhat<-ozone.rf$predicted yhatnoob<-predict(ozone.rf,airquality[,-1]) res<-newdata$Ozone-yhat mse<-sum(res^2)/nrow(newdata) #calculate r^2 ssto<-sum((newdata$Ozone-mean(newdata$Ozone))^2) sse<-sum(res^2) ratio<-(ssto-sse)/ssto ratio ## Show "importance" of variables: higher value mean more important: round(importance(ozone.rf), 2)