###################################################### ############ Handout #9 for ST476/576 ############## ############## Discriminant Analysis ############### ###################################################### ################################################## ################# Example 1 ###################### ################################################## #### Read in data #### cancer.data <- read.table(file="W:/teaching/stat4765762012/data/breast_cancer_data.txt", header=T,sep=",") nrow(cancer.data) ncol(cancer.data) names(cancer.data) ######################################################################## ######################## Description of data ########################### ######################################################################## # Title: Wisconsin Diagnostic Breast Cancer (WDBC) # # 1) ID number # 2) Diagnosis (M = malignant, B = benign) # 3-32) # # Ten real-valued features are computed for each cell nucleus: # # a) radius (mean of distances from center to points on the perimeter) # b) texture (standard deviation of gray-scale values) # c) perimeter # d) area # e) smoothness (local variation in radius lengths) # f) compactness (perimeter^2 / area - 1.0) # g) concavity (severity of concave portions of the contour) # h) concave points (number of concave portions of the contour) # i) symmetry # j) fractal dimension ("coastline approximation" - 1) # # The mean, standard error, and "worst" or largest (mean of the three # largest values) of these features were computed for each image, # resulting in 30 features. For instance, field 3 is Mean Radius, field # 13 is Radius SE, field 23 is Worst Radius. # # All feature values are recoded with four significant digits. ########################################################################## #http://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29 #for more information ########################################################################## pairs(cancer.data[,2:12]) ## Perform Linear Discriminat Analysis (LDA) on the data library(MASS) ans.lda <- lda(diagnosis ~ radius.mean + texture.mean + perimeter.mean + area.mean + smoothness.mean + compactness.mean + concavity.mean + con_pts.mean + symmetry.mean + fractal.mean, data=cancer.data, CV=F) ## Alternatively X <- cancer.data[,3:12] group <- cancer.data[,2] ans.lda <- lda(x=X, grouping=group, CV=F) ## Calculate APER fit <- predict(ans.lda, X)$class APER <- mean(fit != cancer.data$diagnosis) APER ## Calculate AER.cv cv.fit <- lda(x=X, grouping=group, CV=T)$class AER.cv <- mean(cv.fit != cancer.data$diagnosis) AER.cv ## Try including the sd and max variables as well X.all <- cancer.data[,3:32] ans.lda <- lda(x=X.all, grouping=group, CV=F) APER.all <- mean(predict(ans.lda, X.all)$class != cancer.data$diagnosis) APER.all cv.fit <- lda(x=X.all, grouping=group, CV=T)$class AER.cv.all <- mean(cv.fit != cancer.data$diagnosis) AER.cv.all ans.lda ## Try to standardize X X.stand <- X.all for(i in 1:ncol(X.all)) X.stand[,i] <- (X.stand[,i]-mean(X.stand[,i]))/sd(X.stand[,i]) ans.lda <- lda(x=X.stand, grouping=group, CV=F) ## Notice APER is unchanged APER.stand <- mean(predict(ans.lda, X.stand)$class != cancer.data$diagnosis) APER.stand ans.lda ################################################## ################# Example 2 CART ###################### ################################################## dat <-read.csv(file="W:/teaching/stat4765762012/data/treedata.csv",head=T) dat2 <- subset(dat,dat$species=="Quercus rubra") #get a subset: a red oak subset ro.plots <- unique(as.character(dat2$plotID)) #plots with red oak u.plots <- unique(as.character(dat$plotID)) #all plots nob.plots <- u.plots[is.element(u.plots,ro.plots)==F] #plots without red oak dat3 <- subset(dat,is.element(as.character(dat$plotID),nob.plots)) #dataset of no red oak dat4 <- subset(dat3,duplicated(dat3$plotID)==F) #one row per plot dat4$cover <- 0 #cover of red oak is zero in these plots rodat = rbind(dat2,dat4) #new dataframe of presences and absences nrow(rodat) head(rodat) library(rpart) rp1 <- rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class") rp1 plot(rp1); text(rp1) summary(rp1) rp2 = rpart(factor(cover>0)~utme+utmn+elev+tci+streamdist+beers,dat=rodat,method="class", control=rpart.control(cp=0.005)) plot(rp2); text(rp2) plotcp(rp2) printcp(rp2) rp2.pruned = prune(rp2,cp=.02) #cp=.02 is the desired CP threshold in the above table rp2.pruned exp = predict(rp2.pruned,type="class") #predicted values from our pruned rp2 tree obs = factor(rodat$cover>0) #actual presences of red oak sum(as.numeric(exp!=obs)) #total misclassifications