########################################################## ############### Handout #8 for ST440/540 ############### ########### Variable Selection in Regression ############# ########################################################## #################################################### ################### Example 1 ###################### ##### Variable Selection on simulated data ######### #################################################### set.seed(22) n <- 100 X <- as.data.frame(matrix(runif(10*n), nrow=n)) X[,4] <- X[,1]+rnorm(n,0,.1) names(X) <- paste("x", 1:10, sep="") true <- 10 + 2*X[,1] + X[,2] + 3*X[,3] y <- true + rnorm(n, 0, 1.1) ex.data <- as.data.frame(cbind(X,y)) ## Notice that the x5-x10 predictors are completely uninformative ## (i.e., they are not included in the "true" model for y) pairs(ex.data[, c(11, 1:4)]) ## pairs plot of true line against x's pairs(cbind(true, ex.data[,1:3])) ## Plotting the "true" regression plane in scatterplots still gives the ## appearance of error because of the effect of the other x's ### Fit full model ### ans.all <- lm(y ~ X[,1]+X[,2]+X[,3]+X[,4]+X[,5]+X[,6]+X[,7]+X[,8]+X[,9]+X[,10]) summary(ans.all) ## calculate mse from true line mse.all <- sum((ans.all$fitted - true)^2)/n mse.all ### Perform Best Subsets search ### ## Install package "leaps" (only need to do this the first time) #install.packages("leaps") ## Load library "leaps" library(leaps) # can use method = "Cp", "adjr2", or "r2" ans.bs <- leaps(X, y, method="Cp", nbest=5) ## Get the 5 "best" models according to Cp ans.bs$which[order(ans.bs$Cp)[1:5],] ans.bs$Cp[order(ans.bs$Cp)[1:5]] ## Get the 5 "best" models according to adjusted R^2 ans.r2 <- leaps(X, y, method="adjr2", nbest=5) ans.r2$which[order(-ans.r2$adjr2)[1:5],] ans.r2$adjr2[order(-ans.r2$adjr2)[1:5]] ## Fit the "best" model according to Cp ans.sub <- lm(y ~ x1+x2+x3, data=ex.data) summary(ans.sub) ## calculate mse from true line mse.sub <- sum((ans.sub$fitted - true)^2)/n mse.sub ### Perform Stepwise search ### ## Must also specify the "full" model upper <- formula(~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10) lower <- formula(~ 1) ## Must first create a "base" model for step function ## This one will start with the "null" model. (e.g. Forward Stagewise) ans0 <- lm(y ~ 1, data=ex.data) ##Now conduct stepwise search ## Can use direction = "both", "backward", or "forward" ans.sw <- step(ans0, scope=list(lower=lower, upper=upper), direction = "both") summary(ans.sw) ## Now start with the "full" model. (e.g. Backward Stagewise) ans1 <- lm(y ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10, data=ex.data) ans.sw1 <- step(ans1, scope=list(lower=lower, upper=upper), direction = "backward") summary(ans.sw1) ## calculate mse from true line mse.all <- sum((ans.all$fitted - true)^2)/n mse.sub <- sum((ans.sub$fitted - true)^2)/n mse.sw <- sum((ans.sw$fitted -true)^2)/n mse.sw1 <- sum((ans.sw1$fitted -true)^2)/n mse.all mse.sub mse.sw mse.sw1 ################### Example 2 ###################### ##### Variable Selection on coleman data ######### #################################################### install.packages(car) library(car) install.packages(leaps) library(leaps) coleman<- read.table(file="W:/teaching/stat440540/Fall2013/data/TAB6-4.DAT",sep="",col.names=c("School","x1","x2","x3","x4","x5","y")) attach(coleman) coleman summary(coleman) #####################best subset selection##################### X<-data.frame(cbind(coleman$x1,coleman$x2,coleman$x3,coleman$x4,coleman$x5)) X y<-coleman$y y ans.cp <- leaps(X, y, method="Cp", nbest=5) # Get the 5 "best" models according to Cp ans.cp$which[order(ans.cp$Cp)[1:5],] ans.cp$Cp[order(ans.cp$Cp)[1:5]] #####################stepwise selection ##################### upper <- formula(~ x1+x2+x3+x4+x5) lower <- formula(~ 1) #start with the "full" model. (e.g. Backward Stagewise) ans0<-lm(y~1) ans1 <- lm(y ~ x1+x2+x3+x4+x5, data=coleman) ans.back <- step(ans1, direction = "backward") summary(ans.back) fit1<-lm(y~x1+x3+x4) fit2<-lm(y~x3+x4) fit3<-lm(y~x1+x3) anova(fit2,fit1) anova(fit3,fit1) #choose the model y~x3+x4 #######################delete #18 obeservation##################### coleman2<-coleman[-18,] #####################best subset selection##################### X2<-data.frame(cbind(coleman2$x1,coleman2$x2,coleman2$x3,coleman2$x4,coleman2$x5)) X2 y2<-coleman2$y y2 ans.cp2 <- leaps(X2, y2, method="Cp", nbest=5) # Get the 5 "best" models according to Cp ans.cp2$which[order(ans.cp2$Cp)[1:5],] ans.cp2$Cp[order(ans.cp2$Cp)[1:5]] #####################stepwise selection ##################### upper <- formula(~ x1+x2+x3+x4+x5) lower <- formula(~ 1) #start with the "full" model. (e.g. Backward Stagewise) ans12 <- lm(y ~ x1+x2+x3+x4+x5, data=coleman2) ans.back2 <- step(ans12, direction = "backward") summary(ans.back2) #######################outliers##################### co2 <- lm(y ~ x1+x2+x3+x4+x5,data=coleman2) cop2=summary(co2) rstudent(co2) ##gives rstudent values outlierTest(co2) ##Reports the Bonferroni p-value for the most extreme observation. par(mfrow=c(1,1)) plot(rstudent(co2)) ##leverage, x outliers lev2<-hatvalues(co2) lev2 xoutliers <- which(lev2 > 2*cop2$df[1]/(cop2$df[1]+cop2$df[2])) xoutliers plot(lev2) ######################################################################### ###############New outliers after deleting case #18 ##################### co <- lm(y ~ x1+x2+x3+x4+x5,data=coleman) cop=summary(co) rstudent(co) ##gives rstudent values par(mfrow=c(1,1)) plot(rstudent(co)) outlierTest(co) ##Reports the Bonferroni p-value for the most extreme observation. #identify that case #18 is a y outlier #cooks distance cooks.distance(co) max(cooks.distance(co)) order(cooks.distance(co))[20] plot(cooks.distance(co)) highcook <- which((cooks.distance(co)) > qf(0.5,cop$df[1],cop$df[2])) cooks.distance(co)[highcook] #case 18 is not influential #######################delete #18 obeservation##################### coleman2<-coleman[-18,] #######################outliers##################### co2 <- lm(y ~ x1+x2+x3+x4+x5,data=coleman2) cop2=summary(co2) rstudent(co2) ##gives rstudent values par(mfrow=c(1,1)) plot(rstudent(co2)) outlierTest(co2) ##Reports the Bonferroni p-value for the most extreme observation. #now after deleting #18, case #3 becomes a new outlier #cooks distance cooks.distance(co2) max(cooks.distance(co2)) order(cooks.distance(co2))[19] plot(cooks.distance(co2)) highcook <- which((cooks.distance(co2)) > qf(0.5,cop2$df[1],cop2$df[2])) cooks.distance(co2)[highcook]