########################################################## ###################Handout 10####################################### ########################################################## bodyfat<-read.table(file="C:/jenn/teaching/stat440540/Fall2013/data/bodyfat.txt", col.names=c("x1","x2","x3","y")) attach(bodyfat) cor(cbind(x1,x2,x3)) pairs(x1~x2+x3) ## Fit the Estimated Regression Line by all predictor variables ans.all <- lm(y ~ x1+x2+x3, data=bodyfat) summary(ans.all) anova(ans.all) ##check VIF library(car) vif(ans.all) ##vif indicates strong collinearity problem ### Backward Stepwise Regression ### lower <- formula(~ 1) upper <- formula(~ x1+x2+x3) ans.sw <- step(ans.all, scope=list(lower=lower, upper=upper), direction = "back") summary(ans.sw) ##########################ridge regression##################################### #direct calculation for b^R when c =0.02 rxx<-cor(cbind(x1,x2,x3)) cor(x1,y) cor(x2,y) cor(x3,y) rxy<-cbind(cor(x1,y),cor(x2,y),cor(x3,y)) cmatrix<-matrix(0,3,3) cmatrix[1,1]<-0.02 cmatrix[2,2]<-0.02 cmatrix[3,3]<-0.02 br<-solve(rxx+cmatrix)%*%t(rxy) br ############## LASSO ################ ##Lasso install.packages("lasso2") install.packages("lars") library(lasso2) library(lars) X <- as.matrix(bodyfat[,c(1,2,3)]) y<-bodyfat$y ans.cv <- cv.lars(X, y,type="lasso",index=seq(from = 0, to = 1, length =20)) #type=lasso is the defaut, index=seq(from = 0, to = 1, length =100) is a default #can set K=10 10-fold cross validation lambdamin0 <- ans.cv$index[order(ans.cv$cv)[1]] #find \lambda that is associated with minimum CV lambdamin0 lasso11 <- l1ce(y~x1+x2+x3, data=bodyfat, bound=lambdamin0) coef(lasso11) ##Lasso for coleman data coleman<- read.table(file="C:/jenn/teaching/stat440540/data/TAB6-4.DAT",sep="",col.names=c("School","x1","x2","x3","x4","x5","y")) X <- as.matrix(coleman[,c(2,3,4,5,6)]) y<-coleman$y ans.cv2 <- cv.lars(X, y,index=seq(from = 0, to = 1, length =20)) seid<-order(ans.cv2$cv)[1] oneselower<-min(ans.cv2$cv)+ans.cv2$cv.error[seid] oneselower2<-min(ans.cv2$cv)-ans.cv2$cv.error[seid] abline(oneselower,0,lty=2) abline(oneselower2,0,lty=2) ##values within 1 SE from the minimum CV value are reasonable choises lasso21 <- l1ce(y~x1+x2+x3+x4+x5, data=coleman, bound=0.5) #try \lambda = 0.5 coef(lasso21) ##with lamda=0.5, lasso includes x3 and x4 in the model as we did in Chapter 9 lasso22 <- l1ce(y~x1+x2+x3+x4+x5, data=coleman, bound=0.55) #try \lambda =0.55 coef(lasso22) ##with lamda=0.55, lasso includes x3 and x4 in the model as we did in Chapter 9 ########################################################## ##############SENIC data################ ########################################################## ########################################################## ## Read in data senic.data <- read.table(file="C://jenn/teaching/stat440540/data/senic_data.txt", header=T) senic.data[1:10,] attach(senic.data) n<-nrow(senic.data) n #################################################### cor(senic.data) names(senic.data) X <- senic.data[,c(2,3,5:7,10:12)] y <- senic.data$risk ### LS Estimator with all X's in the model ### ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data) summary(ans.all) ### Backward Stepwise Regression ### lower <- formula(~ 1) upper <- formula(~ stay + age + culture + xray + beds + census + nurses + services) ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data) ans.sw <- step(ans.all, scope=list(lower=lower, upper=upper), direction = "backward") summary(ans.sw) ### LASSO ### install.packages("lars") library(lasso2) library(lars) ans.cv <- cv.lars(X, y, index=seq(0,1,by=.02)) best.bound <- ans.cv$index[order(ans.cv$cv)[1]] best.lasso <- l1ce(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data, bound=best.bound) coef(best.lasso) summary(best.lasso) coef(ans.all) coef(ans.sw) coef(best.lasso) ############################################################################################ ### Calculate PRESS: predicted residual sum of squares statistic for the various methods ### ############################################################################################ #PRESS = \sum_{i=1}^n (y_i -\hat y_{i, -i})^2 ### Calculate PRESS for the all variables in LS estimator ### ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data) ## get leverage values lev <- hat(ans.all$qr) del.res <- ans.all$resid/(1-lev) #get deleted residual PRESS.all <- sum(del.res^2) PRESS.all ### Calculate PRESS for the all variables in by refitting n regressions ### ## Notice we redo the variable selection each time we remove a point to get ## a true measure of the entire variability involed in the estimation process. n <- nrow(senic.data) yhat <- rep(0,n) for(i in 1:n){ data.i <- senic.data[-i,] ## estimation part goes here ## ans.i <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=data.i) ## Now predict at the held out point ## yhat[i] <- predict(ans.i, senic.data[i,]) } PRESS.all <- sum((senic.data$risk - yhat)^2) PRESS.all ## Same as using deleted residuals via leverages ### Calculate PRESS for the stepwise estimation method ### ## Notice we redo the variable selection each time we remove a point to get ## a true measure of the entire variability involed in the estimation process. n <- nrow(senic.data) yhat <- rep(0,n) for(i in 1:n){ data.i <- senic.data[-i,] ## estimation part goes here ## lower <- formula(~ 1) upper <- formula(~ stay + age + culture + xray + beds + census + nurses + services) ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=data.i) ans.i <- step(ans.all, scope=list(lower=lower, upper=upper), direction = "backward") ## Now predict at the held out point ## yhat[i] <- predict(ans.i, senic.data[i,]) } PRESS.sw <- sum((senic.data$risk - yhat)^2) PRESS.sw ### Calculate PRESS for LASSO ### ## Notice we redo the lambda selection each time we remove a point to get ## a true measure of the entire variability involed in the estimation process. library(lasso2) library(lars) n <- nrow(senic.data) yhat <- rep(0,n) for(i in 1:n){ data.i <- senic.data[-i,] ## estimation part goes here ## X.i <- as.matrix(data.i[,c(2,3,5:7,10:12)]) y.i <- data.i$risk ans.cv <- cv.lars(X.i, y.i, index=seq(0,1,by=.05)) best.bound <- ans.cv$index[order(ans.cv$cv)[1]] print(best.bound) ans.i <- l1ce(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=data.i, bound=best.bound) ## Now predict at the held out point ## yhat[i] <- predict(ans.i, senic.data[i,]) } PRESS.lasso <- sum((senic.data$risk - yhat)^2) PRESS.lasso