##########################################################
###################Handout 10#######################################
##########################################################

bodyfat<-read.table(file="C:/jenn/teaching/stat440540/Fall2013/data/bodyfat.txt",
col.names=c("x1","x2","x3","y"))
attach(bodyfat)

cor(cbind(x1,x2,x3))
pairs(x1~x2+x3)


## Fit the Estimated Regression Line by all predictor variables
ans.all <- lm(y ~ x1+x2+x3, data=bodyfat)
summary(ans.all)
anova(ans.all)

##check VIF
library(car)
vif(ans.all) 
##vif indicates strong collinearity problem

### Backward Stepwise Regression ###

lower <- formula(~ 1)
upper <- formula(~ x1+x2+x3)
ans.sw <- step(ans.all, scope=list(lower=lower, upper=upper), direction = "back")
summary(ans.sw)


##########################ridge regression#####################################
#direct calculation for b^R when c =0.02
rxx<-cor(cbind(x1,x2,x3))
cor(x1,y)
cor(x2,y)
cor(x3,y)
rxy<-cbind(cor(x1,y),cor(x2,y),cor(x3,y))
cmatrix<-matrix(0,3,3)
cmatrix[1,1]<-0.02
cmatrix[2,2]<-0.02
cmatrix[3,3]<-0.02
br<-solve(rxx+cmatrix)%*%t(rxy)
br


############## LASSO ################

##Lasso
install.packages("lasso2")
install.packages("lars")
library(lasso2)
library(lars)

X <- as.matrix(bodyfat[,c(1,2,3)])
y<-bodyfat$y
ans.cv <- cv.lars(X, y,type="lasso",index=seq(from = 0, to = 1, length =20)) 
#type=lasso is the defaut, index=seq(from = 0, to = 1, length =100) is a default
#can set K=10 10-fold cross validation

lambdamin0 <- ans.cv$index[order(ans.cv$cv)[1]] #find \lambda that is associated with minimum CV 
lambdamin0 
lasso11 <- l1ce(y~x1+x2+x3, data=bodyfat, bound=lambdamin0)
coef(lasso11)


##Lasso for coleman data
coleman<- read.table(file="C:/jenn/teaching/stat440540/data/TAB6-4.DAT",sep="",col.names=c("School","x1","x2","x3","x4","x5","y"))
X <- as.matrix(coleman[,c(2,3,4,5,6)])
y<-coleman$y
ans.cv2 <- cv.lars(X, y,index=seq(from = 0, to = 1, length =20)) 
seid<-order(ans.cv2$cv)[1]
oneselower<-min(ans.cv2$cv)+ans.cv2$cv.error[seid]
oneselower2<-min(ans.cv2$cv)-ans.cv2$cv.error[seid]
abline(oneselower,0,lty=2)
abline(oneselower2,0,lty=2)
##values within 1 SE from the minimum CV value are reasonable choises

lasso21 <- l1ce(y~x1+x2+x3+x4+x5, data=coleman, bound=0.5) #try  \lambda = 0.5
coef(lasso21)
##with lamda=0.5, lasso includes x3 and x4 in the model as we did in Chapter 9
lasso22 <- l1ce(y~x1+x2+x3+x4+x5, data=coleman, bound=0.55) #try \lambda =0.55
coef(lasso22)
##with lamda=0.55, lasso includes x3 and x4 in the model as we did in Chapter 9


##########################################################
##############SENIC data################
##########################################################
##########################################################

## Read in data
senic.data <- read.table(file="C://jenn/teaching/stat440540/data/senic_data.txt", header=T)
senic.data[1:10,]
attach(senic.data)
n<-nrow(senic.data)
n


####################################################

cor(senic.data)
names(senic.data)
X <- senic.data[,c(2,3,5:7,10:12)]
y <- senic.data$risk

### LS Estimator with all X's in the model ###

ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data)
summary(ans.all)


### Backward Stepwise Regression ###

lower <- formula(~ 1)
upper <- formula(~ stay + age + culture + xray + beds + census + nurses + services)
ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data)
ans.sw <- step(ans.all, scope=list(lower=lower, upper=upper), direction = "backward")
summary(ans.sw)


### LASSO ###

install.packages("lars")
library(lasso2)
library(lars)

ans.cv <- cv.lars(X, y, index=seq(0,1,by=.02))

best.bound <- ans.cv$index[order(ans.cv$cv)[1]]
best.lasso <- l1ce(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data, bound=best.bound)

coef(best.lasso)
summary(best.lasso)


coef(ans.all)

coef(ans.sw)

coef(best.lasso)


############################################################################################
### Calculate PRESS: predicted residual sum of squares statistic for the various methods ###
############################################################################################
#PRESS = \sum_{i=1}^n (y_i -\hat y_{i, -i})^2

### Calculate PRESS for the all variables in LS estimator ###

ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=senic.data)

## get leverage values
lev <- hat(ans.all$qr)
del.res <- ans.all$resid/(1-lev)  #get deleted residual
PRESS.all <- sum(del.res^2)
PRESS.all


### Calculate PRESS for the all variables in by refitting n regressions ###

## Notice we redo the variable selection each time we remove a point to get
##  a true measure of the entire variability involed in the estimation process.

n <- nrow(senic.data)
yhat <- rep(0,n)

for(i in 1:n){
  data.i <- senic.data[-i,]

 ## estimation part goes here ##
  ans.i <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=data.i)

 ## Now predict at the held out point ##
  yhat[i] <- predict(ans.i, senic.data[i,])
}

PRESS.all <- sum((senic.data$risk - yhat)^2)
PRESS.all ## Same as using deleted residuals via leverages


### Calculate PRESS for the stepwise estimation method ###

## Notice we redo the variable selection each time we remove a point to get
##  a true measure of the entire variability involed in the estimation process.

n <- nrow(senic.data)
yhat <- rep(0,n)

for(i in 1:n){
  data.i <- senic.data[-i,]

 ## estimation part goes here ##
  lower <- formula(~ 1)
  upper <- formula(~ stay + age + culture + xray + beds + census + nurses + services)
  ans.all <- lm(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=data.i)
  ans.i <- step(ans.all, scope=list(lower=lower, upper=upper), direction = "backward")

 ## Now predict at the held out point ##
  yhat[i] <- predict(ans.i, senic.data[i,])
}

PRESS.sw <- sum((senic.data$risk - yhat)^2)
PRESS.sw


### Calculate PRESS for LASSO ###


## Notice we redo the lambda selection each time we remove a point to get
##  a true measure of the entire variability involed in the estimation process.

library(lasso2)
library(lars)

n <- nrow(senic.data)
yhat <- rep(0,n)

for(i in 1:n){
  data.i <- senic.data[-i,]

 ## estimation part goes here ##
 
  X.i <- as.matrix(data.i[,c(2,3,5:7,10:12)])
  y.i <- data.i$risk
  ans.cv <- cv.lars(X.i, y.i, index=seq(0,1,by=.05))
  best.bound <- ans.cv$index[order(ans.cv$cv)[1]]
print(best.bound)
  ans.i <- l1ce(risk ~ stay + age + culture + xray + beds + census + nurses + services, data=data.i, bound=best.bound)

 ## Now predict at the held out point ##
  yhat[i] <- predict(ans.i, senic.data[i,])
}

PRESS.lasso <- sum((senic.data$risk - yhat)^2)
PRESS.lasso