##########################################################
###############  Handout #8 for ST440/540  ###############
########### Variable Selection in Regression #############
##########################################################


####################################################
################### Example 1 ######################
##### Variable Selection on simulated data #########
####################################################

set.seed(22)


n <- 100
X <- as.data.frame(matrix(runif(10*n), nrow=n))
X[,4] <- X[,1]+rnorm(n,0,.1)
names(X) <- paste("x", 1:10, sep="")
true <- 10 + 2*X[,1] + X[,2] + 3*X[,3]
y <- true + rnorm(n, 0, 1.1)
ex.data <- as.data.frame(cbind(X,y))

## Notice that the x5-x10 predictors are completely uninformative
## (i.e., they are not included in the "true" model for y)

pairs(ex.data[, c(11, 1:4)])

## pairs plot of true line against x's
pairs(cbind(true, ex.data[,1:3]))
## Plotting the "true" regression plane in scatterplots still gives the
##  appearance of error because of the effect of the other x's

### Fit full model ###
ans.all <- lm(y ~ X[,1]+X[,2]+X[,3]+X[,4]+X[,5]+X[,6]+X[,7]+X[,8]+X[,9]+X[,10])
summary(ans.all)

## calculate mse from true line
mse.all <- sum((ans.all$fitted - true)^2)/n
mse.all


### Perform Best Subsets search ###

## Install package "leaps"  (only need to do this the first time)
#install.packages("leaps")

## Load library "leaps"
library(leaps)

                      # can use method = "Cp", "adjr2", or "r2"
ans.bs <- leaps(X, y, method="Cp", nbest=5)  

## Get the 5 "best" models according to Cp
ans.bs$which[order(ans.bs$Cp)[1:5],]
ans.bs$Cp[order(ans.bs$Cp)[1:5]]

## Get the 5 "best" models according to adjusted R^2
ans.r2 <- leaps(X, y, method="adjr2", nbest=5)  
ans.r2$which[order(-ans.r2$adjr2)[1:5],]
ans.r2$adjr2[order(-ans.r2$adjr2)[1:5]]

## Fit the "best" model according to Cp
ans.sub <- lm(y ~ x1+x2+x3, data=ex.data)
summary(ans.sub)

## calculate mse from true line
mse.sub <- sum((ans.sub$fitted - true)^2)/n
mse.sub


### Perform Stepwise search ###

## Must also specify the "full" model
upper <- formula(~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10)
lower <- formula(~ 1)

## Must first create a "base" model for step function
## This one will start with the "null" model.  (e.g. Forward Stagewise)
ans0 <- lm(y ~ 1, data=ex.data)

##Now conduct stepwise search
                   ## Can use direction = "both", "backward", or "forward"
ans.sw <- step(ans0, scope=list(lower=lower, upper=upper), direction = "both")
summary(ans.sw)

## Now start with the "full" model.  (e.g. Backward Stagewise)
ans1 <- lm(y ~ x1+x2+x3+x4+x5+x6+x7+x8+x9+x10, data=ex.data)
ans.sw1 <- step(ans1, scope=list(lower=lower, upper=upper), direction = "backward")
summary(ans.sw1)


## calculate mse from true line
mse.all <- sum((ans.all$fitted - true)^2)/n
mse.sub <- sum((ans.sub$fitted - true)^2)/n
mse.sw <- sum((ans.sw$fitted -true)^2)/n
mse.sw1 <- sum((ans.sw1$fitted -true)^2)/n

mse.all
mse.sub
mse.sw
mse.sw1


################### Example 2 ######################
##### Variable Selection on coleman data #########
####################################################
install.packages(car)
library(car)
install.packages(leaps) 
library(leaps)
coleman<- read.table(file="W:/teaching/stat440540/Fall2013/data/TAB6-4.DAT",sep="",col.names=c("School","x1","x2","x3","x4","x5","y"))
attach(coleman)
coleman
summary(coleman)
#####################best subset selection#####################
X<-data.frame(cbind(coleman$x1,coleman$x2,coleman$x3,coleman$x4,coleman$x5))
X
y<-coleman$y
y
ans.cp <- leaps(X, y, method="Cp", nbest=5)  
# Get the 5 "best" models according to Cp
ans.cp$which[order(ans.cp$Cp)[1:5],]
ans.cp$Cp[order(ans.cp$Cp)[1:5]]

#####################stepwise selection #####################
upper <- formula(~ x1+x2+x3+x4+x5)
lower <- formula(~ 1)
#start with the "full" model.  (e.g. Backward Stagewise)
ans0<-lm(y~1)
ans1 <- lm(y ~ x1+x2+x3+x4+x5, data=coleman)
ans.back <- step(ans1, direction = "backward")
summary(ans.back)
fit1<-lm(y~x1+x3+x4)
fit2<-lm(y~x3+x4)
fit3<-lm(y~x1+x3)
anova(fit2,fit1)
anova(fit3,fit1)
#choose the model y~x3+x4

#######################delete #18 obeservation#####################
coleman2<-coleman[-18,]
#####################best subset selection#####################
X2<-data.frame(cbind(coleman2$x1,coleman2$x2,coleman2$x3,coleman2$x4,coleman2$x5))
X2
y2<-coleman2$y
y2
ans.cp2 <- leaps(X2, y2, method="Cp", nbest=5)  
# Get the 5 "best" models according to Cp
ans.cp2$which[order(ans.cp2$Cp)[1:5],]
ans.cp2$Cp[order(ans.cp2$Cp)[1:5]]

#####################stepwise selection #####################
upper <- formula(~ x1+x2+x3+x4+x5)
lower <- formula(~ 1)
#start with the "full" model.  (e.g. Backward Stagewise)
ans12 <- lm(y ~ x1+x2+x3+x4+x5, data=coleman2)
ans.back2 <- step(ans12, direction = "backward")
summary(ans.back2)

#######################outliers#####################
co2 <- lm(y ~ x1+x2+x3+x4+x5,data=coleman2)
cop2=summary(co2)
rstudent(co2)  ##gives rstudent values
outlierTest(co2)  ##Reports the Bonferroni p-value for the most extreme observation.
par(mfrow=c(1,1))
plot(rstudent(co2))

##leverage, x outliers
lev2<-hatvalues(co2)  
lev2
xoutliers <- which(lev2 > 2*cop2$df[1]/(cop2$df[1]+cop2$df[2]))
xoutliers
plot(lev2)

#########################################################################
###############New outliers after deleting case #18 #####################

co <- lm(y ~ x1+x2+x3+x4+x5,data=coleman)
cop=summary(co)
rstudent(co)  ##gives rstudent values
par(mfrow=c(1,1))
plot(rstudent(co))
outlierTest(co)  ##Reports the Bonferroni p-value for the most extreme observation.
#identify that case #18 is a y outlier


#cooks distance
cooks.distance(co)
max(cooks.distance(co))
order(cooks.distance(co))[20]
plot(cooks.distance(co))
highcook <- which((cooks.distance(co)) > qf(0.5,cop$df[1],cop$df[2]))
cooks.distance(co)[highcook]  #case 18 is not influential

#######################delete #18 obeservation#####################
coleman2<-coleman[-18,]

#######################outliers#####################
co2 <- lm(y ~ x1+x2+x3+x4+x5,data=coleman2)
cop2=summary(co2)
rstudent(co2)  ##gives rstudent values
par(mfrow=c(1,1))
plot(rstudent(co2))
outlierTest(co2)  ##Reports the Bonferroni p-value for the most extreme observation.

#now after deleting #18, case #3 becomes a new outlier

#cooks distance
cooks.distance(co2)
max(cooks.distance(co2))
order(cooks.distance(co2))[19]
plot(cooks.distance(co2))
highcook <- which((cooks.distance(co2)) > qf(0.5,cop2$df[1],cop2$df[2]))
cooks.distance(co2)[highcook]