##########################################################
###############  Handout #5 for ST440/540  ###############
#################  Multiple Regression  ##################
##########################################################

########################################
############# Example 1 ################
####  A Randomly Generated data set ####
########################################

## Generate the Data

n <- 25

x_1 <- runif(n, 0, 10) 
x_2 <- runif(n, 0, 10)

beta_0 <- 10
beta_1 <- 2
beta_2 <- -1

eps <- rnorm(n, 0, 5)

y <- beta_0 + beta_1*x_1 + beta_2*x_2 + eps

####################################################

## Fit the Estimated Regression Line
##  y ~ x1 +x2 means use the model y = b0 + b1*x1 + b2*x2

myfit1 <- lm(y ~ x_1 + x_2)  


####################################################

## Hypothesis tests for
##   H_0: beta_0=0 vs. H_a: beta_0 != 0 and
##   H_0: beta_1=0 vs. H_a: beta_1 != 0 
##   H_0: beta_2=0 vs. H_a: beta_2 != 0 

summary(myfit1)


####################################################

## Make an ANOVA table and F-test
## These p-values correspond to the sequential testing approach
## H_0: beta_1=0 vs. H_a: beta_1 != 0  for the model y = b0 + b1*x1 + e
## H_0: beta_2=0 vs. H_a: beta_2 != 0  for the model y = b0 + b1*x1 + b2*x2 + e

## This will become an important concept for model building, but we can sort of
## ignore these for now

anova(myfit1)


####################################################

## 95% confidence intervals for beta_0 and beta_1

confint(myfit1, level=.95)


####################################################

## Calculate estimates using only matrix algebra 
## This is just for illustration.  Not useful for actual data analysis

## Design Matrix
X <- cbind(rep(1,n), x_1, x_2)

## estimated coef
##   - %*% means matrix multiplication  
##   - t(X) means transpose X
##   - solve(A) means A inverse
b <- solve(t(X) %*% X) %*% t(X) %*% y

## Compare to lm fit
myfit1$coef

## Calculate t-tests
y_hat <- X %*% b
MSE <- sum((y-y_hat)^2)/(n-3)
s2_b <- MSE*solve(t(X)%*%X)

## t stat for beta_0, beta_1, and beta_2
t_0 <- b[1]/sqrt(s2_b[1,1])
t_1 <- b[2]/sqrt(s2_b[2,2])
t_2 <- b[3]/sqrt(s2_b[3,3])
t_0
t_1
t_2

## p-val for beta_0, beta_1, and beta_2
pval_0 <- 2*(1-pt(abs(t_0), n-3))
pval_1 <- 2*(1-pt(abs(t_1), n-3))
pval_2 <- 2*(1-pt(abs(t_2), n-3))
pval_0
pval_1
pval_2

## CI's for beta_0, beta_1, and beta_2
CI_beta_0 <- c(b[1]-qt(.975, n-3)*sqrt(s2_b[1,1]), b[1]+qt(.975, n-3)*sqrt(s2_b[1,1]))
CI_beta_1 <- c(b[2]-qt(.975, n-3)*sqrt(s2_b[2,2]), b[2]+qt(.975, n-3)*sqrt(s2_b[2,2]))
CI_beta_2 <- c(b[3]-qt(.975, n-3)*sqrt(s2_b[3,3]), b[3]+qt(.975, n-3)*sqrt(s2_b[3,3]))
CI_beta_0
CI_beta_1
CI_beta_2


####################################################

## Plot Data 

## Scatterplot matrix
pairs(y ~ x_1 + x_2)

## Correlation matrix
cor(cbind(y, x_1, x_2))

## 3-D scatterplot

install.packages("lattice")
library(lattice)

cloud(y ~ x_1 + x_2, scales = list(arrows = FALSE), drape = TRUE, colorkey = TRUE)

## 3-D perspective plot of regression surface
plot.data <- expand.grid(int = 1, x_1 = seq(1,10,by=.5), x_2 = seq(1,10,by=.5))
plot.data$y <- as.matrix(plot.data) %*% b
wireframe(y ~ x_1 + x_2, data=plot.data, scales = list(arrows = FALSE), drape = TRUE, colorkey = TRUE)


####################################################

## 95% CI's for E(Y) when (x1,x2)=(1,2), (x1,x2)=(2.5,6), (x1,x2)=(4,8)
newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8))
predict(myfit1, newdata=newdata, interval="confidence", level=.95)

## 95% CI's for E(Y) using Bonferroni Correction
newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8))
predict(myfit1, newdata=newdata, interval="confidence", level=1-.05/3)


####################################################

## 95% Prediction Intervals for Y_new when (x1,x2)=(1,2), (x1,x2)=(2.5,6), 
##  (x1,x2)=(4,8)

newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8))
predict(myfit1, newdata=newdata, interval="prediction", level=.95)

## 95% PI's for Y_new using Bonferroni Correction
newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8))
predict(myfit1, newdata=newdata, interval="prediction", level=1-.05/3)


##################################################
##################################################
############# Example 2 ##########################
####### Multiple Regression on SENIC data ########
##################################################

## Read in data
senic.data <- read.table(file="W://teaching/stat440540/data/senic_data.txt", header=T)
senic.data[1:10,]
n<-nrow(senic.data)
n
#refer to appendix C.1 page 1348 for description of the data
#determine whether infection surveillance and control programs have reduced the rates of nosocomial (hospital-acquired) infection in United states

####################################################

## Fit the Estimated Regression Line
## Fit the model  risk = b0 + b1*stay + b2*age + b3*xray
myfit2 <- lm(risk ~ stay + age + xray, data=senic.data) 
# risk: average estimated probability of acquiring infection in hospital
# stay: Average length of stay of all patients in the hospital (days)
# age: Average age of patients (in years)
# xray: Ratio of number of X-rays performed to number of 
#patients without signs or symptoms of pneumonia

summary(myfit2)
anova(myfit2)
confint(myfit2, level=.95)


####################################################

## Check Adequacy of Model Assumptions

## Define the variables globally
risk <- senic.data$risk
stay <- senic.data$stay
age <- senic.data$age
xray <- senic.data$xray
resid <- myfit2$residuals
yhat <- myfit2$fitted

## Check for possible non-linear relationships

## Scattterplot matrix
pairs(risk ~ stay + age + xray, data=senic.data)

#correlation matrix
cor(senic.data[,2:7])


## Plot residuals by predictors and fitted value
par(mfrow=c(2,2))

plot(stay, resid)
abline(0,0,lty=2)

plot(age, resid)
abline(0,0,lty=2)

plot(xray, resid)
abline(0,0,lty=2)

plot(yhat, resid)
abline(0,0,lty=2)



par(mfrow=c(1,1))

## Check for possible interactions

par(mfrow=c(2,2))

plot(stay*age, resid)
abline(0,0,lty=2)

plot(stay*xray, resid)
abline(0,0,lty=2)

plot(age*xray, resid)
abline(0,0,lty=2)

par(mfrow=c(1,1))


## Check for outliers with boxplot
boxplot(resid, main="Boxplot of Residuals")


##outliers
##install package car
install.packages("car")
library(car)

rstudent(myfit2)  ##gives rstudent values
outlierTest(myfit2)  ##Reports the Bonferroni p-value for the most extreme observation.
qt(1-.05/(2*113),113-4-1)

# Bonferonni p-value for most extreme obs using rstudent--Studentized deleted residual
par(mfrow=c(1,1))
plot(rstudent(myfit2))

##leverage, x outliers
lev<-hatvalues(myfit2)  
lev
xoutliers <- which(lev > 2*4/113))
xoutliers
lev[xoutliers]
plot(lev)




## Check for non-constant error variance
plot(yhat, resid)
abline(0,0,lty=2)


## Check for non-normality
hist(resid)

qqnorm(resid)
qqline(resid)


## Check for independence across order
plot(1:length(resid), resid, xlab='order')
abline(0,0,lty=2)


####################################################

## What is 95% CI for E(Y) of a hospital with stay=9, age=50, xray=85

newdata <- data.frame(stay=9, age=50, xray=85)
predict(myfit2, newdata=newdata, interval="confidence", level=.95)


## A hospital not included in the study has stay=9, age=50, xray=85. 
##  Predict the infection risk for this hospital

newdata <- data.frame(stay=9, age=50, xray=85)
predict(myfit2, newdata=newdata, interval="prediction", level=.95)