########################################################## ############### Handout #5 for ST440/540 ############### ################# Multiple Regression ################## ########################################################## ######################################## ############# Example 1 ################ #### A Randomly Generated data set #### ######################################## ## Generate the Data n <- 25 x_1 <- runif(n, 0, 10) x_2 <- runif(n, 0, 10) beta_0 <- 10 beta_1 <- 2 beta_2 <- -1 eps <- rnorm(n, 0, 5) y <- beta_0 + beta_1*x_1 + beta_2*x_2 + eps #################################################### ## Fit the Estimated Regression Line ## y ~ x1 +x2 means use the model y = b0 + b1*x1 + b2*x2 myfit1 <- lm(y ~ x_1 + x_2) #################################################### ## Hypothesis tests for ## H_0: beta_0=0 vs. H_a: beta_0 != 0 and ## H_0: beta_1=0 vs. H_a: beta_1 != 0 ## H_0: beta_2=0 vs. H_a: beta_2 != 0 summary(myfit1) #################################################### ## Make an ANOVA table and F-test ## These p-values correspond to the sequential testing approach ## H_0: beta_1=0 vs. H_a: beta_1 != 0 for the model y = b0 + b1*x1 + e ## H_0: beta_2=0 vs. H_a: beta_2 != 0 for the model y = b0 + b1*x1 + b2*x2 + e ## This will become an important concept for model building, but we can sort of ## ignore these for now anova(myfit1) #################################################### ## 95% confidence intervals for beta_0 and beta_1 confint(myfit1, level=.95) #################################################### ## Calculate estimates using only matrix algebra ## This is just for illustration. Not useful for actual data analysis ## Design Matrix X <- cbind(rep(1,n), x_1, x_2) ## estimated coef ## - %*% means matrix multiplication ## - t(X) means transpose X ## - solve(A) means A inverse b <- solve(t(X) %*% X) %*% t(X) %*% y ## Compare to lm fit myfit1$coef ## Calculate t-tests y_hat <- X %*% b MSE <- sum((y-y_hat)^2)/(n-3) s2_b <- MSE*solve(t(X)%*%X) ## t stat for beta_0, beta_1, and beta_2 t_0 <- b[1]/sqrt(s2_b[1,1]) t_1 <- b[2]/sqrt(s2_b[2,2]) t_2 <- b[3]/sqrt(s2_b[3,3]) t_0 t_1 t_2 ## p-val for beta_0, beta_1, and beta_2 pval_0 <- 2*(1-pt(abs(t_0), n-3)) pval_1 <- 2*(1-pt(abs(t_1), n-3)) pval_2 <- 2*(1-pt(abs(t_2), n-3)) pval_0 pval_1 pval_2 ## CI's for beta_0, beta_1, and beta_2 CI_beta_0 <- c(b[1]-qt(.975, n-3)*sqrt(s2_b[1,1]), b[1]+qt(.975, n-3)*sqrt(s2_b[1,1])) CI_beta_1 <- c(b[2]-qt(.975, n-3)*sqrt(s2_b[2,2]), b[2]+qt(.975, n-3)*sqrt(s2_b[2,2])) CI_beta_2 <- c(b[3]-qt(.975, n-3)*sqrt(s2_b[3,3]), b[3]+qt(.975, n-3)*sqrt(s2_b[3,3])) CI_beta_0 CI_beta_1 CI_beta_2 #################################################### ## Plot Data ## Scatterplot matrix pairs(y ~ x_1 + x_2) ## Correlation matrix cor(cbind(y, x_1, x_2)) ## 3-D scatterplot install.packages("lattice") library(lattice) cloud(y ~ x_1 + x_2, scales = list(arrows = FALSE), drape = TRUE, colorkey = TRUE) ## 3-D perspective plot of regression surface plot.data <- expand.grid(int = 1, x_1 = seq(1,10,by=.5), x_2 = seq(1,10,by=.5)) plot.data$y <- as.matrix(plot.data) %*% b wireframe(y ~ x_1 + x_2, data=plot.data, scales = list(arrows = FALSE), drape = TRUE, colorkey = TRUE) #################################################### ## 95% CI's for E(Y) when (x1,x2)=(1,2), (x1,x2)=(2.5,6), (x1,x2)=(4,8) newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8)) predict(myfit1, newdata=newdata, interval="confidence", level=.95) ## 95% CI's for E(Y) using Bonferroni Correction newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8)) predict(myfit1, newdata=newdata, interval="confidence", level=1-.05/3) #################################################### ## 95% Prediction Intervals for Y_new when (x1,x2)=(1,2), (x1,x2)=(2.5,6), ## (x1,x2)=(4,8) newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8)) predict(myfit1, newdata=newdata, interval="prediction", level=.95) ## 95% PI's for Y_new using Bonferroni Correction newdata <- data.frame(x_1=c(1, 2.5, 4), x_2=c(2, 6, 8)) predict(myfit1, newdata=newdata, interval="prediction", level=1-.05/3) ################################################## ################################################## ############# Example 2 ########################## ####### Multiple Regression on SENIC data ######## ################################################## ## Read in data senic.data <- read.table(file="W://teaching/stat440540/data/senic_data.txt", header=T) senic.data[1:10,] n<-nrow(senic.data) n #refer to appendix C.1 page 1348 for description of the data #determine whether infection surveillance and control programs have reduced the rates of nosocomial (hospital-acquired) infection in United states #################################################### ## Fit the Estimated Regression Line ## Fit the model risk = b0 + b1*stay + b2*age + b3*xray myfit2 <- lm(risk ~ stay + age + xray, data=senic.data) # risk: average estimated probability of acquiring infection in hospital # stay: Average length of stay of all patients in the hospital (days) # age: Average age of patients (in years) # xray: Ratio of number of X-rays performed to number of #patients without signs or symptoms of pneumonia summary(myfit2) anova(myfit2) confint(myfit2, level=.95) #################################################### ## Check Adequacy of Model Assumptions ## Define the variables globally risk <- senic.data$risk stay <- senic.data$stay age <- senic.data$age xray <- senic.data$xray resid <- myfit2$residuals yhat <- myfit2$fitted ## Check for possible non-linear relationships ## Scattterplot matrix pairs(risk ~ stay + age + xray, data=senic.data) #correlation matrix cor(senic.data[,2:7]) ## Plot residuals by predictors and fitted value par(mfrow=c(2,2)) plot(stay, resid) abline(0,0,lty=2) plot(age, resid) abline(0,0,lty=2) plot(xray, resid) abline(0,0,lty=2) plot(yhat, resid) abline(0,0,lty=2) par(mfrow=c(1,1)) ## Check for possible interactions par(mfrow=c(2,2)) plot(stay*age, resid) abline(0,0,lty=2) plot(stay*xray, resid) abline(0,0,lty=2) plot(age*xray, resid) abline(0,0,lty=2) par(mfrow=c(1,1)) ## Check for outliers with boxplot boxplot(resid, main="Boxplot of Residuals") ##outliers ##install package car install.packages("car") library(car) rstudent(myfit2) ##gives rstudent values outlierTest(myfit2) ##Reports the Bonferroni p-value for the most extreme observation. qt(1-.05/(2*113),113-4-1) # Bonferonni p-value for most extreme obs using rstudent--Studentized deleted residual par(mfrow=c(1,1)) plot(rstudent(myfit2)) ##leverage, x outliers lev<-hatvalues(myfit2) lev xoutliers <- which(lev > 2*4/113)) xoutliers lev[xoutliers] plot(lev) ## Check for non-constant error variance plot(yhat, resid) abline(0,0,lty=2) ## Check for non-normality hist(resid) qqnorm(resid) qqline(resid) ## Check for independence across order plot(1:length(resid), resid, xlab='order') abline(0,0,lty=2) #################################################### ## What is 95% CI for E(Y) of a hospital with stay=9, age=50, xray=85 newdata <- data.frame(stay=9, age=50, xray=85) predict(myfit2, newdata=newdata, interval="confidence", level=.95) ## A hospital not included in the study has stay=9, age=50, xray=85. ## Predict the infection risk for this hospital newdata <- data.frame(stay=9, age=50, xray=85) predict(myfit2, newdata=newdata, interval="prediction", level=.95)