########################################################## ############### Handout #4 for ST440/540 ############### ####### Checking the Regression Model Assumptions ######## ########################################################## ############################################### ################ Example 1 #################### ######## simulated data ######## ############################################### ## set random seed so the same data is generated each time set.seed(16) ## Generate Data x <- seq(0,1,length=100) beta.0 <- 10 beta.1 <- 3 eps <- rnorm(100, mean=0, sd=1) y <- beta.0 + beta.1*x + eps ################################################### ################## Example 2 (a)###################### ################# p115, example, p119 example ##### ################################################### ##Toluca Company example. To discover the relationship ##between lot size and labor hours required to produce the lot. Data ##on lot size and work hours for 25 recent production runs were utilized install.packages("lmtest") install.packages("nortest") library(lmtest) library(nortest) tuluca<-read.table(file="W:/teaching/stat440540/data/CH1/CH01TA01.txt") tuluca size<-tuluca$V1 hours<-tuluca$V2 plot(size,hours, main="scatter plot") myfit<-lm(hours~size) b_0 <- myfit$coef[1] b_1 <- myfit$coef[2] ## Plot the data with the fitted LS line plot(size,hours, main="Fitted Line Plot") abline(b_0, b_1) resid<-myfit$residuals ##perform B-P test residsquare<-resid^2 myfit2<-lm(residsquare~size) anova(myfit) anova(myfit2) bptest(hours~size,studentize=FALSE) ##perform normality tests #Shapiro-Wilk test is based approximately also on the coefficient of correlation between the ordered #residuals and their expected values under normality shapiro.test(resid) ################################################### ################## Example 2 (b)###################### ######### p120, example, Fisher's lack of fit test ##### ################################################### bank<-read.table(file="C:/jenn/teaching/stat440540/data/CH3/CH03TA04.txt") bank deposit<-bank$V1 accounts<-bank$V2 plot(deposit,accounts, main="scatter plot") #full model myfit = aov(accounts~factor(deposit), data=bank) summary(myfit) #reduced model myfit2<-lm(accounts~deposit) summary(myfit2) anova(myfit2) anova(myfit, myfit2) ################################################### ################## Example 3 ###################### ## Transformation Plasma Example in textbook ## ################################################### ##Data on age and plasma lelev of a polyamine for a portion of the 25 healthy children in a study## ex.data<-read.table(file="W:/teaching/stat440540/data/CH3/CH03TA08.txt") age<-ex.data$V1 plasma<-ex.data$V2 head(ex.data) n<-nrow(ex.data) ## Plot the data plot(age,plasma, main="Scatterplot of the Data") ## Fit the regression model myfit <- lm(plasma~age) ## Obtain residuals and fitted values resid <- myfit$residuals yhat <- myfit$fitted ## Check for non-linear relationship plot(age, plasma) abline(myfit$coef[1], myfit$coef[2]) plot(age, resid) abline(0,0,lty=2) #studentized residual sr=rstudent(myfit) sr plot(age,Leverage,main="Age-Rstudent plot") #leverage plot Leverage=hatvalues(myfit) plot(age,Leverage,main="Age-Leverage plot") ## Check for non-constant error variance plot(yhat, resid) abline(0,0,lty=2) bptest(plasma~age,studentize=FALSE) ## Check for non-normality par(mfrow=c(1,2)) hist(resid) qqnorm(resid) qqline(resid) shapiro.test(resid) ##check for outliers boxplot(rstudent(myfit)) youtliers <- which(abs(rstudent(myfit)) > qt(1-0.05/(2*n),n-2)) youtliers plasma[youtliers] age[youtliers] #leverage plot for x outliers Case=seq(1:n) Leverage=hatvalues(myfit) plot(Case,Leverage,main="Case-Leverage plot") xoutliers<-which(Leverage>2/n*2) xoutliers2<-which(Leverage>2/n*3) xoutliers xoutliers2 ##use boxcox to select transformation library(MASS) boxcox(plasma ~ age, lambda = seq(-2, 2, length = 10)) ##find sses in TABLE 3.9 gmean <- exp(mean(log(plasma))) #geometric mean K_2 in page 135 sse <- c() lambda<-c() i <- 1 for (lam in seq(-1,1,0.1)){ if (lam != 0){ tY <- (plasma^lam - 1) / (lam*gmean^(lam-1)) } else { tY <- log(plasma)*gmean } test <- anova(lm(tY~age)) sse[i] <- test['Residuals','Sum Sq'] lambda[i] <- lam i <- i+1 } cbind(lambda,sse) ########transformation y2=log10(y) plasma2<-log10(plasma) ## Plot the data plot(age,plasma2, main="Scatterplot of the Data") ## Fit the regression model myfit2 <- lm(plasma2~age) ## Obtain residuals and fitted values resid2 <- myfit2$residuals yhat2 <- myfit2$fitted ## Check for non-linear relationship par(mfrow=c(2,3)) plot(age, plasma2) abline(myfit2$coef[1], myfit2$coef[2]) plot(age, resid2) abline(0,0,lty=2) ## Check for outliers with boxplot boxplot(resid2) ## Check for non-constant error variance plot(yhat2, resid2) abline(0,0,lty=2) bptest(plasma2~age,studentize=FALSE) ## Check for non-normality hist(resid2) qqnorm(resid2) qqline(resid2) shapiro.test(resid2) #leverage plot for x outliers Case=seq(1:n) tLeverage=hatvalues(myfit2) plot(Case,tLeverage,main="Case-tLeverage plot") txoutliers<-which(tLeverage>2/n*2) txoutliers2<-which(tLeverage>2/n*3) txoutliers txoutliers2