########################################################## ############### Handout #9 for ST440/540 ############### ############## outliers and influential data################ ########################################################## ##Life Insurance Example: a portion of the data on average annual income of managers during the past ##two years (x1), a score measuring each manger's risk aversion (x2), and the ##amount of life insurance carried (y) for a sample of 18 managers in the 30-39 ##age group. Risk aversion was measured by a standard questionnaire administered ##to each manager: the higher the score, the greater the degree of risk aversion. ##Income and risk aversion are mildly correlated, the coefficient of correlation ##is r12 =.254. ex.data <- read.table(file="C:/jenn/teachingrelated/stat440540/data/CH10/CH10TA01.txt") names(ex.data)[1]<-"income" names(ex.data)[2]<-"risk" names(ex.data)[3]<-"insurance" x1 <- ex.data$income x2 <- ex.data$risk y <- ex.data$insurance ##fit regression model myfit<-lm(y~x1+x2, data=ex.data) summary(myfit) anova(myfit) resid<-myfit$residual pred<-myfit$fitted ##residual plots pairs(ex.data) par(mfrow=c(3,1)) plot(x1, resid,pch=23, bg='red', cex=2) abline(0,0,lty=2) plot(x2, resid,pch=23, bg='red', cex=2) abline(0,0,lty=2) plot(pred, resid, pch=23, bg='red', cex=2) abline(0,0,lty=2) ##outliers rstudent(myfit) ##gives rstudent values ##install package car install.packages("car") library(car) outlier.test(myfit) ##Reports the Bonferroni p-value for the most extreme observation. qt(1-.05/36,14) ## compare to t_14 (.9986)=3.621442 # Bonferonni p-value for most extreme obs using rstudent--Studentized deleted residual ##leverage, x outliers aa<-lm.influence(myfit) ##hat: a vector containing the diagonal of the "hat" matrix. xoutliers <- which(aa$hat > .333) xoutliers x1[xoutliers] x2[xoutliers] y[xoutliers] ex.data[6:7,] ##Influential Observations #dffits dffits(myfit) plot(dffits(myfit), pch=23, bg='orange', cex=2, ylab="DFFITS") ex.data[which(dffits(myfit) > 1),] #cooks distance cooks.distance(myfit) max(cooks.distance(myfit)) order(cooks.distance(myfit))[18] plot(cooks.distance(myfit),pch=23, bg='orange', cex=2, ylab="COOK'S D") smyfit<-summary(myfit) highcook <- which((cooks.distance(co)) > qf(0.5,smyfit$df[1],smyfit$df[2])) cooks.distance(myfit)[highcook] #dfbetas dfbetas(myfit) #all influential measures together influence.measures(myfit) ##cook.d, dffits qf(.5,3,15) # Evaluate Collinearity vif(myfit) # variance inflation factors ##partial regression plots myfit1<-lm(y~x2, data=ex.data) resid1<-myfit1$residual myfit2<-lm(x1~x2, data=ex.data) resid2<-myfit2$residual plot(resid2,resid1) myfit3<-lm(y~x1, data=ex.data) resid3<-myfit3$residual myfit4<-lm(x2~x1, data=ex.data) resid4<-myfit4$residual plot(resid4,resid3) ########################################################## ############Added variable plots####################################### ########################################################## bodyfat<-read.table(file="C:/jenn/teaching/stat440540/data/bodyfat.txt") x1 <- bodyfat$V1 x2 <- bodyfat$V2 x3 <- bodyfat$V3 y <- bodyfat$V4 cor(cbind(x1,x2,x3)) pairs(x1~x2+x3) myfit<-lm(y~x1+x2) summary(myfit) resid<-myfit$residuals myfit1<-lm(y~x2) resid1<-myfit1$residuals myfit2<-lm(x1~x2) resid2<-myfit2$residuals myfit3<-lm(y~x1) resid3<-myfit3$residuals myfit4<-lm(x2~x1) resid4<-myfit4$residuals par(mfrow=c(2,2)) plot(x1,resid, main="(a) residual plot against x1") plot(resid2,resid1,xlab="e(x1|x2)",ylab="e(y|x2)",main="(b) added variable plot for x1") plot(x2,resid, main="(c) residual plot against x2") plot(resid4,resid3,xlab="e(x2|x1)",ylab="e(y|x1)",main="(d) added variable plot for x2") myfit5<-lm(y~x2+x3) resid5<-myfit5$residuals myfit6<-lm(x1~x2+x3) resid6<-myfit6$residuals par(mfrow=c(1,1)) plot(resid6,resid5,xlab="e(x1|x2,x3)",ylab="e(y|x2,x3)",main="added variable plot for x1")