##################Senic.data Analysis################################## senic.data <- read.table(file="C://jenn/teaching/stat440540/data/senic_data.txt", header=T) senic.data[1:10,] attach(senic.data) n<-nrow(senic.data) n # risk: average estimated probability of acquiring infection in hospital # stay: Average length of stay of all patients in the hospital (days) # age: Average age of patients (in years) # xray: Ratio of number of X-rays performed to number of #patients without signs or symptoms of pneumonia #################################################### ## Scattterplot matrix pairs(risk~stay+ age+culture+xray+beds+ region+ census+nurses +services) #notice that "region" is qualitative #fit full model myfit<-lm(risk~ stay+ age+culture+xray+beds+ factor(region)+ census+nurses +services) myfit summary(myfit) anova(myfit) #################################################### ## Check Adequacy of Model Assumptions resid <- myfit$residuals yhat <- myfit$fitted ## Check for possible non-linear relationships ## Plot residuals by fitted value plot(yhat, resid) abline(0,0,lty=2) library(lmtest) bptest(risk~ stay+ age+culture+xray+beds+ factor(region)+ census+nurses +services,studentize=FALSE) ## Check for non-normality par(mfrow=c(1,2)) hist(resid) qqnorm(resid) qqline(resid) shapiro.test(rstandard(myfit)) ## Check for possible interactions we have interest par(mfrow=c(2,2)) plot(stay*age, resid) abline(0,0,lty=2) plot(stay*xray, resid) abline(0,0,lty=2) plot(age*xray, resid) abline(0,0,lty=2) par(mfrow=c(1,1)) #suggest no interaction term should be included ## Check for outliers with boxplot boxplot(resid, main="Boxplot of Residuals") ## myfits<-summary(myfit) rstudent(myfit) ##gives rstudent values library(car) outlierTest(myfit) ##Reports the Bonferroni p-value for the most extreme observation. qt(1-.05/(2*113),99) ## compare to t_16 (.99875)=3.635 # Bonferonni p-value for most extreme obs using rstudent--Studentized deleted residual par(mfrow=c(1,1)) plot(rstudent(myfit)) ##leverage, x outliers lev<-hatvalues(myfit) lev xoutliers <- which(lev > 3*myfits$df[1]/(myfits$df[1]+myfits$df[2])) xoutliers lev[xoutliers] plot(lev) #cooks distance cooks.distance(myfit) plot(cooks.distance(myfit)) order(cooks.distance(myfit))[111:113] highcook <- which((cooks.distance(myfit)) > qf(0.5,myfits$df[1],myfits$df[2])) cooks.distance(myfit)[highcook] senic.data[8,] senic.data[53,] senic.data[112,] #boxcox transformation selection boxcox(risk~ stay+ age+culture+xray+beds+ factor(region)+ census+nurses +services,plotit=T,lambda=seq(-0.5,3,by=0.1)) #no transformaiton is suggested, if suggested, do transformation #collinearity vif(myfit) #use GVIF, large vif value of "beds" and "census" #can go with advanced method principal component regression, ridge regression