##########################################################
################ Logistic Regression#####################
##########################################################
#################Example Admission Data######################
#install.packages("aod")
#install.packages("ggplot2")
library(aod)
library(ggplot2)
ex.data <-read.csv("~/teaching/stat579/data/binary.csv")
nrow(ex.data)
## [1] 400
#This dataset has a binary response (outcome, dependent) variable called admit, 1 admit, 0 no admission.
#There are three predictor variables: gre, gpa and rank.
#variables gre and gpa are continuous. The variable rank takes on the values 1 through 4. Institutions
#with a rank of 1 have the highest prestige,
#while those with a rank of 4 have the lowest.
## view the first few rows of the data
head(ex.data)
## admit gre gpa rank
## 1 0 380 3.61 3
## 2 1 660 3.67 3
## 3 1 800 4.00 1
## 4 1 640 3.19 4
## 5 0 520 2.93 4
## 6 1 760 3.00 2
#Descriptive Statistics
summary(ex.data)
## admit gre gpa rank
## Min. :0.0000 Min. :220.0 Min. :2.260 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:520.0 1st Qu.:3.130 1st Qu.:2.000
## Median :0.0000 Median :580.0 Median :3.395 Median :2.000
## Mean :0.3175 Mean :587.7 Mean :3.390 Mean :2.485
## 3rd Qu.:1.0000 3rd Qu.:660.0 3rd Qu.:3.670 3rd Qu.:3.000
## Max. :1.0000 Max. :800.0 Max. :4.000 Max. :4.000
sapply(ex.data, sd) #use sapply to apply the sd function to each variable in the dataset.
## admit gre gpa rank
## 0.4660867 115.5165364 0.3805668 0.9444602
tapply(ex.data$gpa,ex.data$rank,mean)
## 1 2 3 4
## 3.453115 3.361656 3.432893 3.318358
tapply(ex.data$gre,ex.data$rank,mean)
## 1 2 3 4
## 611.8033 596.0265 574.8760 570.1493
## two-way contingency table of categorical outcome and predictors
xtabs(~admit + rank, data = ex.data)
## rank
## admit 1 2 3 4
## 0 28 97 93 55
## 1 33 54 28 12
####Logistic regression modeling
ex.data$rank <- factor(ex.data$rank) #convert rank to a factor to indicate that rank should be treated as a
#categorical variable.
###fit data with all variables
myfit <- glm(admit ~ gre + gpa + rank, data = ex.data, family = "binomial")
summary(myfit)
##
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = "binomial",
## data = ex.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6268 -0.8662 -0.6388 1.1490 2.0790
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.989979 1.139951 -3.500 0.000465 ***
## gre 0.002264 0.001094 2.070 0.038465 *
## gpa 0.804038 0.331819 2.423 0.015388 *
## rank2 -0.675443 0.316490 -2.134 0.032829 *
## rank3 -1.340204 0.345306 -3.881 0.000104 ***
## rank4 -1.551464 0.417832 -3.713 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 458.52 on 394 degrees of freedom
## AIC: 470.52
##
## Number of Fisher Scoring iterations: 4
anova(myfit)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: admit
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 399 499.98
## gre 1 13.9204 398 486.06
## gpa 1 5.7122 397 480.34
## rank 3 21.8265 394 458.52
confint(myfit) ## CIs using profiled log-likelihood
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -6.2716202334 -1.792547080
## gre 0.0001375921 0.004435874
## gpa 0.1602959439 1.464142727
## rank2 -1.3008888002 -0.056745722
## rank3 -2.0276713127 -0.670372346
## rank4 -2.4000265384 -0.753542605
confint.default(myfit) ## CIs using standard errors
## 2.5 % 97.5 %
## (Intercept) -6.2242418514 -1.755716295
## gre 0.0001202298 0.004408622
## gpa 0.1536836760 1.454391423
## rank2 -1.2957512650 -0.055134591
## rank3 -2.0169920597 -0.663415773
## rank4 -2.3703986294 -0.732528724
myfit$deviance #G^2
## [1] 458.5175
logLik(myfit) #log likelihood ratio
## 'log Lik.' -229.2587 (df=6)
# -2*(-229.2587) =458.52
##GOF test
pearsonchi<-sum(residuals(myfit, type = "pearson")^2)
1 - pchisq(deviance(myfit), df.residual(myfit))
## [1] 0.01365347
#The p-value is large indicating no evidence of lack of fit.
###prediction
myfit$fit[1:20] #fitted probabilities
## 1 2 3 4 5 6
## 0.17262654 0.29217496 0.73840825 0.17838461 0.11835391 0.36996994
## 7 8 9 10 11 12
## 0.41924616 0.21700328 0.20073518 0.51786820 0.37431440 0.40020025
## 13 14 15 16 17 18
## 0.72053858 0.35345462 0.69237989 0.18582508 0.33993917 0.07895335
## 19 20
## 0.54022772 0.57351182
plot(myfit$fit ~ gre, data=ex.data)
plot(myfit$fit ~ gpa, data=ex.data)
ggplot(ex.data,aes(x=gpa,y=admit))+geom_point()+geom_smooth(method="glm",se=FALSE,method.args = list(family="binomial"))+xlab("GPA") + ylab("Probability of Admission")
#default se=TRUE will give confidence bands
mgre<-tapply(ex.data$gre, ex.data$rank, mean) # mean of gre by rank
mgpa<-tapply(ex.data$gpa, ex.data$rank, mean) # mean of gpa by rank
newdata1 <- with(ex.data, data.frame(gre = mgre, gpa = mgpa, rank = factor(1:4)))
newdata1
## gre gpa rank
## 1 611.8033 3.453115 1
## 2 596.0265 3.361656 2
## 3 574.8760 3.432893 3
## 4 570.1493 3.318358 4
newdata1$rankP <- predict(myfit, newdata = newdata1, type = "response")
newdata1
## gre gpa rank rankP
## 1 611.8033 3.453115 1 0.5428541
## 2 596.0265 3.361656 2 0.3514055
## 3 574.8760 3.432893 3 0.2195579
## 4 570.1493 3.318358 4 0.1704703
fitted1<-predict(myfit)
fitted1[1:20]
## 1 2 3 4 5 6
## -1.56712564 -0.88484417 1.03771175 -1.52733046 -2.00811132 -0.53234576
## 7 8 9 10 11 12
## -0.32586874 -1.28321604 -1.38170577 0.07150324 -0.51375192 -0.40463082
## 13 14 15 16 17 18
## 0.94713472 -0.60388830 0.81126917 -1.47736944 -0.66356532 -2.45665358
## 19 20
## 0.16125944 0.29619391
fitted2<-predict(myfit,type="response")
fitted2[1:20]
## 1 2 3 4 5 6
## 0.17262654 0.29217496 0.73840825 0.17838461 0.11835391 0.36996994
## 7 8 9 10 11 12
## 0.41924616 0.21700328 0.20073518 0.51786820 0.37431440 0.40020025
## 13 14 15 16 17 18
## 0.72053858 0.35345462 0.69237989 0.18582508 0.33993917 0.07895335
## 19 20
## 0.54022772 0.57351182
exp(fitted1[1:20])/(1+exp(fitted1[1:20]))
## 1 2 3 4 5 6
## 0.17262654 0.29217496 0.73840825 0.17838461 0.11835391 0.36996994
## 7 8 9 10 11 12
## 0.41924616 0.21700328 0.20073518 0.51786820 0.37431440 0.40020025
## 13 14 15 16 17 18
## 0.72053858 0.35345462 0.69237989 0.18582508 0.33993917 0.07895335
## 19 20
## 0.54022772 0.57351182
## odds ratios
exp(coef(myfit))
## (Intercept) gre gpa rank2 rank3 rank4
## 0.0185001 1.0022670 2.2345448 0.5089310 0.2617923 0.2119375
## odds ratios and 95% CI
exp(cbind(OR = coef(myfit), confint(myfit)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.0185001 0.001889165 0.1665354
## gre 1.0022670 1.000137602 1.0044457
## gpa 2.2345448 1.173858216 4.3238349
## rank2 0.5089310 0.272289674 0.9448343
## rank3 0.2617923 0.131641717 0.5115181
## rank4 0.2119375 0.090715546 0.4706961
#test for an overall effect of rank using the wald.test function
wald.test(b = coef(myfit), Sigma = vcov(myfit), Terms = 4:6)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 20.9, df = 3, P(> X2) = 0.00011
#test that the coefficient for rank=2 is equal to the coefficient for rank=3
l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(myfit), Sigma = vcov(myfit), L = l)
## Wald test:
## ----------
##
## Chi-squared test:
## X2 = 5.5, df = 1, P(> X2) = 0.019
##comparing models
myfit0<-glm(admit ~ 1, data = ex.data, family = "binomial")
summary(myfit0)
##
## Call:
## glm(formula = admit ~ 1, family = "binomial", data = ex.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8741 -0.8741 -0.8741 1.5148 1.5148
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7653 0.1074 -7.125 1.04e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 499.98 on 399 degrees of freedom
## AIC: 501.98
##
## Number of Fisher Scoring iterations: 4
myfit2<-glm(admit ~ gre + gpa, data = ex.data, family = "binomial")
summary(myfit2)
##
## Call:
## glm(formula = admit ~ gre + gpa, family = "binomial", data = ex.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2730 -0.8988 -0.7206 1.3013 2.0620
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.949378 1.075093 -4.604 4.15e-06 ***
## gre 0.002691 0.001057 2.544 0.0109 *
## gpa 0.754687 0.319586 2.361 0.0182 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 480.34 on 397 degrees of freedom
## AIC: 486.34
##
## Number of Fisher Scoring iterations: 4
myfit3<-glm(admit ~ gpa+rank, data = ex.data, family = "binomial")
summary(myfit3)
##
## Call:
## glm(formula = admit ~ gpa + rank, family = "binomial", data = ex.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5055 -0.8663 -0.6590 1.1505 2.0913
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.4636 1.1003 -3.148 0.001645 **
## gpa 1.0521 0.3102 3.392 0.000694 ***
## rank2 -0.6810 0.3141 -2.168 0.030181 *
## rank3 -1.3919 0.3419 -4.071 4.68e-05 ***
## rank4 -1.5943 0.4152 -3.840 0.000123 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 499.98 on 399 degrees of freedom
## Residual deviance: 462.88 on 395 degrees of freedom
## AIC: 472.88
##
## Number of Fisher Scoring iterations: 4
anova(myfit0,myfit,test="Chisq")
## Analysis of Deviance Table
##
## Model 1: admit ~ 1
## Model 2: admit ~ gre + gpa + rank
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 399 499.98
## 2 394 458.52 5 41.459 7.578e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qchisq(0.95,5)
## [1] 11.0705
pchisq(41.459,5,lower.tail = FALSE)
## [1] 7.578283e-08
anova(myfit, myfit2)
## Analysis of Deviance Table
##
## Model 1: admit ~ gre + gpa + rank
## Model 2: admit ~ gre + gpa
## Resid. Df Resid. Dev Df Deviance
## 1 394 458.52
## 2 397 480.34 -3 -21.826
qchisq(0.95,3)
## [1] 7.814728
pchisq(21.826,3,lower.tail = FALSE)
## [1] 7.090117e-05
anova(myfit3,myfit)
## Analysis of Deviance Table
##
## Model 1: admit ~ gpa + rank
## Model 2: admit ~ gre + gpa + rank
## Resid. Df Resid. Dev Df Deviance
## 1 395 462.88
## 2 394 458.52 1 4.3578
qchisq(0.95,1)
## [1] 3.841459
pchisq(4.3578,1,lower.tail = FALSE)
## [1] 0.03683985
#model selection
upper<-formula(~gre+gpa+rank,data=ex.data)
model.aic = step(myfit0, scope=list(lower= ~., upper= upper))
## Start: AIC=501.98
## admit ~ 1
##
## Df Deviance AIC
## + rank 3 474.97 482.97
## + gre 1 486.06 490.06
## + gpa 1 486.97 490.97
## <none> 499.98 501.98
##
## Step: AIC=482.97
## admit ~ rank
##
## Df Deviance AIC
## + gpa 1 462.88 472.88
## + gre 1 464.53 474.53
## <none> 474.97 482.97
## - rank 3 499.98 501.98
##
## Step: AIC=472.88
## admit ~ rank + gpa
##
## Df Deviance AIC
## + gre 1 458.52 470.52
## <none> 462.88 472.88
## - gpa 1 474.97 482.97
## - rank 3 486.97 490.97
##
## Step: AIC=470.52
## admit ~ rank + gpa + gre
##
## Df Deviance AIC
## <none> 458.52 470.52
## - gre 1 462.88 472.88
## - gpa 1 464.53 474.53
## - rank 3 480.34 486.34
##diagnostics
residuals(myfit)[1:20] # deviance residuals
## 1 2 3 4 5 6
## -0.6156283 1.5686953 0.7787919 1.8567786 -0.5019254 1.4102011
## 7 8 9 10 11 12
## 1.3185576 -0.6994666 1.7920763 -1.2079220 -0.9684083 -1.0110978
## 13 14 15 16 17 18
## 0.8096373 -0.9339292 0.8574619 -0.6412177 -0.9115078 -0.4055727
## 19 20
## -1.2466146 1.0544920
residuals(myfit, "pearson")[1:20] # pearson residuals
## 1 2 3 4 5 6
## -0.4567757 1.5564726 0.5952011 2.1461279 -0.3663905 1.3049606
## 7 8 9 10 11 12
## 1.1769594 -0.5264452 1.9954167 -1.0363984 -0.7734641 -0.8168372
## 13 14 15 16 17 18
## 0.6227766 -0.7393794 0.6665537 -0.4777419 -0.7176433 -0.2927821
## 19 20
## -1.0839694 0.8623475
smyfit<-summary(myfit)
infv <- c(ex.data$admit,myfit$fit,hatvalues(myfit),rstandard(myfit),cooks.distance(myfit))
inf<-matrix(infv,I(smyfit$df[1]+smyfit$df[2]),5,dimnames = list(NULL,
c("y", "yhat", "lev","r","C")))
inf[1:20,]
## y yhat lev r C
## [1,] 0 0.17262654 0.016121436 -0.6206515 0.0005791292
## [2,] 1 0.29217496 0.010913543 1.5773260 0.0045043171
## [3,] 1 0.73840825 0.023401648 0.7880675 0.0014487409
## [4,] 1 0.17838461 0.016681487 1.8724620 0.0132436067
## [5,] 0 0.11835391 0.013165325 -0.5052624 0.0003024683
## [6,] 1 0.36996994 0.021309398 1.4254708 0.0063142813
## [7,] 1 0.41924616 0.022142758 1.3334028 0.0053462903
## [8,] 0 0.21700328 0.012860089 -0.7040081 0.0006095954
## [9,] 1 0.20073518 0.008484274 1.7997272 0.0057270558
## [10,] 0 0.51786820 0.014541377 -1.2168014 0.0026805935
## [11,] 0 0.37431440 0.038492481 -0.9876024 0.0041514485
## [12,] 0 0.40020025 0.024071336 -1.0234913 0.0028105018
## [13,] 1 0.72053858 0.022210930 0.8187814 0.0015017227
## [14,] 0 0.35345462 0.013577873 -0.9403349 0.0012714214
## [15,] 1 0.69237989 0.021557285 0.8668564 0.0016674095
## [16,] 0 0.18582508 0.009742092 -0.6443641 0.0003779122
## [17,] 0 0.33993917 0.033329207 -0.9270883 0.0030614966
## [18,] 0 0.07895335 0.011819616 -0.4079910 0.0001729293
## [19,] 0 0.54022772 0.017820576 -1.2578730 0.0036176197
## [20,] 1 0.57351182 0.023625212 1.0671735 0.0030715385
#leverages
plot(hatvalues(myfit))
highleverage <- which(hatvalues(myfit) > .045)
highleverage
## 373
## 373
hatvalues(myfit)[highleverage]
## 373
## 0.04921401
ex.data[373,]
## admit gre gpa rank
## 373 1 680 2.42 1
#cooks distance
plot(cooks.distance(myfit))
highcook <- which((cooks.distance(myfit)) > .05)
cooks.distance(myfit)[highcook]
## named numeric(0)