#############dental data, Fitting population-averaged models uing the gls() function in nlme#############
library(reshape2) #to change long format to wide format or change from wide format to long format
library(car)
## Loading required package: carData
library(heplots) #test equivalence of covariance matrix
# Function to compute correct and robust standard errors from a gls()
# model object
library(nlme)
library(Matrix)
library(magic)
## Loading required package: abind
library(lmerTest)
## Loading required package: lme4
##
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
##
## lmList
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
ex.data <- read.table("~/Desktop/jenn/teaching/stat579/data/dental.dat",
col.names=c("obs","child","age","distance","gender"))
nrow(ex.data)
## [1] 108
head(ex.data) #1: boys, 0: girls"
## obs child age distance gender
## 1 1 1 8 21.0 0
## 2 2 1 10 20.0 0
## 3 3 1 12 21.5 0
## 4 4 1 14 23.0 0
## 5 5 2 8 21.0 0
## 6 6 2 10 21.5 0
ex.data2<-ex.data[ex.data$gender==0,] #dataset with only girls
ex.data3<-ex.data[ex.data$gender==1,] #dataset with only boys
aa<-aggregate(distance~age+gender, data=ex.data, mean) #look at the cell means
aa
## age gender distance
## 1 8 0 21.18182
## 2 10 0 22.22727
## 3 12 0 23.09091
## 4 14 0 24.09091
## 5 8 1 22.87500
## 6 10 1 23.81250
## 7 12 1 25.71875
## 8 14 1 27.46875
#checking correlation matrices
#### From long to wide format
d2 <- dcast(ex.data, child+gender ~ age, value.var = "distance")
d2
## child gender 8 10 12 14
## 1 1 0 21.0 20.0 21.5 23.0
## 2 2 0 21.0 21.5 24.0 25.5
## 3 3 0 20.5 24.0 24.5 26.0
## 4 4 0 23.5 24.5 25.0 26.5
## 5 5 0 21.5 23.0 22.5 23.5
## 6 6 0 20.0 21.0 21.0 22.5
## 7 7 0 21.5 22.5 23.0 25.0
## 8 8 0 23.0 23.0 23.5 24.0
## 9 9 0 20.0 21.0 22.0 21.5
## 10 10 0 16.5 19.0 19.0 19.5
## 11 11 0 24.5 25.0 28.0 28.0
## 12 12 1 26.0 25.0 29.0 31.0
## 13 13 1 21.5 22.5 23.0 26.5
## 14 14 1 23.0 22.5 24.0 27.5
## 15 15 1 25.5 27.5 26.5 27.0
## 16 16 1 20.0 23.5 22.5 26.0
## 17 17 1 24.5 25.5 27.0 28.5
## 18 18 1 22.0 22.0 24.5 26.5
## 19 19 1 24.0 21.5 24.5 25.5
## 20 20 1 23.0 20.5 31.0 26.0
## 21 21 1 27.5 28.0 31.0 31.5
## 22 22 1 23.0 23.0 23.5 25.0
## 23 23 1 21.5 23.5 24.0 28.0
## 24 24 1 17.0 24.5 26.0 29.5
## 25 25 1 22.5 25.5 25.5 26.0
## 26 26 1 23.0 24.5 26.0 30.0
## 27 27 1 22.0 21.5 23.5 25.0
nrow(d2)
## [1] 27
d2g<-d2[1:11,3:6] #girls with 4 columns, ages 8,10, 12, 14
d2b<-d2[12:27,3:6] #boys with 4 columns, ages 8,10, 12, 14
nrow(d2g) #11 girls
## [1] 11
nrow(d2b) #16 boys
## [1] 16
girls.cov<-cov(d2g)
girls.corr<-cor(d2g)
boys.cov<-cov(d2b)
boys.corr<-cor(d2b)
round(girls.cov,3)
## 8 10 12 14
## 8 4.514 3.355 4.332 4.357
## 10 3.355 3.618 4.027 4.077
## 12 4.332 4.027 5.591 5.466
## 14 4.357 4.077 5.466 5.941
round(boys.cov,3)
## 8 10 12 14
## 8 6.017 2.292 3.629 1.612
## 10 2.292 4.562 2.194 2.810
## 12 3.629 2.194 7.032 3.241
## 14 1.612 2.810 3.241 4.349
##test if the three covariance matrices are equivalent across the groups
tres <- boxM(d2[, 3:6], d2[, "gender"])
tres
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: d2[, 3:6]
## Chi-Sq (approx.) = 17.335, df = 10, p-value = 0.06727
summary(tres)
## Summary for Box's M-test of Equality of Covariance Matrices
##
## Chi-Sq: 17.33534
## df: 10
## p-value: 0.06727
##
## log of Covariance determinants:
## 0 1 pooled
## 0.8947425 5.2957931 4.3826063
##
## Eigenvalues:
## 0 1 pooled
## 1 17.9526585 13.668410 15.2972768
## 2 0.8825295 3.914267 2.6815584
## 3 0.5451995 3.219589 2.1590642
## 4 0.2832490 1.158150 0.9038051
##
## Statistics based on eigenvalues:
## 0 1 pooled
## product 2.4467057 199.4957821 80.0463895
## sum 19.6636364 21.9604167 21.0417045
## precision 0.1525911 0.6654769 0.4980364
## max 17.9526585 13.6684105 15.2972768
# Center and scale to create scatterplot matrices using the scale and
# pairs functions
girls.mean <- apply(d2g,2,mean)
boys.mean <- apply(d2b,2,mean)
girls.sd <- sqrt(diag(girls.cov))
boys.sd <- sqrt(diag(boys.cov))
girls.centscale <- scale(d2g,center=girls.mean,scale=girls.sd)
boys.centscale <- scale(d2b,center=boys.mean,scale=boys.sd)
# Now create the scatterplots
agelabs <- c("Age 8", "Age 10", "Age 12", "Age 14")
pdf("girlsscatter.pdf",width=8)
pairs(girls.centscale,label=agelabs,oma=c(5,5,5,5),main="Girls")
graphics.off()
pdf("boysscatter.pdf",width=8)
pairs(boys.centscale,label=agelabs,oma=c(5,5,5,5),main="Boys")
graphics.off()
#interaction plots
par(mfrow=c(1,1),oma=c(0,0,3,0),pch=42,font.sub=3,adj=0.5)
interaction.plot(ex.data$age,ex.data$gender,ex.data$distance,
trace.label="gender",xlab="age",ylab="distance",col=c("black","red"))
mtext(side=3,outer=T,line=0,cex=1.3,"Age Distance Study")
par(adj=1)
title(sub="interact.ps")
# savePlot(filename="dentalinteraction",
# type=c("png"),
# device=dev.cur())
par(mfrow=c(1,1),oma=c(0,0,3,0),pch=42,font.sub=3,adj=0.5)
interaction.plot(ex.data2$age,ex.data2$child,ex.data2$distance,
trace.label="girls",xlab="age",ylab="distance",col=1:11)
mtext(side=3,outer=T,line=0,cex=1.3,"Age Distance Study Female")
par(adj=1)
title(sub="interact2.ps")
# savePlot(filename="dentalinteraction2",
# type=c("png"),
# device=dev.cur())
par(mfrow=c(1,1),oma=c(0,0,3,0),pch=42,font.sub=3,adj=0.5)
interaction.plot(ex.data3$age,ex.data3$child,ex.data3$distance,
trace.label="boys",xlab="age",ylab="distance",col=1:16)
mtext(side=3,outer=T,line=0,cex=1.3,"Age Distance Study Male")
par(adj=1)
title(sub="interact3.ps")
# savePlot(filename="dentalinteraction3",
# type=c("png"),
# device=dev.cur())
#model fitting
# Create factor variables for use in gls()
ex.data$child <- factor(ex.data$child)
ex.data$gender <- factor(ex.data$gender)
# First do OLS fit ignoring correlation
# Assume group difference paramatrization
fit.ls.gdp <- lm(distance ~ gender + age+ age:gender,data=ex.data)
summary(fit.ls.gdp)
##
## Call:
## lm(formula = distance ~ gender + age + age:gender, data = ex.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6156 -1.3219 -0.1682 1.3299 5.2469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.3727 1.7080 10.171 < 2e-16 ***
## gender1 -1.0321 2.2188 -0.465 0.64279
## age 0.4795 0.1522 3.152 0.00212 **
## gender1:age 0.3048 0.1977 1.542 0.12608
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.257 on 104 degrees of freedom
## Multiple R-squared: 0.4227, Adjusted R-squared: 0.4061
## F-statistic: 25.39 on 3 and 104 DF, p-value: 2.108e-12
anova(fit.ls.gdp)
## Analysis of Variance Table
##
## Response: distance
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 140.46 140.465 27.5756 8.054e-07 ***
## age 1 235.36 235.356 46.2042 6.884e-10 ***
## gender:age 1 12.11 12.114 2.3782 0.1261
## Residuals 104 529.76 5.094
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Assume separate intercept and slope by gender
fit.ls.sgp <- lm(distance ~ -1 + gender+ age:gender,data=ex.data)
summary(fit.ls.sgp)
##
## Call:
## lm(formula = distance ~ -1 + gender + age:gender, data = ex.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6156 -1.3219 -0.1682 1.3299 5.2469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## gender0 17.3727 1.7080 10.171 < 2e-16 ***
## gender1 16.3406 1.4162 11.538 < 2e-16 ***
## gender0:age 0.4795 0.1522 3.152 0.00212 **
## gender1:age 0.7844 0.1262 6.217 1.07e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.257 on 104 degrees of freedom
## Multiple R-squared: 0.9916, Adjusted R-squared: 0.9913
## F-statistic: 3078 on 4 and 104 DF, p-value: < 2.2e-16
anova(fit.ls.sgp)
## Analysis of Variance Table
##
## Response: distance
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 2 62469 31234.3 6131.797 < 2.2e-16 ***
## gender:age 2 247 123.7 24.291 2.205e-09 ***
## Residuals 104 530 5.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# in the following,
# Assume separate intercept and slope by gender and fit different
# covariance structures using ML; REML is the default, so we have to
# add option method="ML"
#gls: generalized least squares
# u <- model object from fit, m <- total # individuals
#robust.cov is to calculate the correct ses
robust.cov <- function(u,m){
form <- formula(u)
mf <- model.frame(form,getData(u))
Xmat <- model.matrix(form,mf)
Vlist <- as.list(1:m)
for (i in 1:m){
Vlist[[i]] <- getVarCov(u,individual=i,type="marginal")
}
V <- Reduce(adiag,Vlist)
Vinv <- solve(V)
Sig.model <- solve(t(Xmat)%*%Vinv%*%Xmat)
resid <- diag(residuals(u,type="response"))
ones.list <- lapply(Vlist,FUN=function(u){matrix(1,nrow(u),ncol(u))})
Ones <- Reduce(adiag,ones.list)
meat <- t(Xmat)%*%Vinv%*%resid%*%Ones%*%resid%*%Vinv%*%Xmat
Sig.robust <- Sig.model%*%meat%*%Sig.model
se.robust <- sqrt(diag(Sig.robust))
se.model <- sqrt(diag(Sig.model))
return(list(Sig.model=Sig.model,se.model=se.model,Sig.robust=Sig.robust,se.robust=se.robust))
}
# (a) Common unstructured correlation with variances changing over time
# for both genders -- we extract components of the fit. Note that
# gls() defines BIC dfferently from SAS (it uses the total number of
# observations N while SAS MIXED uses the total number of individuals m
# The weights statement makes the variances on the diagonal differ over
# time - the default with no weight statement is that they are the same
# for all times
fit.un <- gls(distance ~ -1 + gender + age:gender,data=ex.data,
correlation=corSymm(form=~1|child),
weights=varIdent(form=~1|age),method="ML")
##varIdent: constant variance(s), generally used to allow different variances according
#to the levels of a classification factor.
beta.un <- coef(fit.un)
sebeta.un <- summary(fit.un)$tTable[,"Std.Error"]
V.un <- getVarCov(fit.un)
Gamma.un <- cov2cor(V.un)
vars.un <- diag(V.un)
# Call robust.cov to get corrected model-based SEs and compare to
# those from gls(). Compare the results to those from SAS proc
# mixed, which DOES compute the correct standard errors
u<-fit.un
m<-27
robust.un <- robust.cov(u,m)
sebeta.un.corrected <- robust.un$se.model
#compare
rbind(beta.un,sebeta.un)
## gender0 gender1 gender0:age gender1:age
## beta.un 17.425368 15.8422796 0.47636477 0.82680439
## sebeta.un 1.149879 0.9534291 0.09723354 0.08062179
sebeta.un.corrected
## gender0 gender1 gender0:age gender1:age
## 1.12838378 0.93560641 0.09541593 0.07911471
#marginal covariance and correlation matrices
V.un
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 5.1192 2.4409 3.6105 2.5222
## [2,] 2.4409 3.9279 2.7175 3.0624
## [3,] 3.6105 2.7175 5.9798 3.8235
## [4,] 2.5222 3.0624 3.8235 4.6180
## Standard Deviations: 2.2626 1.9819 2.4454 2.149
Gamma.un
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 1.00000 0.54434 0.65256 0.51875
## [2,] 0.54434 1.00000 0.56072 0.71903
## [3,] 0.65256 0.56072 1.00000 0.72760
## [4,] 0.51875 0.71903 0.72760 1.00000
## Standard Deviations: 1 1 1 1
## In the rest of these fits, we do not calculate the corrected
## standard errors. We can calculate them when we settle on a final model
# (b1) Compound symmetry
fit.cs <- gls(distance ~ -1 + gender + age:gender,data=ex.data,
correlation=corCompSymm(form = ~ 1 | child),method="ML")
beta.cs <- coef(fit.cs)
sebeta.cs <- summary(fit.cs)$tTable[,"Std.Error"]
V.cs <- getVarCov(fit.cs)
Gamma.cs <- cov2cor(V.cs)
vars.cs <- diag(V.cs)
rbind(beta.cs,sebeta.cs)
## gender0 gender1 gender0:age gender1:age
## beta.cs 17.37273 16.340625 0.47954545 0.78437500
## sebeta.cs 1.18365 0.981431 0.09406714 0.07799635
V.cs
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 4.9052 3.0306 3.0306 3.0306
## [2,] 3.0306 4.9052 3.0306 3.0306
## [3,] 3.0306 3.0306 4.9052 3.0306
## [4,] 3.0306 3.0306 3.0306 4.9052
## Standard Deviations: 2.2148 2.2148 2.2148 2.2148
Gamma.cs
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 1.00000 0.61783 0.61783 0.61783
## [2,] 0.61783 1.00000 0.61783 0.61783
## [3,] 0.61783 0.61783 1.00000 0.61783
## [4,] 0.61783 0.61783 0.61783 1.00000
## Standard Deviations: 1 1 1 1
# (b2) Compound symmetry with variance different by gender
fit.cs2 <- gls(distance ~ -1 + gender + gender:age,data=ex.data,correlation=corCompSymm(form = ~ 1 | child),
weights = varIdent(form = ~ 1 | gender),method="ML")
beta.cs2 <- coef(fit.cs2)
sebeta.cs2 <- summary(fit.cs2)$tTable[,"Std.Error"]
V.cs2.girl <- getVarCov(fit.cs2, individual=1)
V.cs2.boy <- getVarCov(fit.cs2, individual=12)
Gamma.cs2 <- cov2cor(V.cs2.girl)
rbind(beta.cs2,sebeta.cs2)
## gender0 gender1 gender0:age gender1:age
## beta.cs2 17.3727273 16.340625 0.47954545 0.78437500
## sebeta.cs2 0.8175862 1.163776 0.06125437 0.08719126
V.cs2.girl
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 2.8677 2.0728 2.0728 2.0728
## [2,] 2.0728 2.8677 2.0728 2.0728
## [3,] 2.0728 2.0728 2.8677 2.0728
## [4,] 2.0728 2.0728 2.0728 2.8677
## Standard Deviations: 1.6934 1.6934 1.6934 1.6934
V.cs2.boy
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 8.4514 6.1088 6.1088 6.1088
## [2,] 6.1088 8.4514 6.1088 6.1088
## [3,] 6.1088 6.1088 8.4514 6.1088
## [4,] 6.1088 6.1088 6.1088 8.4514
## Standard Deviations: 2.9071 2.9071 2.9071 2.9071
Gamma.cs2
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 1.00000 0.72281 0.72281 0.72281
## [2,] 0.72281 1.00000 0.72281 0.72281
## [3,] 0.72281 0.72281 1.00000 0.72281
## [4,] 0.72281 0.72281 0.72281 1.00000
## Standard Deviations: 1 1 1 1
# (c) AR(1)
fit.ar1 <- gls(distance ~ -1 + gender + age:gender,data=ex.data,
correlation=corAR1(form = ~ 1 | child),method="ML")
beta.ar1 <- coef(fit.ar1)
sebeta.ar1 <- summary(fit.ar1)$tTable[,"Std.Error"]
V.ar1 <- getVarCov(fit.ar1) # or corMatrix(dental.un$modelStruct$corStruct)[[1]]
Gamma.ar1 <- cov2cor(V.ar1)
vars.ar1 <- diag(V.ar1)
rbind(beta.ar1,sebeta.ar1)
## gender0 gender1 gender0:age gender1:age
## beta.ar1 17.321720 16.591996 0.4837322 0.7695718
## sebeta.ar1 1.634509 1.355263 0.1409898 0.1169026
V.ar1
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 4.8908 2.9693 1.8027 1.0944
## [2,] 2.9693 4.8908 2.9693 1.8027
## [3,] 1.8027 2.9693 4.8908 2.9693
## [4,] 1.0944 1.8027 2.9693 4.8908
## Standard Deviations: 2.2115 2.2115 2.2115 2.2115
Gamma.ar1
## Marginal variance covariance matrix
## [,1] [,2] [,3] [,4]
## [1,] 1.00000 0.60712 0.36859 0.22378
## [2,] 0.60712 1.00000 0.60712 0.36859
## [3,] 0.36859 0.60712 1.00000 0.60712
## [4,] 0.22378 0.36859 0.60712 1.00000
## Standard Deviations: 1 1 1 1
#########################################################################
#
# SAS and R use different conventions to calculate AIC and BIC. We
# have used ML here, in which case AIC is the same but BIC differs as
# noted above. If we'd used REML, both AIC and BIC values are calculated
# differently by SAS and R using different conventions regarding the
# number of observations and number of parameters, so are not comparable,
# but can be compared within a single implementation (but not across
# SAS and R).
#
# We use the anova() function to print out these values for the above
# models. Both AIC and BIC support the common compound symmetric correlation
# model with different variance for each gender (constant across time),
# This is as close as the models we can fit here with
# the generic gls() function can get to the model with different
# compound correlation and variance for each gender we fit in SAS,
# which was preferred among those fit in that program.
#
# It seems that the data are suggesting that the correlation pattern
# is close to compound symmetric, with heterogeneity of some sort
# between genders.
#
#########################################################################
anova(fit.un,fit.cs,fit.cs2,fit.ar1)
## Model df AIC BIC logLik Test L.Ratio p-value
## fit.un 1 14 447.4770 485.0269 -209.7385
## fit.cs 2 6 440.6391 456.7318 -214.3195 1 vs 2 9.16201 0.3288
## fit.cs2 3 7 430.6521 449.4270 -208.3261 2 vs 3 11.98695 0.0005
## fit.ar1 4 6 452.6810 468.7738 -220.3405 3 vs 4 24.02890 <.0001
#seems model fit.cs2, compound symmetry structure with differenct covariance
#matrix for different genders is the choice
#consider reduced model under this assumption but with same slope for
#both gender
reduced <- gls(distance ~ -1 + gender + age,data=ex.data,correlation=corCompSymm(form = ~ 1 | child),
weights = varIdent(form = ~ 1 | gender),method="ML")
anova(fit.cs2,reduced) #stay with fit.cs2 with differenct covariance
## Model df AIC BIC logLik Test L.Ratio p-value
## fit.cs2 1 7 430.6521 449.4270 -208.3261
## reduced 2 6 436.7324 452.8252 -212.3662 1 vs 2 8.080332 0.0045
#matrix for different genders
summary(fit.cs2)
## Generalized least squares fit by maximum likelihood
## Model: distance ~ -1 + gender + gender:age
## Data: ex.data
## AIC BIC logLik
## 430.6521 449.427 -208.3261
##
## Correlation Structure: Compound symmetry
## Formula: ~1 | child
## Parameter estimate(s):
## Rho
## 0.7228109
## Variance function:
## Structure: Different standard deviations per stratum
## Formula: ~1 | gender
## Parameter estimates:
## 0 1
## 1.00000 1.71672
##
## Coefficients:
## Value Std.Error t-value p-value
## gender0 17.372727 0.8175862 21.248802 0
## gender1 16.340625 1.1637761 14.041038 0
## gender0:age 0.479545 0.0612544 7.828755 0
## gender1:age 0.784375 0.0871913 8.996028 0
##
## Correlation:
## gendr0 gendr1 gndr0:
## gender1 0.000
## gender0:age -0.824 0.000
## gender1:age 0.000 -0.824 0.000
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.78081373 -0.63039864 -0.08111374 0.52140257 2.87744432
##
## Residual standard error: 1.693422
## Degrees of freedom: 108 total; 104 residual
anova(fit.cs2)
## Denom. DF: 104
## numDF F-value p-value
## gender 2 1913.362 <.0001
## gender:age 2 71.109 <.0001
# Get the corrected standard errors
u <- fit.cs2
robust.fit.cs2<- robust.cov(u,m)
se.fit.cs2.corrected <- robust.fit.cs2$se.model
se.fit.cs2.corrected
## gender0 gender1 gender0:age gender1:age
## 0.80230287 1.14202137 0.06010933 0.08556137