############Example on three-way ANOVA##########################
################################################################
# This dataset adapt.txt is from an experiment on how plants adapt to
# cold climates. The investigators decided to study this problem after
# observing that plants that have been conditioned to cold previously
# appear to suffer less damage from the cold. Two species of potato
# were studied (species 1 and 2). Each plant was exposed to one of two
# acclimatization regimes (1= plant was kept in cold room; 0= plant
#                          was kept at room temperature) for several days. Later, plants were
# subjected to one of two cold temperatures (-4 degrees C is coded as
#                                            1; -8 degrees C is coded as 2). Two responses were measured: damage
# score for photosynthesis (photo), and damage score for ion leakage
# (leak).

# Use score for photosynthesis (photo) to be the response variable. Some of the 80 plants
# originally assigned to the treatment combinations were lost during
# the experiment period. Analyze the data from the plants that made it
# through, and assess the effects of the three experimental factors
# species, regime, and temperature on the response "photo".

potato<-read.table("~/Desktop/jenn/teaching/stat445545/data/adapt.txt",header=TRUE)
n<-nrow(potato)
n
## [1] 75
potato[1:10,]
##    variety regime temp   photo  leak
## 1        1      0    1  4.6386  2.25
## 2        1      0    1  5.7914  4.34
## 3        1      0    1 29.3515  4.25
## 4        1      0    1 18.4173  6.14
## 5        1      0    1  2.2556  2.38
## 6        1      0    2  8.2358 16.30
## 7        1      0    2  8.1972  5.24
## 8        1      0    2 -2.4482  3.25
## 9        1      0    2  5.2820  0.88
## 10       1      0    2 -5.1283  3.98
potato$regime<-factor(potato$regime,label=c("R","C"))
potato$variety<-factor(potato$variety,label=c("s1","s2"))
potato$temp<-factor(potato$temp,label=c("-4","-8"))
attach(potato)
potato[1:10,]
##    variety regime temp   photo  leak
## 1       s1      R   -4  4.6386  2.25
## 2       s1      R   -4  5.7914  4.34
## 3       s1      R   -4 29.3515  4.25
## 4       s1      R   -4 18.4173  6.14
## 5       s1      R   -4  2.2556  2.38
## 6       s1      R   -8  8.2358 16.30
## 7       s1      R   -8  8.1972  5.24
## 8       s1      R   -8 -2.4482  3.25
## 9       s1      R   -8  5.2820  0.88
## 10      s1      R   -8 -5.1283  3.98
#summary statistics
tapply(photo,regime,mean) #plants previously placed in cold adapt to cold weather better
##        R        C 
## 23.25246 12.47348
tapply(photo,variety, mean) #variety means, variety 1 seems adapt to cold better
##       s1       s2 
## 12.06579 22.53129
tapply(photo,temp, mean) #plants later put in -4 or -8 C doesn't make much difference in mean damage score
##       -4       -8 
## 18.32070 16.99179
aggregate(photo~variety+regime+temp, data=potato, mean) #cell means
##   variety regime temp     photo
## 1      s1      R   -4 12.090880
## 2      s2      R   -4 33.887738
## 3      s1      C   -4 10.121942
## 4      s2      C   -4  7.915386
## 5      s1      R   -8  2.827700
## 6      s2      R   -8 24.765762
## 7      s1      C   -8 17.403577
## 8      s2      C   -8 11.906886
aggregate(photo~variety+regime+temp, data=potato, length) #cell size, unequal
##   variety regime temp photo
## 1      s1      R   -4     5
## 2      s2      R   -4    13
## 3      s1      C   -4    12
## 4      s2      C   -4     7
## 5      s1      R   -8     5
## 6      s2      R   -8    13
## 7      s1      C   -8    13
## 8      s2      C   -8     7
#boxplot
par(mfrow=c(1,3))
boxplot(photo ~ variety,
        data = potato,
        xlab = "Variety",
        ylab = "Mean damage score for photosynthesis")
boxplot(photo ~ regime,
        data = potato,
        xlab = "Regime",
        ylab = "Mean damage score for photosynthesis")

boxplot(photo ~ temp,
        data = potato,
        xlab = "Temp",
        ylab = "Mean damage score for photosynthesis")

par(mfrow=c(1,1))
boxplot(photo ~ variety:regime:temp,
        data = potato,
        xlab = "V:R:T",
        ylab = "Mean score of photo")

##different cell means seem to differ in mean damage score for photosynthesis 

##View interaction plots, also called profile plots
interaction.plot(regime,variety:temp,photo,type='b',
                 col=1:4, pch=1:4) 
#variety 1 later exposed in -8c seems working better when previously put in room temp than put in cold temp, which is
#different from other combinations
#you can also plot other interaction plots using different x axis.

##fit full model
myfit0<-lm(photo~variety*regime*temp)
anova(myfit0) #type 1
## Analysis of Variance Table
## 
## Response: photo
##                     Df Sum Sq Mean Sq F value    Pr(>F)    
## variety              1 2044.5 2044.50 25.1340 4.163e-06 ***
## regime               1 1050.4 1050.36 12.9126 0.0006178 ***
## temp                 1   23.5   23.47  0.2885 0.5929428    
## variety:regime       1 2682.0 2681.98 32.9709 2.477e-07 ***
## variety:temp         1  233.9  233.87  2.8751 0.0946011 .  
## regime:temp          1  868.0  868.02 10.6710 0.0017176 ** 
## variety:regime:temp  1   11.8   11.78  0.1448 0.7047866    
## Residuals           67 5450.0   81.34                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(car) #type 2 and 3 SS
## Loading required package: carData
Anova(myfit0,type=3)  #wrong way, we need to add restrictions in the model
## Anova Table (Type III tests)
## 
## Response: photo
##                     Sum Sq Df F value    Pr(>F)    
## (Intercept)          730.9  1  8.9859 0.0038141 ** 
## variety             1715.6  1 21.0914 1.985e-05 ***
## regime                13.7  1  0.1682 0.6830193    
## temp                 214.5  1  2.6372 0.1090860    
## variety:regime      1145.2  1 14.0785 0.0003686 ***
## variety:temp           0.0  1  0.0004 0.9832785    
## regime:temp          488.6  1  6.0064 0.0168707 *  
## variety:regime:temp   11.8  1  0.1448 0.7047866    
## Residuals           5450.0 67                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##CORRECT WAY TO GET THE TYPE III SS
myfit<-lm(photo~variety*regime*temp, contrasts = c(variety=contr.sum, 
                                                   regime=contr.sum,temp=contr.sum))
## CRITICAL!!!  Unbalanced design warning.
##   The contrast statement above must be included identifying
##   each main effect with "contr.sum" in order for the correct
##   Type III SS to be computed.
Anova(myfit,type=3)  #use Type 3 for tests
## Anova Table (Type III tests)
## 
## Response: photo
##                      Sum Sq Df  F value    Pr(>F)    
## (Intercept)         14624.3  1 179.7839 < 2.2e-16 ***
## variety              1298.5  1  15.9634 0.0001632 ***
## regime                687.8  1   8.4560 0.0049305 ** 
## temp                   50.6  1   0.6219 0.4331125    
## variety:regime       2646.4  1  32.5332 2.881e-07 ***
## variety:temp            9.9  1   0.1219 0.7280542    
## regime:temp           879.8  1  10.8155 0.0016060 ** 
## variety:regime:temp    11.8  1   0.1448 0.7047866    
## Residuals            5450.0 67                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#residual df = 67 = 75-8

###### lsmeans (least square means)

# unbalanced, means don't match with lsmeans
aggregate(photo~variety+temp, data=potato, mean) #cell means
##   variety temp    photo
## 1      s1   -4 10.70104
## 2      s2   -4 24.79742
## 3      s1   -8 13.35472
## 4      s2   -8 20.26516
library(lsmeans)
## The 'lsmeans' package is being deprecated.
## Users are encouraged to switch to 'emmeans'.
## See help('transition') for more information, including how
## to convert 'lsmeans' objects and scripts to work with 'emmeans'.

lsmeans(myfit, list(pairwise ~ variety*temp), adjust = "bonferroni") 
## NOTE: Results may be misleading due to involvement in interactions
## $`lsmeans of variety, temp`
##  variety temp   lsmean       SE df  lower.CL upper.CL
##  s1      -4   11.10641 2.400385 67  6.315223 15.89760
##  s2      -4   20.90156 2.114104 67 16.681793 25.12133
##  s1      -8   10.11564 2.373076 67  5.378959 14.85232
##  s2      -8   18.33632 2.114104 67 14.116555 22.55609
## 
## Results are averaged over the levels of: regime 
## Confidence level used: 0.95 
## 
## $`pairwise differences of contrast`
##  contrast        estimate       SE df t.ratio p.value
##  s1,-4 - s2,-4 -9.7951513 3.198638 67  -3.062  0.0190
##  s1,-4 - s1,-8  0.9907724 3.375402 67   0.294  1.0000
##  s1,-4 - s2,-8 -7.2299128 3.198638 67  -2.260  0.1623
##  s2,-4 - s1,-8 10.7859236 3.178195 67   3.394  0.0070
##  s2,-4 - s2,-8  2.5652385 2.989794 67   0.858  1.0000
##  s1,-8 - s2,-8 -8.2206852 3.178195 67  -2.587  0.0712
## 
## Results are averaged over the levels of: regime 
## P value adjustment: bonferroni method for 6 tests
##lsmeans are calculated as the average of the means
#s1+temp -4, mean =11.10641 (mean of all the obsns in this cell), lsmean = 11.10641 = (12.090880 [mean of s1+room+ temp -4]
#+ 10.121942 [mean of s1 + cold + temp-4] )/2
lsmeans(myfit, list(pairwise ~ variety*temp*regime), adjust = "bonferroni") 
## $`lsmeans of variety, temp, regime`
##  variety temp regime    lsmean       SE df  lower.CL upper.CL
##  s1      -4   R      12.090880 4.033453 67  4.040074 20.14169
##  s2      -4   R      33.887738 2.501441 67 28.894840 38.88064
##  s1      -8   R       2.827700 4.033453 67 -5.223106 10.87851
##  s2      -8   R      24.765762 2.501441 67 19.772864 29.75866
##  s1      -4   C      10.121942 2.603583 67  4.925169 15.31871
##  s2      -4   C       7.915386 3.408890 67  1.111213 14.71956
##  s1      -8   C      17.403577 2.501441 67 12.410679 22.39647
##  s2      -8   C      11.906886 3.408890 67  5.102713 18.71106
## 
## Confidence level used: 0.95 
## 
## $`pairwise differences of contrast`
##  contrast             estimate       SE df t.ratio p.value
##  s1,-4,R - s2,-4,R -21.7968585 4.746151 67  -4.593  0.0006
##  s1,-4,R - s1,-8,R   9.2631800 5.704164 67   1.624  1.0000
##  s1,-4,R - s2,-8,R -12.6748815 4.746151 67  -2.671  0.2659
##  s1,-4,R - s1,-4,C   1.9689383 4.800769 67   0.410  1.0000
##  s1,-4,R - s2,-4,C   4.1754943 5.281030 67   0.791  1.0000
##  s1,-4,R - s1,-8,C  -5.3126969 4.746151 67  -1.119  1.0000
##  s1,-4,R - s2,-8,C   0.1839943 5.281030 67   0.035  1.0000
##  s2,-4,R - s1,-8,R  31.0600385 4.746151 67   6.544  <.0001
##  s2,-4,R - s2,-8,R   9.1219769 3.537572 67   2.579  0.3394
##  s2,-4,R - s1,-4,C  23.7657968 3.610520 67   6.582  <.0001
##  s2,-4,R - s2,-4,C  25.9723527 4.228208 67   6.143  <.0001
##  s2,-4,R - s1,-8,C  16.4841615 3.537572 67   4.660  0.0004
##  s2,-4,R - s2,-8,C  21.9808527 4.228208 67   5.199  0.0001
##  s1,-8,R - s2,-8,R -21.9380615 4.746151 67  -4.622  0.0005
##  s1,-8,R - s1,-4,C  -7.2942417 4.800769 67  -1.519  1.0000
##  s1,-8,R - s2,-4,C  -5.0876857 5.281030 67  -0.963  1.0000
##  s1,-8,R - s1,-8,C -14.5758769 4.746151 67  -3.071  0.0862
##  s1,-8,R - s2,-8,C  -9.0791857 5.281030 67  -1.719  1.0000
##  s2,-8,R - s1,-4,C  14.6438199 3.610520 67   4.056  0.0037
##  s2,-8,R - s2,-4,C  16.8503758 4.228208 67   3.985  0.0047
##  s2,-8,R - s1,-8,C   7.3621846 3.537572 67   2.081  1.0000
##  s2,-8,R - s2,-8,C  12.8588758 4.228208 67   3.041  0.0941
##  s1,-4,C - s2,-4,C   2.2065560 4.289426 67   0.514  1.0000
##  s1,-4,C - s1,-8,C  -7.2816353 3.610520 67  -2.017  1.0000
##  s1,-4,C - s2,-8,C  -1.7849440 4.289426 67  -0.416  1.0000
##  s2,-4,C - s1,-8,C  -9.4881912 4.228208 67  -2.244  0.7879
##  s2,-4,C - s2,-8,C  -3.9915000 4.820899 67  -0.828  1.0000
##  s1,-8,C - s2,-8,C   5.4966912 4.228208 67   1.300  1.0000
## 
## P value adjustment: bonferroni method for 28 tests
##residual analysis
qqPlot(myfit$residuals, las = 1, main="QQ Plot")

## [1] 58 57
shapiro.test(myfit$resid)
## 
##  Shapiro-Wilk normality test
## 
## data:  myfit$resid
## W = 0.96858, p-value = 0.05911
plot(myfit$fitted,myfit$res,xlab="Fitted",ylab="Residuals") 

#no serious departure detected

#supplemental tests for normality and constant variance
shapiro.test(myfit$resid)
## 
##  Shapiro-Wilk normality test
## 
## data:  myfit$resid
## W = 0.96858, p-value = 0.05911
##Brown-Forsythe Test (modified levene test) for constancy of the error variance, reject constant variance assumption
#p-value of the test is 0.0149, less than 0.05
leveneTest(photo~variety*temp*regime, data=potato)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)  
## group  7  2.7559 0.01409 *
##       67                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##check outliers
rstudent<-rstudent(myfit)
bcritical<- qt(1-0.05/(2*n), 75-8-1)
outliers <- which(abs(rstudent) > bcritical)
outliers #58 obsn is detected as outlier
## 58 
## 58
#or use outlier test
outlierTest(myfit)
##     rstudent unadjusted p-value Bonferonni p
## 58 -4.325043         5.2729e-05    0.0039546
#check what happened to the outlier
potato[58,]
##    variety regime temp   photo  leak
## 58      s2      R   -8 -8.5655 10.17
#variety 2 previously conidtioned in room temp and later exposed to low temp as -8c has a 
#very low score for for photosynthesis 

#try transformation
#box-cox transformation
min(photo) #-8.5655
## [1] -8.5655
photo<-photo+9 #make response positive
myfit2<-lm(photo~variety*regime*temp,contrasts = c(variety=contr.sum, 
                                                   regime=contr.sum,temp=contr.sum))
library(MASS)
boxcox(myfit2, lambda = seq(-5, 5, length = 10))

#\lambda =1 is within 95% CI, also plots don't show serious departure, 
#continue without transformation

##reduced model, 
#by type III test
Anova(myfit,type=3)
## Anova Table (Type III tests)
## 
## Response: photo
##                      Sum Sq Df  F value    Pr(>F)    
## (Intercept)         14624.3  1 179.7839 < 2.2e-16 ***
## variety              1298.5  1  15.9634 0.0001632 ***
## regime                687.8  1   8.4560 0.0049305 ** 
## temp                   50.6  1   0.6219 0.4331125    
## variety:regime       2646.4  1  32.5332 2.881e-07 ***
## variety:temp            9.9  1   0.1219 0.7280542    
## regime:temp           879.8  1  10.8155 0.0016060 ** 
## variety:regime:temp    11.8  1   0.1448 0.7047866    
## Residuals            5450.0 67                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##three way interaction is not significant, remove it 
myfit3<-lm(photo~variety+regime+temp+variety:temp+variety:regime +temp:regime, 
           contrasts = c(variety=contr.sum, regime=contr.sum,temp=contr.sum))

##compare full model and reduced model, p-value =0.7048, do not reject reduced model
anova(myfit,myfit3)
## Analysis of Variance Table
## 
## Model 1: photo ~ variety * regime * temp
## Model 2: photo ~ variety + regime + temp + variety:temp + variety:regime + 
##     temp:regime
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     67 5450.0                           
## 2     68 5461.8 -1   -11.776 0.1448 0.7048
Anova(myfit3,type=3)
## Anova Table (Type III tests)
## 
## Response: photo
##                Sum Sq Df  F value    Pr(>F)    
## (Intercept)     37235  1 463.5782 < 2.2e-16 ***
## variety          1297  1  16.1476 0.0001491 ***
## regime            687  1   8.5497 0.0046908 ** 
## temp               40  1   0.4922 0.4853388    
## variety:temp       13  1   0.1560 0.6940723    
## variety:regime   2649  1  32.9771 2.381e-07 ***
## regime:temp       868  1  10.8070 0.0016024 ** 
## Residuals        5462 68                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#notice variety:temp is not significant, remove it
myfit4<-lm(photo~variety+regime+temp+variety:regime +temp:regime, 
           contrasts = c(variety=contr.sum, regime=contr.sum,temp=contr.sum))
Anova(myfit4,type=3)
## Anova Table (Type III tests)
## 
## Response: photo
##                Sum Sq Df  F value    Pr(>F)    
## (Intercept)     37247  1 469.4657 < 2.2e-16 ***
## variety          1295  1  16.3254 0.0001365 ***
## regime            685  1   8.6393 0.0044719 ** 
## temp               44  1   0.5527 0.4597261    
## variety:regime   2651  1  33.4200 1.969e-07 ***
## regime:temp      1089  1  13.7306 0.0004217 ***
## Residuals        5474 69                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#notice temp is not significant, can we remove it? No.
#because regime:temp is significant, we will keep temp in the model

##residual analysis
qqPlot(myfit4$residuals, las = 1, main="QQ Plot")

## [1] 58 57
plot(myfit4$fitted,myfit4$res,xlab="Fitted",ylab="Residuals") 

#no serious departure detected

##check outliers
rstudent<-rstudent(myfit4)
bcritical2<- qt(1-0.05/(2*n), 75-6-1)
outliers <- which(abs(rstudent) > bcritical2)
outliers #58 obsn is detected as outlier
## 58 
## 58
#or use outlier test
outlierTest(myfit4)
##     rstudent unadjusted p-value Bonferonni p
## 58 -4.342192         4.8086e-05    0.0036065
#fnal model is photo~variety+regime+temp+variety:regime +temp:regime

#Tukey's multiple comparison

comp1<-lsmeans(myfit4, pairwise ~ temp:regime,adjust="tukey")
cld(comp1, alpha=.05,Letters=letters)
##  temp regime   lsmean       SE df lower.CL upper.CL .group
##  -4   C      17.79910 2.080579 69 13.64845 21.94974  a    
##  -8   R      22.81242 2.224896 69 18.37387 27.25097  a    
##  -8   C      23.89844 2.041053 69 19.82666 27.97023  a    
##  -4   R      31.97362 2.224896 69 27.53507 36.41217   b   
## 
## Results are averaged over the levels of: variety 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05
comp2<-lsmeans(myfit4, pairwise ~ variety:regime,
               adjust="tukey")
cld(comp2, 
    alpha=.05,  
    Letters=letters)
##  variety regime   lsmean       SE df lower.CL upper.CL .group
##  s1      R      16.45929 2.816704 69 10.84012 22.07846  a    
##  s2      C      18.91114 2.380550 69 14.16207 23.66020  a    
##  s1      C      22.78641 1.782355 69 19.23071 26.34211  a    
##  s2      R      38.32675 1.746846 69 34.84189 41.81161   b   
## 
## Results are averaged over the levels of: temp 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05
#Interpret comparison results, for example, the lsmean damage score of photosynthesis 
#for species 2 conditioned at room temperature 
#previously is significantly higher (worse adaptability) than other combinations of factor levels of of
#variety and regime. 
#also explain where is the interaction coming from, species 1 has better adaptability to the cold weather 
#when previously conditioned at room temperature than those conditioned at cold temperature; while species 2 
#has better adaptability to the cold weather 
#when previously conditioned at cold temperature than those conditioned at room temperature