############################################################### ################## Handout #5 for ST476/576 ################# #### Comparing Means from several MV populations (MANOVA) ##### ############################################################### ################################################# ################## Example 1 #################### #### Test of hyp, and CI's on Generated Data #### ################################################# #### Generate random samples of n1=50, n2=50, n3=50 from 3 populations #### N(mu_1, Sigma), N(mu_2, Sigma), N(mu_3, Sigma) library("MASS") library(mvnormtest) set.seed(11) mu1 <- c(0,0,0) mu2 <- c(0,1,3) mu3 <- c(0,0,0) Sigma <- rbind(c(5, 4, 5), c(4, 10, 3), c(5, 3, 15)) n1 <- 50 n2 <- 50 n3 <- 50 X1 <- mvrnorm(n1, mu1, Sigma) X2 <- mvrnorm(n1, mu2, Sigma) X3 <- mvrnorm(n1, mu3, Sigma) ## Column stack the data and add an indicator variable to denote the group X <- rbind(X1, X2, X3) group <- factor(c(rep(1,n1), rep(2,n2), rep(3,n3))) #### MANOVA procedure to test H_0: mu1=m2=mu3 #### manova.obj <- manova(X ~ group) summary(manova.obj, test="Wilks") ## Give SS matrices B <- summary(manova.obj)$SS[[1]] W <- summary(manova.obj)$SS[[2]] ## Perform approximate Chi Sqr Test by direct calculation ## (just to verify what R is doing) N <- nrow(X) p <- ncol(X) g <- length(unique(group)) Lambda <- det(W)/det(B+W) a_star <- -(N-1-(p+g)/2)*log(Lambda) p_val <- 1-pchisq(a_star, p*(g-1)) p_val ## Other possiblilities for a test statistic summary(manova.obj, test="Pillai") summary(manova.obj, test="Hotelling-Lawley") summary(manova.obj, test="Roy") #### Obtain a univariate ANOVA summary for each component #### summary.aov(manova.obj) #### 95% Simulataneous CI's for all pairwise diffs between component means #### ## To make simulataneous CI's just need to make sure the treatment variable ## is called factor, the multivariate response matrix is called X, the ## SS_res matrix is called W, and alpha is set to desired value. factor <- group X <- X W <- summary(manova.obj)$SS[[2]] alpha <- .05 ## Now the following commands will calculate the CI's for you N <- nrow(X) p <- ncol(X) factor <- as.numeric(factor) levels <- unique(factor) g <- length(levels) conf <- 1-alpha/(p*g*(g-1)) x_bar_mat <- matrix(nrow=g,ncol=p) n <- numeric(g) CIs <- data.frame(matrix(nrow=p*choose(g,2), ncol=2)) names(CIs) <- c(paste(" ", 100*(1-alpha), "% Lower", sep=""), paste(" ", 100*(1-alpha), "% Upper", sep="")) for(l in 1:g){ x_bar_mat[l,] <- colMeans(X[factor==l,]) n[l] <- sum(factor==l) } next.row <- 1 for(i in 1:p){ for(l in 1:(g-1)){ for(m in (l+1):g){ lower <- (x_bar_mat[l,i]-x_bar_mat[m,i]) - qt(conf, N-g)*sqrt(W[i,i]/(N-g)*(1/n[l]+1/n[m])) upper <- (x_bar_mat[l,i]-x_bar_mat[m,i]) + qt(conf, N-g)*sqrt(W[i,i]/(N-g)*(1/n[l]+1/n[m])) descr <- paste("tau_",l, ",", i, " - ", "tau_",m, ",", i, sep="") row.names(CIs)[next.row] <- descr CIs[next.row,] <- c(format(round(lower,4),nsmall=4), format(round(upper,4),nsmall=4)) next.row <- next.row+1 } } } ## Print the CI's CIs #### Create Main Effects Plots #### ## technically only the plot for the third component should be examined ## based on the CI's par(mfrow=c(3,1)) interaction.plot(group, rep(1,length(group)), X[,1], legend=F, main="Main Effects Plot for Group", ylab="1st Component Means") interaction.plot(group, rep(1,length(group)), X[,2], legend=F, main="Main Effects Plot for Group", ylab="2nd Component Means") interaction.plot(group, rep(1,length(group)), X[,3], legend=F, main="Main Effects Plot for Group", ylab="3rd Component Means") par(mfrow=c(1,1)) ## Can also use boxplots par(mfrow=c(3,1)) plot(group, X[,1]) plot(group, X[,2]) plot(group, X[,3]) par(mfrow=c(1,1)) ########################################################### ####################### Example 2 ######################### ### Test of MANOVA hyp, and pairwise CI's on Skull Data ### ########################################################### #### Read in data #### ex.data <- read.table(file="skull_data.txt", header=T) #### MANOVA for Size = Period #### Size <- as.matrix(ex.data[,1:4]) Period <- factor(ex.data[,5]) manova.obj <- manova(Size ~ Period) summary(manova.obj,test="Wilks") #### Check MANOVA assumptions #### ## (1) Assess marginal Normality of each components residuals ## via histograms and QQ-plots resid <- manova.obj$residuals par(mfrow=c(4,2)) hist(Size[,1], main=names(ex.data)[1], xlab='X_1') qqnorm(resid[,1]) qqline(resid[,1]) shapiro.test(resid[,1]) hist(Size[,2], main=names(ex.data)[2], xlab='X_2') qqnorm(resid[,2]) qqline(resid[,2]) shapiro.test(resid[,2]) hist(Size[,3], main=names(ex.data)[3], xlab='X_3') qqnorm(resid[,3]) qqline(resid[,3]) shapiro.test(resid[,2]) hist(Size[,4], main=names(ex.data)[4], xlab='X_4') qqnorm(resid[,4]) qqline(resid[,4]) shapiro.test(resid[,2]) par(mfrow=c(1,1)) ## (2) Assess bivariate Normaility of each pair pairs(resid) pairs12<-Size[,1:2] mshapiro.test(t(pairs12)) ##similarly you can do the mshapiro test for other pairs ## Compare Covariance matrices for each group S1 <- cov(Size[Period==1,]) S2 <- cov(Size[Period==2,]) S3 <- cov(Size[Period==3,]) S1 S2 S3 #testing equality of covariance matrices BoxMTest <- function(X, cl, alpha=0.05) { ## Multivariate Statistical Testing for the Homogeneity of Covariance ## Matrices by the Box's M. #### Inputs: ## X - data matrix (Size of matrix must be n-by-p). ## alpha - significance level (default = 0.05). ## cl - factor, nlevel(cl) = g ## Output: ## MBox - the Box's M statistic. ## Chi-sqr. or F - the approximation statistic test. ## df's - degrees' of freedom of the approximation statistic test. ## P - observed significance level. ## ## If the groups sample-size is at least 20 (sufficiently large), ## Box's M test takes a Chi-square approximation; otherwise it takes ## an F approximation. ## ## Created by A. Trujillo-Ortiz and R. Hernandez-Walls ## Facultad de Ciencias Marinas ## Universidad Autonoma de Baja California ## Apdo. Postal 453 ## Ensenada, Baja California ## Mexico. if (alpha <= 0 || alpha >= 1) stop('significance level must be between 0 and 1') g = nlevels(cl) ## Number of groups. n = table(cl) ## Vector of groups-size. N = nrow(X) p = ncol(X) bandera = 2 if (any(n >= 20)) bandera = 1 ## Partition of the group covariance matrices. covList <- tapply(X, rep(cl, ncol(X)), function(x, nc) cov(matrix(x, nc=nc)), ncol(X)) deno = sum(n) - g suma = array(0, dim=dim(covList[[1]])) for (k in 1:g) suma = suma + (n[k] - 1) * covList[[k]] Sp = suma / deno ## Pooled covariance matrix. Falta=0 for (k in 1:g) Falta = Falta + ((n[k] - 1) * log(det(covList[[k]]))) MB = (sum(n) - g) * log(det(Sp)) - Falta ## Box's M statistic. suma1 = sum(1 / (n[1:g] - 1)) suma2 = sum(1 / ((n[1:g] - 1)^2)) C = (((2 * p^2) + (3 * p) - 1) / (6 * (p + 1) * (g - 1))) * (suma1 - (1 / deno)) ## Computing of correction factor. if (bandera == 1) { X2 = MB * (1 - C) ## Chi-square approximation. v = as.integer((p * (p + 1) * (g - 1)) / 2) ## Degrees of freedom. ## Significance value associated to the observed Chi-square statistic. P = pchisq(X2, v, lower=TRUE) cat('------------------------------------------------\n'); cat(' MBox Chi-sqr. df P\n') cat('------------------------------------------------\n') cat(sprintf("%10.4f%11.4f%12.i%13.4f\n", MB, X2, v, P)) cat('------------------------------------------------\n') if (P >= alpha) { cat('Covariance matrices are not significantly different.\n') } else { cat('Covariance matrices are significantly different.\n') } return(list(MBox=MB, ChiSq=X2, df=v, pValue=P)) } else { ## To obtain the F approximation we first define Co, which combined to ## the before C value are used to estimate the denominator degrees of ## freedom (v2); resulting two possible cases. Co = (((p-1) * (p+2)) / (6 * (g-1))) * (suma2 - (1 / (deno^2))) if (Co - (C^2) >= 0) { v1 = as.integer((p * (p + 1) * (g - 1)) / 2) ## Numerator DF. v21 = as.integer(trunc((v1 + 2) / (Co - (C^2)))) ## Denominator DF. F1 = MB * ((1 - C - (v1 / v21)) / v1) ## F approximation. ## Significance value associated to the observed F statistic. P1 = pf(F1, v1, v21, lower=FALSE) cat('\n------------------------------------------------------------\n') cat(' MBox F df1 df2 P\n') cat('------------------------------------------------------------\n') cat(sprintf("%10.4f%11.4f%11.i%14.i%13.4f\n", MB, F1, v1, v21, P1)) cat('------------------------------------------------------------\n') if (P1 >= alpha) { cat('Covariance matrices are not significantly different.\n') } else { cat('Covariance matrices are significantly different.\n') } return(list(MBox=MB, F=F1, df1=v1, df2=v21, pValue=P1)) } else { v1 = as.integer((p * (p + 1) * (g - 1)) / 2) ## Numerator df. v22 = as.integer(trunc((v1 + 2) / ((C^2) - Co))) ## Denominator df. b = v22 / (1 - C - (2 / v22)) F2 = (v22 * MB) / (v1 * (b - MB)) ## F approximation. ## Significance value associated to the observed F statistic. P2 = pf(F2, v1, v22, lower=FALSE) cat('\n------------------------------------------------------------\n') cat(' MBox F df1 df2 P\n') cat('------------------------------------------------------------\n') cat(sprintf('%10.4f%11.4f%11.i%14.i%13.4f\n', MB, F2, v1, v22, P2)) cat('------------------------------------------------------------\n') if (P2 >= alpha) { cat('Covariance matrices are not significantly different.\n') } else { cat('Covariance matrices are significantly different.\n') } return(list(MBox=MB, F=F2, df1=v1, df2=v22, pValue=P2)) } } } BoxMTest(Size,Period) #### 95% Simulataneous CI's for all pairwise diffs between component means #### ## To make simulataneous CI's just need to make sure the treatment variable ## is called factor, the multivariate response matrix is called X, the ## SS_res matrix is called W, and alpha is set to desired value. factor <- Period X <- Size W <- summary(manova.obj)$SS[[2]] alpha <- .05 ## Now the following commands will calculate the CI's for you N <- nrow(X) p <- ncol(X) factor <- as.numeric(factor) levels <- unique(factor) g <- length(levels) conf <- 1-alpha/(p*g*(g-1)) x_bar_mat <- matrix(nrow=g,ncol=p) n <- numeric(g) CIs <- data.frame(matrix(nrow=p*choose(g,2), ncol=2)) names(CIs) <- c(paste(" ", 100*(1-alpha), "% Lower", sep=""), paste(" ", 100*(1-alpha), "% Upper", sep="")) for(l in 1:g){ x_bar_mat[l,] <- colMeans(X[factor==l,]) n[l] <- sum(factor==l) } next.row <- 1 for(i in 1:p){ for(l in 1:(g-1)){ for(m in (l+1):g){ lower <- (x_bar_mat[l,i]-x_bar_mat[m,i]) - qt(conf, N-g)*sqrt(W[i,i]/(N-g)*(1/n[l]+1/n[m])) upper <- (x_bar_mat[l,i]-x_bar_mat[m,i]) + qt(conf, N-g)*sqrt(W[i,i]/(N-g)*(1/n[l]+1/n[m])) descr <- paste("tau_",l, ",", i, " - ", "tau_",m, ",", i, sep="") row.names(CIs)[next.row] <- descr CIs[next.row,] <- c(format(round(lower,4),nsmall=4), format(round(upper,4),nsmall=4)) next.row <- next.row+1 } } } ## Print the CI's CIs ################################################# ################## Example 3 #################### ### Test of MANOVA hyp on Milk Transport Data ### ################################################# #### Read in data #### ex.data <- read.table(file="handout4_data.txt", header=T) #### MANOVA for Costs = Fuel type #### Cost <- as.matrix(ex.data[,1:3]) Fuel <- ex.data$type manova.obj <- manova(Cost ~ Fuel) summary(manova.obj, test="Wilks") #### Obtain a univariate ANOVA summary for each component #### summary.aov(manova.obj) #### 95% Simulataneous CI's for all pairwise diffs between component means #### ## To make simulataneous CI's just need to make sure the treatment variable ## is called factor, the multivariate response matrix is called X, the ## SS_res matrix is called W, and alpha is set to desired value. factor <- Fuel X <- Cost W <- summary(manova.obj)$SS[[2]] alpha <- .05 ## Now the following commands will calculate the CI's for you N <- nrow(X) p <- ncol(X) factor <- as.numeric(factor) levels <- unique(factor) g <- length(levels) conf <- 1-alpha/(p*g*(g-1)) x_bar_mat <- matrix(nrow=g,ncol=p) n <- numeric(g) CIs <- data.frame(matrix(nrow=p*choose(g,2), ncol=2)) names(CIs) <- c(paste(" ", 100*(1-alpha), "% Lower", sep=""), paste(" ", 100*(1-alpha), "% Upper", sep="")) for(l in 1:g){ x_bar_mat[l,] <- colMeans(X[factor==l,]) n[l] <- sum(factor==l) } next.row <- 1 for(i in 1:p){ for(l in 1:(g-1)){ for(m in (l+1):g){ lower <- (x_bar_mat[l,i]-x_bar_mat[m,i]) - qt(conf, N-g)*sqrt(W[i,i]/(N-g)*(1/n[l]+1/n[m])) upper <- (x_bar_mat[l,i]-x_bar_mat[m,i]) + qt(conf, N-g)*sqrt(W[i,i]/(N-g)*(1/n[l]+1/n[m])) descr <- paste("tau_",l, ",", i, " - ", "tau_",m, ",", i, sep="") row.names(CIs)[next.row] <- descr CIs[next.row,] <- c(format(round(lower,4),nsmall=4), format(round(upper,4),nsmall=4)) next.row <- next.row+1 } } } ## Print the CI's CIs #### Create Main Effects Plots #### ## only the plot for the third component should be examined based on the CI's interaction.plot(Fuel, rep(1,length(Fuel)), X[,3], legend=F, main="Main Effects Plot for Fuel", ylab="Capital Means") ## Can also use boxplots plot(Fuel, X[,3], main="Boxplots for Capital Cost")