########################################################## ############### Handout #4 for ST476/576 ############### ######## Comparing Means from two MV populations ######### ########################################################## ################################################# ################## Example 1 #################### #### Test of hyp, and CI's on Generated Data #### ################################################# #### Generate a random sample of n1=100 from N(mu1, Sigma1) #### set.seed(24) mu1 <- c(0,.5,-1) Sigma1 <- rbind(c(5, 4, 5), c(4, 10, 3), c(5, 3, 15)) n1 <- 100 Z <- matrix(rnorm(300), nrow=n, ncol=3) decomp <- eigen(Sigma1) Lambda <- diag(decomp$values) P <- decomp$vectors Sigma.sqrt <- P%*%sqrt(Lambda)%*%t(P) M <- matrix(rep(mu1, times=n), nrow=3, ncol=n) X1 <- Z%*%Sigma.sqrt + t(M) #### Generate a random sample of n2=100 from N(mu2, Sigma2) #### mu2 <- c(2,1,1) Sigma2 <- 2*Sigma1 n2 <- 100 Z <- matrix(rnorm(300), nrow=n, ncol=3) decomp <- eigen(Sigma2) Lambda <- diag(decomp$values) P <- decomp$vectors Sigma.sqrt <- P%*%sqrt(Lambda)%*%t(P) M <- matrix(rep(mu2, times=n), nrow=3, ncol=n) X2 <- Z%*%Sigma.sqrt + t(M) ##################################################### ## T2 test for mu1 = mu2 (assuming Sigma1=Sigma2) ## ##################################################### p <- ncol(X1) x1_bar <- colMeans(X1) S1 <- cov(X1) n1 <- nrow(X1) x2_bar <- colMeans(X2) S2 <- cov(X2) n2 <- nrow(X2) S_pooled <- ((n1-1)*S1 + (n2-1)*S2)/(n1+n2-2) T2 <- t(x1_bar-x2_bar)%*%solve((1/n1+1/n2)*S_pooled)%*%(x1_bar-x2_bar) a_star <- (n1+n2-p-1)/((n1+n2-2)*p)*T2 pval <- 1-pf(a_star,p,n1+n2-p-1) pval ## 95% T2 intervals for diff between component means ## mu_{1,1} - mu_{2,1} a <- c(1,0,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) diff_1_CI <- c(lower,upper) diff_1_CI ## mu_{1,2} - mu_{2,2} a <- c(0,1,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) diff_2_CI <- c(lower,upper) diff_2_CI ## mu_{1,3} - mu_{2,3} a <- c(0,0,1) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) diff_3_CI <- c(lower,upper) diff_3_CI ## largest linear combo a_hat <- solve(S_pooled)%*%(x1_bar-x2_bar) lower <- t(a_hat)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a_hat)%*%((1/n1+1/n2)*S_pooled)%*%a_hat) upper <- t(a_hat)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a_hat)%*%((1/n1+1/n2)*S_pooled)%*%a_hat) largest_diff_CI <- c(lower,upper) largest_diff_CI ############################################################################# ## Test for mu1 = mu2 (using large n1, n2 with possibly Sigma1 != Sigma2) ## ############################################################################# p <- ncol(X1) x1_bar <- colMeans(X1) S1 <- cov(X1) n1 <- nrow(X1) x2_bar <- colMeans(X2) S2 <- cov(X2) n2 <- nrow(X2) T2 <- t(x1_bar-x2_bar)%*%solve(1/n1*S1+1/n2*S2)%*%(x1_bar-x2_bar) large_n_pval <- 1-pchisq(T2,p) large_n_pval ## Can also do 95% T2 intervals for diff between component means ## mu_{1,1} - mu_{2,1} a <- c(1,0,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) large_n_diff_1_CI <- c(lower,upper) large_n_diff_1_CI ## mu_{1,2} - mu_{2,2} a <- c(0,1,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) large_n_diff_2_CI <- c(lower,upper) large_n_diff_2_CI ## mu_{1,3} - mu_{2,3} a <- c(0,0,1) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) large_n_diff_3_CI <- c(lower,upper) large_n_diff_3_CI ############################################################################ ## Test for mu1 = mu2 (with small n1, n2 with possibly Sigma1 != Sigma2) ## ############################################################################ ## Just for illustration as n1=100 and n2=100 are both large relative to p=3 p <- ncol(X1) x1_bar <- colMeans(X1) S1 <- cov(X1) n1 <- nrow(X1) x2_bar <- colMeans(X2) S2 <- cov(X2) n2 <- nrow(X2) T2 <- t(x1_bar-x2_bar)%*%solve(1/n1*S1+1/n2*S2)%*%(x1_bar-x2_bar) mat1 <- 1/n1*S1%*%solve(1/n1*S1+1/n2*S2) mat2 <- 1/n2*S2%*%solve(1/n1*S1+1/n2*S2) val1 <- 1/n1*sum(diag(mat1%*%mat1)) val2 <- 1/n1*sum(diag(mat1))^2 val3 <- 1/n2*sum(diag(mat2%*%mat2)) val4 <- 1/n2*sum(diag(mat2))^2 nu <- (p+p^2)/(val1+val2+val3+val4) a_star <- (nu-p+1)/(nu*p)*T2 small_n_pval <- 1-pf(a_star,p,nu-p+1) small_n_pval ## Can also do 95% T2 intervals for diff between component means ## mu_{1,1} - mu_{2,1} a <- c(1,0,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(nu*p/(nu-p+1)*qf(.95,p,nu-p+1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(nu*p/(nu-p+1)*qf(.95,p,nu-p+1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) small_n_diff_1_CI <- c(lower,upper) small_n_diff_1_CI ## Others similarly with a <- c(0,1,0) and a <- c(0,0,1) ######################################### ############## Example 2 ################ ### Test of hyp, CR and T2 intervals #### ######################################### ## Read in data ex.data <- read.table(file="handout4_data.txt", header=T) X <- as.matrix(ex.data[,1:3]) X1 <- X[ex.data$type=='gasoline',] X2 <- X[ex.data$type=='diesel',] ## T2 test for mu1 = mu2 or not p <- ncol(X1) x1_bar <- colMeans(X1) S1 <- cov(X1) n1 <- nrow(X1) x2_bar <- colMeans(X2) S2 <- cov(X2) n2 <- nrow(X2) S_pooled <- ((n1-1)*S1 + (n2-1)*S2)/(n1+n2-2) T2 <- t(x1_bar-x2_bar)%*%solve((1/n1+1/n2)*S_pooled)%*%(x1_bar-x2_bar) a_star <- (n1+n2-p-1)/((n1+n2-2)*p)*T2 pval <- 1-pf(a_star,p,n1+n2-p-1) pval ## 95% T2 intervals for diff between component means ## mu_{1,1} - mu_{2,1} a <- c(1,0,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) diff_1_CI <- c(lower,upper) diff_1_CI ## mu_{1,2} - mu_{2,2} a <- c(0,1,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) diff_2_CI <- c(lower,upper) diff_2_CI ## mu_{1,3} - mu_{2,3} a <- c(0,0,1) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) diff_3_CI <- c(lower,upper) diff_3_CI ## largest linear combo a_hat <- solve(S_pooled)%*%(x1_bar-x2_bar) lower <- t(a_hat)%*%(x1_bar-x2_bar) - sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a_hat)%*%((1/n1+1/n2)*S_pooled)%*%a_hat) upper <- t(a_hat)%*%(x1_bar-x2_bar) + sqrt((n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1)*t(a_hat)%*%((1/n1+1/n2)*S_pooled)%*%a_hat) largest_diff_CI <- c(lower,upper) largest_diff_CI ## Neither CI for (mu_{1,1} - mu_{2,1}) or (mu_{1,2} - mu_{2,2}) contains zero. ## Plot T^2 Confidence Ellipse for (mu_{1,1}-mu_{1,2}, mu_{2,1}-mu_{2,2}) library('ellipse') crit <- (n1+n2-2)*p/(n1+n2-p-1)*qf(.95,p,n1+n2-p-1) plot(ellipse((1/n1+1/n2)*S_pooled[1:2,1:2], t=sqrt(crit), centre=x1_bar[1:2]-x2_bar[1:2]), type='l', main='95% T^2 Confidence Ellipse for (mu_diff_1, mu_diff_2)') ## Add T^2 CI lines lines(c(diff_1_CI[1], diff_1_CI[1]), c(-100,100), lty=2) lines(c(diff_1_CI[2], diff_1_CI[2]), c(-100,100), lty=2) lines(c(-100,100), c(diff_2_CI[1], diff_2_CI[1]), lty=2) lines(c(-100,100), c(diff_2_CI[2], diff_2_CI[2]), lty=2) points(0,0) ############################################################################# ## Test for mu1 = mu2 (using large n1, n2 with possibly Sigma1 != Sigma2) ## ############################################################################# p <- ncol(X1) x1_bar <- colMeans(X1) S1 <- cov(X1) n1 <- nrow(X1) x2_bar <- colMeans(X2) S2 <- cov(X2) n2 <- nrow(X2) T2 <- t(x1_bar-x2_bar)%*%solve(1/n1*S1+1/n2*S2)%*%(x1_bar-x2_bar) large_n_pval <- 1-pchisq(T2,p) large_n_pval ## Can also do 95% T2 intervals for diff between component means ## mu_{1,1} - mu_{2,1} a <- c(1,0,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) large_n_diff_1_CI <- c(lower,upper) large_n_diff_1_CI ## mu_{1,2} - mu_{2,2} a <- c(0,1,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) large_n_diff_2_CI <- c(lower,upper) large_n_diff_2_CI ## mu_{1,3} - mu_{2,3} a <- c(0,0,1) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(qchisq(.95,p)*t(a)%*%(1/n1*S1+1/n2*S2)%*%a) large_n_diff_3_CI <- c(lower,upper) large_n_diff_3_CI ############################################################################ ## Test for mu1 = mu2 (with small n1, n2 with possibly Sigma1 != Sigma2) ## ############################################################################ p <- ncol(X1) x1_bar <- colMeans(X1) S1 <- cov(X1) n1 <- nrow(X1) x2_bar <- colMeans(X2) S2 <- cov(X2) n2 <- nrow(X2) T2 <- t(x1_bar-x2_bar)%*%solve(1/n1*S1+1/n2*S2)%*%(x1_bar-x2_bar) mat1 <- 1/n1*S1%*%solve(1/n1*S1+1/n2*S2) mat2 <- 1/n2*S2%*%solve(1/n1*S1+1/n2*S2) val1 <- 1/n1*sum(diag(mat1%*%mat1)) val2 <- 1/n1*sum(diag(mat1))^2 val3 <- 1/n2*sum(diag(mat2%*%mat2)) val4 <- 1/n2*sum(diag(mat2))^2 nu <- (p+p^2)/(val1+val2+val3+val4) a_star <- (nu-p+1)/(nu*p)*T2 small_n_pval <- 1-pf(a_star,p,nu-p+1) small_n_pval ## Can also do 95% T2 intervals for diff between component means ## mu_{1,1} - mu_{2,1} a <- c(1,0,0) lower <- t(a)%*%(x1_bar-x2_bar) - sqrt(nu*p/(nu-p+1)*qf(.95,p,nu-p+1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) upper <- t(a)%*%(x1_bar-x2_bar) + sqrt(nu*p/(nu-p+1)*qf(.95,p,nu-p+1)*t(a)%*%((1/n1+1/n2)*S_pooled)%*%a) small_n_diff_1_CI <- c(lower,upper) small_n_diff_1_CI ## Others similarly with a <- c(0,1,0) and a <- c(0,0,1)