########################################################## ############### Handout #2 for ST476/576 ############### ########################################################## ######################################### ############## Example 1 ################ ####### Generating Multivariate Normal Data ####### ######################################### mu <- c(1,2,3) Sigma <- rbind(c(5, 4, 5), c(4, 10, 3), c(5, 3, 15)) #### Generate 1 random draw from N(mu, Sigma) ## need Sigma^{1/2} decomp <- eigen(Sigma) Lambda <- diag(decomp$values) P <- decomp$vectors Sigma.sqrt <- P%*%sqrt(Lambda)%*%t(P) z <- rnorm(3,0,1) x <- Sigma.sqrt%*%z + mu x #### Generate a random sample of n=100 from N(mu, Sigma) #### n <- 100 ## First Generate iid N(0,1)'s Z <- matrix(rnorm(3*n), nrow=n, ncol=3) # 100x3 matrix needs 300 total N(0,1)'s ## Now need M=(mu, mu, ... , mu) M <- matrix(rep(mu, times=n), nrow=3, ncol=n) ## Now create sample X <- Z%*%Sigma.sqrt + t(M) ## Compute x_bar and S x_bar <- colMeans(X) x_bar S <- cov(X) S #### easier way to Generate a random sample of n=100 from N(mu, Sigma), use MASS library library(MASS) X <- mvrnorm(n=100, mu, Sigma) ## Now Compute x_bar and S x_bar <- colMeans(X) x_bar S <- cov(X) S #### Generate a random sample of n=1000 from N(mu, Sigma) #### library(MASS) #from book Modern Applied Statistics with S (MASS) X <- mvrnorm(n=1000, mu, Sigma) ## Now Compute x_bar and S x_bar <- colMeans(X) x_bar S <- cov(X) S #### Generate a random sample of n=100000 from N(mu, Sigma) #### library(MASS) X <- mvrnorm(n=100000, mu, Sigma) ## Now Compute x_bar and S x_bar <- colMeans(X) x_bar S <- cov(X) S ######################################### ############## Example 2 ################ ###### Assessing Normality of Data ###### ######################################### ## We can use X from example 1 to see what actually normally distrbuted ## data is supposed to look like #### Generate a random sample of n=100 from N(mu, Sigma) #### ##used to set the random number generator seed (useful to reproduce results) set.seed(11) library(MASS) X <- mvrnorm(n=100, mu, Sigma) ## Now Compute x_bar and S x_bar <- colMeans(X) S <- cov(X) #### (1) Assess marginal Normality of each variable #### ## Histogram of X_1 hist(X[,1]) ## creates histogram for first column of X hist(X[,1], main='Histogram of X_1', xlab='X_1') ## Make a plot with a different x label and title. ## QQ plot of X_1 qqnorm(X[,1]) ## creates QQ-plot for first column of X qqline(X[,1]) ## adds a line to the QQ-plot ## Histogram and QQ for X_2 hist(X[,2], main='Histogram of X_2', xlab='X_2') qqnorm(X[,2]) qqline(X[,2]) ## Histogram and QQ for X_3 hist(X[,3], main='Histogram of X_3', xlab='X_3') qqnorm(X[,3]) qqline(X[,3]) ## Make all six plots on one graphic par(mfrow=c(3,2)) ## says I want a 3x2 matrix of plots hist(X[,1], main='Histogram of X_1', xlab='X_1') qqnorm(X[,1]) qqline(X[,1]) hist(X[,2], main='Histogram of X_2', xlab='X_2') qqnorm(X[,2]) qqline(X[,2]) hist(X[,3], main='Histogram of X_3', xlab='X_3') qqnorm(X[,3]) qqline(X[,3]) par(mfrow=c(1,1)) #### (2) Assess bivariate Normaility of each pair #### pairs(X) ##let's give an example of a typital bivariate normal elliptical appearance n1<-1000 mu1 <- c(10,20) sigma1 <- rbind(c(4,6), c(6,25)) Xb<-mvrnorm(n1,mu1,sigma1) pairs(Xb) plot(Xb) #### (3) Look for outliers #### ## use squared statstical distance function 'mahalanobis' to calculate distance ## from 1st observation to x_bar dist1 <- mahalanobis(X[1,], x_bar, S) dist1 ## just to show you 'mahalanobis' is calculating the right thing dist1 <- t(X[1,]-x_bar)%*%solve(S)%*%(X[1,]-x_bar) dist1 ## use 'mahalanobis' function to calculate distance to x_bar for all n obs dist <- mahalanobis(X, x_bar, S) dist ## What is the threshold value we should use to declare outliers? ## Let's use 99th% of chisquare 3 df dist'n cutoff <- qchisq(.99, 3) ## Now see which obs are greater distance to x_bar than cutoff which(dist > cutoff) ######################################### ############## Example 3 ################ ### Assessing Normality of Real Data #### ######################################### ## Read in data ex.data <- read.table(file="W:/teaching/stat4765762012/data/handout2_data.txt", header=T) nrow(ex.data) #how many rows of the data ex.data[1:10,] ## Take a look at the first 10 rows of data ## Intrested in the first 3 columns contain the X's of interest X <- as.matrix(ex.data[,1:3]) ## (1) Assess marginal Normality of each variable histograms and QQ-plots par(mfrow=c(3,2)) hist(X[,1], main='Histogram of Fuel', xlab='X_1') qqnorm(X[,1]) qqline(X[,1]) hist(X[,2], main='Histogram of Repair', xlab='X_2') qqnorm(X[,2]) qqline(X[,2]) hist(X[,3], main='Histogram of Capital', xlab='X_3') qqnorm(X[,3]) qqline(X[,3]) par(mfrow=c(1,1)) ## (2) Assess bivariate Normaility of each pair pairs(X) ## (3) Look for outliers x_bar <- colMeans(X) S <- cov(X) dist <- mahalanobis(X, x_bar, S) ## Let's use 99th% of chisquare 3 df dist'n cutoff <- qchisq(.99, 3) outliers <- which(dist > cutoff) outliers X[outliers,] ## Lets transform Fuel (X_1) to be more normal ## Try log transform logX1 <- log(X[,1]) X.trans <- X X.trans[,1] <- logX1 ## (1) Assess marginal Normality of each variable after transform par(mfrow=c(3,2)) hist(X.trans[,1], main='Histogram of log(Fuel)', xlab='X_1') qqnorm(X.trans[,1]) qqline(X.trans[,1]) hist(X.trans[,2], main='Histogram of Repair', xlab='X_2') qqnorm(X.trans[,2]) qqline(X.trans[,2]) hist(X.trans[,3], main='Histogram of Capital', xlab='X_3') qqnorm(X.trans[,3]) qqline(X.trans[,3]) par(mfrow=c(1,1)) par(mfrow=c(1,2)) hist(logX1, main='Histogram of Fuel', xlab='X_1') qqnorm(logX1) qqline(logX1) par(mfrow=c(1,1)) ## (2) re-assess bivariate Normaility of each pair pairs(X.trans) ## (3) Look for outliers x_bar <- colMeans(X.trans) S <- cov(X.trans) dist <- mahalanobis(X.trans, x_bar, S) cutoff <- qchisq(.99, 3) outliers <- which(dist > cutoff) X.trans[outliers,] ##data looks much better for ## normal dist'n assumption