##########################################################
###############  Handout #2 for ST476/576  ###############
##########################################################


#########################################
############## Example 1 ################
####### Generating Multivariate Normal Data #######
#########################################

mu <- c(1,2,3)

Sigma <- rbind(c(5,  4,  5),
               c(4, 10,  3),
               c(5,  3, 15))


#### Generate 1 random draw from N(mu, Sigma)

## need Sigma^{1/2}
decomp <- eigen(Sigma)
Lambda <- diag(decomp$values)
P <- decomp$vectors
Sigma.sqrt <- P%*%sqrt(Lambda)%*%t(P)

z <- rnorm(3,0,1)
x <- Sigma.sqrt%*%z + mu
x

#### Generate a random sample of n=100 from N(mu, Sigma) ####
n <- 100

## First Generate iid N(0,1)'s
Z <- matrix(rnorm(3*n), nrow=n, ncol=3) # 100x3 matrix needs 300 total N(0,1)'s

## Now need M=(mu, mu, ... , mu)
M <- matrix(rep(mu, times=n), nrow=3, ncol=n)


## Now create sample
X <- Z%*%Sigma.sqrt + t(M)

## Compute x_bar and S 
x_bar <- colMeans(X)
x_bar
S <- cov(X)
S


#### easier way to Generate a random sample of n=100 from N(mu, Sigma), use MASS library
library(MASS)
X <- mvrnorm(n=100, mu, Sigma)

## Now Compute x_bar and S 
x_bar <- colMeans(X)
x_bar
S <- cov(X)
S


#### Generate a random sample of n=1000 from N(mu, Sigma) ####
library(MASS)   #from book Modern Applied Statistics with S (MASS)
X <- mvrnorm(n=1000, mu, Sigma)


## Now Compute x_bar and S 
x_bar <- colMeans(X)
x_bar
S <- cov(X)
S


#### Generate a random sample of n=100000 from N(mu, Sigma) ####
library(MASS)
X <- mvrnorm(n=100000, mu, Sigma)


## Now Compute x_bar and S 
x_bar <- colMeans(X)
x_bar
S <- cov(X)
S


#########################################
############## Example 2 ################
###### Assessing Normality of Data ######
#########################################

## We can use X from example 1 to see what actually normally distrbuted
##  data is supposed to look like

#### Generate a random sample of n=100 from N(mu, Sigma) ####

##used to set the random number generator seed (useful to reproduce results)
set.seed(11)

library(MASS)
X <- mvrnorm(n=100, mu, Sigma)

## Now Compute x_bar and S 
x_bar <- colMeans(X)
S <- cov(X)


#### (1) Assess marginal Normality of each variable ####

## Histogram of X_1
hist(X[,1])  ## creates histogram for first column of X

hist(X[,1], main='Histogram of X_1', xlab='X_1')  
             ## Make a plot with a different x label and title.


## QQ plot of X_1
qqnorm(X[,1])  ## creates QQ-plot for first column of X
qqline(X[,1])  ## adds a line to the QQ-plot


## Histogram and QQ for X_2
hist(X[,2], main='Histogram of X_2', xlab='X_2')  

qqnorm(X[,2])  
qqline(X[,2])  

## Histogram and QQ for X_3
hist(X[,3], main='Histogram of X_3', xlab='X_3')  

qqnorm(X[,3])  
qqline(X[,3])  


## Make all six plots on one graphic
par(mfrow=c(3,2))  ## says I want a 3x2 matrix of plots

hist(X[,1], main='Histogram of X_1', xlab='X_1')  
qqnorm(X[,1])  
qqline(X[,1])  

hist(X[,2], main='Histogram of X_2', xlab='X_2')  
qqnorm(X[,2])  
qqline(X[,2])  

hist(X[,3], main='Histogram of X_3', xlab='X_3')  
qqnorm(X[,3])  
qqline(X[,3])  

par(mfrow=c(1,1))


#### (2) Assess bivariate Normaility of each pair ####

pairs(X)

##let's give an example of a typital bivariate normal elliptical appearance
n1<-1000
mu1 <- c(10,20)
sigma1 <- rbind(c(4,6),
               c(6,25))
Xb<-mvrnorm(n1,mu1,sigma1)
pairs(Xb)
plot(Xb)


#### (3) Look for outliers ####

## use squared statstical distance function 'mahalanobis' to calculate distance
##  from 1st observation to x_bar
dist1 <- mahalanobis(X[1,], x_bar, S)
dist1

## just to show you 'mahalanobis' is calculating the right thing
dist1 <- t(X[1,]-x_bar)%*%solve(S)%*%(X[1,]-x_bar)
dist1

## use 'mahalanobis' function to calculate distance to x_bar for all n obs
dist <- mahalanobis(X, x_bar, S)
dist

## What is the threshold value we should use to declare outliers?
##  Let's use 99th%  of chisquare 3 df dist'n
cutoff <- qchisq(.99, 3)

## Now see which obs are greater distance to x_bar than cutoff
which(dist > cutoff) 


#########################################
############## Example 3 ################
### Assessing Normality of Real Data ####
#########################################

## Read in data
ex.data <- read.table(file="W:/teaching/stat4765762012/data/handout2_data.txt", header=T)
nrow(ex.data) #how many rows of the data
ex.data[1:10,] ## Take a look at the first 10 rows of data

## Intrested in the first 3 columns contain the X's of interest
X <- as.matrix(ex.data[,1:3])

## (1) Assess marginal Normality of each variable  histograms and QQ-plots 

par(mfrow=c(3,2))  

hist(X[,1], main='Histogram of Fuel', xlab='X_1')  
qqnorm(X[,1])  
qqline(X[,1])  

hist(X[,2], main='Histogram of Repair', xlab='X_2')  
qqnorm(X[,2])  
qqline(X[,2])  

hist(X[,3], main='Histogram of Capital', xlab='X_3')  
qqnorm(X[,3])  
qqline(X[,3])  

par(mfrow=c(1,1))


## (2) Assess bivariate Normaility of each pair
pairs(X)


## (3) Look for outliers
x_bar <- colMeans(X)
S <- cov(X)
dist <- mahalanobis(X, x_bar, S)

##  Let's use 99th%  of chisquare 3 df dist'n
cutoff <- qchisq(.99, 3)

outliers <- which(dist > cutoff) 
outliers

X[outliers,]


## Lets transform Fuel (X_1) to be more normal
## Try log transform

logX1 <- log(X[,1])
X.trans <- X
X.trans[,1] <- logX1


## (1) Assess marginal Normality of each variable after transform 
par(mfrow=c(3,2))

hist(X.trans[,1], main='Histogram of log(Fuel)', xlab='X_1')  
qqnorm(X.trans[,1])  
qqline(X.trans[,1])  

hist(X.trans[,2], main='Histogram of Repair', xlab='X_2')  
qqnorm(X.trans[,2])  
qqline(X.trans[,2])  

hist(X.trans[,3], main='Histogram of Capital', xlab='X_3')  
qqnorm(X.trans[,3])  
qqline(X.trans[,3])  

par(mfrow=c(1,1))

par(mfrow=c(1,2))
hist(logX1, main='Histogram of Fuel', xlab='X_1')  
qqnorm(logX1)
qqline(logX1)
par(mfrow=c(1,1))


## (2) re-assess bivariate Normaility of each pair
pairs(X.trans)


## (3) Look for outliers
x_bar <- colMeans(X.trans)
S <- cov(X.trans)
dist <- mahalanobis(X.trans, x_bar, S)

cutoff <- qchisq(.99, 3)
outliers <- which(dist > cutoff) 

X.trans[outliers,]

##data looks much better for 
##  normal dist'n assumption