#########################################################
############  Handout #7 for ST476/576  #################
##############  Principle Components  ###################
#########################################################


###################################
############ Example 1 ############
###### PC's for X versus Z ########
###################################


mu <- c(10,5)
Sigma <- rbind(c(100, 15),
               c(15, 10))
V <- diag(diag(1/sqrt(Sigma)))
Rho <- V %*% Sigma %*% V
Rho

eig.Sigma <- eigen(Sigma) #gives eigenvalue and eigenvector
eig.Sigma$values[1]/(eig.Sigma$values[1]+eig.Sigma$values[2]) #gives proportion of first eigenvalue
eig.Sigma

eig.Rho <- eigen(Rho)
eig.Rho$values[1]/(eig.Rho$values[1]+eig.Rho$values[2])
eig.Rho

summary(princomp(covmat=Sigma))
summary(princomp(covmat=Rho))


################################################
################## Example 2 ###################
######## PC's for Stock-Price data #############
################################################


#### Read in data ####
ex.data <- read.table(file="W:/teaching/stat4765762012/data/stocks_data.txt", header=T)

pairs(ex.data)

#### PCA on Sigma ####
stock_pca <- princomp(~ jpmorgan + citibank + wellsfargo + royaldutch + exxon, data=ex.data, cor=FALSE)##using covariance matrix

summary(stock_pca)
screeplot(stock_pca, type='lines')
loadings(stock_pca) #Small loadings are conventionally not printed
stock_pca$loadings[1:5,1:5]
## Plot data in PC coordinates
plot(stock_pca$scores, main='PC Scores for Stocks')
library('ellipse')
lines(ellipse(diag(stock_pca$sdev^2), t=sqrt(qchisq(.95,2)), centre=c(0,0)))

##stock_pca$scores are \hat y_1 = \hat e_1 *(x-\bar x)

#### PCA on Correlation Matrix ####
stock_pca <- princomp(~ jpmorgan + citibank + wellsfargo + royaldutch + exxon, data=ex.data, cor=TRUE)

summary(stock_pca)
screeplot(stock_pca, type='lines')
loadings(stock_pca)


################ Example 3 #################
######## PC's for NFL Games Data ###########
############################################


#### Read in data ####
nfl.data <- read.table(file="W:/teaching/stat4765762012/data/nfl_clean_data.txt", header=T)

nfl.data[1:3,]

##First 35 variables are "predictor" variables"

nfl_pca <- princomp(nfl.data[,1:35], cor=TRUE)

summary(nfl_pca)
screeplot(nfl_pca, type='lines')
nfl_pca$loadings[,1:4]


## Plot data in first 2 PC coordinates
plot(nfl_pca$scores[,1:2], main='PC Scores for NFL Data')
library('ellipse')
lines(ellipse(diag(nfl_pca$sdev[1:2]^2), t=sqrt(qchisq(.95,2)), centre=c(0,0)))


###  Use PC regression to predict point spread ###

## Use first 10 PC's in regression to predict point spread of game
nfl_lm <- lm(act_spread ~ nfl_pca$scores[,1:10], data=nfl.data)
summary(nfl_lm)

## Use first 15 PC's in regression to predict point spread of game
nfl_lm <- lm(act_spread ~ nfl_pca$scores[,1:15], data=nfl.data)
summary(nfl_lm)

## Use only Vegas Line to predict point spread
nfl_lm <- lm(act_spread ~ Spread_1, data=nfl.data)
summary(nfl_lm)


## Hold out several games for validation.  That is fit the regression to 
##  remaining data and try to predict the held out games' point spread

## hold out first 30 games for prediction
hold <- 1:30  
reg.data <- as.data.frame(nfl_pca$scores[-hold,1:35])
reg.data$act_spread <- nfl.data$act_spread[-hold] 
nfl_lm <- lm(act_spread ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10, data=reg.data)

pred_sprd <- predict(nfl_lm, as.data.frame(nfl_pca$scores[hold,1:35]))
act_sprd <- nfl.data$act_spread[hold]
vegas_sprd <- nfl.data$Spread_1[hold]

result <- data.frame(pred_sprd, act_sprd, vegas_sprd)
win <- ((pred_sprd > vegas_sprd)&(act_sprd > vegas_sprd)) |
       ((pred_sprd < vegas_sprd)&(act_sprd < vegas_sprd))
tie <- ((pred_sprd > vegas_sprd)&(act_sprd == vegas_sprd)) |
       ((pred_sprd < vegas_sprd)&(act_sprd == vegas_sprd))
result$win <- win + .5*tie

## Calculate winning pct
mean(result$win)


## What is winning pct for spreads that differ by 5 or more from our predicted?
mean(result$win[abs(pred_sprd-vegas_sprd) >= 5])


## Next piece of code holds out random subsets of the data to use
## for validation.  Repeat several times, then take average of wins.

## Hold out random subsets of the data to use for validation.  Repeat several 
## times, then take calculate pct of wins.

set.seed(110)
nit <- 500
wins <- rep(0,nit)
totals <- rep(0,nit)
bet.diff <- 2.5

for(i in 1:nit){
hold <- sample(1:168, size=20, replace=F)

reg.data <- as.data.frame(nfl_pca$scores[-hold,1:35])
reg.data$act_spread <- nfl.data$act_spread[-hold] 
nfl_lm <- lm(act_spread ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8, data=reg.data)

pred_sprd <- predict(nfl_lm, as.data.frame(nfl_pca$scores[hold,1:35]))
act_sprd <- nfl.data$act_spread[hold]
vegas_sprd <- nfl.data$Spread_1[hold]

result <- data.frame(pred_sprd, act_sprd, vegas_sprd)
win <- ((pred_sprd > vegas_sprd)&(act_sprd > vegas_sprd)) |
       ((pred_sprd < vegas_sprd)&(act_sprd < vegas_sprd))
tie <- ((pred_sprd > vegas_sprd)&(act_sprd == vegas_sprd)) |
       ((pred_sprd < vegas_sprd)&(act_sprd == vegas_sprd))
result$win <- win + .5*tie

wins[i] <- sum(result$win[abs(pred_sprd-vegas_sprd) >= bet.diff])
totals[i] <- sum(abs(pred_sprd-vegas_sprd) >= bet.diff)
}

prop.win <- sum(wins, na.rm=T)/sum(totals, na.rm=T)
std.err <- sqrt(prop.win*(1-prop.win)/sum(totals, na.rm=T))
prop.win
std.err


## What about total points?

## Use first 10 PC's in regression to predict total points of game
reg.data <- as.data.frame(nfl_pca$scores[,1:35])
reg.data$act_total <- nfl.data$act_total 
nfl_lm <- lm(act_total ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10, data=reg.data)
summary(nfl_lm)

## Use first 15 PC's in regression to predict total points of game
nfl_lm <- lm(act_total ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12+Comp.13+Comp.14+Comp.15, data=reg.data)
summary(nfl_lm)

## Use the Vegas over/under line to predict total points
nfl_lm <- lm(act_total ~ tot_pts, data=nfl.data)
summary(nfl_lm)


## Hold out random subsets of the data to use for validation.  Repeat 
## several times, then calculate pct of wins.

set.seed(110)
nit <- 500
wins <- rep(0,nit)
totals <- rep(0,nit)
bet.diff <- 5

for(i in 1:nit){
hold <- sample(1:168, size=20, replace=F)

reg.data <- as.data.frame(nfl_pca$scores[-hold,1:35])
reg.data$act_total <- nfl.data$act_total[-hold] 

nfl_lm <- lm(act_total ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10, data=reg.data)

pred_tot <- predict(nfl_lm, as.data.frame(nfl_pca$scores[hold,1:35]))
act_tot <- nfl.data$act_total[hold]
vegas_tot <- nfl.data$tot_pts[hold]

result <- data.frame(pred_tot, act_tot, vegas_tot)
win <- ((pred_tot > vegas_tot)&(act_tot > vegas_tot)) |
       ((pred_tot < vegas_tot)&(act_tot < vegas_tot))
tie <- ((pred_tot > vegas_tot)&(act_tot == vegas_tot)) |
       ((pred_tot < vegas_tot)&(act_tot == vegas_tot))
result$win <- win + .5*tie

wins[i] <- sum(result$win[abs(pred_tot-vegas_tot) > bet.diff])
totals[i] <- sum(abs(pred_tot-vegas_tot) > bet.diff)
}

prop.win <- sum(wins, na.rm=T)/sum(totals, na.rm=T)
std.err <- sqrt(prop.win*(1-prop.win)/sum(totals, na.rm=T))
prop.win
std.err