######################################################### ############ Handout #7 for ST476/576 ################# ############## Principle Components ################### ######################################################### ################################### ############ Example 1 ############ ###### PC's for X versus Z ######## ################################### mu <- c(10,5) Sigma <- rbind(c(100, 15), c(15, 10)) V <- diag(diag(1/sqrt(Sigma))) Rho <- V %*% Sigma %*% V Rho eig.Sigma <- eigen(Sigma) #gives eigenvalue and eigenvector eig.Sigma$values[1]/(eig.Sigma$values[1]+eig.Sigma$values[2]) #gives proportion of first eigenvalue eig.Sigma eig.Rho <- eigen(Rho) eig.Rho$values[1]/(eig.Rho$values[1]+eig.Rho$values[2]) eig.Rho summary(princomp(covmat=Sigma)) summary(princomp(covmat=Rho)) ################################################ ################## Example 2 ################### ######## PC's for Stock-Price data ############# ################################################ #### Read in data #### ex.data <- read.table(file="W:/teaching/stat4765762012/data/stocks_data.txt", header=T) pairs(ex.data) #### PCA on Sigma #### stock_pca <- princomp(~ jpmorgan + citibank + wellsfargo + royaldutch + exxon, data=ex.data, cor=FALSE)##using covariance matrix summary(stock_pca) screeplot(stock_pca, type='lines') loadings(stock_pca) #Small loadings are conventionally not printed stock_pca$loadings[1:5,1:5] ## Plot data in PC coordinates plot(stock_pca$scores, main='PC Scores for Stocks') library('ellipse') lines(ellipse(diag(stock_pca$sdev^2), t=sqrt(qchisq(.95,2)), centre=c(0,0))) ##stock_pca$scores are \hat y_1 = \hat e_1 *(x-\bar x) #### PCA on Correlation Matrix #### stock_pca <- princomp(~ jpmorgan + citibank + wellsfargo + royaldutch + exxon, data=ex.data, cor=TRUE) summary(stock_pca) screeplot(stock_pca, type='lines') loadings(stock_pca) ################ Example 3 ################# ######## PC's for NFL Games Data ########### ############################################ #### Read in data #### nfl.data <- read.table(file="W:/teaching/stat4765762012/data/nfl_clean_data.txt", header=T) nfl.data[1:3,] ##First 35 variables are "predictor" variables" nfl_pca <- princomp(nfl.data[,1:35], cor=TRUE) summary(nfl_pca) screeplot(nfl_pca, type='lines') nfl_pca$loadings[,1:4] ## Plot data in first 2 PC coordinates plot(nfl_pca$scores[,1:2], main='PC Scores for NFL Data') library('ellipse') lines(ellipse(diag(nfl_pca$sdev[1:2]^2), t=sqrt(qchisq(.95,2)), centre=c(0,0))) ### Use PC regression to predict point spread ### ## Use first 10 PC's in regression to predict point spread of game nfl_lm <- lm(act_spread ~ nfl_pca$scores[,1:10], data=nfl.data) summary(nfl_lm) ## Use first 15 PC's in regression to predict point spread of game nfl_lm <- lm(act_spread ~ nfl_pca$scores[,1:15], data=nfl.data) summary(nfl_lm) ## Use only Vegas Line to predict point spread nfl_lm <- lm(act_spread ~ Spread_1, data=nfl.data) summary(nfl_lm) ## Hold out several games for validation. That is fit the regression to ## remaining data and try to predict the held out games' point spread ## hold out first 30 games for prediction hold <- 1:30 reg.data <- as.data.frame(nfl_pca$scores[-hold,1:35]) reg.data$act_spread <- nfl.data$act_spread[-hold] nfl_lm <- lm(act_spread ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10, data=reg.data) pred_sprd <- predict(nfl_lm, as.data.frame(nfl_pca$scores[hold,1:35])) act_sprd <- nfl.data$act_spread[hold] vegas_sprd <- nfl.data$Spread_1[hold] result <- data.frame(pred_sprd, act_sprd, vegas_sprd) win <- ((pred_sprd > vegas_sprd)&(act_sprd > vegas_sprd)) | ((pred_sprd < vegas_sprd)&(act_sprd < vegas_sprd)) tie <- ((pred_sprd > vegas_sprd)&(act_sprd == vegas_sprd)) | ((pred_sprd < vegas_sprd)&(act_sprd == vegas_sprd)) result$win <- win + .5*tie ## Calculate winning pct mean(result$win) ## What is winning pct for spreads that differ by 5 or more from our predicted? mean(result$win[abs(pred_sprd-vegas_sprd) >= 5]) ## Next piece of code holds out random subsets of the data to use ## for validation. Repeat several times, then take average of wins. ## Hold out random subsets of the data to use for validation. Repeat several ## times, then take calculate pct of wins. set.seed(110) nit <- 500 wins <- rep(0,nit) totals <- rep(0,nit) bet.diff <- 2.5 for(i in 1:nit){ hold <- sample(1:168, size=20, replace=F) reg.data <- as.data.frame(nfl_pca$scores[-hold,1:35]) reg.data$act_spread <- nfl.data$act_spread[-hold] nfl_lm <- lm(act_spread ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8, data=reg.data) pred_sprd <- predict(nfl_lm, as.data.frame(nfl_pca$scores[hold,1:35])) act_sprd <- nfl.data$act_spread[hold] vegas_sprd <- nfl.data$Spread_1[hold] result <- data.frame(pred_sprd, act_sprd, vegas_sprd) win <- ((pred_sprd > vegas_sprd)&(act_sprd > vegas_sprd)) | ((pred_sprd < vegas_sprd)&(act_sprd < vegas_sprd)) tie <- ((pred_sprd > vegas_sprd)&(act_sprd == vegas_sprd)) | ((pred_sprd < vegas_sprd)&(act_sprd == vegas_sprd)) result$win <- win + .5*tie wins[i] <- sum(result$win[abs(pred_sprd-vegas_sprd) >= bet.diff]) totals[i] <- sum(abs(pred_sprd-vegas_sprd) >= bet.diff) } prop.win <- sum(wins, na.rm=T)/sum(totals, na.rm=T) std.err <- sqrt(prop.win*(1-prop.win)/sum(totals, na.rm=T)) prop.win std.err ## What about total points? ## Use first 10 PC's in regression to predict total points of game reg.data <- as.data.frame(nfl_pca$scores[,1:35]) reg.data$act_total <- nfl.data$act_total nfl_lm <- lm(act_total ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10, data=reg.data) summary(nfl_lm) ## Use first 15 PC's in regression to predict total points of game nfl_lm <- lm(act_total ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10+Comp.11+Comp.12+Comp.13+Comp.14+Comp.15, data=reg.data) summary(nfl_lm) ## Use the Vegas over/under line to predict total points nfl_lm <- lm(act_total ~ tot_pts, data=nfl.data) summary(nfl_lm) ## Hold out random subsets of the data to use for validation. Repeat ## several times, then calculate pct of wins. set.seed(110) nit <- 500 wins <- rep(0,nit) totals <- rep(0,nit) bet.diff <- 5 for(i in 1:nit){ hold <- sample(1:168, size=20, replace=F) reg.data <- as.data.frame(nfl_pca$scores[-hold,1:35]) reg.data$act_total <- nfl.data$act_total[-hold] nfl_lm <- lm(act_total ~ Comp.1+Comp.2+Comp.3+Comp.4+Comp.5+Comp.6+Comp.7+Comp.8+Comp.9+Comp.10, data=reg.data) pred_tot <- predict(nfl_lm, as.data.frame(nfl_pca$scores[hold,1:35])) act_tot <- nfl.data$act_total[hold] vegas_tot <- nfl.data$tot_pts[hold] result <- data.frame(pred_tot, act_tot, vegas_tot) win <- ((pred_tot > vegas_tot)&(act_tot > vegas_tot)) | ((pred_tot < vegas_tot)&(act_tot < vegas_tot)) tie <- ((pred_tot > vegas_tot)&(act_tot == vegas_tot)) | ((pred_tot < vegas_tot)&(act_tot == vegas_tot)) result$win <- win + .5*tie wins[i] <- sum(result$win[abs(pred_tot-vegas_tot) > bet.diff]) totals[i] <- sum(abs(pred_tot-vegas_tot) > bet.diff) } prop.win <- sum(wins, na.rm=T)/sum(totals, na.rm=T) std.err <- sqrt(prop.win*(1-prop.win)/sum(totals, na.rm=T)) prop.win std.err