preliminaries
| |
Load some basic dependencies | library(caret)
library(DAMisc)
library(splines)
library(caretEnsemble)
library(foreign)
library(sna)
library(network)
library(psych)
library(doSNOW)
library(parallel)
|
Optional
| |
install kaggleNCAA | require('devtools')
install_github('zachmayer/kaggleNCAA')
require(kaggleNCAA)
|
to get a phone notification when you run this via boxcar, install tmisc and
insert the appropriate .boxcar_token | require(devtools)
install_github('trcook/tmisc',subdir='tmisc')
.boxcar_token<-c("TOKEN_HERE")
library(tmisc)
|
N threads, set to 0 to turn off parallel processing. Otherwise, set to
appropriate number of cores. | ncluster<-4
|
set your directory here: | setwd("C:/Users/Thomas/Desktop/NCAA/")
|
START TRAINING
| |
This is the training process
First we boot up in parallel | ivalidate<-read.csv("./ivalidate_pt1.csv")
start<-Sys.time()
if(ncluster>0){
cl<-makeCluster(ncluster)
registerDoSNOW(cl)
}
|
Set some parameters to the training process | my_control <- trainControl(
method='repeatedcv',
repeats=5,
savePredictions=TRUE,
classProbs=TRUE,
summaryFunction=twoClassSummary
)
|
Run models – this part takes a while. Specific models we use are listed in the
method-list parameter | models <- caretList(
win~.,
data=ivalidate[
,grepl("win|survscores|powscore|rank|winper|rpi|sos|ncf|orank",
names(ivalidate))
],
trControl=my_control,
methodList=c(
'bagFDA', 'nnet', 'ada',
'bayesglm', 'svmPoly','rf',
'knn', 'svmLinear', 'gbm')
)
|
This will stack the models into an ensemble using a greedy stepwise algorithm | stack <- caretStack(models, method='glm')
greedy <- caretEnsemble(models, iter=1000L)
|
Turn off parallel processing | if(ncluster>0){
stopCluster(cl)
}
|
Send a handy notification to your phone. The body will have the time it took to
process: | end<-Sys.time()
boxcar_notify(
token = .boxcar_token,
body = paste("time taken:",
c(start-end)),title = "Training Done"
)
|
Validation Process
| |
Load in our validation dataset | ivalidate<-read.csv("./ivalidate_pt2.csv")
|
Make the predictions and get the log loss for the validation set. | preds <- predict(stack,
type="prob",
newdata = ivalidate[ ,
grepl(
"win|survscores|powscore|rank|winper|rpi|sos|ncf|orank",
names(ivalidate)
)
]
)[,1]
df <- data.frame(
preds=preds[which(ivalidate$daynum>135)],
realscore=ivalidate$scorediff[which(ivalidate$daynum>135)],
season=ivalidate$season[which(ivalidate$daynum>135)]
)
qplot(
preds, realscore,
data=df,
xlab="Prediction",
ylab="Real Margin") +
geom_smooth(method="loess")
df$win <- 1*(df$realscore>0)
df$pwin <- 1*(df$preds>=.5)
logloss <- sum(
(df$win*log(df$preds) +
(1-df$win)*log(1-df$preds)) * (1/nrow(df)) )
logloss
accuracy <- sum(df$win==df$pwin)/nrow(df) #Make 65% accuracy
|
Calculate Log Loss – this is the metric by which the contest is judged. | CappedBinomialDeviance <- function(a, p) {
if (length(a) != length(p)){
stop("Actual and Predicted need to be equal lengths!")
}
p_capped <- pmin(0.99, p)
p_capped <- pmax(0.01, p_capped)
-sum(a * log(p_capped) + (1 - a) * log(1 - p_capped)) / length(a)
}
CappedBinomialDeviance(df$win, df$preds)
|
Second Round Training
| |
get final data | ivalidate<-read.csv("./ivalidate_pt3.csv")
|
Start up parallel processing | start<-Sys.time()
if(ncluster>0){
cl<-makeCluster(ncluster)
registerDoSNOW(cl)
}
|
set parameters for models: | my_control <- trainControl(
method='repeatedcv',
repeats=5,
savePredictions=TRUE,
classProbs=TRUE,
summaryFunction=twoClassSummary
)
|
run models | models <- caretList(
win~., data=ivalidate[,
grepl(
"win|survscores|powscore|rank|winper|rpi|sos|ncf",
names(ivalidate)
)
],
trControl=my_control,
methodList=c(
'bagFDA', 'nnet','ada', 'bayesglm',
'svmPoly', 'rf','knn', 'svmLinear','gbm'
)#
)
|
This will stack the models into an ensemble using a greedy stepwise algorithm | stack <- caretStack(models, method='glm')
greedy <- caretEnsemble(models, iter=1000L)
|
turn off parallel processing: | if(ncluster>0){
stopCluster(cl)
}
|
send notification to phone: | end<-Sys.time()
boxcar_notify(
token = .boxcar_token,
body = paste("time taken:",c(start-end)),
title = "Final Training Done")
|
Final Round Predictions:
| |
Get final round data: | ivalidate<-read.csv("./ivalidate_pt4.csv")
df2<-read.csv("./df2.csv")
df<-read.csv("./df.csv")
|
CREATE PREDICTIONS FOR EVERY MATCH-UP FOR STAGE 2 | preds <- predict(stack, type="prob",
newdata = df2[,
grepl(
"win|survscores|powscore|rank|winper|rpi|sos|ncf|orank",
names(df2)
)
]
)[,2]
|
put in data.frame | finaldf <- data.frame(id=df$id, pred=1-preds)
write.csv(finaldf, "./stage2_n2.csv", row.names=F)
|
generate a comparision of predictions against seed-based predictions: | d1 <- read.csv("./kaggle_submission_public.csv")
d2 <- read.csv("./stage2_n2.csv"); names(d2)[2] <- "mypred"
dat <- merge(d1, d2, by="id")
qplot(mypred, pred, data=dat)
|
Generate bracket
From the kaggleNCAA package (install with install_github('zachmayer/kaggleNCAA'))
| |

| dat<-parseBracket("./stage2_n2.csv")
setnames(dat,names(dat),c("season", "team_2", "team_1", "pred"))
bracket <- walkTourney(dat, year=2015)
printableBracket(bracket)
|
| |