West-Nile-Virus | Kaggle

#install.packages("ISOweek")
#install.packages("stringr")
#install.packages("caTools")
#install.packages("TTR")
#install.packages("gam")
#install.packages("data.table")
#install.packages("Metrics")
#install.packages("randomForest")
#install.packages("MASS")
#install.packages("e1071")
#install.packages("fpc", dependencies = TRUE)
#install.packages("geosphere")
#install.packages("scatterplot3d")
#install.packages("party")
#install.packages("adabag")
#install.packages("DeducerExtras")
#install.packages("h2o")
library(ISOweek)
library(stringr)
library(caTools)
library(TTR)
require(gam)
library(data.table)
library(Metrics)
require(randomForest)
require(MASS)
require(e1071)
library(fpc)
library(geosphere)
library(scatterplot3d)
require(party)
require(adabag)
require(DeducerExtras)
require(h2o)
#Read Raw
Data------------------------------------------------------------------
------------------------------------------------
train <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/train.csv")
test <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working
Folder/test.csv")
weather <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/

Working Folder/weather.csv")
spray <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/spray.csv")
#Get Week Number and ReCategorize
Species---------------------------------------------------------------
------------------------
vSpecies<-c(as.character(train$Species),as.character(test$Species))
vSpecies[vSpecies=="UNSPECIFIED CULEX"]<-"CULEX ERRATICUS"
vSpecies[-which(vSpecies == "CULEX PIPIENS" |
vSpecies == "CULEX PIPIENS/RESTUANS" |
vSpecies == "CULEX RESTUANS")] = "CULEX OTHER"
vSpecies<-factor(vSpecies,levels=unique(vSpecies))
## data.table syntax for adding a column; could overwrite the existing
column as well
train[,Species2:=factor(vSpecies[1:nrow(train)],levels=unique(vSpecies
))]
test[,Species2:=factor(vSpecies[(nrow(train)
+1):length(vSpecies)],levels=unique(vSpecies))]
## also add some fields for components of the date using simple
substrings
train[,dMonth:=as.factor(paste(substr(train$Date,6,7)))]
train[,dYear:=as.factor(paste(substr(train$Date,1,4)))]
train$Date = as.Date(train$Date, format="%Y-%m-%d")
xsDate = as.Date(paste0(train$dYear, "0101"), format="%Y%m%d")
train$dWeek = as.numeric(paste(floor((train$Date - xsDate + 1)/7)))
test[,dMonth:=as.factor(paste(substr(test$Date,6,7)))]
test[,dYear:=as.factor(paste(substr(test$Date,1,4)))]
test$Date = as.Date(test$Date, format="%Y-%m-%d")
tsDate = as.Date(paste0(test$dYear, "0101"), format="%Y%m%d")
test$dWeek = as.numeric(paste(floor((test$Date - tsDate + 1)/7)))
weather[,dMonth:=as.factor(paste(substr(weather$Date,6,7)))]
weather[,dYear:=as.factor(paste(substr(weather$Date,1,4)))]
weather$Date = as.Date(weather$Date, format="%Y-%m-%d")
tsDate = as.Date(paste0(weather$dYear, "0101"), format="%Y%m%d")
weather$dWeek = as.numeric(paste(floor((weather$Date - tsDate + 1)/
7)))
#Train Data Pre-

processing------------------------------------------------------------
------------------------------------------
wnv.train <- data.frame(train[,list(Date, dWeek, Species, Trap,
NumMosquitos, Latitude, Longitude, Species2, WnvPresent)])
#Cast Date Type
wnv.train$Date <- as.Date(wnv.train$Date)
oh.lat <- 41.995
oh.long <- -87.933
mw.lat <- 41.786
mw.long <- -87.752
for (i in 1:nrow(wnv.train)) {
#Euclidean Distances
wnv.train$dist.oh[i] <- sqrt((wnv.train$Latitude[i]-oh.lat)**2+
(wnv.train$Longitude[i]-oh.long)**2)*111
wnv.train$dist.mw[i] <- sqrt((wnv.train$Latitude[i]-mw.lat)**2+
(wnv.train$Longitude[i]-mw.long)**2)*111
#Haversine Distance
wnv.train$disth.oh[i] <- distHaversine(c(wnv.train$Longitude[i],
wnv.train$Latitude[i]), c(oh.long, oh.lat))/1000
wnv.train$disth.mw[i] <- distHaversine(c(wnv.train$Longitude[i],
wnv.train$Latitude[i]), c(mw.long, mw.lat))/1000
if (wnv.train$dist.oh[i] >= wnv.train$dist.mw[i]) {
wnv.train$Station.Euc[i] <- 2 #2 is Midway
}
else {
wnv.train$Station.Euc[i] <- 1 #1 is O'Hare
}
if (wnv.train$disth.oh[i] >= wnv.train$disth.mw[i]) {
wnv.train$Station[i] <- 2 #2 is Midway
}
else {
wnv.train$Station[i] <- 1 #1 is O'Hare
}
#Creating ISO Week Number
#wnv.train$week.num[i] <- substring(ISOweek(wnv.train$Date[i]), 7,
8)

}
#Weather Data
Cleaning--------------------------------------------------------------
----------------------------------------------
#Choose Variables
wnv.weather <- data.frame(weather[,list(Station, Date, Tmax, Tmin,
Tavg, Depart, DewPoint, WetBulb, Heat, Cool, PrecipTotal, StnPressure,
SeaLevel, ResultSpeed,
ResultDir, AvgSpeed)])
#Cast Date Type
wnv.weather$Date <- as.Date(wnv.weather$Date)
#Create Week Number Variable
for (i in 1:nrow(wnv.weather)) {
wnv.weather$week.num[i] <- as.numeric(substring(ISOweek(wnv.weather
$Date[i]), 7, 8)) #Create Week Markers
wnv.weather$date.year[i] <- as.numeric(format(wnv.weather$Date[i],
"%Y"))
#wnv.weather$date.month[i] <- as.numeric(format(wnv.weather$Date[i],
"%m"))
#wnv.weather$date.day[i] <- as.numeric(format(wnv.weather$Date[i],
"%d"))
}
#Clean Temperature
wnv.weather$Tmax <- as.numeric(as.character(wnv.weather$Tmax))
wnv.weather$Tmin <- as.numeric(as.character(wnv.weather$Tmin))
wnv.weather$Tavg <- as.numeric(as.character(wnv.weather$Tavg))
wnv.weather$Tavg[is.na(wnv.weather$Tavg)] <- (wnv.weather
$Tmax[is.na(wnv.weather$Tavg)] + wnv.weather$Tmin[is.na(wnv.weather
$Tavg)])/2
#Clean DewPoint
wnv.weather$DewPoint <- as.numeric(as.character(wnv.weather$DewPoint))

#Clean Depart
wnv.weather$Depart[wnv.weather$Depart=="M"] <- NA
wnv.weather$Depart <- as.numeric(as.character(wnv.weather$Depart))
#Clean WetBulb
wnv.weather$WetBulb <- as.numeric(as.character(wnv.weather$WetBulb))
wnv.weather$WetBulb[849]<- wnv.weather$WetBulb[850]
#Clean Heat
wnv.weather$Heat <- as.numeric(as.character(wnv.weather$Heat))
wnv.weather$Heat[8]<- wnv.weather$Heat[7]
#Clean Cool
wnv.weather$Cool <- as.numeric(as.character(wnv.weather$Cool))
wnv.weather$Cool[8]<- wnv.weather$Cool[7]

#Clean PrecipTotal
wnv.weather$PrecipTotal <- as.numeric(as.character(wnv.weather
$PrecipTotal))
wnv.weather$PrecipTotal[is.na(wnv.weather$PrecipTotal)] <- 0
#Clean StnPressure
wnv.weather$StnPressure <- as.numeric(as.character(wnv.weather
$StnPressure))
wnv.weather$StnPressure[88]<- wnv.weather$StnPressure[87]
#Clean SeaLevel
wnv.weather$SeaLevel <- as.numeric(as.character(wnv.weather$SeaLevel))
wnv.weather$SeaLevel[88]<- wnv.weather$SeaLevel[87]
#Clean ResultSpeed
wnv.weather$ResultSpeed <- as.numeric(as.character(wnv.weather
$ResultSpeed))
#Clean ResultDir
wnv.weather$ResultDir <- as.numeric(as.character(wnv.weather
$ResultDir))
#Clean AvgSpeed
wnv.weather$AvgSpeed <- as.numeric(as.character(wnv.weather$AvgSpeed))
wnv.weather$AvgSpeed[is.na(wnv.weather$AvgSpeed)] <- mean(wnv.weather
$AvgSpeed, na.rm =TRUE)

#Split weather data into O'Hare and Midway
wnv.weather.oh <- subset(wnv.weather, wnv.weather$Station == 1)
wnv.weather.mw <- subset(wnv.weather, wnv.weather$Station == 2)
#Create New Weather Features: temp.mov.avg, prec.mov.avg, deg.day,
accumulated degree day for each year
Tbase <- 71.6 #22deg C = 71.6deg F
wnv.weather.oh$deg.day <- 0
wnv.weather.oh$deg.day.c <- 0
wnv.weather.oh$acc.deg.day <- 0
wnv.weather.oh$acc.deg.day.c <- 0
newyear <- TRUE
#For O'Hare
start <- 1
for (year in 2007:2014) {
iterations <- nrow(subset(wnv.weather.oh, wnv.weather.oh$date.year
== year ))
for (i in start:(start+iterations-1)) {
Tmean <- (wnv.weather.oh$Tmax[i] + wnv.weather.oh$Tmin[i])/2
if ( Tmean > Tbase) {
wnv.weather.oh$deg.day[i] <- Tmean - Tbase
wnv.weather.oh$deg.day.c[i] <- 0
} else {
wnv.weather.oh$deg.day[i] <- 0
wnv.weather.oh$deg.day.c[i] <- Tbase - Tmean
}
if (newyear == TRUE) {
newyear <- FALSE
wnv.weather.oh$acc.deg.day[i] <- 0
} else {
wnv.weather.oh$acc.deg.day[i] <- wnv.weather.oh$deg.day[i] +
wnv.weather.oh$acc.deg.day[i-1]
wnv.weather.oh$acc.deg.day.c[i] <- wnv.weather.oh$deg.day.c[i] +
wnv.weather.oh$acc.deg.day.c[i-1]
}
}
start <- start + iterations
newyear <- TRUE
}

#For Midway
Tbase <- 71.6 #22deg C = 71.6deg F
wnv.weather.mw$deg.day <- 0
wnv.weather.mw$deg.day.c <- 0
wnv.weather.mw$acc.deg.day <- 0
wnv.weather.mw$acc.deg.day.c <- 0
newyear <- TRUE
start <- 1
for (year in 2007:2014) {
iterations <- nrow(subset(wnv.weather.mw, wnv.weather.mw$date.year
== year ))
for (i in start:(start+iterations-1)) {
Tmean <- (wnv.weather.mw$Tmax[i] + wnv.weather.mw$Tmin[i])/2
if ( Tmean > Tbase) {
wnv.weather.mw$deg.day[i] <- Tmean - Tbase
wnv.weather.mw$deg.day.c[i] <- 0
} else {
wnv.weather.mw$deg.day[i] <- 0
wnv.weather.mw$deg.day.c[i] <- Tbase - Tmean
}
if (newyear == TRUE) {
newyear <- FALSE
wnv.weather.mw$acc.deg.day[i] <- 0
} else {
wnv.weather.mw$acc.deg.day[i] <- wnv.weather.mw$deg.day[i] +
wnv.weather.mw$acc.deg.day[i-1]
wnv.weather.mw$acc.deg.day.c[i] <- wnv.weather.mw$deg.day.c[i] +
wnv.weather.mw$acc.deg.day.c[i-1]
}
}
start <- start + iterations
newyear <- TRUE
}
#Creating Moving Averages and Moving Sums
oh.temp.mov.avg.wk <- vector()
oh.temp.mov.avg.2wk <- vector()
oh.prec.mov.avg.wk <- vector()

oh.prec.mov.avg.2wk <- vector()
oh.prec.mov.sum.wk <- vector()
oh.prec.mov.sum.2wk <- vector()
oh.dd.mov.sum.wk <- vector()
oh.dd.mov.sum.2wk <- vector()
mw.temp.mov.avg.wk <- vector()
mw.temp.mov.avg.2wk <- vector()
mw.prec.mov.avg.wk <- vector()
mw.prec.mov.avg.2wk <- vector()
mw.prec.mov.sum.wk <- vector()
mw.prec.mov.sum.2wk <- vector()
mw.dd.mov.sum.wk <- vector()
mw.dd.mov.sum.2wk <- vector()
for (year in 2007:2014) {
#O'Hare
assign(paste("oh.temp.mov.avg.wk", year, sep=""),
runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year ==
year], 7, alg="C", endrule="mean"))
assign(paste("oh.temp.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year ==
assign(paste("oh.prec.mov.avg.wk", year, sep=""),
runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year
== year], 7, alg="C", endrule="mean"))
assign(paste("oh.prec.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year
assign(paste("oh.prec.mov.sum.wk", year, sep=""),
runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("oh.prec.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year ==
assign(paste("oh.dd.mov.sum.wk", year, sep=""),
runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year ==
assign(paste("oh.dd.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year ==

oh.temp.mov.avg.wk <- c(oh.temp.mov.avg.wk,
get(paste("oh.temp.mov.avg.wk", year, sep="")))
oh.temp.mov.avg.2wk <- c(oh.temp.mov.avg.2wk,
get(paste("oh.temp.mov.avg.2wk", year, sep="")))
oh.prec.mov.avg.wk <- c(oh.prec.mov.avg.wk,
get(paste("oh.prec.mov.avg.wk", year, sep="")))
oh.prec.mov.avg.2wk <- c(oh.prec.mov.avg.2wk,
get(paste("oh.prec.mov.avg.2wk", year, sep="")))
oh.prec.mov.sum.wk <- c(oh.prec.mov.sum.wk,
get(paste("oh.prec.mov.sum.wk", year, sep="")))
oh.prec.mov.sum.2wk <- c(oh.prec.mov.sum.2wk,
get(paste("oh.prec.mov.sum.2wk", year, sep="")))
oh.dd.mov.sum.wk <- c(oh.dd.mov.sum.wk,
get(paste("oh.dd.mov.sum.wk", year, sep="")))
oh.dd.mov.sum.2wk <- c(oh.dd.mov.sum.2wk, get(paste("oh.dd.mov.sum.
2wk", year, sep="")))
#Midway
assign(paste("mw.temp.mov.avg.wk", year, sep=""),
runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year ==
assign(paste("mw.temp.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year ==
assign(paste("mw.prec.mov.avg.wk", year, sep=""),
runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year
assign(paste("mw.prec.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year
assign(paste("mw.prec.mov.sum.wk", year, sep=""),
runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year ==
assign(paste("mw.prec.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year ==
assign(paste("mw.dd.mov.sum.wk", year, sep=""),
runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year ==
assign(paste("mw.dd.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year ==

mw.temp.mov.avg.wk <- c(mw.temp.mov.avg.wk,
get(paste("mw.temp.mov.avg.wk", year, sep="")))
mw.temp.mov.avg.2wk <- c(mw.temp.mov.avg.2wk,
get(paste("mw.temp.mov.avg.2wk", year, sep="")))
mw.prec.mov.avg.wk <- c(mw.prec.mov.avg.wk,
get(paste("mw.prec.mov.avg.wk", year, sep="")))
mw.prec.mov.avg.2wk <- c(mw.prec.mov.avg.2wk,
get(paste("mw.prec.mov.avg.2wk", year, sep="")))
mw.prec.mov.sum.wk <- c(mw.prec.mov.sum.wk,
get(paste("mw.prec.mov.sum.wk", year, sep="")))
mw.prec.mov.sum.2wk <- c(mw.prec.mov.sum.2wk,
get(paste("mw.prec.mov.sum.2wk", year, sep="")))
mw.dd.mov.sum.wk <- c(mw.dd.mov.sum.wk,
get(paste("mw.dd.mov.sum.wk", year, sep="")))
mw.dd.mov.sum.2wk <- c(mw.dd.mov.sum.2wk, get(paste("mw.dd.mov.sum.
2wk", year, sep="")))
}
#Add Variable Matrices to Weather Data Frame
wnv.weather.oh$temp.mov.avg.wk <- oh.temp.mov.avg.wk
wnv.weather.oh$temp.mov.avg.2wk <- oh.temp.mov.avg.2wk
wnv.weather.oh$prec.mov.avg.wk <- oh.prec.mov.avg.wk
wnv.weather.oh$prec.mov.avg.2wk <- oh.prec.mov.avg.2wk
wnv.weather.oh$prec.mov.sum.wk <- oh.prec.mov.sum.wk
wnv.weather.oh$prec.mov.sum.2wk <- oh.prec.mov.sum.2wk
wnv.weather.oh$dd.mov.sum.wk <- oh.dd.mov.sum.wk
wnv.weather.oh$dd.mov.sum.2wk <- oh.dd.mov.sum.2wk
wnv.weather.oh$prec.mov.sum.wk[is.na(wnv.weather.oh$prec.mov.sum.wk)]
<- 0
wnv.weather.oh$prec.mov.sum.2wk[is.na(wnv.weather.oh$prec.mov.sum.
2wk)] <- 0
wnv.weather.oh$dd.mov.sum.wk[is.na(wnv.weather.oh$dd.mov.sum.wk)] <- 0
wnv.weather.oh$dd.mov.sum.2wk[is.na(wnv.weather.oh$dd.mov.sum.2wk)] <-
0
wnv.weather.mw$temp.mov.avg.wk <- mw.temp.mov.avg.wk

wnv.weather.mw$temp.mov.avg.2wk <- mw.temp.mov.avg.2wk
wnv.weather.mw$prec.mov.avg.wk <- mw.prec.mov.avg.wk
wnv.weather.mw$prec.mov.avg.2wk <- mw.prec.mov.avg.2wk
wnv.weather.mw$prec.mov.sum.wk <- mw.prec.mov.sum.wk
wnv.weather.mw$prec.mov.sum.2wk <- mw.prec.mov.sum.2wk
wnv.weather.mw$dd.mov.sum.wk <- mw.dd.mov.sum.wk
wnv.weather.mw$dd.mov.sum.2wk <- mw.dd.mov.sum.2wk
wnv.weather.mw$prec.mov.sum.wk[is.na(wnv.weather.mw$prec.mov.sum.wk)]
<- 0
wnv.weather.mw$prec.mov.sum.2wk[is.na(wnv.weather.mw$prec.mov.sum.
2wk)] <- 0
wnv.weather.mw$dd.mov.sum.wk[is.na(wnv.weather.mw$dd.mov.sum.wk)] <- 0
wnv.weather.mw$dd.mov.sum.2wk[is.na(wnv.weather.mw$dd.mov.sum.2wk)] <-
0
options(scipen=100)
#Test Data Pre-
processing------------------------------------------------------------
------------------------------------------
wnv.test <- data.frame(test[,list(Date, Id, dWeek, Species, Trap,
Latitude, Longitude, Species2)])
#Cast Date Type
wnv.test$Date <- as.Date(wnv.test$Date)
oh.lat <- 41.995
oh.long <- -87.933
mw.lat <- 41.786
mw.long <- -87.752
for (i in 1:nrow(wnv.test)) {
#Euclidean Distances
wnv.test$dist.oh[i] <- sqrt((wnv.test$Latitude[i]-oh.lat)**2+
(wnv.test$Longitude[i]-oh.long)**2)*111
wnv.test$dist.mw[i] <- sqrt((wnv.test$Latitude[i]-mw.lat)**2+
(wnv.test$Longitude[i]-mw.long)**2)*111
#Haversine Distance

wnv.test$disth.oh[i] <- distHaversine(c(wnv.test$Longitude[i],
wnv.test$Latitude[i]), c(oh.long, oh.lat))/1000
wnv.test$disth.mw[i] <- distHaversine(c(wnv.test$Longitude[i],
wnv.test$Latitude[i]), c(mw.long, mw.lat))/1000
if (wnv.test$dist.oh[i] >= wnv.test$dist.mw[i]) {
wnv.test$Station.Euc[i] <- 2 #2 is Midway
}
else {
wnv.test$Station.Euc[i] <- 1 #1 is O'Hare
}
if (wnv.test$disth.oh[i] >= wnv.test$disth.mw[i]) {
wnv.test$Station[i] <- 2 #2 is Midway
}
else {
wnv.test$Station[i] <- 1 #1 is O'Hare
}
#Creating ISO Week Number
#wnv.test$week.num[i] <- substring(ISOweek(wnv.test$Date[i]), 7, 8)
}
#Concatenating Clean Weather Datasets and Merging with Training/Test
Data----------------------------------------------------------
#wnv.train$Species2 <- train$Species2
#wnv.test$Species2 <- test$Species2
wnv.weather.clean <- rbind(wnv.weather.oh, wnv.weather.mw)
wnv.train.weather <- merge(wnv.train, wnv.weather.clean, by=c("Date",
"Station"))
wnv.test.weather <- merge(wnv.test, wnv.weather.clean, by=c("Date",
"Station"))
#Just use one station...
#Sorting wnv.test.weather
wnv.test.weather.sorted <- wnv.test.weather[order(wnv.test.weather
$Id),]

#Training/Testing
Split-----------------------------------------------------------------
------------------------------------
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2007,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2007,]
#-----------------------------------------------------
= 2009,]
== 2009,]
#-----------------------------------------------------
#Set aside 2011 data as test, and train on the remaining
= 2011,]
== 2011,]
#-----------------------------------------------------
= 2013,]
== 2013,]
#Clustering Locations -
DBSCAN----------------------------------------------------------------
---------------------------
#Potential epsilon values:
#1/4 mile = 0.003623
#1/2 mile = 0.007246
#1 mile = 0.014493
#2 mile = 0.028986
#3 mile = 0.043479

#Based on 1deg Latitude/Longitude = 69 miles
x.train <- cbind(wnv.train.weather$Longitude, wnv.train.weather
$Latitude)
dbs.train <- dbscan(x.train, 0.003623)
plot(dbs.train, x.train)
print(dbs.train)
prediction.train <- predict(dbs.train, x.train)
#length(prediction.train)
#unique(prediction.train)
wnv.train.weather$loc.cluster.qmile <- as.factor(prediction.train)
x.test <- cbind(wnv.test.weather.sorted$Longitude,
wnv.test.weather.sorted$Latitude)
prediction.test <- predict(dbs.train, x.train, x.test)
#length(prediction.test)
#unique(prediction.test)
wnv.test.weather.sorted$loc.cluster.2mile <-
as.factor(prediction.test)
#Clustering Locations - K-
Means-----------------------------------------------------------------
-----------------------
#With normalized Longitude & Latitude
train.means <- apply(wnv.train.weather[c("Longitude", "Latitude")], 2,
mean)
train.sds <-apply(wnv.train.weather[c("Longitude", "Latitude")], 2,
sd)
train.loc.norm <- scale(wnv.train.weather[c("Longitude",
"Latitude")],center=means,scale=sds)
test.means <- apply(wnv.test.weather.sorted[c("Longitude",
"Latitude")], 2, mean)
test.sds <-apply(wnv.test.weather.sorted[c("Longitude", "Latitude")],
2, sd)
test.loc.norm <- scale(wnv.test.weather.sorted[c("Longitude",
"Latitude")],center=means,scale=sds)
set.seed(123)
clusters.kmeans.norm <- kmeans(train.loc.norm, 20)

clusters.kmeans.norm$centers
plot(clusters.kmeans.norm$centers)
pred.cluster.norm.train <- predict(clusters.kmeans.norm,
data=train.loc.norm)
pred.cluster.norm.test <- predict(clusters.kmeans.norm,
data=test.loc.norm)
#With un-normalized Longitude & Latitude
train.loc <- wnv.train.weather[c("Longitude", "Latitude")]
test.loc <- wnv.test.weather.sorted[c("Longitude", "Latitude")]
set.seed(123)
clusters.kmeans <- kmeans(train.loc, 25)
clusters.kmeans$centers
plot(clusters.kmeans$centers)
pred.cluster.train <- predict(clusters.kmeans.norm,
data=train.loc.norm)
pred.cluster.test <- predict(clusters.kmeans.norm, data=test.loc.norm)
#Assign to training/testing set
wnv.train.weather$kmeans.cluster.n <-
as.factor(pred.cluster.norm.train)
wnv.test.weather.sorted$kmeans.cluster.n <-
as.factor(pred.cluster.norm.test)
#PCA------------------------------------------------------------------
-------------------------------------------------
keeps1 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb",
"StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed",
"acc.deg.day", "temp.mov.avg.wk", "prec.mov.avg.wk",
"prec.mov.sum.wk", "dd.mov.sum.wk" )
keeps2 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb",
"StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed")
#model.pca <- princomp(wnv.train.weather[keeps1], cor=TRUE,

scores=TRUE)
#model.pca
model.pca.train1 <- princomp(wnv.train.weather[keeps1], cor=TRUE,
scores=TRUE)
model.pca.test1 <- princomp(wnv.test.weather.sorted[keeps1], cor=TRUE,
scores=TRUE)
model.pca.train <- princomp(wnv.train.weather[keeps2], cor=TRUE,
scores=TRUE)
model.pca.test <- princomp(wnv.test.weather.sorted[keeps2], cor=TRUE,
scores=TRUE)
plot(model.pca.train)
summary(model.pca.train)
plot(model.pca.test)
summary(model.pca.test)
plot(model.pca.train1)
summary(model.pca.train1)
model.pca.train$scores
model.pca.test$scores
model.pca.train$loadings
model.pca.test$loadings
model.pca.train1$loadings
pca.out.train <- data.frame(model.pca.train$scores)
pca.out.test <- data.frame(model.pca.test$scores)
wnv.train.weather <- cbind(wnv.train.weather, pca.out.train)
wnv.test.weather.sorted <- cbind(wnv.test.weather.sorted,
pca.out.test)
#wnv.train.weather.pca <- cbind(wnv.train.weather, pca.out)
#Dealing with unbalanced
classes---------------------------------------------------------------

------------------------
wnv.train.no <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent ==
0,]
wnv.train.yes <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent ==
1,]
set.seed(1234)
wnv.train.no.75 <- wnv.train.no[sample(1:nrow(wnv.train.no), 5852,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.75, wnv.train.yes)
set.seed(1234)
replace=FALSE),]
set.seed(1234)
replace=FALSE),]
set.seed(1234)
replace=FALSE),]
set.seed(1234)
replace=FALSE),]
#70/30 Split--------------------------------------------
set.seed(1234)
#set.seed(4321)
ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7,
0.3))
trainData <- wnv.train.weather[ind==1,]
testData <- wnv.train.weather[ind==2,]
#GAM
Modeling--------------------------------------------------------------

--------------------------------------------------
#Experiment with PCA and Kmeans-------------------------------
train.model <- gam(WnvPresent ~ s(dWeek) + Species2 + kmeans.cluster.n
+ s(Comp.1)
+ s(Comp.2)
+ s(Comp.3)
+ s(Comp.4)
+ s(Comp.5)
+ s(acc.deg.day)
+ s(temp.mov.avg.wk)
+ s(prec.mov.avg.2wk)
#+ s(prec.mov.sum.wk)
, data = trainData, family="binomial", bf.maxit =
1000)
prediction.gam <-predict(train.model, newdata = testData, type =
"response")
auc(testData$WnvPresent, prediction.gam)
#-------------------------------------------------------------
fitSubmit <- update(train.model, data=wnv.train.weather)
pSubmit.gam<-predict(fitSubmit, newdata = wnv.test.weather.sorted,
type = "response")
submissionFile<-cbind(test$Id,pSubmit.gam)
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi134prec.csv",row.names=FALSE
,quote=FALSE)
#GLM------------------------------------------------------------------
--------------------------------------------------------
#Create Squared Term deg.day, dWeek, dd.mov.sum.2wk
wnv.train.weather$acc.deg.day2 <- (wnv.train.weather$acc.deg.day)**2
wnv.train.weather$dWeek2 <- (wnv.train.weather$dWeek)**2
wnv.train.weather$dd.mov.sum.2wk2 <- (wnv.train.weather$dd.mov.sum.
2wk)**2
wnv.test.weather.sorted$acc.deg.day2 <- (wnv.test.weather.sorted

$acc.deg.day)**2
wnv.test.weather.sorted$dWeek2 <- (wnv.test.weather.sorted$dWeek)**2
wnv.test.weather.sorted$dd.mov.sum.2wk2 <- (wnv.test.weather.sorted
$dd.mov.sum.2wk)**2
#Experiment with PCA and Kmeans--------------------------------
log.model <- glm(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n
+ dWeek2
+ Comp.1
#+ Comp.2
#+ Comp.3
#+ Comp.4
#+ Comp.5
#+ acc.deg.day
#+ acc.deg.day2
+ temp.mov.avg.2wk
+ prec.mov.avg.2wk
, data = trainData, family=binomial())
summary(log.model)
prediction.glm <-predict(log.model, newdata = testData, type =
"response")
auc(testData$WnvPresent, prediction.glm)
#------------------------------------------------------------
fitSubmit <- update(log.model, data=wnv.train.weather)
pSubmit.glm <-predict(fitSubmit, newdata = wnv.test.weather.sorted,
type = "response")
#summary(pSubmit.glm)
submissionFile<-cbind(test$Id,pSubmit.glm)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#Base Model Submission AUC: 0.64569
#Base Train AUC: 0.78259055
#LDA------------------------------------------------------------------
--------------------------------------------------------

lda.model <- lda(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
+ acc.deg.day2
#+ temp.mov.avg.2wk
+ prec.mov.avg.wk
+ prec.mov.sum.2wk
,data = trainData)
prediction.lda <- predict(lda.model, testData)
lda.df <- data.frame(prediction.lda$posterior)
auc(testData$WnvPresent, lda.df[,2])
#-------------------------------------------------------------------
fitSubmit <- update(lda.model, data=wnv.train.weather)
pSubmit<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type =
"probability")
lda.test <- data.frame(pSubmit$posterior)
submissionFile<-cbind(test$Id,lda.df.test[,2])
FALSE)
#Random
Forest----------------------------------------------------------------
------------------------------------------------
set.seed(1234)
rf.model <- randomForest(WnvPresent.C ~ dWeek + Species2 +
kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
#+ temp.mov.avg.2wk

#+ prec.mov.avg.2wk
#+ prec.mov.sum.2wk
, data = trainData, nTree = 1000)
prediction.rf <- predict(rf.model, testData, type = "prob")
rf.df <- data.frame(prediction.rf)
auc(testData$WnvPresent.C, rf.df[,2])
fitSubmit <- update(rf.model, data=wnv.train.weather)
pSubmit.rf<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type
= "prob")
rf.df.test <- data.frame(pSubmit.rf)
submissionFile<-cbind(test$Id, rf.df.test[,2])
FALSE)
#PCA and Kmeans-------------------
set.seed(1234)
rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
#+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
#+ prec.mov.sum.wk
,data = trainData,
controls=cforest_unbiased(ntree=200, mtry=2))
prediction.rf <- predict(rf.model, newdata=testData, type = "prob")
x <- vector()
for (i in seq(2, 6232, 2)) {
x <- c(x, rf.df[1, i])
}
auc(testData$WnvPresent.C, x)
set.seed(1234)

rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
#+ Comp.3
#+ Comp.4
#+ Comp.5
#+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
,data = wnv.train.weather,
controls=cforest_unbiased(ntree=10, mtry=2, maxdepth = 10))
prediction.rf <- predict(rf.model, newdata=wnv.test.weather.sorted,
type = "prob")
#H20 Random Forest--------------
library(h2o)
localH2O <- h2o.init(nthreads = -1,max_mem_size = '7g')
rf.model <- h2o.randomForest(x=c(3, 9, 43, 44:48),y = 53,data =
trainData,
mtries = 18,
sample.rate = 0.5,
classification = T,ntree = 200,verbose =
T)
prediction.rf <- h2o.predict(rf.model,testData)
#Support Vector
Machines--------------------------------------------------------------
----------------------------------------
#PCA and kmeans----------------------------------------------------
set.seed(1234)
svm.model <- svm(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
+ temp.mov.avg.2wk
#+ prec.mov.avg.wk

#+ prec.mov.sum.2wk
, data = trainData, probability = TRUE)
#summary(svm.model)
prediction.svm <- predict(svm.model, testData, probability = TRUE)
svm.df <- data.frame(attr(prediction.svm, "probabilities"))
auc(testData$WnvPresent.C, svm.df[,2])
#------------------------------------------------------------------
set.seed(1234)
fitSubmit <- update(svm.model, data=wnv.train.weather)
pSubmit.svm<-predict(fitSubmit, newdata = wnv.test.weather.sorted,
probability = TRUE)
svm.df.test <- data.frame(attr(pSubmit.svm, "probabilities"))
submissionFile<-cbind(test$Id, svm.df.test[,2])
FALSE)
#Ensemble-------------------------------------------------------------
--------------------------------------------------------
gam.train <- as.matrix(prediction.gam)
glm.train <- as.matrix(prediction.glm)
lda.train <- as.matrix(lda.df[,2])
rf.train <- as.matrix(rf.df[,2])
svm.train <- as.matrix(svm.df[,2])
ens.train <- cbind(gam.train, lda.train, rf.train, svm.train)
gam.test <- as.matrix(pSubmit.gam)
glm.test <- as.matrix(pSubmit.glm)
lda.test <- as.matrix(lda.df.test[,2])
rf.test <- as.matrix(rf.df.test[,2])
svm.test <- as.matrix(svm.df.test[,2])
ens.test <- cbind(lda.test, rf.test, svm.test)
w <- matrix(c(0.0, 0.3, 0.1, 0.6), nrow = 4, ncol = 1)
ens.model <- ens.train %*% w
auc(testData$WnvPresent, ens.model)
w <- matrix(c(0.3, 0.1, 0.6), nrow = 3, ncol = 1)
ens.model.test <- ens.test %*% w

submissionFile<-cbind(test$Id,ens.model.test)
write.csv(submissionFile,"Submission_MaxLiBest3.csv",row.names=FALSE,q
uote=FALSE)
#Bagging--------------------------------------------------------------
------------------------------------------------------
set.seed(1234)
ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7,
0.3))
trainData <- wnv.train.weather[ind==1,]
testData <- wnv.train.weather[ind==2,]
set.seed(4321)
all_data <- wnv.train.weather
positions <- sample(nrow(all_data),size=floor((nrow(all_data)/10)*7))
trainBag<- all_data[positions,]
testBag<- all_data[-positions,]
library(foreach)
length_divisor<-6
iterations<-50
predictions<-foreach(m=1:iterations,.combine=cbind) %do% {
training_positions <- sample(nrow(trainData),
size=floor((nrow(trainData)/length_divisor)))
train_pos<-1:nrow(trainData) %in% training_positions
gam.bag <- gam(WnvPresent ~ lo(dWeek) + Species2 + lo(Longitude,
Latitude) + s(acc.deg.day), data = trainData[train_pos,]
, family="binomial", bf.maxit = 45)
predict(gam.bag, newdata = testData, type = "response")
}
predictions<-data.frame(rowMeans(predictions))
auc(testData$WnvPresent.C, predictions)
#--------------------------------------------------------------------
library(foreach)
length_divisor<-6
iterations<-1000
predictions<-foreach(m=1:iterations,.combine=cbind) %do% {
training_positions <- sample(nrow(trainData),
size=floor((nrow(trainData)/length_divisor)))

train_pos<-1:nrow(trainData) %in% training_positions
glm.bag <- glm(WnvPresent ~ dWeek + dWeek2 + Species2 + Longitude +
acc.deg.day + acc.deg.day2
, data = trainData[train_pos,], family=binomial())
#summary(log.model)
predict(glm.bag, newdata = testData, type = "response")
}
predictions<-data.frame(rowMeans(predictions))
auc(testData$WnvPresent.C, predictions)
#Vs.
gam.test <- gam(WnvPresent ~ lo(dWeek) + Species2 + loc.cluster.qmile,
data = trainBag, family="binomial", bf.maxit = 200)
prediction.gam <-predict(gam.test, newdata = testBag, type =
"response")
auc(testBag$WnvPresent, prediction.gam)
#Adabag
Exploration-----------------------------------------------------------
---------------------------------------------
wnv.adaboost <- boosting(WnvPresent.C ~ dWeek, data = trainData,
boos=TRUE, mfinal = 5)

West-Nile-Virus | Kaggle

More Related Content

What's hot (20)

Similar to West-Nile-Virus | Kaggle (20)

West-Nile-Virus | Kaggle