#install.packages("ISOweek")
#install.packages("stringr")
#install.packages("caTools")
#install.packages("TTR")
#install.packages("gam")
#install.packages("data.table")
#install.packages("Metrics")
#install.packages("randomForest")
#install.packages("MASS")
#install.packages("e1071")
#install.packages("fpc", dependencies = TRUE)
#install.packages("geosphere")
#install.packages("scatterplot3d")
#install.packages("party")
#install.packages("adabag")
#install.packages("DeducerExtras")
#install.packages("h2o")
library(ISOweek)
library(stringr)
library(caTools)
library(TTR)
require(gam)
library(data.table)
library(Metrics)
require(randomForest)
require(MASS)
require(e1071)
library(fpc)
library(geosphere)
library(scatterplot3d)
require(party)
require(adabag)
require(DeducerExtras)
require(h2o)
#Read Raw
Data------------------------------------------------------------------
------------------------------------------------
train <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/train.csv")
test <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working
Folder/test.csv")
weather <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/weather.csv")
spray <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
Working Folder/spray.csv")
#Get Week Number and ReCategorize
Species---------------------------------------------------------------
------------------------
vSpecies<-c(as.character(train$Species),as.character(test$Species))
vSpecies[vSpecies=="UNSPECIFIED CULEX"]<-"CULEX ERRATICUS"
vSpecies[-which(vSpecies == "CULEX PIPIENS" |
vSpecies == "CULEX PIPIENS/RESTUANS" |
vSpecies == "CULEX RESTUANS")] = "CULEX OTHER"
vSpecies<-factor(vSpecies,levels=unique(vSpecies))
## data.table syntax for adding a column; could overwrite the existing
column as well
train[,Species2:=factor(vSpecies[1:nrow(train)],levels=unique(vSpecies
))]
test[,Species2:=factor(vSpecies[(nrow(train)
+1):length(vSpecies)],levels=unique(vSpecies))]
## also add some fields for components of the date using simple
substrings
train[,dMonth:=as.factor(paste(substr(train$Date,6,7)))]
train[,dYear:=as.factor(paste(substr(train$Date,1,4)))]
train$Date = as.Date(train$Date, format="%Y-%m-%d")
xsDate = as.Date(paste0(train$dYear, "0101"), format="%Y%m%d")
train$dWeek = as.numeric(paste(floor((train$Date - xsDate + 1)/7)))
test[,dMonth:=as.factor(paste(substr(test$Date,6,7)))]
test[,dYear:=as.factor(paste(substr(test$Date,1,4)))]
test$Date = as.Date(test$Date, format="%Y-%m-%d")
tsDate = as.Date(paste0(test$dYear, "0101"), format="%Y%m%d")
test$dWeek = as.numeric(paste(floor((test$Date - tsDate + 1)/7)))
weather[,dMonth:=as.factor(paste(substr(weather$Date,6,7)))]
weather[,dYear:=as.factor(paste(substr(weather$Date,1,4)))]
weather$Date = as.Date(weather$Date, format="%Y-%m-%d")
tsDate = as.Date(paste0(weather$dYear, "0101"), format="%Y%m%d")
weather$dWeek = as.numeric(paste(floor((weather$Date - tsDate + 1)/
7)))
#Train Data Pre-
processing------------------------------------------------------------
------------------------------------------
wnv.train <- data.frame(train[,list(Date, dWeek, Species, Trap,
NumMosquitos, Latitude, Longitude, Species2, WnvPresent)])
#Cast Date Type
wnv.train$Date <- as.Date(wnv.train$Date)
oh.lat <- 41.995
oh.long <- -87.933
mw.lat <- 41.786
mw.long <- -87.752
for (i in 1:nrow(wnv.train)) {
#Euclidean Distances
wnv.train$dist.oh[i] <- sqrt((wnv.train$Latitude[i]-oh.lat)**2+
(wnv.train$Longitude[i]-oh.long)**2)*111
wnv.train$dist.mw[i] <- sqrt((wnv.train$Latitude[i]-mw.lat)**2+
(wnv.train$Longitude[i]-mw.long)**2)*111
#Haversine Distance
wnv.train$disth.oh[i] <- distHaversine(c(wnv.train$Longitude[i],
wnv.train$Latitude[i]), c(oh.long, oh.lat))/1000
wnv.train$disth.mw[i] <- distHaversine(c(wnv.train$Longitude[i],
wnv.train$Latitude[i]), c(mw.long, mw.lat))/1000
if (wnv.train$dist.oh[i] >= wnv.train$dist.mw[i]) {
wnv.train$Station.Euc[i] <- 2 #2 is Midway
}
else {
wnv.train$Station.Euc[i] <- 1 #1 is O'Hare
}
if (wnv.train$disth.oh[i] >= wnv.train$disth.mw[i]) {
wnv.train$Station[i] <- 2 #2 is Midway
}
else {
wnv.train$Station[i] <- 1 #1 is O'Hare
}
#Creating ISO Week Number
#wnv.train$week.num[i] <- substring(ISOweek(wnv.train$Date[i]), 7,
8)
}
#Weather Data
Cleaning--------------------------------------------------------------
----------------------------------------------
#Choose Variables
wnv.weather <- data.frame(weather[,list(Station, Date, Tmax, Tmin,
Tavg, Depart, DewPoint, WetBulb, Heat, Cool, PrecipTotal, StnPressure,
SeaLevel, ResultSpeed,
ResultDir, AvgSpeed)])
#Cast Date Type
wnv.weather$Date <- as.Date(wnv.weather$Date)
#Create Week Number Variable
for (i in 1:nrow(wnv.weather)) {
wnv.weather$week.num[i] <- as.numeric(substring(ISOweek(wnv.weather
$Date[i]), 7, 8)) #Create Week Markers
wnv.weather$date.year[i] <- as.numeric(format(wnv.weather$Date[i],
"%Y"))
#wnv.weather$date.month[i] <- as.numeric(format(wnv.weather$Date[i],
"%m"))
#wnv.weather$date.day[i] <- as.numeric(format(wnv.weather$Date[i],
"%d"))
}
#Clean Temperature
wnv.weather$Tmax <- as.numeric(as.character(wnv.weather$Tmax))
wnv.weather$Tmin <- as.numeric(as.character(wnv.weather$Tmin))
wnv.weather$Tavg <- as.numeric(as.character(wnv.weather$Tavg))
wnv.weather$Tavg[is.na(wnv.weather$Tavg)] <- (wnv.weather
$Tmax[is.na(wnv.weather$Tavg)] + wnv.weather$Tmin[is.na(wnv.weather
$Tavg)])/2
#Clean DewPoint
wnv.weather$DewPoint <- as.numeric(as.character(wnv.weather$DewPoint))
#Clean Depart
wnv.weather$Depart[wnv.weather$Depart=="M"] <- NA
wnv.weather$Depart <- as.numeric(as.character(wnv.weather$Depart))
#Clean WetBulb
wnv.weather$WetBulb <- as.numeric(as.character(wnv.weather$WetBulb))
wnv.weather$WetBulb[849]<- wnv.weather$WetBulb[850]
wnv.weather$WetBulb[2411]<- wnv.weather$WetBulb[2412]
wnv.weather$WetBulb[2413]<- wnv.weather$WetBulb[2414]
wnv.weather$WetBulb[2416]<- wnv.weather$WetBulb[2415]
#Clean Heat
wnv.weather$Heat <- as.numeric(as.character(wnv.weather$Heat))
wnv.weather$Heat[8]<- wnv.weather$Heat[7]
wnv.weather$Heat[506]<- wnv.weather$Heat[505]
wnv.weather$Heat[676]<- wnv.weather$Heat[675]
wnv.weather$Heat[1638]<- wnv.weather$Heat[1637]
wnv.weather$Heat[2068]<- wnv.weather$Heat[2067]
wnv.weather$Heat[2212]<- wnv.weather$Heat[2411]
wnv.weather$Heat[2502]<- wnv.weather$Heat[2501]
wnv.weather$Heat[2512]<- wnv.weather$Heat[2511]
wnv.weather$Heat[2526]<- wnv.weather$Heat[2525]
wnv.weather$Heat[2580]<- wnv.weather$Heat[2579]
wnv.weather$Heat[2812]<- wnv.weather$Heat[2811]
#Clean Cool
wnv.weather$Cool <- as.numeric(as.character(wnv.weather$Cool))
wnv.weather$Cool[8]<- wnv.weather$Cool[7]
wnv.weather$Cool[506]<- wnv.weather$Cool[505]
wnv.weather$Cool[676]<- wnv.weather$Cool[675]
wnv.weather$Cool[1638]<- wnv.weather$Cool[1637]
wnv.weather$Cool[2068]<- wnv.weather$Cool[2067]
wnv.weather$Cool[2212]<- wnv.weather$Cool[2411]
wnv.weather$Cool[2502]<- wnv.weather$Cool[2501]
wnv.weather$Cool[2512]<- wnv.weather$Cool[2511]
wnv.weather$Cool[2526]<- wnv.weather$Cool[2525]
wnv.weather$Cool[2580]<- wnv.weather$Cool[2579]
wnv.weather$Cool[2812]<- wnv.weather$Cool[2811]
#Clean PrecipTotal
wnv.weather$PrecipTotal <- as.numeric(as.character(wnv.weather
$PrecipTotal))
wnv.weather$PrecipTotal[is.na(wnv.weather$PrecipTotal)] <- 0
#Clean StnPressure
wnv.weather$StnPressure <- as.numeric(as.character(wnv.weather
$StnPressure))
wnv.weather$StnPressure[88]<- wnv.weather$StnPressure[87]
wnv.weather$StnPressure[849]<- wnv.weather$StnPressure[850]
wnv.weather$StnPressure[2411]<- wnv.weather$StnPressure[2410]
wnv.weather$StnPressure[2412]<- wnv.weather$StnPressure[2410]
#Clean SeaLevel
wnv.weather$SeaLevel <- as.numeric(as.character(wnv.weather$SeaLevel))
wnv.weather$SeaLevel[88]<- wnv.weather$SeaLevel[87]
wnv.weather$SeaLevel[833]<- wnv.weather$SeaLevel[834]
wnv.weather$SeaLevel[995]<- wnv.weather$SeaLevel[996]
wnv.weather$SeaLevel[1733]<- wnv.weather$SeaLevel[1734]
wnv.weather$SeaLevel[1746]<- wnv.weather$SeaLevel[1745]
wnv.weather$SeaLevel[1757]<- wnv.weather$SeaLevel[1758]
wnv.weather$SeaLevel[2068]<- wnv.weather$SeaLevel[2067]
wnv.weather$SeaLevel[2091]<- wnv.weather$SeaLevel[2092]
wnv.weather$SeaLevel[2744]<- wnv.weather$SeaLevel[2743]
#Clean ResultSpeed
wnv.weather$ResultSpeed <- as.numeric(as.character(wnv.weather
$ResultSpeed))
#Clean ResultDir
wnv.weather$ResultDir <- as.numeric(as.character(wnv.weather
$ResultDir))
#Clean AvgSpeed
wnv.weather$AvgSpeed <- as.numeric(as.character(wnv.weather$AvgSpeed))
wnv.weather$AvgSpeed[is.na(wnv.weather$AvgSpeed)] <- mean(wnv.weather
$AvgSpeed, na.rm =TRUE)
#Split weather data into O'Hare and Midway
wnv.weather.oh <- subset(wnv.weather, wnv.weather$Station == 1)
wnv.weather.mw <- subset(wnv.weather, wnv.weather$Station == 2)
#Create New Weather Features: temp.mov.avg, prec.mov.avg, deg.day,
accumulated degree day for each year
Tbase <- 71.6 #22deg C = 71.6deg F
wnv.weather.oh$deg.day <- 0
wnv.weather.oh$deg.day.c <- 0
wnv.weather.oh$acc.deg.day <- 0
wnv.weather.oh$acc.deg.day.c <- 0
newyear <- TRUE
#For O'Hare
start <- 1
for (year in 2007:2014) {
iterations <- nrow(subset(wnv.weather.oh, wnv.weather.oh$date.year
== year ))
for (i in start:(start+iterations-1)) {
Tmean <- (wnv.weather.oh$Tmax[i] + wnv.weather.oh$Tmin[i])/2
if ( Tmean > Tbase) {
wnv.weather.oh$deg.day[i] <- Tmean - Tbase
wnv.weather.oh$deg.day.c[i] <- 0
} else {
wnv.weather.oh$deg.day[i] <- 0
wnv.weather.oh$deg.day.c[i] <- Tbase - Tmean
}
if (newyear == TRUE) {
newyear <- FALSE
wnv.weather.oh$acc.deg.day[i] <- 0
} else {
wnv.weather.oh$acc.deg.day[i] <- wnv.weather.oh$deg.day[i] +
wnv.weather.oh$acc.deg.day[i-1]
wnv.weather.oh$acc.deg.day.c[i] <- wnv.weather.oh$deg.day.c[i] +
wnv.weather.oh$acc.deg.day.c[i-1]
}
}
start <- start + iterations
newyear <- TRUE
}
#For Midway
Tbase <- 71.6 #22deg C = 71.6deg F
wnv.weather.mw$deg.day <- 0
wnv.weather.mw$deg.day.c <- 0
wnv.weather.mw$acc.deg.day <- 0
wnv.weather.mw$acc.deg.day.c <- 0
newyear <- TRUE
start <- 1
for (year in 2007:2014) {
iterations <- nrow(subset(wnv.weather.mw, wnv.weather.mw$date.year
== year ))
for (i in start:(start+iterations-1)) {
Tmean <- (wnv.weather.mw$Tmax[i] + wnv.weather.mw$Tmin[i])/2
if ( Tmean > Tbase) {
wnv.weather.mw$deg.day[i] <- Tmean - Tbase
wnv.weather.mw$deg.day.c[i] <- 0
} else {
wnv.weather.mw$deg.day[i] <- 0
wnv.weather.mw$deg.day.c[i] <- Tbase - Tmean
}
if (newyear == TRUE) {
newyear <- FALSE
wnv.weather.mw$acc.deg.day[i] <- 0
} else {
wnv.weather.mw$acc.deg.day[i] <- wnv.weather.mw$deg.day[i] +
wnv.weather.mw$acc.deg.day[i-1]
wnv.weather.mw$acc.deg.day.c[i] <- wnv.weather.mw$deg.day.c[i] +
wnv.weather.mw$acc.deg.day.c[i-1]
}
}
start <- start + iterations
newyear <- TRUE
}
#Creating Moving Averages and Moving Sums
oh.temp.mov.avg.wk <- vector()
oh.temp.mov.avg.2wk <- vector()
oh.prec.mov.avg.wk <- vector()
oh.prec.mov.avg.2wk <- vector()
oh.prec.mov.sum.wk <- vector()
oh.prec.mov.sum.2wk <- vector()
oh.dd.mov.sum.wk <- vector()
oh.dd.mov.sum.2wk <- vector()
mw.temp.mov.avg.wk <- vector()
mw.temp.mov.avg.2wk <- vector()
mw.prec.mov.avg.wk <- vector()
mw.prec.mov.avg.2wk <- vector()
mw.prec.mov.sum.wk <- vector()
mw.prec.mov.sum.2wk <- vector()
mw.dd.mov.sum.wk <- vector()
mw.dd.mov.sum.2wk <- vector()
for (year in 2007:2014) {
#O'Hare
assign(paste("oh.temp.mov.avg.wk", year, sep=""),
runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year ==
year], 7, alg="C", endrule="mean"))
assign(paste("oh.temp.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year ==
year], 14, alg="C", endrule="mean"))
assign(paste("oh.prec.mov.avg.wk", year, sep=""),
runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year
== year], 7, alg="C", endrule="mean"))
assign(paste("oh.prec.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year
== year], 14, alg="C", endrule="mean"))
assign(paste("oh.prec.mov.sum.wk", year, sep=""),
runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("oh.prec.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year ==
year], n = 14, cumulative = FALSE))
assign(paste("oh.dd.mov.sum.wk", year, sep=""),
runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("oh.dd.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year ==
year], n = 14, cumulative = FALSE))
oh.temp.mov.avg.wk <- c(oh.temp.mov.avg.wk,
get(paste("oh.temp.mov.avg.wk", year, sep="")))
oh.temp.mov.avg.2wk <- c(oh.temp.mov.avg.2wk,
get(paste("oh.temp.mov.avg.2wk", year, sep="")))
oh.prec.mov.avg.wk <- c(oh.prec.mov.avg.wk,
get(paste("oh.prec.mov.avg.wk", year, sep="")))
oh.prec.mov.avg.2wk <- c(oh.prec.mov.avg.2wk,
get(paste("oh.prec.mov.avg.2wk", year, sep="")))
oh.prec.mov.sum.wk <- c(oh.prec.mov.sum.wk,
get(paste("oh.prec.mov.sum.wk", year, sep="")))
oh.prec.mov.sum.2wk <- c(oh.prec.mov.sum.2wk,
get(paste("oh.prec.mov.sum.2wk", year, sep="")))
oh.dd.mov.sum.wk <- c(oh.dd.mov.sum.wk,
get(paste("oh.dd.mov.sum.wk", year, sep="")))
oh.dd.mov.sum.2wk <- c(oh.dd.mov.sum.2wk, get(paste("oh.dd.mov.sum.
2wk", year, sep="")))
#Midway
assign(paste("mw.temp.mov.avg.wk", year, sep=""),
runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year ==
year], 7, alg="C", endrule="mean"))
assign(paste("mw.temp.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year ==
year], 14, alg="C", endrule="mean"))
assign(paste("mw.prec.mov.avg.wk", year, sep=""),
runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year
== year], 7, alg="C", endrule="mean"))
assign(paste("mw.prec.mov.avg.2wk", year, sep=""),
runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year
== year], 14, alg="C", endrule="mean"))
assign(paste("mw.prec.mov.sum.wk", year, sep=""),
runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("mw.prec.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year ==
year], n = 14, cumulative = FALSE))
assign(paste("mw.dd.mov.sum.wk", year, sep=""),
runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year ==
year], n = 7, cumulative = FALSE))
assign(paste("mw.dd.mov.sum.2wk", year, sep=""),
runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year ==
year], n = 14, cumulative = FALSE))
mw.temp.mov.avg.wk <- c(mw.temp.mov.avg.wk,
get(paste("mw.temp.mov.avg.wk", year, sep="")))
mw.temp.mov.avg.2wk <- c(mw.temp.mov.avg.2wk,
get(paste("mw.temp.mov.avg.2wk", year, sep="")))
mw.prec.mov.avg.wk <- c(mw.prec.mov.avg.wk,
get(paste("mw.prec.mov.avg.wk", year, sep="")))
mw.prec.mov.avg.2wk <- c(mw.prec.mov.avg.2wk,
get(paste("mw.prec.mov.avg.2wk", year, sep="")))
mw.prec.mov.sum.wk <- c(mw.prec.mov.sum.wk,
get(paste("mw.prec.mov.sum.wk", year, sep="")))
mw.prec.mov.sum.2wk <- c(mw.prec.mov.sum.2wk,
get(paste("mw.prec.mov.sum.2wk", year, sep="")))
mw.dd.mov.sum.wk <- c(mw.dd.mov.sum.wk,
get(paste("mw.dd.mov.sum.wk", year, sep="")))
mw.dd.mov.sum.2wk <- c(mw.dd.mov.sum.2wk, get(paste("mw.dd.mov.sum.
2wk", year, sep="")))
}
#Add Variable Matrices to Weather Data Frame
wnv.weather.oh$temp.mov.avg.wk <- oh.temp.mov.avg.wk
wnv.weather.oh$temp.mov.avg.2wk <- oh.temp.mov.avg.2wk
wnv.weather.oh$prec.mov.avg.wk <- oh.prec.mov.avg.wk
wnv.weather.oh$prec.mov.avg.2wk <- oh.prec.mov.avg.2wk
wnv.weather.oh$prec.mov.sum.wk <- oh.prec.mov.sum.wk
wnv.weather.oh$prec.mov.sum.2wk <- oh.prec.mov.sum.2wk
wnv.weather.oh$dd.mov.sum.wk <- oh.dd.mov.sum.wk
wnv.weather.oh$dd.mov.sum.2wk <- oh.dd.mov.sum.2wk
wnv.weather.oh$prec.mov.sum.wk[is.na(wnv.weather.oh$prec.mov.sum.wk)]
<- 0
wnv.weather.oh$prec.mov.sum.2wk[is.na(wnv.weather.oh$prec.mov.sum.
2wk)] <- 0
wnv.weather.oh$dd.mov.sum.wk[is.na(wnv.weather.oh$dd.mov.sum.wk)] <- 0
wnv.weather.oh$dd.mov.sum.2wk[is.na(wnv.weather.oh$dd.mov.sum.2wk)] <-
0
wnv.weather.mw$temp.mov.avg.wk <- mw.temp.mov.avg.wk
wnv.weather.mw$temp.mov.avg.2wk <- mw.temp.mov.avg.2wk
wnv.weather.mw$prec.mov.avg.wk <- mw.prec.mov.avg.wk
wnv.weather.mw$prec.mov.avg.2wk <- mw.prec.mov.avg.2wk
wnv.weather.mw$prec.mov.sum.wk <- mw.prec.mov.sum.wk
wnv.weather.mw$prec.mov.sum.2wk <- mw.prec.mov.sum.2wk
wnv.weather.mw$dd.mov.sum.wk <- mw.dd.mov.sum.wk
wnv.weather.mw$dd.mov.sum.2wk <- mw.dd.mov.sum.2wk
wnv.weather.mw$prec.mov.sum.wk[is.na(wnv.weather.mw$prec.mov.sum.wk)]
<- 0
wnv.weather.mw$prec.mov.sum.2wk[is.na(wnv.weather.mw$prec.mov.sum.
2wk)] <- 0
wnv.weather.mw$dd.mov.sum.wk[is.na(wnv.weather.mw$dd.mov.sum.wk)] <- 0
wnv.weather.mw$dd.mov.sum.2wk[is.na(wnv.weather.mw$dd.mov.sum.2wk)] <-
0
options(scipen=100)
#Test Data Pre-
processing------------------------------------------------------------
------------------------------------------
wnv.test <- data.frame(test[,list(Date, Id, dWeek, Species, Trap,
Latitude, Longitude, Species2)])
#Cast Date Type
wnv.test$Date <- as.Date(wnv.test$Date)
oh.lat <- 41.995
oh.long <- -87.933
mw.lat <- 41.786
mw.long <- -87.752
for (i in 1:nrow(wnv.test)) {
#Euclidean Distances
wnv.test$dist.oh[i] <- sqrt((wnv.test$Latitude[i]-oh.lat)**2+
(wnv.test$Longitude[i]-oh.long)**2)*111
wnv.test$dist.mw[i] <- sqrt((wnv.test$Latitude[i]-mw.lat)**2+
(wnv.test$Longitude[i]-mw.long)**2)*111
#Haversine Distance
wnv.test$disth.oh[i] <- distHaversine(c(wnv.test$Longitude[i],
wnv.test$Latitude[i]), c(oh.long, oh.lat))/1000
wnv.test$disth.mw[i] <- distHaversine(c(wnv.test$Longitude[i],
wnv.test$Latitude[i]), c(mw.long, mw.lat))/1000
if (wnv.test$dist.oh[i] >= wnv.test$dist.mw[i]) {
wnv.test$Station.Euc[i] <- 2 #2 is Midway
}
else {
wnv.test$Station.Euc[i] <- 1 #1 is O'Hare
}
if (wnv.test$disth.oh[i] >= wnv.test$disth.mw[i]) {
wnv.test$Station[i] <- 2 #2 is Midway
}
else {
wnv.test$Station[i] <- 1 #1 is O'Hare
}
#Creating ISO Week Number
#wnv.test$week.num[i] <- substring(ISOweek(wnv.test$Date[i]), 7, 8)
}
#Concatenating Clean Weather Datasets and Merging with Training/Test
Data----------------------------------------------------------
#wnv.train$Species2 <- train$Species2
#wnv.test$Species2 <- test$Species2
wnv.weather.clean <- rbind(wnv.weather.oh, wnv.weather.mw)
wnv.train.weather <- merge(wnv.train, wnv.weather.clean, by=c("Date",
"Station"))
wnv.test.weather <- merge(wnv.test, wnv.weather.clean, by=c("Date",
"Station"))
#Just use one station...
#Sorting wnv.test.weather
wnv.test.weather.sorted <- wnv.test.weather[order(wnv.test.weather
$Id),]
#Training/Testing
Split-----------------------------------------------------------------
------------------------------------
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2007,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2007,]
#-----------------------------------------------------
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2009,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2009,]
#-----------------------------------------------------
#Set aside 2011 data as test, and train on the remaining
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2011,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2011,]
#-----------------------------------------------------
wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year !
= 2013,]
wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year
== 2013,]
#Clustering Locations -
DBSCAN----------------------------------------------------------------
---------------------------
#Potential epsilon values:
#1/4 mile = 0.003623
#1/2 mile = 0.007246
#1 mile = 0.014493
#2 mile = 0.028986
#3 mile = 0.043479
#Based on 1deg Latitude/Longitude = 69 miles
x.train <- cbind(wnv.train.weather$Longitude, wnv.train.weather
$Latitude)
dbs.train <- dbscan(x.train, 0.003623)
plot(dbs.train, x.train)
print(dbs.train)
prediction.train <- predict(dbs.train, x.train)
#length(prediction.train)
#unique(prediction.train)
wnv.train.weather$loc.cluster.qmile <- as.factor(prediction.train)
x.test <- cbind(wnv.test.weather.sorted$Longitude,
wnv.test.weather.sorted$Latitude)
prediction.test <- predict(dbs.train, x.train, x.test)
#length(prediction.test)
#unique(prediction.test)
wnv.test.weather.sorted$loc.cluster.2mile <-
as.factor(prediction.test)
#Clustering Locations - K-
Means-----------------------------------------------------------------
-----------------------
#With normalized Longitude & Latitude
train.means <- apply(wnv.train.weather[c("Longitude", "Latitude")], 2,
mean)
train.sds <-apply(wnv.train.weather[c("Longitude", "Latitude")], 2,
sd)
train.loc.norm <- scale(wnv.train.weather[c("Longitude",
"Latitude")],center=means,scale=sds)
test.means <- apply(wnv.test.weather.sorted[c("Longitude",
"Latitude")], 2, mean)
test.sds <-apply(wnv.test.weather.sorted[c("Longitude", "Latitude")],
2, sd)
test.loc.norm <- scale(wnv.test.weather.sorted[c("Longitude",
"Latitude")],center=means,scale=sds)
set.seed(123)
clusters.kmeans.norm <- kmeans(train.loc.norm, 20)
clusters.kmeans.norm$centers
plot(clusters.kmeans.norm$centers)
pred.cluster.norm.train <- predict(clusters.kmeans.norm,
data=train.loc.norm)
pred.cluster.norm.test <- predict(clusters.kmeans.norm,
data=test.loc.norm)
#With un-normalized Longitude & Latitude
train.loc <- wnv.train.weather[c("Longitude", "Latitude")]
test.loc <- wnv.test.weather.sorted[c("Longitude", "Latitude")]
set.seed(123)
clusters.kmeans <- kmeans(train.loc, 25)
clusters.kmeans$centers
plot(clusters.kmeans$centers)
pred.cluster.train <- predict(clusters.kmeans.norm,
data=train.loc.norm)
pred.cluster.test <- predict(clusters.kmeans.norm, data=test.loc.norm)
#Assign to training/testing set
wnv.train.weather$kmeans.cluster.n <-
as.factor(pred.cluster.norm.train)
wnv.test.weather.sorted$kmeans.cluster.n <-
as.factor(pred.cluster.norm.test)
#PCA------------------------------------------------------------------
-------------------------------------------------
keeps1 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb",
"StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed",
"acc.deg.day", "temp.mov.avg.wk", "prec.mov.avg.wk",
"prec.mov.sum.wk", "dd.mov.sum.wk" )
keeps2 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb",
"StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed")
#model.pca <- princomp(wnv.train.weather[keeps1], cor=TRUE,
scores=TRUE)
#model.pca
model.pca.train1 <- princomp(wnv.train.weather[keeps1], cor=TRUE,
scores=TRUE)
model.pca.test1 <- princomp(wnv.test.weather.sorted[keeps1], cor=TRUE,
scores=TRUE)
model.pca.train <- princomp(wnv.train.weather[keeps2], cor=TRUE,
scores=TRUE)
model.pca.test <- princomp(wnv.test.weather.sorted[keeps2], cor=TRUE,
scores=TRUE)
plot(model.pca.train)
summary(model.pca.train)
plot(model.pca.test)
summary(model.pca.test)
plot(model.pca.train1)
summary(model.pca.train1)
model.pca.train$scores
model.pca.test$scores
model.pca.train$loadings
model.pca.test$loadings
model.pca.train1$loadings
pca.out.train <- data.frame(model.pca.train$scores)
pca.out.test <- data.frame(model.pca.test$scores)
wnv.train.weather <- cbind(wnv.train.weather, pca.out.train)
wnv.test.weather.sorted <- cbind(wnv.test.weather.sorted,
pca.out.test)
#wnv.train.weather.pca <- cbind(wnv.train.weather, pca.out)
#Dealing with unbalanced
classes---------------------------------------------------------------
------------------------
wnv.train.no <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent ==
0,]
wnv.train.yes <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent ==
1,]
set.seed(1234)
wnv.train.no.75 <- wnv.train.no[sample(1:nrow(wnv.train.no), 5852,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.75, wnv.train.yes)
set.seed(1234)
wnv.train.no.50 <- wnv.train.no[sample(1:nrow(wnv.train.no), 3901,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.50, wnv.train.yes)
set.seed(1234)
wnv.train.no.25 <- wnv.train.no[sample(1:nrow(wnv.train.no), 1951,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.25, wnv.train.yes)
set.seed(1234)
wnv.train.no.10 <- wnv.train.no[sample(1:nrow(wnv.train.no), 780,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.10, wnv.train.yes)
set.seed(1234)
wnv.train.no.5 <- wnv.train.no[sample(1:nrow(wnv.train.no), 312,
replace=FALSE),]
wnv.train.weather.1 <- rbind(wnv.train.no.5, wnv.train.yes)
#70/30 Split--------------------------------------------
set.seed(1234)
#set.seed(4321)
ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7,
0.3))
trainData <- wnv.train.weather[ind==1,]
testData <- wnv.train.weather[ind==2,]
#GAM
Modeling--------------------------------------------------------------
--------------------------------------------------
#Experiment with PCA and Kmeans-------------------------------
train.model <- gam(WnvPresent ~ s(dWeek) + Species2 + kmeans.cluster.n
+ s(Comp.1)
+ s(Comp.2)
+ s(Comp.3)
+ s(Comp.4)
+ s(Comp.5)
+ s(acc.deg.day)
+ s(temp.mov.avg.wk)
+ s(prec.mov.avg.2wk)
#+ s(prec.mov.sum.wk)
, data = trainData, family="binomial", bf.maxit =
1000)
prediction.gam <-predict(train.model, newdata = testData, type =
"response")
auc(testData$WnvPresent, prediction.gam)
#-------------------------------------------------------------
fitSubmit <- update(train.model, data=wnv.train.weather)
pSubmit.gam<-predict(fitSubmit, newdata = wnv.test.weather.sorted,
type = "response")
submissionFile<-cbind(test$Id,pSubmit.gam)
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi134prec.csv",row.names=FALSE
,quote=FALSE)
#GLM------------------------------------------------------------------
--------------------------------------------------------
#Create Squared Term deg.day, dWeek, dd.mov.sum.2wk
wnv.train.weather$acc.deg.day2 <- (wnv.train.weather$acc.deg.day)**2
wnv.train.weather$dWeek2 <- (wnv.train.weather$dWeek)**2
wnv.train.weather$dd.mov.sum.2wk2 <- (wnv.train.weather$dd.mov.sum.
2wk)**2
wnv.test.weather.sorted$acc.deg.day2 <- (wnv.test.weather.sorted
$acc.deg.day)**2
wnv.test.weather.sorted$dWeek2 <- (wnv.test.weather.sorted$dWeek)**2
wnv.test.weather.sorted$dd.mov.sum.2wk2 <- (wnv.test.weather.sorted
$dd.mov.sum.2wk)**2
#Experiment with PCA and Kmeans--------------------------------
log.model <- glm(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n
+ dWeek2
+ Comp.1
#+ Comp.2
#+ Comp.3
#+ Comp.4
#+ Comp.5
#+ acc.deg.day
#+ acc.deg.day2
+ temp.mov.avg.2wk
+ prec.mov.avg.2wk
, data = trainData, family=binomial())
summary(log.model)
prediction.glm <-predict(log.model, newdata = testData, type =
"response")
auc(testData$WnvPresent, prediction.glm)
#------------------------------------------------------------
fitSubmit <- update(log.model, data=wnv.train.weather)
pSubmit.glm <-predict(fitSubmit, newdata = wnv.test.weather.sorted,
type = "response")
#summary(pSubmit.glm)
submissionFile<-cbind(test$Id,pSubmit.glm)
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#Base Model Submission AUC: 0.64569
#Base Train AUC: 0.78259055
#LDA------------------------------------------------------------------
--------------------------------------------------------
lda.model <- lda(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
+ acc.deg.day2
#+ temp.mov.avg.2wk
+ prec.mov.avg.wk
+ prec.mov.sum.2wk
,data = trainData)
prediction.lda <- predict(lda.model, testData)
lda.df <- data.frame(prediction.lda$posterior)
auc(testData$WnvPresent, lda.df[,2])
#-------------------------------------------------------------------
fitSubmit <- update(lda.model, data=wnv.train.weather)
pSubmit<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type =
"probability")
lda.test <- data.frame(pSubmit$posterior)
submissionFile<-cbind(test$Id,lda.df.test[,2])
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#Random
Forest----------------------------------------------------------------
------------------------------------------------
set.seed(1234)
rf.model <- randomForest(WnvPresent.C ~ dWeek + Species2 +
kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
#+ prec.mov.sum.2wk
, data = trainData, nTree = 1000)
prediction.rf <- predict(rf.model, testData, type = "prob")
rf.df <- data.frame(prediction.rf)
auc(testData$WnvPresent.C, rf.df[,2])
fitSubmit <- update(rf.model, data=wnv.train.weather)
pSubmit.rf<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type
= "prob")
rf.df.test <- data.frame(pSubmit.rf)
submissionFile<-cbind(test$Id, rf.df.test[,2])
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#PCA and Kmeans-------------------
set.seed(1234)
rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
#+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
#+ prec.mov.sum.wk
,data = trainData,
controls=cforest_unbiased(ntree=200, mtry=2))
prediction.rf <- predict(rf.model, newdata=testData, type = "prob")
rf.df <- data.frame(prediction.rf)
x <- vector()
for (i in seq(2, 6232, 2)) {
x <- c(x, rf.df[1, i])
}
auc(testData$WnvPresent.C, x)
set.seed(1234)
rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
#+ Comp.3
#+ Comp.4
#+ Comp.5
#+ acc.deg.day
#+ temp.mov.avg.2wk
#+ prec.mov.avg.2wk
,data = wnv.train.weather,
controls=cforest_unbiased(ntree=10, mtry=2, maxdepth = 10))
prediction.rf <- predict(rf.model, newdata=wnv.test.weather.sorted,
type = "prob")
rf.df <- data.frame(prediction.rf)
#H20 Random Forest--------------
library(h2o)
localH2O <- h2o.init(nthreads = -1,max_mem_size = '7g')
rf.model <- h2o.randomForest(x=c(3, 9, 43, 44:48),y = 53,data =
trainData,
mtries = 18,
sample.rate = 0.5,
classification = T,ntree = 200,verbose =
T)
prediction.rf <- h2o.predict(rf.model,testData)
#Support Vector
Machines--------------------------------------------------------------
----------------------------------------
#PCA and kmeans----------------------------------------------------
set.seed(1234)
svm.model <- svm(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n
+ Comp.1
+ Comp.2
+ Comp.3
+ Comp.4
+ Comp.5
+ acc.deg.day
+ temp.mov.avg.2wk
#+ prec.mov.avg.wk
#+ prec.mov.sum.2wk
, data = trainData, probability = TRUE)
#summary(svm.model)
prediction.svm <- predict(svm.model, testData, probability = TRUE)
svm.df <- data.frame(attr(prediction.svm, "probabilities"))
auc(testData$WnvPresent.C, svm.df[,2])
#------------------------------------------------------------------
set.seed(1234)
fitSubmit <- update(svm.model, data=wnv.train.weather)
pSubmit.svm<-predict(fitSubmit, newdata = wnv.test.weather.sorted,
probability = TRUE)
svm.df.test <- data.frame(attr(pSubmit.svm, "probabilities"))
submissionFile<-cbind(test$Id, svm.df.test[,2])
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=
FALSE)
#Ensemble-------------------------------------------------------------
--------------------------------------------------------
gam.train <- as.matrix(prediction.gam)
glm.train <- as.matrix(prediction.glm)
lda.train <- as.matrix(lda.df[,2])
rf.train <- as.matrix(rf.df[,2])
svm.train <- as.matrix(svm.df[,2])
ens.train <- cbind(gam.train, lda.train, rf.train, svm.train)
gam.test <- as.matrix(pSubmit.gam)
glm.test <- as.matrix(pSubmit.glm)
lda.test <- as.matrix(lda.df.test[,2])
rf.test <- as.matrix(rf.df.test[,2])
svm.test <- as.matrix(svm.df.test[,2])
ens.test <- cbind(lda.test, rf.test, svm.test)
w <- matrix(c(0.0, 0.3, 0.1, 0.6), nrow = 4, ncol = 1)
ens.model <- ens.train %*% w
auc(testData$WnvPresent, ens.model)
w <- matrix(c(0.3, 0.1, 0.6), nrow = 3, ncol = 1)
ens.model.test <- ens.test %*% w
submissionFile<-cbind(test$Id,ens.model.test)
colnames(submissionFile)<-c("Id","WnvPresent")
options("scipen"=100, "digits"=8)
write.csv(submissionFile,"Submission_MaxLiBest3.csv",row.names=FALSE,q
uote=FALSE)
#Bagging--------------------------------------------------------------
------------------------------------------------------
set.seed(1234)
ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7,
0.3))
trainData <- wnv.train.weather[ind==1,]
testData <- wnv.train.weather[ind==2,]
set.seed(4321)
all_data <- wnv.train.weather
positions <- sample(nrow(all_data),size=floor((nrow(all_data)/10)*7))
trainBag<- all_data[positions,]
testBag<- all_data[-positions,]
library(foreach)
length_divisor<-6
iterations<-50
predictions<-foreach(m=1:iterations,.combine=cbind) %do% {
training_positions <- sample(nrow(trainData),
size=floor((nrow(trainData)/length_divisor)))
train_pos<-1:nrow(trainData) %in% training_positions
gam.bag <- gam(WnvPresent ~ lo(dWeek) + Species2 + lo(Longitude,
Latitude) + s(acc.deg.day), data = trainData[train_pos,]
, family="binomial", bf.maxit = 45)
predict(gam.bag, newdata = testData, type = "response")
}
predictions<-data.frame(rowMeans(predictions))
auc(testData$WnvPresent.C, predictions)
#--------------------------------------------------------------------
library(foreach)
length_divisor<-6
iterations<-1000
predictions<-foreach(m=1:iterations,.combine=cbind) %do% {
training_positions <- sample(nrow(trainData),
size=floor((nrow(trainData)/length_divisor)))
train_pos<-1:nrow(trainData) %in% training_positions
glm.bag <- glm(WnvPresent ~ dWeek + dWeek2 + Species2 + Longitude +
acc.deg.day + acc.deg.day2
, data = trainData[train_pos,], family=binomial())
#summary(log.model)
predict(glm.bag, newdata = testData, type = "response")
}
predictions<-data.frame(rowMeans(predictions))
auc(testData$WnvPresent.C, predictions)
#Vs.
gam.test <- gam(WnvPresent ~ lo(dWeek) + Species2 + loc.cluster.qmile,
data = trainBag, family="binomial", bf.maxit = 200)
prediction.gam <-predict(gam.test, newdata = testBag, type =
"response")
auc(testBag$WnvPresent, prediction.gam)
#Adabag
Exploration-----------------------------------------------------------
---------------------------------------------
wnv.adaboost <- boosting(WnvPresent.C ~ dWeek, data = trainData,
boos=TRUE, mfinal = 5)

More Related Content

PDF
The Ring programming language version 1.10 book - Part 80 of 212
DOCX
Spark_Documentation_Template1
PPTX
PPT
Dynamically Evolving Systems: Cluster Analysis Using Time
PDF
The Ring programming language version 1.7 book - Part 16 of 196
PDF
The Ring programming language version 1.7 book - Part 73 of 196
PDF
The Ring programming language version 1.6 book - Part 15 of 189
PDF
The Ring programming language version 1.8 book - Part 75 of 202
The Ring programming language version 1.10 book - Part 80 of 212
Spark_Documentation_Template1
Dynamically Evolving Systems: Cluster Analysis Using Time
The Ring programming language version 1.7 book - Part 16 of 196
The Ring programming language version 1.7 book - Part 73 of 196
The Ring programming language version 1.6 book - Part 15 of 189
The Ring programming language version 1.8 book - Part 75 of 202

What's hot (20)

KEY
Unittesting JavaScript with Evidence
PDF
The elements of a functional mindset
PDF
The Ring programming language version 1.3 book - Part 63 of 88
PDF
Kotlin Coroutines in Practice @ KotlinConf 2018
DOCX
Ejemplo radio
PDF
EMFPath
PDF
The Ring programming language version 1.7 book - Part 12 of 196
PDF
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
PDF
State of the CFEngine 2018
PDF
JJUG CCC 2011 Spring
DOCX
Registro de venta
TXT
Procesos
PDF
Message-based communication patterns in distributed Akka applications
DOCX
XTW_Import
DOCX
Dmxedit
PDF
Benchy, python framework for performance benchmarking of Python Scripts
PDF
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
KEY
RHadoop, R meets Hadoop
PDF
mobl - model-driven engineering lecture
PPTX
Writing Hadoop Jobs in Scala using Scalding
Unittesting JavaScript with Evidence
The elements of a functional mindset
The Ring programming language version 1.3 book - Part 63 of 88
Kotlin Coroutines in Practice @ KotlinConf 2018
Ejemplo radio
EMFPath
The Ring programming language version 1.7 book - Part 12 of 196
Knowledge is Power: Getting out of trouble by understanding Git - Steve Smith...
State of the CFEngine 2018
JJUG CCC 2011 Spring
Registro de venta
Procesos
Message-based communication patterns in distributed Akka applications
XTW_Import
Dmxedit
Benchy, python framework for performance benchmarking of Python Scripts
3rd Proj. Update: Integrating SWI-Prolog for Semantic Reasoning in Bioclipse
RHadoop, R meets Hadoop
mobl - model-driven engineering lecture
Writing Hadoop Jobs in Scala using Scalding
Ad

Similar to West-Nile-Virus | Kaggle (20)

PDF
R programming & Machine Learning
PDF
Datamining R 4th
PPTX
Air Quality in Taiwan 2013
PDF
Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
PDF
Practical data science_public
PDF
R data-import, data-export
 
PDF
Data Munging in R - Chicago R User Group
PDF
Study covid19
PPTX
Exploratory Analysis Part1 Coursera DataScience Specialisation
ODP
Geospatial Data in R
PDF
Project Bird species The OrdwayBirds data frame is a histor.pdf
PDF
phylosmith
PDF
R and data mining
PDF
9. R data-import data-export
PPTX
R language introduction
PDF
Data transformation-cheatsheet
PDF
R gráfico
PDF
第3回 データフレームの基本操作 その1(解答付き)
PDF
purrr.pdf
DOCX
R (Shiny Package) - UI Side Script for Decision Support System
R programming & Machine Learning
Datamining R 4th
Air Quality in Taiwan 2013
Chapter 2: R tutorial Handbook for Data Science and Machine Learning Practiti...
Practical data science_public
R data-import, data-export
 
Data Munging in R - Chicago R User Group
Study covid19
Exploratory Analysis Part1 Coursera DataScience Specialisation
Geospatial Data in R
Project Bird species The OrdwayBirds data frame is a histor.pdf
phylosmith
R and data mining
9. R data-import data-export
R language introduction
Data transformation-cheatsheet
R gráfico
第3回 データフレームの基本操作 その1(解答付き)
purrr.pdf
R (Shiny Package) - UI Side Script for Decision Support System
Ad

West-Nile-Virus | Kaggle

  • 1. #install.packages("ISOweek") #install.packages("stringr") #install.packages("caTools") #install.packages("TTR") #install.packages("gam") #install.packages("data.table") #install.packages("Metrics") #install.packages("randomForest") #install.packages("MASS") #install.packages("e1071") #install.packages("fpc", dependencies = TRUE) #install.packages("geosphere") #install.packages("scatterplot3d") #install.packages("party") #install.packages("adabag") #install.packages("DeducerExtras") #install.packages("h2o") library(ISOweek) library(stringr) library(caTools) library(TTR) require(gam) library(data.table) library(Metrics) require(randomForest) require(MASS) require(e1071) library(fpc) library(geosphere) library(scatterplot3d) require(party) require(adabag) require(DeducerExtras) require(h2o) #Read Raw Data------------------------------------------------------------------ ------------------------------------------------ train <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/ Working Folder/train.csv") test <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working Folder/test.csv") weather <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/
  • 2. Working Folder/weather.csv") spray <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/ Working Folder/spray.csv") #Get Week Number and ReCategorize Species--------------------------------------------------------------- ------------------------ vSpecies<-c(as.character(train$Species),as.character(test$Species)) vSpecies[vSpecies=="UNSPECIFIED CULEX"]<-"CULEX ERRATICUS" vSpecies[-which(vSpecies == "CULEX PIPIENS" | vSpecies == "CULEX PIPIENS/RESTUANS" | vSpecies == "CULEX RESTUANS")] = "CULEX OTHER" vSpecies<-factor(vSpecies,levels=unique(vSpecies)) ## data.table syntax for adding a column; could overwrite the existing column as well train[,Species2:=factor(vSpecies[1:nrow(train)],levels=unique(vSpecies ))] test[,Species2:=factor(vSpecies[(nrow(train) +1):length(vSpecies)],levels=unique(vSpecies))] ## also add some fields for components of the date using simple substrings train[,dMonth:=as.factor(paste(substr(train$Date,6,7)))] train[,dYear:=as.factor(paste(substr(train$Date,1,4)))] train$Date = as.Date(train$Date, format="%Y-%m-%d") xsDate = as.Date(paste0(train$dYear, "0101"), format="%Y%m%d") train$dWeek = as.numeric(paste(floor((train$Date - xsDate + 1)/7))) test[,dMonth:=as.factor(paste(substr(test$Date,6,7)))] test[,dYear:=as.factor(paste(substr(test$Date,1,4)))] test$Date = as.Date(test$Date, format="%Y-%m-%d") tsDate = as.Date(paste0(test$dYear, "0101"), format="%Y%m%d") test$dWeek = as.numeric(paste(floor((test$Date - tsDate + 1)/7))) weather[,dMonth:=as.factor(paste(substr(weather$Date,6,7)))] weather[,dYear:=as.factor(paste(substr(weather$Date,1,4)))] weather$Date = as.Date(weather$Date, format="%Y-%m-%d") tsDate = as.Date(paste0(weather$dYear, "0101"), format="%Y%m%d") weather$dWeek = as.numeric(paste(floor((weather$Date - tsDate + 1)/ 7))) #Train Data Pre-
  • 3. processing------------------------------------------------------------ ------------------------------------------ wnv.train <- data.frame(train[,list(Date, dWeek, Species, Trap, NumMosquitos, Latitude, Longitude, Species2, WnvPresent)]) #Cast Date Type wnv.train$Date <- as.Date(wnv.train$Date) oh.lat <- 41.995 oh.long <- -87.933 mw.lat <- 41.786 mw.long <- -87.752 for (i in 1:nrow(wnv.train)) { #Euclidean Distances wnv.train$dist.oh[i] <- sqrt((wnv.train$Latitude[i]-oh.lat)**2+ (wnv.train$Longitude[i]-oh.long)**2)*111 wnv.train$dist.mw[i] <- sqrt((wnv.train$Latitude[i]-mw.lat)**2+ (wnv.train$Longitude[i]-mw.long)**2)*111 #Haversine Distance wnv.train$disth.oh[i] <- distHaversine(c(wnv.train$Longitude[i], wnv.train$Latitude[i]), c(oh.long, oh.lat))/1000 wnv.train$disth.mw[i] <- distHaversine(c(wnv.train$Longitude[i], wnv.train$Latitude[i]), c(mw.long, mw.lat))/1000 if (wnv.train$dist.oh[i] >= wnv.train$dist.mw[i]) { wnv.train$Station.Euc[i] <- 2 #2 is Midway } else { wnv.train$Station.Euc[i] <- 1 #1 is O'Hare } if (wnv.train$disth.oh[i] >= wnv.train$disth.mw[i]) { wnv.train$Station[i] <- 2 #2 is Midway } else { wnv.train$Station[i] <- 1 #1 is O'Hare } #Creating ISO Week Number #wnv.train$week.num[i] <- substring(ISOweek(wnv.train$Date[i]), 7, 8)
  • 4. } #Weather Data Cleaning-------------------------------------------------------------- ---------------------------------------------- #Choose Variables wnv.weather <- data.frame(weather[,list(Station, Date, Tmax, Tmin, Tavg, Depart, DewPoint, WetBulb, Heat, Cool, PrecipTotal, StnPressure, SeaLevel, ResultSpeed, ResultDir, AvgSpeed)]) #Cast Date Type wnv.weather$Date <- as.Date(wnv.weather$Date) #Create Week Number Variable for (i in 1:nrow(wnv.weather)) { wnv.weather$week.num[i] <- as.numeric(substring(ISOweek(wnv.weather $Date[i]), 7, 8)) #Create Week Markers wnv.weather$date.year[i] <- as.numeric(format(wnv.weather$Date[i], "%Y")) #wnv.weather$date.month[i] <- as.numeric(format(wnv.weather$Date[i], "%m")) #wnv.weather$date.day[i] <- as.numeric(format(wnv.weather$Date[i], "%d")) } #Clean Temperature wnv.weather$Tmax <- as.numeric(as.character(wnv.weather$Tmax)) wnv.weather$Tmin <- as.numeric(as.character(wnv.weather$Tmin)) wnv.weather$Tavg <- as.numeric(as.character(wnv.weather$Tavg)) wnv.weather$Tavg[is.na(wnv.weather$Tavg)] <- (wnv.weather $Tmax[is.na(wnv.weather$Tavg)] + wnv.weather$Tmin[is.na(wnv.weather $Tavg)])/2 #Clean DewPoint wnv.weather$DewPoint <- as.numeric(as.character(wnv.weather$DewPoint))
  • 5. #Clean Depart wnv.weather$Depart[wnv.weather$Depart=="M"] <- NA wnv.weather$Depart <- as.numeric(as.character(wnv.weather$Depart)) #Clean WetBulb wnv.weather$WetBulb <- as.numeric(as.character(wnv.weather$WetBulb)) wnv.weather$WetBulb[849]<- wnv.weather$WetBulb[850] wnv.weather$WetBulb[2411]<- wnv.weather$WetBulb[2412] wnv.weather$WetBulb[2413]<- wnv.weather$WetBulb[2414] wnv.weather$WetBulb[2416]<- wnv.weather$WetBulb[2415] #Clean Heat wnv.weather$Heat <- as.numeric(as.character(wnv.weather$Heat)) wnv.weather$Heat[8]<- wnv.weather$Heat[7] wnv.weather$Heat[506]<- wnv.weather$Heat[505] wnv.weather$Heat[676]<- wnv.weather$Heat[675] wnv.weather$Heat[1638]<- wnv.weather$Heat[1637] wnv.weather$Heat[2068]<- wnv.weather$Heat[2067] wnv.weather$Heat[2212]<- wnv.weather$Heat[2411] wnv.weather$Heat[2502]<- wnv.weather$Heat[2501] wnv.weather$Heat[2512]<- wnv.weather$Heat[2511] wnv.weather$Heat[2526]<- wnv.weather$Heat[2525] wnv.weather$Heat[2580]<- wnv.weather$Heat[2579] wnv.weather$Heat[2812]<- wnv.weather$Heat[2811] #Clean Cool wnv.weather$Cool <- as.numeric(as.character(wnv.weather$Cool)) wnv.weather$Cool[8]<- wnv.weather$Cool[7] wnv.weather$Cool[506]<- wnv.weather$Cool[505] wnv.weather$Cool[676]<- wnv.weather$Cool[675] wnv.weather$Cool[1638]<- wnv.weather$Cool[1637] wnv.weather$Cool[2068]<- wnv.weather$Cool[2067] wnv.weather$Cool[2212]<- wnv.weather$Cool[2411] wnv.weather$Cool[2502]<- wnv.weather$Cool[2501] wnv.weather$Cool[2512]<- wnv.weather$Cool[2511] wnv.weather$Cool[2526]<- wnv.weather$Cool[2525] wnv.weather$Cool[2580]<- wnv.weather$Cool[2579] wnv.weather$Cool[2812]<- wnv.weather$Cool[2811]
  • 6. #Clean PrecipTotal wnv.weather$PrecipTotal <- as.numeric(as.character(wnv.weather $PrecipTotal)) wnv.weather$PrecipTotal[is.na(wnv.weather$PrecipTotal)] <- 0 #Clean StnPressure wnv.weather$StnPressure <- as.numeric(as.character(wnv.weather $StnPressure)) wnv.weather$StnPressure[88]<- wnv.weather$StnPressure[87] wnv.weather$StnPressure[849]<- wnv.weather$StnPressure[850] wnv.weather$StnPressure[2411]<- wnv.weather$StnPressure[2410] wnv.weather$StnPressure[2412]<- wnv.weather$StnPressure[2410] #Clean SeaLevel wnv.weather$SeaLevel <- as.numeric(as.character(wnv.weather$SeaLevel)) wnv.weather$SeaLevel[88]<- wnv.weather$SeaLevel[87] wnv.weather$SeaLevel[833]<- wnv.weather$SeaLevel[834] wnv.weather$SeaLevel[995]<- wnv.weather$SeaLevel[996] wnv.weather$SeaLevel[1733]<- wnv.weather$SeaLevel[1734] wnv.weather$SeaLevel[1746]<- wnv.weather$SeaLevel[1745] wnv.weather$SeaLevel[1757]<- wnv.weather$SeaLevel[1758] wnv.weather$SeaLevel[2068]<- wnv.weather$SeaLevel[2067] wnv.weather$SeaLevel[2091]<- wnv.weather$SeaLevel[2092] wnv.weather$SeaLevel[2744]<- wnv.weather$SeaLevel[2743] #Clean ResultSpeed wnv.weather$ResultSpeed <- as.numeric(as.character(wnv.weather $ResultSpeed)) #Clean ResultDir wnv.weather$ResultDir <- as.numeric(as.character(wnv.weather $ResultDir)) #Clean AvgSpeed wnv.weather$AvgSpeed <- as.numeric(as.character(wnv.weather$AvgSpeed)) wnv.weather$AvgSpeed[is.na(wnv.weather$AvgSpeed)] <- mean(wnv.weather $AvgSpeed, na.rm =TRUE)
  • 7. #Split weather data into O'Hare and Midway wnv.weather.oh <- subset(wnv.weather, wnv.weather$Station == 1) wnv.weather.mw <- subset(wnv.weather, wnv.weather$Station == 2) #Create New Weather Features: temp.mov.avg, prec.mov.avg, deg.day, accumulated degree day for each year Tbase <- 71.6 #22deg C = 71.6deg F wnv.weather.oh$deg.day <- 0 wnv.weather.oh$deg.day.c <- 0 wnv.weather.oh$acc.deg.day <- 0 wnv.weather.oh$acc.deg.day.c <- 0 newyear <- TRUE #For O'Hare start <- 1 for (year in 2007:2014) { iterations <- nrow(subset(wnv.weather.oh, wnv.weather.oh$date.year == year )) for (i in start:(start+iterations-1)) { Tmean <- (wnv.weather.oh$Tmax[i] + wnv.weather.oh$Tmin[i])/2 if ( Tmean > Tbase) { wnv.weather.oh$deg.day[i] <- Tmean - Tbase wnv.weather.oh$deg.day.c[i] <- 0 } else { wnv.weather.oh$deg.day[i] <- 0 wnv.weather.oh$deg.day.c[i] <- Tbase - Tmean } if (newyear == TRUE) { newyear <- FALSE wnv.weather.oh$acc.deg.day[i] <- 0 } else { wnv.weather.oh$acc.deg.day[i] <- wnv.weather.oh$deg.day[i] + wnv.weather.oh$acc.deg.day[i-1] wnv.weather.oh$acc.deg.day.c[i] <- wnv.weather.oh$deg.day.c[i] + wnv.weather.oh$acc.deg.day.c[i-1] } } start <- start + iterations newyear <- TRUE }
  • 8. #For Midway Tbase <- 71.6 #22deg C = 71.6deg F wnv.weather.mw$deg.day <- 0 wnv.weather.mw$deg.day.c <- 0 wnv.weather.mw$acc.deg.day <- 0 wnv.weather.mw$acc.deg.day.c <- 0 newyear <- TRUE start <- 1 for (year in 2007:2014) { iterations <- nrow(subset(wnv.weather.mw, wnv.weather.mw$date.year == year )) for (i in start:(start+iterations-1)) { Tmean <- (wnv.weather.mw$Tmax[i] + wnv.weather.mw$Tmin[i])/2 if ( Tmean > Tbase) { wnv.weather.mw$deg.day[i] <- Tmean - Tbase wnv.weather.mw$deg.day.c[i] <- 0 } else { wnv.weather.mw$deg.day[i] <- 0 wnv.weather.mw$deg.day.c[i] <- Tbase - Tmean } if (newyear == TRUE) { newyear <- FALSE wnv.weather.mw$acc.deg.day[i] <- 0 } else { wnv.weather.mw$acc.deg.day[i] <- wnv.weather.mw$deg.day[i] + wnv.weather.mw$acc.deg.day[i-1] wnv.weather.mw$acc.deg.day.c[i] <- wnv.weather.mw$deg.day.c[i] + wnv.weather.mw$acc.deg.day.c[i-1] } } start <- start + iterations newyear <- TRUE } #Creating Moving Averages and Moving Sums oh.temp.mov.avg.wk <- vector() oh.temp.mov.avg.2wk <- vector() oh.prec.mov.avg.wk <- vector()
  • 9. oh.prec.mov.avg.2wk <- vector() oh.prec.mov.sum.wk <- vector() oh.prec.mov.sum.2wk <- vector() oh.dd.mov.sum.wk <- vector() oh.dd.mov.sum.2wk <- vector() mw.temp.mov.avg.wk <- vector() mw.temp.mov.avg.2wk <- vector() mw.prec.mov.avg.wk <- vector() mw.prec.mov.avg.2wk <- vector() mw.prec.mov.sum.wk <- vector() mw.prec.mov.sum.2wk <- vector() mw.dd.mov.sum.wk <- vector() mw.dd.mov.sum.2wk <- vector() for (year in 2007:2014) { #O'Hare assign(paste("oh.temp.mov.avg.wk", year, sep=""), runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year == year], 7, alg="C", endrule="mean")) assign(paste("oh.temp.mov.avg.2wk", year, sep=""), runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year == year], 14, alg="C", endrule="mean")) assign(paste("oh.prec.mov.avg.wk", year, sep=""), runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], 7, alg="C", endrule="mean")) assign(paste("oh.prec.mov.avg.2wk", year, sep=""), runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], 14, alg="C", endrule="mean")) assign(paste("oh.prec.mov.sum.wk", year, sep=""), runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], n = 7, cumulative = FALSE)) assign(paste("oh.prec.mov.sum.2wk", year, sep=""), runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], n = 14, cumulative = FALSE)) assign(paste("oh.dd.mov.sum.wk", year, sep=""), runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year == year], n = 7, cumulative = FALSE)) assign(paste("oh.dd.mov.sum.2wk", year, sep=""), runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year == year], n = 14, cumulative = FALSE))
  • 10. oh.temp.mov.avg.wk <- c(oh.temp.mov.avg.wk, get(paste("oh.temp.mov.avg.wk", year, sep=""))) oh.temp.mov.avg.2wk <- c(oh.temp.mov.avg.2wk, get(paste("oh.temp.mov.avg.2wk", year, sep=""))) oh.prec.mov.avg.wk <- c(oh.prec.mov.avg.wk, get(paste("oh.prec.mov.avg.wk", year, sep=""))) oh.prec.mov.avg.2wk <- c(oh.prec.mov.avg.2wk, get(paste("oh.prec.mov.avg.2wk", year, sep=""))) oh.prec.mov.sum.wk <- c(oh.prec.mov.sum.wk, get(paste("oh.prec.mov.sum.wk", year, sep=""))) oh.prec.mov.sum.2wk <- c(oh.prec.mov.sum.2wk, get(paste("oh.prec.mov.sum.2wk", year, sep=""))) oh.dd.mov.sum.wk <- c(oh.dd.mov.sum.wk, get(paste("oh.dd.mov.sum.wk", year, sep=""))) oh.dd.mov.sum.2wk <- c(oh.dd.mov.sum.2wk, get(paste("oh.dd.mov.sum. 2wk", year, sep=""))) #Midway assign(paste("mw.temp.mov.avg.wk", year, sep=""), runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year == year], 7, alg="C", endrule="mean")) assign(paste("mw.temp.mov.avg.2wk", year, sep=""), runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year == year], 14, alg="C", endrule="mean")) assign(paste("mw.prec.mov.avg.wk", year, sep=""), runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], 7, alg="C", endrule="mean")) assign(paste("mw.prec.mov.avg.2wk", year, sep=""), runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], 14, alg="C", endrule="mean")) assign(paste("mw.prec.mov.sum.wk", year, sep=""), runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], n = 7, cumulative = FALSE)) assign(paste("mw.prec.mov.sum.2wk", year, sep=""), runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], n = 14, cumulative = FALSE)) assign(paste("mw.dd.mov.sum.wk", year, sep=""), runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year == year], n = 7, cumulative = FALSE)) assign(paste("mw.dd.mov.sum.2wk", year, sep=""), runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year == year], n = 14, cumulative = FALSE))
  • 11. mw.temp.mov.avg.wk <- c(mw.temp.mov.avg.wk, get(paste("mw.temp.mov.avg.wk", year, sep=""))) mw.temp.mov.avg.2wk <- c(mw.temp.mov.avg.2wk, get(paste("mw.temp.mov.avg.2wk", year, sep=""))) mw.prec.mov.avg.wk <- c(mw.prec.mov.avg.wk, get(paste("mw.prec.mov.avg.wk", year, sep=""))) mw.prec.mov.avg.2wk <- c(mw.prec.mov.avg.2wk, get(paste("mw.prec.mov.avg.2wk", year, sep=""))) mw.prec.mov.sum.wk <- c(mw.prec.mov.sum.wk, get(paste("mw.prec.mov.sum.wk", year, sep=""))) mw.prec.mov.sum.2wk <- c(mw.prec.mov.sum.2wk, get(paste("mw.prec.mov.sum.2wk", year, sep=""))) mw.dd.mov.sum.wk <- c(mw.dd.mov.sum.wk, get(paste("mw.dd.mov.sum.wk", year, sep=""))) mw.dd.mov.sum.2wk <- c(mw.dd.mov.sum.2wk, get(paste("mw.dd.mov.sum. 2wk", year, sep=""))) } #Add Variable Matrices to Weather Data Frame wnv.weather.oh$temp.mov.avg.wk <- oh.temp.mov.avg.wk wnv.weather.oh$temp.mov.avg.2wk <- oh.temp.mov.avg.2wk wnv.weather.oh$prec.mov.avg.wk <- oh.prec.mov.avg.wk wnv.weather.oh$prec.mov.avg.2wk <- oh.prec.mov.avg.2wk wnv.weather.oh$prec.mov.sum.wk <- oh.prec.mov.sum.wk wnv.weather.oh$prec.mov.sum.2wk <- oh.prec.mov.sum.2wk wnv.weather.oh$dd.mov.sum.wk <- oh.dd.mov.sum.wk wnv.weather.oh$dd.mov.sum.2wk <- oh.dd.mov.sum.2wk wnv.weather.oh$prec.mov.sum.wk[is.na(wnv.weather.oh$prec.mov.sum.wk)] <- 0 wnv.weather.oh$prec.mov.sum.2wk[is.na(wnv.weather.oh$prec.mov.sum. 2wk)] <- 0 wnv.weather.oh$dd.mov.sum.wk[is.na(wnv.weather.oh$dd.mov.sum.wk)] <- 0 wnv.weather.oh$dd.mov.sum.2wk[is.na(wnv.weather.oh$dd.mov.sum.2wk)] <- 0 wnv.weather.mw$temp.mov.avg.wk <- mw.temp.mov.avg.wk
  • 12. wnv.weather.mw$temp.mov.avg.2wk <- mw.temp.mov.avg.2wk wnv.weather.mw$prec.mov.avg.wk <- mw.prec.mov.avg.wk wnv.weather.mw$prec.mov.avg.2wk <- mw.prec.mov.avg.2wk wnv.weather.mw$prec.mov.sum.wk <- mw.prec.mov.sum.wk wnv.weather.mw$prec.mov.sum.2wk <- mw.prec.mov.sum.2wk wnv.weather.mw$dd.mov.sum.wk <- mw.dd.mov.sum.wk wnv.weather.mw$dd.mov.sum.2wk <- mw.dd.mov.sum.2wk wnv.weather.mw$prec.mov.sum.wk[is.na(wnv.weather.mw$prec.mov.sum.wk)] <- 0 wnv.weather.mw$prec.mov.sum.2wk[is.na(wnv.weather.mw$prec.mov.sum. 2wk)] <- 0 wnv.weather.mw$dd.mov.sum.wk[is.na(wnv.weather.mw$dd.mov.sum.wk)] <- 0 wnv.weather.mw$dd.mov.sum.2wk[is.na(wnv.weather.mw$dd.mov.sum.2wk)] <- 0 options(scipen=100) #Test Data Pre- processing------------------------------------------------------------ ------------------------------------------ wnv.test <- data.frame(test[,list(Date, Id, dWeek, Species, Trap, Latitude, Longitude, Species2)]) #Cast Date Type wnv.test$Date <- as.Date(wnv.test$Date) oh.lat <- 41.995 oh.long <- -87.933 mw.lat <- 41.786 mw.long <- -87.752 for (i in 1:nrow(wnv.test)) { #Euclidean Distances wnv.test$dist.oh[i] <- sqrt((wnv.test$Latitude[i]-oh.lat)**2+ (wnv.test$Longitude[i]-oh.long)**2)*111 wnv.test$dist.mw[i] <- sqrt((wnv.test$Latitude[i]-mw.lat)**2+ (wnv.test$Longitude[i]-mw.long)**2)*111 #Haversine Distance
  • 13. wnv.test$disth.oh[i] <- distHaversine(c(wnv.test$Longitude[i], wnv.test$Latitude[i]), c(oh.long, oh.lat))/1000 wnv.test$disth.mw[i] <- distHaversine(c(wnv.test$Longitude[i], wnv.test$Latitude[i]), c(mw.long, mw.lat))/1000 if (wnv.test$dist.oh[i] >= wnv.test$dist.mw[i]) { wnv.test$Station.Euc[i] <- 2 #2 is Midway } else { wnv.test$Station.Euc[i] <- 1 #1 is O'Hare } if (wnv.test$disth.oh[i] >= wnv.test$disth.mw[i]) { wnv.test$Station[i] <- 2 #2 is Midway } else { wnv.test$Station[i] <- 1 #1 is O'Hare } #Creating ISO Week Number #wnv.test$week.num[i] <- substring(ISOweek(wnv.test$Date[i]), 7, 8) } #Concatenating Clean Weather Datasets and Merging with Training/Test Data---------------------------------------------------------- #wnv.train$Species2 <- train$Species2 #wnv.test$Species2 <- test$Species2 wnv.weather.clean <- rbind(wnv.weather.oh, wnv.weather.mw) wnv.train.weather <- merge(wnv.train, wnv.weather.clean, by=c("Date", "Station")) wnv.test.weather <- merge(wnv.test, wnv.weather.clean, by=c("Date", "Station")) #Just use one station... #Sorting wnv.test.weather wnv.test.weather.sorted <- wnv.test.weather[order(wnv.test.weather $Id),]
  • 14. #Training/Testing Split----------------------------------------------------------------- ------------------------------------ wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2007,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2007,] #----------------------------------------------------- wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2009,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2009,] #----------------------------------------------------- #Set aside 2011 data as test, and train on the remaining wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2011,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2011,] #----------------------------------------------------- wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year ! = 2013,] wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2013,] #Clustering Locations - DBSCAN---------------------------------------------------------------- --------------------------- #Potential epsilon values: #1/4 mile = 0.003623 #1/2 mile = 0.007246 #1 mile = 0.014493 #2 mile = 0.028986 #3 mile = 0.043479
  • 15. #Based on 1deg Latitude/Longitude = 69 miles x.train <- cbind(wnv.train.weather$Longitude, wnv.train.weather $Latitude) dbs.train <- dbscan(x.train, 0.003623) plot(dbs.train, x.train) print(dbs.train) prediction.train <- predict(dbs.train, x.train) #length(prediction.train) #unique(prediction.train) wnv.train.weather$loc.cluster.qmile <- as.factor(prediction.train) x.test <- cbind(wnv.test.weather.sorted$Longitude, wnv.test.weather.sorted$Latitude) prediction.test <- predict(dbs.train, x.train, x.test) #length(prediction.test) #unique(prediction.test) wnv.test.weather.sorted$loc.cluster.2mile <- as.factor(prediction.test) #Clustering Locations - K- Means----------------------------------------------------------------- ----------------------- #With normalized Longitude & Latitude train.means <- apply(wnv.train.weather[c("Longitude", "Latitude")], 2, mean) train.sds <-apply(wnv.train.weather[c("Longitude", "Latitude")], 2, sd) train.loc.norm <- scale(wnv.train.weather[c("Longitude", "Latitude")],center=means,scale=sds) test.means <- apply(wnv.test.weather.sorted[c("Longitude", "Latitude")], 2, mean) test.sds <-apply(wnv.test.weather.sorted[c("Longitude", "Latitude")], 2, sd) test.loc.norm <- scale(wnv.test.weather.sorted[c("Longitude", "Latitude")],center=means,scale=sds) set.seed(123) clusters.kmeans.norm <- kmeans(train.loc.norm, 20)
  • 16. clusters.kmeans.norm$centers plot(clusters.kmeans.norm$centers) pred.cluster.norm.train <- predict(clusters.kmeans.norm, data=train.loc.norm) pred.cluster.norm.test <- predict(clusters.kmeans.norm, data=test.loc.norm) #With un-normalized Longitude & Latitude train.loc <- wnv.train.weather[c("Longitude", "Latitude")] test.loc <- wnv.test.weather.sorted[c("Longitude", "Latitude")] set.seed(123) clusters.kmeans <- kmeans(train.loc, 25) clusters.kmeans$centers plot(clusters.kmeans$centers) pred.cluster.train <- predict(clusters.kmeans.norm, data=train.loc.norm) pred.cluster.test <- predict(clusters.kmeans.norm, data=test.loc.norm) #Assign to training/testing set wnv.train.weather$kmeans.cluster.n <- as.factor(pred.cluster.norm.train) wnv.test.weather.sorted$kmeans.cluster.n <- as.factor(pred.cluster.norm.test) #PCA------------------------------------------------------------------ ------------------------------------------------- keeps1 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb", "StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed", "acc.deg.day", "temp.mov.avg.wk", "prec.mov.avg.wk", "prec.mov.sum.wk", "dd.mov.sum.wk" ) keeps2 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb", "StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed") #model.pca <- princomp(wnv.train.weather[keeps1], cor=TRUE,
  • 17. scores=TRUE) #model.pca model.pca.train1 <- princomp(wnv.train.weather[keeps1], cor=TRUE, scores=TRUE) model.pca.test1 <- princomp(wnv.test.weather.sorted[keeps1], cor=TRUE, scores=TRUE) model.pca.train <- princomp(wnv.train.weather[keeps2], cor=TRUE, scores=TRUE) model.pca.test <- princomp(wnv.test.weather.sorted[keeps2], cor=TRUE, scores=TRUE) plot(model.pca.train) summary(model.pca.train) plot(model.pca.test) summary(model.pca.test) plot(model.pca.train1) summary(model.pca.train1) model.pca.train$scores model.pca.test$scores model.pca.train$loadings model.pca.test$loadings model.pca.train1$loadings pca.out.train <- data.frame(model.pca.train$scores) pca.out.test <- data.frame(model.pca.test$scores) wnv.train.weather <- cbind(wnv.train.weather, pca.out.train) wnv.test.weather.sorted <- cbind(wnv.test.weather.sorted, pca.out.test) #wnv.train.weather.pca <- cbind(wnv.train.weather, pca.out) #Dealing with unbalanced classes---------------------------------------------------------------
  • 18. ------------------------ wnv.train.no <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent == 0,] wnv.train.yes <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent == 1,] set.seed(1234) wnv.train.no.75 <- wnv.train.no[sample(1:nrow(wnv.train.no), 5852, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.75, wnv.train.yes) set.seed(1234) wnv.train.no.50 <- wnv.train.no[sample(1:nrow(wnv.train.no), 3901, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.50, wnv.train.yes) set.seed(1234) wnv.train.no.25 <- wnv.train.no[sample(1:nrow(wnv.train.no), 1951, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.25, wnv.train.yes) set.seed(1234) wnv.train.no.10 <- wnv.train.no[sample(1:nrow(wnv.train.no), 780, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.10, wnv.train.yes) set.seed(1234) wnv.train.no.5 <- wnv.train.no[sample(1:nrow(wnv.train.no), 312, replace=FALSE),] wnv.train.weather.1 <- rbind(wnv.train.no.5, wnv.train.yes) #70/30 Split-------------------------------------------- set.seed(1234) #set.seed(4321) ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7, 0.3)) trainData <- wnv.train.weather[ind==1,] testData <- wnv.train.weather[ind==2,] #GAM Modeling--------------------------------------------------------------
  • 19. -------------------------------------------------- #Experiment with PCA and Kmeans------------------------------- train.model <- gam(WnvPresent ~ s(dWeek) + Species2 + kmeans.cluster.n + s(Comp.1) + s(Comp.2) + s(Comp.3) + s(Comp.4) + s(Comp.5) + s(acc.deg.day) + s(temp.mov.avg.wk) + s(prec.mov.avg.2wk) #+ s(prec.mov.sum.wk) , data = trainData, family="binomial", bf.maxit = 1000) prediction.gam <-predict(train.model, newdata = testData, type = "response") auc(testData$WnvPresent, prediction.gam) #------------------------------------------------------------- fitSubmit <- update(train.model, data=wnv.train.weather) pSubmit.gam<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "response") submissionFile<-cbind(test$Id,pSubmit.gam) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi134prec.csv",row.names=FALSE ,quote=FALSE) #GLM------------------------------------------------------------------ -------------------------------------------------------- #Create Squared Term deg.day, dWeek, dd.mov.sum.2wk wnv.train.weather$acc.deg.day2 <- (wnv.train.weather$acc.deg.day)**2 wnv.train.weather$dWeek2 <- (wnv.train.weather$dWeek)**2 wnv.train.weather$dd.mov.sum.2wk2 <- (wnv.train.weather$dd.mov.sum. 2wk)**2 wnv.test.weather.sorted$acc.deg.day2 <- (wnv.test.weather.sorted
  • 20. $acc.deg.day)**2 wnv.test.weather.sorted$dWeek2 <- (wnv.test.weather.sorted$dWeek)**2 wnv.test.weather.sorted$dd.mov.sum.2wk2 <- (wnv.test.weather.sorted $dd.mov.sum.2wk)**2 #Experiment with PCA and Kmeans-------------------------------- log.model <- glm(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n + dWeek2 + Comp.1 #+ Comp.2 #+ Comp.3 #+ Comp.4 #+ Comp.5 #+ acc.deg.day #+ acc.deg.day2 + temp.mov.avg.2wk + prec.mov.avg.2wk , data = trainData, family=binomial()) summary(log.model) prediction.glm <-predict(log.model, newdata = testData, type = "response") auc(testData$WnvPresent, prediction.glm) #------------------------------------------------------------ fitSubmit <- update(log.model, data=wnv.train.weather) pSubmit.glm <-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "response") #summary(pSubmit.glm) submissionFile<-cbind(test$Id,pSubmit.glm) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #Base Model Submission AUC: 0.64569 #Base Train AUC: 0.78259055 #LDA------------------------------------------------------------------ --------------------------------------------------------
  • 21. lda.model <- lda(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day + acc.deg.day2 #+ temp.mov.avg.2wk + prec.mov.avg.wk + prec.mov.sum.2wk ,data = trainData) prediction.lda <- predict(lda.model, testData) lda.df <- data.frame(prediction.lda$posterior) auc(testData$WnvPresent, lda.df[,2]) #------------------------------------------------------------------- fitSubmit <- update(lda.model, data=wnv.train.weather) pSubmit<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "probability") lda.test <- data.frame(pSubmit$posterior) submissionFile<-cbind(test$Id,lda.df.test[,2]) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #Random Forest---------------------------------------------------------------- ------------------------------------------------ set.seed(1234) rf.model <- randomForest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day #+ temp.mov.avg.2wk
  • 22. #+ prec.mov.avg.2wk #+ prec.mov.sum.2wk , data = trainData, nTree = 1000) prediction.rf <- predict(rf.model, testData, type = "prob") rf.df <- data.frame(prediction.rf) auc(testData$WnvPresent.C, rf.df[,2]) fitSubmit <- update(rf.model, data=wnv.train.weather) pSubmit.rf<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "prob") rf.df.test <- data.frame(pSubmit.rf) submissionFile<-cbind(test$Id, rf.df.test[,2]) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #PCA and Kmeans------------------- set.seed(1234) rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 #+ acc.deg.day #+ temp.mov.avg.2wk #+ prec.mov.avg.2wk #+ prec.mov.sum.wk ,data = trainData, controls=cforest_unbiased(ntree=200, mtry=2)) prediction.rf <- predict(rf.model, newdata=testData, type = "prob") rf.df <- data.frame(prediction.rf) x <- vector() for (i in seq(2, 6232, 2)) { x <- c(x, rf.df[1, i]) } auc(testData$WnvPresent.C, x) set.seed(1234)
  • 23. rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 #+ Comp.3 #+ Comp.4 #+ Comp.5 #+ acc.deg.day #+ temp.mov.avg.2wk #+ prec.mov.avg.2wk ,data = wnv.train.weather, controls=cforest_unbiased(ntree=10, mtry=2, maxdepth = 10)) prediction.rf <- predict(rf.model, newdata=wnv.test.weather.sorted, type = "prob") rf.df <- data.frame(prediction.rf) #H20 Random Forest-------------- library(h2o) localH2O <- h2o.init(nthreads = -1,max_mem_size = '7g') rf.model <- h2o.randomForest(x=c(3, 9, 43, 44:48),y = 53,data = trainData, mtries = 18, sample.rate = 0.5, classification = T,ntree = 200,verbose = T) prediction.rf <- h2o.predict(rf.model,testData) #Support Vector Machines-------------------------------------------------------------- ---------------------------------------- #PCA and kmeans---------------------------------------------------- set.seed(1234) svm.model <- svm(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day + temp.mov.avg.2wk #+ prec.mov.avg.wk
  • 24. #+ prec.mov.sum.2wk , data = trainData, probability = TRUE) #summary(svm.model) prediction.svm <- predict(svm.model, testData, probability = TRUE) svm.df <- data.frame(attr(prediction.svm, "probabilities")) auc(testData$WnvPresent.C, svm.df[,2]) #------------------------------------------------------------------ set.seed(1234) fitSubmit <- update(svm.model, data=wnv.train.weather) pSubmit.svm<-predict(fitSubmit, newdata = wnv.test.weather.sorted, probability = TRUE) svm.df.test <- data.frame(attr(pSubmit.svm, "probabilities")) submissionFile<-cbind(test$Id, svm.df.test[,2]) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote= FALSE) #Ensemble------------------------------------------------------------- -------------------------------------------------------- gam.train <- as.matrix(prediction.gam) glm.train <- as.matrix(prediction.glm) lda.train <- as.matrix(lda.df[,2]) rf.train <- as.matrix(rf.df[,2]) svm.train <- as.matrix(svm.df[,2]) ens.train <- cbind(gam.train, lda.train, rf.train, svm.train) gam.test <- as.matrix(pSubmit.gam) glm.test <- as.matrix(pSubmit.glm) lda.test <- as.matrix(lda.df.test[,2]) rf.test <- as.matrix(rf.df.test[,2]) svm.test <- as.matrix(svm.df.test[,2]) ens.test <- cbind(lda.test, rf.test, svm.test) w <- matrix(c(0.0, 0.3, 0.1, 0.6), nrow = 4, ncol = 1) ens.model <- ens.train %*% w auc(testData$WnvPresent, ens.model) w <- matrix(c(0.3, 0.1, 0.6), nrow = 3, ncol = 1) ens.model.test <- ens.test %*% w
  • 25. submissionFile<-cbind(test$Id,ens.model.test) colnames(submissionFile)<-c("Id","WnvPresent") options("scipen"=100, "digits"=8) write.csv(submissionFile,"Submission_MaxLiBest3.csv",row.names=FALSE,q uote=FALSE) #Bagging-------------------------------------------------------------- ------------------------------------------------------ set.seed(1234) ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7, 0.3)) trainData <- wnv.train.weather[ind==1,] testData <- wnv.train.weather[ind==2,] set.seed(4321) all_data <- wnv.train.weather positions <- sample(nrow(all_data),size=floor((nrow(all_data)/10)*7)) trainBag<- all_data[positions,] testBag<- all_data[-positions,] library(foreach) length_divisor<-6 iterations<-50 predictions<-foreach(m=1:iterations,.combine=cbind) %do% { training_positions <- sample(nrow(trainData), size=floor((nrow(trainData)/length_divisor))) train_pos<-1:nrow(trainData) %in% training_positions gam.bag <- gam(WnvPresent ~ lo(dWeek) + Species2 + lo(Longitude, Latitude) + s(acc.deg.day), data = trainData[train_pos,] , family="binomial", bf.maxit = 45) predict(gam.bag, newdata = testData, type = "response") } predictions<-data.frame(rowMeans(predictions)) auc(testData$WnvPresent.C, predictions) #-------------------------------------------------------------------- library(foreach) length_divisor<-6 iterations<-1000 predictions<-foreach(m=1:iterations,.combine=cbind) %do% { training_positions <- sample(nrow(trainData), size=floor((nrow(trainData)/length_divisor)))
  • 26. train_pos<-1:nrow(trainData) %in% training_positions glm.bag <- glm(WnvPresent ~ dWeek + dWeek2 + Species2 + Longitude + acc.deg.day + acc.deg.day2 , data = trainData[train_pos,], family=binomial()) #summary(log.model) predict(glm.bag, newdata = testData, type = "response") } predictions<-data.frame(rowMeans(predictions)) auc(testData$WnvPresent.C, predictions) #Vs. gam.test <- gam(WnvPresent ~ lo(dWeek) + Species2 + loc.cluster.qmile, data = trainBag, family="binomial", bf.maxit = 200) prediction.gam <-predict(gam.test, newdata = testBag, type = "response") auc(testBag$WnvPresent, prediction.gam) #Adabag Exploration----------------------------------------------------------- --------------------------------------------- wnv.adaboost <- boosting(WnvPresent.C ~ dWeek, data = trainData, boos=TRUE, mfinal = 5)