West-Nile-Virus | Kaggle

26
#install.packages("ISOweek") #install.packages("stringr") #install.packages("caTools") #install.packages("TTR") #install.packages("gam") #install.packages("data.table") #install.packages("Metrics") #install.packages("randomForest") #install.packages("MASS") #install.packages("e1071") #install.packages("fpc", dependencies = TRUE) #install.packages("geosphere") #install.packages("scatterplot3d") #install.packages("party") #install.packages("adabag") #install.packages("DeducerExtras") #install.packages("h2o") library(ISOweek) library(stringr) library(caTools) library(TTR) require(gam) library(data.table) library(Metrics) require(randomForest) require(MASS) require(e1071) library(fpc) library(geosphere) library(scatterplot3d) require(party) require(adabag) require(DeducerExtras) require(h2o) #Read Raw Data------------------------------------------------------------------ ------------------------------------------------ train <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/ Working Folder/train.csv") test <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working Folder/test.csv") weather <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/

Transcript of West-Nile-Virus | Kaggle

Page 1: West-Nile-Virus | Kaggle

#install.packages("ISOweek")#install.packages("stringr")#install.packages("caTools")#install.packages("TTR")#install.packages("gam")#install.packages("data.table")#install.packages("Metrics")#install.packages("randomForest")#install.packages("MASS")#install.packages("e1071")#install.packages("fpc", dependencies = TRUE)#install.packages("geosphere")#install.packages("scatterplot3d")#install.packages("party")#install.packages("adabag")#install.packages("DeducerExtras")#install.packages("h2o")library(ISOweek)library(stringr)library(caTools)library(TTR)require(gam)library(data.table)library(Metrics)require(randomForest)require(MASS)require(e1071)library(fpc)library(geosphere)library(scatterplot3d)require(party)require(adabag)require(DeducerExtras)require(h2o)

#Read Raw Data------------------------------------------------------------------------------------------------------------------

train <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working Folder/train.csv")test <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working Folder/test.csv")weather <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/

Page 2: West-Nile-Virus | Kaggle

Working Folder/weather.csv")spray <- fread("C:/Users/Maxie/Documents/Kaggle/West Nile Virus/Working Folder/spray.csv")

#Get Week Number and ReCategorize Species---------------------------------------------------------------------------------------

vSpecies<-c(as.character(train$Species),as.character(test$Species))vSpecies[vSpecies=="UNSPECIFIED CULEX"]<-"CULEX ERRATICUS"vSpecies[-which(vSpecies == "CULEX PIPIENS" | vSpecies == "CULEX PIPIENS/RESTUANS" | vSpecies == "CULEX RESTUANS")] = "CULEX OTHER"vSpecies<-factor(vSpecies,levels=unique(vSpecies))

## data.table syntax for adding a column; could overwrite the existing column as welltrain[,Species2:=factor(vSpecies[1:nrow(train)],levels=unique(vSpecies))]test[,Species2:=factor(vSpecies[(nrow(train)+1):length(vSpecies)],levels=unique(vSpecies))]

## also add some fields for components of the date using simple substringstrain[,dMonth:=as.factor(paste(substr(train$Date,6,7)))]train[,dYear:=as.factor(paste(substr(train$Date,1,4)))]train$Date = as.Date(train$Date, format="%Y-%m-%d")xsDate = as.Date(paste0(train$dYear, "0101"), format="%Y%m%d")train$dWeek = as.numeric(paste(floor((train$Date - xsDate + 1)/7)))

test[,dMonth:=as.factor(paste(substr(test$Date,6,7)))]test[,dYear:=as.factor(paste(substr(test$Date,1,4)))]test$Date = as.Date(test$Date, format="%Y-%m-%d")tsDate = as.Date(paste0(test$dYear, "0101"), format="%Y%m%d")test$dWeek = as.numeric(paste(floor((test$Date - tsDate + 1)/7)))

weather[,dMonth:=as.factor(paste(substr(weather$Date,6,7)))]weather[,dYear:=as.factor(paste(substr(weather$Date,1,4)))]weather$Date = as.Date(weather$Date, format="%Y-%m-%d")tsDate = as.Date(paste0(weather$dYear, "0101"), format="%Y%m%d")weather$dWeek = as.numeric(paste(floor((weather$Date - tsDate + 1)/7)))

#Train Data Pre-

Page 3: West-Nile-Virus | Kaggle

processing------------------------------------------------------------------------------------------------------

wnv.train <- data.frame(train[,list(Date, dWeek, Species, Trap, NumMosquitos, Latitude, Longitude, Species2, WnvPresent)])

#Cast Date Typewnv.train$Date <- as.Date(wnv.train$Date)

oh.lat <- 41.995oh.long <- -87.933mw.lat <- 41.786mw.long <- -87.752

for (i in 1:nrow(wnv.train)) { #Euclidean Distances wnv.train$dist.oh[i] <- sqrt((wnv.train$Latitude[i]-oh.lat)**2+(wnv.train$Longitude[i]-oh.long)**2)*111 wnv.train$dist.mw[i] <- sqrt((wnv.train$Latitude[i]-mw.lat)**2+(wnv.train$Longitude[i]-mw.long)**2)*111 #Haversine Distance wnv.train$disth.oh[i] <- distHaversine(c(wnv.train$Longitude[i], wnv.train$Latitude[i]), c(oh.long, oh.lat))/1000 wnv.train$disth.mw[i] <- distHaversine(c(wnv.train$Longitude[i], wnv.train$Latitude[i]), c(mw.long, mw.lat))/1000 if (wnv.train$dist.oh[i] >= wnv.train$dist.mw[i]) { wnv.train$Station.Euc[i] <- 2 #2 is Midway } else { wnv.train$Station.Euc[i] <- 1 #1 is O'Hare } if (wnv.train$disth.oh[i] >= wnv.train$disth.mw[i]) { wnv.train$Station[i] <- 2 #2 is Midway } else { wnv.train$Station[i] <- 1 #1 is O'Hare } #Creating ISO Week Number #wnv.train$week.num[i] <- substring(ISOweek(wnv.train$Date[i]), 7, 8)

Page 4: West-Nile-Virus | Kaggle

}

#Weather Data Cleaning------------------------------------------------------------------------------------------------------------

#Choose Variables

wnv.weather <- data.frame(weather[,list(Station, Date, Tmax, Tmin, Tavg, Depart, DewPoint, WetBulb, Heat, Cool, PrecipTotal, StnPressure, SeaLevel, ResultSpeed, ResultDir, AvgSpeed)])#Cast Date Type

wnv.weather$Date <- as.Date(wnv.weather$Date)

#Create Week Number Variable

for (i in 1:nrow(wnv.weather)) { wnv.weather$week.num[i] <- as.numeric(substring(ISOweek(wnv.weather$Date[i]), 7, 8)) #Create Week Markers wnv.weather$date.year[i] <- as.numeric(format(wnv.weather$Date[i], "%Y")) #wnv.weather$date.month[i] <- as.numeric(format(wnv.weather$Date[i], "%m")) #wnv.weather$date.day[i] <- as.numeric(format(wnv.weather$Date[i], "%d"))}

#Clean Temperature

wnv.weather$Tmax <- as.numeric(as.character(wnv.weather$Tmax)) wnv.weather$Tmin <- as.numeric(as.character(wnv.weather$Tmin)) wnv.weather$Tavg <- as.numeric(as.character(wnv.weather$Tavg)) wnv.weather$Tavg[is.na(wnv.weather$Tavg)] <- (wnv.weather$Tmax[is.na(wnv.weather$Tavg)] + wnv.weather$Tmin[is.na(wnv.weather$Tavg)])/2

#Clean DewPoint

wnv.weather$DewPoint <- as.numeric(as.character(wnv.weather$DewPoint))

Page 5: West-Nile-Virus | Kaggle

#Clean Depart

wnv.weather$Depart[wnv.weather$Depart=="M"] <- NAwnv.weather$Depart <- as.numeric(as.character(wnv.weather$Depart))

#Clean WetBulb

wnv.weather$WetBulb <- as.numeric(as.character(wnv.weather$WetBulb))wnv.weather$WetBulb[849]<- wnv.weather$WetBulb[850]wnv.weather$WetBulb[2411]<- wnv.weather$WetBulb[2412]wnv.weather$WetBulb[2413]<- wnv.weather$WetBulb[2414]wnv.weather$WetBulb[2416]<- wnv.weather$WetBulb[2415]

#Clean Heat

wnv.weather$Heat <- as.numeric(as.character(wnv.weather$Heat))wnv.weather$Heat[8]<- wnv.weather$Heat[7]wnv.weather$Heat[506]<- wnv.weather$Heat[505]wnv.weather$Heat[676]<- wnv.weather$Heat[675]wnv.weather$Heat[1638]<- wnv.weather$Heat[1637]wnv.weather$Heat[2068]<- wnv.weather$Heat[2067]wnv.weather$Heat[2212]<- wnv.weather$Heat[2411]wnv.weather$Heat[2502]<- wnv.weather$Heat[2501]wnv.weather$Heat[2512]<- wnv.weather$Heat[2511]wnv.weather$Heat[2526]<- wnv.weather$Heat[2525]wnv.weather$Heat[2580]<- wnv.weather$Heat[2579]wnv.weather$Heat[2812]<- wnv.weather$Heat[2811]

#Clean Cool

wnv.weather$Cool <- as.numeric(as.character(wnv.weather$Cool))wnv.weather$Cool[8]<- wnv.weather$Cool[7]wnv.weather$Cool[506]<- wnv.weather$Cool[505]wnv.weather$Cool[676]<- wnv.weather$Cool[675]wnv.weather$Cool[1638]<- wnv.weather$Cool[1637]wnv.weather$Cool[2068]<- wnv.weather$Cool[2067]wnv.weather$Cool[2212]<- wnv.weather$Cool[2411]wnv.weather$Cool[2502]<- wnv.weather$Cool[2501]wnv.weather$Cool[2512]<- wnv.weather$Cool[2511]wnv.weather$Cool[2526]<- wnv.weather$Cool[2525]wnv.weather$Cool[2580]<- wnv.weather$Cool[2579]wnv.weather$Cool[2812]<- wnv.weather$Cool[2811]

Page 6: West-Nile-Virus | Kaggle

#Clean PrecipTotal

wnv.weather$PrecipTotal <- as.numeric(as.character(wnv.weather$PrecipTotal))wnv.weather$PrecipTotal[is.na(wnv.weather$PrecipTotal)] <- 0

#Clean StnPressure

wnv.weather$StnPressure <- as.numeric(as.character(wnv.weather$StnPressure))wnv.weather$StnPressure[88]<- wnv.weather$StnPressure[87]wnv.weather$StnPressure[849]<- wnv.weather$StnPressure[850]wnv.weather$StnPressure[2411]<- wnv.weather$StnPressure[2410]wnv.weather$StnPressure[2412]<- wnv.weather$StnPressure[2410]

#Clean SeaLevel

wnv.weather$SeaLevel <- as.numeric(as.character(wnv.weather$SeaLevel))wnv.weather$SeaLevel[88]<- wnv.weather$SeaLevel[87]wnv.weather$SeaLevel[833]<- wnv.weather$SeaLevel[834]wnv.weather$SeaLevel[995]<- wnv.weather$SeaLevel[996]wnv.weather$SeaLevel[1733]<- wnv.weather$SeaLevel[1734]wnv.weather$SeaLevel[1746]<- wnv.weather$SeaLevel[1745]wnv.weather$SeaLevel[1757]<- wnv.weather$SeaLevel[1758]wnv.weather$SeaLevel[2068]<- wnv.weather$SeaLevel[2067]wnv.weather$SeaLevel[2091]<- wnv.weather$SeaLevel[2092]wnv.weather$SeaLevel[2744]<- wnv.weather$SeaLevel[2743]

#Clean ResultSpeed

wnv.weather$ResultSpeed <- as.numeric(as.character(wnv.weather$ResultSpeed))

#Clean ResultDir

wnv.weather$ResultDir <- as.numeric(as.character(wnv.weather$ResultDir))

#Clean AvgSpeed

wnv.weather$AvgSpeed <- as.numeric(as.character(wnv.weather$AvgSpeed))wnv.weather$AvgSpeed[is.na(wnv.weather$AvgSpeed)] <- mean(wnv.weather$AvgSpeed, na.rm =TRUE)

Page 7: West-Nile-Virus | Kaggle

#Split weather data into O'Hare and Midway

wnv.weather.oh <- subset(wnv.weather, wnv.weather$Station == 1)wnv.weather.mw <- subset(wnv.weather, wnv.weather$Station == 2)

#Create New Weather Features: temp.mov.avg, prec.mov.avg, deg.day, accumulated degree day for each year

Tbase <- 71.6 #22deg C = 71.6deg Fwnv.weather.oh$deg.day <- 0wnv.weather.oh$deg.day.c <- 0wnv.weather.oh$acc.deg.day <- 0wnv.weather.oh$acc.deg.day.c <- 0newyear <- TRUE

#For O'Hare

start <- 1for (year in 2007:2014) { iterations <- nrow(subset(wnv.weather.oh, wnv.weather.oh$date.year == year )) for (i in start:(start+iterations-1)) { Tmean <- (wnv.weather.oh$Tmax[i] + wnv.weather.oh$Tmin[i])/2 if ( Tmean > Tbase) { wnv.weather.oh$deg.day[i] <- Tmean - Tbase wnv.weather.oh$deg.day.c[i] <- 0 } else { wnv.weather.oh$deg.day[i] <- 0 wnv.weather.oh$deg.day.c[i] <- Tbase - Tmean } if (newyear == TRUE) { newyear <- FALSE wnv.weather.oh$acc.deg.day[i] <- 0 } else { wnv.weather.oh$acc.deg.day[i] <- wnv.weather.oh$deg.day[i] + wnv.weather.oh$acc.deg.day[i-1] wnv.weather.oh$acc.deg.day.c[i] <- wnv.weather.oh$deg.day.c[i] + wnv.weather.oh$acc.deg.day.c[i-1] } } start <- start + iterations newyear <- TRUE}

Page 8: West-Nile-Virus | Kaggle

#For Midway

Tbase <- 71.6 #22deg C = 71.6deg Fwnv.weather.mw$deg.day <- 0wnv.weather.mw$deg.day.c <- 0wnv.weather.mw$acc.deg.day <- 0wnv.weather.mw$acc.deg.day.c <- 0newyear <- TRUE

start <- 1for (year in 2007:2014) { iterations <- nrow(subset(wnv.weather.mw, wnv.weather.mw$date.year == year )) for (i in start:(start+iterations-1)) { Tmean <- (wnv.weather.mw$Tmax[i] + wnv.weather.mw$Tmin[i])/2 if ( Tmean > Tbase) { wnv.weather.mw$deg.day[i] <- Tmean - Tbase wnv.weather.mw$deg.day.c[i] <- 0 } else { wnv.weather.mw$deg.day[i] <- 0 wnv.weather.mw$deg.day.c[i] <- Tbase - Tmean } if (newyear == TRUE) { newyear <- FALSE wnv.weather.mw$acc.deg.day[i] <- 0 } else { wnv.weather.mw$acc.deg.day[i] <- wnv.weather.mw$deg.day[i] + wnv.weather.mw$acc.deg.day[i-1] wnv.weather.mw$acc.deg.day.c[i] <- wnv.weather.mw$deg.day.c[i] + wnv.weather.mw$acc.deg.day.c[i-1] } } start <- start + iterations newyear <- TRUE}

#Creating Moving Averages and Moving Sums

oh.temp.mov.avg.wk <- vector()oh.temp.mov.avg.2wk <- vector()oh.prec.mov.avg.wk <- vector()

Page 9: West-Nile-Virus | Kaggle

oh.prec.mov.avg.2wk <- vector()oh.prec.mov.sum.wk <- vector()oh.prec.mov.sum.2wk <- vector()oh.dd.mov.sum.wk <- vector()oh.dd.mov.sum.2wk <- vector()

mw.temp.mov.avg.wk <- vector()mw.temp.mov.avg.2wk <- vector()mw.prec.mov.avg.wk <- vector()mw.prec.mov.avg.2wk <- vector()mw.prec.mov.sum.wk <- vector()mw.prec.mov.sum.2wk <- vector()mw.dd.mov.sum.wk <- vector()mw.dd.mov.sum.2wk <- vector()

for (year in 2007:2014) { #O'Hare assign(paste("oh.temp.mov.avg.wk", year, sep=""), runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year == year], 7, alg="C", endrule="mean")) assign(paste("oh.temp.mov.avg.2wk", year, sep=""), runmean(wnv.weather.oh$Tavg[wnv.weather.oh$date.year == year], 14, alg="C", endrule="mean")) assign(paste("oh.prec.mov.avg.wk", year, sep=""), runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], 7, alg="C", endrule="mean")) assign(paste("oh.prec.mov.avg.2wk", year, sep=""), runmean(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], 14, alg="C", endrule="mean")) assign(paste("oh.prec.mov.sum.wk", year, sep=""), runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], n = 7, cumulative = FALSE)) assign(paste("oh.prec.mov.sum.2wk", year, sep=""), runSum(wnv.weather.oh$PrecipTotal[wnv.weather.oh$date.year == year], n = 14, cumulative = FALSE)) assign(paste("oh.dd.mov.sum.wk", year, sep=""), runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year == year], n = 7, cumulative = FALSE)) assign(paste("oh.dd.mov.sum.2wk", year, sep=""), runSum(wnv.weather.oh$deg.day[wnv.weather.oh$date.year == year], n = 14, cumulative = FALSE))

Page 10: West-Nile-Virus | Kaggle

oh.temp.mov.avg.wk <- c(oh.temp.mov.avg.wk, get(paste("oh.temp.mov.avg.wk", year, sep=""))) oh.temp.mov.avg.2wk <- c(oh.temp.mov.avg.2wk, get(paste("oh.temp.mov.avg.2wk", year, sep=""))) oh.prec.mov.avg.wk <- c(oh.prec.mov.avg.wk, get(paste("oh.prec.mov.avg.wk", year, sep=""))) oh.prec.mov.avg.2wk <- c(oh.prec.mov.avg.2wk, get(paste("oh.prec.mov.avg.2wk", year, sep=""))) oh.prec.mov.sum.wk <- c(oh.prec.mov.sum.wk, get(paste("oh.prec.mov.sum.wk", year, sep=""))) oh.prec.mov.sum.2wk <- c(oh.prec.mov.sum.2wk, get(paste("oh.prec.mov.sum.2wk", year, sep=""))) oh.dd.mov.sum.wk <- c(oh.dd.mov.sum.wk, get(paste("oh.dd.mov.sum.wk", year, sep=""))) oh.dd.mov.sum.2wk <- c(oh.dd.mov.sum.2wk, get(paste("oh.dd.mov.sum.2wk", year, sep=""))) #Midway assign(paste("mw.temp.mov.avg.wk", year, sep=""), runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year == year], 7, alg="C", endrule="mean")) assign(paste("mw.temp.mov.avg.2wk", year, sep=""), runmean(wnv.weather.mw$Tavg[wnv.weather.mw$date.year == year], 14, alg="C", endrule="mean")) assign(paste("mw.prec.mov.avg.wk", year, sep=""), runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], 7, alg="C", endrule="mean")) assign(paste("mw.prec.mov.avg.2wk", year, sep=""), runmean(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], 14, alg="C", endrule="mean")) assign(paste("mw.prec.mov.sum.wk", year, sep=""), runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], n = 7, cumulative = FALSE)) assign(paste("mw.prec.mov.sum.2wk", year, sep=""), runSum(wnv.weather.mw$PrecipTotal[wnv.weather.mw$date.year == year], n = 14, cumulative = FALSE)) assign(paste("mw.dd.mov.sum.wk", year, sep=""), runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year == year], n = 7, cumulative = FALSE)) assign(paste("mw.dd.mov.sum.2wk", year, sep=""), runSum(wnv.weather.mw$deg.day[wnv.weather.mw$date.year == year], n = 14, cumulative = FALSE))

Page 11: West-Nile-Virus | Kaggle

mw.temp.mov.avg.wk <- c(mw.temp.mov.avg.wk, get(paste("mw.temp.mov.avg.wk", year, sep=""))) mw.temp.mov.avg.2wk <- c(mw.temp.mov.avg.2wk, get(paste("mw.temp.mov.avg.2wk", year, sep=""))) mw.prec.mov.avg.wk <- c(mw.prec.mov.avg.wk, get(paste("mw.prec.mov.avg.wk", year, sep=""))) mw.prec.mov.avg.2wk <- c(mw.prec.mov.avg.2wk, get(paste("mw.prec.mov.avg.2wk", year, sep=""))) mw.prec.mov.sum.wk <- c(mw.prec.mov.sum.wk, get(paste("mw.prec.mov.sum.wk", year, sep=""))) mw.prec.mov.sum.2wk <- c(mw.prec.mov.sum.2wk, get(paste("mw.prec.mov.sum.2wk", year, sep=""))) mw.dd.mov.sum.wk <- c(mw.dd.mov.sum.wk, get(paste("mw.dd.mov.sum.wk", year, sep=""))) mw.dd.mov.sum.2wk <- c(mw.dd.mov.sum.2wk, get(paste("mw.dd.mov.sum.2wk", year, sep=""))) }

#Add Variable Matrices to Weather Data Frame

wnv.weather.oh$temp.mov.avg.wk <- oh.temp.mov.avg.wk wnv.weather.oh$temp.mov.avg.2wk <- oh.temp.mov.avg.2wkwnv.weather.oh$prec.mov.avg.wk <- oh.prec.mov.avg.wkwnv.weather.oh$prec.mov.avg.2wk <- oh.prec.mov.avg.2wk

wnv.weather.oh$prec.mov.sum.wk <- oh.prec.mov.sum.wkwnv.weather.oh$prec.mov.sum.2wk <- oh.prec.mov.sum.2wkwnv.weather.oh$dd.mov.sum.wk <- oh.dd.mov.sum.wkwnv.weather.oh$dd.mov.sum.2wk <- oh.dd.mov.sum.2wk

wnv.weather.oh$prec.mov.sum.wk[is.na(wnv.weather.oh$prec.mov.sum.wk)] <- 0wnv.weather.oh$prec.mov.sum.2wk[is.na(wnv.weather.oh$prec.mov.sum.2wk)] <- 0wnv.weather.oh$dd.mov.sum.wk[is.na(wnv.weather.oh$dd.mov.sum.wk)] <- 0wnv.weather.oh$dd.mov.sum.2wk[is.na(wnv.weather.oh$dd.mov.sum.2wk)] <- 0

wnv.weather.mw$temp.mov.avg.wk <- mw.temp.mov.avg.wk

Page 12: West-Nile-Virus | Kaggle

wnv.weather.mw$temp.mov.avg.2wk <- mw.temp.mov.avg.2wkwnv.weather.mw$prec.mov.avg.wk <- mw.prec.mov.avg.wkwnv.weather.mw$prec.mov.avg.2wk <- mw.prec.mov.avg.2wk

wnv.weather.mw$prec.mov.sum.wk <- mw.prec.mov.sum.wkwnv.weather.mw$prec.mov.sum.2wk <- mw.prec.mov.sum.2wkwnv.weather.mw$dd.mov.sum.wk <- mw.dd.mov.sum.wkwnv.weather.mw$dd.mov.sum.2wk <- mw.dd.mov.sum.2wk

wnv.weather.mw$prec.mov.sum.wk[is.na(wnv.weather.mw$prec.mov.sum.wk)] <- 0wnv.weather.mw$prec.mov.sum.2wk[is.na(wnv.weather.mw$prec.mov.sum.2wk)] <- 0wnv.weather.mw$dd.mov.sum.wk[is.na(wnv.weather.mw$dd.mov.sum.wk)] <- 0wnv.weather.mw$dd.mov.sum.2wk[is.na(wnv.weather.mw$dd.mov.sum.2wk)] <- 0

options(scipen=100)

#Test Data Pre-processing------------------------------------------------------------------------------------------------------

wnv.test <- data.frame(test[,list(Date, Id, dWeek, Species, Trap, Latitude, Longitude, Species2)])

#Cast Date Typewnv.test$Date <- as.Date(wnv.test$Date)

oh.lat <- 41.995oh.long <- -87.933mw.lat <- 41.786mw.long <- -87.752

for (i in 1:nrow(wnv.test)) { #Euclidean Distances wnv.test$dist.oh[i] <- sqrt((wnv.test$Latitude[i]-oh.lat)**2+(wnv.test$Longitude[i]-oh.long)**2)*111 wnv.test$dist.mw[i] <- sqrt((wnv.test$Latitude[i]-mw.lat)**2+(wnv.test$Longitude[i]-mw.long)**2)*111 #Haversine Distance

Page 13: West-Nile-Virus | Kaggle

wnv.test$disth.oh[i] <- distHaversine(c(wnv.test$Longitude[i], wnv.test$Latitude[i]), c(oh.long, oh.lat))/1000 wnv.test$disth.mw[i] <- distHaversine(c(wnv.test$Longitude[i], wnv.test$Latitude[i]), c(mw.long, mw.lat))/1000 if (wnv.test$dist.oh[i] >= wnv.test$dist.mw[i]) { wnv.test$Station.Euc[i] <- 2 #2 is Midway } else { wnv.test$Station.Euc[i] <- 1 #1 is O'Hare } if (wnv.test$disth.oh[i] >= wnv.test$disth.mw[i]) { wnv.test$Station[i] <- 2 #2 is Midway } else { wnv.test$Station[i] <- 1 #1 is O'Hare } #Creating ISO Week Number #wnv.test$week.num[i] <- substring(ISOweek(wnv.test$Date[i]), 7, 8) }

#Concatenating Clean Weather Datasets and Merging with Training/Test Data----------------------------------------------------------

#wnv.train$Species2 <- train$Species2#wnv.test$Species2 <- test$Species2

wnv.weather.clean <- rbind(wnv.weather.oh, wnv.weather.mw)wnv.train.weather <- merge(wnv.train, wnv.weather.clean, by=c("Date", "Station"))wnv.test.weather <- merge(wnv.test, wnv.weather.clean, by=c("Date", "Station"))

#Just use one station...

#Sorting wnv.test.weather

wnv.test.weather.sorted <- wnv.test.weather[order(wnv.test.weather$Id),]

Page 14: West-Nile-Virus | Kaggle

#Training/Testing Split-----------------------------------------------------------------------------------------------------

wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year != 2007,]wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2007,]#-----------------------------------------------------

wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year != 2009,]wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2009,]#-----------------------------------------------------

#Set aside 2011 data as test, and train on the remainingwnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year != 2011,]wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2011,]

#-----------------------------------------------------

wnv.train.weather.1 <- wnv.train.weather[wnv.train.weather$date.year != 2013,]wnv.train.weather.2 <- wnv.train.weather[wnv.train.weather$date.year == 2013,]

#Clustering Locations - DBSCAN-------------------------------------------------------------------------------------------

#Potential epsilon values:#1/4 mile = 0.003623#1/2 mile = 0.007246#1 mile = 0.014493#2 mile = 0.028986#3 mile = 0.043479

Page 15: West-Nile-Virus | Kaggle

#Based on 1deg Latitude/Longitude = 69 miles

x.train <- cbind(wnv.train.weather$Longitude, wnv.train.weather$Latitude)dbs.train <- dbscan(x.train, 0.003623)plot(dbs.train, x.train)print(dbs.train)prediction.train <- predict(dbs.train, x.train)#length(prediction.train)#unique(prediction.train)

wnv.train.weather$loc.cluster.qmile <- as.factor(prediction.train)

x.test <- cbind(wnv.test.weather.sorted$Longitude, wnv.test.weather.sorted$Latitude)prediction.test <- predict(dbs.train, x.train, x.test)#length(prediction.test)#unique(prediction.test)wnv.test.weather.sorted$loc.cluster.2mile <- as.factor(prediction.test)

#Clustering Locations - K-Means----------------------------------------------------------------------------------------

#With normalized Longitude & Latitude

train.means <- apply(wnv.train.weather[c("Longitude", "Latitude")], 2, mean)train.sds <-apply(wnv.train.weather[c("Longitude", "Latitude")], 2, sd)train.loc.norm <- scale(wnv.train.weather[c("Longitude", "Latitude")],center=means,scale=sds)

test.means <- apply(wnv.test.weather.sorted[c("Longitude", "Latitude")], 2, mean)test.sds <-apply(wnv.test.weather.sorted[c("Longitude", "Latitude")], 2, sd)test.loc.norm <- scale(wnv.test.weather.sorted[c("Longitude", "Latitude")],center=means,scale=sds)

set.seed(123)clusters.kmeans.norm <- kmeans(train.loc.norm, 20)

Page 16: West-Nile-Virus | Kaggle

clusters.kmeans.norm$centersplot(clusters.kmeans.norm$centers)

pred.cluster.norm.train <- predict(clusters.kmeans.norm, data=train.loc.norm)pred.cluster.norm.test <- predict(clusters.kmeans.norm, data=test.loc.norm)

#With un-normalized Longitude & Latitude

train.loc <- wnv.train.weather[c("Longitude", "Latitude")]test.loc <- wnv.test.weather.sorted[c("Longitude", "Latitude")]

set.seed(123)clusters.kmeans <- kmeans(train.loc, 25)

clusters.kmeans$centersplot(clusters.kmeans$centers)

pred.cluster.train <- predict(clusters.kmeans.norm, data=train.loc.norm)pred.cluster.test <- predict(clusters.kmeans.norm, data=test.loc.norm)

#Assign to training/testing set

wnv.train.weather$kmeans.cluster.n <- as.factor(pred.cluster.norm.train)wnv.test.weather.sorted$kmeans.cluster.n <- as.factor(pred.cluster.norm.test)

#PCA-------------------------------------------------------------------------------------------------------------------

keeps1 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb", "StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed", "acc.deg.day", "temp.mov.avg.wk", "prec.mov.avg.wk", "prec.mov.sum.wk", "dd.mov.sum.wk" )

keeps2 <- c("Tavg", "DewPoint", "PrecipTotal", "WetBulb", "StnPressure", "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed")

#model.pca <- princomp(wnv.train.weather[keeps1], cor=TRUE,

Page 17: West-Nile-Virus | Kaggle

scores=TRUE)#model.pca

model.pca.train1 <- princomp(wnv.train.weather[keeps1], cor=TRUE, scores=TRUE)model.pca.test1 <- princomp(wnv.test.weather.sorted[keeps1], cor=TRUE, scores=TRUE)

model.pca.train <- princomp(wnv.train.weather[keeps2], cor=TRUE, scores=TRUE)model.pca.test <- princomp(wnv.test.weather.sorted[keeps2], cor=TRUE, scores=TRUE)

plot(model.pca.train)summary(model.pca.train)plot(model.pca.test)summary(model.pca.test)

plot(model.pca.train1)summary(model.pca.train1)

model.pca.train$scoresmodel.pca.test$scoresmodel.pca.train$loadingsmodel.pca.test$loadings

model.pca.train1$loadings

pca.out.train <- data.frame(model.pca.train$scores)pca.out.test <- data.frame(model.pca.test$scores)

wnv.train.weather <- cbind(wnv.train.weather, pca.out.train) wnv.test.weather.sorted <- cbind(wnv.test.weather.sorted, pca.out.test) #wnv.train.weather.pca <- cbind(wnv.train.weather, pca.out)

#Dealing with unbalanced classes---------------------------------------------------------------

Page 18: West-Nile-Virus | Kaggle

------------------------

wnv.train.no <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent == 0,]wnv.train.yes <- wnv.train.weather.1[wnv.train.weather.1$WnvPresent == 1,]

set.seed(1234)wnv.train.no.75 <- wnv.train.no[sample(1:nrow(wnv.train.no), 5852, replace=FALSE),]wnv.train.weather.1 <- rbind(wnv.train.no.75, wnv.train.yes)

set.seed(1234)wnv.train.no.50 <- wnv.train.no[sample(1:nrow(wnv.train.no), 3901, replace=FALSE),]wnv.train.weather.1 <- rbind(wnv.train.no.50, wnv.train.yes)

set.seed(1234)wnv.train.no.25 <- wnv.train.no[sample(1:nrow(wnv.train.no), 1951, replace=FALSE),]wnv.train.weather.1 <- rbind(wnv.train.no.25, wnv.train.yes)

set.seed(1234)wnv.train.no.10 <- wnv.train.no[sample(1:nrow(wnv.train.no), 780, replace=FALSE),]wnv.train.weather.1 <- rbind(wnv.train.no.10, wnv.train.yes)

set.seed(1234)wnv.train.no.5 <- wnv.train.no[sample(1:nrow(wnv.train.no), 312, replace=FALSE),]wnv.train.weather.1 <- rbind(wnv.train.no.5, wnv.train.yes)

#70/30 Split--------------------------------------------

set.seed(1234) #set.seed(4321)ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7, 0.3)) trainData <- wnv.train.weather[ind==1,]testData <- wnv.train.weather[ind==2,]

#GAM Modeling--------------------------------------------------------------

Page 19: West-Nile-Virus | Kaggle

--------------------------------------------------

#Experiment with PCA and Kmeans-------------------------------train.model <- gam(WnvPresent ~ s(dWeek) + Species2 + kmeans.cluster.n + s(Comp.1) + s(Comp.2) + s(Comp.3) + s(Comp.4) + s(Comp.5) + s(acc.deg.day) + s(temp.mov.avg.wk) + s(prec.mov.avg.2wk) #+ s(prec.mov.sum.wk) , data = trainData, family="binomial", bf.maxit = 1000)

prediction.gam <-predict(train.model, newdata = testData, type = "response")auc(testData$WnvPresent, prediction.gam)

#-------------------------------------------------------------

fitSubmit <- update(train.model, data=wnv.train.weather)pSubmit.gam<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "response")

submissionFile<-cbind(test$Id,pSubmit.gam)colnames(submissionFile)<-c("Id","WnvPresent")options("scipen"=100, "digits"=8)write.csv(submissionFile,"Submission_MaxLi134prec.csv",row.names=FALSE,quote=FALSE)

#GLM--------------------------------------------------------------------------------------------------------------------------

#Create Squared Term deg.day, dWeek, dd.mov.sum.2wk

wnv.train.weather$acc.deg.day2 <- (wnv.train.weather$acc.deg.day)**2wnv.train.weather$dWeek2 <- (wnv.train.weather$dWeek)**2wnv.train.weather$dd.mov.sum.2wk2 <- (wnv.train.weather$dd.mov.sum.2wk)**2

wnv.test.weather.sorted$acc.deg.day2 <- (wnv.test.weather.sorted

Page 20: West-Nile-Virus | Kaggle

$acc.deg.day)**2wnv.test.weather.sorted$dWeek2 <- (wnv.test.weather.sorted$dWeek)**2wnv.test.weather.sorted$dd.mov.sum.2wk2 <- (wnv.test.weather.sorted$dd.mov.sum.2wk)**2

#Experiment with PCA and Kmeans--------------------------------log.model <- glm(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n + dWeek2 + Comp.1 #+ Comp.2 #+ Comp.3 #+ Comp.4 #+ Comp.5 #+ acc.deg.day #+ acc.deg.day2 + temp.mov.avg.2wk + prec.mov.avg.2wk , data = trainData, family=binomial())summary(log.model)prediction.glm <-predict(log.model, newdata = testData, type = "response")auc(testData$WnvPresent, prediction.glm)

#------------------------------------------------------------fitSubmit <- update(log.model, data=wnv.train.weather)pSubmit.glm <-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "response")#summary(pSubmit.glm)

submissionFile<-cbind(test$Id,pSubmit.glm)colnames(submissionFile)<-c("Id","WnvPresent")options("scipen"=100, "digits"=8)write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=FALSE)

#Base Model Submission AUC: 0.64569

#Base Train AUC: 0.78259055

#LDA--------------------------------------------------------------------------------------------------------------------------

Page 21: West-Nile-Virus | Kaggle

lda.model <- lda(WnvPresent ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day + acc.deg.day2 #+ temp.mov.avg.2wk + prec.mov.avg.wk + prec.mov.sum.2wk ,data = trainData)prediction.lda <- predict(lda.model, testData)lda.df <- data.frame(prediction.lda$posterior)auc(testData$WnvPresent, lda.df[,2])

#-------------------------------------------------------------------

fitSubmit <- update(lda.model, data=wnv.train.weather)pSubmit<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "probability")lda.test <- data.frame(pSubmit$posterior)

submissionFile<-cbind(test$Id,lda.df.test[,2])colnames(submissionFile)<-c("Id","WnvPresent")options("scipen"=100, "digits"=8)write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=FALSE)

#Random Forest----------------------------------------------------------------------------------------------------------------

set.seed(1234)rf.model <- randomForest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day #+ temp.mov.avg.2wk

Page 22: West-Nile-Virus | Kaggle

#+ prec.mov.avg.2wk #+ prec.mov.sum.2wk , data = trainData, nTree = 1000)prediction.rf <- predict(rf.model, testData, type = "prob")rf.df <- data.frame(prediction.rf)auc(testData$WnvPresent.C, rf.df[,2])

fitSubmit <- update(rf.model, data=wnv.train.weather)pSubmit.rf<-predict(fitSubmit, newdata = wnv.test.weather.sorted, type = "prob")rf.df.test <- data.frame(pSubmit.rf)

submissionFile<-cbind(test$Id, rf.df.test[,2])colnames(submissionFile)<-c("Id","WnvPresent")options("scipen"=100, "digits"=8)write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=FALSE)

#PCA and Kmeans-------------------

set.seed(1234)rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 #+ acc.deg.day #+ temp.mov.avg.2wk #+ prec.mov.avg.2wk #+ prec.mov.sum.wk ,data = trainData, controls=cforest_unbiased(ntree=200, mtry=2))prediction.rf <- predict(rf.model, newdata=testData, type = "prob")rf.df <- data.frame(prediction.rf)

x <- vector()for (i in seq(2, 6232, 2)) { x <- c(x, rf.df[1, i])}

auc(testData$WnvPresent.C, x)

set.seed(1234)

Page 23: West-Nile-Virus | Kaggle

rf.model <- cforest(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 #+ Comp.3 #+ Comp.4 #+ Comp.5 #+ acc.deg.day #+ temp.mov.avg.2wk #+ prec.mov.avg.2wk ,data = wnv.train.weather, controls=cforest_unbiased(ntree=10, mtry=2, maxdepth = 10))prediction.rf <- predict(rf.model, newdata=wnv.test.weather.sorted, type = "prob")rf.df <- data.frame(prediction.rf)

#H20 Random Forest--------------library(h2o)localH2O <- h2o.init(nthreads = -1,max_mem_size = '7g')

rf.model <- h2o.randomForest(x=c(3, 9, 43, 44:48),y = 53,data = trainData, mtries = 18, sample.rate = 0.5, classification = T,ntree = 200,verbose = T)prediction.rf <- h2o.predict(rf.model,testData)

#Support Vector Machines------------------------------------------------------------------------------------------------------

#PCA and kmeans----------------------------------------------------set.seed(1234)svm.model <- svm(WnvPresent.C ~ dWeek + Species2 + kmeans.cluster.n + Comp.1 + Comp.2 + Comp.3 + Comp.4 + Comp.5 + acc.deg.day + temp.mov.avg.2wk #+ prec.mov.avg.wk

Page 24: West-Nile-Virus | Kaggle

#+ prec.mov.sum.2wk , data = trainData, probability = TRUE)#summary(svm.model)prediction.svm <- predict(svm.model, testData, probability = TRUE)svm.df <- data.frame(attr(prediction.svm, "probabilities"))auc(testData$WnvPresent.C, svm.df[,2])

#------------------------------------------------------------------set.seed(1234)fitSubmit <- update(svm.model, data=wnv.train.weather)pSubmit.svm<-predict(fitSubmit, newdata = wnv.test.weather.sorted, probability = TRUE)svm.df.test <- data.frame(attr(pSubmit.svm, "probabilities"))

submissionFile<-cbind(test$Id, svm.df.test[,2])colnames(submissionFile)<-c("Id","WnvPresent")options("scipen"=100, "digits"=8)write.csv(submissionFile,"Submission_MaxLi.csv",row.names=FALSE,quote=FALSE)

#Ensemble---------------------------------------------------------------------------------------------------------------------

gam.train <- as.matrix(prediction.gam)glm.train <- as.matrix(prediction.glm)lda.train <- as.matrix(lda.df[,2])rf.train <- as.matrix(rf.df[,2])svm.train <- as.matrix(svm.df[,2])ens.train <- cbind(gam.train, lda.train, rf.train, svm.train)

gam.test <- as.matrix(pSubmit.gam)glm.test <- as.matrix(pSubmit.glm)lda.test <- as.matrix(lda.df.test[,2])rf.test <- as.matrix(rf.df.test[,2])svm.test <- as.matrix(svm.df.test[,2])ens.test <- cbind(lda.test, rf.test, svm.test)

w <- matrix(c(0.0, 0.3, 0.1, 0.6), nrow = 4, ncol = 1)ens.model <- ens.train %*% wauc(testData$WnvPresent, ens.model)

w <- matrix(c(0.3, 0.1, 0.6), nrow = 3, ncol = 1)ens.model.test <- ens.test %*% w

Page 25: West-Nile-Virus | Kaggle

submissionFile<-cbind(test$Id,ens.model.test)colnames(submissionFile)<-c("Id","WnvPresent")options("scipen"=100, "digits"=8)write.csv(submissionFile,"Submission_MaxLiBest3.csv",row.names=FALSE,quote=FALSE)

#Bagging--------------------------------------------------------------------------------------------------------------------

set.seed(1234) ind <- sample(2, nrow(wnv.train.weather), replace=TRUE, prob=c(0.7, 0.3))trainData <- wnv.train.weather[ind==1,]testData <- wnv.train.weather[ind==2,]

set.seed(4321)all_data <- wnv.train.weatherpositions <- sample(nrow(all_data),size=floor((nrow(all_data)/10)*7))trainBag<- all_data[positions,]testBag<- all_data[-positions,]

library(foreach)length_divisor<-6iterations<-50predictions<-foreach(m=1:iterations,.combine=cbind) %do% { training_positions <- sample(nrow(trainData), size=floor((nrow(trainData)/length_divisor))) train_pos<-1:nrow(trainData) %in% training_positions gam.bag <- gam(WnvPresent ~ lo(dWeek) + Species2 + lo(Longitude, Latitude) + s(acc.deg.day), data = trainData[train_pos,] , family="binomial", bf.maxit = 45) predict(gam.bag, newdata = testData, type = "response") }predictions<-data.frame(rowMeans(predictions))auc(testData$WnvPresent.C, predictions)#--------------------------------------------------------------------library(foreach)length_divisor<-6iterations<-1000predictions<-foreach(m=1:iterations,.combine=cbind) %do% { training_positions <- sample(nrow(trainData), size=floor((nrow(trainData)/length_divisor)))

Page 26: West-Nile-Virus | Kaggle

train_pos<-1:nrow(trainData) %in% training_positions glm.bag <- glm(WnvPresent ~ dWeek + dWeek2 + Species2 + Longitude + acc.deg.day + acc.deg.day2 , data = trainData[train_pos,], family=binomial()) #summary(log.model) predict(glm.bag, newdata = testData, type = "response")}predictions<-data.frame(rowMeans(predictions))auc(testData$WnvPresent.C, predictions)

#Vs.

gam.test <- gam(WnvPresent ~ lo(dWeek) + Species2 + loc.cluster.qmile, data = trainBag, family="binomial", bf.maxit = 200)prediction.gam <-predict(gam.test, newdata = testBag, type = "response")auc(testBag$WnvPresent, prediction.gam)

#Adabag Exploration--------------------------------------------------------------------------------------------------------

wnv.adaboost <- boosting(WnvPresent.C ~ dWeek, data = trainData, boos=TRUE, mfinal = 5)