diff --git a/CODE/00_Startup.R b/CODE/00_Startup.R
index ece0e83..4312130 100644
--- a/CODE/00_Startup.R
+++ b/CODE/00_Startup.R
@@ -1,15 +1,55 @@
-## INSTALL THESE DEPENDENCIES
-install.packages("devtools",
-                 dependencies = TRUE,
-                 repos='http://cran.us.r-project.org')
-install.packages("Rcpp",
-                 dependencies = TRUE,
-                 repos='http://cran.us.r-project.org')
-
-## Update two packages not on CRAN using the devtools package.
-devtools::install_github(repo = 'geneorama/geneorama')
-devtools::install_github(repo = 'yihui/printr')
+##------------------------------------------------------------------------------
+## INSTALL DEPENDENCIES IF MISSING
+##------------------------------------------------------------------------------
+
+if(!"devtools" %in% rownames(installed.packages())){
+    install.packages("devtools",
+                     dependencies = TRUE,
+                     repos = "https://cloud.r-project.org/")
+}
+
+if(!"Rcpp" %in% rownames(installed.packages())){
+    install.packages("Rcpp",
+                     dependencies = TRUE,
+                     repos = "https://cloud.r-project.org/")
+}
+
+if(!"RSocrata" %in% rownames(installed.packages())){
+    install.packages("RSocrata",
+                     dependencies = TRUE,
+                     repos = "https://cloud.r-project.org/")
+}
+
+if(!"data.table" %in% rownames(installed.packages())){
+    install.packages("data.table",
+                     dependencies = TRUE,
+                     repos = "https://cloud.r-project.org/")
+}
+
+if(!"geneorama" %in% rownames(installed.packages())){
+    devtools::install_github('geneorama/geneorama')
+}
+
+if(!"printr" %in% rownames(installed.packages())){
+    devtools::install_github(repo = 'yihui/printr')
+}
+
+##------------------------------------------------------------------------------
+## UPDATE DEPENDENCIES IF MISSING
+##------------------------------------------------------------------------------
 
 ## Update to RSocrata 1.7.2-2 (or later) 
-## which is only on github as of March 8, 2016
-devtools::install_github(repo = 'chicago/RSocrata')
+if(installed.packages()["RSocrata","Version"] < "1.7.2-2"){
+    install.packages("RSocrata", 
+                     repos = "https://cloud.r-project.org/")
+}
+
+## Needs recent version for foverlaps
+if(installed.packages()["data.table","Version"] < "1.10.0"){
+    install.packages("data.table", 
+                     repos = "https://cloud.r-project.org/")
+}
+
+if(installed.packages()["geneorama","Version"] < "1.5.0"){
+    devtools::install_github('geneorama/geneorama')
+}
diff --git a/CODE/11_business_download.R b/CODE/11_business_download.R
index 446003f..aebcebc 100644
--- a/CODE/11_business_download.R
+++ b/CODE/11_business_download.R
@@ -1,13 +1,10 @@
-if(interactive()){
-    ##==========================================================================
-    ## INITIALIZE
-    ##==========================================================================
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach any non-standard libraries
-    geneorama::detach_nonstandard_packages()
-}
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+    
 ## Load libraries & project functions
 geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
 geneorama::sourceDir("CODE/functions/")
@@ -38,4 +35,4 @@ business[ , LICENSE_TERM_START_DATE := as.IDate(LICENSE_TERM_START_DATE, "%m/%d/
 business[ , LICENSE_TERM_EXPIRATION_DATE := as.IDate(LICENSE_TERM_EXPIRATION_DATE, "%m/%d/%Y")]
 
 ## SAVE RESULT
-saveRDS(business, "DATA/bus_license.Rds")
+saveRDS(business, "DATA/11_bus_license.Rds")
diff --git a/CODE/12_crime_download.R b/CODE/12_crime_download.R
index 9f2f489..d751481 100644
--- a/CODE/12_crime_download.R
+++ b/CODE/12_crime_download.R
@@ -1,13 +1,10 @@
-if(interactive()){
-    ##==========================================================================
-    ## INITIALIZE
-    ##==========================================================================
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach any non-standard libraries
-    geneorama::detach_nonstandard_packages()
-}
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
 ## Load libraries & project functions
 geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
 geneorama::sourceDir("CODE/functions/")
@@ -38,4 +35,4 @@ crime[ , Arrest := as.logical(Arrest)]
 crime[ , Domestic := as.logical(Domestic)]
 
 ## SAVE RESULT
-saveRDS(crime , "DATA/crime.Rds")
+saveRDS(crime , "DATA/12_crime.Rds")
diff --git a/CODE/13_food_inspection_download.R b/CODE/13_food_inspection_download.R
index 67c6d8f..2fd665c 100644
--- a/CODE/13_food_inspection_download.R
+++ b/CODE/13_food_inspection_download.R
@@ -1,13 +1,10 @@
-if(interactive()){
-    ##==========================================================================
-    ## INITIALIZE
-    ##==========================================================================
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach any non-standard libraries
-    geneorama::detach_nonstandard_packages()
-}
+##==========================================================================
+## INITIALIZE
+##==========================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
 ## Load libraries & project functions
 geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
 geneorama::sourceDir("CODE/functions/")
@@ -34,5 +31,5 @@ setnames(foodInspect, gsub("_+$","",colnames(foodInspect)))
 geneorama::convert_datatable_IntNum(foodInspect)
 geneorama::convert_datatable_DateIDate(foodInspect)
 
-## SAVE ANSWER
-saveRDS(foodInspect , "DATA/food_inspections.Rds")
+## SAVE RESULT
+saveRDS(foodInspect , "DATA/13_food_inspections.Rds")
diff --git a/CODE/14_garbage_download.R b/CODE/14_garbage_download.R
index b688e63..12685b6 100644
--- a/CODE/14_garbage_download.R
+++ b/CODE/14_garbage_download.R
@@ -1,13 +1,10 @@
-if(interactive()){
-    ##==========================================================================
-    ## INITIALIZE
-    ##==========================================================================
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach any non-standard libraries
-    geneorama::detach_nonstandard_packages()
-}
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
 ## Load libraries & project functions
 geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
 geneorama::sourceDir("CODE/functions/")
@@ -34,4 +31,5 @@ geneorama::convert_datatable_IntNum(garbageCarts)
 geneorama::convert_datatable_DateIDate(garbageCarts)
 
 ## SAVE RESULT
-saveRDS(garbageCarts , "DATA/garbage_carts.Rds")
+saveRDS(garbageCarts , "DATA/14_garbage_carts.Rds")
+
diff --git a/CODE/15_sanitation_download.R b/CODE/15_sanitation_download.R
index 8f38253..8f3099d 100644
--- a/CODE/15_sanitation_download.R
+++ b/CODE/15_sanitation_download.R
@@ -1,13 +1,10 @@
-if(interactive()){
-    ##==========================================================================
-    ## INITIALIZE
-    ##==========================================================================
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach any non-standard libraries
-    geneorama::detach_nonstandard_packages()
-}
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
 ## Load libraries & project functions
 geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
 geneorama::sourceDir("CODE/functions/")
@@ -37,29 +34,5 @@ geneorama::convert_datatable_IntNum(sanitationComplaints)
 geneorama::convert_datatable_DateIDate(sanitationComplaints)
 
 ## SAVE RESULT
-saveRDS(sanitationComplaints , "DATA/sanitation_code.Rds")
-
-# ## Quick fix to download creation date, which is needed for the heat map calc
-# ## The following block can be removed after issue 68 is resolved in RSocrata
-# ## https://github.com/Chicago/RSocrata/issues/68
-# crdate <- list()
-# i <- 0
-# while(length(crdate)==0 || length(crdate[[length(crdate)]]) == 50000 ){
-#     i <- i + 1
-#     url <- paste0("https://data.cityofchicago.org/resource/me59-5fac.csv",
-#                   "?$select=creation_date&$LIMIT=50000",
-#                   "&$OFFSET=", (i - 1) * 50000)
-#     crdate[[i]] <- httr::content(httr::GET(url), as = "text")
-#     crdate[[i]] <- strsplit(crdate[[i]], "\n")[[1]][-2]
-#     print(i)
-#     print(length(crdate[[i]]))
-# }
-# crdate <- do.call(c, crdate)
-# crdate <- crdate[-1]
-# 
-# length(crdate) == nrow(sanitationComplaints)
-# 
-# crdate <- as.IDate(crdate, "%m/%d/%Y")
-# sanitationComplaints$Creation_Date <- crdate
-
+saveRDS(sanitationComplaints , "DATA/15_sanitation_code.Rds")
 
diff --git a/CODE/21_calculate_violation_matrix.R b/CODE/21_calculate_violation_matrix.R
index 69f7acd..d4eece3 100644
--- a/CODE/21_calculate_violation_matrix.R
+++ b/CODE/21_calculate_violation_matrix.R
@@ -8,25 +8,33 @@
 ## Remove all objects; perform garbage collection
 rm(list=ls())
 gc(reset=TRUE)
-## Detach libraries that are not used
-geneorama::detach_nonstandard_packages()
-## Load libraries that are used
+
+## Load libraries & project functions
 geneorama::loadinstall_libraries(c("data.table", "MASS"))
-## Load custom functions
 geneorama::sourceDir("CODE/functions/")
 
 ##==============================================================================
 ## LOAD CACHED RDS FILES
 ##==============================================================================
-foodInspect <- readRDS("DATA/food_inspections.Rds")
+foodInspect <- readRDS("DATA/13_food_inspections.Rds")
+foodInspect <- filter_foodInspect(foodInspect)
 
 ##==============================================================================
 ## CALCULATE FEATURES BASED ON FOOD INSPECTION DATA
 ##==============================================================================
 
 ## Calculate violation matrix and put into data.table with inspection id as key
+vio_mat <- calculate_violation_matrix(foodInspect[ , Violations])
+
+## Add key column to vio_mat
+vio_mat <- data.table(vio_mat, 
+                      Inspection_ID = foodInspect[ , Inspection_ID], 
+                      key = "Inspection_ID")
+
 ## calculate_violation_types calculates violations by categories:
 ##       Critical, serious, and minor violations
-violation_dat <- calculate_violation_types(foodInspect$Violations,
-                                           Inspection_ID = foodInspect$Inspection_ID)
-saveRDS(violation_dat, "DATA/violation_dat.Rds")
+violation_dat <- calculate_violation_types(violation_mat =vio_mat)
+
+## Save results
+saveRDS(vio_mat, "DATA/21_food_inspection_violation_matrix_nums.Rds")
+saveRDS(violation_dat, "DATA/21_food_inspection_violation_matrix.Rds")
diff --git a/CODE/22_calculate_heat_map_values.R b/CODE/22_calculate_heat_map_values.R
index 673ef9b..99b9def 100644
--- a/CODE/22_calculate_heat_map_values.R
+++ b/CODE/22_calculate_heat_map_values.R
@@ -1,15 +1,12 @@
-
 ##==============================================================================
 ## INITIALIZE
 ##==============================================================================
 ## Remove all objects; perform garbage collection
 rm(list=ls())
 gc(reset=TRUE)
-## Detach libraries that are not used
-geneorama::detach_nonstandard_packages()
-## Load libraries that are used
+
+## Load libraries & project functions
 geneorama::loadinstall_libraries(c("data.table", "MASS"))
-## Load custom functions
 geneorama::sourceDir("CODE/functions/")
 
 ##==============================================================================
@@ -17,10 +14,10 @@ geneorama::sourceDir("CODE/functions/")
 ##==============================================================================
 
 ## Import the key data sets used for prediction
-foodInspect <- readRDS("DATA/food_inspections.Rds")
-crime <-  readRDS("DATA/crime.Rds")
-garbageCarts <- readRDS("DATA/garbage_carts.Rds")
-sanitationComplaints <- readRDS("DATA/sanitation_code.Rds")
+foodInspect <- readRDS("DATA/13_food_inspections.Rds")
+crime <-  readRDS("DATA/12_crime.Rds")
+garbageCarts <- readRDS("DATA/14_garbage_carts.Rds")
+sanitationComplaints <- readRDS("DATA/15_sanitation_code.Rds")
 
 ## Apply filters by omitting rows that are not used in the model
 foodInspect <- filter_foodInspect(foodInspect)
@@ -58,8 +55,8 @@ sanitationComplaints_heat <-
 ##==============================================================================
 ## SAVE HEAT MAP VALUES
 ##==============================================================================
-saveRDS(burglary_heat, "DATA/burglary_heat.Rds")
-saveRDS(garbageCarts_heat, "DATA/garbageCarts_heat.Rds")
-saveRDS(sanitationComplaints_heat, "DATA/sanitationComplaints_heat.Rds")
+saveRDS(burglary_heat, "DATA/22_burglary_heat.Rds")
+saveRDS(garbageCarts_heat, "DATA/22_garbageCarts_heat.Rds")
+saveRDS(sanitationComplaints_heat, "DATA/22_sanitationComplaints_heat.Rds")
 
 
diff --git a/CODE/23_food_insp_features.R b/CODE/23_food_insp_features.R
new file mode 100644
index 0000000..fa32807
--- /dev/null
+++ b/CODE/23_food_insp_features.R
@@ -0,0 +1,74 @@
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
+## Load libraries & project functions
+geneorama::loadinstall_libraries(c("data.table", "MASS"))
+geneorama::sourceDir("CODE/functions/")
+## Import shift function
+shift <- geneorama::shift
+
+##==============================================================================
+## LOAD CACHED RDS FILES
+##==============================================================================
+foodInspect <- readRDS("DATA/13_food_inspections.Rds")
+
+## Apply row filter to remove invalid data
+foodInspect <- filter_foodInspect(foodInspect)
+
+## Remove violations from food inspection, violations are caputured in the 
+## violation matrix data
+foodInspect$Violations <- NULL
+
+## Import violation matrix which lists violations by categories:
+##       Critical, serious, and minor violations
+violation_dat <- readRDS("DATA/21_food_inspection_violation_matrix.Rds")
+
+##==============================================================================
+## CALCULATE FEATURES
+##==============================================================================
+
+## Facility_Type_Clean: Anything that is not "restaurant" or "grocery" is "other"
+foodInspect[ , Facility_Type_Clean := 
+                 categorize(x = Facility_Type,
+                            primary = list(Restaurant = "restaurant",
+                                           Grocery_Store = "grocery"),
+                            ignore.case = TRUE)]
+## Join in the violation matrix
+foodInspect <- merge(x = foodInspect, 
+                     y = violation_dat, 
+                     by = "Inspection_ID")
+## Create pass / fail flags
+foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)]
+foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)]
+## Set key to ensure that records are treated CHRONOLOGICALLY...
+setkey(foodInspect, License, Inspection_Date)
+## Then find previous info by "shifting" the columns (grouped by License)
+foodInspect[ , pastFail := shift(fail_flag, -1, 0), by = License]
+foodInspect[ , pastCritical := shift(criticalCount, -1, 0), by = License]
+foodInspect[ , pastSerious := shift(seriousCount, -1, 0), by = License]
+foodInspect[ , pastMinor := shift(minorCount, -1, 0), by = License]
+
+## Calcualte time since last inspection.
+## If the time is NA, this means it's the first inspection; add an inicator 
+## variable to indicate that it's the first inspection.
+foodInspect[i = TRUE , 
+            j = timeSinceLast := as.numeric(
+                Inspection_Date - shift(Inspection_Date, -1, NA)) / 365, 
+            by = License]
+foodInspect[ , firstRecord := 0]
+foodInspect[is.na(timeSinceLast), firstRecord := 1]
+foodInspect[is.na(timeSinceLast), timeSinceLast := 2]
+foodInspect[ , timeSinceLast := pmin(timeSinceLast, 2)]
+
+##==============================================================================
+## SAVE RDS
+##==============================================================================
+setkey(foodInspect, Inspection_ID)
+saveRDS(foodInspect, file.path("DATA/23_food_insp_features.Rds"))
+
+
+
diff --git a/CODE/23_generate_model_dat.R b/CODE/23_generate_model_dat.R
deleted file mode 100644
index 6ebcf0a..0000000
--- a/CODE/23_generate_model_dat.R
+++ /dev/null
@@ -1,227 +0,0 @@
-
-##==============================================================================
-## INITIALIZE
-##==============================================================================
-## Remove all objects; perform garbage collection
-rm(list=ls())
-gc(reset=TRUE)
-## Detach libraries that are not used
-geneorama::detach_nonstandard_packages()
-## Load libraries that are used
-geneorama::loadinstall_libraries(c("data.table", "MASS"))
-## Load custom functions
-geneorama::sourceDir("CODE/functions/")
-
-## Import shift function
-shift <- geneorama::shift
-
-##==============================================================================
-## LOAD CACHED RDS FILES
-##==============================================================================
-
-## Import the key data sets used for prediction
-foodInspect <- readRDS("DATA/food_inspections.Rds")
-business <- readRDS("DATA/bus_license.Rds")
-
-## For clarity, remove violations from food inspection
-## This information is caputured in the violation matrix (below)
-foodInspect$Violations <- NULL
-
-## Import violation matrix which lists violations by categories:
-##       Critical, serious, and minor violations
-violation_dat <- readRDS("DATA/violation_dat.Rds")
-
-## Import the inspectors
-inspectors <- readRDS("DATA/inspectors.Rds")
-
-## Import weather data
-weather <- readRDS("DATA/weather_20110401_20141031.Rds")
-weather_3day <- weather_3day_calc(weather)
-rm(weather)
-
-## Import the heat density values, previously calculated
-burglary_heat <- readRDS("DATA/burglary_heat.Rds")
-garbage_heat <- readRDS("DATA/garbageCarts_heat.Rds")
-sanitation_heat <- readRDS("DATA/sanitationComplaints_heat.Rds")
-
-##==============================================================================
-## Filter the primary data sets to remove unnecessary rows
-##==============================================================================
-foodInspect <- filter_foodInspect(foodInspect)
-business <- filter_business(business)
-
-foodInspect[ , Facility_Type_Clean := 
-                categorize(x = Facility_Type,
-                           primary = list(Restaurant = "restaurant",
-                                          Grocery_Store = "grocery"),
-                           ignore.case = TRUE)]
-
-##==============================================================================
-## Create basis for dat_model, which is the data that will be used in the model
-##==============================================================================
-dat_model <- foodInspect[i = TRUE , 
-                         j = list(Inspection_Date, 
-                                  License,
-                                  Inspection_Type,
-                                  Results), 
-                         keyby = Inspection_ID]
-
-##==============================================================================
-## CALCULATE FEATURES BASED ON FOOD INSPECTION DATA
-##==============================================================================
-
-## Join in the violation matrix
-dat_model <- merge(x = dat_model, 
-                   y = violation_dat, 
-                   by = "Inspection_ID")
-
-## Join in the clean facility type
-dat_model <- merge(
-    x = dat_model, 
-    y = foodInspect[ , list(Inspection_ID, 
-                            Facility_Type = Facility_Type_Clean)],
-    by = "Inspection_ID")
-
-## Create pass / fail flags
-dat_model[ , pass_flag := ifelse(Results=="Pass",1, 0)]
-dat_model[ , fail_flag := ifelse(Results=="Fail",1, 0)]
-## Set key to ensure that records are treated CHRONOLOGICALLY...
-setkey(dat_model, License, Inspection_Date)
-## Then find previous info by "shifting" the columns (grouped by License)
-dat_model[ , pastFail := shift(fail_flag, -1, 0), by = License]
-dat_model[ , pastCritical := shift(criticalCount, -1, 0), by = License]
-dat_model[ , pastSerious := shift(seriousCount, -1, 0), by = License]
-dat_model[ , pastMinor := shift(minorCount, -1, 0), by = License]
-
-## Calcualte time since last inspection.
-## If the time is NA, this means it's the first inspection; add an inicator 
-## variable to indicate that it's the first inspection.
-dat_model[i = TRUE , 
-          j = timeSinceLast := as.numeric(
-              Inspection_Date - shift(Inspection_Date, -1, NA)) / 365, 
-          by = License]
-dat_model[ , firstRecord := 0]
-dat_model[is.na(timeSinceLast), firstRecord := 1]
-dat_model[is.na(timeSinceLast), timeSinceLast := 2]
-dat_model[ , timeSinceLast := pmin(timeSinceLast, 2)]
-
-##==============================================================================
-## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA
-##==============================================================================
-
-## Create a table of matches between the food inspection and business license
-## data, based on the where the Inspection_Date falls within the business
-## license renewal
-id_table_food2business <- find_bus_id_matches(business, foodInspect)
-
-## Join business ID to dat_model
-dat_model <- merge(x = dat_model, 
-                   y = id_table_food2business, 
-                   by = "Inspection_ID")
-
-## Add LICENSE_DESCRIPTION to dat_model
-license_dec <- business[i = TRUE, 
-                        j = LICENSE_DESCRIPTION, 
-                        keyby = list(Business_ID = ID)]
-dat_model <- merge(dat_model, 
-                   license_dec, 
-                   by = "Business_ID",
-                   all.x = TRUE)
-
-## Calculate min date (by license)
-business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER]
-business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER]
-
-## Calculate age at inspection:
-## Add minDate to dat_model
-dat_model <- merge(x = dat_model, 
-                   y = business[ , list(Business_ID = ID,
-                                        minDate,
-                                        maxDate)], # maxDate's just nice to have
-                   by = "Business_ID",
-                   all.x = TRUE)
-## Use minDate to calculate age
-dat_model[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365]
-
-## CALCULATE AND MERGE IN OTHER CATEGORIES
-OtherCategories <- GenerateOtherLicenseInfo(dat_model, 
-                                            business, 
-                                            max_cat = 12)
-## Merge in results
-dat_model <- merge(x = dat_model, 
-                   y = OtherCategories, 
-                   by = "Inspection_ID",
-                   all.x = T)
-## Remove NAs in category columns and set max value to 1
-for (j in match(colnames(OtherCategories)[-1], colnames(dat_model))) {
-    set(x = dat_model, i = which(is.na(dat_model[[j]])), j = j, value = 0)
-    set(x = dat_model, j = j, value = pmin(dat_model[[j]], 1))
-}
-
-##==============================================================================
-## ATTACH "HEAT MAP" DATA
-##==============================================================================
-
-## Merge in results of heat calculations:
-setnames(burglary_heat, "heat_values", "heat_burglary")
-setnames(garbage_heat, "heat_values", "heat_garbage")
-setnames(sanitation_heat, "heat_values", "heat_sanitation")
-
-setkey(dat_model, Inspection_ID)
-setkey(burglary_heat, Inspection_ID)
-setkey(garbage_heat, Inspection_ID)
-setkey(sanitation_heat, Inspection_ID)
-dat_model <- dat_model[burglary_heat][garbage_heat][sanitation_heat]
-
-##==============================================================================
-## ATTACH INSPECTOR DATA
-##==============================================================================
-## Removing letters out front, and numbers trailing hyphen are not needed 
-## (e.g. -1006 I believe is the code for retail food license)
-inspectors[ , License := gsub('[A-z]+|-.+$', "", License)]
-## cleaning any leading zeros
-inspectors[ , License := gsub('^[0]+', "", License)]
-## removing possibly invalid license numbers
-inspectors <- inspectors[nchar(License) > 3 & Inspector_Assigned != " "]
-## if multiple inspections for same license number, then seeking the inspector 
-## on the first inspection
-inspectors <- inspectors[ , .N, by=list(License, Inspection_Date, Inspector_Assigned)]
-inspectors$N <- NULL
-## Convert to integer to match 
-inspectors[ , License := as.integer(License)]
-setkey(inspectors, License, Inspection_Date)
-setkey(dat_model, License, Inspection_Date)
-
-inspectors_deduped <- inspectors[
-    i = TRUE , 
-    j = list(Inspector_Assigned = Inspector_Assigned[1]), 
-    keyby = list(License, Inspection_Date)]
-dat_model <- merge(
-    x = dat_model,
-    y = inspectors_deduped,
-    by = c("License", "Inspection_Date"),
-    all.x = TRUE)
-
-##==============================================================================
-## MERGE IN WEATHER
-##==============================================================================
-dat_model <- merge(dat_model, 
-                   weather_3day, 
-                   by = "Inspection_Date",
-                   all.x = TRUE)
-
-# geneorama::NAsummary(dat_model)
-# str(na.omit(dat_model))
-# dat_model[ , .N, LICENSE_DESCRIPTION]
-# str(na.omit(dat_model[LICENSE_DESCRIPTION=="Retail Food Establishment"]))
-# geneorama::NAsummary(dat_model[LICENSE_DESCRIPTION=="Retail Food Establishment"])
-
-##==============================================================================
-## SAVE RDS
-##==============================================================================
-## Set the key for dat_model
-setkey(dat_model, Inspection_ID)
-saveRDS(dat_model, file.path("DATA/dat_model.Rds"))
-
-
-
diff --git a/CODE/24_bus_features.R b/CODE/24_bus_features.R
new file mode 100644
index 0000000..cdfc2a2
--- /dev/null
+++ b/CODE/24_bus_features.R
@@ -0,0 +1,87 @@
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
+## Load libraries & project functions
+geneorama::loadinstall_libraries(c("data.table", "MASS"))
+geneorama::sourceDir("CODE/functions/")
+## Import shift function
+shift <- geneorama::shift
+
+##==============================================================================
+## LOAD CACHED RDS FILES
+##==============================================================================
+business <- readRDS("DATA/11_bus_license.Rds")
+
+## Apply filter to remove invalid / unused data
+business <- filter_business(business)
+
+## Food inspection data needed for some feature calculations inspection date
+foodInspect <- readRDS("DATA/23_food_insp_features.Rds")
+
+##==============================================================================
+## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA
+##==============================================================================
+
+## Calculate min date (by license)
+business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER]
+business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER]
+
+##==============================================================================
+## Use only the business data that pertains to food inspections
+##==============================================================================
+## Create a table of matches between the food inspection and business license
+## data, based on the where the Inspection_Date falls within the business
+## license renewal
+id_table_food2business <- find_bus_id_matches(business, foodInspect)
+geneorama::NAsummary(id_table_food2business)
+
+## Add food key to matched business data
+bus_matched <- merge(x = id_table_food2business, 
+                     y = business, 
+                     by = "ID", 
+                     all.y = FALSE, 
+                     all.x = TRUE)
+setkey(bus_matched, Inspection_ID)
+
+## Add business key to food data
+foodInspect <- merge(x = id_table_food2business, 
+                     y = foodInspect, 
+                     by = "Inspection_ID")
+setkey(foodInspect, Inspection_ID)
+
+## Use minDate and Inspection date to calculate age at 
+bus_matched <- bus_matched[foodInspect[,Inspection_Date,keyby=Inspection_ID]]
+bus_matched[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365]
+
+## Remove Inspection Date to avoid conflict names when merging later
+bus_matched[ , Inspection_Date := NULL]
+
+
+## CALCULATE AND MERGE IN OTHER CATEGORIES
+OtherCategories <- GenerateOtherLicenseInfo(foodInspect, business,  max_cat = 12)
+geneorama::NAsummary(OtherCategories)
+
+## Merge in results
+bus_matched <- merge(x = bus_matched, 
+                     y = OtherCategories, 
+                     by = "Inspection_ID",
+                     all.x = T)
+## Remove NAs in category columns and set max value to 1
+for (j in match(colnames(OtherCategories)[-1], colnames(bus_matched))) {
+    set(x = bus_matched, i = which(is.na(bus_matched[[j]])), j = j, value = 0)
+    set(x = bus_matched, j = j, value = pmin(bus_matched[[j]], 1))
+}
+
+bus_matched
+
+##==============================================================================
+## SAVE RDS
+##==============================================================================
+## Set the key for dat_model
+setkey(bus_matched, Inspection_ID)
+saveRDS(bus_matched, file.path("DATA/24_bus_features.Rds"))
+
diff --git a/CODE/30_glmnet_model.R b/CODE/30_glmnet_model.R
deleted file mode 100644
index c18691d..0000000
--- a/CODE/30_glmnet_model.R
+++ /dev/null
@@ -1,134 +0,0 @@
-
-##==============================================================================
-## INITIALIZE
-##==============================================================================
-if(interactive()){
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach libraries that are not used
-    geneorama::detach_nonstandard_packages()
-}
-## Load libraries that are used
-geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2"))
-## Load custom functions
-geneorama::sourceDir("CODE/functions/")
-
-##==============================================================================
-## LOAD CACHED RDS FILES
-##==============================================================================
-dat <- readRDS("DATA/dat_model.Rds")
-
-## Only keep "Retail Food Establishment"
-dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"]
-## Remove License Description
-dat[ , LICENSE_DESCRIPTION := NULL]
-dat <- na.omit(dat)
-
-## Add criticalFound variable to dat:
-dat[ , criticalFound := pmin(1, criticalCount)]
-
-## Set the key for dat
-setkey(dat, Inspection_ID)
-
-## Match time period of original results
-# dat <- dat[Inspection_Date < "2013-09-01" | Inspection_Date > "2014-07-01"]
-
-##==============================================================================
-## CREATE MODEL DATA
-##==============================================================================
-# sort(colnames(dat))
-xmat <- dat[ , list(Inspector = Inspector_Assigned,
-                    pastSerious = pmin(pastSerious, 1),
-                    pastCritical = pmin(pastCritical, 1),
-                    timeSinceLast,
-                    ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
-                    consumption_on_premises_incidental_activity,
-                    tobacco_retail_over_counter,
-                    temperatureMax,
-                    heat_burglary = pmin(heat_burglary, 70),
-                    heat_sanitation = pmin(heat_sanitation, 70),
-                    heat_garbage = pmin(heat_garbage, 50),
-                    # Facility_Type,
-                    criticalFound),
-             keyby = Inspection_ID]
-mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
-mm <- as.data.table(mm)
-str(mm)
-colnames(mm)
-
-##==============================================================================
-## CREATE TEST / TRAIN PARTITIONS
-##==============================================================================
-## 2014-07-01 is an easy separator
-dat[Inspection_Date < "2014-07-01", range(Inspection_Date)]
-dat[Inspection_Date > "2014-07-01", range(Inspection_Date)]
-
-iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")]
-iiTest <- dat[ , which(Inspection_Date > "2014-07-01")]
-
-## Check to see if any rows didn't make it through the model.matrix formula
-nrow(dat)
-nrow(xmat)
-nrow(mm)
-
-##==============================================================================
-## GLMNET MODEL
-## FOR MORE INFO SEE:
-## http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html
-##==============================================================================
-
-# fit ridge regression, alpha = 0, only inspector coefficients penalized
-penalty <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0)
-
-## Find best lambda based on CV results
-## Note, The cvfit object includes the top model
-cvfit <- cv.glmnet(x = as.matrix(mm[iiTrain]),
-                   y = xmat[iiTrain,  criticalFound],
-                   family = "binomial", 
-                   alpha = 0,
-                   penalty.factor = penalty,
-                   type.measure = "deviance")
-
-## View of results
-plot(cvfit)
-cvfit$lambda
-cvfit$lambda.min
-
-##==============================================================================
-## ATTACH PREDICTIONS TO DAT
-##==============================================================================
-
-## Attach predictions for top lambda choice to the data
-dat$score <- predict(cvfit$glmnet.fit, 
-                     newx = as.matrix(mm), 
-                     s = cvfit$lambda.min,
-                     type = "response")[,1]
-
-## Identify each row as test / train
-dat$Test <- 1:nrow(dat) %in% iiTest
-dat$Train <- 1:nrow(dat) %in% iiTrain
-
-## Calculate scores for all lambda values
-allscores <- predict(cvfit$glmnet.fit, 
-                     newx = as.matrix(mm), 
-                     s = cvfit$glmnet.fit$lambda,
-                     type = "response")
-
-allscores <- as.data.table(allscores)
-setnames(allscores, 
-         cvfit$glmnet.fit$beta@Dimnames[[2]])
-
-## Identify each row as test / train
-allscores$Test <- 1:nrow(allscores) %in% iiTest
-allscores$Train <- 1:nrow(allscores) %in% iiTrain
-
-##==============================================================================
-## SAVE RESULTS
-##==============================================================================
-
-saveRDS(dat, "DATA/30_glmnet_data.Rds")
-saveRDS(cvfit, "DATA/30_glmnet_cvfit.Rds")
-saveRDS(allscores, "DATA/30_glmnet_allscores.Rds")
-
-
diff --git a/CODE/30_random_forest_model.R b/CODE/30_random_forest_model.R
deleted file mode 100644
index bc2568e..0000000
--- a/CODE/30_random_forest_model.R
+++ /dev/null
@@ -1,98 +0,0 @@
-
-##==============================================================================
-## INITIALIZE
-##==============================================================================
-if(interactive()){
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach libraries that are not used
-    geneorama::detach_nonstandard_packages()
-}
-## Load libraries that are used
-geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2"))
-## Load custom functions
-geneorama::sourceDir("CODE/functions/")
-
-##==============================================================================
-## LOAD CACHED RDS FILES
-##==============================================================================
-dat <- readRDS("DATA/dat_model.Rds")
-
-## Only keep "Retail Food Establishment"
-dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"]
-## Remove License Description
-dat[ , LICENSE_DESCRIPTION := NULL]
-dat <- na.omit(dat)
-
-## Add criticalFound variable to dat:
-dat[ , criticalFound := pmin(1, criticalCount)]
-
-## Set the key for dat
-setkey(dat, Inspection_ID)
-
-## Match time period of original results
-# dat <- dat[Inspection_Date < "2013-09-01" | Inspection_Date > "2014-07-01"]
-
-##==============================================================================
-## CREATE MODEL DATA
-##==============================================================================
-# sort(colnames(dat))
-xmat <- dat[ , list(Inspector = Inspector_Assigned,
-                    pastSerious = pmin(pastSerious, 1),
-                    pastCritical = pmin(pastCritical, 1),
-                    timeSinceLast,
-                    ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
-                    consumption_on_premises_incidental_activity,
-                    tobacco_retail_over_counter,
-                    temperatureMax,
-                    heat_burglary = pmin(heat_burglary, 70),
-                    heat_sanitation = pmin(heat_sanitation, 70),
-                    heat_garbage = pmin(heat_garbage, 50),
-                    # Facility_Type,
-                    criticalFound),
-             keyby = Inspection_ID]
-mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
-mm <- as.data.table(mm)
-str(mm)
-colnames(mm)
-
-##==============================================================================
-## CREATE TEST / TRAIN PARTITIONS
-##==============================================================================
-## 2014-07-01 is an easy separator
-dat[Inspection_Date < "2014-07-01", range(Inspection_Date)]
-dat[Inspection_Date > "2014-07-01", range(Inspection_Date)]
-
-iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")]
-iiTest <- dat[ , which(Inspection_Date > "2014-07-01")]
-
-## Check to see if any rows didn't make it through the model.matrix formula
-nrow(dat)
-nrow(xmat)
-nrow(mm)
-
-##==============================================================================
-## RANDOM FOREST MODEL
-##==============================================================================
-model <- randomForest(x = as.matrix(mm[iiTrain]),
-                      y = as.factor(xmat[iiTrain, criticalFound]),
-                      importance=TRUE)
-
-## ATTACH PREDICTIONS TO DAT
-dat$score <- predict(model, as.matrix(mm),
-                    type="prob")[ , 2]
-
-## Identify each row as test / train
-dat$Test <- 1:nrow(dat) %in% iiTest
-dat$Train <- 1:nrow(dat) %in% iiTrain
-
-##==============================================================================
-## SAVE RESULTS
-##==============================================================================
-
-saveRDS(dat, "DATA/30_random_forest_data.Rds")
-saveRDS(model, "DATA/30_random_forest_model.Rds")
-
-
-
diff --git a/CODE/30a_glmnet_model.R b/CODE/30a_glmnet_model.R
new file mode 100644
index 0000000..4b5039f
--- /dev/null
+++ b/CODE/30a_glmnet_model.R
@@ -0,0 +1,142 @@
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
+## Load libraries & project functions
+geneorama::loadinstall_libraries(c("data.table", "glmnet"))
+geneorama::sourceDir("CODE/functions/")
+
+##==============================================================================
+## LOAD CACHED RDS FILES
+##==============================================================================
+food <- readRDS("DATA/23_food_insp_features.Rds")
+bus <- readRDS("DATA/24_bus_features.Rds")
+sanitarians <- readRDS("DATA/19_inspector_assignments.Rds")
+weather <- readRDS("DATA/17_mongo_weather_update.Rds")
+heat_burglary <- readRDS("DATA/22_burglary_heat.Rds")
+heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds")
+heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds")
+
+##==============================================================================
+## MERGE IN FEATURES
+##==============================================================================
+sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian]
+setnames(heat_burglary, "heat_values", "heat_burglary")
+setnames(heat_garbage, "heat_values", "heat_garbage")
+setnames(heat_sanitation, "heat_values", "heat_sanitation")
+
+dat <- copy(food)
+dat <- dat[bus]
+dat <- merge(x = dat,  y = sanitarians,  by = "Inspection_ID")
+dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date")
+dat <- merge(dat, na.omit(heat_burglary),  by = "Inspection_ID")
+dat <- merge(dat, na.omit(heat_garbage),  by = "Inspection_ID")
+dat <- merge(dat, na.omit(heat_sanitation),  by = "Inspection_ID")
+
+## Set the key for dat
+setkey(dat, Inspection_ID)
+
+## Remove unnecessary data
+rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation)
+
+## Only the model data should be present
+geneorama::lll()
+
+##==============================================================================
+## FILTER ROWS
+##==============================================================================
+dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"]
+dat
+
+##==============================================================================
+## DISPLAY AVAILABLE VARIABLES
+##==============================================================================
+geneorama::NAsummary(dat)
+
+##==============================================================================
+## Add criticalFound variable to dat:
+##==============================================================================
+dat[ , criticalFound := pmin(1, criticalCount)]
+
+##==============================================================================
+## Calculate index for training data (last three months)
+##==============================================================================
+dat[ , Test := Inspection_Date >= (max(Inspection_Date) - 90)]
+
+##==============================================================================
+## CREATE MODEL DATA
+##==============================================================================
+# sort(colnames(dat))
+xmat <- dat[ , list(Inspector = as.character(sanitarian),
+                    pastSerious = pmin(pastSerious, 1),
+                    pastCritical = pmin(pastCritical, 1),
+                    timeSinceLast,
+                    ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
+                    consumption_on_premises_incidental_activity,
+                    tobacco,
+                    temperatureMax,
+                    heat_burglary = pmin(heat_burglary, 70),
+                    heat_sanitation = pmin(heat_sanitation, 70),
+                    heat_garbage = pmin(heat_garbage, 50),
+                    criticalFound),
+             keyby = list(Inspection_ID, Test)]
+
+## View the structure of the final xmat
+str(xmat)
+
+##==============================================================================
+## GLMNET MODEL
+##==============================================================================
+## Construct model matrix without the key values in xmat
+mm <- model.matrix(criticalFound ~ . -1, 
+                   data = xmat[ , .SD, .SDcol=-key(xmat)])
+str(mm)
+colnames(mm)
+
+# fit ridge regression, alpha = 0, only inspector coefficients penalized
+penalty <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0)
+
+## PRODUCTION version of the model which is not split for test / train
+model <- cv.glmnet(x = mm,
+                   y = xmat[ ,  criticalFound],
+                   family = "binomial", 
+                   alpha = 0,
+                   penalty.factor = penalty)
+
+## EVALUATION version of the model which is only fit on the training data
+model_eval <- cv.glmnet(x = mm[xmat$Test==FALSE, ],
+                        y = xmat[Test == FALSE ,  criticalFound],
+                        family = "binomial", 
+                        alpha = 0,
+                        penalty.factor = penalty)
+
+## Lambda
+model$lambda
+model$lambda.min
+
+## Attach predictions for top lambda choice to the data
+dat$glm_pred <- predict(model$glmnet.fit, 
+                        newx = mm, 
+                        s = model$lambda.min,
+                        type = "response")[,1]
+dat$glm_pred_test <- predict(model_eval$glmnet.fit, 
+                             newx = mm, 
+                             s = model_eval$lambda.min,
+                             type = "response")[,1]
+
+## Coefficients
+coef <- coef(model)[,1]
+inspCoef <- coef[grepl("^Inspector",names(coef))]
+inspCoef <- inspCoef[order(-inspCoef)]
+
+## Save Results
+saveRDS(dat, "DATA/30_dat.Rds")
+saveRDS(xmat, "DATA/30_xmat.Rds")
+saveRDS(mm, "DATA/30_modelmatrix.Rds")
+saveRDS(coef, "DATA/30_coef.Rds")
+saveRDS(inspCoef, "DATA/30_inspCoef.Rds")
+saveRDS(model, "DATA/30_model.Rds")
+saveRDS(model_eval, "DATA/30_model_eval.Rds")
diff --git a/CODE/31_glmnet_model_evaluation.R b/CODE/30b_glmnet_model_evaluation.R
similarity index 80%
rename from CODE/31_glmnet_model_evaluation.R
rename to CODE/30b_glmnet_model_evaluation.R
index c69f719..5b04aa4 100644
--- a/CODE/31_glmnet_model_evaluation.R
+++ b/CODE/30b_glmnet_model_evaluation.R
@@ -1,14 +1,10 @@
-
 ##==============================================================================
 ## INITIALIZE
 ##==============================================================================
-if(interactive()){
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach libraries that are not used
-    geneorama::detach_nonstandard_packages()
-}
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
 ## Load libraries that are used
 geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "ROCR"))
 ## Load custom functions
@@ -17,13 +13,25 @@ geneorama::sourceDir("CODE/functions/")
 ##==============================================================================
 ## LOAD CACHED RDS FILES
 ##==============================================================================
-dat <- readRDS("DATA/30_glmnet_data.Rds")
-cvfit <- readRDS("DATA/30_glmnet_cvfit.Rds")
-allscores <- readRDS("DATA/30_glmnet_allscores.Rds")
+dat <- readRDS("DATA/30_dat.Rds")
+cvfit <- readRDS("DATA/30_model_eval.Rds")
+mm <- readRDS("DATA/30_modelmatrix.Rds")
 
 str(cvfit)
 str(cvfit$glmnet.fit$beta)
 
+## Calculate scores for all lambda values
+allscores <- predict(cvfit$glmnet.fit, 
+                     newx = as.matrix(mm), 
+                     s = cvfit$glmnet.fit$lambda,
+                     type = "response")
+allscores <- as.data.table(allscores)
+setnames(allscores, cvfit$glmnet.fit$beta@Dimnames[[2]])
+
+## Identify each row as test / train
+allscores$Test <- dat$Test
+allscores$Train <- !dat$Test
+
 ##==============================================================================
 ## GLMNET specific diagnostics
 ##==============================================================================
@@ -48,7 +56,7 @@ plot(cvfit$cvm ~ log(cvfit$lambda))
 loglik_errors <- sapply(1:100, 
                         function(i) {
                             logLik(p = allscores[Train==TRUE][[i]], 
-                            y = dat[Train==TRUE, criticalFound])
+                            y = dat[Test==FALSE, criticalFound])
                             })
 plot(loglik_errors ~ log(cvfit$glmnet.fit$lambda))
 lines(x = log(cvfit$glmnet.fit$lambda),
@@ -65,19 +73,19 @@ plot(cvfit$glmnet.fit, xvar = "dev", label = TRUE)
 ##==============================================================================
 
 # Show gini performance of inspector model on tune data set
-dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)]
-dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)]
+dat[Test==FALSE, gini(glm_pred, criticalFound, plot=TRUE)]
+dat[Test==TRUE, gini(glm_pred, criticalFound, plot=TRUE)]
 
 ## Calculate confusion matrix values for evaluation
 calculate_confusion_values(actual = dat[Test==TRUE, criticalFound],
-                           expected = dat[Test==TRUE, score], 
+                           expected = dat[Test==TRUE, glm_pred_test], 
                            r = .25)
 
 ## Calculate matrix of confusion matrix values for evaluation
 confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
                                   calculate_confusion_values,
                                   actual = dat[Test==TRUE, criticalFound],
-                                  expected = dat[Test==TRUE, score]))
+                                  expected = dat[Test==TRUE, glm_pred_test]))
 confusion_values_test <- as.data.table(confusion_values_test)
 confusion_values_test
 
@@ -85,7 +93,6 @@ confusion_values_test
 ggplot(melt(confusion_values_test, id.vars="r")) + 
     aes(x=r, y=value, colour=variable) + geom_line() + 
     geom_hline(yintercept = c(0,1))
-
 ##==============================================================================
 ## MODEL EVALUATION
 ##    - TIME SAVINGS
@@ -95,10 +102,10 @@ ggplot(melt(confusion_values_test, id.vars="r")) +
 datTest <- dat[Test == TRUE]
 
 ## Mean time savings:
-datTest[ , simulated_date_diff_mean(Inspection_Date, score, criticalFound)]
+datTest[ , simulated_date_diff_mean(Inspection_Date, glm_pred_test, criticalFound)]
 
 ## Detailed time savings:
-bins <- datTest[ , simulated_bin_summary(Inspection_Date, score, criticalFound)]
+bins <- datTest[ , simulated_bin_summary(Inspection_Date, glm_pred_test, criticalFound)]
 bins
 
 ## This calculation is the weighted average date difference, which should match
@@ -126,10 +133,10 @@ binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705
 ##==============================================================================
 
 ## Example with random values:
-# predTest <- prediction(datTest[ ,list(score, runif(.N) )], 
+# predTest <- prediction(datTest[ ,list(glm_pred_test, runif(.N) )], 
 #                        datTest[ ,list(criticalFound, criticalFound)])
 
-predTest <- prediction(datTest$score, datTest$criticalFound)
+predTest <- prediction(datTest$glm_pred_test, datTest$criticalFound)
 
 ## precision / recall
 plot(performance(predTest, "prec", "rec"), main="precision recall")
@@ -162,3 +169,4 @@ plot(performance(predTest, measure = "acc"))
 
 ## AUC
 performance(predTest, measure = "auc")@y.values[[1]]
+
diff --git a/CODE/31a_random_forest_model.R b/CODE/31a_random_forest_model.R
new file mode 100644
index 0000000..75e7040
--- /dev/null
+++ b/CODE/31a_random_forest_model.R
@@ -0,0 +1,40 @@
+##==============================================================================
+## INITIALIZE
+##==============================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
+## Load libraries that are used
+geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2"))
+## Load custom functions
+geneorama::sourceDir("CODE/functions/")
+
+##==============================================================================
+## LOAD CACHED RDS FILES
+##==============================================================================
+dat <- readRDS("DATA/30_dat.Rds")
+mm <- readRDS("DATA/30_modelmatrix.Rds")
+xmat <- readRDS("DATA/30_xmat.Rds")
+
+##==============================================================================
+## RANDOM FOREST MODEL
+##==============================================================================
+model <- randomForest(x = mm[xmat$Test==FALSE, ],
+                      y = as.factor(xmat[xmat$Test==FALSE, criticalFound]),
+                      importance=TRUE)
+
+## ATTACH PREDICTIONS TO DAT
+dat$rf_pred_test <- predict(model, 
+                            as.matrix(mm), 
+                            type="prob")[ , 2]
+
+##==============================================================================
+## SAVE RESULTS
+##==============================================================================
+
+saveRDS(dat, "DATA/31_random_forest_data.Rds")
+saveRDS(model, "DATA/31_random_forest_model.Rds")
+
+
+
diff --git a/CODE/31_random_forest_evaluation.R b/CODE/31b_random_forest_evaluation.R
similarity index 83%
rename from CODE/31_random_forest_evaluation.R
rename to CODE/31b_random_forest_evaluation.R
index a368f02..889c59e 100644
--- a/CODE/31_random_forest_evaluation.R
+++ b/CODE/31b_random_forest_evaluation.R
@@ -1,14 +1,10 @@
-
 ##==============================================================================
 ## INITIALIZE
 ##==============================================================================
-if(interactive()){
-    ## Remove all objects; perform garbage collection
-    rm(list=ls())
-    gc(reset=TRUE)
-    ## Detach libraries that are not used
-    geneorama::detach_nonstandard_packages()
-}
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
 ## Load libraries that are used
 geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2", "ROCR"))
 ## Load custom functions
@@ -17,13 +13,14 @@ geneorama::sourceDir("CODE/functions/")
 ##==============================================================================
 ## LOAD CACHED RDS FILES
 ##==============================================================================
-dat <- readRDS("DATA/30_random_forest_data.Rds")
-model <- readRDS("DATA/30_random_forest_model.Rds")
+dat <- readRDS("DATA/31_random_forest_data.Rds")
+model <- readRDS("DATA/31_random_forest_model.Rds")
 
 ##==============================================================================
 ## Random Forest specific diagnostics
 ##==============================================================================
-
+varImpPlot(model)
+importance(model)
 
 ##==============================================================================
 ## Gini and confusion matrix calculations
@@ -31,19 +28,19 @@ model <- readRDS("DATA/30_random_forest_model.Rds")
 
 ## Plot of the errors by Lambda for the out of sample Test data
 # Show gini performance of inspector model on tune data set
-dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)]
-dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)]
+dat[Test==FALSE, gini(rf_pred_test, criticalFound, plot=TRUE)]
+dat[Test==TRUE, gini(rf_pred_test, criticalFound, plot=TRUE)]
 
 ## Calculate confusion matrix values for evaluation
 calculate_confusion_values(actual = dat[Test==TRUE, criticalFound],
-                           expected = dat[Test==TRUE, score], 
+                           expected = dat[Test==TRUE, rf_pred_test], 
                            r = .25)
 
 ## Calculate matrix of confusion matrix values for evaluation
 confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
                                   calculate_confusion_values,
                                   actual = dat[Test==TRUE, criticalFound],
-                                  expected = dat[Test==TRUE, score]))
+                                  expected = dat[Test==TRUE, rf_pred_test]))
 confusion_values_test <- as.data.table(confusion_values_test)
 confusion_values_test
 
@@ -61,10 +58,10 @@ ggplot(melt(confusion_values_test, id.vars="r")) +
 datTest <- dat[Test == TRUE]
 
 ## Mean time savings:
-datTest[ , simulated_date_diff_mean(Inspection_Date, score, criticalFound)]
+datTest[ , simulated_date_diff_mean(Inspection_Date, rf_pred_test, criticalFound)]
 
 ## Detailed time savings:
-bins <- datTest[ , simulated_bin_summary(Inspection_Date, score, criticalFound)]
+bins <- datTest[ , simulated_bin_summary(Inspection_Date, rf_pred_test, criticalFound)]
 bins
 
 ## This calculation is the weighted average date difference, which should match
@@ -94,10 +91,10 @@ binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705
 ##==============================================================================
 
 ## Example with random values:
-# predTest <- prediction(datTest[ ,list(score, runif(.N) )], 
+# predTest <- prediction(datTest[ ,list(rf_pred_test, runif(.N) )], 
 #                        datTest[ ,list(criticalFound, criticalFound)])
 
-predTest <- prediction(datTest$score, datTest$criticalFound)
+predTest <- prediction(datTest$rf_pred_test, datTest$criticalFound)
 
 ## precision / recall
 plot(performance(predTest, "prec", "rec"), main="precision recall")
diff --git a/CODE/functions/GenerateOtherLicenseInfo.R b/CODE/functions/GenerateOtherLicenseInfo.R
index 6c501bf..04ffd21 100644
--- a/CODE/functions/GenerateOtherLicenseInfo.R
+++ b/CODE/functions/GenerateOtherLicenseInfo.R
@@ -49,7 +49,7 @@ GenerateOtherLicenseInfo <- function(inspection_data,
                                      keep.rownames = TRUE)[order(-V2)]
     setnames(category_totals, c("cat", "N"))
     ## LIMIT CATEGORY COLUMNS
-    categories_keep <- category_totals[1:min(max_cat, nrow(category_totals)-3), 
+    categories_keep <- category_totals[1:max(min(max_cat, nrow(category_totals)-3), 1), 
                                        cat]
     tab_final <- tab[,c("dba", "addr", "d_insp", categories_keep), with=F]
     
diff --git a/CODE/functions/calculate_violation_matrix.R b/CODE/functions/calculate_violation_matrix.R
new file mode 100644
index 0000000..44d645e
--- /dev/null
+++ b/CODE/functions/calculate_violation_matrix.R
@@ -0,0 +1,26 @@
+
+## Requires a vector of violations in text format, 
+## where each element is a collapsed list of violations
+## separated by |
+
+calculate_violation_matrix <- function(violation_text){
+    
+    require(data.table)
+    
+    ## Tabluate voilation types
+    ## 1) Split violoation description by "|"
+    ## 2) use regex to extract leading digits of code number
+    ## 3) create indicator matrix of code violations
+    vio <- strsplit(violation_text,"| ",fixed=T)
+    vio_nums <- lapply(vio, 
+                       function(item) regmatches(x = item, 
+                                                 m = gregexpr(pattern = "^[0-9]+", 
+                                                              text = item)))
+    vio_mat <- geneorama::list2matrix(vio_nums, count = T)
+    vio_mat <- vio_mat[ , order(as.numeric(colnames(vio_mat)))]
+    # colnames(vio_mat)
+    # range(vio_mat)
+    
+    return(vio_mat)
+}
+
diff --git a/CODE/functions/calculate_violation_types.R b/CODE/functions/calculate_violation_types.R
index aa34e5b..45cb5f5 100644
--- a/CODE/functions/calculate_violation_types.R
+++ b/CODE/functions/calculate_violation_types.R
@@ -1,54 +1,43 @@
 
-## Requires a vector of violations in text format, 
-## where each element is a collapsed list of violations
-## separated by |
-## Requires a vector of key values to be assigned to the 
-## result for merging after the function is run.
+## Requirements for calculate_violation_types:
+##       1 Requires a matrix (within a keyed data.table) of violation 
+##         indicators where each row contains 0 or 1 to indicate the presence
+##         of a violation.  There should be columns 1:44, named "1" to "44" 
+##         (plus a key column) corresponding to the 44 violation types found in 
+##         the data.
+##
+## Uses the function calculate_violation_matrix
+## to calculate intermediate result (indicator matrix)
 ##
 
-calculate_violation_types <- function(violation_text, ...){
+calculate_violation_types <- function(violation_mat){
     
     require(data.table)
     
-    ## Tabluate voilation types
-    ## 1) Split violoation description by "|"
-    ## 2) use regex to extract leading digits of code number
-    ## 3) create indicator matrix of code violations
-    ## 4) use apply to total up each group of code violations
-    vio <- strsplit(violation_text,"| ",fixed=T)
-    vio_nums <- lapply(vio, 
-                       function(item) regmatches(x = item, 
-                                                 m = gregexpr(pattern = "^[0-9]+", 
-                                                              text = item)))
-    vio_mat <- geneorama::list2matrix(vio_nums, count = T)
-    vio_mat <- vio_mat[ , order(as.numeric(colnames(vio_mat)))]
-    # colnames(vio_mat)
-    # range(vio_mat)
-    
-    criticalCount <- apply(vio_mat[ , colnames(vio_mat) %in% 1:14], 1, sum)
-    seriousCount <- apply(vio_mat[ , colnames(vio_mat) %in% 15:29], 1, sum)
-    minorCount <- apply(vio_mat[ , colnames(vio_mat) %in% 30:44], 1, sum)
-    
-    ## Extract the key from the ...'s
-    key_vec <- list(...)
-    
-    ## Check if the key is in the ...'s
-    if(length(key_vec) != 1) {
-        stop("A key vector is required as the second argument")
+    ## Check that violation_mat is a data.table
+    if(!inherits(x = violation_mat, what = "data.table")){
+        stop("violation_mat should be a data.table, and have a defined key")
     }
     
-    ## Check key length
-    if(length(key_vec[[1]]) != length(violation_text)){
-        stop("The length of the key must match the length of the first argument")
+    ## Check for the key
+    if(length(key(violation_mat)) == 0) {
+        stop("The violation matrix should have a defined key")
     }
     
+    vio_mat <- as.matrix(nokey(violation_mat))
+    vio_key <- violation_mat[ , key(violation_mat), with = FALSE]
+    
+    ## Tabluate voilation types
+    ## use apply to total up each group of code violations
+    criticalCount <- apply(vio_mat[ , colnames(vio_mat) %in% 1:14], 1, sum)
+    seriousCount <- apply(vio_mat[ , colnames(vio_mat) %in% 15:29], 1, sum)
+    minorCount <- apply(vio_mat[ , colnames(vio_mat) %in% 30:44], 1, sum)
+
     ## Construct return values
     ret <- data.table(criticalCount,
                       seriousCount,
-                      minorCount)
-    data.table::set(x = ret, 
-                    j = names(key_vec), 
-                    value = key_vec[[1]])
-    setkeyv(ret, names(key_vec))
+                      minorCount,
+                      vio_key,
+                      key = key(violation_mat))
     return(ret)
 }
diff --git a/CODE/functions/categorize.R b/CODE/functions/categorize.R
index 482ad7d..6c1924b 100644
--- a/CODE/functions/categorize.R
+++ b/CODE/functions/categorize.R
@@ -22,11 +22,9 @@
 #' package).
 #' 
 #' @example
-#' dat <- c("restaurant", "Restaurant and bar", "grocery", "Grocery Store",
-#'          "stadium", "school", "church", "high school", "school restaurant")
-#' other = "Other"
-#' 
-#' categorize(x = dat, 
+#' categorize(x = c("restaurant", "Restaurant and bar", "grocery", 
+#'                  "Grocery Store", "stadium", "school", "church", 
+#'                  "high school", "school restaurant"), 
 #'            primary = list(Restaurant = "restaurant",
 #'                           Grocery_Store = "grocery",
 #'                           "School"), 
diff --git a/CODE/functions/filter_business.R b/CODE/functions/filter_business.R
index 61e7329..106ca2d 100644
--- a/CODE/functions/filter_business.R
+++ b/CODE/functions/filter_business.R
@@ -1,4 +1,8 @@
 filter_business <- function(business){
+    
+    ##==============================================================================
+    ## Filter rows
+    ##==============================================================================
     ## Remove rows with na values
     business <- business[!is.na(LICENSE_TERM_START_DATE)]
     business <- business[!is.na(LICENSE_TERM_EXPIRATION_DATE)]
@@ -6,6 +10,21 @@ filter_business <- function(business){
     business <- business[!(APPLICATION_TYPE %in% c("C_CAPA","C_SBA"))]
     ## Remove duplicate ids
     business <- business[!duplicated(ID)]
+    
+    ##==============================================================================
+    ## Remove columns that are never used
+    ##==============================================================================
+    business$SITE_NUMBER <- NULL
+    business$APPLICATION_CREATED_DATE <- NULL
+    business$APPLICATION_REQUIREMENTS_COMPLETE <- NULL
+    business$PAYMENT_DATE <- NULL
+    business$CONDITIONAL_APPROVAL <- NULL
+    business$LICENSE_APPROVED_FOR_ISSUANCE <- NULL
+    business$DATE_ISSUED <- NULL
+    business$LICENSE_STATUS_CHANGE_DATE <- NULL
+    business$SSA <- NULL
+    business$LOCATION <- NULL
+    
     ## Return results
     return(business)
 }
diff --git a/CODE/functions/filter_foodInspect.R b/CODE/functions/filter_foodInspect.R
index aed27d2..d8946ce 100644
--- a/CODE/functions/filter_foodInspect.R
+++ b/CODE/functions/filter_foodInspect.R
@@ -1,6 +1,13 @@
 filter_foodInspect <- function(foodInspect){
     foodInspect <- foodInspect[!is.na(Inspection_Date) & !is.na(License)]
-    foodInspect <- foodInspect[!duplicated(Inspection_ID)]
+    if(foodInspect[,any(duplicated(Inspection_ID))]){
+        warning(paste0("Removing ", nrow(foodInspect[duplicated(Inspection_ID)]), 
+                       " duplicated records from foodInspect ",
+                       "of ", nrow(foodInspect), " total records.\n",
+                       "Duplication is based on the Inspection_ID"))
+        ## Remove any duplicated Inspection_ID values
+        foodInspect <- foodInspect[!duplicated(Inspection_ID)]
+    }
     foodInspect <- foodInspect[License != 0]
     foodInspect <- foodInspect[Inspection_Date > as.IDate("2011-09-01")]
     foodInspect <- foodInspect[Inspection_Type == "Canvass"]
diff --git a/CODE/functions/find_bus_id_matches.R b/CODE/functions/find_bus_id_matches.R
index e2ebbb6..2131238 100644
--- a/CODE/functions/find_bus_id_matches.R
+++ b/CODE/functions/find_bus_id_matches.R
@@ -6,6 +6,16 @@
 ##
 
 find_bus_id_matches <- function(business, foodInspect) {
+    # browser()
+    ## Since many food businesses are inspected before they are issued a 
+    ## license,  so we need to move back the "License term start date" for 
+    ## newly issued licenses.  An adjustment of 365 days seems pretty 
+    ## reasonable, and creates lots of matches.
+    ## Note also, the `as.integer` call.  This is needed to avoid a warning.
+    business_copy <- copy(business)
+    business_copy <- business_copy[APPLICATION_TYPE=="ISSUE",
+                                   LICENSE_TERM_START_DATE := 
+                                       as.integer(LICENSE_TERM_START_DATE - 365)]
     
     ## Merge over time periods
     dat <- foverlaps(foodInspect[i = TRUE,
@@ -13,13 +23,13 @@ find_bus_id_matches <- function(business, foodInspect) {
                                  keyby = list(License, 
                                               Inspection_Date = Inspection_Date, 
                                               Inspection_Date_end = Inspection_Date)], 
-                     business[i = LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE, 
-                              j = list(Business_ID = ID),
-                              keyby = list(LICENSE_NUMBER, 
-                                           LICENSE_TERM_START_DATE, 
-                                           LICENSE_TERM_EXPIRATION_DATE)], 
+                     business_copy[i = LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE, 
+                                   j = list(ID),
+                                   keyby = list(LICENSE_NUMBER, 
+                                                LICENSE_TERM_START_DATE, 
+                                                LICENSE_TERM_EXPIRATION_DATE)], 
                      mult = "first", 
                      type = "any", 
                      nomatch = NA)
-    return(dat[ , list(Inspection_ID, Business_ID)])
+    return(dat[ , list(Inspection_ID, ID)])
 }
diff --git a/CODE/functions/nokey.R b/CODE/functions/nokey.R
new file mode 100644
index 0000000..0f02660
--- /dev/null
+++ b/CODE/functions/nokey.R
@@ -0,0 +1,16 @@
+
+nokey <- function(dt, usestrings){
+  require(data.table)
+  
+  if(!inherits(x = dt, what = "data.table")){
+    stop("dt should be a data.table")
+  }
+  
+  if(haskey(dt)){
+    ret <- dt[, .SD, .SDcol = -key(dt)]
+  } else {
+    ret <- dt
+  }
+  return(ret)
+}
+
diff --git a/CODE/prep_inspectors_for_eval.R b/CODE/prep_inspectors_for_eval.R
new file mode 100644
index 0000000..7984b46
--- /dev/null
+++ b/CODE/prep_inspectors_for_eval.R
@@ -0,0 +1,80 @@
+
+##
+## The sanitarian identies are not publically available, so measures are taken 
+## disguise their identities in this analysis.
+## This script takes the original disguised data used in the evaluation and 
+## modifies it to match the format that became available in the city in early 
+## 2015 and is still available as of Dec 2017. 
+##
+
+##==========================================================================
+## INITIALIZE
+##==========================================================================
+## Remove all objects; perform garbage collection
+rm(list=ls())
+gc(reset=TRUE)
+
+## Load libraries & project functions
+geneorama::loadinstall_libraries(c("data.table"))
+geneorama::sourceDir("CODE/functions/")
+
+##==============================================================================
+## LOAD CACHED RDS FILES
+##==============================================================================
+
+inspectors <- readRDS("DATA/inspectors.Rds")
+food <- readRDS("DATA/13_food_inspections.Rds")
+
+##==============================================================================
+## PROCESS INSPECTOR DATA
+##==============================================================================
+
+## Removing letters out front, and numbers trailing hyphen are not needed 
+## (e.g. -1006 I believe is the code for retail food license)
+inspectors[ , License := gsub('[A-z]+|-.+$', "", License)]
+
+## cleaning any leading zeros
+inspectors[ , License := gsub('^[0]+', "", License)]
+
+## removing possibly invalid license numbers
+inspectors <- inspectors[nchar(License) > 3 & Inspector_Assigned != " "]
+
+## if multiple inspections for same license number, then using the inspector 
+## on the first inspection
+inspectors <- inspectors[ , .N, by=list(License, Inspection_Date, Inspector_Assigned)]
+inspectors$N <- NULL
+
+## Convert to integer to match other data
+inspectors[ , License := as.integer(License)]
+setkey(inspectors, License, Inspection_Date)
+
+## Further deduplication
+inspectors_deduped <- inspectors[i = TRUE , 
+                                 j = list(Inspector_Assigned = Inspector_Assigned[1]), 
+                                 keyby = list(License, Inspection_Date)]
+
+## Merge in the Inspection_ID from the food records
+inspectors_deduped <- merge(x = food[ , list(License, Inspection_Date, Inspection_ID)],
+                            y = inspectors_deduped,
+                            by = c("License", "Inspection_Date"),
+                            # all.x = TRUE,
+                            sort = FALSE)
+inspectors_deduped[duplicated(Inspection_ID)]
+inspectors_deduped <- inspectors_deduped[!duplicated(Inspection_ID)]
+
+## Make the key columns of the inspector data match the output that the model
+## would have gotten from COC internal systems
+inspectors_deduped_renamed <- inspectors_deduped[i = TRUE,
+                                                 j = list(sanitarian = Inspector_Assigned),
+                                                 keyby = list(inspectionID = Inspection_ID)]
+
+geneorama::NAsummary(inspectors_deduped_renamed)
+
+##------------------------------------------------------------------------------
+## SAVE RESULT
+##------------------------------------------------------------------------------
+
+saveRDS(inspectors_deduped_renamed, "DATA/19_inspector_assignments.Rds")
+
+
+
diff --git a/DATA/11_bus_license.Rds b/DATA/11_bus_license.Rds
new file mode 100644
index 0000000..69a0cf4
Binary files /dev/null and b/DATA/11_bus_license.Rds differ
diff --git a/DATA/crime.Rds b/DATA/12_crime.Rds
similarity index 100%
rename from DATA/crime.Rds
rename to DATA/12_crime.Rds
diff --git a/DATA/food_inspections.Rds b/DATA/13_food_inspections.Rds
similarity index 100%
rename from DATA/food_inspections.Rds
rename to DATA/13_food_inspections.Rds
diff --git a/DATA/garbage_carts.Rds b/DATA/14_garbage_carts.Rds
similarity index 100%
rename from DATA/garbage_carts.Rds
rename to DATA/14_garbage_carts.Rds
diff --git a/DATA/sanitation_code.Rds b/DATA/15_sanitation_code.Rds
similarity index 100%
rename from DATA/sanitation_code.Rds
rename to DATA/15_sanitation_code.Rds
diff --git a/DATA/weather_20110401_20141031.Rds b/DATA/17_mongo_weather_update.Rds
similarity index 100%
rename from DATA/weather_20110401_20141031.Rds
rename to DATA/17_mongo_weather_update.Rds
diff --git a/DATA/19_inspector_assignments.Rds b/DATA/19_inspector_assignments.Rds
new file mode 100644
index 0000000..ba58ec2
Binary files /dev/null and b/DATA/19_inspector_assignments.Rds differ
diff --git a/DATA/21_food_inspection_violation_matrix.Rds b/DATA/21_food_inspection_violation_matrix.Rds
new file mode 100644
index 0000000..78edebe
Binary files /dev/null and b/DATA/21_food_inspection_violation_matrix.Rds differ
diff --git a/DATA/21_food_inspection_violation_matrix_nums.Rds b/DATA/21_food_inspection_violation_matrix_nums.Rds
new file mode 100644
index 0000000..1070c6e
Binary files /dev/null and b/DATA/21_food_inspection_violation_matrix_nums.Rds differ
diff --git a/DATA/burglary_heat.Rds b/DATA/22_burglary_heat.Rds
similarity index 100%
rename from DATA/burglary_heat.Rds
rename to DATA/22_burglary_heat.Rds
diff --git a/DATA/garbageCarts_heat.Rds b/DATA/22_garbageCarts_heat.Rds
similarity index 100%
rename from DATA/garbageCarts_heat.Rds
rename to DATA/22_garbageCarts_heat.Rds
diff --git a/DATA/sanitationComplaints_heat.Rds b/DATA/22_sanitationComplaints_heat.Rds
similarity index 100%
rename from DATA/sanitationComplaints_heat.Rds
rename to DATA/22_sanitationComplaints_heat.Rds
diff --git a/DATA/23_food_insp_features.Rds b/DATA/23_food_insp_features.Rds
new file mode 100644
index 0000000..268c65a
Binary files /dev/null and b/DATA/23_food_insp_features.Rds differ
diff --git a/DATA/24_bus_features.Rds b/DATA/24_bus_features.Rds
new file mode 100644
index 0000000..b0a20ca
Binary files /dev/null and b/DATA/24_bus_features.Rds differ
diff --git a/DATA/bus_license.Rds b/DATA/bus_license.Rds
deleted file mode 100644
index 3b4a7f6..0000000
Binary files a/DATA/bus_license.Rds and /dev/null differ
diff --git a/DATA/dat_model.Rds b/DATA/dat_model.Rds
deleted file mode 100644
index c1c1cd5..0000000
Binary files a/DATA/dat_model.Rds and /dev/null differ
diff --git a/DATA/violation_dat.Rds b/DATA/violation_dat.Rds
deleted file mode 100644
index 406f38f..0000000
Binary files a/DATA/violation_dat.Rds and /dev/null differ
diff --git a/REPORTS/CountComparison_aftersecondrefactor.Rmd b/REPORTS/CountComparison_aftersecondrefactor.Rmd
index 6c54918..61bb4b0 100644
--- a/REPORTS/CountComparison_aftersecondrefactor.Rmd
+++ b/REPORTS/CountComparison_aftersecondrefactor.Rmd
@@ -33,10 +33,10 @@ geneorama::sourceDir("CODE/functions/")
 ## LOAD CACHED RDS FILES
 ##==============================================================================
 ## Import the key data sets used for prediction
-foodInspect <- readRDS("DATA/food_inspections.Rds")
+foodInspect <- readRDS("DATA/13_food_inspections.Rds")
 foodInspect$Violations <- NULL
-violation_dat <- readRDS("DATA/violation_dat.Rds")
-dat_model <- readRDS("DATA/dat_model.Rds")
+violation_dat <- readRDS("DATA/22_violation_dat.Rds")
+dat_model <- readRDS("DATA/23_dat_model.Rds")
 
 ##==============================================================================
 ## ADD FIELDS AND APPLY FILTER TO foodInspect
diff --git a/REPORTS/Metric_Development.Rmd b/REPORTS/Metric_Development.Rmd
index 881f059..9721b53 100644
--- a/REPORTS/Metric_Development.Rmd
+++ b/REPORTS/Metric_Development.Rmd
@@ -293,7 +293,7 @@ ggplot(melt(comp_summary_cumsum_diff_multi,
 
 ```{r}
 geneorama::set_project_dir("food-inspections-evaluation")
-foodInspect <- readRDS("DATA/food_inspections.Rds")
+foodInspect <- readRDS("DATA/13_food_inspections.Rds")
 foodInspect <- foodInspect[ , Violations := NULL]
 
 setkey(foodInspect, Inspection_ID)
diff --git a/REPORTS/ModelSummary_20141204.Rmd b/REPORTS/ModelSummary_20141204.Rmd
index 93cd919..859f203 100644
--- a/REPORTS/ModelSummary_20141204.Rmd
+++ b/REPORTS/ModelSummary_20141204.Rmd
@@ -31,7 +31,7 @@ geneorama::set_project_dir("food-inspections-evaluation")
 ##==============================================================================
 ## LOAD CACHED RDS FILES
 ##==============================================================================
-dat <- readRDS("DATA/dat_model.Rds")
+dat <- readRDS("DATA/23_dat_model.Rds")
 
 ## Only keep "Retail Food Establishment"
 dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"]
diff --git a/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd b/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd
index c307789..ececcc4 100644
--- a/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd
+++ b/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd
@@ -70,7 +70,7 @@ rm(confusion_values_test, dat, errors, errorsTest, iiTest, iiTrain,
 geneorama::lll()
 
 ## Load Food Inspection Data
-food <- readRDS("DATA/food_inspections.Rds")
+food <- readRDS("DATA/13_food_inspections.Rds")
 ```