diff --git a/CODE/00_Startup.R b/CODE/00_Startup.R index ece0e83..4312130 100644 --- a/CODE/00_Startup.R +++ b/CODE/00_Startup.R @@ -1,15 +1,55 @@ -## INSTALL THESE DEPENDENCIES -install.packages("devtools", - dependencies = TRUE, - repos='http://cran.us.r-project.org') -install.packages("Rcpp", - dependencies = TRUE, - repos='http://cran.us.r-project.org') - -## Update two packages not on CRAN using the devtools package. -devtools::install_github(repo = 'geneorama/geneorama') -devtools::install_github(repo = 'yihui/printr') +##------------------------------------------------------------------------------ +## INSTALL DEPENDENCIES IF MISSING +##------------------------------------------------------------------------------ + +if(!"devtools" %in% rownames(installed.packages())){ + install.packages("devtools", + dependencies = TRUE, + repos = "https://cloud.r-project.org/") +} + +if(!"Rcpp" %in% rownames(installed.packages())){ + install.packages("Rcpp", + dependencies = TRUE, + repos = "https://cloud.r-project.org/") +} + +if(!"RSocrata" %in% rownames(installed.packages())){ + install.packages("RSocrata", + dependencies = TRUE, + repos = "https://cloud.r-project.org/") +} + +if(!"data.table" %in% rownames(installed.packages())){ + install.packages("data.table", + dependencies = TRUE, + repos = "https://cloud.r-project.org/") +} + +if(!"geneorama" %in% rownames(installed.packages())){ + devtools::install_github('geneorama/geneorama') +} + +if(!"printr" %in% rownames(installed.packages())){ + devtools::install_github(repo = 'yihui/printr') +} + +##------------------------------------------------------------------------------ +## UPDATE DEPENDENCIES IF MISSING +##------------------------------------------------------------------------------ ## Update to RSocrata 1.7.2-2 (or later) -## which is only on github as of March 8, 2016 -devtools::install_github(repo = 'chicago/RSocrata') +if(installed.packages()["RSocrata","Version"] < "1.7.2-2"){ + install.packages("RSocrata", + repos = "https://cloud.r-project.org/") +} + +## Needs recent version for foverlaps +if(installed.packages()["data.table","Version"] < "1.10.0"){ + install.packages("data.table", + repos = "https://cloud.r-project.org/") +} + +if(installed.packages()["geneorama","Version"] < "1.5.0"){ + devtools::install_github('geneorama/geneorama') +} diff --git a/CODE/11_business_download.R b/CODE/11_business_download.R index 446003f..aebcebc 100644 --- a/CODE/11_business_download.R +++ b/CODE/11_business_download.R @@ -1,13 +1,10 @@ -if(interactive()){ - ##========================================================================== - ## INITIALIZE - ##========================================================================== - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach any non-standard libraries - geneorama::detach_nonstandard_packages() -} +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + ## Load libraries & project functions geneorama::loadinstall_libraries(c("data.table", "RSocrata")) geneorama::sourceDir("CODE/functions/") @@ -38,4 +35,4 @@ business[ , LICENSE_TERM_START_DATE := as.IDate(LICENSE_TERM_START_DATE, "%m/%d/ business[ , LICENSE_TERM_EXPIRATION_DATE := as.IDate(LICENSE_TERM_EXPIRATION_DATE, "%m/%d/%Y")] ## SAVE RESULT -saveRDS(business, "DATA/bus_license.Rds") +saveRDS(business, "DATA/11_bus_license.Rds") diff --git a/CODE/12_crime_download.R b/CODE/12_crime_download.R index 9f2f489..d751481 100644 --- a/CODE/12_crime_download.R +++ b/CODE/12_crime_download.R @@ -1,13 +1,10 @@ -if(interactive()){ - ##========================================================================== - ## INITIALIZE - ##========================================================================== - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach any non-standard libraries - geneorama::detach_nonstandard_packages() -} +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + ## Load libraries & project functions geneorama::loadinstall_libraries(c("data.table", "RSocrata")) geneorama::sourceDir("CODE/functions/") @@ -38,4 +35,4 @@ crime[ , Arrest := as.logical(Arrest)] crime[ , Domestic := as.logical(Domestic)] ## SAVE RESULT -saveRDS(crime , "DATA/crime.Rds") +saveRDS(crime , "DATA/12_crime.Rds") diff --git a/CODE/13_food_inspection_download.R b/CODE/13_food_inspection_download.R index 67c6d8f..2fd665c 100644 --- a/CODE/13_food_inspection_download.R +++ b/CODE/13_food_inspection_download.R @@ -1,13 +1,10 @@ -if(interactive()){ - ##========================================================================== - ## INITIALIZE - ##========================================================================== - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach any non-standard libraries - geneorama::detach_nonstandard_packages() -} +##========================================================================== +## INITIALIZE +##========================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + ## Load libraries & project functions geneorama::loadinstall_libraries(c("data.table", "RSocrata")) geneorama::sourceDir("CODE/functions/") @@ -34,5 +31,5 @@ setnames(foodInspect, gsub("_+$","",colnames(foodInspect))) geneorama::convert_datatable_IntNum(foodInspect) geneorama::convert_datatable_DateIDate(foodInspect) -## SAVE ANSWER -saveRDS(foodInspect , "DATA/food_inspections.Rds") +## SAVE RESULT +saveRDS(foodInspect , "DATA/13_food_inspections.Rds") diff --git a/CODE/14_garbage_download.R b/CODE/14_garbage_download.R index b688e63..12685b6 100644 --- a/CODE/14_garbage_download.R +++ b/CODE/14_garbage_download.R @@ -1,13 +1,10 @@ -if(interactive()){ - ##========================================================================== - ## INITIALIZE - ##========================================================================== - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach any non-standard libraries - geneorama::detach_nonstandard_packages() -} +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + ## Load libraries & project functions geneorama::loadinstall_libraries(c("data.table", "RSocrata")) geneorama::sourceDir("CODE/functions/") @@ -34,4 +31,5 @@ geneorama::convert_datatable_IntNum(garbageCarts) geneorama::convert_datatable_DateIDate(garbageCarts) ## SAVE RESULT -saveRDS(garbageCarts , "DATA/garbage_carts.Rds") +saveRDS(garbageCarts , "DATA/14_garbage_carts.Rds") + diff --git a/CODE/15_sanitation_download.R b/CODE/15_sanitation_download.R index 8f38253..8f3099d 100644 --- a/CODE/15_sanitation_download.R +++ b/CODE/15_sanitation_download.R @@ -1,13 +1,10 @@ -if(interactive()){ - ##========================================================================== - ## INITIALIZE - ##========================================================================== - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach any non-standard libraries - geneorama::detach_nonstandard_packages() -} +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + ## Load libraries & project functions geneorama::loadinstall_libraries(c("data.table", "RSocrata")) geneorama::sourceDir("CODE/functions/") @@ -37,29 +34,5 @@ geneorama::convert_datatable_IntNum(sanitationComplaints) geneorama::convert_datatable_DateIDate(sanitationComplaints) ## SAVE RESULT -saveRDS(sanitationComplaints , "DATA/sanitation_code.Rds") - -# ## Quick fix to download creation date, which is needed for the heat map calc -# ## The following block can be removed after issue 68 is resolved in RSocrata -# ## https://github.com/Chicago/RSocrata/issues/68 -# crdate <- list() -# i <- 0 -# while(length(crdate)==0 || length(crdate[[length(crdate)]]) == 50000 ){ -# i <- i + 1 -# url <- paste0("https://data.cityofchicago.org/resource/me59-5fac.csv", -# "?$select=creation_date&$LIMIT=50000", -# "&$OFFSET=", (i - 1) * 50000) -# crdate[[i]] <- httr::content(httr::GET(url), as = "text") -# crdate[[i]] <- strsplit(crdate[[i]], "\n")[[1]][-2] -# print(i) -# print(length(crdate[[i]])) -# } -# crdate <- do.call(c, crdate) -# crdate <- crdate[-1] -# -# length(crdate) == nrow(sanitationComplaints) -# -# crdate <- as.IDate(crdate, "%m/%d/%Y") -# sanitationComplaints$Creation_Date <- crdate - +saveRDS(sanitationComplaints , "DATA/15_sanitation_code.Rds") diff --git a/CODE/21_calculate_violation_matrix.R b/CODE/21_calculate_violation_matrix.R index 69f7acd..d4eece3 100644 --- a/CODE/21_calculate_violation_matrix.R +++ b/CODE/21_calculate_violation_matrix.R @@ -8,25 +8,33 @@ ## Remove all objects; perform garbage collection rm(list=ls()) gc(reset=TRUE) -## Detach libraries that are not used -geneorama::detach_nonstandard_packages() -## Load libraries that are used + +## Load libraries & project functions geneorama::loadinstall_libraries(c("data.table", "MASS")) -## Load custom functions geneorama::sourceDir("CODE/functions/") ##============================================================================== ## LOAD CACHED RDS FILES ##============================================================================== -foodInspect <- readRDS("DATA/food_inspections.Rds") +foodInspect <- readRDS("DATA/13_food_inspections.Rds") +foodInspect <- filter_foodInspect(foodInspect) ##============================================================================== ## CALCULATE FEATURES BASED ON FOOD INSPECTION DATA ##============================================================================== ## Calculate violation matrix and put into data.table with inspection id as key +vio_mat <- calculate_violation_matrix(foodInspect[ , Violations]) + +## Add key column to vio_mat +vio_mat <- data.table(vio_mat, + Inspection_ID = foodInspect[ , Inspection_ID], + key = "Inspection_ID") + ## calculate_violation_types calculates violations by categories: ## Critical, serious, and minor violations -violation_dat <- calculate_violation_types(foodInspect$Violations, - Inspection_ID = foodInspect$Inspection_ID) -saveRDS(violation_dat, "DATA/violation_dat.Rds") +violation_dat <- calculate_violation_types(violation_mat =vio_mat) + +## Save results +saveRDS(vio_mat, "DATA/21_food_inspection_violation_matrix_nums.Rds") +saveRDS(violation_dat, "DATA/21_food_inspection_violation_matrix.Rds") diff --git a/CODE/22_calculate_heat_map_values.R b/CODE/22_calculate_heat_map_values.R index 673ef9b..99b9def 100644 --- a/CODE/22_calculate_heat_map_values.R +++ b/CODE/22_calculate_heat_map_values.R @@ -1,15 +1,12 @@ - ##============================================================================== ## INITIALIZE ##============================================================================== ## Remove all objects; perform garbage collection rm(list=ls()) gc(reset=TRUE) -## Detach libraries that are not used -geneorama::detach_nonstandard_packages() -## Load libraries that are used + +## Load libraries & project functions geneorama::loadinstall_libraries(c("data.table", "MASS")) -## Load custom functions geneorama::sourceDir("CODE/functions/") ##============================================================================== @@ -17,10 +14,10 @@ geneorama::sourceDir("CODE/functions/") ##============================================================================== ## Import the key data sets used for prediction -foodInspect <- readRDS("DATA/food_inspections.Rds") -crime <- readRDS("DATA/crime.Rds") -garbageCarts <- readRDS("DATA/garbage_carts.Rds") -sanitationComplaints <- readRDS("DATA/sanitation_code.Rds") +foodInspect <- readRDS("DATA/13_food_inspections.Rds") +crime <- readRDS("DATA/12_crime.Rds") +garbageCarts <- readRDS("DATA/14_garbage_carts.Rds") +sanitationComplaints <- readRDS("DATA/15_sanitation_code.Rds") ## Apply filters by omitting rows that are not used in the model foodInspect <- filter_foodInspect(foodInspect) @@ -58,8 +55,8 @@ sanitationComplaints_heat <- ##============================================================================== ## SAVE HEAT MAP VALUES ##============================================================================== -saveRDS(burglary_heat, "DATA/burglary_heat.Rds") -saveRDS(garbageCarts_heat, "DATA/garbageCarts_heat.Rds") -saveRDS(sanitationComplaints_heat, "DATA/sanitationComplaints_heat.Rds") +saveRDS(burglary_heat, "DATA/22_burglary_heat.Rds") +saveRDS(garbageCarts_heat, "DATA/22_garbageCarts_heat.Rds") +saveRDS(sanitationComplaints_heat, "DATA/22_sanitationComplaints_heat.Rds") diff --git a/CODE/23_food_insp_features.R b/CODE/23_food_insp_features.R new file mode 100644 index 0000000..fa32807 --- /dev/null +++ b/CODE/23_food_insp_features.R @@ -0,0 +1,74 @@ +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + +## Load libraries & project functions +geneorama::loadinstall_libraries(c("data.table", "MASS")) +geneorama::sourceDir("CODE/functions/") +## Import shift function +shift <- geneorama::shift + +##============================================================================== +## LOAD CACHED RDS FILES +##============================================================================== +foodInspect <- readRDS("DATA/13_food_inspections.Rds") + +## Apply row filter to remove invalid data +foodInspect <- filter_foodInspect(foodInspect) + +## Remove violations from food inspection, violations are caputured in the +## violation matrix data +foodInspect$Violations <- NULL + +## Import violation matrix which lists violations by categories: +## Critical, serious, and minor violations +violation_dat <- readRDS("DATA/21_food_inspection_violation_matrix.Rds") + +##============================================================================== +## CALCULATE FEATURES +##============================================================================== + +## Facility_Type_Clean: Anything that is not "restaurant" or "grocery" is "other" +foodInspect[ , Facility_Type_Clean := + categorize(x = Facility_Type, + primary = list(Restaurant = "restaurant", + Grocery_Store = "grocery"), + ignore.case = TRUE)] +## Join in the violation matrix +foodInspect <- merge(x = foodInspect, + y = violation_dat, + by = "Inspection_ID") +## Create pass / fail flags +foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)] +foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)] +## Set key to ensure that records are treated CHRONOLOGICALLY... +setkey(foodInspect, License, Inspection_Date) +## Then find previous info by "shifting" the columns (grouped by License) +foodInspect[ , pastFail := shift(fail_flag, -1, 0), by = License] +foodInspect[ , pastCritical := shift(criticalCount, -1, 0), by = License] +foodInspect[ , pastSerious := shift(seriousCount, -1, 0), by = License] +foodInspect[ , pastMinor := shift(minorCount, -1, 0), by = License] + +## Calcualte time since last inspection. +## If the time is NA, this means it's the first inspection; add an inicator +## variable to indicate that it's the first inspection. +foodInspect[i = TRUE , + j = timeSinceLast := as.numeric( + Inspection_Date - shift(Inspection_Date, -1, NA)) / 365, + by = License] +foodInspect[ , firstRecord := 0] +foodInspect[is.na(timeSinceLast), firstRecord := 1] +foodInspect[is.na(timeSinceLast), timeSinceLast := 2] +foodInspect[ , timeSinceLast := pmin(timeSinceLast, 2)] + +##============================================================================== +## SAVE RDS +##============================================================================== +setkey(foodInspect, Inspection_ID) +saveRDS(foodInspect, file.path("DATA/23_food_insp_features.Rds")) + + + diff --git a/CODE/23_generate_model_dat.R b/CODE/23_generate_model_dat.R deleted file mode 100644 index 6ebcf0a..0000000 --- a/CODE/23_generate_model_dat.R +++ /dev/null @@ -1,227 +0,0 @@ - -##============================================================================== -## INITIALIZE -##============================================================================== -## Remove all objects; perform garbage collection -rm(list=ls()) -gc(reset=TRUE) -## Detach libraries that are not used -geneorama::detach_nonstandard_packages() -## Load libraries that are used -geneorama::loadinstall_libraries(c("data.table", "MASS")) -## Load custom functions -geneorama::sourceDir("CODE/functions/") - -## Import shift function -shift <- geneorama::shift - -##============================================================================== -## LOAD CACHED RDS FILES -##============================================================================== - -## Import the key data sets used for prediction -foodInspect <- readRDS("DATA/food_inspections.Rds") -business <- readRDS("DATA/bus_license.Rds") - -## For clarity, remove violations from food inspection -## This information is caputured in the violation matrix (below) -foodInspect$Violations <- NULL - -## Import violation matrix which lists violations by categories: -## Critical, serious, and minor violations -violation_dat <- readRDS("DATA/violation_dat.Rds") - -## Import the inspectors -inspectors <- readRDS("DATA/inspectors.Rds") - -## Import weather data -weather <- readRDS("DATA/weather_20110401_20141031.Rds") -weather_3day <- weather_3day_calc(weather) -rm(weather) - -## Import the heat density values, previously calculated -burglary_heat <- readRDS("DATA/burglary_heat.Rds") -garbage_heat <- readRDS("DATA/garbageCarts_heat.Rds") -sanitation_heat <- readRDS("DATA/sanitationComplaints_heat.Rds") - -##============================================================================== -## Filter the primary data sets to remove unnecessary rows -##============================================================================== -foodInspect <- filter_foodInspect(foodInspect) -business <- filter_business(business) - -foodInspect[ , Facility_Type_Clean := - categorize(x = Facility_Type, - primary = list(Restaurant = "restaurant", - Grocery_Store = "grocery"), - ignore.case = TRUE)] - -##============================================================================== -## Create basis for dat_model, which is the data that will be used in the model -##============================================================================== -dat_model <- foodInspect[i = TRUE , - j = list(Inspection_Date, - License, - Inspection_Type, - Results), - keyby = Inspection_ID] - -##============================================================================== -## CALCULATE FEATURES BASED ON FOOD INSPECTION DATA -##============================================================================== - -## Join in the violation matrix -dat_model <- merge(x = dat_model, - y = violation_dat, - by = "Inspection_ID") - -## Join in the clean facility type -dat_model <- merge( - x = dat_model, - y = foodInspect[ , list(Inspection_ID, - Facility_Type = Facility_Type_Clean)], - by = "Inspection_ID") - -## Create pass / fail flags -dat_model[ , pass_flag := ifelse(Results=="Pass",1, 0)] -dat_model[ , fail_flag := ifelse(Results=="Fail",1, 0)] -## Set key to ensure that records are treated CHRONOLOGICALLY... -setkey(dat_model, License, Inspection_Date) -## Then find previous info by "shifting" the columns (grouped by License) -dat_model[ , pastFail := shift(fail_flag, -1, 0), by = License] -dat_model[ , pastCritical := shift(criticalCount, -1, 0), by = License] -dat_model[ , pastSerious := shift(seriousCount, -1, 0), by = License] -dat_model[ , pastMinor := shift(minorCount, -1, 0), by = License] - -## Calcualte time since last inspection. -## If the time is NA, this means it's the first inspection; add an inicator -## variable to indicate that it's the first inspection. -dat_model[i = TRUE , - j = timeSinceLast := as.numeric( - Inspection_Date - shift(Inspection_Date, -1, NA)) / 365, - by = License] -dat_model[ , firstRecord := 0] -dat_model[is.na(timeSinceLast), firstRecord := 1] -dat_model[is.na(timeSinceLast), timeSinceLast := 2] -dat_model[ , timeSinceLast := pmin(timeSinceLast, 2)] - -##============================================================================== -## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA -##============================================================================== - -## Create a table of matches between the food inspection and business license -## data, based on the where the Inspection_Date falls within the business -## license renewal -id_table_food2business <- find_bus_id_matches(business, foodInspect) - -## Join business ID to dat_model -dat_model <- merge(x = dat_model, - y = id_table_food2business, - by = "Inspection_ID") - -## Add LICENSE_DESCRIPTION to dat_model -license_dec <- business[i = TRUE, - j = LICENSE_DESCRIPTION, - keyby = list(Business_ID = ID)] -dat_model <- merge(dat_model, - license_dec, - by = "Business_ID", - all.x = TRUE) - -## Calculate min date (by license) -business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER] -business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER] - -## Calculate age at inspection: -## Add minDate to dat_model -dat_model <- merge(x = dat_model, - y = business[ , list(Business_ID = ID, - minDate, - maxDate)], # maxDate's just nice to have - by = "Business_ID", - all.x = TRUE) -## Use minDate to calculate age -dat_model[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365] - -## CALCULATE AND MERGE IN OTHER CATEGORIES -OtherCategories <- GenerateOtherLicenseInfo(dat_model, - business, - max_cat = 12) -## Merge in results -dat_model <- merge(x = dat_model, - y = OtherCategories, - by = "Inspection_ID", - all.x = T) -## Remove NAs in category columns and set max value to 1 -for (j in match(colnames(OtherCategories)[-1], colnames(dat_model))) { - set(x = dat_model, i = which(is.na(dat_model[[j]])), j = j, value = 0) - set(x = dat_model, j = j, value = pmin(dat_model[[j]], 1)) -} - -##============================================================================== -## ATTACH "HEAT MAP" DATA -##============================================================================== - -## Merge in results of heat calculations: -setnames(burglary_heat, "heat_values", "heat_burglary") -setnames(garbage_heat, "heat_values", "heat_garbage") -setnames(sanitation_heat, "heat_values", "heat_sanitation") - -setkey(dat_model, Inspection_ID) -setkey(burglary_heat, Inspection_ID) -setkey(garbage_heat, Inspection_ID) -setkey(sanitation_heat, Inspection_ID) -dat_model <- dat_model[burglary_heat][garbage_heat][sanitation_heat] - -##============================================================================== -## ATTACH INSPECTOR DATA -##============================================================================== -## Removing letters out front, and numbers trailing hyphen are not needed -## (e.g. -1006 I believe is the code for retail food license) -inspectors[ , License := gsub('[A-z]+|-.+$', "", License)] -## cleaning any leading zeros -inspectors[ , License := gsub('^[0]+', "", License)] -## removing possibly invalid license numbers -inspectors <- inspectors[nchar(License) > 3 & Inspector_Assigned != " "] -## if multiple inspections for same license number, then seeking the inspector -## on the first inspection -inspectors <- inspectors[ , .N, by=list(License, Inspection_Date, Inspector_Assigned)] -inspectors$N <- NULL -## Convert to integer to match -inspectors[ , License := as.integer(License)] -setkey(inspectors, License, Inspection_Date) -setkey(dat_model, License, Inspection_Date) - -inspectors_deduped <- inspectors[ - i = TRUE , - j = list(Inspector_Assigned = Inspector_Assigned[1]), - keyby = list(License, Inspection_Date)] -dat_model <- merge( - x = dat_model, - y = inspectors_deduped, - by = c("License", "Inspection_Date"), - all.x = TRUE) - -##============================================================================== -## MERGE IN WEATHER -##============================================================================== -dat_model <- merge(dat_model, - weather_3day, - by = "Inspection_Date", - all.x = TRUE) - -# geneorama::NAsummary(dat_model) -# str(na.omit(dat_model)) -# dat_model[ , .N, LICENSE_DESCRIPTION] -# str(na.omit(dat_model[LICENSE_DESCRIPTION=="Retail Food Establishment"])) -# geneorama::NAsummary(dat_model[LICENSE_DESCRIPTION=="Retail Food Establishment"]) - -##============================================================================== -## SAVE RDS -##============================================================================== -## Set the key for dat_model -setkey(dat_model, Inspection_ID) -saveRDS(dat_model, file.path("DATA/dat_model.Rds")) - - - diff --git a/CODE/24_bus_features.R b/CODE/24_bus_features.R new file mode 100644 index 0000000..cdfc2a2 --- /dev/null +++ b/CODE/24_bus_features.R @@ -0,0 +1,87 @@ +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + +## Load libraries & project functions +geneorama::loadinstall_libraries(c("data.table", "MASS")) +geneorama::sourceDir("CODE/functions/") +## Import shift function +shift <- geneorama::shift + +##============================================================================== +## LOAD CACHED RDS FILES +##============================================================================== +business <- readRDS("DATA/11_bus_license.Rds") + +## Apply filter to remove invalid / unused data +business <- filter_business(business) + +## Food inspection data needed for some feature calculations inspection date +foodInspect <- readRDS("DATA/23_food_insp_features.Rds") + +##============================================================================== +## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA +##============================================================================== + +## Calculate min date (by license) +business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER] +business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER] + +##============================================================================== +## Use only the business data that pertains to food inspections +##============================================================================== +## Create a table of matches between the food inspection and business license +## data, based on the where the Inspection_Date falls within the business +## license renewal +id_table_food2business <- find_bus_id_matches(business, foodInspect) +geneorama::NAsummary(id_table_food2business) + +## Add food key to matched business data +bus_matched <- merge(x = id_table_food2business, + y = business, + by = "ID", + all.y = FALSE, + all.x = TRUE) +setkey(bus_matched, Inspection_ID) + +## Add business key to food data +foodInspect <- merge(x = id_table_food2business, + y = foodInspect, + by = "Inspection_ID") +setkey(foodInspect, Inspection_ID) + +## Use minDate and Inspection date to calculate age at +bus_matched <- bus_matched[foodInspect[,Inspection_Date,keyby=Inspection_ID]] +bus_matched[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365] + +## Remove Inspection Date to avoid conflict names when merging later +bus_matched[ , Inspection_Date := NULL] + + +## CALCULATE AND MERGE IN OTHER CATEGORIES +OtherCategories <- GenerateOtherLicenseInfo(foodInspect, business, max_cat = 12) +geneorama::NAsummary(OtherCategories) + +## Merge in results +bus_matched <- merge(x = bus_matched, + y = OtherCategories, + by = "Inspection_ID", + all.x = T) +## Remove NAs in category columns and set max value to 1 +for (j in match(colnames(OtherCategories)[-1], colnames(bus_matched))) { + set(x = bus_matched, i = which(is.na(bus_matched[[j]])), j = j, value = 0) + set(x = bus_matched, j = j, value = pmin(bus_matched[[j]], 1)) +} + +bus_matched + +##============================================================================== +## SAVE RDS +##============================================================================== +## Set the key for dat_model +setkey(bus_matched, Inspection_ID) +saveRDS(bus_matched, file.path("DATA/24_bus_features.Rds")) + diff --git a/CODE/30_glmnet_model.R b/CODE/30_glmnet_model.R deleted file mode 100644 index c18691d..0000000 --- a/CODE/30_glmnet_model.R +++ /dev/null @@ -1,134 +0,0 @@ - -##============================================================================== -## INITIALIZE -##============================================================================== -if(interactive()){ - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach libraries that are not used - geneorama::detach_nonstandard_packages() -} -## Load libraries that are used -geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2")) -## Load custom functions -geneorama::sourceDir("CODE/functions/") - -##============================================================================== -## LOAD CACHED RDS FILES -##============================================================================== -dat <- readRDS("DATA/dat_model.Rds") - -## Only keep "Retail Food Establishment" -dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"] -## Remove License Description -dat[ , LICENSE_DESCRIPTION := NULL] -dat <- na.omit(dat) - -## Add criticalFound variable to dat: -dat[ , criticalFound := pmin(1, criticalCount)] - -## Set the key for dat -setkey(dat, Inspection_ID) - -## Match time period of original results -# dat <- dat[Inspection_Date < "2013-09-01" | Inspection_Date > "2014-07-01"] - -##============================================================================== -## CREATE MODEL DATA -##============================================================================== -# sort(colnames(dat)) -xmat <- dat[ , list(Inspector = Inspector_Assigned, - pastSerious = pmin(pastSerious, 1), - pastCritical = pmin(pastCritical, 1), - timeSinceLast, - ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L), - consumption_on_premises_incidental_activity, - tobacco_retail_over_counter, - temperatureMax, - heat_burglary = pmin(heat_burglary, 70), - heat_sanitation = pmin(heat_sanitation, 70), - heat_garbage = pmin(heat_garbage, 50), - # Facility_Type, - criticalFound), - keyby = Inspection_ID] -mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F]) -mm <- as.data.table(mm) -str(mm) -colnames(mm) - -##============================================================================== -## CREATE TEST / TRAIN PARTITIONS -##============================================================================== -## 2014-07-01 is an easy separator -dat[Inspection_Date < "2014-07-01", range(Inspection_Date)] -dat[Inspection_Date > "2014-07-01", range(Inspection_Date)] - -iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")] -iiTest <- dat[ , which(Inspection_Date > "2014-07-01")] - -## Check to see if any rows didn't make it through the model.matrix formula -nrow(dat) -nrow(xmat) -nrow(mm) - -##============================================================================== -## GLMNET MODEL -## FOR MORE INFO SEE: -## http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html -##============================================================================== - -# fit ridge regression, alpha = 0, only inspector coefficients penalized -penalty <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0) - -## Find best lambda based on CV results -## Note, The cvfit object includes the top model -cvfit <- cv.glmnet(x = as.matrix(mm[iiTrain]), - y = xmat[iiTrain, criticalFound], - family = "binomial", - alpha = 0, - penalty.factor = penalty, - type.measure = "deviance") - -## View of results -plot(cvfit) -cvfit$lambda -cvfit$lambda.min - -##============================================================================== -## ATTACH PREDICTIONS TO DAT -##============================================================================== - -## Attach predictions for top lambda choice to the data -dat$score <- predict(cvfit$glmnet.fit, - newx = as.matrix(mm), - s = cvfit$lambda.min, - type = "response")[,1] - -## Identify each row as test / train -dat$Test <- 1:nrow(dat) %in% iiTest -dat$Train <- 1:nrow(dat) %in% iiTrain - -## Calculate scores for all lambda values -allscores <- predict(cvfit$glmnet.fit, - newx = as.matrix(mm), - s = cvfit$glmnet.fit$lambda, - type = "response") - -allscores <- as.data.table(allscores) -setnames(allscores, - cvfit$glmnet.fit$beta@Dimnames[[2]]) - -## Identify each row as test / train -allscores$Test <- 1:nrow(allscores) %in% iiTest -allscores$Train <- 1:nrow(allscores) %in% iiTrain - -##============================================================================== -## SAVE RESULTS -##============================================================================== - -saveRDS(dat, "DATA/30_glmnet_data.Rds") -saveRDS(cvfit, "DATA/30_glmnet_cvfit.Rds") -saveRDS(allscores, "DATA/30_glmnet_allscores.Rds") - - diff --git a/CODE/30_random_forest_model.R b/CODE/30_random_forest_model.R deleted file mode 100644 index bc2568e..0000000 --- a/CODE/30_random_forest_model.R +++ /dev/null @@ -1,98 +0,0 @@ - -##============================================================================== -## INITIALIZE -##============================================================================== -if(interactive()){ - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach libraries that are not used - geneorama::detach_nonstandard_packages() -} -## Load libraries that are used -geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2")) -## Load custom functions -geneorama::sourceDir("CODE/functions/") - -##============================================================================== -## LOAD CACHED RDS FILES -##============================================================================== -dat <- readRDS("DATA/dat_model.Rds") - -## Only keep "Retail Food Establishment" -dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"] -## Remove License Description -dat[ , LICENSE_DESCRIPTION := NULL] -dat <- na.omit(dat) - -## Add criticalFound variable to dat: -dat[ , criticalFound := pmin(1, criticalCount)] - -## Set the key for dat -setkey(dat, Inspection_ID) - -## Match time period of original results -# dat <- dat[Inspection_Date < "2013-09-01" | Inspection_Date > "2014-07-01"] - -##============================================================================== -## CREATE MODEL DATA -##============================================================================== -# sort(colnames(dat)) -xmat <- dat[ , list(Inspector = Inspector_Assigned, - pastSerious = pmin(pastSerious, 1), - pastCritical = pmin(pastCritical, 1), - timeSinceLast, - ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L), - consumption_on_premises_incidental_activity, - tobacco_retail_over_counter, - temperatureMax, - heat_burglary = pmin(heat_burglary, 70), - heat_sanitation = pmin(heat_sanitation, 70), - heat_garbage = pmin(heat_garbage, 50), - # Facility_Type, - criticalFound), - keyby = Inspection_ID] -mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F]) -mm <- as.data.table(mm) -str(mm) -colnames(mm) - -##============================================================================== -## CREATE TEST / TRAIN PARTITIONS -##============================================================================== -## 2014-07-01 is an easy separator -dat[Inspection_Date < "2014-07-01", range(Inspection_Date)] -dat[Inspection_Date > "2014-07-01", range(Inspection_Date)] - -iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")] -iiTest <- dat[ , which(Inspection_Date > "2014-07-01")] - -## Check to see if any rows didn't make it through the model.matrix formula -nrow(dat) -nrow(xmat) -nrow(mm) - -##============================================================================== -## RANDOM FOREST MODEL -##============================================================================== -model <- randomForest(x = as.matrix(mm[iiTrain]), - y = as.factor(xmat[iiTrain, criticalFound]), - importance=TRUE) - -## ATTACH PREDICTIONS TO DAT -dat$score <- predict(model, as.matrix(mm), - type="prob")[ , 2] - -## Identify each row as test / train -dat$Test <- 1:nrow(dat) %in% iiTest -dat$Train <- 1:nrow(dat) %in% iiTrain - -##============================================================================== -## SAVE RESULTS -##============================================================================== - -saveRDS(dat, "DATA/30_random_forest_data.Rds") -saveRDS(model, "DATA/30_random_forest_model.Rds") - - - diff --git a/CODE/30a_glmnet_model.R b/CODE/30a_glmnet_model.R new file mode 100644 index 0000000..4b5039f --- /dev/null +++ b/CODE/30a_glmnet_model.R @@ -0,0 +1,142 @@ +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + +## Load libraries & project functions +geneorama::loadinstall_libraries(c("data.table", "glmnet")) +geneorama::sourceDir("CODE/functions/") + +##============================================================================== +## LOAD CACHED RDS FILES +##============================================================================== +food <- readRDS("DATA/23_food_insp_features.Rds") +bus <- readRDS("DATA/24_bus_features.Rds") +sanitarians <- readRDS("DATA/19_inspector_assignments.Rds") +weather <- readRDS("DATA/17_mongo_weather_update.Rds") +heat_burglary <- readRDS("DATA/22_burglary_heat.Rds") +heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds") +heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds") + +##============================================================================== +## MERGE IN FEATURES +##============================================================================== +sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian] +setnames(heat_burglary, "heat_values", "heat_burglary") +setnames(heat_garbage, "heat_values", "heat_garbage") +setnames(heat_sanitation, "heat_values", "heat_sanitation") + +dat <- copy(food) +dat <- dat[bus] +dat <- merge(x = dat, y = sanitarians, by = "Inspection_ID") +dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date") +dat <- merge(dat, na.omit(heat_burglary), by = "Inspection_ID") +dat <- merge(dat, na.omit(heat_garbage), by = "Inspection_ID") +dat <- merge(dat, na.omit(heat_sanitation), by = "Inspection_ID") + +## Set the key for dat +setkey(dat, Inspection_ID) + +## Remove unnecessary data +rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation) + +## Only the model data should be present +geneorama::lll() + +##============================================================================== +## FILTER ROWS +##============================================================================== +dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"] +dat + +##============================================================================== +## DISPLAY AVAILABLE VARIABLES +##============================================================================== +geneorama::NAsummary(dat) + +##============================================================================== +## Add criticalFound variable to dat: +##============================================================================== +dat[ , criticalFound := pmin(1, criticalCount)] + +##============================================================================== +## Calculate index for training data (last three months) +##============================================================================== +dat[ , Test := Inspection_Date >= (max(Inspection_Date) - 90)] + +##============================================================================== +## CREATE MODEL DATA +##============================================================================== +# sort(colnames(dat)) +xmat <- dat[ , list(Inspector = as.character(sanitarian), + pastSerious = pmin(pastSerious, 1), + pastCritical = pmin(pastCritical, 1), + timeSinceLast, + ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L), + consumption_on_premises_incidental_activity, + tobacco, + temperatureMax, + heat_burglary = pmin(heat_burglary, 70), + heat_sanitation = pmin(heat_sanitation, 70), + heat_garbage = pmin(heat_garbage, 50), + criticalFound), + keyby = list(Inspection_ID, Test)] + +## View the structure of the final xmat +str(xmat) + +##============================================================================== +## GLMNET MODEL +##============================================================================== +## Construct model matrix without the key values in xmat +mm <- model.matrix(criticalFound ~ . -1, + data = xmat[ , .SD, .SDcol=-key(xmat)]) +str(mm) +colnames(mm) + +# fit ridge regression, alpha = 0, only inspector coefficients penalized +penalty <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0) + +## PRODUCTION version of the model which is not split for test / train +model <- cv.glmnet(x = mm, + y = xmat[ , criticalFound], + family = "binomial", + alpha = 0, + penalty.factor = penalty) + +## EVALUATION version of the model which is only fit on the training data +model_eval <- cv.glmnet(x = mm[xmat$Test==FALSE, ], + y = xmat[Test == FALSE , criticalFound], + family = "binomial", + alpha = 0, + penalty.factor = penalty) + +## Lambda +model$lambda +model$lambda.min + +## Attach predictions for top lambda choice to the data +dat$glm_pred <- predict(model$glmnet.fit, + newx = mm, + s = model$lambda.min, + type = "response")[,1] +dat$glm_pred_test <- predict(model_eval$glmnet.fit, + newx = mm, + s = model_eval$lambda.min, + type = "response")[,1] + +## Coefficients +coef <- coef(model)[,1] +inspCoef <- coef[grepl("^Inspector",names(coef))] +inspCoef <- inspCoef[order(-inspCoef)] + +## Save Results +saveRDS(dat, "DATA/30_dat.Rds") +saveRDS(xmat, "DATA/30_xmat.Rds") +saveRDS(mm, "DATA/30_modelmatrix.Rds") +saveRDS(coef, "DATA/30_coef.Rds") +saveRDS(inspCoef, "DATA/30_inspCoef.Rds") +saveRDS(model, "DATA/30_model.Rds") +saveRDS(model_eval, "DATA/30_model_eval.Rds") diff --git a/CODE/31_glmnet_model_evaluation.R b/CODE/30b_glmnet_model_evaluation.R similarity index 80% rename from CODE/31_glmnet_model_evaluation.R rename to CODE/30b_glmnet_model_evaluation.R index c69f719..5b04aa4 100644 --- a/CODE/31_glmnet_model_evaluation.R +++ b/CODE/30b_glmnet_model_evaluation.R @@ -1,14 +1,10 @@ - ##============================================================================== ## INITIALIZE ##============================================================================== -if(interactive()){ - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach libraries that are not used - geneorama::detach_nonstandard_packages() -} +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + ## Load libraries that are used geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "ROCR")) ## Load custom functions @@ -17,13 +13,25 @@ geneorama::sourceDir("CODE/functions/") ##============================================================================== ## LOAD CACHED RDS FILES ##============================================================================== -dat <- readRDS("DATA/30_glmnet_data.Rds") -cvfit <- readRDS("DATA/30_glmnet_cvfit.Rds") -allscores <- readRDS("DATA/30_glmnet_allscores.Rds") +dat <- readRDS("DATA/30_dat.Rds") +cvfit <- readRDS("DATA/30_model_eval.Rds") +mm <- readRDS("DATA/30_modelmatrix.Rds") str(cvfit) str(cvfit$glmnet.fit$beta) +## Calculate scores for all lambda values +allscores <- predict(cvfit$glmnet.fit, + newx = as.matrix(mm), + s = cvfit$glmnet.fit$lambda, + type = "response") +allscores <- as.data.table(allscores) +setnames(allscores, cvfit$glmnet.fit$beta@Dimnames[[2]]) + +## Identify each row as test / train +allscores$Test <- dat$Test +allscores$Train <- !dat$Test + ##============================================================================== ## GLMNET specific diagnostics ##============================================================================== @@ -48,7 +56,7 @@ plot(cvfit$cvm ~ log(cvfit$lambda)) loglik_errors <- sapply(1:100, function(i) { logLik(p = allscores[Train==TRUE][[i]], - y = dat[Train==TRUE, criticalFound]) + y = dat[Test==FALSE, criticalFound]) }) plot(loglik_errors ~ log(cvfit$glmnet.fit$lambda)) lines(x = log(cvfit$glmnet.fit$lambda), @@ -65,19 +73,19 @@ plot(cvfit$glmnet.fit, xvar = "dev", label = TRUE) ##============================================================================== # Show gini performance of inspector model on tune data set -dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)] -dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)] +dat[Test==FALSE, gini(glm_pred, criticalFound, plot=TRUE)] +dat[Test==TRUE, gini(glm_pred, criticalFound, plot=TRUE)] ## Calculate confusion matrix values for evaluation calculate_confusion_values(actual = dat[Test==TRUE, criticalFound], - expected = dat[Test==TRUE, score], + expected = dat[Test==TRUE, glm_pred_test], r = .25) ## Calculate matrix of confusion matrix values for evaluation confusion_values_test <- t(sapply(seq(0, 1 ,.01), calculate_confusion_values, actual = dat[Test==TRUE, criticalFound], - expected = dat[Test==TRUE, score])) + expected = dat[Test==TRUE, glm_pred_test])) confusion_values_test <- as.data.table(confusion_values_test) confusion_values_test @@ -85,7 +93,6 @@ confusion_values_test ggplot(melt(confusion_values_test, id.vars="r")) + aes(x=r, y=value, colour=variable) + geom_line() + geom_hline(yintercept = c(0,1)) - ##============================================================================== ## MODEL EVALUATION ## - TIME SAVINGS @@ -95,10 +102,10 @@ ggplot(melt(confusion_values_test, id.vars="r")) + datTest <- dat[Test == TRUE] ## Mean time savings: -datTest[ , simulated_date_diff_mean(Inspection_Date, score, criticalFound)] +datTest[ , simulated_date_diff_mean(Inspection_Date, glm_pred_test, criticalFound)] ## Detailed time savings: -bins <- datTest[ , simulated_bin_summary(Inspection_Date, score, criticalFound)] +bins <- datTest[ , simulated_bin_summary(Inspection_Date, glm_pred_test, criticalFound)] bins ## This calculation is the weighted average date difference, which should match @@ -126,10 +133,10 @@ binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705 ##============================================================================== ## Example with random values: -# predTest <- prediction(datTest[ ,list(score, runif(.N) )], +# predTest <- prediction(datTest[ ,list(glm_pred_test, runif(.N) )], # datTest[ ,list(criticalFound, criticalFound)]) -predTest <- prediction(datTest$score, datTest$criticalFound) +predTest <- prediction(datTest$glm_pred_test, datTest$criticalFound) ## precision / recall plot(performance(predTest, "prec", "rec"), main="precision recall") @@ -162,3 +169,4 @@ plot(performance(predTest, measure = "acc")) ## AUC performance(predTest, measure = "auc")@y.values[[1]] + diff --git a/CODE/31a_random_forest_model.R b/CODE/31a_random_forest_model.R new file mode 100644 index 0000000..75e7040 --- /dev/null +++ b/CODE/31a_random_forest_model.R @@ -0,0 +1,40 @@ +##============================================================================== +## INITIALIZE +##============================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + +## Load libraries that are used +geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2")) +## Load custom functions +geneorama::sourceDir("CODE/functions/") + +##============================================================================== +## LOAD CACHED RDS FILES +##============================================================================== +dat <- readRDS("DATA/30_dat.Rds") +mm <- readRDS("DATA/30_modelmatrix.Rds") +xmat <- readRDS("DATA/30_xmat.Rds") + +##============================================================================== +## RANDOM FOREST MODEL +##============================================================================== +model <- randomForest(x = mm[xmat$Test==FALSE, ], + y = as.factor(xmat[xmat$Test==FALSE, criticalFound]), + importance=TRUE) + +## ATTACH PREDICTIONS TO DAT +dat$rf_pred_test <- predict(model, + as.matrix(mm), + type="prob")[ , 2] + +##============================================================================== +## SAVE RESULTS +##============================================================================== + +saveRDS(dat, "DATA/31_random_forest_data.Rds") +saveRDS(model, "DATA/31_random_forest_model.Rds") + + + diff --git a/CODE/31_random_forest_evaluation.R b/CODE/31b_random_forest_evaluation.R similarity index 83% rename from CODE/31_random_forest_evaluation.R rename to CODE/31b_random_forest_evaluation.R index a368f02..889c59e 100644 --- a/CODE/31_random_forest_evaluation.R +++ b/CODE/31b_random_forest_evaluation.R @@ -1,14 +1,10 @@ - ##============================================================================== ## INITIALIZE ##============================================================================== -if(interactive()){ - ## Remove all objects; perform garbage collection - rm(list=ls()) - gc(reset=TRUE) - ## Detach libraries that are not used - geneorama::detach_nonstandard_packages() -} +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + ## Load libraries that are used geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2", "ROCR")) ## Load custom functions @@ -17,13 +13,14 @@ geneorama::sourceDir("CODE/functions/") ##============================================================================== ## LOAD CACHED RDS FILES ##============================================================================== -dat <- readRDS("DATA/30_random_forest_data.Rds") -model <- readRDS("DATA/30_random_forest_model.Rds") +dat <- readRDS("DATA/31_random_forest_data.Rds") +model <- readRDS("DATA/31_random_forest_model.Rds") ##============================================================================== ## Random Forest specific diagnostics ##============================================================================== - +varImpPlot(model) +importance(model) ##============================================================================== ## Gini and confusion matrix calculations @@ -31,19 +28,19 @@ model <- readRDS("DATA/30_random_forest_model.Rds") ## Plot of the errors by Lambda for the out of sample Test data # Show gini performance of inspector model on tune data set -dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)] -dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)] +dat[Test==FALSE, gini(rf_pred_test, criticalFound, plot=TRUE)] +dat[Test==TRUE, gini(rf_pred_test, criticalFound, plot=TRUE)] ## Calculate confusion matrix values for evaluation calculate_confusion_values(actual = dat[Test==TRUE, criticalFound], - expected = dat[Test==TRUE, score], + expected = dat[Test==TRUE, rf_pred_test], r = .25) ## Calculate matrix of confusion matrix values for evaluation confusion_values_test <- t(sapply(seq(0, 1 ,.01), calculate_confusion_values, actual = dat[Test==TRUE, criticalFound], - expected = dat[Test==TRUE, score])) + expected = dat[Test==TRUE, rf_pred_test])) confusion_values_test <- as.data.table(confusion_values_test) confusion_values_test @@ -61,10 +58,10 @@ ggplot(melt(confusion_values_test, id.vars="r")) + datTest <- dat[Test == TRUE] ## Mean time savings: -datTest[ , simulated_date_diff_mean(Inspection_Date, score, criticalFound)] +datTest[ , simulated_date_diff_mean(Inspection_Date, rf_pred_test, criticalFound)] ## Detailed time savings: -bins <- datTest[ , simulated_bin_summary(Inspection_Date, score, criticalFound)] +bins <- datTest[ , simulated_bin_summary(Inspection_Date, rf_pred_test, criticalFound)] bins ## This calculation is the weighted average date difference, which should match @@ -94,10 +91,10 @@ binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705 ##============================================================================== ## Example with random values: -# predTest <- prediction(datTest[ ,list(score, runif(.N) )], +# predTest <- prediction(datTest[ ,list(rf_pred_test, runif(.N) )], # datTest[ ,list(criticalFound, criticalFound)]) -predTest <- prediction(datTest$score, datTest$criticalFound) +predTest <- prediction(datTest$rf_pred_test, datTest$criticalFound) ## precision / recall plot(performance(predTest, "prec", "rec"), main="precision recall") diff --git a/CODE/functions/GenerateOtherLicenseInfo.R b/CODE/functions/GenerateOtherLicenseInfo.R index 6c501bf..04ffd21 100644 --- a/CODE/functions/GenerateOtherLicenseInfo.R +++ b/CODE/functions/GenerateOtherLicenseInfo.R @@ -49,7 +49,7 @@ GenerateOtherLicenseInfo <- function(inspection_data, keep.rownames = TRUE)[order(-V2)] setnames(category_totals, c("cat", "N")) ## LIMIT CATEGORY COLUMNS - categories_keep <- category_totals[1:min(max_cat, nrow(category_totals)-3), + categories_keep <- category_totals[1:max(min(max_cat, nrow(category_totals)-3), 1), cat] tab_final <- tab[,c("dba", "addr", "d_insp", categories_keep), with=F] diff --git a/CODE/functions/calculate_violation_matrix.R b/CODE/functions/calculate_violation_matrix.R new file mode 100644 index 0000000..44d645e --- /dev/null +++ b/CODE/functions/calculate_violation_matrix.R @@ -0,0 +1,26 @@ + +## Requires a vector of violations in text format, +## where each element is a collapsed list of violations +## separated by | + +calculate_violation_matrix <- function(violation_text){ + + require(data.table) + + ## Tabluate voilation types + ## 1) Split violoation description by "|" + ## 2) use regex to extract leading digits of code number + ## 3) create indicator matrix of code violations + vio <- strsplit(violation_text,"| ",fixed=T) + vio_nums <- lapply(vio, + function(item) regmatches(x = item, + m = gregexpr(pattern = "^[0-9]+", + text = item))) + vio_mat <- geneorama::list2matrix(vio_nums, count = T) + vio_mat <- vio_mat[ , order(as.numeric(colnames(vio_mat)))] + # colnames(vio_mat) + # range(vio_mat) + + return(vio_mat) +} + diff --git a/CODE/functions/calculate_violation_types.R b/CODE/functions/calculate_violation_types.R index aa34e5b..45cb5f5 100644 --- a/CODE/functions/calculate_violation_types.R +++ b/CODE/functions/calculate_violation_types.R @@ -1,54 +1,43 @@ -## Requires a vector of violations in text format, -## where each element is a collapsed list of violations -## separated by | -## Requires a vector of key values to be assigned to the -## result for merging after the function is run. +## Requirements for calculate_violation_types: +## 1 Requires a matrix (within a keyed data.table) of violation +## indicators where each row contains 0 or 1 to indicate the presence +## of a violation. There should be columns 1:44, named "1" to "44" +## (plus a key column) corresponding to the 44 violation types found in +## the data. +## +## Uses the function calculate_violation_matrix +## to calculate intermediate result (indicator matrix) ## -calculate_violation_types <- function(violation_text, ...){ +calculate_violation_types <- function(violation_mat){ require(data.table) - ## Tabluate voilation types - ## 1) Split violoation description by "|" - ## 2) use regex to extract leading digits of code number - ## 3) create indicator matrix of code violations - ## 4) use apply to total up each group of code violations - vio <- strsplit(violation_text,"| ",fixed=T) - vio_nums <- lapply(vio, - function(item) regmatches(x = item, - m = gregexpr(pattern = "^[0-9]+", - text = item))) - vio_mat <- geneorama::list2matrix(vio_nums, count = T) - vio_mat <- vio_mat[ , order(as.numeric(colnames(vio_mat)))] - # colnames(vio_mat) - # range(vio_mat) - - criticalCount <- apply(vio_mat[ , colnames(vio_mat) %in% 1:14], 1, sum) - seriousCount <- apply(vio_mat[ , colnames(vio_mat) %in% 15:29], 1, sum) - minorCount <- apply(vio_mat[ , colnames(vio_mat) %in% 30:44], 1, sum) - - ## Extract the key from the ...'s - key_vec <- list(...) - - ## Check if the key is in the ...'s - if(length(key_vec) != 1) { - stop("A key vector is required as the second argument") + ## Check that violation_mat is a data.table + if(!inherits(x = violation_mat, what = "data.table")){ + stop("violation_mat should be a data.table, and have a defined key") } - ## Check key length - if(length(key_vec[[1]]) != length(violation_text)){ - stop("The length of the key must match the length of the first argument") + ## Check for the key + if(length(key(violation_mat)) == 0) { + stop("The violation matrix should have a defined key") } + vio_mat <- as.matrix(nokey(violation_mat)) + vio_key <- violation_mat[ , key(violation_mat), with = FALSE] + + ## Tabluate voilation types + ## use apply to total up each group of code violations + criticalCount <- apply(vio_mat[ , colnames(vio_mat) %in% 1:14], 1, sum) + seriousCount <- apply(vio_mat[ , colnames(vio_mat) %in% 15:29], 1, sum) + minorCount <- apply(vio_mat[ , colnames(vio_mat) %in% 30:44], 1, sum) + ## Construct return values ret <- data.table(criticalCount, seriousCount, - minorCount) - data.table::set(x = ret, - j = names(key_vec), - value = key_vec[[1]]) - setkeyv(ret, names(key_vec)) + minorCount, + vio_key, + key = key(violation_mat)) return(ret) } diff --git a/CODE/functions/categorize.R b/CODE/functions/categorize.R index 482ad7d..6c1924b 100644 --- a/CODE/functions/categorize.R +++ b/CODE/functions/categorize.R @@ -22,11 +22,9 @@ #' package). #' #' @example -#' dat <- c("restaurant", "Restaurant and bar", "grocery", "Grocery Store", -#' "stadium", "school", "church", "high school", "school restaurant") -#' other = "Other" -#' -#' categorize(x = dat, +#' categorize(x = c("restaurant", "Restaurant and bar", "grocery", +#' "Grocery Store", "stadium", "school", "church", +#' "high school", "school restaurant"), #' primary = list(Restaurant = "restaurant", #' Grocery_Store = "grocery", #' "School"), diff --git a/CODE/functions/filter_business.R b/CODE/functions/filter_business.R index 61e7329..106ca2d 100644 --- a/CODE/functions/filter_business.R +++ b/CODE/functions/filter_business.R @@ -1,4 +1,8 @@ filter_business <- function(business){ + + ##============================================================================== + ## Filter rows + ##============================================================================== ## Remove rows with na values business <- business[!is.na(LICENSE_TERM_START_DATE)] business <- business[!is.na(LICENSE_TERM_EXPIRATION_DATE)] @@ -6,6 +10,21 @@ filter_business <- function(business){ business <- business[!(APPLICATION_TYPE %in% c("C_CAPA","C_SBA"))] ## Remove duplicate ids business <- business[!duplicated(ID)] + + ##============================================================================== + ## Remove columns that are never used + ##============================================================================== + business$SITE_NUMBER <- NULL + business$APPLICATION_CREATED_DATE <- NULL + business$APPLICATION_REQUIREMENTS_COMPLETE <- NULL + business$PAYMENT_DATE <- NULL + business$CONDITIONAL_APPROVAL <- NULL + business$LICENSE_APPROVED_FOR_ISSUANCE <- NULL + business$DATE_ISSUED <- NULL + business$LICENSE_STATUS_CHANGE_DATE <- NULL + business$SSA <- NULL + business$LOCATION <- NULL + ## Return results return(business) } diff --git a/CODE/functions/filter_foodInspect.R b/CODE/functions/filter_foodInspect.R index aed27d2..d8946ce 100644 --- a/CODE/functions/filter_foodInspect.R +++ b/CODE/functions/filter_foodInspect.R @@ -1,6 +1,13 @@ filter_foodInspect <- function(foodInspect){ foodInspect <- foodInspect[!is.na(Inspection_Date) & !is.na(License)] - foodInspect <- foodInspect[!duplicated(Inspection_ID)] + if(foodInspect[,any(duplicated(Inspection_ID))]){ + warning(paste0("Removing ", nrow(foodInspect[duplicated(Inspection_ID)]), + " duplicated records from foodInspect ", + "of ", nrow(foodInspect), " total records.\n", + "Duplication is based on the Inspection_ID")) + ## Remove any duplicated Inspection_ID values + foodInspect <- foodInspect[!duplicated(Inspection_ID)] + } foodInspect <- foodInspect[License != 0] foodInspect <- foodInspect[Inspection_Date > as.IDate("2011-09-01")] foodInspect <- foodInspect[Inspection_Type == "Canvass"] diff --git a/CODE/functions/find_bus_id_matches.R b/CODE/functions/find_bus_id_matches.R index e2ebbb6..2131238 100644 --- a/CODE/functions/find_bus_id_matches.R +++ b/CODE/functions/find_bus_id_matches.R @@ -6,6 +6,16 @@ ## find_bus_id_matches <- function(business, foodInspect) { + # browser() + ## Since many food businesses are inspected before they are issued a + ## license, so we need to move back the "License term start date" for + ## newly issued licenses. An adjustment of 365 days seems pretty + ## reasonable, and creates lots of matches. + ## Note also, the `as.integer` call. This is needed to avoid a warning. + business_copy <- copy(business) + business_copy <- business_copy[APPLICATION_TYPE=="ISSUE", + LICENSE_TERM_START_DATE := + as.integer(LICENSE_TERM_START_DATE - 365)] ## Merge over time periods dat <- foverlaps(foodInspect[i = TRUE, @@ -13,13 +23,13 @@ find_bus_id_matches <- function(business, foodInspect) { keyby = list(License, Inspection_Date = Inspection_Date, Inspection_Date_end = Inspection_Date)], - business[i = LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE, - j = list(Business_ID = ID), - keyby = list(LICENSE_NUMBER, - LICENSE_TERM_START_DATE, - LICENSE_TERM_EXPIRATION_DATE)], + business_copy[i = LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE, + j = list(ID), + keyby = list(LICENSE_NUMBER, + LICENSE_TERM_START_DATE, + LICENSE_TERM_EXPIRATION_DATE)], mult = "first", type = "any", nomatch = NA) - return(dat[ , list(Inspection_ID, Business_ID)]) + return(dat[ , list(Inspection_ID, ID)]) } diff --git a/CODE/functions/nokey.R b/CODE/functions/nokey.R new file mode 100644 index 0000000..0f02660 --- /dev/null +++ b/CODE/functions/nokey.R @@ -0,0 +1,16 @@ + +nokey <- function(dt, usestrings){ + require(data.table) + + if(!inherits(x = dt, what = "data.table")){ + stop("dt should be a data.table") + } + + if(haskey(dt)){ + ret <- dt[, .SD, .SDcol = -key(dt)] + } else { + ret <- dt + } + return(ret) +} + diff --git a/CODE/prep_inspectors_for_eval.R b/CODE/prep_inspectors_for_eval.R new file mode 100644 index 0000000..7984b46 --- /dev/null +++ b/CODE/prep_inspectors_for_eval.R @@ -0,0 +1,80 @@ + +## +## The sanitarian identies are not publically available, so measures are taken +## disguise their identities in this analysis. +## This script takes the original disguised data used in the evaluation and +## modifies it to match the format that became available in the city in early +## 2015 and is still available as of Dec 2017. +## + +##========================================================================== +## INITIALIZE +##========================================================================== +## Remove all objects; perform garbage collection +rm(list=ls()) +gc(reset=TRUE) + +## Load libraries & project functions +geneorama::loadinstall_libraries(c("data.table")) +geneorama::sourceDir("CODE/functions/") + +##============================================================================== +## LOAD CACHED RDS FILES +##============================================================================== + +inspectors <- readRDS("DATA/inspectors.Rds") +food <- readRDS("DATA/13_food_inspections.Rds") + +##============================================================================== +## PROCESS INSPECTOR DATA +##============================================================================== + +## Removing letters out front, and numbers trailing hyphen are not needed +## (e.g. -1006 I believe is the code for retail food license) +inspectors[ , License := gsub('[A-z]+|-.+$', "", License)] + +## cleaning any leading zeros +inspectors[ , License := gsub('^[0]+', "", License)] + +## removing possibly invalid license numbers +inspectors <- inspectors[nchar(License) > 3 & Inspector_Assigned != " "] + +## if multiple inspections for same license number, then using the inspector +## on the first inspection +inspectors <- inspectors[ , .N, by=list(License, Inspection_Date, Inspector_Assigned)] +inspectors$N <- NULL + +## Convert to integer to match other data +inspectors[ , License := as.integer(License)] +setkey(inspectors, License, Inspection_Date) + +## Further deduplication +inspectors_deduped <- inspectors[i = TRUE , + j = list(Inspector_Assigned = Inspector_Assigned[1]), + keyby = list(License, Inspection_Date)] + +## Merge in the Inspection_ID from the food records +inspectors_deduped <- merge(x = food[ , list(License, Inspection_Date, Inspection_ID)], + y = inspectors_deduped, + by = c("License", "Inspection_Date"), + # all.x = TRUE, + sort = FALSE) +inspectors_deduped[duplicated(Inspection_ID)] +inspectors_deduped <- inspectors_deduped[!duplicated(Inspection_ID)] + +## Make the key columns of the inspector data match the output that the model +## would have gotten from COC internal systems +inspectors_deduped_renamed <- inspectors_deduped[i = TRUE, + j = list(sanitarian = Inspector_Assigned), + keyby = list(inspectionID = Inspection_ID)] + +geneorama::NAsummary(inspectors_deduped_renamed) + +##------------------------------------------------------------------------------ +## SAVE RESULT +##------------------------------------------------------------------------------ + +saveRDS(inspectors_deduped_renamed, "DATA/19_inspector_assignments.Rds") + + + diff --git a/DATA/11_bus_license.Rds b/DATA/11_bus_license.Rds new file mode 100644 index 0000000..69a0cf4 Binary files /dev/null and b/DATA/11_bus_license.Rds differ diff --git a/DATA/crime.Rds b/DATA/12_crime.Rds similarity index 100% rename from DATA/crime.Rds rename to DATA/12_crime.Rds diff --git a/DATA/food_inspections.Rds b/DATA/13_food_inspections.Rds similarity index 100% rename from DATA/food_inspections.Rds rename to DATA/13_food_inspections.Rds diff --git a/DATA/garbage_carts.Rds b/DATA/14_garbage_carts.Rds similarity index 100% rename from DATA/garbage_carts.Rds rename to DATA/14_garbage_carts.Rds diff --git a/DATA/sanitation_code.Rds b/DATA/15_sanitation_code.Rds similarity index 100% rename from DATA/sanitation_code.Rds rename to DATA/15_sanitation_code.Rds diff --git a/DATA/weather_20110401_20141031.Rds b/DATA/17_mongo_weather_update.Rds similarity index 100% rename from DATA/weather_20110401_20141031.Rds rename to DATA/17_mongo_weather_update.Rds diff --git a/DATA/19_inspector_assignments.Rds b/DATA/19_inspector_assignments.Rds new file mode 100644 index 0000000..ba58ec2 Binary files /dev/null and b/DATA/19_inspector_assignments.Rds differ diff --git a/DATA/21_food_inspection_violation_matrix.Rds b/DATA/21_food_inspection_violation_matrix.Rds new file mode 100644 index 0000000..78edebe Binary files /dev/null and b/DATA/21_food_inspection_violation_matrix.Rds differ diff --git a/DATA/21_food_inspection_violation_matrix_nums.Rds b/DATA/21_food_inspection_violation_matrix_nums.Rds new file mode 100644 index 0000000..1070c6e Binary files /dev/null and b/DATA/21_food_inspection_violation_matrix_nums.Rds differ diff --git a/DATA/burglary_heat.Rds b/DATA/22_burglary_heat.Rds similarity index 100% rename from DATA/burglary_heat.Rds rename to DATA/22_burglary_heat.Rds diff --git a/DATA/garbageCarts_heat.Rds b/DATA/22_garbageCarts_heat.Rds similarity index 100% rename from DATA/garbageCarts_heat.Rds rename to DATA/22_garbageCarts_heat.Rds diff --git a/DATA/sanitationComplaints_heat.Rds b/DATA/22_sanitationComplaints_heat.Rds similarity index 100% rename from DATA/sanitationComplaints_heat.Rds rename to DATA/22_sanitationComplaints_heat.Rds diff --git a/DATA/23_food_insp_features.Rds b/DATA/23_food_insp_features.Rds new file mode 100644 index 0000000..268c65a Binary files /dev/null and b/DATA/23_food_insp_features.Rds differ diff --git a/DATA/24_bus_features.Rds b/DATA/24_bus_features.Rds new file mode 100644 index 0000000..b0a20ca Binary files /dev/null and b/DATA/24_bus_features.Rds differ diff --git a/DATA/bus_license.Rds b/DATA/bus_license.Rds deleted file mode 100644 index 3b4a7f6..0000000 Binary files a/DATA/bus_license.Rds and /dev/null differ diff --git a/DATA/dat_model.Rds b/DATA/dat_model.Rds deleted file mode 100644 index c1c1cd5..0000000 Binary files a/DATA/dat_model.Rds and /dev/null differ diff --git a/DATA/violation_dat.Rds b/DATA/violation_dat.Rds deleted file mode 100644 index 406f38f..0000000 Binary files a/DATA/violation_dat.Rds and /dev/null differ diff --git a/REPORTS/CountComparison_aftersecondrefactor.Rmd b/REPORTS/CountComparison_aftersecondrefactor.Rmd index 6c54918..61bb4b0 100644 --- a/REPORTS/CountComparison_aftersecondrefactor.Rmd +++ b/REPORTS/CountComparison_aftersecondrefactor.Rmd @@ -33,10 +33,10 @@ geneorama::sourceDir("CODE/functions/") ## LOAD CACHED RDS FILES ##============================================================================== ## Import the key data sets used for prediction -foodInspect <- readRDS("DATA/food_inspections.Rds") +foodInspect <- readRDS("DATA/13_food_inspections.Rds") foodInspect$Violations <- NULL -violation_dat <- readRDS("DATA/violation_dat.Rds") -dat_model <- readRDS("DATA/dat_model.Rds") +violation_dat <- readRDS("DATA/22_violation_dat.Rds") +dat_model <- readRDS("DATA/23_dat_model.Rds") ##============================================================================== ## ADD FIELDS AND APPLY FILTER TO foodInspect diff --git a/REPORTS/Metric_Development.Rmd b/REPORTS/Metric_Development.Rmd index 881f059..9721b53 100644 --- a/REPORTS/Metric_Development.Rmd +++ b/REPORTS/Metric_Development.Rmd @@ -293,7 +293,7 @@ ggplot(melt(comp_summary_cumsum_diff_multi, ```{r} geneorama::set_project_dir("food-inspections-evaluation") -foodInspect <- readRDS("DATA/food_inspections.Rds") +foodInspect <- readRDS("DATA/13_food_inspections.Rds") foodInspect <- foodInspect[ , Violations := NULL] setkey(foodInspect, Inspection_ID) diff --git a/REPORTS/ModelSummary_20141204.Rmd b/REPORTS/ModelSummary_20141204.Rmd index 93cd919..859f203 100644 --- a/REPORTS/ModelSummary_20141204.Rmd +++ b/REPORTS/ModelSummary_20141204.Rmd @@ -31,7 +31,7 @@ geneorama::set_project_dir("food-inspections-evaluation") ##============================================================================== ## LOAD CACHED RDS FILES ##============================================================================== -dat <- readRDS("DATA/dat_model.Rds") +dat <- readRDS("DATA/23_dat_model.Rds") ## Only keep "Retail Food Establishment" dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"] diff --git a/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd b/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd index c307789..ececcc4 100644 --- a/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd +++ b/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.Rmd @@ -70,7 +70,7 @@ rm(confusion_values_test, dat, errors, errorsTest, iiTest, iiTrain, geneorama::lll() ## Load Food Inspection Data -food <- readRDS("DATA/food_inspections.Rds") +food <- readRDS("DATA/13_food_inspections.Rds") ```