Improvements and bug fixes for epiweekly other hubverse tables and pl…

…ots (#312)
CDCgov · Jan 28, 2025 · df3648d · df3648d
1 parent 82cda4b
commit df3648d
Show file tree

Hide file tree

Showing 4 changed files with 199 additions and 41 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,7 +26,7 @@ repos:
   #####
   # R
   - repo: https://github.com/lorenzwalthert/precommit
-    rev: v0.4.3.9001
+    rev: v0.4.3
     hooks:
       - id: style-files
       - id: lintr

diff --git a/hewr/R/to_epiweekly_quantile_table.R b/hewr/R/to_epiweekly_quantile_table.R
@@ -65,7 +65,8 @@ to_epiweekly_quantiles <- function(model_run_dir,
       value_col = ".value"
     ) |>
     dplyr::mutate(
-      "location" = !!location
+      "location" = !!location,
+      "source_samples" = !!draws_file_name
     )
   message(glue::glue("Done processing {model_run_dir}"))
   return(epiweekly_quantiles)
@@ -98,7 +99,7 @@ to_epiweekly_quantile_table <- function(model_batch_dir,
                                         epiweekly_other_locations = c()) {
   model_runs_path <- fs::path(model_batch_dir, "model_runs")
 
-  locations_to_process <- fs::dir_ls(model_runs_path,
+  model_run_dirs_to_process <- fs::dir_ls(model_runs_path,
     type = "directory"
   ) |>
     purrr::discard(~ fs::path_file(.x) %in% exclude)
@@ -122,45 +123,57 @@ to_epiweekly_quantile_table <- function(model_batch_dir,
     day_of_week = 7
   )
 
-  get_location_table <- \(loc) {
-    epiweekly_other <- loc %in% epiweekly_other_locations
+  get_location_table <- \(model_run_dir) {
+    loc <- fs::path_file(model_run_dir)
+    use_epiweekly_other <- loc %in% epiweekly_other_locations
+    which_forecast <- ifelse(use_epiweekly_other,
+      "explicitly epiweekly",
+      "aggregated daily"
+    )
+    glue::glue(
+      "Using {which_forecast} non-target ED visit forecast ",
+      "for location {loc}"
+    )
+
     draws_file <- ifelse(
-      epiweekly_other,
+      use_epiweekly_other,
       "epiweekly_with_epiweekly_other_samples",
       "epiweekly_samples"
     )
     return(to_epiweekly_quantiles(
-      loc,
+      model_run_dir,
       report_date = report_date,
       max_lookback_days = 15,
       draws_file_name = draws_file,
       strict = strict
     ))
   }
 
-  hubverse_table <- purrr::map(
-    locations_to_process,
+  quant_table <- purrr::map(
+    model_run_dirs_to_process,
     get_location_table
   ) |>
-    dplyr::bind_rows() |>
+    dplyr::bind_rows()
+
+  loc_sources <- quant_table |>
+    dplyr::distinct(.data$location, .data$source_samples)
+
+  hubverse_table <- quant_table |>
     forecasttools::get_hubverse_table(
       report_epiweek_end,
       target_name =
         glue::glue("wk inc {disease_abbr} prop ed visits")
     ) |>
+    dplyr::inner_join(loc_sources,
+      by = "location"
+    ) |>
     dplyr::arrange(
       .data$target,
       .data$output_type,
       .data$location,
       .data$reference_date,
       .data$horizon,
       .data$output_type_id
-    ) |>
-    dplyr::mutate(other_ed_visit_forecast = ifelse(
-      .data$location %in% !!epiweekly_other_locations,
-      "direct_epiweekly_fit",
-      "aggregated_daily_fit"
-    ))
-
+    )
   return(hubverse_table)
 }
diff --git a/hewr/tests/testthat/test_to_epiweekly_quantile_table.R b/hewr/tests/testthat/test_to_epiweekly_quantile_table.R
@@ -39,9 +39,20 @@ test_that("to_epiweekly_quantiles works as expected", {
     ) |> suppressMessages()
 
     expect_s3_class(result, "tbl_df")
-    expect_setequal(c(
-      "epiweek", "epiyear", "quantile_value", "quantile_level", "location"
-    ), colnames(result))
+    checkmate::expect_names(
+      colnames(result),
+      identical.to = c(
+        "epiweek",
+        "epiyear",
+        "quantile_value",
+        "quantile_level",
+        "location",
+        "source_samples"
+      )
+    )
+
+    expect_equal(draws_file_name, unique(result$source_samples))
+
     expect_gt(nrow(result), 0)
   }
 
@@ -127,7 +138,11 @@ test_that("to_epiweekly_quantiles handles missing forecast files", {
 
 
 # tests for `to_epiweekly_quantile_table`
-test_that("to_epiweekly_quantile_table handles multiple locations", {
+test_that(paste0(
+  "to_epiweekly_quantile_table ",
+  "handles multiple locations ",
+  "and multiple source files"
+), {
   batch_dir_name <- "covid-19_r_2024-12-14_f_2024-12-08_t_2024-12-14"
   tempdir <- withr::local_tempdir()
 
@@ -142,6 +157,17 @@ test_that("to_epiweekly_quantile_table handles multiple locations", {
     if (loc != "loc3") {
       disease_cols <- c(disease_cols, "prop_disease_ed_visits")
     }
+    create_tidy_forecast_data(
+      directory = loc_dir,
+      filename = "epiweekly_with_epiweekly_other_samples.parquet",
+      date_cols = seq(
+        lubridate::ymd("2024-12-08"), lubridate::ymd("2024-12-14"),
+        by = "week"
+      ),
+      disease_cols = disease_cols,
+      n_draw = 25,
+      with_epiweek = TRUE
+    )
 
     create_tidy_forecast_data(
       directory = loc_dir,
@@ -157,7 +183,10 @@ test_that("to_epiweekly_quantile_table handles multiple locations", {
   })
 
   ## should succeed despite loc3 not having valid draws with strict = FALSE
-  result_w_both_locations <- to_epiweekly_quantile_table(temp_batch_dir) |>
+  result_w_both_locations <-
+    to_epiweekly_quantile_table(temp_batch_dir,
+      epiweekly_other_locations = "loc1"
+    ) |>
     suppressMessages()
 
   ## should error if strict = TRUE because loc3 does not have
@@ -168,6 +197,44 @@ test_that("to_epiweekly_quantile_table handles multiple locations", {
     "did not find valid draws"
   )
 
+  ## should succeed with strict = TRUE if loc3 is excluded
+  alt_result_w_both_locations <- (
+    to_epiweekly_quantile_table(temp_batch_dir,
+      strict = TRUE,
+      exclude = "loc3"
+  )) |>
+    suppressMessages()
+
+  ## results should be equivalent for loc2,
+  ## but not for loc1
+  expect_equal(
+    result_w_both_locations |>
+      dplyr::filter(location == "loc2"),
+    alt_result_w_both_locations |>
+      dplyr::filter(location == "loc2")
+  )
+
+  ## check that one used epiweekly
+  ## other for loc1 while other used
+  ## default, resulting in different values
+  loc1_a <- result_w_both_locations |>
+    dplyr::filter(location == "loc1") |>
+    dplyr::pull(.data$value)
+  loc1_b <- alt_result_w_both_locations |>
+    dplyr::filter(location == "loc1") |>
+    dplyr::pull(.data$value)
+
+  ## length checks ensure that the
+  ## number of allowed equalities _could_
+  ## be reached if the vectors were mostly
+  ## or entirely identical
+  expect_gt(length(loc1_a), 10)
+  expect_gt(length(loc1_b), 10)
+  expect_lt(
+    sum(loc1_a == loc1_b),
+    5
+  )
+
   expect_s3_class(result_w_both_locations, "tbl_df")
   expect_gt(nrow(result_w_both_locations), 0)
   checkmate::expect_names(
@@ -181,20 +248,32 @@ test_that("to_epiweekly_quantile_table handles multiple locations", {
       "output_type",
       "output_type_id",
       "value",
-      "other_ed_visit_forecast"
+      "source_samples"
     )
   )
   expect_setequal(
-    c("loc1", "loc2"),
-    result_w_both_locations$location
+    result_w_both_locations$location,
+    c("loc1", "loc2")
+  )
+  expect_setequal(
+    alt_result_w_both_locations$location,
+    c("loc1", "loc2")
   )
-  expect_false("loc3" %in% result_w_both_locations$location)
 
-  result_w_one_location <- to_epiweekly_quantile_table(
-    model_batch_dir = temp_batch_dir,
-    exclude = "loc1"
-  ) |>
-    suppressMessages()
-  expect_true("loc2" %in% result_w_one_location$location)
-  expect_false("loc1" %in% result_w_one_location$location)
+  expect_setequal(
+    result_w_both_locations$source_samples,
+    c(
+      "epiweekly_samples",
+      "epiweekly_with_epiweekly_other_samples"
+    )
+  )
+
+  expect_setequal(
+    alt_result_w_both_locations$source_samples,
+    "epiweekly_samples"
+  )
+
+
+  expect_false("loc3" %in% result_w_both_locations$location)
+  expect_false("loc3" %in% alt_result_w_both_locations$location)
 })