From fb2c8325056c01034ad8d0f2030425920804afa1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 May 2023 03:33:38 -0700 Subject: [PATCH 01/21] Add WIP on latency&revisions&backtesting&backcasting materials --- DESCRIPTION | 2 + vignettes/backtesting.Rmd | 328 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 330 insertions(+) create mode 100644 vignettes/backtesting.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index d3d100a9..edbae226 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,6 +45,7 @@ Imports: Suggests: covidcast, epidatr, + epipredict, ggplot2, knitr, outbreaks, @@ -56,6 +57,7 @@ VignetteBuilder: knitr Remotes: cmu-delphi/epidatr, + cmu-delphi/epipredict, reconverse/outbreaks, glmgen/genlasso Config/testthat/edition: 3 diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd new file mode 100644 index 00000000..3644e073 --- /dev/null +++ b/vignettes/backtesting.Rmd @@ -0,0 +1,328 @@ +--- +title: Backtesting forecasters and other models +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Backtesting forecasters and other models} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +### Overview, and why to use `epix_slide()` + +The `epiprocess` package provides two functions that are useful for backtesting +forecasts and other models: `epi_slide()` and `epix_slide()`; however, we +recommend always using the latter, because: + +* `epix_slide()` faithfully reproduces latency in data reporting, while + `epi_slide()` does not; and +* `epix_slide()` faithfully reproduces the version of each available observation + that would have been available in real time, while `epi_slide()` does not. + +Using `epi_slide()` or a similar approach instead would typically produce overly +optimistic retrospective forecast evaluations, and might miss some run-time +errors that would have occurred in real time due to irregular latency in data +reporting. Thus, it is essential to use a version-aware approach like +`epix_slide()` for backtesting evaluations. + +Additionally, `epix_slide()` can be used to build and evaluate version-aware +models that use data on historical data revisions to try to improve their +accuracy. + +### Smaller reproducibility issues that may remain + +Even using version-aware approaches like `epix_slide()`, there may still be some +smaller reproducibility issues based on the data set/provider: + +* If the `version` tags are not datetime resolution, it is unclear what the + input to the forecaster would have looked like at a particular time. For + example, if `version`s are `Date`s, it can be unclear which (if any) data from + version `2023-01-01` were available at 8am Eastern Time on `2023-01-01`. +* Depending on the data source, the `version` tags may not reflect the exact + time that the data actually was available. For example, an upstream data + provider might have some different meaning for its `version` column, or might + use database replicas which may need time to process updates and make them + available to all data consumers. +* Even though we generally think of available version data as fixed in stone, + this isn't necessarily the case. For example, an upstream version data + provider may publish some initial form of version `2023-01-01` data early in + the morning of `2023-01-01`, but discover that it contains an error, and issue + a hotfix later in the day that overwrites the `2023-01-01` version data with a + correction (but still using the exact same version tag); in this situation, + the early-morning version wouldn't be reproducible if performing an analysis + at a later date. + +# Latency issues: most recent time available vs. `ref_time_value` + +Some forecasters select test-time predictors using the maximum time value +available (either overall, or separately for each individual time series for +different geographical/demographical units). E.g., using default settings, +`epipredict::arx_forecaster` will: +- Select test-time predictors for each time series corresponding to + `max_time_value - lags` for that time series. + + FIXME probably wrong/imprecise. Need to consider shifts involved. + +- Guess that the `forecast_date` is the maximum of these `max_time_value`s over + all the time series. Guess that the `target_date` is `forecast_date + ahead`. + +We'll demonstrate some issues that occur using these default settings, first +when dealing with a single time series, and then when dealing with multiple time +series with potentially different reporting latencies. + +### Single location, fixed latency + +First, we'll construct a synthetic data set with a single location where one +particular lag of a predictor is extremely useful, while all the others are not, +and show how ignoring latency can hurt forecast accuracy. + +```{r} +library(dplyr) +library(tidyr) +library(purrr) +library(epiprocess) +library(epipredict) + +# Make RNG reproducible: +invisible(withr::local_rng_version("3.0.0")) +invisible(withr::local_seed(42L)) + +# Settings for our synthetic data set and forecasting task: +great_lag_setting = 8L +target_ahead = 14L +fixed_latency = 3L +nrow_simulated = 100L + great_lag_setting + target_ahead + +edf = tibble( + geo_value = rep("nc", nrow_simulated), + time_value = as.Date("2021-01-01") + seq(0L, nrow_simulated - 1L), + predictor = rnorm(nrow_simulated), + outcome = + dplyr::lag(predictor, great_lag_setting + target_ahead) + + 0.01 * rnorm(nrow_simulated) +) %>% + drop_na(outcome) %>% + as_epi_df() + +ea = edf %>% + mutate(version = time_value + fixed_latency) %>% + as_epi_archive() + +latency_unaware_predictions = function(group_data, group_key) { + group_data %>% + arx_forecaster( + "outcome", "predictor", + args_list = arx_args_list( + lags = 0:9, + ahead = target_ahead + ) + ) %>% + `[[`("predictions") %>% + rename(max_time_value = time_value) +} + +actual_forecast_date = ea$versions_end + +ea %>% + epix_slide( + ref_time_values = actual_forecast_date, + before = 36500L, # use an extremely large time window = no time windowing + f = latency_unaware_predictions, + names_sep = NULL # don't prefix output column names + ) + +cat("Actual forecast date:", toString(actual_forecast_date), "\n") +``` + +TODO finish + +TODO what effect this has in reality + +TODO also give example where this reproduces real-time errors for systems that +are more rigid and calculate from real `forecast_date` but encounter something not there + +# latency issue: mixed latency + +# pseudoprospective forecasting + +# revision MAPE curve + +# version-aware forecasting + +Version-aware forecasting: +```{r} +library(epidatr) + +# For reproducibility, we'll work with data reporting through some fixed issue +# we've already observed. For better reproducibility, that issue should be at +# least two days ago (from today), to incorporate any "hotfixes" to that data +# and allow up to a day for database replication delays: +ga_hhs_analysis_issue = as.Date("2023-04-05") - 2L +# Fetch issue data using `epidatr`; cache the response for politeness, and also +# note the data license for the requested signal. +# TODO try cachem +ga_hhs_issue_data_path = "ga_hhs_issue_data.rds" +if (!file.exists(ga_hhs_issue_data_path)) { + ga_hhs_issue_data = + covidcast( + data_source = "hhs", signal = "confirmed_admissions_influenza_1d", + geo_type = "state", time_type = "day", + geo_values = "ga", + # all `time_value`s: + time_values = "*", + # all issues through the analysis issue + issues = epirange(12340101, ga_hhs_analysis_issue) + ) %>% + fetch_tbl() + saveRDS(ga_hhs_issue_data, ga_hhs_issue_data_path) + writeLines(c("License: Public Domain US Government", + "Accessed via COVIDcast Epidata API"), + "LICENSE_ga_hhs_issue_data.txt") +} else { + ga_hhs_issue_data = readRDS(ga_hhs_issue_data_path) +} + +# Remove some data from a mostly-NA/0/low regime, and put version data into +# the `epi_archive` format: +hhs_start_time_value = as.Date("2020-12-01") +ga_hhs_archive = ga_hhs_issue_data %>% + filter(time_value >= hhs_start_time_value) %>% + select(geo_value, time_value, version = issue, admissions = value) %>% + as_epi_archive(compactify = TRUE) + +# We'll experiment with a pseudoprospective forecast as of a single past +# `forecast_date`. Later, we can apply this to many `forecast_date`s using an +# `epix_slide()`. +forecast_date = ga_hhs_analysis_issue - 7L + +# Get what the archive would have looked like as of the `forecast_date`: +forecast_date_archive = ga_hhs_archive %>% + epix_as_of(forecast_date, all_versions = TRUE) +# Get what the snapshot as of the `forecast_date` would have looked like: +forecast_date_edf = forecast_date_archive %>% + epix_as_of(.$versions_end) +# Reporting latency for this signal in GA on this `forecast_date`: +forecast_date_latency = as.integer(forecast_date - max(forecast_date_edf$time_value)) + +# Our test-time predictors will correspond to `time_value`s in `forecast_date - +# forecast_date_latency - c(0L, 7L, 14L)`. We'll match them to analogous +# training data based on subbing in diffent `training_forecast_date`s. Calculate +# the desired lags relative to the `forecast_date`, and assign them names to be +# used later: +test_predictor_lags_rel_max_t = c(x1 = 0L, x2 = 7L, x3 = 14L) +predictor_lags_rel_fcst_date = forecast_date_latency + test_predictor_lags_rel_max_t +# (Edge case: while we probably want to include `forecast_date - +# forecast_date_latency - 0L` to take advantage of the most recent `time_value` +# available at test time, this could potentially lead to errors if a data source +# starts reporting with lower latency, because we will have trouble matching +# with analogous training data.) + +# We also want to limit our training instances to `training_forecast_date`s with +# the same weekday as our `forecast_date`: +training_and_testing_issues_start = max( + # try to go all the way back to the first version in the archive, + min(forecast_date_archive$DT$version), + # that we expect to possibly contain observations for the desired lags + hhs_start_time_value + max(predictor_lags_rel_fcst_date) +) +# Prepare our training and testing `forecast_date`s, making sure to match the +# wday of the desired `forecast_date`: +training_and_testing_issues = rev(seq(forecast_date, training_and_testing_issues_start, -7)) + +# Find analogous predictor values using `epix_slide()`; get both training and +# testing data simultaneously here as it ends up more convenient than trying to +# separately fetch test data from `forecast_date_edf`: +version_aware_predictors = + ga_hhs_archive %>% + epix_slide( + # Our `training_forecast_date`s + testing `forecast_date` + ref_time_values = training_and_testing_issues, + # For performance purposes, apply a time window that includes just enough + # `time_value`s that we can extract our predictors: + before = max(predictor_lags_rel_fcst_date), + function(edf, gk) { + # * `edf` is a time-windowed snapshot. + # * `gk` is the "group key". If we were performing a grouped slide, this + # could potentially be helpful. We won't need it here. + ref_time_value = attr(edf, "metadata")$as_of + # `ref_time_value` is our training/actual `forecast_date`. + edf %>% + # We must make sure that there are no gaps in the `time_value`s to make + # `dplyr::lag` work properly. Additionally, we will extend the + # `time_value`s to the `ref_time_value`, so we can just use `dplyr::lag` + # plus filter to `time_value == ref_time_value` to get our desired data. + complete( + geo_value, + time_value = full_seq(c(time_value, ref_time_value), period=1L) + ) %>% + # We'd need this `group_by` if we actually had multiple `geo_value`s: + group_by(geo_value) %>% + # Prepare columns x1, x2, x3 by `dplyr::lag`ging `admissions`: + mutate(map_dfc( + predictor_lags_rel_fcst_date, + # test_predictor_lags_rel_max_t, + ~ dplyr::lag(admissions, .x) + )) %>% + # We're done with the extra rows used to calculate the lags: + filter(time_value == ref_time_value) %>% + # Make sure we actually output a row so joins are easier: + complete(time_value = ref_time_value) %>% + ungroup() %>% + select(geo_value, x1, x2, x3) + }, + names_sep = NULL # don't prefix the output column names + ) + +# Combine the version-aware predictors with the latest available version, from +# which we'll calculate targets as of the `forecast_date`. By predicting +# fully/mostly revised targets from real-time predictors, we are making a +# revision-aware forecast. For some models or evaluation schemes, we might want +# to downweight or discard the most recent data from our training sets, or take +# other more complex approaches. But for this model, since we don't use a +# training window, the current approach seems reasonable. +training_and_testing_edf = + left_join(version_aware_predictors, + # shift outcome data so `epipredict` indexes relative to forecast date: + forecast_date_edf %>% + mutate(time_value = time_value + forecast_date_latency), + by = c("geo_value", "time_value")) %>% + as_epi_df(as_of = forecast_date) +# (Caution: since we matched wdays, this is essentially a weekly data set, and +# certain forecasters that use `dplyr::lag` or `dplyr::ahead` without +# considering `time_value` will work differently than they would on daily data. +# This isn't the case with `epipredict` below.) + +# Use `epipredict` canned forecaster `arx_forecaster`. +forecaster_output = training_and_testing_edf %>% + arx_forecaster( + outcome = "admissions", + predictors = c("x1", "x2", "x3"), + args_list = arx_args_list( + lags = 0L, # the lags are already baked into x1, x2, x3 + # ahead = 14L, + # ahead = 7L, + ahead = 0L, + # Use defaults for everything else. + ) + ) + + +library(ggplot2) +training_and_testing_edf %>% + ggplot(aes(x1, admissions)) %>% + `+`(geom_point()) %>% + `+`(geom_jitter()) %>% + `+`(geom_abline(slope = 1, intercept = 0, linetype="dotted")) + +training_and_testing_edf %>% + filter(!is.na(x1) & !is.na(admissions)) %>% + with(sum(admissions) / sum(x1)) +# Generally initial observations are revised downward. + +# Our fit coefficients: +forecaster_output$epi_workflow %>% + parsnip::extract_fit_engine() + +# Actual backcasts are in `forecaster_output$predictions` +``` + +# pseudoprospective version-aware forecasting From 0bd8db368d564b8960a319668780c8e326f1e179 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 May 2023 03:46:48 -0700 Subject: [PATCH 02/21] Add WIP on intros, old dogfooding --- vignettes/forecasting-intro.Rmd | 215 ++++++++++++++++++ vignettes/old_dogfooding_debug001.Rmd | 299 ++++++++++++++++++++++++++ 2 files changed, 514 insertions(+) create mode 100644 vignettes/forecasting-intro.Rmd create mode 100644 vignettes/old_dogfooding_debug001.Rmd diff --git a/vignettes/forecasting-intro.Rmd b/vignettes/forecasting-intro.Rmd new file mode 100644 index 00000000..0a51fbbc --- /dev/null +++ b/vignettes/forecasting-intro.Rmd @@ -0,0 +1,215 @@ +```{r} +# if (Sys.getenv("GITHUB_PAT") == "") { +# warning("If you have trouble with `renv` restoration, please set GITHUB_PAT in a `.Renviron` file (see `help(.Rprofile)` for the format and possible locations) and make sure it's .gitignore'd / not shared.") +# } +# # Initial setup to get `renv::restore()` to fetch the desired versions for +# # this demo; doesn't need to be re-run as it's recorded in the `.Rprofile` and +# # `renv.lock` files and `renv` directory. Instead, to initialize or +# # synchronize these dependencies with the lockfile, just `renv::restore()`. +# # Between these updates, the `.Rprofile` will take care of actually using the +# # `renv` to source the dependencies. +# +# install.packages("renv") +# renv::init() +# renv::install("tidyverse") +# renv::install("MMWRweek") +# renv::install("reconverse/outbreaks") +# renv::install("cmu-delphi/epidatr") +# renv::install("cmu-delphi/epiprocess@953717fd2ad6317d130a8cabeaf3d6040aebb59a") +# renv::install("cmu-delphi/epipredict") +# renv::snapshot() # for this to record dependencies, we may need to actually +# # use them below or put them in a DESCRIPTION file +``` + +Fetch data: +```{r} +library(tidyverse) +library(magrittr) +library(MMWRweek) +library(epidatr) +library(epiprocess) +library(epipredict) + +# flusurv_analysis_issue = MMWRweek(as.Date("2023-04-05") - 7L) %>% +# {.$MMWRyear*100L + .$MMWRweek} +# flusurv_issue_data_path = "flusurv_issue_data.rds" +# if (!file.exists(flusurv_issue_data_path)) { +# flusurv_issue_data = +# flusurv("network_all", +# epiweeks=epirange(123401, 345601), +# issues=epirange(123401, flusurv_analysis_issue)) %>% +# fetch_tbl() +# saveRDS(flusurv_issue_data, flusurv_issue_data_path) +# writeLines("......", "LICENSE_flusurv_data.txt") +# } else { +# flusurv_issue_data = readRDS(flusurv_issue_data) +# } + +ga_hhs_analysis_issue = as.Date("2023-04-05") - 2L +ga_hhs_issue_data_path = "ga_hhs_issue_data.rds" +if (!file.exists(ga_hhs_issue_data_path)) { + ga_hhs_issue_data = + covidcast( + "ga_hhs", "confirmed_admissions_influenza_1d", + geo_type = "state", time_type = "day", + # TODO explain * in comments + geo_values = "ga", time_values = "*", + issues = "*" + ) %>% + fetch_tbl() + saveRDS(ga_hhs_issue_data, ga_hhs_issue_data_path) + writeLines(c("License: Public Domain US Government","Accessed via COVIDcast Epidata API"), + "LICENSE_ga_hhs_issue_data.txt") +} else { + ga_hhs_issue_data = readRDS(ga_hhs_issue_data_path) +} +# TODO use, e.g., cachem. And explain why caching. +``` + +Convert to `epi_archive` format: +```{r} +ga_hhs_archive = ga_hhs_issue_data %>% + select(geo_value, time_value, version = issue, admissions = value) %>% + as_epi_archive(compactify = TRUE) +``` + +Experiment with AR forecaster on latest data (as of the analysis issue): +```{r} +ga_hhs_latest_raw = ga_hhs_archive %>% + epix_as_of(.$versions_end) +ga_hhs_latest_raw +``` +(Note that this is in the `epi_df` format.) + +We forgot about the period in 2020 with a lot of NAs and a few low values (in a +few states) in the HHS Protect data. We could adjust `time_values` above to +something like `epirange(20201201, 34560101)` and regenerate the RDS to avoid. +For now, let's just try filtering them out from the data objects as needed. Some +NA filtering is automatic in epipredict; this is just an extra precaution that +some atypical low values don't somehow make their way into the training set. + +```{r} +hhs_start_time_value = as.Date("2020-12-01") + +ga_hhs_latest_processed = ga_hhs_latest_raw %>% + filter(time_value >= hhs_start_time_value) + +ga_hhs_latest_processed %>% + ggplot(aes(time_value, admissions)) %>% + `+`(geom_line()) +``` + +Using a canned forecaster: +```{r} +fc_info = arx_forecaster( + ga_hhs_latest_processed, + outcome = "admissions", + predictors = c("admissions"), + args_list = arx_args_list( + ahead = 14L, + forecast_date = attr(edf, "metadata")[["as_of"]] + # Use defaults for everything else. + ) +) +fc_info +``` +TODO consider vignette for covid forecasts daily, another parallel for flu weekly, details on deadlines, etc. + +Extracting the forecast: +```{r} +fc_info %>% + extract2("predictions") %>% + mutate(.pred_distn = nested_quantiles(.pred_distn)) %>% + unnest(.pred_distn) +``` +TODO spell out arx_forecaster model a little more + +Pseudo-prospective version-unaware forecasting: +```{r} +forecast_dates = ga_hhs_analysis_issue - c(14L, 7L, 0L) +ga_hhs_archive %>% + epix_slide( + ref_time_values = forecast_dates, + before = 365000, # train on all past data + as_list_col = TRUE, + function(edf, gk) { + edf %>% + filter(time_value >= hhs_start_time_value) %>% + arx_forecaster( + outcome = "admissions", + predictors = c("admissions"), + args_list = arx_args_list( + ahead = 14L, + forecast_date = attr(edf, "metadata")[["as_of"]] + # Use defaults for everything else. + ) + ) %>% + extract2("predictions") %>% + mutate(.pred_distn = nested_quantiles(.pred_distn)) %>% + unnest(.pred_distn) + } + ) %>% + select(-time_value) %>% + unnest(slide_value) +``` +XXX arx_forecaster default to the 23 taus? + +Version-aware forecasting: +```{r} +forecast_date = ga_hhs_analysis_issue + +forecast_date_archive = ga_hhs_archive %>% epix_as_of(forecast_date, all_versions = TRUE) +forecast_date_edf = forecast_date_archive %>% epix_as_of(.$versions_end) +forecast_date_latency = as.integer(forecast_date - max(forecast_date_edf$time_value)) + +training_and_testing_issues_start = max(min(forecast_date_archive$DT$version), + hhs_start_time_value + 14L) +training_and_testing_issues = rev(seq(forecast_date, training_and_testing_issues_start, -7)) + +version_aware_covariates = + ga_hhs_archive %>% + epix_slide( + ref_time_values = training_and_testing_issues, + before = forecast_date_latency + 14L, # include just enough to get the lags we want + names_sep = NULL, + function(edf, gk) { + ref_time_value = attr(edf, "metadata")$as_of + edf %>% + filter(time_value >= hhs_start_time_value) %>% + complete(geo_value, time_value = full_seq(c(time_value, ref_time_value), period=1L)) %>% + group_by(geo_value) %>% + mutate(x1 = lag(admissions, forecast_date_latency + 0L), + x2 = lag(admissions, forecast_date_latency + 7L), + x3 = lag(admissions, forecast_date_latency + 14L)) %>% + ungroup() %>% + filter(time_value == ref_time_value) %>% + select(geo_value, x1, x2, x3) + } + ) + +full_join(version_aware_covariates, forecast_date_edf) %>% + as_epi_df(as_of = forecast_date) %>% + na.omit() %>% + arx_forecaster( + outcome = "admissions", + predictors = c("x1","x2","x3"), + args_list = arx_args_list( + lags = 0L, + ahead = 14L, + forecast_date = forecast_date, + # Use defaults for everything else. + ) +) +``` + + + + + + + +# other notes +https://cmu-delphi.github.io/epipredict/articles/sliding.html +- consider warnings with merge locf. especially if more meaningful amounts like 5 days +- explain [[1]] / explain `y` object. port over the better names from derived vignette +- k_week_ahead -> k_day_ahead diff --git a/vignettes/old_dogfooding_debug001.Rmd b/vignettes/old_dogfooding_debug001.Rmd new file mode 100644 index 00000000..dc53a3a7 --- /dev/null +++ b/vignettes/old_dogfooding_debug001.Rmd @@ -0,0 +1,299 @@ +--- +title: Dogfooding `epiprocess` with some data debugging +date: 2022-10-19 +output: html_document +--- + +TODO: compare how things look with slides on `dev` / in PRs + +This is an Rmd-ified version/retrospective of some data debugging for the influenza forecasts that went out yesterday. + +NOTE: Previously, I saw the primary use cases of `epi_archive` as performing + pseudoprospective forecasts and using version data to get more appropriate + training sets, but this might also rank up there. + +- TODO: evaluate how easy it is to do the former (vs. using `evalcast`). Consider + where the `weighted_interval_score` functions should live. Consider ways to + simplify querying&merging multiple data sources. +- Nat is working on tools for the latter. + +```{r} +library(epidatr) +library(epiprocess) # installed a local version with `ref_time_values` filter removed; issue #153 +library(data.table) +library(dplyr) +library(ggplot2) +``` + +Fetch hhs data: +```{r hhs_influenza_tbl, cache=TRUE} +hhs_influenza_tbl = covidcast( + data_source = "hhs", + signals = "confirmed_admissions_influenza_1d", + time_type = "day", + geo_type = "state", + time_values = epirange(12340101, 34560101), + geo_values = "*", + issues = epirange(12340101, 34560101) +) %>% fetch_tbl() + +hhs_influenza_archive = hhs_influenza_tbl %>% + rename(version = issue) %>% + as_epi_archive() +``` +Notes: + +- `epirange(12340101, 34560101)` pattern: this is a range to cover all dates. + + - MINOR: Would look better as "*". I think this is already an issue somewhere. + - MINOR: For reproducibility, 34560101 should probably be the date of the analysis or + the date before it. A couple awkward things here: + + - `epidatr` / the API let me query version data it didn't have, and + repeating this query a week from now will include additional data. This is + definitely a bug for `as_of` queries, but for these `issues` queries, it + might be debatable. + + - If using the date of the analysis, it's including version data which could + be overwritten later in the day, and the analysis might not match later + reproductions. + + - Currently we have a (noisy, poorly worded) warning when using the + `epi_archive` data for those dates. But the warning wouldn't actually + apply to later reproductions when the version data in question has been + finalized (barring exceptional circumstances). + - We plan to change settings to remove this warning by default, and make + it up to the user to change internal archive settings to give a warning + if they want it. + +- Why a separate `tbl` object? This is awkward and memory-inefficient. + + - MEDIUM: there's no `filter` or comparable function in `epi_archive`. It + feels awkward to have to clone the archive, and overwrite / copy-and-mutate + the data table inside (or to extract the data table, filter, and convert + back to an `epi_archive`). + - MEDIUM: R6 annoyances during development, and for users that save archives + and update epiprocess: archive objects include their R6 code; if I load a + new version of epiprocess it only affects the S3 wrappers, not the R6 + methods underneath. This leads to reloading the package appearing to have no + affect, and could lead to errors from incompatibilities between updated + wrappers and non-updated R6 methods. + +```{r} +ref_time_values = hhs_influenza_archive$DT[as.POSIXlt(version)$wday==1L,unique(version)] +``` + +- MAJOR: I ran this on a Monday. I wanted the `ref_time_values` to include that + Monday. But `epix_slide` throws it out. I ran on a patched version of + epiprocess. + +```{r} +slide_result = + hhs_influenza_archive %>% + epix_slide(function(edf, grp) { + edf %>% + mutate(version_lag = attr(., "metadata")$as_of - .data$time_value) %>% + group_by(version_lag) %>% + summarize( + n_zero = sum(!is.na(.data$value) & .data$value == 0L), + n_na = sum(is.na(.data$value)), + n_nonzero_nonna = n() - .data$n_zero - .data$n_na, + n = n(), + .groups = "drop" + ) %>% + {tibble(stats = list(.))} + }, n = 60L, group_by=c(), ref_time_values = ref_time_values) +stats_on_specific_values = + slide_result %>% + group_by(time_value) %>% + slice_head(n = 1L) %>% + ungroup() %>% + unnest(slide_value_stats) %>% + group_by(version_lag) %>% + summarize(across(c(n_zero, n_na, n_nonzero_nonna, n), sum), + .groups = "drop") %>% + mutate(p_zero = .data$n_zero / .data$n, + p_na = .data$n_na / .data$n, + p_nonzero_nonna = .data$n_nonzero_nonna / .data$n) +print(stats_on_specific_values, n=100L) +``` + +- MAJOR: I struggled to get the computation result in an accepted format. I + don't have a record of what all I tried, but some of it was failing to have + success with a list-type result. Some was a stupid mistake forgetting to + remove the ` = ` when moving from failing to use that form to + providing an `f` function/formula. Some was due to the row count requirement + on the computation results, discussed next. +- MAJOR: computation result output requirements and slide output don't make + sense when we don't group by all non-version, non-time key columns. We require + the computation output to have either just one row, or one row per distinct + combination of the non-grouping, non-time, non-version key columns; however, + in the former case, we repeat that single row the latter number of times, but + don't attach those non-grouping key columns to the result. + + - In this case, we didn't want that repeating behavior, thus the grouped slice + operation. + - In fact, we wanted a different number of rows than the expectation, thus the + unnesting. +- MINOR: slide computation wasn't instantaneous. Thus I stored in a + `slide_result` object while I tried to figure out the rest of the operations + that would be required. (Plus retrying computing the slide result while trying + to figure out how to make things work.) +- MINOR: missing an archive filter function here too, to speed up + experimentation / help debugging. + + +```{r} +hhs_influenza_archive %>% + epix_as_of(as.Date("2022-10-17")) %>% + filter(.data$geo_value == "nv") %>% + select(time_value, value) %>% + print(n=2000L) +``` +- MINOR: missing an archive filter function here too. + +```{r} +stats_on_deltas = + hhs_influenza_archive %>% + epix_slide(function(edf, grp) { + edf %>% + arrange(geo_value, time_value) %>% + mutate(no_change = + lag(.data$time_value) == .data$time_value - 1L & + lag(.data$value) == .data$value) %>% + group_by(version_lag = as.integer(attr(., "metadata")$as_of - .data$time_value)) %>% + summarize( + n_no_change = sum(!is.na(.data$no_change) & .data$no_change), + n_na_change = sum(is.na(.data$no_change)), + n = n(), + .groups = "drop" + ) %>% + {tibble(stats = list(.))} + }, n = 60L, group_by=c(), ref_time_values = ref_time_values) %>% + group_by(time_value) %>% + slice_head(n = 1L) %>% + ungroup() %>% + unnest(slide_value_stats) %>% + group_by(version_lag) %>% + summarize(across(starts_with("n"), sum), + .groups = "drop") %>% + mutate(across(starts_with("n_"), list(prop=~.x/n))) +print(stats_on_deltas, n=100L) +``` + +```{r} +hhs_influenza_tbl %>% + filter(geo_value == "nv") %>% + rename(version = issue) %>% + as_epi_archive() %>% + epix_slide(~ tibble(data=list(.x)), n = 100L, + # ref_time_values = seq(as.Date("2022-10-01"), as.Date("2022-10-17"), by="day")) %>% + ref_time_values = seq(as.Date("2021-01-01"), as.Date("2022-10-17"), by="day")) %>% + transmute(version=time_value, slide_value_data) %>% + unnest(slide_value_data) %>% + mutate(version_lag = as.integer(version - time_value)) %>% + # ggplot(aes(time_value, value, group=version, colour=version)) %>% + # `+`(geom_line()) %>% + # `+`(scale_colour_continuous(type = "viridis", trans = "date")) + # ggplot(aes(time_value, value)) %>% + # `+`(geom_line()) %>% + # `+`(scale_colour_continuous(type = "viridis")) %>% + # `+`(facet_wrap(~ version_lag)) + filter(time_value >= as.Date("2022-09-01")) %>% + ggplot(aes(version, value)) %>% + `+`(geom_line()) %>% + `+`(scale_colour_continuous(type = "viridis")) %>% + `+`(facet_wrap(~ time_value)) +``` + +- MEDIUM: the `ref_time_values` become the `time_value` column, but I was + outputting a (sensible) `time_value` column from the computation. Munging the column names is a hassle. +- MINOR: I don't like the `slide_value_`/any name prefix, and no prefix isn't even an option. +- MINOR: again, missing an archive filter function + +Here, I wanted "revision sequences" for some observations in NV. +```{r} +hhs_influenza_tbl %>% + filter(geo_value == "nv") %>% + filter(issue >= as.Date("2022-10-10")) %>% + select(issue, time_value, value, lag) %>% + arrange(time_value, issue) %>% + group_by(time_value) %>% + mutate(diff_value = c(NA, diff(value))) %>% + ungroup() +``` +- NOTE: this can't be done with an epi_archive's built-in functions. But I could + have just done `hhs_influenza_archive$DT` and used a data table; a separate + tibble wasn't required yet. + +Try to get a sense of the scale of available data in a couple of locations: +```{r} +hhs_influenza_archive %>% + epix_as_of(.$versions_end) %>% + group_by(geo_value) %>% + summarize(tibble(levels = (0:20)/20, + quantiles = quantile(value, levels, na.rm=TRUE))) %>% + filter(geo_value %in% c("al", "ar")) %>% + print(n=50L) +``` + +Compute and plot 7-day sum data: +```{r} +hhs_influenza_archive %>% + epix_as_of(.$versions_end) %>% + filter(geo_value %in% c("al", "ar")) %>% + tidyr::complete(geo_value, time_value = tidyr::full_seq(time_value, 1L)) %>% + dplyr::group_by(geo_value) %>% + dplyr::mutate(value_7dsum = frollsum(value, 7L)) %>% + # ggplot(aes(time_value, value)) %>% + ggplot(aes(time_value, value_7dsum)) %>% + `+`(geom_line()) %>% + `+`(facet_wrap(~ geo_value)) +``` + +- MEDIUM: I had to calculate the 7d sum myself; while this is simple, it's maybe + a little tedious, and might be error-prone if there are missing rows. There's + no 7d sum in the API. In the future we don't be able to fetch a 7d average + issue data either, so that's out for use with `epi_archive`s. + +```{r} +# chng_influenza_tbl = covidcast( +# data_source = "chng", +# signals = "smoothed_adj_outpatient_flu", +# time_type = "day", +# geo_type = "state", +# time_values = epirange(12340101, 34560101), +# geo_values = "*", +# issues = epirange(12340101, 34560101) +# ) %>% fetch_tbl() + +# chng_influenza_archive = chng_influenza_tbl %>% +# rename(version = issue) %>% +# as_epi_archive() +``` + +```{r} +# chng_influenza_tbl %>% +# filter(geo_value == "nv") %>% +# rename(version = issue) %>% +# as_epi_archive() %>% +# epix_slide(~ tibble(data=list(.x)), n = 100L, +# ref_time_values = seq(as.Date("2022-05-01"), as.Date("2022-10-13"), by="day")) %>% +# # ref_time_values = seq(as.Date("2021-01-01"), as.Date("2022-10-17"), by="day")) %>% +# transmute(version=time_value, slide_value_data) %>% +# unnest(slide_value_data) %>% +# mutate(version_lag = as.integer(version - time_value)) %>% +# ggplot(aes(time_value, value, group=version, colour=version)) %>% +# `+`(geom_line()) %>% +# `+`(scale_colour_continuous(type = "viridis", trans = "date")) +# # ggplot(aes(time_value, value)) %>% +# # `+`(geom_line()) %>% +# # `+`(scale_colour_continuous(type = "viridis")) %>% +# # `+`(facet_wrap(~ version_lag)) +# # filter(time_value >= as.Date("2022-09-01")) %>% +# # ggplot(aes(version, value)) %>% +# # `+`(geom_line()) %>% +# # `+`(scale_colour_continuous(type = "viridis")) %>% +# # `+`(facet_wrap(~ time_value)) +``` + From 0f62b9f820389996277c39bf1c8b3cc468004f07 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 12 May 2023 07:37:02 -0700 Subject: [PATCH 03/21] In backcasting example, note latency indexing issues if try forecasts --- vignettes/backtesting.Rmd | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd index 3644e073..e79090fb 100644 --- a/vignettes/backtesting.Rmd +++ b/vignettes/backtesting.Rmd @@ -305,6 +305,10 @@ forecaster_output = training_and_testing_edf %>% ) ) +# FIXME this works for backcasting revisions to the most recent value available, +# but will be predicting the wrong aheads if we were trying to forecast 14/7 +# days beyond the `forecast_date` (since we shifted `admissions`). + library(ggplot2) training_and_testing_edf %>% From f6e79db24948bd4e2d7eb7947ad538f4786214a2 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 13 Jun 2023 09:51:51 -0700 Subject: [PATCH 04/21] Add latency&revision stats&plotting Rmd --- vignettes/latency_revision_stats.Rmd | 194 +++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 vignettes/latency_revision_stats.Rmd diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd new file mode 100644 index 00000000..c73eec14 --- /dev/null +++ b/vignettes/latency_revision_stats.Rmd @@ -0,0 +1,194 @@ +--- +title: Calculating latency and revision statistics +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Calculating latency and revision statistics} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# Load data + +```{r} +library(epidatr) +library(epiprocess) +library(dplyr) +library(tidyr) +library(ggplot2) + +# For reproducibility, we'll work with data reporting through some fixed issue +# we've already observed. For better reproducibility, that issue should be at +# least two days ago (from today), to incorporate any "hotfixes" to that data +# and allow up to a day for database replication delays: +hhs_flu_hosp_analysis_issue = as.Date("2023-05-31") - 2L + +# We'll query version data for this data set from the Delphi Epidata API using +# epidatr, caching the result so that we can re-run this chunk repeatedly a bit +# faster, while offline, and without placing undue load on the API servers. +hhs_flu_hosp_issue_data_path = "hhs_flu_hosp_issue_data.rds" +if (!file.exists(hhs_flu_hosp_issue_data_path)) { + cat("Fetching.\n") + hhs_flu_hosp_issue_data = + covidcast( + data_source = "hhs", signals = "confirmed_admissions_influenza_1d", + geo_type = "state", time_type = "day", + geo_values = "*", + # all `time_value`s: + time_values = "*", + # all issues through the analysis issue + issues = epirange(12340101, hhs_flu_hosp_analysis_issue) + ) %>% + fetch() + saveRDS(hhs_flu_hosp_issue_data, hhs_flu_hosp_issue_data_path) + writeLines(c("License: Public Domain US Government", + "Accessed via COVIDcast Epidata API"), + "LICENSE_hhs_flu_hosp_issue_data.txt") +} else { + cat("Using cache.\n") + hhs_flu_hosp_issue_data = readRDS(hhs_flu_hosp_issue_data_path) +} + +# Remove some data from a mostly-NA/0/low regime, and put version data into +# the `epi_archive` format: +hhs_flu_hosp_start_time_value = as.Date("2020-12-01") +hhs_flu_hosp_archive = hhs_flu_hosp_issue_data %>% + filter(time_value >= hhs_flu_hosp_start_time_value) %>% + select(geo_value, time_value, version = issue, admissions = value) %>% + as_epi_archive(compactify = TRUE) +``` + +What is the distribution of latency across all locations and versions? +```{r, fig.width=7, fig.height=5} +n_bools_true_at_tail = function(x) { + sum(cumprod(rev(x))) +} + +last_or_na = function(x) { + if (length(x) != 0L) { + x[[length(x)]] + } else { + vctrs::vec_cast(NA, x) + } +} + +latency_stats = function(window_data, group_key, ref_time_value) { + window_data %>% + group_by(geo_value) %>% + complete(time_value = full_seq(time_value, period = 1L)) %>% + ungroup() %>% + mutate( + is_na = is.na(admissions), + is_na0 = is_na | admissions == 0L + ) %>% + group_by(geo_value) %>% + mutate( + is_na_same = is_na0 | + admissions == last_or_na(admissions[!is_na]), + is_na0same = is_na0 | + admissions == last_or_na(admissions[!is_na0]) + ) %>% + summarize( + nominal_latency = as.integer(ref_time_value - max(time_value)), + na_latency = n_bools_true_at_tail(is_na) + nominal_latency, + na0_latency = n_bools_true_at_tail(is_na0) + nominal_latency, + na_dup_latency = n_bools_true_at_tail(is_na_same) - 1L + nominal_latency, + na0dup_latency = n_bools_true_at_tail(is_na0same) - 1L + nominal_latency + ) +} + +hhs_flu_hosp_latency_by_version_geo <- + hhs_flu_hosp_archive %>% + epix_slide( + latency_stats, + before = 27L, # don't count latency more than this far back + names_sep = NULL # don't add "slide_value_" colname prefixes + ) %>% + rename(version = time_value) + +hhs_flu_hosp_latency_by_version_geo %>% + ggplot(aes(nominal_latency)) + + geom_histogram(binwidth = 1L) +``` + +Over time: +```{r, fig.width = 7, fig.height = 5} +hhs_flu_hosp_latency_by_version_geo %>% + group_by(version) %>% + summarize(mean_nominal_latency = mean(nominal_latency)) %>% + ggplot(aes(version, mean_nominal_latency)) + + geom_line() + + expand_limits(y=0) + + xlab("Version") + + ylab("Mean Nominal Latency Across Locations") +``` +How accurate is `k`-day-old data on average? +```{r} +max_analysis_lag = 30L + +hhs_flu_hosp_version_lag_data = + hhs_flu_hosp_archive %>% + epix_slide( + ~ .x, + before = max_analysis_lag, + as_list_col = TRUE + ) %>% + rename(version = time_value) %>% + unnest(slide_value) %>% + mutate(version_lag = as.integer(version - time_value)) + +hhs_flu_hosp_latest_snapshot = + hhs_flu_hosp_archive %>% + epix_as_of(.$versions_end) + +left_join(hhs_flu_hosp_version_lag_data, + hhs_flu_hosp_latest_snapshot %>% + filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), + by = c("geo_value", "time_value"), + suffix = c("_lagged", "_latest")) %>% + tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% + group_by(version_lag) %>% + summarize(MAPE1 = 100*mean(abs((admissions_latest - admissions_lagged)/(admissions_latest + 1)))) %>% + ggplot(aes(version_lag, MAPE1)) + + geom_line() + +left_join(hhs_flu_hosp_version_lag_data, + hhs_flu_hosp_latest_snapshot %>% + filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), + by = c("geo_value", "time_value"), + suffix = c("_lagged", "_latest")) %>% + tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% + group_by(version_lag) %>% + summarize(MdAPE1 = 100*median(abs((admissions_latest - admissions_lagged)/(admissions_latest + 1)))) %>% + ggplot(aes(version_lag, MdAPE1)) + + geom_line() + +left_join(hhs_flu_hosp_version_lag_data, + hhs_flu_hosp_latest_snapshot %>% + filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), + by = c("geo_value", "time_value"), + suffix = c("_lagged", "_latest")) %>% + tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% + group_by(version_lag) %>% + summarize(Q3APE1 = 100*quantile(abs((admissions_latest - admissions_lagged)/(admissions_latest + 1)), 0.75)) %>% + ggplot(aes(version_lag, Q3APE1)) + + geom_line() + +left_join(hhs_flu_hosp_version_lag_data, + hhs_flu_hosp_latest_snapshot %>% + filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), + by = c("geo_value", "time_value"), + suffix = c("_lagged", "_latest")) %>% + tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% + group_by(version_lag) %>% + reframe( + tau = (1:9)/10, + QtauAPE1 = 100*quantile(abs((admissions_latest - admissions_lagged)/(admissions_latest + 1)), tau) + ) %>% + mutate(tau = as.factor(format(tau, digits = 3))) %>% + ggplot(aes(version_lag, QtauAPE1, colour = tau, group = tau, label = tau)) + + geom_line() + + scale_colour_discrete( + guide = guide_legend(reverse = TRUE, + override.aes = list(label = "")) + ) +``` From 85bf8123913c30c21c8ccf4e8a57eff8fea8ba9c Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Tue, 13 Jun 2023 09:57:14 -0700 Subject: [PATCH 05/21] Port meteor shower latency plot from slide --- vignettes/latency_revision_stats.Rmd | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index c73eec14..387d280b 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -121,6 +121,19 @@ hhs_flu_hosp_latency_by_version_geo %>% xlab("Version") + ylab("Mean Nominal Latency Across Locations") ``` + +```{r} +hhs_flu_hosp_latency_by_version_geo %>% + mutate(nominal_latency = case_when(nominal_latency <= 2 ~ NA, TRUE ~ nominal_latency)) %>% + ggplot(aes(time_value, geo_value, fill = nominal_latency)) + + geom_raster() + + scale_fill_viridis_c(na.value = NA) + + theme_bw() + + scale_x_date(date_labels = "%b %Y") + + xlab("Version") + + ylab("HHS Flu Hospitalization Nominal Latency") +``` + How accurate is `k`-day-old data on average? ```{r} max_analysis_lag = 30L From 8edf29ad56c757eb9d08ab0d3a9146c8378a38da Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 13 Jun 2023 09:59:55 -0700 Subject: [PATCH 06/21] Tweak meteor shower latency plots, add variants --- vignettes/latency_revision_stats.Rmd | 56 ++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 387d280b..ee96903c 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -124,16 +124,66 @@ hhs_flu_hosp_latency_by_version_geo %>% ```{r} hhs_flu_hosp_latency_by_version_geo %>% - mutate(nominal_latency = case_when(nominal_latency <= 2 ~ NA, TRUE ~ nominal_latency)) %>% - ggplot(aes(time_value, geo_value, fill = nominal_latency)) + + mutate(nominal_latency = case_when( + nominal_latency <= 2 ~ NA, + TRUE ~ nominal_latency + )) %>% + ggplot(aes(version, geo_value, fill = nominal_latency)) + geom_raster() + - scale_fill_viridis_c(na.value = NA) + + scale_fill_viridis_c(na.value = "transparent") + theme_bw() + scale_x_date(date_labels = "%b %Y") + xlab("Version") + ylab("HHS Flu Hospitalization Nominal Latency") ``` +```{r} +hhs_flu_hosp_latency_by_version_geo %>% + mutate(na_latency = case_when( + na_latency <= 2 ~ NA, + TRUE ~ na_latency + )) %>% + ggplot(aes(version, geo_value, fill = na_latency)) + + geom_raster() + + scale_fill_viridis_c(na.value = "transparent") + + theme_bw() + + scale_x_date(date_labels = "%b %Y") + + xlab("Version") + + ylab("HHS Flu Hospitalization Nonmissing Value Latency") +``` + +```{r} +hhs_flu_hosp_latency_by_version_geo %>% + mutate(na_dup_latency = case_when( + na_dup_latency <= 2 ~ NA, + TRUE ~ na_dup_latency + )) %>% + ggplot(aes(version, geo_value, fill = na_dup_latency)) + + geom_raster() + + scale_fill_viridis_c(na.value = "transparent") + + theme_bw() + + scale_x_date(date_labels = "%b %Y") + + xlab("Version") + + ylab("HHS Flu Hospitalization Nonmissing Nonduplicate Latency") +``` + +```{r} +hhs_flu_hosp_latency_by_version_geo %>% + mutate(na0dup_latency = case_when( + na0dup_latency <= 2 ~ NA, + TRUE ~ na0dup_latency + )) %>% + ggplot(aes(version, geo_value, fill = na0dup_latency)) + + geom_raster() + + scale_fill_viridis_c(na.value = "transparent") + + theme_bw() + + scale_x_date(date_labels = "%b %Y") + + xlab("Version") + + ylab("HHS Flu Hospitalization Nonmissing Nonzero Nonduplicate Latency") +``` + + + How accurate is `k`-day-old data on average? ```{r} max_analysis_lag = 30L From 9361e526e1e4aaa9dd2b831d5a8b62985eb11726 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 13 Jun 2023 10:37:23 -0700 Subject: [PATCH 07/21] Update latency stats analysis date, style, discussion --- vignettes/latency_revision_stats.Rmd | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index ee96903c..66830d4c 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -20,7 +20,7 @@ library(ggplot2) # we've already observed. For better reproducibility, that issue should be at # least two days ago (from today), to incorporate any "hotfixes" to that data # and allow up to a day for database replication delays: -hhs_flu_hosp_analysis_issue = as.Date("2023-05-31") - 2L +hhs_flu_hosp_analysis_issue = as.Date("2023-06-13") - 2L # We'll query version data for this data set from the Delphi Epidata API using # epidatr, caching the result so that we can re-run this chunk repeatedly a bit @@ -57,7 +57,18 @@ hhs_flu_hosp_archive = hhs_flu_hosp_issue_data %>% as_epi_archive(compactify = TRUE) ``` -What is the distribution of latency across all locations and versions? +There are several notions of latency, some of which include: +- "Nominal latency": the difference between the max `time_value` available for + some location, as of some version +- "Nonmissing latency": also considers explicitly-recorded `NA`s and any skipped + `time_value`s as latent +- "Nonmissing nonduplicate latency": considers `NA`s, skipped `time_value`s, and + duplications of the most recent non-`NA` value to be latent +- "Nonmissing nonzero nonduplicate latency": considers `NA`s, skipped + `time_value`s, zeros, and duplications of the most recent non-`NA`, nonzero + value to be latent + +We'll calculate all the above definitions of latency using `epix_slide()`: ```{r, fig.width=7, fig.height=5} n_bools_true_at_tail = function(x) { sum(cumprod(rev(x))) @@ -96,15 +107,18 @@ latency_stats = function(window_data, group_key, ref_time_value) { ) } -hhs_flu_hosp_latency_by_version_geo <- +hhs_flu_hosp_latency_by_version_geo = hhs_flu_hosp_archive %>% epix_slide( latency_stats, - before = 27L, # don't count latency more than this far back + before = 27L, # don't calculate latency more than this far back names_sep = NULL # don't add "slide_value_" colname prefixes ) %>% rename(version = time_value) +``` +Some summary statistics related to the "nominal latency": +```{r} hhs_flu_hosp_latency_by_version_geo %>% ggplot(aes(nominal_latency)) + geom_histogram(binwidth = 1L) @@ -122,6 +136,7 @@ hhs_flu_hosp_latency_by_version_geo %>% ylab("Mean Nominal Latency Across Locations") ``` +Finer-grained visualizations of different notions of latency: ```{r} hhs_flu_hosp_latency_by_version_geo %>% mutate(nominal_latency = case_when( From 84bb6bcd4b95f87fc2f56d5a976485d5fe032fb6 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 13 Jun 2023 12:53:17 -0700 Subject: [PATCH 08/21] Fix latency calc bugs, missing descriptions, apply ggplotly --- DESCRIPTION | 1 + vignettes/latency_revision_stats.Rmd | 133 +++++++++++++++++++-------- 2 files changed, 97 insertions(+), 37 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index edbae226..a3d8724f 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,6 +49,7 @@ Suggests: ggplot2, knitr, outbreaks, + plotly, rmarkdown, testthat (>= 3.1.5), waldo (>= 0.3.1), diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 66830d4c..22afc390 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -15,6 +15,7 @@ library(epiprocess) library(dplyr) library(tidyr) library(ggplot2) +library(plotly) # For reproducibility, we'll work with data reporting through some fixed issue # we've already observed. For better reproducibility, that issue should be at @@ -57,11 +58,15 @@ hhs_flu_hosp_archive = hhs_flu_hosp_issue_data %>% as_epi_archive(compactify = TRUE) ``` +# Latency statistics + There are several notions of latency, some of which include: - "Nominal latency": the difference between the max `time_value` available for some location, as of some version - "Nonmissing latency": also considers explicitly-recorded `NA`s and any skipped `time_value`s as latent +- "Nonmissing nonzero latency": considers explicitly-recorded `NA`s, skipped + `time_value`s, and zeros as latent - "Nonmissing nonduplicate latency": considers `NA`s, skipped `time_value`s, and duplications of the most recent non-`NA` value to be latent - "Nonmissing nonzero nonduplicate latency": considers `NA`s, skipped @@ -69,9 +74,9 @@ There are several notions of latency, some of which include: value to be latent We'll calculate all the above definitions of latency using `epix_slide()`: -```{r, fig.width=7, fig.height=5} +```{r latency-stats, cache=TRUE} n_bools_true_at_tail = function(x) { - sum(cumprod(rev(x))) + match(FALSE, rev(x), nomatch = length(x) + 1L) - 1L } last_or_na = function(x) { @@ -93,7 +98,7 @@ latency_stats = function(window_data, group_key, ref_time_value) { ) %>% group_by(geo_value) %>% mutate( - is_na_same = is_na0 | + is_na_same = is_na | admissions == last_or_na(admissions[!is_na]), is_na0same = is_na0 | admissions == last_or_na(admissions[!is_na0]) @@ -101,9 +106,13 @@ latency_stats = function(window_data, group_key, ref_time_value) { summarize( nominal_latency = as.integer(ref_time_value - max(time_value)), na_latency = n_bools_true_at_tail(is_na) + nominal_latency, - na0_latency = n_bools_true_at_tail(is_na0) + nominal_latency, - na_dup_latency = n_bools_true_at_tail(is_na_same) - 1L + nominal_latency, - na0dup_latency = n_bools_true_at_tail(is_na0same) - 1L + nominal_latency + na0_latency = n_bools_true_at_tail(is_na0) + nominal_latency, + na_dup_latency = + # (don't count NAs before the first copy of the last non-na) + sum(cumsum(is_na_same[rev(cumsum(rev(!is_na_same))) == 0L]) != 0L) - 1L, + na0dup_latency = + # (don't count NAs and 0s before the first copy of the last non-na0) + sum(cumsum(is_na0same[rev(cumsum(rev(!is_na0same))) == 0L]) != 0L) - 1L ) } @@ -117,28 +126,12 @@ hhs_flu_hosp_latency_by_version_geo = rename(version = time_value) ``` -Some summary statistics related to the "nominal latency": -```{r} -hhs_flu_hosp_latency_by_version_geo %>% - ggplot(aes(nominal_latency)) + - geom_histogram(binwidth = 1L) -``` - -Over time: -```{r, fig.width = 7, fig.height = 5} -hhs_flu_hosp_latency_by_version_geo %>% - group_by(version) %>% - summarize(mean_nominal_latency = mean(nominal_latency)) %>% - ggplot(aes(version, mean_nominal_latency)) + - geom_line() + - expand_limits(y=0) + - xlab("Version") + - ylab("Mean Nominal Latency Across Locations") -``` +## Finer-grained visualizations of different notions of latency -Finer-grained visualizations of different notions of latency: -```{r} -hhs_flu_hosp_latency_by_version_geo %>% +Nominal latency, where it is over 2 days: +```{r, out.width = "60em", out.height="60em"} +plt = + hhs_flu_hosp_latency_by_version_geo %>% mutate(nominal_latency = case_when( nominal_latency <= 2 ~ NA, TRUE ~ nominal_latency @@ -150,10 +143,13 @@ hhs_flu_hosp_latency_by_version_geo %>% scale_x_date(date_labels = "%b %Y") + xlab("Version") + ylab("HHS Flu Hospitalization Nominal Latency") +ggplotly(plt) ``` -```{r} -hhs_flu_hosp_latency_by_version_geo %>% +Nonmissing latency, where it is over 2 days: +```{r, out.width = "60em", out.height="60em"} +plt = + hhs_flu_hosp_latency_by_version_geo %>% mutate(na_latency = case_when( na_latency <= 2 ~ NA, TRUE ~ na_latency @@ -164,13 +160,35 @@ hhs_flu_hosp_latency_by_version_geo %>% theme_bw() + scale_x_date(date_labels = "%b %Y") + xlab("Version") + - ylab("HHS Flu Hospitalization Nonmissing Value Latency") + ylab("HHS Flu Hospitalization Nonmissing Latency") +ggplotly(plt) ``` -```{r} -hhs_flu_hosp_latency_by_version_geo %>% +Nonmissing nonzero latency, where it is over 2 days: +```{r, out.width = "60em", out.height="60em"} +plt = + hhs_flu_hosp_latency_by_version_geo %>% + mutate(na0_latency = case_when( + na0_latency <= 2 ~ NA, + TRUE ~ na0_latency + )) %>% + ggplot(aes(version, geo_value, fill = na0_latency)) + + geom_raster() + + scale_fill_viridis_c(na.value = "transparent") + + theme_bw() + + scale_x_date(date_labels = "%b %Y") + + xlab("Version") + + ylab("HHS Flu Hospitalization Nonmissing Nonzero Latency") +ggplotly(plt) +``` + + +Nonmissing nonduplicate latency, where it is over **3** days: +```{r, out.width = "60em", out.height="60em"} +plt = + hhs_flu_hosp_latency_by_version_geo %>% mutate(na_dup_latency = case_when( - na_dup_latency <= 2 ~ NA, + na_dup_latency <= 3 ~ NA, TRUE ~ na_dup_latency )) %>% ggplot(aes(version, geo_value, fill = na_dup_latency)) + @@ -180,12 +198,15 @@ hhs_flu_hosp_latency_by_version_geo %>% scale_x_date(date_labels = "%b %Y") + xlab("Version") + ylab("HHS Flu Hospitalization Nonmissing Nonduplicate Latency") +ggplotly(plt) ``` -```{r} -hhs_flu_hosp_latency_by_version_geo %>% +Nonmissing nonzero nonduplicate latency, where it is over **3** days: +```{r, out.width = "60em", out.height="60em"} +plt = + hhs_flu_hosp_latency_by_version_geo %>% mutate(na0dup_latency = case_when( - na0dup_latency <= 2 ~ NA, + na0dup_latency <= 3 ~ NA, TRUE ~ na0dup_latency )) %>% ggplot(aes(version, geo_value, fill = na0dup_latency)) + @@ -195,12 +216,50 @@ hhs_flu_hosp_latency_by_version_geo %>% scale_x_date(date_labels = "%b %Y") + xlab("Version") + ylab("HHS Flu Hospitalization Nonmissing Nonzero Nonduplicate Latency") +ggplotly(plt) +``` + +## Plotting individual time series snapshots of interest + +There's a time span of high latency according to some definitions, e.g., for DE as of 2023-05-11; let's see what the time series looked like as of that version: +```{r} +plt = + hhs_flu_hosp_archive %>% + epix_as_of(as.Date("2023-05-11"), + # plot just a year of data: + min_time_value = as.Date("2022-05-12")) %>% + filter(geo_value == "de") %>% + complete(time_value = full_seq(time_value, period = 1L)) %>% + ggplot(aes(time_value, admissions)) + + geom_point() +ggplotly(plt) +``` +We can see at later time values that the number of admissions alternates between 0 most days and 1 other days. In some definitions of latency, we consider all but the first 1 + +## Some summary statistics related to the nonmissing latency: + +```{r, fig.width = 7, fig.height = 5} +hhs_flu_hosp_latency_by_version_geo %>% + ggplot(aes(na_latency)) + + geom_histogram(binwidth = 1L) ``` +Over time: +```{r, fig.width = 7, fig.height = 5} +hhs_flu_hosp_latency_by_version_geo %>% + group_by(version) %>% + summarize(mean_na_latency = mean(na_latency)) %>% + ggplot(aes(version, mean_na_latency)) + + geom_line() + + expand_limits(y=0) + + xlab("Version") + + ylab("Mean Nonmissing Latency Across Locations") +``` +# Revision statistics How accurate is `k`-day-old data on average? -```{r} +```{r, fig.width = 7, fig.height = 5} max_analysis_lag = 30L hhs_flu_hosp_version_lag_data = From 0b940c1dce3ed52a2b9f371ed4b8b327dd348de1 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 13 Jun 2023 14:04:21 -0700 Subject: [PATCH 09/21] Fix nominal latency not being included in some measures --- vignettes/latency_revision_stats.Rmd | 51 +++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 22afc390..2bd419ef 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -107,12 +107,13 @@ latency_stats = function(window_data, group_key, ref_time_value) { nominal_latency = as.integer(ref_time_value - max(time_value)), na_latency = n_bools_true_at_tail(is_na) + nominal_latency, na0_latency = n_bools_true_at_tail(is_na0) + nominal_latency, + # FIXME this still looks buggy; why don't we have the same all-geo event lines as in the above? na_dup_latency = # (don't count NAs before the first copy of the last non-na) - sum(cumsum(is_na_same[rev(cumsum(rev(!is_na_same))) == 0L]) != 0L) - 1L, + sum(cumsum(is_na_same[rev(cumsum(rev(!is_na_same))) == 0L]) != 0L) - 1L + nominal_latency, na0dup_latency = # (don't count NAs and 0s before the first copy of the last non-na0) - sum(cumsum(is_na0same[rev(cumsum(rev(!is_na0same))) == 0L]) != 0L) - 1L + sum(cumsum(is_na0same[rev(cumsum(rev(!is_na0same))) == 0L]) != 0L) - 1L + nominal_latency ) } @@ -222,7 +223,7 @@ ggplotly(plt) ## Plotting individual time series snapshots of interest There's a time span of high latency according to some definitions, e.g., for DE as of 2023-05-11; let's see what the time series looked like as of that version: -```{r} +```{r, out.width = "100%"} plt = hhs_flu_hosp_archive %>% epix_as_of(as.Date("2023-05-11"), @@ -234,19 +235,52 @@ plt = geom_point() ggplotly(plt) ``` -We can see at later time values that the number of admissions alternates between 0 most days and 1 other days. In some definitions of latency, we consider all but the first 1 +We can see at later time values that the number of admissions alternates between 0 most days and 1 other days. In some definitions of latency, we consider all but the first 1 in these sections to be "latent". + +There's also some interesting patterns where latency appears to decrease in AS, then suddenly jump up; let's visualize some snapshots here: +```{r} + hhs_flu_hosp_archive %>% + epix_slide( + ~ .x %>% filter(geo_value == "as") %>% rename(atv = time_value), + before = 27L, # don't calculate latency more than this far back + names_sep = NULL # don't add "slide_value_" colname prefixes + ) %>% + rename(version = time_value, time_value = atv) %>% + filter(version %in% as.Date(c("2023-03-20", "2023-04-02", "2023-04-03"))) %>% + ggplot(aes(time_value, admissions, colour = version)) + + geom_line() +``` + +Let's check some counts instead: +```{r} +hhs_flu_hosp_archive %>% + epix_slide( + ~ .x %>% + filter(geo_value == "as") %>% + complete(time_value = full_seq(time_value, period = 1L)) %>% + rename(atv = time_value), + before = 27L, # don't calculate latency more than this far back + names_sep = NULL # don't add "slide_value_" colname prefixes + ) %>% + rename(version = time_value, time_value = atv) %>% + filter(version %in% as.Date(c("2023-03-20", "2023-04-02", "2023-04-03"))) %>% + count(version) +``` ## Some summary statistics related to the nonmissing latency: -```{r, fig.width = 7, fig.height = 5} -hhs_flu_hosp_latency_by_version_geo %>% +```{r, out.width = "100%"} +plt = + hhs_flu_hosp_latency_by_version_geo %>% ggplot(aes(na_latency)) + geom_histogram(binwidth = 1L) +ggplotly(plt) ``` Over time: -```{r, fig.width = 7, fig.height = 5} -hhs_flu_hosp_latency_by_version_geo %>% +```{r, out.width = "100%"} +plt = + hhs_flu_hosp_latency_by_version_geo %>% group_by(version) %>% summarize(mean_na_latency = mean(na_latency)) %>% ggplot(aes(version, mean_na_latency)) + @@ -254,6 +288,7 @@ hhs_flu_hosp_latency_by_version_geo %>% expand_limits(y=0) + xlab("Version") + ylab("Mean Nonmissing Latency Across Locations") +ggplotly(plt) ``` # Revision statistics From 4f6c0773a3ea5037ad84028614749397a4cd1c29 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Thu, 15 Jun 2023 14:59:10 -0700 Subject: [PATCH 10/21] Add `yaml` to `Suggests:` for `renv` dependency parsing --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a3d8724f..49197843 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,7 +53,8 @@ Suggests: rmarkdown, testthat (>= 3.1.5), waldo (>= 0.3.1), - withr + withr, + yaml VignetteBuilder: knitr Remotes: From 9def0922bc5eefc26987eb931b3fcdc6d8d422c9 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Aug 2023 09:59:06 -0700 Subject: [PATCH 11/21] Latency vignette: fix hhs flu start date, improve naming, more ggplotly --- vignettes/latency_revision_stats.Rmd | 105 ++++++++++++++++----------- 1 file changed, 64 insertions(+), 41 deletions(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 2bd419ef..6b68049f 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -37,30 +37,47 @@ if (!file.exists(hhs_flu_hosp_issue_data_path)) { # all `time_value`s: time_values = "*", # all issues through the analysis issue - issues = epirange(12340101, hhs_flu_hosp_analysis_issue) + issues = epirange(12340101, hhs_influenza_hosp_analysis_issue) ) %>% fetch() - saveRDS(hhs_flu_hosp_issue_data, hhs_flu_hosp_issue_data_path) + saveRDS(hhs_influenza_hosp_issue_data, hhs_influenza_hosp_issue_data_path) writeLines(c("License: Public Domain US Government", "Accessed via COVIDcast Epidata API"), - "LICENSE_hhs_flu_hosp_issue_data.txt") + "LICENSE_hhs_influenza_hosp_issue_data.txt") } else { cat("Using cache.\n") - hhs_flu_hosp_issue_data = readRDS(hhs_flu_hosp_issue_data_path) + hhs_influenza_hosp_issue_data = readRDS(hhs_influenza_hosp_issue_data_path) } -# Remove some data from a mostly-NA/0/low regime, and put version data into -# the `epi_archive` format: -hhs_flu_hosp_start_time_value = as.Date("2020-12-01") -hhs_flu_hosp_archive = hhs_flu_hosp_issue_data %>% - filter(time_value >= hhs_flu_hosp_start_time_value) %>% +# Use the compactification feature to see when values changed in the data set, +# not just re-reporting of the same values. (The Delphi Epidata API already +# compactifies much of the issue data, but not all of it.) +hhs_influenza_hosp_issue_data %>% select(geo_value, time_value, version = issue, admissions = value) %>% + as_epi_archive(compactify = TRUE) %>% + .$DT %>% + `[`(, .N, by = version) %>% + print(topn = 20L) + +# It looks like more regular reporting started around version 2021-09-21, with +# 2021-09-21 being a little different as well. It's currently not easy to filter +# diff-based issue data correctly based on the version, so let's just filter out +# some this reporting based on time values that we expect would be included. We +# might also consider one of the mandatory reporting start dates. We're working +# on ways to filter both version and time values more easily from an archive +# object directly. +hhs_influenza_hosp_start_time_value = as.Date("2021-09-21") +hhs_influenza_hosp_archive = hhs_influenza_hosp_issue_data %>% + filter(time_value >= hhs_influenza_hosp_start_time_value) %>% + select(geo_value, time_value, version = issue, admissions = value) %>% + mutate(geo_value = factor(geo_value, epidatasets::state_census %>% arrange(pop) %>% pull(abbr))) %>% as_epi_archive(compactify = TRUE) ``` # Latency statistics There are several notions of latency, some of which include: + - "Nominal latency": the difference between the max `time_value` available for some location, as of some version - "Nonmissing latency": also considers explicitly-recorded `NA`s and any skipped @@ -130,9 +147,11 @@ hhs_flu_hosp_latency_by_version_geo = ## Finer-grained visualizations of different notions of latency Nominal latency, where it is over 2 days: -```{r, out.width = "60em", out.height="60em"} +```{r, out.width = "100%", out.height="60em"} +class(hhs_influenza_hosp_latency_by_version_geo$geo_value) +sessioninfo::session_info() plt = - hhs_flu_hosp_latency_by_version_geo %>% + hhs_influenza_hosp_latency_by_version_geo %>% mutate(nominal_latency = case_when( nominal_latency <= 2 ~ NA, TRUE ~ nominal_latency @@ -143,14 +162,14 @@ plt = theme_bw() + scale_x_date(date_labels = "%b %Y") + xlab("Version") + - ylab("HHS Flu Hospitalization Nominal Latency") + ylab("HHS Influenza Hospitalization Nominal Latency") ggplotly(plt) ``` Nonmissing latency, where it is over 2 days: ```{r, out.width = "60em", out.height="60em"} plt = - hhs_flu_hosp_latency_by_version_geo %>% + hhs_influenza_hosp_latency_by_version_geo %>% mutate(na_latency = case_when( na_latency <= 2 ~ NA, TRUE ~ na_latency @@ -161,14 +180,14 @@ plt = theme_bw() + scale_x_date(date_labels = "%b %Y") + xlab("Version") + - ylab("HHS Flu Hospitalization Nonmissing Latency") + ylab("HHS Influenza Hospitalization Nonmissing Latency") ggplotly(plt) ``` Nonmissing nonzero latency, where it is over 2 days: ```{r, out.width = "60em", out.height="60em"} plt = - hhs_flu_hosp_latency_by_version_geo %>% + hhs_influenza_hosp_latency_by_version_geo %>% mutate(na0_latency = case_when( na0_latency <= 2 ~ NA, TRUE ~ na0_latency @@ -179,7 +198,7 @@ plt = theme_bw() + scale_x_date(date_labels = "%b %Y") + xlab("Version") + - ylab("HHS Flu Hospitalization Nonmissing Nonzero Latency") + ylab("HHS Influenza Hospitalization Nonmissing Nonzero Latency") ggplotly(plt) ``` @@ -187,7 +206,7 @@ ggplotly(plt) Nonmissing nonduplicate latency, where it is over **3** days: ```{r, out.width = "60em", out.height="60em"} plt = - hhs_flu_hosp_latency_by_version_geo %>% + hhs_influenza_hosp_latency_by_version_geo %>% mutate(na_dup_latency = case_when( na_dup_latency <= 3 ~ NA, TRUE ~ na_dup_latency @@ -198,14 +217,14 @@ plt = theme_bw() + scale_x_date(date_labels = "%b %Y") + xlab("Version") + - ylab("HHS Flu Hospitalization Nonmissing Nonduplicate Latency") + ylab("HHS Influenza Hospitalization Nonmissing Nonduplicate Latency") ggplotly(plt) ``` Nonmissing nonzero nonduplicate latency, where it is over **3** days: ```{r, out.width = "60em", out.height="60em"} plt = - hhs_flu_hosp_latency_by_version_geo %>% + hhs_influenza_hosp_latency_by_version_geo %>% mutate(na0dup_latency = case_when( na0dup_latency <= 3 ~ NA, TRUE ~ na0dup_latency @@ -216,7 +235,7 @@ plt = theme_bw() + scale_x_date(date_labels = "%b %Y") + xlab("Version") + - ylab("HHS Flu Hospitalization Nonmissing Nonzero Nonduplicate Latency") + ylab("HHS Influenza Hospitalization Nonmissing Nonzero Nonduplicate Latency") ggplotly(plt) ``` @@ -225,7 +244,7 @@ ggplotly(plt) There's a time span of high latency according to some definitions, e.g., for DE as of 2023-05-11; let's see what the time series looked like as of that version: ```{r, out.width = "100%"} plt = - hhs_flu_hosp_archive %>% + hhs_influenza_hosp_archive %>% epix_as_of(as.Date("2023-05-11"), # plot just a year of data: min_time_value = as.Date("2022-05-12")) %>% @@ -239,7 +258,8 @@ We can see at later time values that the number of admissions alternates between There's also some interesting patterns where latency appears to decrease in AS, then suddenly jump up; let's visualize some snapshots here: ```{r} - hhs_flu_hosp_archive %>% +plt = + hhs_influenza_hosp_archive %>% epix_slide( ~ .x %>% filter(geo_value == "as") %>% rename(atv = time_value), before = 27L, # don't calculate latency more than this far back @@ -249,11 +269,12 @@ There's also some interesting patterns where latency appears to decrease in AS, filter(version %in% as.Date(c("2023-03-20", "2023-04-02", "2023-04-03"))) %>% ggplot(aes(time_value, admissions, colour = version)) + geom_line() +ggplotly(plt) ``` Let's check some counts instead: ```{r} -hhs_flu_hosp_archive %>% +hhs_influenza_hosp_archive %>% epix_slide( ~ .x %>% filter(geo_value == "as") %>% @@ -271,7 +292,7 @@ hhs_flu_hosp_archive %>% ```{r, out.width = "100%"} plt = - hhs_flu_hosp_latency_by_version_geo %>% + hhs_influenza_hosp_latency_by_version_geo %>% ggplot(aes(na_latency)) + geom_histogram(binwidth = 1L) ggplotly(plt) @@ -280,7 +301,7 @@ ggplotly(plt) Over time: ```{r, out.width = "100%"} plt = - hhs_flu_hosp_latency_by_version_geo %>% + hhs_influenza_hosp_latency_by_version_geo %>% group_by(version) %>% summarize(mean_na_latency = mean(na_latency)) %>% ggplot(aes(version, mean_na_latency)) + @@ -297,8 +318,8 @@ How accurate is `k`-day-old data on average? ```{r, fig.width = 7, fig.height = 5} max_analysis_lag = 30L -hhs_flu_hosp_version_lag_data = - hhs_flu_hosp_archive %>% +hhs_influenza_hosp_version_lag_data = + hhs_influenza_hosp_archive %>% epix_slide( ~ .x, before = max_analysis_lag, @@ -308,13 +329,14 @@ hhs_flu_hosp_version_lag_data = unnest(slide_value) %>% mutate(version_lag = as.integer(version - time_value)) -hhs_flu_hosp_latest_snapshot = - hhs_flu_hosp_archive %>% +hhs_influenza_hosp_latest_snapshot = + hhs_influenza_hosp_archive %>% epix_as_of(.$versions_end) -left_join(hhs_flu_hosp_version_lag_data, - hhs_flu_hosp_latest_snapshot %>% - filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), +plt = + left_join(hhs_influenza_hosp_version_lag_data, + hhs_influenza_hosp_latest_snapshot %>% + filter(time_value <= hhs_influenza_hosp_analysis_issue - max_analysis_lag * 2L), by = c("geo_value", "time_value"), suffix = c("_lagged", "_latest")) %>% tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% @@ -322,10 +344,11 @@ left_join(hhs_flu_hosp_version_lag_data, summarize(MAPE1 = 100*mean(abs((admissions_latest - admissions_lagged)/(admissions_latest + 1)))) %>% ggplot(aes(version_lag, MAPE1)) + geom_line() +ggplotly(plt) -left_join(hhs_flu_hosp_version_lag_data, - hhs_flu_hosp_latest_snapshot %>% - filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), +left_join(hhs_influenza_hosp_version_lag_data, + hhs_influenza_hosp_latest_snapshot %>% + filter(time_value <= hhs_influenza_hosp_analysis_issue - max_analysis_lag * 2L), by = c("geo_value", "time_value"), suffix = c("_lagged", "_latest")) %>% tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% @@ -334,9 +357,9 @@ left_join(hhs_flu_hosp_version_lag_data, ggplot(aes(version_lag, MdAPE1)) + geom_line() -left_join(hhs_flu_hosp_version_lag_data, - hhs_flu_hosp_latest_snapshot %>% - filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), +left_join(hhs_influenza_hosp_version_lag_data, + hhs_influenza_hosp_latest_snapshot %>% + filter(time_value <= hhs_influenza_hosp_analysis_issue - max_analysis_lag * 2L), by = c("geo_value", "time_value"), suffix = c("_lagged", "_latest")) %>% tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% @@ -345,9 +368,9 @@ left_join(hhs_flu_hosp_version_lag_data, ggplot(aes(version_lag, Q3APE1)) + geom_line() -left_join(hhs_flu_hosp_version_lag_data, - hhs_flu_hosp_latest_snapshot %>% - filter(time_value <= hhs_flu_hosp_analysis_issue - max_analysis_lag * 2L), +left_join(hhs_influenza_hosp_version_lag_data, + hhs_influenza_hosp_latest_snapshot %>% + filter(time_value <= hhs_influenza_hosp_analysis_issue - max_analysis_lag * 2L), by = c("geo_value", "time_value"), suffix = c("_lagged", "_latest")) %>% tidyr::drop_na(c(admissions_lagged, admissions_latest)) %>% From 07001e78ba17e8664c8793e3e2b464fc6b5c4e78 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Aug 2023 09:59:44 -0700 Subject: [PATCH 12/21] Fix same -> dup latency calcs, tweak helper functions --- vignettes/latency_revision_stats.Rmd | 38 +++++++++++++++++----------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 6b68049f..1852ce90 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -21,15 +21,15 @@ library(plotly) # we've already observed. For better reproducibility, that issue should be at # least two days ago (from today), to incorporate any "hotfixes" to that data # and allow up to a day for database replication delays: -hhs_flu_hosp_analysis_issue = as.Date("2023-06-13") - 2L +hhs_influenza_hosp_analysis_issue = as.Date("2023-06-28") - 2L # We'll query version data for this data set from the Delphi Epidata API using # epidatr, caching the result so that we can re-run this chunk repeatedly a bit # faster, while offline, and without placing undue load on the API servers. -hhs_flu_hosp_issue_data_path = "hhs_flu_hosp_issue_data.rds" -if (!file.exists(hhs_flu_hosp_issue_data_path)) { +hhs_influenza_hosp_issue_data_path = "hhs_influenza_hosp_issue_data.rds" +if (!file.exists(hhs_influenza_hosp_issue_data_path)) { cat("Fetching.\n") - hhs_flu_hosp_issue_data = + hhs_influenza_hosp_issue_data = covidcast( data_source = "hhs", signals = "confirmed_admissions_influenza_1d", geo_type = "state", time_type = "day", @@ -92,11 +92,11 @@ There are several notions of latency, some of which include: We'll calculate all the above definitions of latency using `epix_slide()`: ```{r latency-stats, cache=TRUE} -n_bools_true_at_tail = function(x) { +n_true_at_tail = function(x) { match(FALSE, rev(x), nomatch = length(x) + 1L) - 1L } -last_or_na = function(x) { +last_elt_or_na = function(x) { if (length(x) != 0L) { x[[length(x)]] } else { @@ -104,7 +104,16 @@ last_or_na = function(x) { } } +cumsum_from_tail = function(x) { + rev(cumsum(rev(x))) +} + latency_stats = function(window_data, group_key, ref_time_value) { + if (nrow(window_data) == 0L) { + # assume that this is from an early version that references a time range we + # excluded + return (tibble::tibble()) + } window_data %>% group_by(geo_value) %>% complete(time_value = full_seq(time_value, period = 1L)) %>% @@ -116,26 +125,25 @@ latency_stats = function(window_data, group_key, ref_time_value) { group_by(geo_value) %>% mutate( is_na_same = is_na | - admissions == last_or_na(admissions[!is_na]), + admissions == last_elt_or_na(admissions[!is_na]), is_na0same = is_na0 | - admissions == last_or_na(admissions[!is_na0]) + admissions == last_elt_or_na(admissions[!is_na0]) ) %>% summarize( nominal_latency = as.integer(ref_time_value - max(time_value)), - na_latency = n_bools_true_at_tail(is_na) + nominal_latency, - na0_latency = n_bools_true_at_tail(is_na0) + nominal_latency, - # FIXME this still looks buggy; why don't we have the same all-geo event lines as in the above? + na_latency = n_true_at_tail(is_na) + nominal_latency, + na0_latency = n_true_at_tail(is_na0) + nominal_latency, na_dup_latency = # (don't count NAs before the first copy of the last non-na) - sum(cumsum(is_na_same[rev(cumsum(rev(!is_na_same))) == 0L]) != 0L) - 1L + nominal_latency, + sum(cumsum(!is_na[cumsum_from_tail(!is_na_same) == 0L]) != 0L) - 1L + nominal_latency, na0dup_latency = # (don't count NAs and 0s before the first copy of the last non-na0) - sum(cumsum(is_na0same[rev(cumsum(rev(!is_na0same))) == 0L]) != 0L) - 1L + nominal_latency + sum(cumsum(!is_na0[cumsum_from_tail(!is_na0same) == 0L]) != 0L) - 1L + nominal_latency ) } -hhs_flu_hosp_latency_by_version_geo = - hhs_flu_hosp_archive %>% +hhs_influenza_hosp_latency_by_version_geo = + hhs_influenza_hosp_archive %>% epix_slide( latency_stats, before = 27L, # don't calculate latency more than this far back From dc74dad0cae9e1386e0a16f2a351fee47cfa249e Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 9 Aug 2023 10:00:55 -0700 Subject: [PATCH 13/21] Backfill: switch to hhs covid ga example, add backcasting, revision heat map --- vignettes/backtesting.Rmd | 177 +++++++++++++++++++++++++++++--------- 1 file changed, 134 insertions(+), 43 deletions(-) diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd index e79090fb..d77ff133 100644 --- a/vignettes/backtesting.Rmd +++ b/vignettes/backtesting.Rmd @@ -73,7 +73,10 @@ series with potentially different reporting latencies. First, we'll construct a synthetic data set with a single location where one particular lag of a predictor is extremely useful, while all the others are not, -and show how ignoring latency can hurt forecast accuracy. +and show how ignoring latency can hurt forecast accuracy. We'll assume for +simplicity that there are no data revisions, and that data about a particular +`time_value` is always published promptly at midnight on `time_value + +fixed_latency`. ```{r} library(dplyr) @@ -92,6 +95,7 @@ target_ahead = 14L fixed_latency = 3L nrow_simulated = 100L + great_lag_setting + target_ahead +# Generate the data in both formats: edf = tibble( geo_value = rep("nc", nrow_simulated), time_value = as.Date("2021-01-01") + seq(0L, nrow_simulated - 1L), @@ -106,9 +110,14 @@ edf = tibble( ea = edf %>% mutate(version = time_value + fixed_latency) %>% as_epi_archive() +``` -latency_unaware_predictions = function(group_data, group_key) { - group_data %>% +Our first forecaster will predict `target_ahead` ahead of the latest available +`time_value` for each `geo_value`, rather than the `forecast_date` (though with +some reservations/warnings): +```{r} +latency_unaware_predictions = function(window_data, group_key, ref_time_value) { + window_data %>% arx_forecaster( "outcome", "predictor", args_list = arx_args_list( @@ -117,28 +126,41 @@ latency_unaware_predictions = function(group_data, group_key) { ) ) %>% `[[`("predictions") %>% - rename(max_time_value = time_value) + rename( + guessed_forecast_date = forecast_date#, + # max_time_value = time_value + ) } +``` +`epix_slide()` lets us simulate making a forecast based on any previous or +current version of the data; we'll use it here to make a forecast as of the +last version available in this artificial data set. +```{r} actual_forecast_date = ea$versions_end ea %>% + # no grouping --- training will pool across locations epix_slide( ref_time_values = actual_forecast_date, before = 36500L, # use an extremely large time window = no time windowing f = latency_unaware_predictions, names_sep = NULL # don't prefix output column names - ) + ) %>% + select(ref_time_value = time_value, + # max_time_value, + guessed_forecast_date, + everything()) cat("Actual forecast date:", toString(actual_forecast_date), "\n") ``` -TODO finish + -TODO what effect this has in reality + -TODO also give example where this reproduces real-time errors for systems that -are more rigid and calculate from real `forecast_date` but encounter something not there + + # latency issue: mixed latency @@ -160,11 +182,11 @@ ga_hhs_analysis_issue = as.Date("2023-04-05") - 2L # Fetch issue data using `epidatr`; cache the response for politeness, and also # note the data license for the requested signal. # TODO try cachem -ga_hhs_issue_data_path = "ga_hhs_issue_data.rds" +ga_hhs_issue_data_path = "ga_hhs_covid_issue_data.rds" if (!file.exists(ga_hhs_issue_data_path)) { ga_hhs_issue_data = covidcast( - data_source = "hhs", signal = "confirmed_admissions_influenza_1d", + data_source = "hhs", signal = "confirmed_admissions_covid_1d", geo_type = "state", time_type = "day", geo_values = "ga", # all `time_value`s: @@ -172,7 +194,7 @@ if (!file.exists(ga_hhs_issue_data_path)) { # all issues through the analysis issue issues = epirange(12340101, ga_hhs_analysis_issue) ) %>% - fetch_tbl() + fetch() saveRDS(ga_hhs_issue_data, ga_hhs_issue_data_path) writeLines(c("License: Public Domain US Government", "Accessed via COVIDcast Epidata API"), @@ -181,9 +203,15 @@ if (!file.exists(ga_hhs_issue_data_path)) { ga_hhs_issue_data = readRDS(ga_hhs_issue_data_path) } -# Remove some data from a mostly-NA/0/low regime, and put version data into -# the `epi_archive` format: -hhs_start_time_value = as.Date("2020-12-01") +# It looks like more regular reporting started around version 2021-09-21, with +# 2021-09-21 being a little different as well. It's currently not easy to filter +# diff-based issue data correctly based on the version, so let's just filter out +# some this reporting based on time values that we expect would be included. We +# might also consider one of the mandatory reporting start dates. We're working +# on ways to filter both version and time values more easily from an archive +# object directly. +# hhs_start_time_value = as.Date("2021-09-21") # flu, approx +hhs_start_time_value = as.Date("2020-12-01") # covid, approx ga_hhs_archive = ga_hhs_issue_data %>% filter(time_value >= hhs_start_time_value) %>% select(geo_value, time_value, version = issue, admissions = value) %>% @@ -192,7 +220,11 @@ ga_hhs_archive = ga_hhs_issue_data %>% # We'll experiment with a pseudoprospective forecast as of a single past # `forecast_date`. Later, we can apply this to many `forecast_date`s using an # `epix_slide()`. -forecast_date = ga_hhs_analysis_issue - 7L +# forecast_date = ga_hhs_analysis_issue - 7L +# forecast_date = ga_hhs_analysis_issue - 90L +forecast_date = ga_hhs_analysis_issue - 100L +# forecast_date = ga_hhs_analysis_issue - 120L +# forecast_date = ga_hhs_analysis_issue - 150L # Get what the archive would have looked like as of the `forecast_date`: forecast_date_archive = ga_hhs_archive %>% @@ -239,12 +271,11 @@ version_aware_predictors = # For performance purposes, apply a time window that includes just enough # `time_value`s that we can extract our predictors: before = max(predictor_lags_rel_fcst_date), - function(edf, gk) { + function(edf, gk, ref_time_value) { # * `edf` is a time-windowed snapshot. # * `gk` is the "group key". If we were performing a grouped slide, this # could potentially be helpful. We won't need it here. - ref_time_value = attr(edf, "metadata")$as_of - # `ref_time_value` is our training/actual `forecast_date`. + # `ref_time_value` is our training/actual `forecast_date`. edf %>% # We must make sure that there are no gaps in the `time_value`s to make # `dplyr::lag` work properly. Additionally, we will extend the @@ -270,7 +301,13 @@ version_aware_predictors = select(geo_value, x1, x2, x3) }, names_sep = NULL # don't prefix the output column names - ) + ) %>% + # suppose our aheads are relative to max t (so we can show a nowcast) + # + # XXX this works for backcasting revisions to the most recent value available, + # but will be predicting the wrong aheads if we were trying to forecast 14/7 + # days beyond the `forecast_date` (since we shifted `admissions`). + mutate(time_value = time_value - forecast_date_latency) # Combine the version-aware predictors with the latest available version, from # which we'll calculate targets as of the `forecast_date`. By predicting @@ -291,30 +328,74 @@ training_and_testing_edf = # considering `time_value` will work differently than they would on daily data. # This isn't the case with `epipredict` below.) -# Use `epipredict` canned forecaster `arx_forecaster`. -forecaster_output = training_and_testing_edf %>% - arx_forecaster( - outcome = "admissions", - predictors = c("x1", "x2", "x3"), - args_list = arx_args_list( - lags = 0L, # the lags are already baked into x1, x2, x3 - # ahead = 14L, - # ahead = 7L, - ahead = 0L, - # Use defaults for everything else. - ) - ) + # # ahead = 14L, + # # ahead = 7L, + # # ahead = 0L, + # ahead = 0, +aheads = c(0L, 7L, 14L, 21L, 28L) -# FIXME this works for backcasting revisions to the most recent value available, -# but will be predicting the wrong aheads if we were trying to forecast 14/7 -# days beyond the `forecast_date` (since we shifted `admissions`). +# can't use intermediate aheads due to way that join was set up +# can't use negative aheads directly yet +forecaster_outputs = map(aheads, function(ahead) { + training_and_testing_edf %>% + arx_forecaster( + outcome = "admissions", + predictors = c("x1", "x2", "x3"), + args_list = arx_args_list( + lags = 0L, # the lags are already baked into x1, x2, x3 + ahead = ahead + # Use defaults for everything else. + ) + ) %>% + { + .$predictions <- .$predictions %>% + mutate(forecast_date = .env$forecast_date) + . + } +}) library(ggplot2) +ga_hhs_archive %>% + epix_as_of(.$versions_end) %>% + ggplot(aes(time_value, admissions)) %>% + `+`(geom_point()) %>% + `+`(geom_point( + data = ga_hhs_archive %>% + epix_as_of(forecast_date), + colour = "blue", + shape = 4L + )) %>% + `+`(geom_point( + data = ga_hhs_archive %>% + epix_as_of(forecast_date) %>% + filter(as.integer(forecast_date - time_value) %in% predictor_lags_rel_fcst_date), + colour = "blue", + shape = 3L, size = 5 + )) %>% + `+`(xlim(c(forecast_date - 100L, forecast_date + max(aheads)))) %>% + `+`(geom_point( + inherit.aes = FALSE, + aes(x = target_date, y = .pred), + forecaster_outputs %>% + map_dfr("predictions"), + colour = "red" + )) %>% + `+`(geom_ribbon( + inherit.aes = FALSE, + aes(x = target_date, ymin = `0.05`, ymax = `0.95`), + forecaster_outputs %>% + map_dfr("predictions") %>% + pivot_quantiles(.pred_distn), + fill = "red", alpha = 0.4 + )) + training_and_testing_edf %>% ggplot(aes(x1, admissions)) %>% - `+`(geom_point()) %>% - `+`(geom_jitter()) %>% + # `+`(geom_point()) %>% + # `+`(geom_jitter()) %>% + `+`(geom_bin2d(binwidth = diff(range(training_and_testing_edf$admissions))/30)) %>% + `+`(scale_fill_continuous(type = "viridis")) %>% `+`(geom_abline(slope = 1, intercept = 0, linetype="dotted")) training_and_testing_edf %>% @@ -322,11 +403,21 @@ training_and_testing_edf %>% with(sum(admissions) / sum(x1)) # Generally initial observations are revised downward. -# Our fit coefficients: -forecaster_output$epi_workflow %>% - parsnip::extract_fit_engine() - -# Actual backcasts are in `forecaster_output$predictions` +# Our fit coefficients for the first of the `aheads`: +aheads[[1L]] +forecaster_outputs[[1L]]$epi_workflow %>% + parsnip::extract_fit_engine() %>% + coef() + +# Our fit coefficients for the last of the `aheads`: +aheads[[length(aheads)]] +forecaster_outputs[[length(aheads)]]$epi_workflow %>% + parsnip::extract_fit_engine() %>% + coef() ``` +Modeling ideas: we probably should have used 7dav features, or some "slope" +features, or lags that cover a longer window. Removing the intercept might also +make sense. + # pseudoprospective version-aware forecasting From e37e29cf27f374b9de9a76da26d5200554171373 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Fri, 11 Aug 2023 16:48:59 -0700 Subject: [PATCH 14/21] Bump latency vignette analysis date to show hhs cadence change --- vignettes/latency_revision_stats.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 1852ce90..6224f46c 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -21,7 +21,7 @@ library(plotly) # we've already observed. For better reproducibility, that issue should be at # least two days ago (from today), to incorporate any "hotfixes" to that data # and allow up to a day for database replication delays: -hhs_influenza_hosp_analysis_issue = as.Date("2023-06-28") - 2L +hhs_influenza_hosp_analysis_issue = as.Date("2023-08-11") - 2L # We'll query version data for this data set from the Delphi Epidata API using # epidatr, caching the result so that we can re-run this chunk repeatedly a bit From f6586f8229efd2135f392eede1cdf6ba511fb61c Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 22 Aug 2023 11:41:34 -0700 Subject: [PATCH 15/21] Remove version debug info from latency vignette --- vignettes/latency_revision_stats.Rmd | 2 -- 1 file changed, 2 deletions(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 6224f46c..025eba1d 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -156,8 +156,6 @@ hhs_influenza_hosp_latency_by_version_geo = Nominal latency, where it is over 2 days: ```{r, out.width = "100%", out.height="60em"} -class(hhs_influenza_hosp_latency_by_version_geo$geo_value) -sessioninfo::session_info() plt = hhs_influenza_hosp_latency_by_version_geo %>% mutate(nominal_latency = case_when( From d46ebad842f90cc338732812b585ce58a887ac81 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 22 Aug 2023 11:42:06 -0700 Subject: [PATCH 16/21] Partially flesh out latency vignette intro --- vignettes/latency_revision_stats.Rmd | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 025eba1d..872cb673 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -90,7 +90,14 @@ There are several notions of latency, some of which include: `time_value`s, zeros, and duplications of the most recent non-`NA`, nonzero value to be latent -We'll calculate all the above definitions of latency using `epix_slide()`: +These definitions are nonexhaustive, and knowledge about underlying reporting +mechanisms in a data sources are can help inform what an appropriate definition +of latency is. In epidemiological surveillance data sets, there will always be +some degree of nominal latency. In the current data set, some repeated values +also reflect data latency, and there may be other types of latency as well. +We'll plot all the above definitions of latency to investigate. + +We'll calculate all of the above definitions of latency using `epix_slide()`: ```{r latency-stats, cache=TRUE} n_true_at_tail = function(x) { match(FALSE, rev(x), nomatch = length(x) + 1L) - 1L From d6d8c2f498ecf6c15b41fbb618adaf368e1b2e56 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Mon, 15 Apr 2024 11:00:19 -0700 Subject: [PATCH 17/21] Backtesting vignette editing pass + better epipredict-based example --- vignettes/backtesting.Rmd | 553 +++++++++++++------------------------- 1 file changed, 183 insertions(+), 370 deletions(-) diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd index d77ff133..9f5d7919 100644 --- a/vignettes/backtesting.Rmd +++ b/vignettes/backtesting.Rmd @@ -1,31 +1,32 @@ --- -title: Backtesting forecasters and other models +title: Backtesting models output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Backtesting forecasters and other models} + %\VignetteIndexEntry{Backtesting models} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ### Overview, and why to use `epix_slide()` -The `epiprocess` package provides two functions that are useful for backtesting -forecasts and other models: `epi_slide()` and `epix_slide()`; however, we -recommend always using the latter, because: +The `epiprocess` package provides two functions that can be used for backtesting +forecasts and other models across a range of dates: `epi_slide()` and +`epix_slide()`; however, we recommend always using the latter, because: * `epix_slide()` faithfully reproduces latency in data reporting, while `epi_slide()` does not; and * `epix_slide()` faithfully reproduces the version of each available observation that would have been available in real time, while `epi_slide()` does not. -Using `epi_slide()` or a similar approach instead would typically produce overly -optimistic retrospective forecast evaluations, and might miss some run-time -errors that would have occurred in real time due to irregular latency in data -reporting. Thus, it is essential to use a version-aware approach like -`epix_slide()` for backtesting evaluations. +Using `epi_slide()` or a similar approach instead will typically produce +**overly optimistic** retrospective forecast evaluations, and might miss some +run-time errors that would have occurred in real time due to irregular latency +in data reporting. Thus, it is important to use a versioning-faithful +"pseudoprospective" approach like `epix_slide()` for backtesting evaluations +whenever possible. Additionally, `epix_slide()` can be used to build and evaluate version-aware -models that use data on historical data revisions to try to improve their +models that factor in historical data revisions to try to improve their accuracy. ### Smaller reproducibility issues that may remain @@ -33,50 +34,38 @@ accuracy. Even using version-aware approaches like `epix_slide()`, there may still be some smaller reproducibility issues based on the data set/provider: -* If the `version` tags are not datetime resolution, it is unclear what the - input to the forecaster would have looked like at a particular time. For - example, if `version`s are `Date`s, it can be unclear which (if any) data from - version `2023-01-01` were available at 8am Eastern Time on `2023-01-01`. +* If the `version` tags are not date+times, it is unclear what the input to the + forecaster would have looked like at a particular date+time. For example, if + `version`s are `Date`s, it can be unclear which (if any) updates in version + `2023-01-01` were available at 8 a.m. Eastern Time on `2023-01-01`. * Depending on the data source, the `version` tags may not reflect the exact - time that the data actually was available. For example, an upstream data - provider might have some different meaning for its `version` column, or might - use database replicas which may need time to process updates and make them - available to all data consumers. + time that the data actually was available to you. For example, an upstream + data provider might have some different meaning for its `version` column, such + as when it first became available to them, rather than when they made it + available to others. There are also technical details that can come into play: + sometimes, data is hosted on multiple database or web server "replicas" that + need some time to mirror newly-published data, so the effective publication + time can be somewhat fuzzy. * Even though we generally think of available version data as fixed in stone, this isn't necessarily the case. For example, an upstream version data provider may publish some initial form of version `2023-01-01` data early in the morning of `2023-01-01`, but discover that it contains an error, and issue - a hotfix later in the day that overwrites the `2023-01-01` version data with a - correction (but still using the exact same version tag); in this situation, - the early-morning version wouldn't be reproducible if performing an analysis - at a later date. + a "hotfix" later in the day that overwrites the `2023-01-01` version data with + a correction (but still calling it the exact same version, `2023-01-01`); in + this situation, modeling based on the early-morning version wouldn't be + reproducible if backtesting at a later date. -# Latency issues: most recent time available vs. `ref_time_value` +### Pseudoprospective modeling with `epi_archive` -Some forecasters select test-time predictors using the maximum time value -available (either overall, or separately for each individual time series for -different geographical/demographical units). E.g., using default settings, -`epipredict::arx_forecaster` will: -- Select test-time predictors for each time series corresponding to - `max_time_value - lags` for that time series. - - FIXME probably wrong/imprecise. Need to consider shifts involved. - -- Guess that the `forecast_date` is the maximum of these `max_time_value`s over - all the time series. Guess that the `target_date` is `forecast_date + ahead`. +Pseudoprospective modeling can be performed with `epix_as_of()` or +`epix_slide()`. We'll demonstrate using `epix_slide()` to perform +pseudoprospective forecasting on COVID-19 incident case rate data from JHU-CSSE +for a few US states. This data stream had very low reporting latency, and in +most cases defined incident cases in a way that made revisions theoretically +impossible (the difference between cumulative reporting between consecutive +reports), but nevertheless, there were some exceptions such as large revisions. -We'll demonstrate some issues that occur using these default settings, first -when dealing with a single time series, and then when dealing with multiple time -series with potentially different reporting latencies. - -### Single location, fixed latency - -First, we'll construct a synthetic data set with a single location where one -particular lag of a predictor is extremely useful, while all the others are not, -and show how ignoring latency can hurt forecast accuracy. We'll assume for -simplicity that there are no data revisions, and that data about a particular -`time_value` is always published promptly at midnight on `time_value + -fixed_latency`. +FIXME discussion here. CLI if used ```{r} library(dplyr) @@ -84,340 +73,164 @@ library(tidyr) library(purrr) library(epiprocess) library(epipredict) +library(ggplot2) -# Make RNG reproducible: -invisible(withr::local_rng_version("3.0.0")) -invisible(withr::local_seed(42L)) - -# Settings for our synthetic data set and forecasting task: -great_lag_setting = 8L -target_ahead = 14L -fixed_latency = 3L -nrow_simulated = 100L + great_lag_setting + target_ahead - -# Generate the data in both formats: -edf = tibble( - geo_value = rep("nc", nrow_simulated), - time_value = as.Date("2021-01-01") + seq(0L, nrow_simulated - 1L), - predictor = rnorm(nrow_simulated), - outcome = - dplyr::lag(predictor, great_lag_setting + target_ahead) + - 0.01 * rnorm(nrow_simulated) -) %>% - drop_na(outcome) %>% - as_epi_df() - -ea = edf %>% - mutate(version = time_value + fixed_latency) %>% - as_epi_archive() -``` - -Our first forecaster will predict `target_ahead` ahead of the latest available -`time_value` for each `geo_value`, rather than the `forecast_date` (though with -some reservations/warnings): -```{r} -latency_unaware_predictions = function(window_data, group_key, ref_time_value) { - window_data %>% - arx_forecaster( - "outcome", "predictor", - args_list = arx_args_list( - lags = 0:9, - ahead = target_ahead - ) - ) %>% - `[[`("predictions") %>% - rename( - guessed_forecast_date = forecast_date#, - # max_time_value = time_value - ) +# Mondays +forecast_dates <- seq(as.Date("2020-11-02"), as.Date("2021-11-30"), by = "6 weeks") + +horizons <- c(0, 7, 14, 21, 28) # relative to forecast_date + +example_forecaster <- function(snapshot_edf, forecast_date) { + shared_reporting_latency <- forecast_date - max(snapshot_edf$time_value) + aheads <- horizons + shared_reporting_latency # relative to max time_value + # FIXME shifting technique instead + aheads %>% + map(function(ahead) { + snapshot_edf %>% + ## flatline_forecaster( + ## "case_rate_7d_av", + ## flatline_args_list( + ## ahead = ahead, + ## quantile_levels = c(0.1, 0.5, 0.9), + ## forecast_date = forecast_date + ## )) %>% + arx_forecaster( + outcome = "case_rate_7d_av", # via JHU-CSSE + ## predictors = "percent_cli", # from doctor-visits (claims) + predictors = c("case_rate_7d_av", "percent_cli"), # from doctor-visits (claims) + args_list = arx_args_list( + ahead = ahead, + quantile_levels = c(0.1, 0.5, 0.9), + forecast_date = forecast_date + )) %>% + .$predictions + }) %>% + bind_rows() + ## list() } -``` -`epix_slide()` lets us simulate making a forecast based on any previous or -current version of the data; we'll use it here to make a forecast as of the -last version available in this artificial data set. -```{r} -actual_forecast_date = ea$versions_end - -ea %>% - # no grouping --- training will pool across locations +pseudoprospective_forecasts <- + archive_cases_dv_subset %>% epix_slide( - ref_time_values = actual_forecast_date, - before = 36500L, # use an extremely large time window = no time windowing - f = latency_unaware_predictions, - names_sep = NULL # don't prefix output column names + ref_time_values = forecast_dates, + before = 365000L, # 1000-year time window --> don't filter out any `time_value`s + ~ example_forecaster(.x, .ref_time_value), + names_sep = NULL + ## as_list_col = TRUE ) %>% - select(ref_time_value = time_value, - # max_time_value, - guessed_forecast_date, - everything()) - -cat("Actual forecast date:", toString(actual_forecast_date), "\n") -``` - - - - - - - - -# latency issue: mixed latency - -# pseudoprospective forecasting - -# revision MAPE curve + select(-time_value) -# version-aware forecasting - -Version-aware forecasting: -```{r} -library(epidatr) - -# For reproducibility, we'll work with data reporting through some fixed issue -# we've already observed. For better reproducibility, that issue should be at -# least two days ago (from today), to incorporate any "hotfixes" to that data -# and allow up to a day for database replication delays: -ga_hhs_analysis_issue = as.Date("2023-04-05") - 2L -# Fetch issue data using `epidatr`; cache the response for politeness, and also -# note the data license for the requested signal. -# TODO try cachem -ga_hhs_issue_data_path = "ga_hhs_covid_issue_data.rds" -if (!file.exists(ga_hhs_issue_data_path)) { - ga_hhs_issue_data = - covidcast( - data_source = "hhs", signal = "confirmed_admissions_covid_1d", - geo_type = "state", time_type = "day", - geo_values = "ga", - # all `time_value`s: - time_values = "*", - # all issues through the analysis issue - issues = epirange(12340101, ga_hhs_analysis_issue) - ) %>% - fetch() - saveRDS(ga_hhs_issue_data, ga_hhs_issue_data_path) - writeLines(c("License: Public Domain US Government", - "Accessed via COVIDcast Epidata API"), - "LICENSE_ga_hhs_issue_data.txt") -} else { - ga_hhs_issue_data = readRDS(ga_hhs_issue_data_path) -} - -# It looks like more regular reporting started around version 2021-09-21, with -# 2021-09-21 being a little different as well. It's currently not easy to filter -# diff-based issue data correctly based on the version, so let's just filter out -# some this reporting based on time values that we expect would be included. We -# might also consider one of the mandatory reporting start dates. We're working -# on ways to filter both version and time values more easily from an archive -# object directly. -# hhs_start_time_value = as.Date("2021-09-21") # flu, approx -hhs_start_time_value = as.Date("2020-12-01") # covid, approx -ga_hhs_archive = ga_hhs_issue_data %>% - filter(time_value >= hhs_start_time_value) %>% - select(geo_value, time_value, version = issue, admissions = value) %>% - as_epi_archive(compactify = TRUE) - -# We'll experiment with a pseudoprospective forecast as of a single past -# `forecast_date`. Later, we can apply this to many `forecast_date`s using an -# `epix_slide()`. -# forecast_date = ga_hhs_analysis_issue - 7L -# forecast_date = ga_hhs_analysis_issue - 90L -forecast_date = ga_hhs_analysis_issue - 100L -# forecast_date = ga_hhs_analysis_issue - 120L -# forecast_date = ga_hhs_analysis_issue - 150L - -# Get what the archive would have looked like as of the `forecast_date`: -forecast_date_archive = ga_hhs_archive %>% - epix_as_of(forecast_date, all_versions = TRUE) -# Get what the snapshot as of the `forecast_date` would have looked like: -forecast_date_edf = forecast_date_archive %>% - epix_as_of(.$versions_end) -# Reporting latency for this signal in GA on this `forecast_date`: -forecast_date_latency = as.integer(forecast_date - max(forecast_date_edf$time_value)) - -# Our test-time predictors will correspond to `time_value`s in `forecast_date - -# forecast_date_latency - c(0L, 7L, 14L)`. We'll match them to analogous -# training data based on subbing in diffent `training_forecast_date`s. Calculate -# the desired lags relative to the `forecast_date`, and assign them names to be -# used later: -test_predictor_lags_rel_max_t = c(x1 = 0L, x2 = 7L, x3 = 14L) -predictor_lags_rel_fcst_date = forecast_date_latency + test_predictor_lags_rel_max_t -# (Edge case: while we probably want to include `forecast_date - -# forecast_date_latency - 0L` to take advantage of the most recent `time_value` -# available at test time, this could potentially lead to errors if a data source -# starts reporting with lower latency, because we will have trouble matching -# with analogous training data.) - -# We also want to limit our training instances to `training_forecast_date`s with -# the same weekday as our `forecast_date`: -training_and_testing_issues_start = max( - # try to go all the way back to the first version in the archive, - min(forecast_date_archive$DT$version), - # that we expect to possibly contain observations for the desired lags - hhs_start_time_value + max(predictor_lags_rel_fcst_date) -) -# Prepare our training and testing `forecast_date`s, making sure to match the -# wday of the desired `forecast_date`: -training_and_testing_issues = rev(seq(forecast_date, training_and_testing_issues_start, -7)) - -# Find analogous predictor values using `epix_slide()`; get both training and -# testing data simultaneously here as it ends up more convenient than trying to -# separately fetch test data from `forecast_date_edf`: -version_aware_predictors = - ga_hhs_archive %>% +snapshots <- + archive_cases_dv_subset %>% epix_slide( - # Our `training_forecast_date`s + testing `forecast_date` - ref_time_values = training_and_testing_issues, - # For performance purposes, apply a time window that includes just enough - # `time_value`s that we can extract our predictors: - before = max(predictor_lags_rel_fcst_date), - function(edf, gk, ref_time_value) { - # * `edf` is a time-windowed snapshot. - # * `gk` is the "group key". If we were performing a grouped slide, this - # could potentially be helpful. We won't need it here. - # `ref_time_value` is our training/actual `forecast_date`. - edf %>% - # We must make sure that there are no gaps in the `time_value`s to make - # `dplyr::lag` work properly. Additionally, we will extend the - # `time_value`s to the `ref_time_value`, so we can just use `dplyr::lag` - # plus filter to `time_value == ref_time_value` to get our desired data. - complete( - geo_value, - time_value = full_seq(c(time_value, ref_time_value), period=1L) - ) %>% - # We'd need this `group_by` if we actually had multiple `geo_value`s: - group_by(geo_value) %>% - # Prepare columns x1, x2, x3 by `dplyr::lag`ging `admissions`: - mutate(map_dfc( - predictor_lags_rel_fcst_date, - # test_predictor_lags_rel_max_t, - ~ dplyr::lag(admissions, .x) - )) %>% - # We're done with the extra rows used to calculate the lags: - filter(time_value == ref_time_value) %>% - # Make sure we actually output a row so joins are easier: - complete(time_value = ref_time_value) %>% - ungroup() %>% - select(geo_value, x1, x2, x3) - }, - names_sep = NULL # don't prefix the output column names + ref_time_values = forecast_dates, + before = 365000L, # 1000-year time window --> don't filter out any `time_value`s + ~ .x, + as_list_col = TRUE ) %>% - # suppose our aheads are relative to max t (so we can show a nowcast) - # - # XXX this works for backcasting revisions to the most recent value available, - # but will be predicting the wrong aheads if we were trying to forecast 14/7 - # days beyond the `forecast_date` (since we shifted `admissions`). - mutate(time_value = time_value - forecast_date_latency) + rename(forecast_date = time_value) %>% + unnest(slide_value) -# Combine the version-aware predictors with the latest available version, from -# which we'll calculate targets as of the `forecast_date`. By predicting -# fully/mostly revised targets from real-time predictors, we are making a -# revision-aware forecast. For some models or evaluation schemes, we might want -# to downweight or discard the most recent data from our training sets, or take -# other more complex approaches. But for this model, since we don't use a -# training window, the current approach seems reasonable. -training_and_testing_edf = - left_join(version_aware_predictors, - # shift outcome data so `epipredict` indexes relative to forecast date: - forecast_date_edf %>% - mutate(time_value = time_value + forecast_date_latency), - by = c("geo_value", "time_value")) %>% - as_epi_df(as_of = forecast_date) -# (Caution: since we matched wdays, this is essentially a weekly data set, and -# certain forecasters that use `dplyr::lag` or `dplyr::ahead` without -# considering `time_value` will work differently than they would on daily data. -# This isn't the case with `epipredict` below.) +latest_edf <- archive_cases_dv_subset %>% epix_as_of(.$versions_end) - # # ahead = 14L, - # # ahead = 7L, - # # ahead = 0L, - # ahead = 0, -aheads = c(0L, 7L, 14L, 21L, 28L) - -# can't use intermediate aheads due to way that join was set up -# can't use negative aheads directly yet - -forecaster_outputs = map(aheads, function(ahead) { - training_and_testing_edf %>% - arx_forecaster( - outcome = "admissions", - predictors = c("x1", "x2", "x3"), - args_list = arx_args_list( - lags = 0L, # the lags are already baked into x1, x2, x3 - ahead = ahead - # Use defaults for everything else. - ) - ) %>% - { - .$predictions <- .$predictions %>% - mutate(forecast_date = .env$forecast_date) - . - } -}) - -library(ggplot2) -ga_hhs_archive %>% - epix_as_of(.$versions_end) %>% - ggplot(aes(time_value, admissions)) %>% - `+`(geom_point()) %>% - `+`(geom_point( - data = ga_hhs_archive %>% - epix_as_of(forecast_date), - colour = "blue", - shape = 4L - )) %>% - `+`(geom_point( - data = ga_hhs_archive %>% - epix_as_of(forecast_date) %>% - filter(as.integer(forecast_date - time_value) %in% predictor_lags_rel_fcst_date), - colour = "blue", - shape = 3L, size = 5 - )) %>% - `+`(xlim(c(forecast_date - 100L, forecast_date + max(aheads)))) %>% - `+`(geom_point( - inherit.aes = FALSE, - aes(x = target_date, y = .pred), - forecaster_outputs %>% - map_dfr("predictions"), - colour = "red" - )) %>% - `+`(geom_ribbon( - inherit.aes = FALSE, - aes(x = target_date, ymin = `0.05`, ymax = `0.95`), - forecaster_outputs %>% - map_dfr("predictions") %>% - pivot_quantiles(.pred_distn), - fill = "red", alpha = 0.4 - )) - -training_and_testing_edf %>% - ggplot(aes(x1, admissions)) %>% - # `+`(geom_point()) %>% - # `+`(geom_jitter()) %>% - `+`(geom_bin2d(binwidth = diff(range(training_and_testing_edf$admissions))/30)) %>% - `+`(scale_fill_continuous(type = "viridis")) %>% - `+`(geom_abline(slope = 1, intercept = 0, linetype="dotted")) - -training_and_testing_edf %>% - filter(!is.na(x1) & !is.na(admissions)) %>% - with(sum(admissions) / sum(x1)) -# Generally initial observations are revised downward. - -# Our fit coefficients for the first of the `aheads`: -aheads[[1L]] -forecaster_outputs[[1L]]$epi_workflow %>% - parsnip::extract_fit_engine() %>% - coef() +unfaithful_forecasts <- latest_edf %>% + # pretend we get observations about today, on today, with no revisions + mutate(version = time_value) %>% + as_epi_archive() %>% + epix_slide( + ref_time_values = forecast_dates, + before = 365000L, # 1000-year time window --> don't filter out any `time_value`s + ~ example_forecaster(.x, .ref_time_value), + names_sep = NULL + ) %>% + select(-time_value) + +# Plot them, on top of latest COVID-19 case rates +bind_rows(list( + pseudoprospective = pseudoprospective_forecasts, + unfaithful = unfaithful_forecasts +), .id = "backtester") %>% + pivot_quantiles_wider(.pred_distn) %>% + ggplot(aes(x = target_date)) + + facet_wrap(vars(geo_value)) + + geom_ribbon(aes(group = interaction(backtester, forecast_date), + fill = backtester, + ymin = `0.1`, ymax = `0.9`), + alpha = 0.4) + + geom_line(aes(x = time_value, y = value, + ## colour = signal, + ## colour = forecast_date, + ## colour = interaction(signal, forecast_date), + colour = as.integer(forecast_date - time_value), + ## alpha = as.integer(forecast_date - time_value), + ## linetype = signal, + group = interaction(signal, forecast_date)), + data = + snapshots %>% + pivot_longer(c(percent_cli, case_rate_7d_av), + names_to = "signal") %>% + left_join( + latest_edf %>% + drop_na(c(percent_cli, case_rate_7d_av)) %>% + summarize( + ## case_rate_7d_av = 1, + across(c(percent_cli, case_rate_7d_av), ~ mean(case_rate_7d_av) / mean(.x)) + ) %>% + pivot_longer(c(percent_cli, case_rate_7d_av), + names_to = "signal", + values_to = "scale_factor"), + by = "signal" + ) %>% + mutate(value = value * scale_factor) %>% + ## mutate(forecast_date = as.factor(forecast_date)) %>% + {.} + ) + + geom_line(aes(y = case_rate_7d_av), + data = latest_edf %>% + rename(target_date = time_value)) + + geom_vline(aes(xintercept = forecast_date), + data = function(df) distinct(df, forecast_date), + linetype = 2, alpha = 0.5) + + ## scale_colour_brewer(palette = "Set2") + + ## scale_fill_brewer(palette = "Spectral") + + ## scale_fill_brewer(palette = "Set3") + + ## scale_colour_viridis_d(direction = -1) + + ## scale_colour_viridis_c(option = "H", direction = -1) + + scale_colour_continuous(type = "viridis", ) + + scale_x_date(minor_breaks = "month", date_labels = "%b %y") + + labs(x = "Date", y = "Reported COVID-19 case rates") # + +## theme(legend.position = "none") +## + +## geom_line(data = x_latest, aes(x = time_value, y = case_rate_7d_av), +## inherit.aes = FALSE, color = "gray50") + +## geom_line(aes(y = fc_point)) + geom_point(aes(y = fc_point), size = 0.5) + +## geom_vline(aes(xintercept = time_value), linetype = 2, alpha = 0.5) + +## facet_grid(vars(geo_value), vars(as_of), scales = "free") + +## scale_x_date(minor_breaks = "month", date_labels = "%b %y") + +## labs(x = "Date", y = "Reported COVID-19 case rates") + +## theme(legend.position = "none") + + +archive_cases_dv_subset %>% + epix_slide(ref_time_values = .$DT$version %>% unique() %>% `[`(as.POSIXlt(.)$wday == 1L), + before = 14L, ~ slice_max(.x, time_value, by = geo_value), as_list_col = TRUE) %>% + rename(version = time_value) %>% + unnest(slide_value) %>% + left_join( + archive_cases_dv_subset %>% epix_as_of(.$versions_end), + by = c("geo_value", "time_value") + ) %>% + mutate(reldiff = abs(case_rate_7d_av.x - case_rate_7d_av.y)/(10 + abs(case_rate_7d_av.y))) %>% + mutate(version_lag = as.integer(version - time_value)) %>% + ## filter(version_lag %in% c(7L, 14L)) %>% + ## slice_max(reldiff, n = 10L, by = version_lag) %>% + slice_max(reldiff, n = 10L) %>% + select(version_lag, geo_value, version, case_rate_7d_av.x, case_rate_7d_av.y, reldiff) %>% + print(n = 100L) -# Our fit coefficients for the last of the `aheads`: -aheads[[length(aheads)]] -forecaster_outputs[[length(aheads)]]$epi_workflow %>% - parsnip::extract_fit_engine() %>% - coef() ``` -Modeling ideas: we probably should have used 7dav features, or some "slope" -features, or lags that cover a longer window. Removing the intercept might also -make sense. +## Attribution +This document contains data that is a modified part of the [COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19) as [republished in the COVIDcast Epidata API](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html). This data set is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/) by the Johns Hopkins University on behalf of its Center for Systems Science in Engineering. Copyright Johns Hopkins University 2020. -# pseudoprospective version-aware forecasting +The `archive_cases_dv_subset` object also contains `percent_cli` data is a modified part of the [COVIDcast Epidata API Doctor Visits data](https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/doctor-visits.html). This dataset is licensed under the terms of the [Creative Commons Attribution 4.0 International license](https://creativecommons.org/licenses/by/4.0/). Copyright Delphi Research Group at Carnegie Mellon University 2020. From 0db7fa7b67eb4618948f46944ace5dea92b260ff Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 17 Apr 2024 09:53:45 -0700 Subject: [PATCH 18/21] Fix target_date labeling in backtesting.Rmd --- vignettes/backtesting.Rmd | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd index 9f5d7919..3f5ef886 100644 --- a/vignettes/backtesting.Rmd +++ b/vignettes/backtesting.Rmd @@ -81,7 +81,7 @@ forecast_dates <- seq(as.Date("2020-11-02"), as.Date("2021-11-30"), by = "6 week horizons <- c(0, 7, 14, 21, 28) # relative to forecast_date example_forecaster <- function(snapshot_edf, forecast_date) { - shared_reporting_latency <- forecast_date - max(snapshot_edf$time_value) + shared_reporting_latency <- as.integer(forecast_date - max(snapshot_edf$time_value)) aheads <- horizons + shared_reporting_latency # relative to max time_value # FIXME shifting technique instead aheads %>% @@ -99,11 +99,13 @@ example_forecaster <- function(snapshot_edf, forecast_date) { ## predictors = "percent_cli", # from doctor-visits (claims) predictors = c("case_rate_7d_av", "percent_cli"), # from doctor-visits (claims) args_list = arx_args_list( + # FIXME this is incomplete; latency often varies signficantly by covariate and can't be ignored, so we also need lag adjustment. ahead = ahead, - quantile_levels = c(0.1, 0.5, 0.9), - forecast_date = forecast_date + quantile_levels = c(0.1, 0.5, 0.9) + # can't use forecast_date = + target_date = ; target_date appears to be ignored )) %>% - .$predictions + .$predictions %>% + mutate(forecast_date = .env$forecast_date) }) %>% bind_rows() ## list() From 64891975e0c6e348c0aed9352269bd8f36b8b920 Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 5 Jun 2024 17:31:38 -0700 Subject: [PATCH 19/21] Add WIP on flusurv; analyzing hhs backfill, any bad instances? --- vignettes/backtesting.Rmd | 270 +++++++++++++++++++++------ vignettes/latency_revision_stats.Rmd | 7 +- 2 files changed, 213 insertions(+), 64 deletions(-) diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd index 3f5ef886..7bafadb2 100644 --- a/vignettes/backtesting.Rmd +++ b/vignettes/backtesting.Rmd @@ -7,32 +7,34 @@ vignette: > %\VignetteEncoding{UTF-8} --- -### Overview, and why to use `epix_slide()` +### Backtesting overview, and why to use `epi_archive`s -The `epiprocess` package provides two functions that can be used for backtesting -forecasts and other models across a range of dates: `epi_slide()` and -`epix_slide()`; however, we recommend always using the latter, because: +The `epiprocess` package provides a few functions that can be used for +backtesting forecasts and other models: `epi_slide()` (on `epi_df`s), and +`epix_as_of()` and `epix_slide()` (on `epi_archive`s); however, we recommend +always using `epi_archive`s, because: -* `epix_slide()` faithfully reproduces latency in data reporting, while - `epi_slide()` does not; and -* `epix_slide()` faithfully reproduces the version of each available observation - that would have been available in real time, while `epi_slide()` does not. +* `epix_as_of()` and `epix_slide()` faithfully reproduce latency in data + reporting, while `epi_slide()` does not; and +* `epix_as_of()` and `epix_slide()` faithfully reproduce the version of each + available observation that would have been available in real time, while + `epi_slide()` does not. Using `epi_slide()` or a similar approach instead will typically produce **overly optimistic** retrospective forecast evaluations, and might miss some run-time errors that would have occurred in real time due to irregular latency in data reporting. Thus, it is important to use a versioning-faithful -"pseudoprospective" approach like `epix_slide()` for backtesting evaluations -whenever possible. +"pseudoprospective" approach like `epix_as_of()` and `epix_slide()` for +backtesting evaluations whenever possible. -Additionally, `epix_slide()` can be used to build and evaluate version-aware -models that factor in historical data revisions to try to improve their -accuracy. +Additionally, `epix_as_of()` and `epix_slide()` can be used to build and +evaluate version-aware models that factor in historical data revisions to try to +improve their accuracy. ### Smaller reproducibility issues that may remain -Even using version-aware approaches like `epix_slide()`, there may still be some -smaller reproducibility issues based on the data set/provider: +Even using version-aware approaches based on `epi_archive`s, there may still be +some smaller reproducibility issues based on the data set/provider: * If the `version` tags are not date+times, it is unclear what the input to the forecaster would have looked like at a particular date+time. For example, if @@ -55,13 +57,13 @@ smaller reproducibility issues based on the data set/provider: this situation, modeling based on the early-morning version wouldn't be reproducible if backtesting at a later date. -### Pseudoprospective modeling with `epi_archive` +### Pseudoprospective backtesting with `epi_archive`s -Pseudoprospective modeling can be performed with `epix_as_of()` or +Pseudoprospective backtesting can be performed with `epix_as_of()` or `epix_slide()`. We'll demonstrate using `epix_slide()` to perform pseudoprospective forecasting on COVID-19 incident case rate data from JHU-CSSE for a few US states. This data stream had very low reporting latency, and in -most cases defined incident cases in a way that made revisions theoretically +most instances defined incident cases in a way that made revisions theoretically impossible (the difference between cumulative reporting between consecutive reports), but nevertheless, there were some exceptions such as large revisions. @@ -71,21 +73,139 @@ FIXME discussion here. CLI if used library(dplyr) library(tidyr) library(purrr) +library(ggplot2) +library(epidatr) library(epiprocess) library(epipredict) -library(ggplot2) + + + + + + + + + + +# For reproducibility, we'll work with data reporting through some fixed issue +# we've already observed. For better reproducibility, that issue should be at +# least two days ago (from today), to incorporate any "hotfixes" to that data +# and allow up to a day for database replication delays: +hhs_analysis_issue = as.Date("2024-02-01") - 2L +# For speed & politness, we have enabled epidatr caching. +cache_info()[c("max_size", "max_age")] +# Fetch issue data using `epidatr`: +hhs_issue_data = + pub_covidcast( + source = "hhs", signals = "confirmed_admissions_covid_1d", + geo_type = "state", time_type = "day", + # all issues through the analysis issue + issues = epirange(12340101, hhs_analysis_issue) + ) +# ^ Data license: Public Domain US Government. Accessed via COVIDcast Epidata API. + +# It looks like more regular reporting started around version 2021-09-21, with +# 2021-09-21 being a little different as well. It's currently not easy to filter +# diff-based issue data correctly based on the version, so let's just filter out +# some this reporting based on time values that we expect would be included. We +# might also consider one of the mandatory reporting start dates. We're working +# on ways to filter both version and time values more easily from an archive +# object directly. +# hhs_start_time_value = as.Date("2021-09-21") # flu, approx +hhs_start_time_value = as.Date("2020-12-01") # covid, approx, though start of 2021 might also have some reporting quirks +hhs_archive = hhs_issue_data %>% + filter(time_value >= hhs_start_time_value) %>% + select(geo_value, time_value, version = issue, admissions = value) %>% + as_epi_archive(compactify = TRUE) + + +flusurv_analysis_issue <- as.Date("2019-08-01") %>% + MMWRweek::MMWRweek() %>% + {.$MMWRyear * 100L + .$MMWRweek} + +flusurv_issue_data <- + pub_flusurv( + locations = "network_all", + issues = epirange(123401, flusurv_analysis_issue) + ) + +flusurv_archive <- flusurv_issue_data %>% + select(geo_value = location, + time_value = epiweek, + ## version = release_date, + version = issue, + starts_with("rate_")) %>% + as_epi_archive(compactify = TRUE) + +flusurv_issue_data %>% + count(lag2 = release_date - issue) + +flusurv_issue_data %>% + count(release_wday = as.POSIXlt(release_date)$wday) + +as.POSIXlt(min(flusurv_issue_data$release_date))$wday + +# XXX surprises regarding release_date ranges (not actually turning off during off and early season); investigating: + +# any update: +tibble(date = flusurv_archive$DT$version %>% unique() %>% sort(), + datediff = c(NA, diff(date))) %>% + print(n=10000L) + +flusurv_archive$DT[, .N, by = version] %>% + ggplot(aes(version, N)) %>% + `+`(geom_point()) + +# adding for first time: +unique(flusurv_archive$DT, by = c("geo_value", "time_value")) %>% + .$version %>% unique() %>% sort() %>% + {tibble(date = ., datediff = c(NA, diff(date)))} %>% + print(n=10000L) + +# okay, this now makes sense + + + + + + + + + + + + + + + + # Mondays -forecast_dates <- seq(as.Date("2020-11-02"), as.Date("2021-11-30"), by = "6 weeks") +## forecast_dates <- seq(as.Date("2020-11-02"), as.Date("2021-11-30"), by = "6 weeks") + + + +## archive <- hhs_archive +archive <- flusurv_archive + +forecast_dates <- seq(min(archive$DT$version) + 120L, archive$versions_end, + by = "6 weeks") horizons <- c(0, 7, 14, 21, 28) # relative to forecast_date +## horizons <- 1 + c(0, 7, 14, 21, 28) # relative to forecast_date example_forecaster <- function(snapshot_edf, forecast_date) { + if (snapshot_edf %>% + distinct(time_value) %>% + filter(time_value >= forecast_date - 40L) %>% + nrow() < 3L) { + return (tibble::tibble()) + } shared_reporting_latency <- as.integer(forecast_date - max(snapshot_edf$time_value)) - aheads <- horizons + shared_reporting_latency # relative to max time_value + ## aheads <- horizons + shared_reporting_latency # relative to max time_value # FIXME shifting technique instead - aheads %>% - map(function(ahead) { + horizons %>% + map(function(horizon) { snapshot_edf %>% ## flatline_forecaster( ## "case_rate_7d_av", @@ -95,24 +215,33 @@ example_forecaster <- function(snapshot_edf, forecast_date) { ## forecast_date = forecast_date ## )) %>% arx_forecaster( - outcome = "case_rate_7d_av", # via JHU-CSSE + ## outcome = "case_rate_7d_av", # via JHU-CSSE ## predictors = "percent_cli", # from doctor-visits (claims) - predictors = c("case_rate_7d_av", "percent_cli"), # from doctor-visits (claims) + ## predictors = c("case_rate_7d_av", "percent_cli"), # cli from doctor-visits (claims) + ## outcome = "admissions", + outcome = "rate_overall", + ## predictors = c("case_rate_7d_av"), + ## predictors = "admissions", + predictors = "rate_overall", args_list = arx_args_list( # FIXME this is incomplete; latency often varies signficantly by covariate and can't be ignored, so we also need lag adjustment. - ahead = ahead, - quantile_levels = c(0.1, 0.5, 0.9) - # can't use forecast_date = + target_date = ; target_date appears to be ignored + ## ahead = shared_reporting_latency + horizon, + ahead = horizon, + quantile_levels = c(0.1, 0.5, 0.9)#, + ## forecast_date = forecast_date, + ## target_date = forecast_date + horizon )) %>% .$predictions %>% - mutate(forecast_date = .env$forecast_date) + mutate(forecast_date = forecast_date, + target_date = forecast_date + horizon) }) %>% bind_rows() ## list() } pseudoprospective_forecasts <- - archive_cases_dv_subset %>% + ## archive_cases_dv_subset %>% + archive %>% epix_slide( ref_time_values = forecast_dates, before = 365000L, # 1000-year time window --> don't filter out any `time_value`s @@ -123,7 +252,8 @@ pseudoprospective_forecasts <- select(-time_value) snapshots <- - archive_cases_dv_subset %>% + ## archive_cases_dv_subset %>% + archive %>% epix_slide( ref_time_values = forecast_dates, before = 365000L, # 1000-year time window --> don't filter out any `time_value`s @@ -133,13 +263,14 @@ snapshots <- rename(forecast_date = time_value) %>% unnest(slide_value) -latest_edf <- archive_cases_dv_subset %>% epix_as_of(.$versions_end) +latest_edf <- archive %>% epix_as_of(.$versions_end) unfaithful_forecasts <- latest_edf %>% # pretend we get observations about today, on today, with no revisions mutate(version = time_value) %>% - as_epi_archive() %>% + as_epi_archive(versions_end = max(forecast_dates)) %>% epix_slide( + # pretend version releases are on forecast dates ref_time_values = forecast_dates, before = 365000L, # 1000-year time window --> don't filter out any `time_value`s ~ example_forecaster(.x, .ref_time_value), @@ -147,14 +278,21 @@ unfaithful_forecasts <- latest_edf %>% ) %>% select(-time_value) +## plot_geo_values <- c("mi", "id", "ca", "ga", "pa") +## plot_geo_values <- c("ca") +plot_geo_values <- c("network_all") + # Plot them, on top of latest COVID-19 case rates -bind_rows(list( - pseudoprospective = pseudoprospective_forecasts, - unfaithful = unfaithful_forecasts -), .id = "backtester") %>% +plt <- + bind_rows(list( + pseudoprospective = pseudoprospective_forecasts, + unfaithful = unfaithful_forecasts + ), .id = "backtester") %>% + filter(geo_value %in% plot_geo_values) %>% pivot_quantiles_wider(.pred_distn) %>% ggplot(aes(x = target_date)) + - facet_wrap(vars(geo_value)) + + ## facet_wrap(vars(geo_value)) + + facet_wrap(vars(geo_value), scales = "free_y") + geom_ribbon(aes(group = interaction(backtester, forecast_date), fill = backtester, ymin = `0.1`, ymax = `0.9`), @@ -169,16 +307,25 @@ bind_rows(list( group = interaction(signal, forecast_date)), data = snapshots %>% - pivot_longer(c(percent_cli, case_rate_7d_av), + filter(geo_value %in% plot_geo_values) %>% + ## pivot_longer(c(percent_cli, case_rate_7d_av), + ## pivot_longer(c(admissions), + pivot_longer(c(rate_overall), names_to = "signal") %>% left_join( latest_edf %>% - drop_na(c(percent_cli, case_rate_7d_av)) %>% + ## drop_na(c(percent_cli, case_rate_7d_av)) %>% + ## drop_na(c(admissions)) %>% + drop_na(c(rate_overall)) %>% summarize( ## case_rate_7d_av = 1, - across(c(percent_cli, case_rate_7d_av), ~ mean(case_rate_7d_av) / mean(.x)) + ## across(c(percent_cli, case_rate_7d_av), ~ mean(case_rate_7d_av) / mean(.x)) + ## across(c(admissions), ~ mean(admissions) / mean(.x)) + across(c(rate_overall), ~ mean(rate_overall) / mean(.x)) ) %>% - pivot_longer(c(percent_cli, case_rate_7d_av), + ## pivot_longer(c(percent_cli, case_rate_7d_av), + ## pivot_longer(c(admissions), + pivot_longer(c(rate_overall), names_to = "signal", values_to = "scale_factor"), by = "signal" @@ -187,8 +334,11 @@ bind_rows(list( ## mutate(forecast_date = as.factor(forecast_date)) %>% {.} ) + - geom_line(aes(y = case_rate_7d_av), + ## geom_line(aes(y = case_rate_7d_av), + ## geom_line(aes(y = admissions), + geom_line(aes(y = rate_overall), data = latest_edf %>% + filter(geo_value %in% plot_geo_values) %>% rename(target_date = time_value)) + geom_vline(aes(xintercept = forecast_date), data = function(df) distinct(df, forecast_date), @@ -211,24 +361,24 @@ bind_rows(list( ## scale_x_date(minor_breaks = "month", date_labels = "%b %y") + ## labs(x = "Date", y = "Reported COVID-19 case rates") + ## theme(legend.position = "none") +plotly::ggplotly(plt) - -archive_cases_dv_subset %>% - epix_slide(ref_time_values = .$DT$version %>% unique() %>% `[`(as.POSIXlt(.)$wday == 1L), - before = 14L, ~ slice_max(.x, time_value, by = geo_value), as_list_col = TRUE) %>% - rename(version = time_value) %>% - unnest(slide_value) %>% - left_join( - archive_cases_dv_subset %>% epix_as_of(.$versions_end), - by = c("geo_value", "time_value") - ) %>% - mutate(reldiff = abs(case_rate_7d_av.x - case_rate_7d_av.y)/(10 + abs(case_rate_7d_av.y))) %>% - mutate(version_lag = as.integer(version - time_value)) %>% - ## filter(version_lag %in% c(7L, 14L)) %>% - ## slice_max(reldiff, n = 10L, by = version_lag) %>% - slice_max(reldiff, n = 10L) %>% - select(version_lag, geo_value, version, case_rate_7d_av.x, case_rate_7d_av.y, reldiff) %>% - print(n = 100L) +## archive_cases_dv_subset %>% +## epix_slide(ref_time_values = .$DT$version %>% unique() %>% `[`(as.POSIXlt(.)$wday == 1L), +## before = 14L, ~ slice_max(.x, time_value, by = geo_value), as_list_col = TRUE) %>% +## rename(version = time_value) %>% +## unnest(slide_value) %>% +## left_join( +## archive_cases_dv_subset %>% epix_as_of(.$versions_end), +## by = c("geo_value", "time_value") +## ) %>% +## mutate(reldiff = abs(case_rate_7d_av.x - case_rate_7d_av.y)/(10 + abs(case_rate_7d_av.y))) %>% +## mutate(version_lag = as.integer(version - time_value)) %>% +## ## filter(version_lag %in% c(7L, 14L)) %>% +## ## slice_max(reldiff, n = 10L, by = version_lag) %>% +## slice_max(reldiff, n = 10L) %>% +## select(version_lag, geo_value, version, case_rate_7d_av.x, case_rate_7d_av.y, reldiff) %>% +## print(n = 100L) ``` diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 872cb673..84f5f1ee 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -30,16 +30,15 @@ hhs_influenza_hosp_issue_data_path = "hhs_influenza_hosp_issue_data.rds" if (!file.exists(hhs_influenza_hosp_issue_data_path)) { cat("Fetching.\n") hhs_influenza_hosp_issue_data = - covidcast( - data_source = "hhs", signals = "confirmed_admissions_influenza_1d", + pub_covidcast( + source = "hhs", signals = "confirmed_admissions_influenza_1d", geo_type = "state", time_type = "day", geo_values = "*", # all `time_value`s: time_values = "*", # all issues through the analysis issue issues = epirange(12340101, hhs_influenza_hosp_analysis_issue) - ) %>% - fetch() + ) saveRDS(hhs_influenza_hosp_issue_data, hhs_influenza_hosp_issue_data_path) writeLines(c("License: Public Domain US Government", "Accessed via COVIDcast Epidata API"), From ba546960b03a17cf5c6d9db1459c202b13de86ba Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Wed, 17 Jul 2024 12:51:26 -0700 Subject: [PATCH 20/21] Prominently mark buggy example `dup` latency definitions, pending deletion/fix --- vignettes/latency_revision_stats.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vignettes/latency_revision_stats.Rmd b/vignettes/latency_revision_stats.Rmd index 84f5f1ee..f40bd660 100644 --- a/vignettes/latency_revision_stats.Rmd +++ b/vignettes/latency_revision_stats.Rmd @@ -7,6 +7,8 @@ vignette: > %\VignetteEncoding{UTF-8} --- +# FIXME the `dup` latency definitions are buggy + # Load data ```{r} From 8ad31974ff2f14c31e4d24ecdc7ad12213d2b64b Mon Sep 17 00:00:00 2001 From: "Logan C. Brooks" Date: Tue, 30 Jul 2024 08:55:33 -0700 Subject: [PATCH 21/21] Update forecasting-intro.Rmd with later epidatr, epiprocess --- vignettes/forecasting-intro.Rmd | 51 ++++++++++++++------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/vignettes/forecasting-intro.Rmd b/vignettes/forecasting-intro.Rmd index 0a51fbbc..ea5c3e2a 100644 --- a/vignettes/forecasting-intro.Rmd +++ b/vignettes/forecasting-intro.Rmd @@ -47,23 +47,14 @@ library(epipredict) ga_hhs_analysis_issue = as.Date("2023-04-05") - 2L ga_hhs_issue_data_path = "ga_hhs_issue_data.rds" -if (!file.exists(ga_hhs_issue_data_path)) { - ga_hhs_issue_data = - covidcast( - "ga_hhs", "confirmed_admissions_influenza_1d", - geo_type = "state", time_type = "day", - # TODO explain * in comments - geo_values = "ga", time_values = "*", - issues = "*" - ) %>% - fetch_tbl() - saveRDS(ga_hhs_issue_data, ga_hhs_issue_data_path) - writeLines(c("License: Public Domain US Government","Accessed via COVIDcast Epidata API"), - "LICENSE_ga_hhs_issue_data.txt") -} else { - ga_hhs_issue_data = readRDS(ga_hhs_issue_data_path) -} -# TODO use, e.g., cachem. And explain why caching. +ga_hhs_issue_data = + pub_covidcast( + "hhs", "confirmed_admissions_influenza_1d", + geo_type = "state", time_type = "day", + # TODO explain * in comments + geo_values = "ga", time_values = "*", + issues = epirange(12340101, ga_hhs_analysis_issue) + ) ``` Convert to `epi_archive` format: @@ -132,7 +123,7 @@ ga_hhs_archive %>% ref_time_values = forecast_dates, before = 365000, # train on all past data as_list_col = TRUE, - function(edf, gk) { + function(edf, gk, v) { edf %>% filter(time_value >= hhs_start_time_value) %>% arx_forecaster( @@ -172,8 +163,7 @@ version_aware_covariates = ref_time_values = training_and_testing_issues, before = forecast_date_latency + 14L, # include just enough to get the lags we want names_sep = NULL, - function(edf, gk) { - ref_time_value = attr(edf, "metadata")$as_of + function(edf, gk, ref_time_value) { edf %>% filter(time_value >= hhs_start_time_value) %>% complete(geo_value, time_value = full_seq(c(time_value, ref_time_value), period=1L)) %>% @@ -187,19 +177,22 @@ version_aware_covariates = } ) -full_join(version_aware_covariates, forecast_date_edf) %>% +fc = full_join(version_aware_covariates, forecast_date_edf, by = c("geo_value", "time_value")) %>% as_epi_df(as_of = forecast_date) %>% na.omit() %>% arx_forecaster( - outcome = "admissions", - predictors = c("x1","x2","x3"), - args_list = arx_args_list( - lags = 0L, - ahead = 14L, - forecast_date = forecast_date, - # Use defaults for everything else. + outcome = "admissions", + predictors = c("x1","x2","x3"), + args_list = arx_args_list( + lags = 0L, + ahead = 14L, + forecast_date = forecast_date, + # Use defaults for everything else. + ) ) -) + +fc$epi_workflow %>% extract_fit_engine() # need to re-impl no-intercept model to get clear picture +fc %>% autoplot() ```