To possibly help, forecasttools-py does this, though likely not in the exact manner than is need for this repository.

So that the code is more findable, I am including some here:

Contents of pyproject toml (in [tool.poetry])

include = [
    { path = "forecasttools/location_table.parquet", format = "sdist" },
    { path = "forecasttools/location_table.parquet", format = "wheel" },
    { path = "forecasttools/example_flusight_submission.parquet", format = "sdist" },
    { path = "forecasttools/example_flusight_submission.parquet", format = "wheel" },
    { path = "forecasttools/example_flu_forecast_wo_dates.nc", format = "sdist" },
    { path = "forecasttools/example_flu_forecast_wo_dates.nc", format = "wheel" },
    { path = "forecasttools/example_flu_forecast_w_dates.nc", format = "sdist" },
    { path = "forecasttools/example_flu_forecast_w_dates.nc", format = "wheel" },
    { path = "forecasttools/nhsn_hosp_COVID.parquet", format = "sdist" },
    { path = "forecasttools/nhsn_hosp_COVID.parquet", format = "wheel" },
    { path = "forecasttools/nhsn_hosp_flu.parquet", format = "sdist" },
    { path = "forecasttools/nhsn_hosp_flu.parquet", format = "wheel" },
]

Contents of __init__ (parquets inside package)

# load example fitting data for COVID
# (NHSN, as of 2024-09-26)
nhsn_hosp_COVID_path = importlib.resources.files(__package__).joinpath(
    "nhsn_hosp_COVID.parquet"
)
nhsn_hosp_COVID = pl.read_parquet(nhsn_hosp_COVID_path)

# load example fitting data for influenza
# (NHSN, as of 2024-09-26)
nhsn_hosp_flu_path = importlib.resources.files(__package__).joinpath(
    "nhsn_hosp_flu.parquet"
)
nhsn_hosp_flu = pl.read_parquet(nhsn_hosp_flu_path)

Contents of data.py

def make_nshn_fitting_dataset(
    dataset: str,
    nhsn_dataset_path: str,
    file_save_path: str,
) -> None:
    """
    Create a polars dataset with columns date,
    state, and hosp and save a CSV. Can be used
    for COVID or influenza. This function DOES
    NOT use the API endpoint, and instead expects
    a CSV.

    dataset
        Name of the dataset to create. Either
        "COVID" or "flu".
    nhsn_dataset_path
        Path to the NHSN dataset (csv file).
    file_save_path
        The path for where to save the output file.
    """
    # check that dataset parameter is possible
    assert dataset in [
        "COVID",
        "flu",
    ], 'Dataset {dataset} must be one of "COVID", "flu"'
    # check the file path is valid
    check_file_save_path(file_save_path)
    # check that a data file exists
    if not os.path.exists(nhsn_dataset_path):
        raise FileNotFoundError(
            f"The file {nhsn_dataset_path} does not exist."
        )
    else:
        # check that the loaded CSV has the needed columns
        df_cols = pl.scan_csv(nhsn_dataset_path).columns
        required_cols = [
            "state",
            "date",
            "previous_day_admission_adult_covid_confirmed",
            "previous_day_admission_influenza_confirmed",
        ]
        if not set(required_cols).issubset(set(df_cols)):
            raise ValueError(
                f"NHSN dataset missing required columns: {set(required_cols) - set(df_cols)}"
            )
        # fully load and save NHSN dataframe
        df = pl.read_csv(nhsn_dataset_path)
        # change date formatting to ISO8601
        df = df.with_columns(df["date"].str.replace_all("/", "-"))
        # pathogen specific df saving
        if dataset == "COVID":
            df_covid = (
                df.select(
                    [
                        "state",
                        "date",
                        "previous_day_admission_adult_covid_confirmed",
                    ]
                )
                .rename(
                    {"previous_day_admission_adult_covid_confirmed": "hosp"}
                )
                .sort(["state", "date"])
            )
            df_covid.write_csv(file_save_path)
        if dataset == "flu":
            df_flu = (
                df.select(
                    [
                        "state",
                        "date",
                        "previous_day_admission_influenza_confirmed",
                    ]
                )
                .rename({"previous_day_admission_influenza_confirmed": "hosp"})
                .sort(["state", "date"])
            )
            df_flu.write_csv(file_save_path)
        print(f"The file {file_save_path} has been created.")

Save NHSN Data for model fitting #202

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions