diff --git a/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.meta.yml b/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.meta.yml index 6b524ea9038..37b6fdaec56 100644 --- a/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.meta.yml +++ b/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.meta.yml @@ -1,5 +1,12 @@ # NOTE: To learn more about the fields, hover over their names. definitions: + others: + group_notes: |- + <% if organism == 'RSV' %> + This includes RSV A and RSV B. + <% elif organism == 'Influenza A' %> + This includes Influenza A H1N1 and H3N2. + <% endif %> common: presentation: topic_tags: @@ -21,12 +28,12 @@ tables: title: Number of specimens with pathogen <> unit: specimens description_short: |- - The number of specimens detected with respiratory pathogen << organism >>. + The number of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes} percentage: title: Percentage of specimens with pathogen <> description_short: |- - The share of specimens detected with respiratory pathogen << organism >>. + The share of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes} unit: "%" short_unit: "%" display: @@ -34,7 +41,32 @@ tables: name: << organism >> tested: - title: Total number of specimens tested for pathogen <> present + title: Total number of specimens tested for presence of pathogen <> description_short: |- - The number of specimens tested to detect the respiratory pathogen << organism >>. Some of these specimens may have tested positive for the pathogen, while others (or all of them) may have tested negative. + The number of specimens tested to detect the respiratory pathogen << organism >>. Some of these specimens may have tested positive for the pathogen, while others (or all of them) may have tested negative. {definitions.others.group_notes} + unit: specimens + + + seattle_pathogens_month: + variables: + present_month: + title: Number of specimens with pathogen <> (monthly) + unit: specimens + description_short: |- + The number of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes} + + percentage_month: + title: Percentage of specimens with pathogen <> (monthly) + description_short: |- + The share of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes} + unit: "%" + short_unit: "%" + display: + numDecimalPlaces: 2 + name: << organism >> + + tested_month: + title: Total number of specimens tested for presence of pathogen <> (monthly) + description_short: |- + The number of specimens tested to detect the respiratory pathogen << organism >>. Some of these specimens may have tested positive for the pathogen, while others (or all of them) may have tested negative. {definitions.others.group_notes} unit: specimens diff --git a/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.py b/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.py index fe75937746b..1acad32badc 100644 --- a/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.py +++ b/etl/steps/data/garden/health/2024-09-05/seattle_pathogens.py @@ -1,6 +1,10 @@ """Load a meadow dataset and create a garden dataset.""" +from typing import List + import pandas as pd +from owid.catalog import Table +from owid.catalog.processing import concat from etl.helpers import PathFinder, create_dataset @@ -38,35 +42,87 @@ def run(dest_dir: str) -> None: # # Process data. # - # Check all organisms are present assert set(tb["organism"].unique()) == set(ORGANISM_RENAME.keys()) # Rename organism tb["organism"] = tb["organism"].cat.rename_categories(ORGANISM_RENAME) - # Change date format from week YYYY-Www to YYYY-MM-DD tb["date"] = pd.to_datetime(tb["week"].astype("string") + "-1", format="%G-W%V-%w") - # Keep relevant columns tb = tb[["date", "organism", "present", "tested"]] - # Estimate percentage - assert (tb["tested"] != 0).all(), "Some zeroes in tested column! This can lead to division by zero." - tb["percentage"] = 100 * tb["present"] / tb["tested"] - - # Add entity - tb["country"] = "Seattle" - + # Month + tb_month = make_monthly_table(tb) + + # Weekly data + tb = process_table(tb) + # Monthly data + tb_month = process_table(tb_month) + tb_month = tb_month.rename( + columns={ + "present": "present_month", + "tested": "tested_month", + "percentage": "percentage_month", + } + ) # Format - tb = tb.format(["country", "date", "organism"]) + tables = [ + tb.format(["country", "date", "organism"]), + tb_month.format(["country", "date", "organism"], short_name="seattle_pathogens_month"), + ] # # Save outputs. # # Create a new garden dataset with the same metadata as the meadow dataset. ds_garden = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, ) # Save changes in the new garden dataset. ds_garden.save() + + +def add_pathogen_group(tb: Table, group_name: str, group_pathogens: List[str]): + tb_group = tb[tb["organism"].isin(group_pathogens)].copy() + + tb_group = tb_group.groupby("date", as_index=False)[["present", "percentage"]].sum() + tb_group["organism"] = group_name + + tb = concat([tb, tb_group]) + return tb + + +def make_monthly_table(tb: Table) -> Table: + tb_month = tb.copy() + # Extract year and month from date + tb_month["date"] = tb_month["date"].dt.to_period("M") + # Group by 'date' and 'organism' and sum the 'present' values + tb_month = tb_month.groupby(["date", "organism"], as_index=False)[["present", "tested"]].sum() + # Convert 'date' back to datetime format if needed + tb_month["date"] = tb_month["date"].dt.to_timestamp(how="end").dt.date + + return tb_month + + +def process_table(tb: Table) -> Table: + # Estimate percentage + assert (tb["tested"] != 0).all(), "Some zeroes in tested column! This can lead to division by zero." + tb["percentage"] = 100 * tb["present"] / tb["tested"] + + # Add groupings + groups = { + "RSV": ["RSV A", "RSV B"], + "Influenza A": ["Influenza A (H3N2)", "Influenza A (H1N1)"], + # # "Influenza": ["Influenza A (H3N2)", "Influenza A (H1N1)", "Influenza B"], + } + for group_name, group_pathogens in groups.items(): + tb = add_pathogen_group(tb, group_name, group_pathogens) + + # Add entity + tb["country"] = "Seattle" + + return tb diff --git a/etl/steps/data/grapher/health/2024-09-05/seattle_pathogens.py b/etl/steps/data/grapher/health/2024-09-05/seattle_pathogens.py index cc11f95d357..d6f03732a10 100644 --- a/etl/steps/data/grapher/health/2024-09-05/seattle_pathogens.py +++ b/etl/steps/data/grapher/health/2024-09-05/seattle_pathogens.py @@ -13,19 +13,17 @@ def run(dest_dir: str) -> None: # Load garden dataset. ds_garden = paths.load_dataset("seattle_pathogens") - # Read table from garden dataset. - tb = ds_garden["seattle_pathogens"] - # # Process data. # + tables = list(ds_garden) # # Save outputs. # # Create a new grapher dataset with the same metadata as the garden dataset. ds_grapher = create_dataset( - dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata ) # Save changes in the new grapher dataset.