Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 covid: merge categories #3300

Merged
merged 4 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 36 additions & 4 deletions etl/steps/data/garden/health/2024-09-05/seattle_pathogens.meta.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
others:
group_notes: |-
<% if organism == 'RSV' %>
This includes RSV A and RSV B.
<% elif organism == 'Influenza A' %>
This includes Influenza A H1N1 and H3N2.
<% endif %>
common:
presentation:
topic_tags:
Expand All @@ -21,20 +28,45 @@ tables:
title: Number of specimens with pathogen <<organism>>
unit: specimens
description_short: |-
The number of specimens detected with respiratory pathogen << organism >>.
The number of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes}

percentage:
title: Percentage of specimens with pathogen <<organism>>
description_short: |-
The share of specimens detected with respiratory pathogen << organism >>.
The share of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes}
unit: "%"
short_unit: "%"
display:
numDecimalPlaces: 2
name: << organism >>

tested:
title: Total number of specimens tested for pathogen <<organism>> present
title: Total number of specimens tested for presence of pathogen <<organism>>
description_short: |-
The number of specimens tested to detect the respiratory pathogen << organism >>. Some of these specimens may have tested positive for the pathogen, while others (or all of them) may have tested negative.
The number of specimens tested to detect the respiratory pathogen << organism >>. Some of these specimens may have tested positive for the pathogen, while others (or all of them) may have tested negative. {definitions.others.group_notes}
unit: specimens


seattle_pathogens_month:
variables:
present_month:
title: Number of specimens with pathogen <<organism>> (monthly)
unit: specimens
description_short: |-
The number of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes}

percentage_month:
title: Percentage of specimens with pathogen <<organism>> (monthly)
description_short: |-
The share of specimens detected with respiratory pathogen << organism >>. {definitions.others.group_notes}
unit: "%"
short_unit: "%"
display:
numDecimalPlaces: 2
name: << organism >>

tested_month:
title: Total number of specimens tested for presence of pathogen <<organism>> (monthly)
description_short: |-
The number of specimens tested to detect the respiratory pathogen << organism >>. Some of these specimens may have tested positive for the pathogen, while others (or all of them) may have tested negative. {definitions.others.group_notes}
unit: specimens
80 changes: 68 additions & 12 deletions etl/steps/data/garden/health/2024-09-05/seattle_pathogens.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""Load a meadow dataset and create a garden dataset."""

from typing import List

import pandas as pd
from owid.catalog import Table
from owid.catalog.processing import concat

from etl.helpers import PathFinder, create_dataset

Expand Down Expand Up @@ -38,35 +42,87 @@ def run(dest_dir: str) -> None:
#
# Process data.
#

# Check all organisms are present
assert set(tb["organism"].unique()) == set(ORGANISM_RENAME.keys())
# Rename organism
tb["organism"] = tb["organism"].cat.rename_categories(ORGANISM_RENAME)

# Change date format from week YYYY-Www to YYYY-MM-DD
tb["date"] = pd.to_datetime(tb["week"].astype("string") + "-1", format="%G-W%V-%w")

# Keep relevant columns
tb = tb[["date", "organism", "present", "tested"]]

# Estimate percentage
assert (tb["tested"] != 0).all(), "Some zeroes in tested column! This can lead to division by zero."
tb["percentage"] = 100 * tb["present"] / tb["tested"]

# Add entity
tb["country"] = "Seattle"

# Month
tb_month = make_monthly_table(tb)

# Weekly data
tb = process_table(tb)
# Monthly data
tb_month = process_table(tb_month)
tb_month = tb_month.rename(
columns={
"present": "present_month",
"tested": "tested_month",
"percentage": "percentage_month",
}
)
# Format
tb = tb.format(["country", "date", "organism"])
tables = [
tb.format(["country", "date", "organism"]),
tb_month.format(["country", "date", "organism"], short_name="seattle_pathogens_month"),
]

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
dest_dir,
tables=tables,
check_variables_metadata=True,
default_metadata=ds_meadow.metadata,
)

# Save changes in the new garden dataset.
ds_garden.save()


def add_pathogen_group(tb: Table, group_name: str, group_pathogens: List[str]):
tb_group = tb[tb["organism"].isin(group_pathogens)].copy()

tb_group = tb_group.groupby("date", as_index=False)[["present", "percentage"]].sum()
tb_group["organism"] = group_name

tb = concat([tb, tb_group])
return tb


def make_monthly_table(tb: Table) -> Table:
tb_month = tb.copy()
# Extract year and month from date
tb_month["date"] = tb_month["date"].dt.to_period("M")
# Group by 'date' and 'organism' and sum the 'present' values
tb_month = tb_month.groupby(["date", "organism"], as_index=False)[["present", "tested"]].sum()
# Convert 'date' back to datetime format if needed
tb_month["date"] = tb_month["date"].dt.to_timestamp(how="end").dt.date

return tb_month


def process_table(tb: Table) -> Table:
# Estimate percentage
assert (tb["tested"] != 0).all(), "Some zeroes in tested column! This can lead to division by zero."
tb["percentage"] = 100 * tb["present"] / tb["tested"]

# Add groupings
groups = {
"RSV": ["RSV A", "RSV B"],
"Influenza A": ["Influenza A (H3N2)", "Influenza A (H1N1)"],
# # "Influenza": ["Influenza A (H3N2)", "Influenza A (H1N1)", "Influenza B"],
}
for group_name, group_pathogens in groups.items():
tb = add_pathogen_group(tb, group_name, group_pathogens)

# Add entity
tb["country"] = "Seattle"

return tb
6 changes: 2 additions & 4 deletions etl/steps/data/grapher/health/2024-09-05/seattle_pathogens.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,17 @@ def run(dest_dir: str) -> None:
# Load garden dataset.
ds_garden = paths.load_dataset("seattle_pathogens")

# Read table from garden dataset.
tb = ds_garden["seattle_pathogens"]

#
# Process data.
#
tables = list(ds_garden)

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata
)

# Save changes in the new grapher dataset.
Expand Down
Loading