From d2ff4f547a4aa5f755f7d8ca52459d7ea48122a3 Mon Sep 17 00:00:00 2001 From: Mojmir Vinkler Date: Thu, 22 Aug 2024 12:17:15 +0200 Subject: [PATCH] :bar_chart: Replace backported dataset by proper GHO (#3171) * :bar_chart: Replace backported dataset by proper GHO --- dag/health.yml | 2 +- .../data/garden/who/2023-06-01/cholera.py | 59 +++++++++++-------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/dag/health.yml b/dag/health.yml index eab71d6c93b..88430719abc 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -140,7 +140,7 @@ steps: # Cholera data://garden/who/2023-06-01/cholera: - - backport://backport/owid/latest/dataset_5676_global_health_observatory__world_health_organization__2022_08 + - data://garden/who/2024-01-03/gho - snapshot://fasttrack/2023-05-31/cholera.csv - data://garden/regions/2023-01-01/regions - data://garden/wb/2021-07-01/wb_income diff --git a/etl/steps/data/garden/who/2023-06-01/cholera.py b/etl/steps/data/garden/who/2023-06-01/cholera.py index 1467dbd247d..8ec3e6a13f2 100644 --- a/etl/steps/data/garden/who/2023-06-01/cholera.py +++ b/etl/steps/data/garden/who/2023-06-01/cholera.py @@ -2,7 +2,6 @@ import pandas as pd from owid.catalog import Dataset, Table -from owid.catalog.datasets import NULLABLE_DTYPES from structlog import get_logger from etl.data_helpers import geo @@ -21,30 +20,19 @@ def run(dest_dir: str) -> None: # Load inputs. # # Load backport dataset. - short_name = "dataset_5676_global_health_observatory__world_health_organization__2022_08" - who_gh_dataset: Dataset = paths.load_dependency(short_name) - who_gh = who_gh_dataset[short_name].reset_index() + who_gh_dataset = paths.load_dataset("gho") # Load fast track dataset - snap: Dataset = paths.load_dependency("cholera.csv") + snap = paths.load_snapshot("cholera.csv") cholera_ft = pd.read_csv(snap.path) # Load countries regions - regions_dataset: Dataset = paths.load_dependency("regions") + regions_dataset = paths.load_dataset("regions") regions = regions_dataset["regions"] - # Process backport dataset - cholera_cols = who_gh.columns[who_gh.columns.str.contains("cholera")].to_list() - cholera_bp = who_gh[["year", "entity_name"] + cholera_cols] - cholera_bp[cholera_cols] = cholera_bp[cholera_cols].apply(pd.to_numeric, errors="coerce") - cholera_bp = cholera_bp.dropna(how="all", axis=0, subset=cholera_cols).rename( - columns={ - "entity_name": "country", - "indicator__cholera_case_fatality_rate": "cholera_case_fatality_rate", - "indicator__number_of_reported_cases_of_cholera": "cholera_reported_cases", - "indicator__number_of_reported_deaths_from_cholera": "cholera_deaths", - } - ) + # Process GHO dataset + cholera_bp = process_gho_cholera(who_gh_dataset).reset_index() + # The regional and global data in the backport is only provided for 2013 so we remove it here and recalculate it cholera_bp = geo.harmonize_countries( df=cholera_bp, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path @@ -58,9 +46,6 @@ def run(dest_dir: str) -> None: tb_garden = Table(cholera_combined.set_index(["country", "year"], verify_integrity=True), short_name="cholera") - # Convert nullable types to float64, otherwise we risk pd.NA and np.nan being mixed up. - float64_cols = [col for col, dtype in tb_garden.dtypes.items() if dtype in NULLABLE_DTYPES] - tb_garden[float64_cols] = tb_garden[float64_cols].astype(float) # Save outputs. # # Create a new garden dataset with the same metadata as the meadow dataset. @@ -72,6 +57,30 @@ def run(dest_dir: str) -> None: log.info("cholera.end") +def process_gho_cholera(who_gh_dataset: Dataset) -> Table: + tb_names = [ + "cholera_case_fatality_rate", + "number_of_reported_cases_of_cholera", + "number_of_reported_deaths_from_cholera", + ] + cholera_bp = who_gh_dataset[tb_names[0]] + for tb_name in tb_names[1:]: + cholera_bp = cholera_bp.join(who_gh_dataset[tb_name], how="outer") + + return ( + cholera_bp.drop(columns=["comments"]) + .rename( + columns={ + "cholera_case_fatality_rate": "cholera_case_fatality_rate", + "number_of_reported_cases_of_cholera": "cholera_reported_cases", + "number_of_reported_deaths_from_cholera": "cholera_deaths", + } + ) + .dropna(how="all", axis=0) + .astype(float) + ) + + def add_global_total(df: pd.DataFrame, regions: Table) -> pd.DataFrame: """ Calculate global total of cholera cases and add it to the existing dataset @@ -85,7 +94,7 @@ def add_global_total(df: pd.DataFrame, regions: Table) -> pd.DataFrame: ), f"{df['country'][~df['country'].isin(countries)].drop_duplicates()}, is not a country" df_glob = df.groupby(["year"]).agg({"cholera_reported_cases": "sum", "cholera_deaths": "sum"}).reset_index() df_glob["country"] = "World" - df_glob["cholera_case_fatality_rate"] = (df_glob["cholera_deaths"] / df_glob["cholera_reported_cases"]) * 100 + df_glob["cholera_case_fatality_rate"] = cholera_case_fatality_rate(df_glob) df = pd.concat([df, df_glob]) return df @@ -122,8 +131,12 @@ def add_regions(df: pd.DataFrame, regions: Table) -> pd.DataFrame: ) df_cont = df_cont[df_cont["country"].isin(continents)] df_out = pd.concat([df_out, df_cont]) - df_out["cholera_case_fatality_rate"] = (df_out["cholera_deaths"] / df_out["cholera_reported_cases"]) * 100 + df_out["cholera_case_fatality_rate"] = cholera_case_fatality_rate(df_out) df = pd.concat([df, df_out]) return df + + +def cholera_case_fatality_rate(df: pd.DataFrame) -> pd.Series: + return (df["cholera_deaths"] / df["cholera_reported_cases"]) * 100