Skip to content

Commit

Permalink
📊 Replace backported dataset by proper GHO (#3171)
Browse files Browse the repository at this point in the history
* 📊 Replace backported dataset by proper GHO
  • Loading branch information
Marigold authored Aug 22, 2024
1 parent 984e074 commit d2ff4f5
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 24 deletions.
2 changes: 1 addition & 1 deletion dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ steps:

# Cholera
data://garden/who/2023-06-01/cholera:
- backport://backport/owid/latest/dataset_5676_global_health_observatory__world_health_organization__2022_08
- data://garden/who/2024-01-03/gho
- snapshot://fasttrack/2023-05-31/cholera.csv
- data://garden/regions/2023-01-01/regions
- data://garden/wb/2021-07-01/wb_income
Expand Down
59 changes: 36 additions & 23 deletions etl/steps/data/garden/who/2023-06-01/cholera.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pandas as pd
from owid.catalog import Dataset, Table
from owid.catalog.datasets import NULLABLE_DTYPES
from structlog import get_logger

from etl.data_helpers import geo
Expand All @@ -21,30 +20,19 @@ def run(dest_dir: str) -> None:
# Load inputs.
#
# Load backport dataset.
short_name = "dataset_5676_global_health_observatory__world_health_organization__2022_08"
who_gh_dataset: Dataset = paths.load_dependency(short_name)
who_gh = who_gh_dataset[short_name].reset_index()
who_gh_dataset = paths.load_dataset("gho")

# Load fast track dataset
snap: Dataset = paths.load_dependency("cholera.csv")
snap = paths.load_snapshot("cholera.csv")
cholera_ft = pd.read_csv(snap.path)

# Load countries regions
regions_dataset: Dataset = paths.load_dependency("regions")
regions_dataset = paths.load_dataset("regions")
regions = regions_dataset["regions"]

# Process backport dataset
cholera_cols = who_gh.columns[who_gh.columns.str.contains("cholera")].to_list()
cholera_bp = who_gh[["year", "entity_name"] + cholera_cols]
cholera_bp[cholera_cols] = cholera_bp[cholera_cols].apply(pd.to_numeric, errors="coerce")
cholera_bp = cholera_bp.dropna(how="all", axis=0, subset=cholera_cols).rename(
columns={
"entity_name": "country",
"indicator__cholera_case_fatality_rate": "cholera_case_fatality_rate",
"indicator__number_of_reported_cases_of_cholera": "cholera_reported_cases",
"indicator__number_of_reported_deaths_from_cholera": "cholera_deaths",
}
)
# Process GHO dataset
cholera_bp = process_gho_cholera(who_gh_dataset).reset_index()

# The regional and global data in the backport is only provided for 2013 so we remove it here and recalculate it
cholera_bp = geo.harmonize_countries(
df=cholera_bp, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
Expand All @@ -58,9 +46,6 @@ def run(dest_dir: str) -> None:

tb_garden = Table(cholera_combined.set_index(["country", "year"], verify_integrity=True), short_name="cholera")

# Convert nullable types to float64, otherwise we risk pd.NA and np.nan being mixed up.
float64_cols = [col for col, dtype in tb_garden.dtypes.items() if dtype in NULLABLE_DTYPES]
tb_garden[float64_cols] = tb_garden[float64_cols].astype(float)
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
Expand All @@ -72,6 +57,30 @@ def run(dest_dir: str) -> None:
log.info("cholera.end")


def process_gho_cholera(who_gh_dataset: Dataset) -> Table:
tb_names = [
"cholera_case_fatality_rate",
"number_of_reported_cases_of_cholera",
"number_of_reported_deaths_from_cholera",
]
cholera_bp = who_gh_dataset[tb_names[0]]
for tb_name in tb_names[1:]:
cholera_bp = cholera_bp.join(who_gh_dataset[tb_name], how="outer")

return (
cholera_bp.drop(columns=["comments"])
.rename(
columns={
"cholera_case_fatality_rate": "cholera_case_fatality_rate",
"number_of_reported_cases_of_cholera": "cholera_reported_cases",
"number_of_reported_deaths_from_cholera": "cholera_deaths",
}
)
.dropna(how="all", axis=0)
.astype(float)
)


def add_global_total(df: pd.DataFrame, regions: Table) -> pd.DataFrame:
"""
Calculate global total of cholera cases and add it to the existing dataset
Expand All @@ -85,7 +94,7 @@ def add_global_total(df: pd.DataFrame, regions: Table) -> pd.DataFrame:
), f"{df['country'][~df['country'].isin(countries)].drop_duplicates()}, is not a country"
df_glob = df.groupby(["year"]).agg({"cholera_reported_cases": "sum", "cholera_deaths": "sum"}).reset_index()
df_glob["country"] = "World"
df_glob["cholera_case_fatality_rate"] = (df_glob["cholera_deaths"] / df_glob["cholera_reported_cases"]) * 100
df_glob["cholera_case_fatality_rate"] = cholera_case_fatality_rate(df_glob)
df = pd.concat([df, df_glob])

return df
Expand Down Expand Up @@ -122,8 +131,12 @@ def add_regions(df: pd.DataFrame, regions: Table) -> pd.DataFrame:
)
df_cont = df_cont[df_cont["country"].isin(continents)]
df_out = pd.concat([df_out, df_cont])
df_out["cholera_case_fatality_rate"] = (df_out["cholera_deaths"] / df_out["cholera_reported_cases"]) * 100
df_out["cholera_case_fatality_rate"] = cholera_case_fatality_rate(df_out)

df = pd.concat([df, df_out])

return df


def cholera_case_fatality_rate(df: pd.DataFrame) -> pd.Series:
return (df["cholera_deaths"] / df["cholera_reported_cases"]) * 100

0 comments on commit d2ff4f5

Please sign in to comment.