Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 Adding global health data for mpox #3229

Merged
merged 8 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -720,13 +720,20 @@ steps:
- data://meadow/who/latest/monkeypox
- data://garden/demography/2023-03-31/population
- data://garden/regions/2023-01-01/regions
- data://grapher/fasttrack/latest/africa_cdc
- data://garden/health/latest/global_health_mpox
data://grapher/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
data://explorers/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
export://github/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
# Mpox - Global.health
data://meadow/health/latest/global_health_mpox:
- snapshot://health/latest/global_health_mpox.csv
data://garden/health/latest/global_health_mpox:
- data://meadow/health/latest/global_health_mpox

# Eurostat cancer

# Eurostat Cancer Screening
data://meadow/health/2024-08-23/eurostat_cancer:
Expand All @@ -736,6 +743,7 @@ steps:
data://grapher/health/2024-08-23/eurostat_cancer:
- data://garden/health/2024-08-23/eurostat_cancer


# Multi-dim indicators
export://multidim/health/latest/causes_of_death:
- grapher://grapher/ihme_gbd/2024-05-20/gbd_cause
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"Burundi": "Burundi",
"Cameroon": "Cameroon",
"Central African Republic": "Central African Republic",
"Cote d'Ivoire": "Cote d'Ivoire",
"Democratic Republic of the Congo": "Democratic Republic of Congo",
"Gabon": "Gabon",
"Ghana": "Ghana",
"Kenya": "Kenya",
"Liberia": "Liberia",
"Nigeria": "Nigeria",
"Rwanda": "Rwanda",
"South Africa": "South Africa",
"Sweden": "Sweden",
"Thailand": "Thailand",
"Uganda": "Uganda",
"Burundi ": "Burundi",
"Republic of the Congo": "Congo"
}
25 changes: 25 additions & 0 deletions etl/steps/data/garden/health/latest/global_health_mpox.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Global Health
- Mpox (monkeypox)


# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 1


tables:
global_health_mpox:
variables:
reported_cases:
title: Reported suspected mpox cases
unit: cases
suspected_cases_cumulative:
title: Cumulative suspected mpox cases
unit: cases
description_processing: Data for 2024 is taken from Global.health, data for 2023 is taken from Africa CDC.
54 changes: 54 additions & 0 deletions etl/steps/data/garden/health/latest/global_health_mpox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Load a meadow dataset and create a garden dataset."""

from owid.catalog import Table
from owid.catalog import processing as pr

from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("global_health_mpox")

# Read table from meadow dataset.
tb = ds_meadow["global_health_mpox"].reset_index()

#
# Process data.
#
tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
tb = tb[tb["case_status"] == "suspected"]
# Calculate the frequency of suspected cases per reported date
tb = tb.groupby(["country", "date"], observed=True).count().reset_index().drop(columns=["id"])
# add suspected cases for 2023
tb_2023 = Table(
{
"country": ["Cameroon", "Congo", "Democratic Republic of Congo"],
"date": ["2023-12-24", "2023-12-24", "2023-12-24"],
"case_status": ["113", "74", "12985"],
}
)
tb = pr.concat([tb, tb_2023]).sort_values(["country", "date"])
# Calculate the cumulative
tb["case_status"] = tb["case_status"].astype("int")
tb["suspected_cases_cumulative"] = tb.groupby(["country"])["case_status"].cumsum()
tb = tb.rename(columns={"case_status": "reported_cases"})
tb = tb.format(["country", "date"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
4 changes: 2 additions & 2 deletions etl/steps/data/garden/who/latest/monkeypox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def run(dest_dir: str) -> None:
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("monkeypox")
ds_suspected = paths.load_dataset("africa_cdc")
ds_suspected = paths.load_dataset("global_health_mpox")
# Read table from meadow dataset.
tb = ds_meadow["monkeypox"].reset_index()
tb_suspected = ds_suspected["africa_cdc"].reset_index()
tb_suspected = ds_suspected["global_health_mpox"].reset_index()
cols = ["country", "date", "suspected_cases_cumulative"]
tb_suspected = tb_suspected[cols]
assert tb_suspected.shape[1] == len(cols)
Expand Down
36 changes: 36 additions & 0 deletions etl/steps/data/meadow/health/latest/global_health_mpox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Load a snapshot and create a meadow dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("global_health_mpox.csv")

# Load data from snapshot.
tb = snap.read(low_memory=False)
tb = tb[["ID", "Case_status", "Location_Admin0", "Date_report_source_I"]]
assert all(tb["Date_report_source_I"].notna())

tb = tb.rename(columns={"Date_report_source_I": "date", "Location_Admin0": "country"})
#
# Process data.
#
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
# Row per individual - will aggregate in garden step so will keep ID as index for now
tb = tb.format(["id", "country", "date"])

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)

# Save changes in the new meadow dataset.
ds_meadow.save()
1 change: 1 addition & 0 deletions scripts/update-monkeypox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ start_time=$(date +%s)
echo '--- Update Monkeypox'
cd /home/owid/etl
poetry run python snapshots/who/latest/monkeypox.py
poetry run python snapshots/health/latest/global_health_mpox.py

# commit to master will trigger ETL which is gonna run the step
echo '--- Commit and push changes'
Expand Down
27 changes: 27 additions & 0 deletions snapshots/health/latest/global_health_mpox.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: Mpox - 2024
date_published: "2024-09-02"

# Citation
producer: Global.health
citation_full: |-
Global.health Mpox (accessed on 2024-09-02)

# Files
url_main: https://global.health/
url_download: https://mpox-2024.s3.eu-central-1.amazonaws.com/latest.csv
date_accessed: 2024-09-02

# License
license:
name: CC BY 4.0
url: https://global.health/terms-of-use/

outs:
- md5: cf3c0ac7af89613fc2aa7e6dcdf954d0
size: 7902134
path: global_health_mpox.csv
24 changes: 24 additions & 0 deletions snapshots/health/latest/global_health_mpox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"health/{SNAPSHOT_VERSION}/global_health_mpox.csv")

# Download data from source, add file to DVC and upload to S3.
snap.create_snapshot(upload=upload)


if __name__ == "__main__":
main()
Loading