📊 Adding global health data for mpox (#3229)

* adding global health data * using global health for explorer * update explorer * auto-update script * combine mpox scripts * adding metadata * update snapshot
owid · Sep 9, 2024 · 2957843 · 2957843
1 parent bf1a645
commit 2957843
Show file tree

Hide file tree

Showing 9 changed files with 197 additions and 3 deletions.
diff --git a/dag/health.yml b/dag/health.yml
@@ -720,13 +720,20 @@ steps:
     - data://meadow/who/latest/monkeypox
     - data://garden/demography/2023-03-31/population
     - data://garden/regions/2023-01-01/regions
-    - data://grapher/fasttrack/latest/africa_cdc
+    - data://garden/health/latest/global_health_mpox
   data://grapher/who/latest/monkeypox:
     - data://garden/who/latest/monkeypox
   data://explorers/who/latest/monkeypox:
     - data://garden/who/latest/monkeypox
   export://github/who/latest/monkeypox:
     - data://garden/who/latest/monkeypox
+# Mpox - Global.health
+  data://meadow/health/latest/global_health_mpox:
+    - snapshot://health/latest/global_health_mpox.csv
+  data://garden/health/latest/global_health_mpox:
+    - data://meadow/health/latest/global_health_mpox
+
+  # Eurostat cancer
 
    # Eurostat Cancer Screening
   data://meadow/health/2024-08-23/eurostat_cancer:
@@ -736,6 +743,7 @@ steps:
   data://grapher/health/2024-08-23/eurostat_cancer:
     - data://garden/health/2024-08-23/eurostat_cancer
 
+
   # Multi-dim indicators
   export://multidim/health/latest/causes_of_death:
     - grapher://grapher/ihme_gbd/2024-05-20/gbd_cause

diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.countries.json b/etl/steps/data/garden/health/latest/global_health_mpox.countries.json
@@ -0,0 +1,19 @@
+{
+  "Burundi": "Burundi",
+  "Cameroon": "Cameroon",
+  "Central African Republic": "Central African Republic",
+  "Cote d'Ivoire": "Cote d'Ivoire",
+  "Democratic Republic of the Congo": "Democratic Republic of Congo",
+  "Gabon": "Gabon",
+  "Ghana": "Ghana",
+  "Kenya": "Kenya",
+  "Liberia": "Liberia",
+  "Nigeria": "Nigeria",
+  "Rwanda": "Rwanda",
+  "South Africa": "South Africa",
+  "Sweden": "Sweden",
+  "Thailand": "Thailand",
+  "Uganda": "Uganda",
+  "Burundi ": "Burundi",
+  "Republic of the Congo": "Congo"
+}
diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml
@@ -0,0 +1,25 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Global Health
+        - Mpox (monkeypox)
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 1
+
+
+tables:
+  global_health_mpox:
+    variables:
+      reported_cases:
+        title: Reported suspected mpox cases
+        unit: cases
+      suspected_cases_cumulative:
+        title: Cumulative suspected mpox cases
+        unit: cases
+        description_processing: Data for 2024 is taken from Global.health, data for 2023 is taken from Africa CDC.
diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.py b/etl/steps/data/garden/health/latest/global_health_mpox.py
@@ -0,0 +1,54 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from owid.catalog import Table
+from owid.catalog import processing as pr
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("global_health_mpox")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow["global_health_mpox"].reset_index()
+
+    #
+    # Process data.
+    #
+    tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
+    tb = tb[tb["case_status"] == "suspected"]
+    # Calculate the frequency of suspected cases per reported date
+    tb = tb.groupby(["country", "date"], observed=True).count().reset_index().drop(columns=["id"])
+    # add suspected cases for 2023
+    tb_2023 = Table(
+        {
+            "country": ["Cameroon", "Congo", "Democratic Republic of Congo"],
+            "date": ["2023-12-24", "2023-12-24", "2023-12-24"],
+            "case_status": ["113", "74", "12985"],
+        }
+    )
+    tb = pr.concat([tb, tb_2023]).sort_values(["country", "date"])
+    # Calculate the cumulative
+    tb["case_status"] = tb["case_status"].astype("int")
+    tb["suspected_cases_cumulative"] = tb.groupby(["country"])["case_status"].cumsum()
+    tb = tb.rename(columns={"case_status": "reported_cases"})
+    tb = tb.format(["country", "date"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
+    )
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/garden/who/latest/monkeypox/__init__.py b/etl/steps/data/garden/who/latest/monkeypox/__init__.py
@@ -31,10 +31,10 @@ def run(dest_dir: str) -> None:
     #
     # Load meadow dataset.
     ds_meadow = paths.load_dataset("monkeypox")
-    ds_suspected = paths.load_dataset("africa_cdc")
+    ds_suspected = paths.load_dataset("global_health_mpox")
     # Read table from meadow dataset.
     tb = ds_meadow["monkeypox"].reset_index()
-    tb_suspected = ds_suspected["africa_cdc"].reset_index()
+    tb_suspected = ds_suspected["global_health_mpox"].reset_index()
     cols = ["country", "date", "suspected_cases_cumulative"]
     tb_suspected = tb_suspected[cols]
     assert tb_suspected.shape[1] == len(cols)

diff --git a/etl/steps/data/meadow/health/latest/global_health_mpox.py b/etl/steps/data/meadow/health/latest/global_health_mpox.py
@@ -0,0 +1,36 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("global_health_mpox.csv")
+
+    # Load data from snapshot.
+    tb = snap.read(low_memory=False)
+    tb = tb[["ID", "Case_status", "Location_Admin0", "Date_report_source_I"]]
+    assert all(tb["Date_report_source_I"].notna())
+
+    tb = tb.rename(columns={"Date_report_source_I": "date", "Location_Admin0": "country"})
+    #
+    # Process data.
+    #
+    # Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
+    # Row per individual - will aggregate in garden step so will keep ID as index for now
+    tb = tb.format(["id", "country", "date"])
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)
+
+    # Save changes in the new meadow dataset.
+    ds_meadow.save()
diff --git a/scripts/update-monkeypox.sh b/scripts/update-monkeypox.sh
@@ -7,6 +7,7 @@ start_time=$(date +%s)
 echo '--- Update Monkeypox'
 cd /home/owid/etl
 poetry run python snapshots/who/latest/monkeypox.py
+poetry run python snapshots/health/latest/global_health_mpox.py
 
 # commit to master will trigger ETL which is gonna run the step
 echo '--- Commit and push changes'

diff --git a/snapshots/health/latest/global_health_mpox.csv.dvc b/snapshots/health/latest/global_health_mpox.csv.dvc
@@ -0,0 +1,27 @@
+# Learn more at:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+meta:
+  origin:
+    # Data product / Snapshot
+    title: Mpox - 2024
+    date_published: "2024-09-02"
+
+    # Citation
+    producer: Global.health
+    citation_full: |-
+      Global.health Mpox (accessed on 2024-09-02)
+
+    # Files
+    url_main: https://global.health/
+    url_download: https://mpox-2024.s3.eu-central-1.amazonaws.com/latest.csv
+    date_accessed: 2024-09-02
+
+    # License
+    license:
+      name: CC BY 4.0
+      url: https://global.health/terms-of-use/
+
+outs:
+  - md5: cf3c0ac7af89613fc2aa7e6dcdf954d0
+    size: 7902134
+    path: global_health_mpox.csv
diff --git a/snapshots/health/latest/global_health_mpox.py b/snapshots/health/latest/global_health_mpox.py
@@ -0,0 +1,24 @@
+"""Script to create a snapshot of dataset."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"health/{SNAPSHOT_VERSION}/global_health_mpox.csv")
+
+    # Download data from source, add file to DVC and upload to S3.
+    snap.create_snapshot(upload=upload)
+
+
+if __name__ == "__main__":
+    main()