Skip to content

Commit

Permalink
📊 Adding global health data for mpox (#3229)
Browse files Browse the repository at this point in the history
* adding global health data

* using global health for explorer

* update explorer

* auto-update script

* combine mpox scripts

* adding metadata

* update snapshot
  • Loading branch information
spoonerf authored Sep 9, 2024
1 parent bf1a645 commit 2957843
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 3 deletions.
10 changes: 9 additions & 1 deletion dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -720,13 +720,20 @@ steps:
- data://meadow/who/latest/monkeypox
- data://garden/demography/2023-03-31/population
- data://garden/regions/2023-01-01/regions
- data://grapher/fasttrack/latest/africa_cdc
- data://garden/health/latest/global_health_mpox
data://grapher/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
data://explorers/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
export://github/who/latest/monkeypox:
- data://garden/who/latest/monkeypox
# Mpox - Global.health
data://meadow/health/latest/global_health_mpox:
- snapshot://health/latest/global_health_mpox.csv
data://garden/health/latest/global_health_mpox:
- data://meadow/health/latest/global_health_mpox

# Eurostat cancer

# Eurostat Cancer Screening
data://meadow/health/2024-08-23/eurostat_cancer:
Expand All @@ -736,6 +743,7 @@ steps:
data://grapher/health/2024-08-23/eurostat_cancer:
- data://garden/health/2024-08-23/eurostat_cancer


# Multi-dim indicators
export://multidim/health/latest/causes_of_death:
- grapher://grapher/ihme_gbd/2024-05-20/gbd_cause
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"Burundi": "Burundi",
"Cameroon": "Cameroon",
"Central African Republic": "Central African Republic",
"Cote d'Ivoire": "Cote d'Ivoire",
"Democratic Republic of the Congo": "Democratic Republic of Congo",
"Gabon": "Gabon",
"Ghana": "Ghana",
"Kenya": "Kenya",
"Liberia": "Liberia",
"Nigeria": "Nigeria",
"Rwanda": "Rwanda",
"South Africa": "South Africa",
"Sweden": "Sweden",
"Thailand": "Thailand",
"Uganda": "Uganda",
"Burundi ": "Burundi",
"Republic of the Congo": "Congo"
}
25 changes: 25 additions & 0 deletions etl/steps/data/garden/health/latest/global_health_mpox.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Global Health
- Mpox (monkeypox)


# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 1


tables:
global_health_mpox:
variables:
reported_cases:
title: Reported suspected mpox cases
unit: cases
suspected_cases_cumulative:
title: Cumulative suspected mpox cases
unit: cases
description_processing: Data for 2024 is taken from Global.health, data for 2023 is taken from Africa CDC.
54 changes: 54 additions & 0 deletions etl/steps/data/garden/health/latest/global_health_mpox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Load a meadow dataset and create a garden dataset."""

from owid.catalog import Table
from owid.catalog import processing as pr

from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("global_health_mpox")

# Read table from meadow dataset.
tb = ds_meadow["global_health_mpox"].reset_index()

#
# Process data.
#
tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
tb = tb[tb["case_status"] == "suspected"]
# Calculate the frequency of suspected cases per reported date
tb = tb.groupby(["country", "date"], observed=True).count().reset_index().drop(columns=["id"])
# add suspected cases for 2023
tb_2023 = Table(
{
"country": ["Cameroon", "Congo", "Democratic Republic of Congo"],
"date": ["2023-12-24", "2023-12-24", "2023-12-24"],
"case_status": ["113", "74", "12985"],
}
)
tb = pr.concat([tb, tb_2023]).sort_values(["country", "date"])
# Calculate the cumulative
tb["case_status"] = tb["case_status"].astype("int")
tb["suspected_cases_cumulative"] = tb.groupby(["country"])["case_status"].cumsum()
tb = tb.rename(columns={"case_status": "reported_cases"})
tb = tb.format(["country", "date"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
4 changes: 2 additions & 2 deletions etl/steps/data/garden/who/latest/monkeypox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def run(dest_dir: str) -> None:
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("monkeypox")
ds_suspected = paths.load_dataset("africa_cdc")
ds_suspected = paths.load_dataset("global_health_mpox")
# Read table from meadow dataset.
tb = ds_meadow["monkeypox"].reset_index()
tb_suspected = ds_suspected["africa_cdc"].reset_index()
tb_suspected = ds_suspected["global_health_mpox"].reset_index()
cols = ["country", "date", "suspected_cases_cumulative"]
tb_suspected = tb_suspected[cols]
assert tb_suspected.shape[1] == len(cols)
Expand Down
36 changes: 36 additions & 0 deletions etl/steps/data/meadow/health/latest/global_health_mpox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Load a snapshot and create a meadow dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("global_health_mpox.csv")

# Load data from snapshot.
tb = snap.read(low_memory=False)
tb = tb[["ID", "Case_status", "Location_Admin0", "Date_report_source_I"]]
assert all(tb["Date_report_source_I"].notna())

tb = tb.rename(columns={"Date_report_source_I": "date", "Location_Admin0": "country"})
#
# Process data.
#
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
# Row per individual - will aggregate in garden step so will keep ID as index for now
tb = tb.format(["id", "country", "date"])

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata)

# Save changes in the new meadow dataset.
ds_meadow.save()
1 change: 1 addition & 0 deletions scripts/update-monkeypox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ start_time=$(date +%s)
echo '--- Update Monkeypox'
cd /home/owid/etl
poetry run python snapshots/who/latest/monkeypox.py
poetry run python snapshots/health/latest/global_health_mpox.py

# commit to master will trigger ETL which is gonna run the step
echo '--- Commit and push changes'
Expand Down
27 changes: 27 additions & 0 deletions snapshots/health/latest/global_health_mpox.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: Mpox - 2024
date_published: "2024-09-02"

# Citation
producer: Global.health
citation_full: |-
Global.health Mpox (accessed on 2024-09-02)

# Files
url_main: https://global.health/
url_download: https://mpox-2024.s3.eu-central-1.amazonaws.com/latest.csv
date_accessed: 2024-09-02

# License
license:
name: CC BY 4.0
url: https://global.health/terms-of-use/

outs:
- md5: cf3c0ac7af89613fc2aa7e6dcdf954d0
size: 7902134
path: global_health_mpox.csv
24 changes: 24 additions & 0 deletions snapshots/health/latest/global_health_mpox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"health/{SNAPSHOT_VERSION}/global_health_mpox.csv")

# Download data from source, add file to DVC and upload to S3.
snap.create_snapshot(upload=upload)


if __name__ == "__main__":
main()

0 comments on commit 2957843

Please sign in to comment.