From 42432fadeedbf7d02080e9bf7f61c779946f9451 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 2 Sep 2024 11:31:39 +0200 Subject: [PATCH 1/7] adding global health data --- dag/health.yml | 4 ++ .../latest/global_health_mpox.countries.json | 19 ++++++ .../health/latest/global_health_mpox.meta.yml | 59 +++++++++++++++++++ .../health/latest/global_health_mpox.py | 53 +++++++++++++++++ .../health/latest/global_health_mpox.py | 36 +++++++++++ .../health/latest/global_health_mpox.csv.dvc | 27 +++++++++ snapshots/health/latest/global_health_mpox.py | 24 ++++++++ 7 files changed, 222 insertions(+) create mode 100644 etl/steps/data/garden/health/latest/global_health_mpox.countries.json create mode 100644 etl/steps/data/garden/health/latest/global_health_mpox.meta.yml create mode 100644 etl/steps/data/garden/health/latest/global_health_mpox.py create mode 100644 etl/steps/data/meadow/health/latest/global_health_mpox.py create mode 100644 snapshots/health/latest/global_health_mpox.csv.dvc create mode 100644 snapshots/health/latest/global_health_mpox.py diff --git a/dag/health.yml b/dag/health.yml index 2f0e2e46cb5..38d2a715624 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -731,3 +731,7 @@ steps: # Multi-dim indicators export://multidim/health/latest/causes_of_death: - grapher://grapher/ihme_gbd/2024-05-20/gbd_cause + data://meadow/health/latest/global_health_mpox: + - snapshot://health/latest/global_health_mpox.csv + data://garden/health/latest/global_health_mpox: + - data://meadow/health/latest/global_health_mpox diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.countries.json b/etl/steps/data/garden/health/latest/global_health_mpox.countries.json new file mode 100644 index 00000000000..dc1f0d32139 --- /dev/null +++ b/etl/steps/data/garden/health/latest/global_health_mpox.countries.json @@ -0,0 +1,19 @@ +{ + "Burundi": "Burundi", + "Cameroon": "Cameroon", + "Central African Republic": "Central African Republic", + "Cote d'Ivoire": "Cote d'Ivoire", + "Democratic Republic of the Congo": "Democratic Republic of Congo", + "Gabon": "Gabon", + "Ghana": "Ghana", + "Kenya": "Kenya", + "Liberia": "Liberia", + "Nigeria": "Nigeria", + "Rwanda": "Rwanda", + "South Africa": "South Africa", + "Sweden": "Sweden", + "Thailand": "Thailand", + "Uganda": "Uganda", + "Burundi ": "Burundi", + "Republic of the Congo": "Congo" +} \ No newline at end of file diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml new file mode 100644 index 00000000000..aa5729f38c6 --- /dev/null +++ b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml @@ -0,0 +1,59 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Global Health + - Mpox (monkeypox) + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 1 + + +tables: + global_health_mpox: + variables: + # testing_variable: + # title: Testing variable title + # unit: arbitrary units + # short_unit: au + # description_short: Short description of testing variable. + # description_processing: Description of processing of testing variable. + # description_key: List of key points about the indicator. + # description_from_producer: Description of testing variable from producer. + # processing_level: minor + # type: + # sort: + # presentation: + # attribution: + # attribution_short: + # faqs: + # grapher_config: + # title_public: + # title_variant: + # topic_tags: + # display: + # name: Testing variable + # numDecimalPlaces: 0 + # tolerance: 0 + # color: + # conversionFactor: 1 + # description: + # entityAnnotationsMap: Test annotation + # includeInTable: + # isProjection: false + # unit: arbitrary units + # shortUnit: au + # tableDisplay: + # hideAbsoluteChange: + # hideRelativeChange: + # yearIsDay: false + # zeroDay: + # roundingMode: + # numSignificantFigures: + # + {} + diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.py b/etl/steps/data/garden/health/latest/global_health_mpox.py new file mode 100644 index 00000000000..7c062893670 --- /dev/null +++ b/etl/steps/data/garden/health/latest/global_health_mpox.py @@ -0,0 +1,53 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table +from owid.catalog import processing as pr + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("global_health_mpox") + + # Read table from meadow dataset. + tb = ds_meadow["global_health_mpox"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb = tb[tb["case_status"] == "suspected"] + # Calculate the frequency of suspected cases per reported date + tb = tb.groupby(["country", "date"], observed=True).count().reset_index().drop(columns=["id"]) + # add suspected cases for 2023 + tb_2023 = Table( + { + "country": ["Cameroon", "Congo", "Democratic Republic of Congo"], + "date": ["2023-12-24", "2023-12-24", "2023-12-24"], + "case_status": ["113", "74", "12985"], + } + ) + tb = pr.concat([tb, tb_2023]).sort_values(["country", "date"]) + # Calculate the cumulative + tb["case_status"] = tb["case_status"].astype("int") + tb["cumulative_cases"] = tb.groupby(["country"])["case_status"].cumsum() + tb = tb.format(["country", "date"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/meadow/health/latest/global_health_mpox.py b/etl/steps/data/meadow/health/latest/global_health_mpox.py new file mode 100644 index 00000000000..1bb6d2ad308 --- /dev/null +++ b/etl/steps/data/meadow/health/latest/global_health_mpox.py @@ -0,0 +1,36 @@ +"""Load a snapshot and create a meadow dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("global_health_mpox.csv") + + # Load data from snapshot. + tb = snap.read(low_memory=False) + tb = tb[["ID", "Case_status", "Location_Admin0", "Date_report_source_I"]] + assert all(tb["Date_report_source_I"].notna()) + + tb = tb.rename(columns={"Date_report_source_I": "date", "Location_Admin0": "country"}) + # + # Process data. + # + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + # Row per individual - will aggregate in garden step so will keep ID as index for now + tb = tb.format(["id", "country", "date"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/health/latest/global_health_mpox.csv.dvc b/snapshots/health/latest/global_health_mpox.csv.dvc new file mode 100644 index 00000000000..fe2506c0129 --- /dev/null +++ b/snapshots/health/latest/global_health_mpox.csv.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: Mpox - 2024 + date_published: "2024-09-02" + + # Citation + producer: Global.health + citation_full: |- + Global.health Mpox (accessed on 2024-09-02) + + # Files + url_main: https://global.health/ + url_download: https://mpox-2024.s3.eu-central-1.amazonaws.com/latest.csv + date_accessed: 2024-09-02 + + # License + license: + name: CC BY 4.0 + url: https://global.health/terms-of-use/ + +outs: + - md5: d08ecccd90b6f83761adb67b1681471a + size: 6744395 + path: global_health_mpox.csv diff --git a/snapshots/health/latest/global_health_mpox.py b/snapshots/health/latest/global_health_mpox.py new file mode 100644 index 00000000000..eb8e11a15b6 --- /dev/null +++ b/snapshots/health/latest/global_health_mpox.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"health/{SNAPSHOT_VERSION}/global_health_mpox.csv") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() From a9bd894d78585b51766c12670ef72edeeae95295 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 2 Sep 2024 11:40:08 +0200 Subject: [PATCH 2/7] using global health for explorer --- dag/health.yml | 16 +++++-- .../health/latest/global_health_mpox.meta.yml | 48 +++---------------- .../health/latest/global_health_mpox.py | 3 +- .../garden/who/latest/monkeypox/__init__.py | 2 +- 4 files changed, 21 insertions(+), 48 deletions(-) diff --git a/dag/health.yml b/dag/health.yml index 38d2a715624..337df088553 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -714,13 +714,20 @@ steps: - data://meadow/who/latest/monkeypox - data://garden/demography/2023-03-31/population - data://garden/regions/2023-01-01/regions - - data://grapher/fasttrack/latest/africa_cdc + - data://garden/health/latest/global_health_mpox data://grapher/who/latest/monkeypox: - data://garden/who/latest/monkeypox data://explorers/who/latest/monkeypox: - data://garden/who/latest/monkeypox export://github/who/latest/monkeypox: - data://garden/who/latest/monkeypox +# Mpox - Global.health + data://meadow/health/latest/global_health_mpox: + - snapshot://health/latest/global_health_mpox.csv + data://garden/health/latest/global_health_mpox: + - data://meadow/health/latest/global_health_mpox + + # Eurostat cancer data://meadow/health/2024-08-23/eurostat_cancer: - snapshot://health/2024-08-23/eurostat_cancer.csv data://garden/health/2024-08-23/eurostat_cancer: @@ -728,10 +735,9 @@ steps: data://grapher/health/2024-08-23/eurostat_cancer: - data://garden/health/2024-08-23/eurostat_cancer + # Multi-dim indicators export://multidim/health/latest/causes_of_death: - grapher://grapher/ihme_gbd/2024-05-20/gbd_cause - data://meadow/health/latest/global_health_mpox: - - snapshot://health/latest/global_health_mpox.csv - data://garden/health/latest/global_health_mpox: - - data://meadow/health/latest/global_health_mpox + + diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml index aa5729f38c6..5ab0f544202 100644 --- a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml +++ b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml @@ -16,44 +16,10 @@ dataset: tables: global_health_mpox: variables: - # testing_variable: - # title: Testing variable title - # unit: arbitrary units - # short_unit: au - # description_short: Short description of testing variable. - # description_processing: Description of processing of testing variable. - # description_key: List of key points about the indicator. - # description_from_producer: Description of testing variable from producer. - # processing_level: minor - # type: - # sort: - # presentation: - # attribution: - # attribution_short: - # faqs: - # grapher_config: - # title_public: - # title_variant: - # topic_tags: - # display: - # name: Testing variable - # numDecimalPlaces: 0 - # tolerance: 0 - # color: - # conversionFactor: 1 - # description: - # entityAnnotationsMap: Test annotation - # includeInTable: - # isProjection: false - # unit: arbitrary units - # shortUnit: au - # tableDisplay: - # hideAbsoluteChange: - # hideRelativeChange: - # yearIsDay: false - # zeroDay: - # roundingMode: - # numSignificantFigures: - # - {} - + reported_cases: + title: Reported mpox cases + unit: cases + cumulative_cases: + title: Cumulative mpox cases + unit: cases + description_processing: Data for 2024 is taken from Global.health, data for 2023 is taken from Africa CDC. diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.py b/etl/steps/data/garden/health/latest/global_health_mpox.py index 7c062893670..f81cba36052 100644 --- a/etl/steps/data/garden/health/latest/global_health_mpox.py +++ b/etl/steps/data/garden/health/latest/global_health_mpox.py @@ -38,7 +38,8 @@ def run(dest_dir: str) -> None: tb = pr.concat([tb, tb_2023]).sort_values(["country", "date"]) # Calculate the cumulative tb["case_status"] = tb["case_status"].astype("int") - tb["cumulative_cases"] = tb.groupby(["country"])["case_status"].cumsum() + tb["suspected_cases_cumulative_cases"] = tb.groupby(["country"])["case_status"].cumsum() + tb = tb.rename(columns={"case_status": "reported_cases"}) tb = tb.format(["country", "date"]) # diff --git a/etl/steps/data/garden/who/latest/monkeypox/__init__.py b/etl/steps/data/garden/who/latest/monkeypox/__init__.py index 77aa6331aa4..3322560e9a8 100644 --- a/etl/steps/data/garden/who/latest/monkeypox/__init__.py +++ b/etl/steps/data/garden/who/latest/monkeypox/__init__.py @@ -28,7 +28,7 @@ def run(dest_dir: str) -> None: # # Load meadow dataset. ds_meadow = paths.load_dataset("monkeypox") - ds_suspected = paths.load_dataset("africa_cdc") + ds_suspected = paths.load_dataset("global_health_mpox") # Read table from meadow dataset. tb = ds_meadow["monkeypox"].reset_index() tb_suspected = ds_suspected["africa_cdc"].reset_index() From 1bf94893c06c23497851a7249cf0450bc4e6adb6 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 2 Sep 2024 12:01:53 +0200 Subject: [PATCH 3/7] update explorer --- etl/steps/data/garden/health/latest/global_health_mpox.meta.yml | 2 +- etl/steps/data/garden/health/latest/global_health_mpox.py | 2 +- etl/steps/data/garden/who/latest/monkeypox/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml index 5ab0f544202..d3001d7ecdc 100644 --- a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml +++ b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml @@ -19,7 +19,7 @@ tables: reported_cases: title: Reported mpox cases unit: cases - cumulative_cases: + suspected_cases_cumulative: title: Cumulative mpox cases unit: cases description_processing: Data for 2024 is taken from Global.health, data for 2023 is taken from Africa CDC. diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.py b/etl/steps/data/garden/health/latest/global_health_mpox.py index f81cba36052..02d81668321 100644 --- a/etl/steps/data/garden/health/latest/global_health_mpox.py +++ b/etl/steps/data/garden/health/latest/global_health_mpox.py @@ -38,7 +38,7 @@ def run(dest_dir: str) -> None: tb = pr.concat([tb, tb_2023]).sort_values(["country", "date"]) # Calculate the cumulative tb["case_status"] = tb["case_status"].astype("int") - tb["suspected_cases_cumulative_cases"] = tb.groupby(["country"])["case_status"].cumsum() + tb["suspected_cases_cumulative"] = tb.groupby(["country"])["case_status"].cumsum() tb = tb.rename(columns={"case_status": "reported_cases"}) tb = tb.format(["country", "date"]) diff --git a/etl/steps/data/garden/who/latest/monkeypox/__init__.py b/etl/steps/data/garden/who/latest/monkeypox/__init__.py index 3322560e9a8..d560ef62846 100644 --- a/etl/steps/data/garden/who/latest/monkeypox/__init__.py +++ b/etl/steps/data/garden/who/latest/monkeypox/__init__.py @@ -31,7 +31,7 @@ def run(dest_dir: str) -> None: ds_suspected = paths.load_dataset("global_health_mpox") # Read table from meadow dataset. tb = ds_meadow["monkeypox"].reset_index() - tb_suspected = ds_suspected["africa_cdc"].reset_index() + tb_suspected = ds_suspected["global_health_mpox"].reset_index() cols = ["country", "date", "suspected_cases_cumulative"] tb_suspected = tb_suspected[cols] assert tb_suspected.shape[1] == len(cols) From a99c967fd5d903c55134d448079cbde1a5c2599f Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 2 Sep 2024 12:31:59 +0200 Subject: [PATCH 4/7] auto-update script --- scripts/update-global-health-mpox.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 scripts/update-global-health-mpox.sh diff --git a/scripts/update-global-health-mpox.sh b/scripts/update-global-health-mpox.sh new file mode 100644 index 00000000000..66966a5a925 --- /dev/null +++ b/scripts/update-global-health-mpox.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -e + +start_time=$(date +%s) + +echo '--- Update Mpox - Global Health' +cd /home/owid/etl +poetry run python snapshots/health/latest/global_health_mpox.py + +# commit to master will trigger ETL which is gonna run the step +echo '--- Commit and push changes' + +git add . +git commit -m ":robot: update: monkeypox" || true +git push origin master -q || true + +end_time=$(date +%s) + +echo "--- Done! ($(($end_time - $start_time))s)" From adf7144e712a15111a8ab9b741399e521524d1c5 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 2 Sep 2024 14:11:11 +0200 Subject: [PATCH 5/7] combine mpox scripts --- scripts/update-global-health-mpox.sh | 20 -------------------- scripts/update-monkeypox.sh | 1 + 2 files changed, 1 insertion(+), 20 deletions(-) delete mode 100644 scripts/update-global-health-mpox.sh diff --git a/scripts/update-global-health-mpox.sh b/scripts/update-global-health-mpox.sh deleted file mode 100644 index 66966a5a925..00000000000 --- a/scripts/update-global-health-mpox.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -e - -start_time=$(date +%s) - -echo '--- Update Mpox - Global Health' -cd /home/owid/etl -poetry run python snapshots/health/latest/global_health_mpox.py - -# commit to master will trigger ETL which is gonna run the step -echo '--- Commit and push changes' - -git add . -git commit -m ":robot: update: monkeypox" || true -git push origin master -q || true - -end_time=$(date +%s) - -echo "--- Done! ($(($end_time - $start_time))s)" diff --git a/scripts/update-monkeypox.sh b/scripts/update-monkeypox.sh index de167e9f905..b07fd8b7d5c 100755 --- a/scripts/update-monkeypox.sh +++ b/scripts/update-monkeypox.sh @@ -7,6 +7,7 @@ start_time=$(date +%s) echo '--- Update Monkeypox' cd /home/owid/etl poetry run python snapshots/who/latest/monkeypox.py +poetry run python snapshots/health/latest/global_health_mpox.py # commit to master will trigger ETL which is gonna run the step echo '--- Commit and push changes' From ab9299f21273729a7e1086e6b015376a52fd46fb Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 2 Sep 2024 14:58:19 +0200 Subject: [PATCH 6/7] adding metadata --- .../data/garden/health/latest/global_health_mpox.meta.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml index d3001d7ecdc..e2b44e192a7 100644 --- a/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml +++ b/etl/steps/data/garden/health/latest/global_health_mpox.meta.yml @@ -17,9 +17,9 @@ tables: global_health_mpox: variables: reported_cases: - title: Reported mpox cases + title: Reported suspected mpox cases unit: cases suspected_cases_cumulative: - title: Cumulative mpox cases + title: Cumulative suspected mpox cases unit: cases description_processing: Data for 2024 is taken from Global.health, data for 2023 is taken from Africa CDC. From 0d209436d32ea375c10cfdac81a12a588e1762e9 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 9 Sep 2024 11:31:56 +0100 Subject: [PATCH 7/7] update snapshot --- snapshots/health/latest/global_health_mpox.csv.dvc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshots/health/latest/global_health_mpox.csv.dvc b/snapshots/health/latest/global_health_mpox.csv.dvc index fe2506c0129..f11786237a3 100644 --- a/snapshots/health/latest/global_health_mpox.csv.dvc +++ b/snapshots/health/latest/global_health_mpox.csv.dvc @@ -22,6 +22,6 @@ meta: url: https://global.health/terms-of-use/ outs: - - md5: d08ecccd90b6f83761adb67b1681471a - size: 6744395 + - md5: cf3c0ac7af89613fc2aa7e6dcdf954d0 + size: 7902134 path: global_health_mpox.csv