diff --git a/dag/animal_welfare.yml b/dag/animal_welfare.yml new file mode 100644 index 00000000000..1927810e370 --- /dev/null +++ b/dag/animal_welfare.yml @@ -0,0 +1,10 @@ +steps: + # + # Global hen inventory (Welfare Footprint Project, 2023). + # + data://meadow/animal_welfare/2023-08-01/global_hen_inventory: + - snapshot://animal_welfare/2023-08-01/global_hen_inventory.csv + data://garden/animal_welfare/2023-08-01/global_hen_inventory: + - data://meadow/animal_welfare/2023-08-01/global_hen_inventory + data://grapher/animal_welfare/2023-08-01/global_hen_inventory: + - data://garden/animal_welfare/2023-08-01/global_hen_inventory diff --git a/dag/main.yml b/dag/main.yml index 81f9d038ec2..441e6d460d8 100644 --- a/dag/main.yml +++ b/dag/main.yml @@ -562,3 +562,4 @@ include: - dag/space.yml - dag/artificial_intelligence.yml - dag/covid.yml + - dag/animal_welfare.yml diff --git a/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.countries.json b/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.countries.json new file mode 100644 index 00000000000..595ce477377 --- /dev/null +++ b/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.countries.json @@ -0,0 +1,82 @@ +{ + "Algeria": "Algeria", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Brazil": "Brazil", + "Brunei": "Brunei", + "Bulgaria": "Bulgaria", + "Canada": "Canada", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "Denmark": "Denmark", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "Estonia": "Estonia", + "Finland": "Finland", + "France": "France", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Hungary": "Hungary", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Ireland": "Ireland", + "Italy": "Italy", + "Japan": "Japan", + "Kazakhstan": "Kazakhstan", + "Latvia": "Latvia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Malaysia": "Malaysia", + "Malta": "Malta", + "Mexico": "Mexico", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nigeria": "Nigeria", + "Norway": "Norway", + "Pakistan": "Pakistan", + "Papua New Guinea": "Papua New Guinea", + "Peru": "Peru", + "Poland": "Poland", + "Portugal": "Portugal", + "Republic of Korea": "South Korea", + "Romania": "Romania", + "Russia": "Russia", + "Saudi Arabia": "Saudi Arabia", + "Singapore": "Singapore", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "South Africa": "South Africa", + "Spain": "Spain", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Taiwan": "Taiwan", + "Thailand": "Thailand", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "Kyrgystan": "Kyrgyzstan", + "Morrocco": "Morocco", + "Philipines": "Philippines", + "Taijikistan": "Tajikistan" +} \ No newline at end of file diff --git a/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.meta.yml b/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.meta.yml new file mode 100644 index 00000000000..a6b91006d7f --- /dev/null +++ b/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.meta.yml @@ -0,0 +1,47 @@ +dataset: + title: Global hen inventory (Welfare Footprint Project, 2022) + +tables: + global_hen_inventory: + title: Global hen inventory + variables: + share_of_hens_in_barns: + title: Share of hens housed in a barn or aviary + unit: '%' + short_unit: "%" + share_of_brown_hens: + title: Share of all hens that are brown + unit: '%' + short_unit: '%' + share_of_hens_in_cages: + title: Share of hens in cages + unit: '%' + short_unit: '%' + number_of_hens_cage_free: + title: Number of cage-free hens + unit: 'hens' + short_unit: '' + number_of_hens_in_cages: + title: Number of hens in cages + unit: 'hens' + short_unit: '' + number_of_commercial_egg_farms: + title: Number of commercial egg farms + unit: 'farms' + short_unit: '' + share_of_hens_free_range_not_organic: + title: Share of non-organic, free-range hens + unit: '%' + short_unit: '%' + share_of_hens_free_range_organic: + title: Share of organic, free-range hens + unit: '%' + short_unit: '%' + number_of_laying_hens: + title: Number of laying hens + unit: 'hens' + short_unit: '' + share_of_hens_in_unknown_housing: + title: Share of laying hens in unknown housing + unit: '%' + short_unit: '%' diff --git a/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.py b/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.py new file mode 100644 index 00000000000..0e4f7b191b1 --- /dev/null +++ b/etl/steps/data/garden/animal_welfare/2023-08-01/global_hen_inventory.py @@ -0,0 +1,131 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Dataset, Table + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Columns to select from data, and how to rename them. +COLUMNS = { + "country": "country", + "year": "year", + "barn": "share_of_hens_in_barns", + "brown__pct": "share_of_brown_hens", + "cage": "share_of_hens_in_cages", + "cage_free": "number_of_hens_cage_free", + "cages": "number_of_hens_in_cages", + "commercial_egg_farms": "number_of_commercial_egg_farms", + "free_range": "share_of_hens_free_range_not_organic", + "organic": "share_of_hens_free_range_organic", + "total_layers": "number_of_laying_hens", + "unknown": "share_of_hens_in_unknown_housing", + # The following columns will be used to extract sources and urls and add them to the source metadata description. + # Afterwards, they will be removed. + "available_at": "available_at", + "source": "source", + # "click_placements" : "click_placements", + # "number_of_records": "number_of_records", +} + + +def add_individual_sources_to_metadata(tb: Table) -> Table: + tb = tb.copy() + # Check that each country has only one source. + assert (tb.groupby("country").agg({"source": "nunique"})["source"] == 1).all(), "Expected one source per country." + # Gather the data source for each country. + original_sources = ( + "- " + + tb["country"].astype(str) + + ": " + + tb["source"].astype(str) + + " Available at " + + tb["available_at"].astype(str) + ) + # Check that each variable has only one source. + assert all( + [len(tb[column].metadata.sources) == 1 for column in tb.columns if column not in ["country", "year"]] + ), "Expected only one source. Something has changed." + # Take the source from any of those variables. + source = tb[tb.columns[-1]].metadata.sources[0] + # Add the full list of original sources to the variable source. + source.published_by = source.published_by + "\n" + "\n".join(original_sources) + # Replace the source of each variable with the new one that has the full list of original sources. + for column in tb.columns: + tb[column].metadata.sources = [source] + + return tb + + +def clean_values(tb: Table) -> Table: + tb = tb.copy() + # Remove the spurious "%" symbols from some of the values in some columns. + for column in tb.columns: + if tb[column].astype(str).str.contains("%").any(): + tb[column] = tb[column].str.replace("%", "").astype(float) + + return tb + + +def run_sanity_checks_on_outputs(tb: Table) -> None: + assert all([tb[column].min() >= 0 for column in tb.columns]), "All numbers should be >0" + assert all([tb[column].max() <= 100 for column in tb.columns if "share" in column]), "Percentages should be <100" + # Check that the percentages of the different laying hens housings add up to 100%. + # Note: The share of brown hens is not related to all other shares about housing systems. + assert ( + tb[ + [ + "share_of_hens_free_range_not_organic", + "share_of_hens_free_range_organic", + "share_of_hens_in_barns", + "share_of_hens_in_cages", + "share_of_hens_in_unknown_housing", + ] + ].sum(axis=1) + < 101 + ).all() + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow: Dataset = paths.load_dependency("global_hen_inventory") + tb = ds_meadow["global_hen_inventory"].reset_index() + + # + # Process data. + # + # Select and rename columns. + tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise") + + # Harmonize country names. + tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + + # The sources and URLs of the data for each country are given as separate columns. + # Gather them and add them to the source description of each variable. + tb = add_individual_sources_to_metadata(tb=tb) + + # Drop unnecessary columns. + tb = tb.drop(columns=["available_at", "source"]) + + # Clean data (remove spurious "%" in the data). + tb = clean_values(tb=tb) + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # Run sanity checks on outputs. + run_sanity_checks_on_outputs(tb=tb) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/animal_welfare/2023-08-01/global_hen_inventory.py b/etl/steps/data/grapher/animal_welfare/2023-08-01/global_hen_inventory.py new file mode 100644 index 00000000000..66d425fb934 --- /dev/null +++ b/etl/steps/data/grapher/animal_welfare/2023-08-01/global_hen_inventory.py @@ -0,0 +1,33 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from typing import cast + +from owid.catalog import Dataset + +from etl.helpers import PathFinder, create_dataset, grapher_checks + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = cast(Dataset, paths.load_dependency("global_hen_inventory")) + tb = ds_garden["global_hen_inventory"] + + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata) + + # + # Checks. + # + grapher_checks(ds_grapher) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/animal_welfare/2023-08-01/global_hen_inventory.py b/etl/steps/data/meadow/animal_welfare/2023-08-01/global_hen_inventory.py new file mode 100644 index 00000000000..63e6a322153 --- /dev/null +++ b/etl/steps/data/meadow/animal_welfare/2023-08-01/global_hen_inventory.py @@ -0,0 +1,38 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset +from etl.snapshot import Snapshot + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap: Snapshot = paths.load_dependency("global_hen_inventory.csv") + + # Load data from snapshot. + tb = pr.read_csv(snap.path, sep="\t", encoding="utf-16", engine="python", metadata=snap.to_table_metadata()) + + # + # Process data. + # + # Ensure all columns are snake-case. + tb = tb.underscore() + + # Set an appropriate index and sort conveniently. + tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata) + + # Save changes in the new garden dataset. + ds_meadow.save() diff --git a/snapshots/animal_welfare/2023-08-01/global_hen_inventory.csv.dvc b/snapshots/animal_welfare/2023-08-01/global_hen_inventory.csv.dvc new file mode 100644 index 00000000000..195010c97a7 --- /dev/null +++ b/snapshots/animal_welfare/2023-08-01/global_hen_inventory.csv.dvc @@ -0,0 +1,21 @@ +meta: + name: Global hen inventory + publication_year: 2022 + + publication_date: '2022-12-06' + source_name: Welfare Footprint Project + source_published_by: Welfare Footprint Project based on various sources. + url: https://welfarefootprint.org/research-projects/laying-hens/ + source_data_url: + file_extension: csv + license_url: https://welfarefootprint.org/research-projects/laying-hens/ + license_name: Creative Commons BY + date_accessed: 2023-08-01 + is_public: true + description: | + +wdir: ../../../data/snapshots/animal_welfare/2023-08-01 +outs: +- md5: 5f8cb302d78da91870da11136e9fdd5a + size: 41872 + path: global_hen_inventory.csv diff --git a/snapshots/animal_welfare/2023-08-01/global_hen_inventory.py b/snapshots/animal_welfare/2023-08-01/global_hen_inventory.py new file mode 100644 index 00000000000..c75a5611d42 --- /dev/null +++ b/snapshots/animal_welfare/2023-08-01/global_hen_inventory.py @@ -0,0 +1,50 @@ +"""Script to create a snapshot of dataset 'Global hen inventory'. + +The data is manually downloaded from the first Tableau dashboard shown in: +https://welfarefootprint.org/research-projects/laying-hens/ + +To download the data: +* Go to: https://public.tableau.com/views/GlobalHenInventory-Reduced/Dashboard1 +* Click on the download button on the bottom-right corner of the dashboard. +* Select "Data" in the pop-up message, which will open another window showing a data table. +* In that new window, there is a button "Show fields" on the upper-right corner of the window. Select all fields. +* Next to that button, click on "Download", which will download a csv file. + +Then execute this script with the argument --path-to-file followed by the path to the downloaded file. + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option( + "--upload/--skip-upload", + default=True, + type=bool, + help="Upload dataset to Snapshot", +) +@click.option("--path-to-file", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"animal_welfare/{SNAPSHOT_VERSION}/global_hen_inventory.csv") + + # Ensure destination folder exists. + snap.path.parent.mkdir(exist_ok=True, parents=True) + + # Copy local data file to snapshots data folder. + snap.path.write_bytes(Path(path_to_file).read_bytes()) + + # Add file to DVC and upload to S3. + snap.dvc_add(upload=upload) + + +if __name__ == "__main__": + main()