Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add global hen inventory dataset #1404

Merged
merged 3 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions dag/animal_welfare.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
steps:
#
# Global hen inventory (Welfare Footprint Project, 2023).
#
data://meadow/animal_welfare/2023-08-01/global_hen_inventory:
- snapshot://animal_welfare/2023-08-01/global_hen_inventory.csv
data://garden/animal_welfare/2023-08-01/global_hen_inventory:
- data://meadow/animal_welfare/2023-08-01/global_hen_inventory
data://grapher/animal_welfare/2023-08-01/global_hen_inventory:
- data://garden/animal_welfare/2023-08-01/global_hen_inventory
1 change: 1 addition & 0 deletions dag/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -562,3 +562,4 @@ include:
- dag/space.yml
- dag/artificial_intelligence.yml
- dag/covid.yml
- dag/animal_welfare.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{
"Algeria": "Algeria",
"Argentina": "Argentina",
"Armenia": "Armenia",
"Australia": "Australia",
"Austria": "Austria",
"Azerbaijan": "Azerbaijan",
"Bangladesh": "Bangladesh",
"Belarus": "Belarus",
"Belgium": "Belgium",
"Brazil": "Brazil",
"Brunei": "Brunei",
"Bulgaria": "Bulgaria",
"Canada": "Canada",
"Chile": "Chile",
"China": "China",
"Colombia": "Colombia",
"Croatia": "Croatia",
"Cuba": "Cuba",
"Cyprus": "Cyprus",
"Czech Republic": "Czechia",
"Denmark": "Denmark",
"Ecuador": "Ecuador",
"Egypt": "Egypt",
"Estonia": "Estonia",
"Finland": "Finland",
"France": "France",
"Georgia": "Georgia",
"Germany": "Germany",
"Ghana": "Ghana",
"Hungary": "Hungary",
"India": "India",
"Indonesia": "Indonesia",
"Iran": "Iran",
"Ireland": "Ireland",
"Italy": "Italy",
"Japan": "Japan",
"Kazakhstan": "Kazakhstan",
"Latvia": "Latvia",
"Libya": "Libya",
"Lithuania": "Lithuania",
"Luxembourg": "Luxembourg",
"Malaysia": "Malaysia",
"Malta": "Malta",
"Mexico": "Mexico",
"Netherlands": "Netherlands",
"New Zealand": "New Zealand",
"Nigeria": "Nigeria",
"Norway": "Norway",
"Pakistan": "Pakistan",
"Papua New Guinea": "Papua New Guinea",
"Peru": "Peru",
"Poland": "Poland",
"Portugal": "Portugal",
"Republic of Korea": "South Korea",
"Romania": "Romania",
"Russia": "Russia",
"Saudi Arabia": "Saudi Arabia",
"Singapore": "Singapore",
"Slovakia": "Slovakia",
"Slovenia": "Slovenia",
"South Africa": "South Africa",
"Spain": "Spain",
"Sweden": "Sweden",
"Switzerland": "Switzerland",
"Taiwan": "Taiwan",
"Thailand": "Thailand",
"Turkey": "Turkey",
"Turkmenistan": "Turkmenistan",
"Ukraine": "Ukraine",
"United Arab Emirates": "United Arab Emirates",
"United Kingdom": "United Kingdom",
"United States": "United States",
"Uruguay": "Uruguay",
"Uzbekistan": "Uzbekistan",
"Venezuela": "Venezuela",
"Vietnam": "Vietnam",
"Kyrgystan": "Kyrgyzstan",
"Morrocco": "Morocco",
"Philipines": "Philippines",
"Taijikistan": "Tajikistan"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
dataset:
title: Global hen inventory (Welfare Footprint Project, 2022)

tables:
global_hen_inventory:
title: Global hen inventory
variables:
share_of_hens_in_barns:
title: Share of hens housed in a barn or aviary
unit: '%'
short_unit: "%"
share_of_brown_hens:
title: Share of all hens that are brown
unit: '%'
short_unit: '%'
share_of_hens_in_cages:
title: Share of hens in cages
unit: '%'
short_unit: '%'
number_of_hens_cage_free:
title: Number of cage-free hens
unit: 'hens'
short_unit: ''
number_of_hens_in_cages:
title: Number of hens in cages
unit: 'hens'
short_unit: ''
number_of_commercial_egg_farms:
title: Number of commercial egg farms
unit: 'farms'
short_unit: ''
share_of_hens_free_range_not_organic:
title: Share of non-organic, free-range hens
unit: '%'
short_unit: '%'
share_of_hens_free_range_organic:
title: Share of organic, free-range hens
unit: '%'
short_unit: '%'
number_of_laying_hens:
title: Number of laying hens
unit: 'hens'
short_unit: ''
share_of_hens_in_unknown_housing:
title: Share of laying hens in unknown housing
unit: '%'
short_unit: '%'
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Load a meadow dataset and create a garden dataset."""

from owid.catalog import Dataset, Table

from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

# Columns to select from data, and how to rename them.
COLUMNS = {
"country": "country",
"year": "year",
"barn": "share_of_hens_in_barns",
"brown__pct": "share_of_brown_hens",
"cage": "share_of_hens_in_cages",
"cage_free": "number_of_hens_cage_free",
"cages": "number_of_hens_in_cages",
"commercial_egg_farms": "number_of_commercial_egg_farms",
"free_range": "share_of_hens_free_range_not_organic",
"organic": "share_of_hens_free_range_organic",
"total_layers": "number_of_laying_hens",
"unknown": "share_of_hens_in_unknown_housing",
# The following columns will be used to extract sources and urls and add them to the source metadata description.
# Afterwards, they will be removed.
"available_at": "available_at",
"source": "source",
# "click_placements" : "click_placements",
# "number_of_records": "number_of_records",
}


def add_individual_sources_to_metadata(tb: Table) -> Table:
tb = tb.copy()
# Check that each country has only one source.
assert (tb.groupby("country").agg({"source": "nunique"})["source"] == 1).all(), "Expected one source per country."
# Gather the data source for each country.
original_sources = (
"- "
+ tb["country"].astype(str)
+ ": "
+ tb["source"].astype(str)
+ " Available at "
+ tb["available_at"].astype(str)
)
# Check that each variable has only one source.
assert all(
[len(tb[column].metadata.sources) == 1 for column in tb.columns if column not in ["country", "year"]]
), "Expected only one source. Something has changed."
# Take the source from any of those variables.
source = tb[tb.columns[-1]].metadata.sources[0]
# Add the full list of original sources to the variable source.
source.published_by = source.published_by + "\n" + "\n".join(original_sources)
# Replace the source of each variable with the new one that has the full list of original sources.
for column in tb.columns:
tb[column].metadata.sources = [source]

return tb


def clean_values(tb: Table) -> Table:
tb = tb.copy()
# Remove the spurious "%" symbols from some of the values in some columns.
for column in tb.columns:
if tb[column].astype(str).str.contains("%").any():
tb[column] = tb[column].str.replace("%", "").astype(float)

return tb


def run_sanity_checks_on_outputs(tb: Table) -> None:
assert all([tb[column].min() >= 0 for column in tb.columns]), "All numbers should be >0"
assert all([tb[column].max() <= 100 for column in tb.columns if "share" in column]), "Percentages should be <100"
# Check that the percentages of the different laying hens housings add up to 100%.
# Note: The share of brown hens is not related to all other shares about housing systems.
assert (
tb[
[
"share_of_hens_free_range_not_organic",
"share_of_hens_free_range_organic",
"share_of_hens_in_barns",
"share_of_hens_in_cages",
"share_of_hens_in_unknown_housing",
]
].sum(axis=1)
< 101
).all()


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset and read its main table.
ds_meadow: Dataset = paths.load_dependency("global_hen_inventory")
tb = ds_meadow["global_hen_inventory"].reset_index()

#
# Process data.
#
# Select and rename columns.
tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise")

# Harmonize country names.
tb: Table = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)

# The sources and URLs of the data for each country are given as separate columns.
# Gather them and add them to the source description of each variable.
tb = add_individual_sources_to_metadata(tb=tb)

# Drop unnecessary columns.
tb = tb.drop(columns=["available_at", "source"])

# Clean data (remove spurious "%" in the data).
tb = clean_values(tb=tb)

# Set an appropriate index and sort conveniently.
tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1)

# Run sanity checks on outputs.
run_sanity_checks_on_outputs(tb=tb)

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)

# Save changes in the new garden dataset.
ds_garden.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Load a garden dataset and create a grapher dataset."""

from typing import cast

from owid.catalog import Dataset

from etl.helpers import PathFinder, create_dataset, grapher_checks

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset and read its main table.
ds_garden = cast(Dataset, paths.load_dependency("global_hen_inventory"))
tb = ds_garden["global_hen_inventory"]

#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)

#
# Checks.
#
grapher_checks(ds_grapher)

# Save changes in the new grapher dataset.
ds_grapher.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Load a snapshot and create a meadow dataset."""

import owid.catalog.processing as pr

from etl.helpers import PathFinder, create_dataset
from etl.snapshot import Snapshot

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap: Snapshot = paths.load_dependency("global_hen_inventory.csv")

# Load data from snapshot.
tb = pr.read_csv(snap.path, sep="\t", encoding="utf-16", engine="python", metadata=snap.to_table_metadata())

#
# Process data.
#
# Ensure all columns are snake-case.
tb = tb.underscore()

# Set an appropriate index and sort conveniently.
tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1)

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)

# Save changes in the new garden dataset.
ds_meadow.save()
21 changes: 21 additions & 0 deletions snapshots/animal_welfare/2023-08-01/global_hen_inventory.csv.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
meta:
name: Global hen inventory
publication_year: 2022

publication_date: '2022-12-06'
source_name: Welfare Footprint Project
source_published_by: Welfare Footprint Project based on various sources.
url: https://welfarefootprint.org/research-projects/laying-hens/
source_data_url:
file_extension: csv
license_url: https://welfarefootprint.org/research-projects/laying-hens/
license_name: Creative Commons BY
date_accessed: 2023-08-01
is_public: true
description: |

wdir: ../../../data/snapshots/animal_welfare/2023-08-01
outs:
- md5: 5f8cb302d78da91870da11136e9fdd5a
size: 41872
path: global_hen_inventory.csv
Loading