Merge pull request #1413 from owid/add-uk-egg-statistics

Add uk egg statistics
owid · Aug 2, 2023 · 53c8908 · 53c8908
2 parents 0996d96 + 44cfb54
commit 53c8908
Show file tree

Hide file tree

Showing 9 changed files with 236 additions and 2 deletions.
diff --git a/dag/animal_welfare.yml b/dag/animal_welfare.yml
@@ -1,10 +1,19 @@
 steps:
   #
-  # Global hen inventory (Welfare Footprint Project, 2023).
+  # Global hen inventory (Welfare Footprint Project, 2022).
   #
   data://meadow/animal_welfare/2023-08-01/global_hen_inventory:
     - snapshot://animal_welfare/2023-08-01/global_hen_inventory.csv
   data://garden/animal_welfare/2023-08-01/global_hen_inventory:
     - data://meadow/animal_welfare/2023-08-01/global_hen_inventory
   data://grapher/animal_welfare/2023-08-01/global_hen_inventory:
     - data://garden/animal_welfare/2023-08-01/global_hen_inventory
+  #
+  # UK egg statistics (Defra, 2023).
+  #
+  data://meadow/animal_welfare/2023-08-01/uk_egg_statistics:
+    - snapshot://animal_welfare/2023-08-01/uk_egg_statistics.ods
+  data://garden/animal_welfare/2023-08-01/uk_egg_statistics:
+    - data://meadow/animal_welfare/2023-08-01/uk_egg_statistics
+  data://grapher/animal_welfare/2023-08-01/uk_egg_statistics:
+    - data://garden/animal_welfare/2023-08-01/uk_egg_statistics
diff --git a/etl/steps/data/garden/animal_welfare/2023-08-01/uk_egg_statistics.meta.yml b/etl/steps/data/garden/animal_welfare/2023-08-01/uk_egg_statistics.meta.yml
@@ -0,0 +1,22 @@
+dataset:
+  title: "UK egg statistics (Defra, 2023)"
+
+tables:
+  uk_egg_statistics:
+    variables:
+      number_of_eggs_from_barns:
+        title: Number of eggs from hens in barns
+        unit: eggs
+        short_unit: ''
+      number_of_eggs_from_enriched_cages:
+        title: Number of eggs from hens in (enriched) cages
+        unit: eggs
+        short_unit: ''
+      number_of_eggs_from_non_organic_free_range_farms:
+        title: Number of eggs from hens in non-organic, free-range farms
+        unit: eggs
+        short_unit: ''
+      number_of_eggs_from_organic_free_range_farms:
+        title: Number of eggs from hens in organic, free-range farms
+        unit: eggs
+        short_unit: ''
diff --git a/etl/steps/data/garden/animal_welfare/2023-08-01/uk_egg_statistics.py b/etl/steps/data/garden/animal_welfare/2023-08-01/uk_egg_statistics.py
@@ -0,0 +1,51 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from owid.catalog import Dataset
+
+from etl.helpers import PathFinder, create_dataset
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Columns to select from data, and how to rename them.
+COLUMNS = {
+    "year": "year",
+    "enriched": "number_of_eggs_from_enriched_cages",
+    "barn": "number_of_eggs_from_barns",
+    "free_range": "number_of_eggs_from_non_organic_free_range_farms",
+    "organic": "number_of_eggs_from_organic_free_range_farms",
+}
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset and read its main table.
+    ds_meadow: Dataset = paths.load_dependency("uk_egg_statistics")
+    tb = ds_meadow["uk_egg_statistics"].reset_index()
+
+    #
+    # Process data.
+    #
+    # Select and rename columns.
+    tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise")
+
+    # Convert million dozens of eggs to eggs.
+    for column in tb.drop(columns="year").columns:
+        tb[column] *= 12e6
+
+    # Add a country column.
+    tb["country"] = "United Kingdom"
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["country", "year"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    #
+    # Save outputs.
+    #
+    # Create a new garden dataset with the same metadata as the meadow dataset.
+    ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/animal_welfare/2023-08-01/uk_egg_statistics.py b/etl/steps/data/grapher/animal_welfare/2023-08-01/uk_egg_statistics.py
@@ -0,0 +1,31 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from owid.catalog import Dataset
+
+from etl.helpers import PathFinder, create_dataset, grapher_checks
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset and read its main table.
+    ds_garden: Dataset = paths.load_dependency("uk_egg_statistics")
+    tb = ds_garden["uk_egg_statistics"]
+
+    #
+    # Save outputs.
+    #
+    # Create a new grapher dataset with the same metadata as the garden dataset.
+    ds_grapher = create_dataset(dest_dir, tables=[tb], default_metadata=ds_garden.metadata)
+
+    #
+    # Checks.
+    #
+    grapher_checks(ds_grapher)
+
+    # Save changes in the new grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/animal_welfare/2023-08-01/uk_egg_statistics.py b/etl/steps/data/meadow/animal_welfare/2023-08-01/uk_egg_statistics.py
@@ -0,0 +1,51 @@
+"""Load a snapshot and create a meadow dataset."""
+
+import owid.catalog.processing as pr
+
+from etl.helpers import PathFinder, create_dataset
+from etl.snapshot import Snapshot
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Columns to select from data, and how to rename them.
+COLUMNS = {
+    "Year": "year",
+    "Enriched": "enriched",
+    "Barn": "barn",
+    "Free Range": "free_range",
+    "Organic": "organic",
+}
+
+
+def run(dest_dir: str) -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap: Snapshot = paths.load_dependency("uk_egg_statistics.ods")
+
+    # Load data from snapshot.
+    tb = pr.read_excel(snap.path, sheet_name="Packers_Annual", skiprows=2, metadata=snap.to_table_metadata())
+
+    #
+    # Process data.
+    #
+    # Select and rename columns.
+    tb = tb[list(COLUMNS)].rename(columns=COLUMNS, errors="raise")
+
+    # Remove spurious rows at the bottom of the file, containing footnotes.
+    # To achieve that, detect rows where the year column is not a number.
+    tb = tb[tb["year"].str.match(r"\d{4}", na=True)].reset_index(drop=True)
+
+    # Set an appropriate index and sort conveniently.
+    tb = tb.set_index(["year"], verify_integrity=True).sort_index().sort_index(axis=1)
+
+    #
+    # Save outputs.
+    #
+    # Create a new meadow dataset with the same metadata as the snapshot.
+    ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)
+
+    # Save changes in the new garden dataset.
+    ds_meadow.save()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -73,6 +73,7 @@ fsspec = "2022.11.0"
 openai = "^0.27.7"
 pdfplumber = "^0.9.0"
 wbgapi = "^1.0.12"
+odfpy = "^1.4.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.1.2"

diff --git a/snapshots/animal_welfare/2023-08-01/uk_egg_statistics.ods.dvc b/snapshots/animal_welfare/2023-08-01/uk_egg_statistics.ods.dvc
@@ -0,0 +1,24 @@
+meta:
+  name: UK egg statistics
+  publication_year: 2023
+
+  publication_date: '2023-07-27'
+  source_name: Department for Environment, Food & Rural Affairs of the United Kingdom
+  source_published_by: Quarterly UK statistics on Egg Packing Station Throughput and
+    Prices, by the Department for Environment, Food & Rural Affairs of the United
+    Kingdom
+  url: https://www.gov.uk/government/statistics/egg-statistics
+  source_data_url: 
+    https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1173898/egg-packers-27july23.ods
+  license_url: 
+    https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/1173898/egg-packers-27july23.ods
+  license_name: Public domain
+  date_accessed: 2023-08-01
+  is_public: true
+  description: |
+    DEFRA runs a quarterly survey of registered UK egg packing stations. It is a voluntary sample survey of 27 respondents that collects information on throughput by production type and prices of graded eggs and sales of ungraded eggs. The response rate is typically 100 per cent and the survey accounts for 75 per cent of eggs packed in the UK. The survey figures are raised up to give UK estimates using information on the number of commercial laying hens, average egg yields, average mortality rates, the proportion of UK eggs that go through packing stations. Throughput by egg type for packing stations not surveyed is calculated using data provided by packing stations responding to the survey. The raised figures are published in this statistics notice and the associated datasets. The figures in this notice therefore represent all Class A eggs passed over a grader in the UK, including seconds.
+wdir: ../../../data/snapshots/animal_welfare/2023-08-01
+outs:
+- md5: fa90508145ab75c9fddb46c8d6a560fe
+  size: 33319
+  path: uk_egg_statistics.ods
diff --git a/snapshots/animal_welfare/2023-08-01/uk_egg_statistics.py b/snapshots/animal_welfare/2023-08-01/uk_egg_statistics.py
@@ -0,0 +1,32 @@
+"""Script to create a snapshot of dataset 'UK egg statistics'."""
+
+from pathlib import Path
+
+import click
+
+from etl.snapshot import Snapshot
+
+# Version for current snapshot dataset.
+SNAPSHOT_VERSION = Path(__file__).parent.name
+
+
+@click.command()
+@click.option(
+    "--upload/--skip-upload",
+    default=True,
+    type=bool,
+    help="Upload dataset to Snapshot",
+)
+def main(upload: bool) -> None:
+    # Create a new snapshot.
+    snap = Snapshot(f"animal_welfare/{SNAPSHOT_VERSION}/uk_egg_statistics.ods")
+
+    # Download data from source.
+    snap.download_from_source()
+
+    # Add file to DVC and upload to S3.
+    snap.dvc_add(upload=upload)
+
+
+if __name__ == "__main__":
+    main()