🔨 refactor walkthrough

owid · Aug 4, 2023 · bb01bbd · bb01bbd
1 parent f2f30dd
commit bb01bbd
Show file tree

Hide file tree

Showing 43 changed files with 105 additions and 335 deletions.
diff --git a/backport/migrate/garden_cookiecutter/cookiecutter.json b/backport/migrate/garden_cookiecutter/cookiecutter.json
diff --git a/backport/migrate/grapher_cookiecutter/cookiecutter.json b/backport/migrate/grapher_cookiecutter/cookiecutter.json
diff --git a/backport/migrate/migrate.py b/backport/migrate/migrate.py
@@ -1,21 +1,19 @@
 import shutil
-import tempfile
 from pathlib import Path
-from typing import Any, Dict, Optional, cast
+from typing import Optional, cast
 
 import click
 import structlog
-from cookiecutter.main import cookiecutter
 from owid.catalog.utils import underscore
 from sqlalchemy.engine import Engine
 
 from etl import config
 from etl.backport_helpers import create_dataset
 from etl.db import get_engine
-from etl.files import apply_black_formatter_to_files, yaml_dump
+from etl.files import yaml_dump
 from etl.metadata_export import metadata_export
 from etl.paths import DAG_DIR, SNAPSHOTS_DIR, STEP_DIR
-from walkthrough.utils import add_to_dag
+from walkthrough.utils import add_to_dag, generate_step
 
 from ..backport import PotentialBackport
 
@@ -89,31 +87,6 @@ def cli(
     )
 
 
-def generate_step(cookiecutter_path: Path, data: Dict[str, Any], target_dir: Path) -> None:
-    with tempfile.TemporaryDirectory() as temp_dir:
-        OUTPUT_DIR = temp_dir
-
-        # generate ingest scripts
-        cookiecutter(
-            cookiecutter_path.as_posix(),
-            no_input=True,
-            output_dir=temp_dir,
-            overwrite_if_exists=True,
-            extra_context=data,
-        )
-
-        shutil.copytree(
-            Path(OUTPUT_DIR),
-            target_dir,
-            dirs_exist_ok=True,
-        )
-
-    DATASET_DIR = target_dir / data["namespace"] / data["version"]
-
-    # Apply black formatter to generated files.
-    apply_black_formatter_to_files(file_paths=DATASET_DIR.glob("*.py"))
-
-
 def migrate(
     dataset_id: int,
     namespace: str,

diff --git a/backport/migrate/snapshot_cookiecutter/cookiecutter.json b/backport/migrate/snapshot_cookiecutter/cookiecutter.json
diff --git a/dag/walkthrough.yml b/dag/walkthrough.yml
@@ -7,4 +7,4 @@ steps:
   data://grapher/dummy/2020-01-01/dummy:
   - data://garden/dummy/2020-01-01/dummy
   data://explorers/dummy/2020-01-01/dummy:
-  - data://garden/dummy/2020-01-01/dummy
+  - data://garden/dummy/2020-01-01/dummy
diff --git a/etl/steps/data/garden/dummy/2020-01-01/dummy.meta.yml b/etl/steps/data/garden/dummy/2020-01-01/dummy.meta.yml
@@ -1,50 +1,5 @@
-# (Inherited from meadow, remove if not different.)
-all_sources:
-dataset:
-  title: Dummy dataset
 tables:
   dummy:
-    # (Inherited from meadow, remove if not different.)
     variables:
       dummy_variable:
-        title: Dummy
-        description: This is a dummy indicator with full metadata. # Description can be a long text if need be.
-        licenses: [] # Licenses is an obsolete field - use origin.license in the snapshot to record upstream licenses and license to specify the redistribution license
-        unit: Dummy unit
-        short_unit: Du
-        display:
-          isProjection: true
-          conversionFactor: 1000
-          numDecimalPlaces: 1
-          tolerance: 5
-          yearIsDay: false
-          zeroDay: 1900-01-01
-          entityAnnotationsMap: "Germany: dummy annotation"
-          includeInTable: true
-        description_short: Short description of the dummy indicator.
-        description_from_producer: The description of the dummy indicator by the producer, shown separately on a data page.
-        processing_level: major
-        license:
-          name: CC-BY 4.0
-          url: ""
-        presentation:
-          grapher_config: # Note that the fields in here use camelCase, not snake_case. All fields of the schema can be used: https://github.com/owid/owid-grapher/tree/master/packages/%40ourworldindata/grapher/src/schema
-            title: The dummy indicator - chart title
-            subtitle: You'll never guess where the line will go
-            hasMapTab: true
-            selectedEntityNames:
-              - Germany
-              - Italy
-              - France
-          title_public: The dummy indicator - data page title
-          title_variant: historical data # This is useful to discern between similar indicators, e.g. if some are projections and some are historical data
-          producer_short: ACME
-          attribution: ACME project # This is what we show in places like the lower left side of Grapher charts to say who produced the data. Often this can be empty, in this case we construct this text from the attribution fields on the Origins.
-          topic_tags_links: # These should exist in the tags table in the grapher db and use the same spelling and case as they do there
-            - "Internet"
-          key_info_text:
-            - "First bullet point info about the data. [Detail on demand link](#dod:primaryenergy)"
-            - "Second bullet point with **bold** text and a [normal link](https://ourworldindata.org)"
-          faqs:
-          - fragment_id: cherries
-            gdoc_id: 16uGVylqtS-Ipc3OCxqapJ3BEVGjWf648wvZpzio1QFE
+        unit: dummy unit
diff --git a/etl/steps/data/garden/dummy/2020-01-01/dummy.py b/etl/steps/data/garden/dummy/2020-01-01/dummy.py
@@ -19,7 +19,7 @@ def run(dest_dir: str) -> None:
     ds_meadow = cast(Dataset, paths.load_dependency("dummy"))
 
     # Read table from meadow dataset.
-    tb = ds_meadow["dummy"]
+    tb = ds_meadow["dummy"].reset_index()
 
     #
     # Process data.
@@ -32,7 +32,9 @@ def run(dest_dir: str) -> None:
     # Save outputs.
     #
     # Create a new garden dataset with the same metadata as the meadow dataset.
-    ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
+    ds_garden = create_dataset(
+        dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=ds_meadow.metadata
+    )
 
     # Save changes in the new garden dataset.
     ds_garden.save()
diff --git a/etl/steps/data/garden/ggdc/2020-10-01/ggdc_maddison.meta.yml b/etl/steps/data/garden/ggdc/2020-10-01/ggdc_maddison.meta.yml
@@ -1,5 +1,3 @@
-dataset:
-  title: Maddison Project Database (Bolt and van Zanden, 2020)
 tables:
   maddison_gdp:
     variables:

diff --git a/etl/steps/data/meadow/dummy/2020-01-01/dummy.py b/etl/steps/data/meadow/dummy/2020-01-01/dummy.py
@@ -32,7 +32,7 @@ def run(dest_dir: str) -> None:
     # Save outputs.
     #
     # Create a new meadow dataset with the same metadata as the snapshot.
-    ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)
+    ds_meadow = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata)
 
     # Save changes in the new garden dataset.
     ds_meadow.save()
diff --git a/fasttrack/cli.py b/fasttrack/cli.py
@@ -239,7 +239,7 @@ def app(dummy_data: bool, commit: bool) -> None:
     dag_content = _add_to_dag(meta.dataset, form.is_private)
 
     # create step and metadata file
-    walkthrough_utils.generate_step(
+    walkthrough_utils.generate_step_to_channel(
         CURRENT_DIR / "grapher_cookiecutter/", dict(**meta.dataset.dict(), channel="grapher")
     )
     fast_import.save_metadata()

diff --git a/lib/catalog/owid/catalog/meta.py b/lib/catalog/owid/catalog/meta.py
@@ -115,7 +115,7 @@ def update(self, **kwargs: Dict[str, Any]) -> None:
 class Origin:
     # Dataset title written by OWID (without a year)
     dataset_title_owid: Optional[str] = None
-    # Dataset title written producer (without a year)
+    # Dataset title written by producer (without a year)
     dataset_title_producer: Optional[str] = None
     # Our description of the dataset
     dataset_description_owid: Optional[str] = None

diff --git a/snapshots/dummy/2020-01-01/dummy.csv.dvc b/snapshots/dummy/2020-01-01/dummy.csv.dvc
@@ -1,26 +1,20 @@
 meta:
   name: Dummy dataset
-  origin:
-    dataset_title_producer: The best dummy dataset  # the dataset title as used by the producer. This should be the default to be filled
-    dataset_title_owid: Dummy # What we call the dataset if the producer's title does not fit our needs.
-    dataset_description_producer: This is a description of the best dummy dataset # How the producer describes this dataset
-    dataset_description_owid: | # How we describe this dataset - also a place to collect important information about the entire dataset.
-      This is a description of this dummy dataset as we would word it.
-
-      It can be a few paragraphs long if need be. Citation information should not go in here. For
-      specific information about indicators, prefer to add indciator level descriptions in meadow or garden.
-    attribution: ACME project # The text we want to appear when we want to credit the "origin" of the data. Should be reasonably short. Can be a project, an institution, actual people. Can include versions or years if that is important.
-    attribution_short: ACME # The shortest version of the attribution that we think is acceptable. Used in places like "This database is used on data by ..."
-    producer: Max Mustermann # Name of the institution or people who are the authors of this data. Should never be the name of a project.
-    citation_producer: Max Mustermann et al based on John Doe. # The citation that the producer asks for, verbatim.
-    dataset_url_main: http://dummy-project.org # The URL where this dataset is described
-    dataset_url_download: http://dummy-project.org/download # The URL from where we downloaded the data
-    date_accessed: 2023-06-29
-    date_published: 2020-01-01
-    version: "1" # Some datasets release versions over time - if so then here we capture the version identifier the producer used for this release
-    license:
-      name: CC-BY 4.0 # The license that governs this dataset
-      url: http://dummy-project.org/license # The URL where the licensing terms are given
+  publication_date: '2020-01-01'
+  source_name: Dummy short source citation
+  source_published_by: Dummy full source citation
+  url: https://www.url-dummy.com/
+  source_data_url: https://raw.githubusercontent.com/owid/etl/master/walkthrough/dummy_data.csv
+  license_url:
+  license_name: ''
+  date_accessed: 2023-08-03
+  is_public: true
+  description: |
+    This
+    is
+    a
+    dummy
+    dataset
 wdir: ../../../data/snapshots/dummy/2020-01-01
 outs:
 - md5: becb9bc64792f7372580683b384e5411

diff --git a/snapshots/dummy/2020-01-01/dummy.py b/snapshots/dummy/2020-01-01/dummy.py
@@ -1,4 +1,4 @@
-"""Script to create a snapshot of dataset 'Dummy dataset'."""
+"""Script to create a snapshot of dataset."""
 
 from pathlib import Path
 

diff --git a/snapshots/ggdc/2020-10-01/ggdc_maddison.xlsx.dvc b/snapshots/ggdc/2020-10-01/ggdc_maddison.xlsx.dvc
@@ -1,5 +1,4 @@
 meta:
-  name: Maddison Project Database (Bolt and van Zanden, 2020)
   origin:
     dataset_title_producer: Maddison Project Database
     dataset_description_owid: |

diff --git a/walkthrough/explorers.md b/walkthrough/explorers.md
@@ -1,7 +1,3 @@
-# Walkthrough - Explorers
-
-## Explorers step
-
 Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are powered by CSV files generated by ETL [served from S3](https://cloud.digitalocean.com/spaces/owid-catalog?path=explorers/). Explorers data step in ETL is responsible for generating these CSV files. It works in the same way as e.g. garden step, but the transformations made there are meant to get the data ready for the data explorer (and not be consumed by users of catalog).
 
 Check out docs about creating [Data explorers](https://www.notion.so/owid/Creating-Data-Explorers-cf47a5ef90f14c1fba8fc243aba79be7).
diff --git a/walkthrough/explorers.py b/walkthrough/explorers.py
@@ -38,8 +38,9 @@ def __init__(self, **data: Any) -> None:
 def app(run_checks: bool) -> None:
     state = utils.APP_STATE
 
+    po.put_markdown("# Walkthrough - Explorers")
     with open(CURRENT_DIR / "explorers.md", "r") as f:
-        po.put_markdown(f.read())
+        po.put_collapse("Instructions", [po.put_markdown(f.read())])
 
     data = pi.input_group(
         "Options",
@@ -100,7 +101,9 @@ def app(run_checks: bool) -> None:
     else:
         dag_content = ""
 
-    DATASET_DIR = utils.generate_step(CURRENT_DIR / "explorers_cookiecutter/", dict(**form.dict(), channel="explorers"))
+    DATASET_DIR = utils.generate_step_to_channel(
+        CURRENT_DIR / "explorers_cookiecutter/", dict(**form.dict(), channel="explorers")
+    )
 
     step_path = DATASET_DIR / (form.short_name + ".py")
 

diff --git a/walkthrough/explorers_cookiecutter/cookiecutter.json b/walkthrough/explorers_cookiecutter/cookiecutter.json
diff --git a/...ory_name}}/{{cookiecutter.short_name}}.py → ....version}}/{{cookiecutter.short_name}}.py b/...ory_name}}/{{cookiecutter.short_name}}.py → ....version}}/{{cookiecutter.short_name}}.py
diff --git a/walkthrough/garden.md b/walkthrough/garden.md
@@ -1,5 +1,3 @@
-# Walkthrough - Garden
-
 Here's a summary of this walkthrough, you don't have to manually execute anything, all of it will be done automatically after submitting a form below
 
 1. **Create a new garden step** (e.g. `etl/etl/steps/data/garden/example_institution/YYYY-MM-DD/example_dataset.py`). The step must contain a `run(dest_dir)` function that loads data from the last `meadow` step, processes the data and creates a dataset with one or more tables and the necessary metadata.

diff --git a/walkthrough/garden.py b/walkthrough/garden.py
@@ -56,8 +56,9 @@ def __init__(self, **data: Any) -> None:
 def app(run_checks: bool) -> None:
     state = utils.APP_STATE
 
+    po.put_markdown("# Walkthrough - Garden")
     with open(CURRENT_DIR / "garden.md", "r") as f:
-        po.put_markdown(f.read())
+        po.put_collapse("Instructions", [po.put_markdown(f.read())])
 
     data = pi.input_group(
         "Options",
@@ -136,7 +137,9 @@ def app(run_checks: bool) -> None:
     else:
         dag_content = ""
 
-    DATASET_DIR = utils.generate_step(CURRENT_DIR / "garden_cookiecutter/", dict(**form.dict(), channel="garden"))
+    DATASET_DIR = utils.generate_step_to_channel(
+        CURRENT_DIR / "garden_cookiecutter/", dict(**form.dict(), channel="garden")
+    )
 
     step_path = DATASET_DIR / (form.short_name + ".py")
     notebook_path = DATASET_DIR / "playground.ipynb"
@@ -245,9 +248,7 @@ def _fill_dummy_metadata_yaml(metadata_path: Path) -> None:
     with open(metadata_path, "r") as f:
         doc = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)
 
-    doc["dataset"]["title"] = "Dummy dataset"
     doc["tables"]["dummy"]["variables"] = {"dummy_variable": {"unit": "dummy unit"}}
-    doc["all_sources"][0]["source_testing"]["name"] = "Dummy source"
 
     with open(metadata_path, "w") as f:
         ruamel.yaml.dump(doc, f, Dumper=ruamel.yaml.RoundTripDumper)
diff --git a/walkthrough/garden_cookiecutter/cookiecutter.json b/walkthrough/garden_cookiecutter/cookiecutter.json
diff --git a/.../garden_cookiecutter/{{cookiecutter.directory_name}}/{{cookiecutter.short_name}}.meta.yml b/.../garden_cookiecutter/{{cookiecutter.directory_name}}/{{cookiecutter.short_name}}.meta.yml
diff --git a/...ecutter.directory_name}}/playground.ipynb → ...{{cookiecutter.version}}/playground.ipynb b/...ecutter.directory_name}}/playground.ipynb → ...{{cookiecutter.version}}/playground.ipynb
diff --git a/...{cookiecutter.short_name}}.countries.json → ...{cookiecutter.short_name}}.countries.json b/...{cookiecutter.short_name}}.countries.json → ...{cookiecutter.short_name}}.countries.json
diff --git a/...tter.short_name}}.excluded_countries.json → ...tter.short_name}}.excluded_countries.json b/...tter.short_name}}.excluded_countries.json → ...tter.short_name}}.excluded_countries.json
diff --git a/.../{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.meta.yml b/.../{{cookiecutter.namespace}}/{{cookiecutter.version}}/{{cookiecutter.short_name}}.meta.yml
@@ -0,0 +1,13 @@
+tables:
+  {{cookiecutter.short_name}}:
+    variables:
+      # testing_variable:
+      #   title: Testing variable title
+      #   unit: arbitrary units
+      #   short_unit: au
+      #   description: Full description of testing variable.
+      #   sources:
+      #     - *source-testing
+      #   display:
+      #     entityAnnotationsMap: Test annotation
+      #     numDecimalPlaces: 0
diff --git a/...ory_name}}/{{cookiecutter.short_name}}.py → ....version}}/{{cookiecutter.short_name}}.py b/...ory_name}}/{{cookiecutter.short_name}}.py → ....version}}/{{cookiecutter.short_name}}.py
@@ -39,7 +39,7 @@ def run(dest_dir: str) -> None:
     ds_meadow = cast(Dataset, paths.load_dependency("{{cookiecutter.short_name}}"))
 
     # Read table from meadow dataset.
-    tb = ds_meadow["{{cookiecutter.short_name}}"]
+    tb = ds_meadow["{{cookiecutter.short_name}}"].reset_index()
 
     #
     # Process data.
@@ -52,7 +52,7 @@ def run(dest_dir: str) -> None:
     # Save outputs.
     #
     # Create a new garden dataset with the same metadata as the meadow dataset.
-    ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
+    ds_garden = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=ds_meadow.metadata)
 
     # Save changes in the new garden dataset.
     ds_garden.save()