Skip to content

Commit

Permalink
🔨 refactor walkthrough
Browse files Browse the repository at this point in the history
  • Loading branch information
Marigold committed Aug 4, 2023
1 parent f2f30dd commit bb01bbd
Show file tree
Hide file tree
Showing 43 changed files with 105 additions and 335 deletions.
5 changes: 0 additions & 5 deletions backport/migrate/garden_cookiecutter/cookiecutter.json

This file was deleted.

5 changes: 0 additions & 5 deletions backport/migrate/grapher_cookiecutter/cookiecutter.json

This file was deleted.

33 changes: 3 additions & 30 deletions backport/migrate/migrate.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
import shutil
import tempfile
from pathlib import Path
from typing import Any, Dict, Optional, cast
from typing import Optional, cast

import click
import structlog
from cookiecutter.main import cookiecutter
from owid.catalog.utils import underscore
from sqlalchemy.engine import Engine

from etl import config
from etl.backport_helpers import create_dataset
from etl.db import get_engine
from etl.files import apply_black_formatter_to_files, yaml_dump
from etl.files import yaml_dump
from etl.metadata_export import metadata_export
from etl.paths import DAG_DIR, SNAPSHOTS_DIR, STEP_DIR
from walkthrough.utils import add_to_dag
from walkthrough.utils import add_to_dag, generate_step

from ..backport import PotentialBackport

Expand Down Expand Up @@ -89,31 +87,6 @@ def cli(
)


def generate_step(cookiecutter_path: Path, data: Dict[str, Any], target_dir: Path) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
OUTPUT_DIR = temp_dir

# generate ingest scripts
cookiecutter(
cookiecutter_path.as_posix(),
no_input=True,
output_dir=temp_dir,
overwrite_if_exists=True,
extra_context=data,
)

shutil.copytree(
Path(OUTPUT_DIR),
target_dir,
dirs_exist_ok=True,
)

DATASET_DIR = target_dir / data["namespace"] / data["version"]

# Apply black formatter to generated files.
apply_black_formatter_to_files(file_paths=DATASET_DIR.glob("*.py"))


def migrate(
dataset_id: int,
namespace: str,
Expand Down
6 changes: 0 additions & 6 deletions backport/migrate/snapshot_cookiecutter/cookiecutter.json

This file was deleted.

2 changes: 1 addition & 1 deletion dag/walkthrough.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ steps:
data://grapher/dummy/2020-01-01/dummy:
- data://garden/dummy/2020-01-01/dummy
data://explorers/dummy/2020-01-01/dummy:
- data://garden/dummy/2020-01-01/dummy
- data://garden/dummy/2020-01-01/dummy
47 changes: 1 addition & 46 deletions etl/steps/data/garden/dummy/2020-01-01/dummy.meta.yml
Original file line number Diff line number Diff line change
@@ -1,50 +1,5 @@
# (Inherited from meadow, remove if not different.)
all_sources:
dataset:
title: Dummy dataset
tables:
dummy:
# (Inherited from meadow, remove if not different.)
variables:
dummy_variable:
title: Dummy
description: This is a dummy indicator with full metadata. # Description can be a long text if need be.
licenses: [] # Licenses is an obsolete field - use origin.license in the snapshot to record upstream licenses and license to specify the redistribution license
unit: Dummy unit
short_unit: Du
display:
isProjection: true
conversionFactor: 1000
numDecimalPlaces: 1
tolerance: 5
yearIsDay: false
zeroDay: 1900-01-01
entityAnnotationsMap: "Germany: dummy annotation"
includeInTable: true
description_short: Short description of the dummy indicator.
description_from_producer: The description of the dummy indicator by the producer, shown separately on a data page.
processing_level: major
license:
name: CC-BY 4.0
url: ""
presentation:
grapher_config: # Note that the fields in here use camelCase, not snake_case. All fields of the schema can be used: https://github.com/owid/owid-grapher/tree/master/packages/%40ourworldindata/grapher/src/schema
title: The dummy indicator - chart title
subtitle: You'll never guess where the line will go
hasMapTab: true
selectedEntityNames:
- Germany
- Italy
- France
title_public: The dummy indicator - data page title
title_variant: historical data # This is useful to discern between similar indicators, e.g. if some are projections and some are historical data
producer_short: ACME
attribution: ACME project # This is what we show in places like the lower left side of Grapher charts to say who produced the data. Often this can be empty, in this case we construct this text from the attribution fields on the Origins.
topic_tags_links: # These should exist in the tags table in the grapher db and use the same spelling and case as they do there
- "Internet"
key_info_text:
- "First bullet point info about the data. [Detail on demand link](#dod:primaryenergy)"
- "Second bullet point with **bold** text and a [normal link](https://ourworldindata.org)"
faqs:
- fragment_id: cherries
gdoc_id: 16uGVylqtS-Ipc3OCxqapJ3BEVGjWf648wvZpzio1QFE
unit: dummy unit
6 changes: 4 additions & 2 deletions etl/steps/data/garden/dummy/2020-01-01/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def run(dest_dir: str) -> None:
ds_meadow = cast(Dataset, paths.load_dependency("dummy"))

# Read table from meadow dataset.
tb = ds_meadow["dummy"]
tb = ds_meadow["dummy"].reset_index()

#
# Process data.
Expand All @@ -32,7 +32,9 @@ def run(dest_dir: str) -> None:
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
ds_garden = create_dataset(
dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
2 changes: 0 additions & 2 deletions etl/steps/data/garden/ggdc/2020-10-01/ggdc_maddison.meta.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
dataset:
title: Maddison Project Database (Bolt and van Zanden, 2020)
tables:
maddison_gdp:
variables:
Expand Down
2 changes: 1 addition & 1 deletion etl/steps/data/meadow/dummy/2020-01-01/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def run(dest_dir: str) -> None:
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(dest_dir, tables=[tb], default_metadata=snap.metadata)
ds_meadow = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=snap.metadata)

# Save changes in the new garden dataset.
ds_meadow.save()
2 changes: 1 addition & 1 deletion fasttrack/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def app(dummy_data: bool, commit: bool) -> None:
dag_content = _add_to_dag(meta.dataset, form.is_private)

# create step and metadata file
walkthrough_utils.generate_step(
walkthrough_utils.generate_step_to_channel(
CURRENT_DIR / "grapher_cookiecutter/", dict(**meta.dataset.dict(), channel="grapher")
)
fast_import.save_metadata()
Expand Down
2 changes: 1 addition & 1 deletion lib/catalog/owid/catalog/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def update(self, **kwargs: Dict[str, Any]) -> None:
class Origin:
# Dataset title written by OWID (without a year)
dataset_title_owid: Optional[str] = None
# Dataset title written producer (without a year)
# Dataset title written by producer (without a year)
dataset_title_producer: Optional[str] = None
# Our description of the dataset
dataset_description_owid: Optional[str] = None
Expand Down
36 changes: 15 additions & 21 deletions snapshots/dummy/2020-01-01/dummy.csv.dvc
Original file line number Diff line number Diff line change
@@ -1,26 +1,20 @@
meta:
name: Dummy dataset
origin:
dataset_title_producer: The best dummy dataset # the dataset title as used by the producer. This should be the default to be filled
dataset_title_owid: Dummy # What we call the dataset if the producer's title does not fit our needs.
dataset_description_producer: This is a description of the best dummy dataset # How the producer describes this dataset
dataset_description_owid: | # How we describe this dataset - also a place to collect important information about the entire dataset.
This is a description of this dummy dataset as we would word it.

It can be a few paragraphs long if need be. Citation information should not go in here. For
specific information about indicators, prefer to add indciator level descriptions in meadow or garden.
attribution: ACME project # The text we want to appear when we want to credit the "origin" of the data. Should be reasonably short. Can be a project, an institution, actual people. Can include versions or years if that is important.
attribution_short: ACME # The shortest version of the attribution that we think is acceptable. Used in places like "This database is used on data by ..."
producer: Max Mustermann # Name of the institution or people who are the authors of this data. Should never be the name of a project.
citation_producer: Max Mustermann et al based on John Doe. # The citation that the producer asks for, verbatim.
dataset_url_main: http://dummy-project.org # The URL where this dataset is described
dataset_url_download: http://dummy-project.org/download # The URL from where we downloaded the data
date_accessed: 2023-06-29
date_published: 2020-01-01
version: "1" # Some datasets release versions over time - if so then here we capture the version identifier the producer used for this release
license:
name: CC-BY 4.0 # The license that governs this dataset
url: http://dummy-project.org/license # The URL where the licensing terms are given
publication_date: '2020-01-01'
source_name: Dummy short source citation
source_published_by: Dummy full source citation
url: https://www.url-dummy.com/
source_data_url: https://raw.githubusercontent.com/owid/etl/master/walkthrough/dummy_data.csv
license_url:
license_name: ''
date_accessed: 2023-08-03
is_public: true
description: |
This
is
a
dummy
dataset
wdir: ../../../data/snapshots/dummy/2020-01-01
outs:
- md5: becb9bc64792f7372580683b384e5411
Expand Down
2 changes: 1 addition & 1 deletion snapshots/dummy/2020-01-01/dummy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Script to create a snapshot of dataset 'Dummy dataset'."""
"""Script to create a snapshot of dataset."""

from pathlib import Path

Expand Down
1 change: 0 additions & 1 deletion snapshots/ggdc/2020-10-01/ggdc_maddison.xlsx.dvc
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
meta:
name: Maddison Project Database (Bolt and van Zanden, 2020)
origin:
dataset_title_producer: Maddison Project Database
dataset_description_owid: |
Expand Down
4 changes: 0 additions & 4 deletions walkthrough/explorers.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
# Walkthrough - Explorers

## Explorers step

Data explorers are Grapher charts expanded with additional functionalities to facilitate exploration, such as dynamic entity filters or customizable menus. They are powered by CSV files generated by ETL [served from S3](https://cloud.digitalocean.com/spaces/owid-catalog?path=explorers/). Explorers data step in ETL is responsible for generating these CSV files. It works in the same way as e.g. garden step, but the transformations made there are meant to get the data ready for the data explorer (and not be consumed by users of catalog).

Check out docs about creating [Data explorers](https://www.notion.so/owid/Creating-Data-Explorers-cf47a5ef90f14c1fba8fc243aba79be7).
7 changes: 5 additions & 2 deletions walkthrough/explorers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ def __init__(self, **data: Any) -> None:
def app(run_checks: bool) -> None:
state = utils.APP_STATE

po.put_markdown("# Walkthrough - Explorers")
with open(CURRENT_DIR / "explorers.md", "r") as f:
po.put_markdown(f.read())
po.put_collapse("Instructions", [po.put_markdown(f.read())])

data = pi.input_group(
"Options",
Expand Down Expand Up @@ -100,7 +101,9 @@ def app(run_checks: bool) -> None:
else:
dag_content = ""

DATASET_DIR = utils.generate_step(CURRENT_DIR / "explorers_cookiecutter/", dict(**form.dict(), channel="explorers"))
DATASET_DIR = utils.generate_step_to_channel(
CURRENT_DIR / "explorers_cookiecutter/", dict(**form.dict(), channel="explorers")
)

step_path = DATASET_DIR / (form.short_name + ".py")

Expand Down
7 changes: 0 additions & 7 deletions walkthrough/explorers_cookiecutter/cookiecutter.json

This file was deleted.

2 changes: 0 additions & 2 deletions walkthrough/garden.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# Walkthrough - Garden

Here's a summary of this walkthrough, you don't have to manually execute anything, all of it will be done automatically after submitting a form below

1. **Create a new garden step** (e.g. `etl/etl/steps/data/garden/example_institution/YYYY-MM-DD/example_dataset.py`). The step must contain a `run(dest_dir)` function that loads data from the last `meadow` step, processes the data and creates a dataset with one or more tables and the necessary metadata.
Expand Down
9 changes: 5 additions & 4 deletions walkthrough/garden.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ def __init__(self, **data: Any) -> None:
def app(run_checks: bool) -> None:
state = utils.APP_STATE

po.put_markdown("# Walkthrough - Garden")
with open(CURRENT_DIR / "garden.md", "r") as f:
po.put_markdown(f.read())
po.put_collapse("Instructions", [po.put_markdown(f.read())])

data = pi.input_group(
"Options",
Expand Down Expand Up @@ -136,7 +137,9 @@ def app(run_checks: bool) -> None:
else:
dag_content = ""

DATASET_DIR = utils.generate_step(CURRENT_DIR / "garden_cookiecutter/", dict(**form.dict(), channel="garden"))
DATASET_DIR = utils.generate_step_to_channel(
CURRENT_DIR / "garden_cookiecutter/", dict(**form.dict(), channel="garden")
)

step_path = DATASET_DIR / (form.short_name + ".py")
notebook_path = DATASET_DIR / "playground.ipynb"
Expand Down Expand Up @@ -245,9 +248,7 @@ def _fill_dummy_metadata_yaml(metadata_path: Path) -> None:
with open(metadata_path, "r") as f:
doc = ruamel.yaml.load(f, Loader=ruamel.yaml.RoundTripLoader)

doc["dataset"]["title"] = "Dummy dataset"
doc["tables"]["dummy"]["variables"] = {"dummy_variable": {"unit": "dummy unit"}}
doc["all_sources"][0]["source_testing"]["name"] = "Dummy source"

with open(metadata_path, "w") as f:
ruamel.yaml.dump(doc, f, Dumper=ruamel.yaml.RoundTripDumper)
10 changes: 0 additions & 10 deletions walkthrough/garden_cookiecutter/cookiecutter.json

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
tables:
{{cookiecutter.short_name}}:
variables:
# testing_variable:
# title: Testing variable title
# unit: arbitrary units
# short_unit: au
# description: Full description of testing variable.
# sources:
# - *source-testing
# display:
# entityAnnotationsMap: Test annotation
# numDecimalPlaces: 0
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def run(dest_dir: str) -> None:
ds_meadow = cast(Dataset, paths.load_dependency("{{cookiecutter.short_name}}"))

# Read table from meadow dataset.
tb = ds_meadow["{{cookiecutter.short_name}}"]
tb = ds_meadow["{{cookiecutter.short_name}}"].reset_index()

#
# Process data.
Expand All @@ -52,7 +52,7 @@ def run(dest_dir: str) -> None:
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(dest_dir, tables=[tb], default_metadata=ds_meadow.metadata)
ds_garden = create_dataset(dest_dir, tables=[tb.set_index(["country", "year"])], default_metadata=ds_meadow.metadata)

# Save changes in the new garden dataset.
ds_garden.save()
Loading

0 comments on commit bb01bbd

Please sign in to comment.