Skip to content

Commit

Permalink
🔨 Allow passing parameters to YAML file (#3231)
Browse files Browse the repository at this point in the history
* 🔨 Allow passing parameters to YAML file
  • Loading branch information
Marigold authored Sep 3, 2024
1 parent b73141e commit c1050f9
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 13 deletions.
6 changes: 6 additions & 0 deletions etl/datadiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,12 @@ def _diff_tables(self, ds_a: Dataset, ds_b: Dataset, table_name: str):
col_a = table_a[col]
col_b = table_b[col]

# sort origins
for tab in (table_a, table_b):
tab[col].m.origins = sorted(
tab[col].m.origins, key=lambda x: (x.title or "", x.title_snapshot or "")
)

# metadata diff
meta_diff = _dict_diff(
_column_metadata_dict(col_a.metadata), _column_metadata_dict(col_b.metadata), tabs=4
Expand Down
4 changes: 3 additions & 1 deletion etl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def create_dataset(
formats: List[FileFormat] = DEFAULT_FORMATS,
check_variables_metadata: bool = False,
run_grapher_checks: bool = True,
yaml_params: Optional[Dict[str, Any]] = None,
if_origins_exist: SOURCE_EXISTS_OPTIONS = "replace",
errors: Literal["ignore", "warn", "raise"] = "raise",
repack: bool = True,
Expand All @@ -200,6 +201,7 @@ def create_dataset(
:param camel_to_snake: Whether to convert camel case to snake case for the table name.
:param check_variables_metadata: Check that all variables in tables have metadata; raise a warning otherwise.
:param run_grapher_checks: Run grapher checks on the dataset, only applies to grapher channel.
:param yaml_params: Dictionary of parameters that can be used in the metadata yaml file.
:param if_origins_exist: What to do if origins already exist in the dataset metadata.
:param repack: Repack dataframe before adding it to the dataset.
Expand Down Expand Up @@ -250,7 +252,7 @@ def create_dataset(

meta_path = get_metadata_path(str(dest_dir))
if meta_path.exists():
ds.update_metadata(meta_path, if_origins_exist=if_origins_exist, errors=errors)
ds.update_metadata(meta_path, if_origins_exist=if_origins_exist, yaml_params=yaml_params, errors=errors)

# another override YAML file with higher priority
meta_override_path = get_metadata_path(str(dest_dir)).with_suffix(".override.yml")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
desc_update: The 2024 data is incomplete and was last updated {TODAY}.
desc_update: The 2024 data is incomplete and was last updated {date_accessed}.
common:
processing_level: major
presentation:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Generate aggregated table for total yearly and cumulative number of notable AI systems in each category of researcher affiliation."""

import datetime as dt

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
Expand Down Expand Up @@ -58,10 +60,16 @@ def run(dest_dir: str) -> None:
# Set the index to year and country
tb_agg = tb_agg.format(["year", "organization_categorization"])

date_acessed = tb_agg.yearly_count.m.origins[0].date_accessed

#
# Save outputs.
#
ds_garden = create_dataset(dest_dir, tables=[tb_agg])
ds_garden = create_dataset(
dest_dir,
tables=[tb_agg],
yaml_params={"date_accessed": dt.datetime.strptime(date_acessed, "%Y-%m-%d").strftime("%d %B %Y")},
)
ds_garden.save()

paths.log.info("epoch_aggregates_affiliation.end")
4 changes: 2 additions & 2 deletions etl/steps/data/meadow/oecd/2016-06-01/regional_wellbeing.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
meta.title = ind.loc[ind_code]
short_unit = ind_unit_code[ind_code]
meta.short_unit = short_unit if not pd.isnull(short_unit) else None
meta.unit = ind_unit_name[ind_code]
meta.unit = ind_unit_name[ind_code] if not pd.isnull(ind_unit_name[ind_code]) else None

t.head()

Expand All @@ -111,7 +111,7 @@
assert t[col].metadata.short_name
assert t[col].metadata.title
assert t[col].metadata.short_unit or col == "unem_ra"
assert t[col].metadata.unit
assert t[col].metadata.unit or col == "unem_ra"

# ## Save the dataset

Expand Down
8 changes: 6 additions & 2 deletions lib/catalog/owid/catalog/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from os import environ
from os.path import join
from pathlib import Path
from typing import Iterator, List, Literal, Optional, Union
from typing import Any, Dict, Iterator, List, Literal, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -213,6 +213,7 @@ def save(self) -> None:
def update_metadata(
self,
metadata_path: Path,
yaml_params: Optional[Dict[str, Any]] = None,
if_source_exists: SOURCE_EXISTS_OPTIONS = "replace",
if_origins_exist: SOURCE_EXISTS_OPTIONS = "replace",
errors: Literal["ignore", "warn", "raise"] = "raise",
Expand All @@ -222,6 +223,7 @@ def update_metadata(
:param metadata_path: Path to *.meta.yml file with metadata. Check out other metadata files
for examples, this function doesn't do schema validation
:param yaml_params: Additional parameters to pass to the YAML loader
:param if_source_exists: What to do if source already exists in metadata. Possible values:
- "replace" (default): replace existing source with new one
- "append": append new source to existing ones
Expand Down Expand Up @@ -250,7 +252,9 @@ def update_metadata(
if errors == "warn":
warnings.warn(str(e))
continue
table.update_metadata_from_yaml(metadata_path, table_name, if_origins_exist=if_origins_exist)
table.update_metadata_from_yaml(
metadata_path, table_name, if_origins_exist=if_origins_exist, yaml_params=yaml_params
)
table._save_metadata(join(self.path, table.metadata.checked_name + ".meta.json"))

def index(self, catalog_path: Path = Path("/")) -> pd.DataFrame:
Expand Down
7 changes: 6 additions & 1 deletion lib/catalog/owid/catalog/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,10 @@ def _save_metadata(self, filename: str) -> None:
metadata = self.metadata.to_dict() # type: ignore
metadata["primary_key"] = self.primary_key
metadata["fields"] = self._get_fields_as_dict()
json.dump(metadata, ostream, indent=2, default=str)
try:
json.dump(metadata, ostream, indent=2, default=str, allow_nan=False)
except ValueError as e:
raise ValueError(f"metadata contains NaNs:\n{metadata}") from e

@classmethod
def read_csv(cls, path: Union[str, Path], **kwargs) -> "Table":
Expand Down Expand Up @@ -512,6 +515,7 @@ def update_metadata_from_yaml(
self,
path: Union[Path, str],
table_name: str,
yaml_params: Optional[Dict[str, Any]] = None,
extra_variables: Literal["raise", "ignore"] = "raise",
if_origins_exist: SOURCE_EXISTS_OPTIONS = "replace",
) -> None:
Expand All @@ -526,6 +530,7 @@ def update_metadata_from_yaml(
path=path,
table_name=table_name,
extra_variables=extra_variables,
yaml_params=yaml_params,
if_origins_exist=if_origins_exist,
)

Expand Down
13 changes: 8 additions & 5 deletions lib/catalog/owid/catalog/yaml_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import Any, List, Literal, Union
from typing import Any, Dict, List, Literal, Optional, Union

from owid.catalog.meta import SOURCE_EXISTS_OPTIONS

Expand All @@ -12,6 +12,7 @@ def update_metadata_from_yaml(
tb: Table,
path: Union[Path, str],
table_name: str,
yaml_params: Optional[Dict[str, Any]] = None,
extra_variables: Literal["raise", "ignore"] = "raise",
if_origins_exist: SOURCE_EXISTS_OPTIONS = "replace",
) -> None:
Expand All @@ -29,11 +30,13 @@ def update_metadata_from_yaml(
:param path: Path to YAML file.
:param table_name: Name of table, also updates this in the metadata.
"""
# load YAML file as dictionary, add parameters from dataset metadata
# Add parameters from dataset metadata
params = DatasetMeta._params_yaml(tb.metadata.dataset or DatasetMeta())
params.update(yaml_params or {})

# load YAML file as dictionary
# TODO: tb.metadata.dataset reference shouldn't exist
annot = dynamic_yaml_to_dict(
dynamic_yaml_load(path, DatasetMeta._params_yaml(tb.metadata.dataset or DatasetMeta()))
)
annot = dynamic_yaml_to_dict(dynamic_yaml_load(path, params))

tb.metadata.short_name = table_name

Expand Down

0 comments on commit c1050f9

Please sign in to comment.