From 37de655ec2e03a696e218bd2dd9f603962aa2f60 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 23 Aug 2023 23:42:07 -0400 Subject: [PATCH 01/69] Re-architecting the loader classes --- STACpopulator/input.py | 45 +++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index f59328f..2b76a26 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -1,8 +1,13 @@ import logging from abc import ABC, abstractmethod -from typing import Optional +from tempfile import NamedTemporaryFile +from typing import Any, Iterator, MutableMapping, Optional, Tuple +import requests +import siphon +import xncml from colorlog import ColoredFormatter +from numpy import extract from siphon.catalog import TDSCatalog LOGGER = logging.getLogger(__name__) @@ -58,9 +63,12 @@ def reset(self): """Reset the generator.""" self.catalog_head = self.catalog - def __iter__(self): + def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" - yield from self.catalog_head.datasets.items() + if self.catalog_head.datasets.items(): + for item_name, ds in self.catalog_head.datasets.items(): + attrs = self.extract_metadata(ds) + yield item_name, attrs if self._depth > 0: for name, ref in self.catalog_head.catalog_refs.items(): @@ -68,11 +76,34 @@ def __iter__(self): self._depth -= 1 yield from self + def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: + # Get URL for NCML service + url = ds.access_urls["NCML"] + + LOGGER.info("Requesting NcML dataset description") + r = requests.get(url) + + # Write response to temporary file + f = NamedTemporaryFile() + f.write(r.content) + + # Convert NcML to CF-compliant dictionary + attrs = xncml.Dataset(f.name).to_cf_dict() -class RemoteTHREDDSLoader(THREDDSLoader): - def __init__(self, thredds_catalog_url: str, depth: int | None = None) -> None: - super().__init__(thredds_catalog_url, depth) - # more stuff to follow based on needs of a concrete implementation + attrs["access_urls"] = ds.access_urls + + return attrs + + +class STACLoader(GenericLoader): + def __init__(self) -> None: + super().__init__() + + def __iter__(self): + raise NotImplementedError + + def reset(self): + raise NotImplementedError class GeoServerLoader(GenericLoader): From 12305af249c7c2d30416f4f78f221fdb5b3aeaa2 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 23 Aug 2023 23:42:27 -0400 Subject: [PATCH 02/69] updating gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 80f0926..0344ff2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ *.pyc STACpopulator.egg-info/ .vscode/ +.venv/ +jupyter/ \ No newline at end of file From ca45cc3b7a09eed006b72766b70a28f7f99ed9e3 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 23 Aug 2023 23:43:11 -0400 Subject: [PATCH 03/69] further developing the ingestion loop --- STACpopulator/populator_base.py | 28 +++++++++++++++++++------ implementations/CMIP6-UofT/add_CMIP6.py | 8 +++++-- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 8c6465c..75a657d 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,6 +1,7 @@ import hashlib import logging from abc import ABC, abstractmethod +from typing import Any, MutableMapping import yaml from colorlog import ColoredFormatter @@ -88,17 +89,32 @@ def ingest(self) -> None: LOGGER.info("Collection successfully created") else: LOGGER.info(f"Collection '{self.collection_name}' already exists") - # for item in self.crawler(self.catalog, **self._crawler_args): - # stac_item = self.process_STAC_item(item) - # self.post_item(stac_item) - def post_item(self, data: dict[str, dict]) -> None: + # Item ingestion loop + for item_name, item_data in self._ingest_pipeline: + LOGGER.info(f"Creating STAC representation for {item_name}") + stac_item = self.create_stac_item(item_name, item_data) + if self.validate_stac_item_cv(stac_item): + if self.post_item(stac_item): + LOGGER.info(f"{item_name} successfully posted") + else: + LOGGER.error(f"Posting {item_name} failed") + self.handle_ingestion_error("Posting Error", item_name, item_data) + else: + LOGGER.error(f"Validation failed for item {item_name}") + self.handle_ingestion_error("Validation Error", item_name, item_data) + + def post_item(self, data: dict[str, dict]) -> bool: pass @abstractmethod - def process_stac_item(self): # noqa N802 + def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): pass @abstractmethod - def validate_stac_item_cv(self): # noqa N802 + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + pass + + @abstractmethod + def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: pass diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 349540d..2b74241 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,5 +1,6 @@ import argparse import logging +from typing import Any, MutableMapping from colorlog import ColoredFormatter @@ -39,11 +40,14 @@ def __init__( print(item) super().__init__(stac_host, data_loader, config_filename) - def process_stac_item(self): # noqa N802 + def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): + pass + + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # TODO: next step is to implement this print("here") - def validate_stac_item_cv(self): + def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # TODO: next step is to implement this pass From 6e500d87a9714c661e13a06c42452295b45fbd52 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 23 Aug 2023 23:48:03 -0400 Subject: [PATCH 04/69] moving post_stac_item to stac_utils --- STACpopulator/populator_base.py | 6 ++---- STACpopulator/stac_utils.py | 4 ++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 75a657d..7dfc687 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -10,6 +10,7 @@ from STACpopulator.stac_utils import ( create_stac_collection, post_collection, + post_stac_item, stac_collection_exists, stac_host_reachable, url_validate, @@ -95,7 +96,7 @@ def ingest(self) -> None: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) if self.validate_stac_item_cv(stac_item): - if self.post_item(stac_item): + if post_stac_item(self.stac_host, self.collection_id, stac_item): LOGGER.info(f"{item_name} successfully posted") else: LOGGER.error(f"Posting {item_name} failed") @@ -104,9 +105,6 @@ def ingest(self) -> None: LOGGER.error(f"Validation failed for item {item_name}") self.handle_ingestion_error("Validation Error", item_name, item_data) - def post_item(self, data: dict[str, dict]) -> bool: - pass - @abstractmethod def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): pass diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 743f53a..1a36783 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -97,3 +97,7 @@ def post_collection(stac_host: str, json_data: dict[str, Any]) -> None: r.raise_for_status() else: r.raise_for_status() + + +def post_stac_item(stac_host: str, collection_id: str, data: dict[str, dict]) -> bool: + pass From f93b10d9378102f797f051420919cd3eb916df2d Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 23 Aug 2023 23:49:02 -0400 Subject: [PATCH 05/69] renaming post_collection to post_stac_collection --- STACpopulator/populator_base.py | 4 ++-- STACpopulator/stac_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 7dfc687..587b624 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -9,7 +9,7 @@ from STACpopulator.input import GenericLoader from STACpopulator.stac_utils import ( create_stac_collection, - post_collection, + post_stac_collection, post_stac_item, stac_collection_exists, stac_host_reachable, @@ -86,7 +86,7 @@ def ingest(self) -> None: if not stac_collection_exists(self.stac_host, self.collection_id): LOGGER.info(f"Creating collection '{self.collection_name}'") pystac_collection = create_stac_collection(self.collection_id, self._collection_info) - post_collection(self.stac_host, pystac_collection) + post_stac_collection(self.stac_host, pystac_collection) LOGGER.info("Collection successfully created") else: LOGGER.info(f"Collection '{self.collection_name}' already exists") diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 1a36783..86cf0f4 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -76,7 +76,7 @@ def create_stac_collection(collection_id: str, collection_info: dict[str, Any]) return collection.to_dict() -def post_collection(stac_host: str, json_data: dict[str, Any]) -> None: +def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: """ Post a STAC collection. From d5a3d2d13d6dde71b178ae1696fa4d2dda2b1317 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 25 Aug 2023 11:42:41 -0400 Subject: [PATCH 06/69] moving collection creation to seaparate function --- STACpopulator/populator_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 587b624..a7a2012 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -60,6 +60,7 @@ def __init__( self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") + self.create_collection() @property def collection_name(self) -> str: @@ -81,8 +82,7 @@ def validate_host(self, stac_host: str) -> str: return stac_host - def ingest(self) -> None: - # First create collection if it doesn't exist + def create_collection(self): if not stac_collection_exists(self.stac_host, self.collection_id): LOGGER.info(f"Creating collection '{self.collection_name}'") pystac_collection = create_stac_collection(self.collection_id, self._collection_info) @@ -91,7 +91,7 @@ def ingest(self) -> None: else: LOGGER.info(f"Collection '{self.collection_name}' already exists") - # Item ingestion loop + def ingest(self) -> None: for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) From 946e72aea117a6f36f940dc31cbbdf3c7d9013d9 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 25 Aug 2023 12:25:30 -0400 Subject: [PATCH 07/69] moving create_stac_collection to STACpopulatorBase --- STACpopulator/populator_base.py | 35 +++++++++++++++++++++++++-------- STACpopulator/stac_utils.py | 27 ------------------------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index a7a2012..b287b13 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,14 +1,15 @@ import hashlib import logging from abc import ABC, abstractmethod +from datetime import datetime from typing import Any, MutableMapping +import pystac import yaml from colorlog import ColoredFormatter from STACpopulator.input import GenericLoader from STACpopulator.stac_utils import ( - create_stac_collection, post_stac_collection, post_stac_item, stac_collection_exists, @@ -60,7 +61,7 @@ def __init__( self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") - self.create_collection() + self.create_stac_collection() @property def collection_name(self) -> str: @@ -82,14 +83,32 @@ def validate_host(self, stac_host: str) -> str: return stac_host - def create_collection(self): - if not stac_collection_exists(self.stac_host, self.collection_id): + def create_stac_collection(self): + """ + Create a basic STAC collection. + + Returns the collection. + """ + if stac_collection_exists(self.stac_host, self.collection_id): + LOGGER.info(f"Collection '{self.collection_name}' already exists") + else: LOGGER.info(f"Creating collection '{self.collection_name}'") - pystac_collection = create_stac_collection(self.collection_id, self._collection_info) - post_stac_collection(self.stac_host, pystac_collection) + sp_extent = pystac.SpatialExtent([self._collection_info.pop("spatialextent")]) + tmp = self._collection_info.pop("temporalextent") + tmp_extent = pystac.TemporalExtent( + [ + [ + datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, + datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, + ] + ] + ) + self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) + self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) + + collection = pystac.Collection(id=self.collection_id, **self._collection_info) LOGGER.info("Collection successfully created") - else: - LOGGER.info(f"Collection '{self.collection_name}' already exists") + post_stac_collection(self.stac_host, collection.to_dict()) def ingest(self) -> None: for item_name, item_data in self._ingest_pipeline: diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 86cf0f4..37f7071 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,9 +1,7 @@ import os import re -from datetime import datetime from typing import Any -import pystac import requests @@ -51,31 +49,6 @@ def stac_collection_exists(stac_host: str, collection_id: str) -> bool: return r.status_code == 200 -def create_stac_collection(collection_id: str, collection_info: dict[str, Any]) -> dict[str, Any]: - """ - Create a basic STAC collection. - - Returns the collection. - """ - - sp_extent = pystac.SpatialExtent([collection_info.pop("spatialextent")]) - tmp = collection_info.pop("temporalextent") - tmp_extent = pystac.TemporalExtent( - [ - [ - datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, - datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, - ] - ] - ) - collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) - collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) - - collection = pystac.Collection(id=collection_id, **collection_info) - - return collection.to_dict() - - def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: """ Post a STAC collection. From 3ee1e6ec0e580e337009cb3c8831af401931b3d8 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 25 Aug 2023 12:33:40 -0400 Subject: [PATCH 08/69] moving all STAC API calls to separate file --- STACpopulator/api_requests.py | 51 +++++++++++++++++++++++++++++++++ STACpopulator/populator_base.py | 6 ++-- STACpopulator/stac_utils.py | 51 --------------------------------- 3 files changed, 54 insertions(+), 54 deletions(-) create mode 100644 STACpopulator/api_requests.py diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py new file mode 100644 index 0000000..59ffc98 --- /dev/null +++ b/STACpopulator/api_requests.py @@ -0,0 +1,51 @@ +import os +from typing import Any + +import requests + + +def stac_host_reachable(url: str) -> bool: + try: + registry = requests.get(url) + registry.raise_for_status() + return True + except (requests.exceptions.RequestException, requests.exceptions.ConnectionError): + return False + + +def stac_collection_exists(stac_host: str, collection_id: str) -> bool: + """ + Get a STAC collection + + Returns the collection JSON. + """ + r = requests.get(os.path.join(stac_host, "collections", collection_id), verify=False) + + return r.status_code == 200 + + +def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: + """ + Post a STAC collection. + + Returns the collection id. + """ + collection_id = json_data["id"] + r = requests.post(os.path.join(stac_host, "collections"), json=json_data, verify=False) + + if r.status_code == 200: + print( + f"{bcolors.OKGREEN}[INFO] Pushed STAC collection [{collection_id}] to [{stac_host}] ({r.status_code}){bcolors.ENDC}" + ) + elif r.status_code == 409: + print( + f"{bcolors.WARNING}[INFO] STAC collection [{collection_id}] already exists on [{stac_host}] ({r.status_code}), updating..{bcolors.ENDC}" + ) + r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) + r.raise_for_status() + else: + r.raise_for_status() + + +def post_stac_item(stac_host: str, collection_id: str, data: dict[str, dict]) -> bool: + pass diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index b287b13..f80a23b 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -8,14 +8,14 @@ import yaml from colorlog import ColoredFormatter -from STACpopulator.input import GenericLoader -from STACpopulator.stac_utils import ( +from STACpopulator.api_requests import ( post_stac_collection, post_stac_item, stac_collection_exists, stac_host_reachable, - url_validate, ) +from STACpopulator.input import GenericLoader +from STACpopulator.stac_utils import url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 37f7071..4f7faba 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,8 +1,4 @@ -import os import re -from typing import Any - -import requests def url_validate(target: str) -> bool: @@ -27,50 +23,3 @@ def url_validate(target: str) -> bool: re.IGNORECASE, ) return True if re.match(url_regex, target) else False - - -def stac_host_reachable(url: str) -> bool: - try: - registry = requests.get(url) - registry.raise_for_status() - return True - except (requests.exceptions.RequestException, requests.exceptions.ConnectionError): - return False - - -def stac_collection_exists(stac_host: str, collection_id: str) -> bool: - """ - Get a STAC collection - - Returns the collection JSON. - """ - r = requests.get(os.path.join(stac_host, "collections", collection_id), verify=False) - - return r.status_code == 200 - - -def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: - """ - Post a STAC collection. - - Returns the collection id. - """ - collection_id = json_data["id"] - r = requests.post(os.path.join(stac_host, "collections"), json=json_data, verify=False) - - if r.status_code == 200: - print( - f"{bcolors.OKGREEN}[INFO] Pushed STAC collection [{collection_id}] to [{stac_host}] ({r.status_code}){bcolors.ENDC}" - ) - elif r.status_code == 409: - print( - f"{bcolors.WARNING}[INFO] STAC collection [{collection_id}] already exists on [{stac_host}] ({r.status_code}), updating..{bcolors.ENDC}" - ) - r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) - r.raise_for_status() - else: - r.raise_for_status() - - -def post_stac_item(stac_host: str, collection_id: str, data: dict[str, dict]) -> bool: - pass From db008a51460bab5ed95d36521cea56bd3426671f Mon Sep 17 00:00:00 2001 From: David Huard Date: Mon, 28 Aug 2023 10:12:33 -0400 Subject: [PATCH 09/69] Create pydantic data model for CMIP6 CV --- implementations/CMIP6-UofT/add_CMIP6.py | 11 ++- implementations/CMIP6-UofT/datamodel.py | 106 ++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 implementations/CMIP6-UofT/datamodel.py diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 2b74241..74423c0 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -25,6 +25,7 @@ def __init__( stac_host: str, thredds_catalog_url: str, config_filename: str, + validator: callable = None ) -> None: """Constructor @@ -34,8 +35,12 @@ def __init__( :type thredds_catalog_url: str :param config_filename: Yaml file containing the information about the collection to populate :type config_filename: str + :param: validator: a function that validates and returns a dictionary of attributes. """ + data_loader = THREDDSLoader(thredds_catalog_url) + self.validator = validator + for item in data_loader: print(item) super().__init__(stac_host, data_loader, config_filename) @@ -45,13 +50,17 @@ def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableM def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # TODO: next step is to implement this - print("here") + self.validator(**item_data) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # TODO: next step is to implement this pass + + + + if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CMIP6 STAC populator") parser.add_argument("stac_host", type=str, help="STAC API address") diff --git a/implementations/CMIP6-UofT/datamodel.py b/implementations/CMIP6-UofT/datamodel.py new file mode 100644 index 0000000..1e54ee9 --- /dev/null +++ b/implementations/CMIP6-UofT/datamodel.py @@ -0,0 +1,106 @@ +""" +Data model for the attributes of a collection. +""" +# TODO: Make this data model compatible with STAC Items and Collections. + +from pydantic import BaseModel, HttpUrl, constr, validator, Field, field_validator +import datetime as dt +from typing import Literal, Optional, Dict +from collections import OrderedDict +import pyessv +from enum import Enum + + +def collection2enum(collection): + """Create Enum based on terms from pyessv collection. + + Parameters + ---------- + collection : pyessv.model.collection.Collection + pyessv collection of terms. + + Returns + ------- + Enum + Enum storing terms and their labels from collection. + """ + mp = {term.name: term.label for term in collection} + return Enum(collection.raw_name.capitalize(), mp, module="base") + + +# CMIP6 controlled vocabulary (CV) +CV = pyessv.WCRP.CMIP6 + +# Enum classes built from the pyessv' CV +Activity = collection2enum(CV.activity_id) +Experiment = collection2enum(CV.experiment_id) +Frequency = collection2enum(CV.frequency) +GridLabel = collection2enum(CV.grid_label) +Institute = collection2enum(CV.institution_id) +Member = collection2enum(CV.member_id) +Resolution = collection2enum(CV.nominal_resolution) +Realm = collection2enum(CV.realm) +Source = collection2enum(CV.source_id) +SourceType = collection2enum(CV.source_type) +SubExperiment = collection2enum(CV.sub_experiment_id) +Table = collection2enum(CV.table_id) +Variable = collection2enum(CV.variable_id) + + +class Attributes(BaseModel): + """Should be extended for each collection.""" + path_: HttpUrl + date_start: dt.datetime + date_end: dt.datetime + version: str = None + license: str = None + + +class CMIP6Attributes(Attributes): + """Data model for catalog entries for CMIP5 simulations. + """ + activity: Activity = Field(..., alias="activity_id") + experiment: Experiment = Field(..., alias="experiment_id") + frequency: Frequency + grid_label: GridLabel + institute: Institute = Field(..., alias="institute_id") + member: Member = Field(..., alias="member_id") + resolution: Resolution = Field(..., alias="nominal_resolution") + realm: Realm = Field(..., alias="realm") + source: Source = Field(..., alias="source_id") + source_type: SourceType = Field(..., alias="source_type") + sub_experiment: SubExperiment = Field(..., alias="sub_experiment_id") + table: Table = Field(..., alias="table_id") + variable: Variable = Field(..., alias="variable_id") + + + +class CatalogEntry(BaseModel): + attributes: Attributes + variables: Dict[str, CFVariable] + + def __init__(self, **kwargs): + # Copy attributes that are deeply nested within groups. + if "THREDDSMetadata" in kwargs["groups"]: + kwargs["attributes"]["path_"] = kwargs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"]["opendap_service"] + kwargs["attributes"]["date_start"] = kwargs["groups"]["CFMetadata"]["attributes"][ + "time_coverage_start"] + kwargs["attributes"]["date_end"] = kwargs["groups"]["CFMetadata"]["attributes"]["time_coverage_end"] + else: + kwargs["attributes"]["path_"] = kwargs["@location"] + + # Ingest data variables only. + variables = OrderedDict() + bounds = [v.get("attributes", {}).get("bounds") for v in kwargs["variables"].values()] + + for name, var in kwargs["variables"].items(): + # Select data variables only + if ('_CoordinateAxisType' not in var.get("attributes", {}) and + name not in var["shape"] and + name not in bounds): + variables[name] = var + variables[name]["name"] = name + + kwargs["variables"] = variables + + super().__init__(**kwargs) From fc2daf359943b1a3affe20fc3e4a6577b1252616 Mon Sep 17 00:00:00 2001 From: David Huard Date: Mon, 28 Aug 2023 11:48:14 -0400 Subject: [PATCH 10/69] create STAC item from TDS NcML response - untested due to missing STAC host --- implementations/CMIP6-UofT/add_CMIP6.py | 136 ++++++++++++++++++++++-- implementations/CMIP6-UofT/datamodel.py | 106 ------------------ pyproject.toml | 4 +- 3 files changed, 132 insertions(+), 114 deletions(-) delete mode 100644 implementations/CMIP6-UofT/datamodel.py diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 74423c0..f4dcd32 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,11 +1,14 @@ import argparse import logging from typing import Any, MutableMapping - +from enum import Enum from colorlog import ColoredFormatter - +from collections import OrderedDict +import pystac +from pydantic import BaseModel, Field from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader +import pyessv # from STACpopulator.metadata_parsers import nc_attrs_from_ncml @@ -19,6 +22,116 @@ LOGGER.propagate = False +def collection2enum(collection): + """Create Enum based on terms from pyessv collection. + + Parameters + ---------- + collection : pyessv.model.collection.Collection + pyessv collection of terms. + + Returns + ------- + Enum + Enum storing terms and their labels from collection. + """ + mp = {term.name: term.label for term in collection} + return Enum(collection.raw_name.capitalize(), mp, module="base") + + +# CMIP6 controlled vocabulary (CV) +CV = pyessv.WCRP.CMIP6 + +# Enum classes built from the pyessv' CV +Activity = collection2enum(CV.activity_id) +Experiment = collection2enum(CV.experiment_id) +Frequency = collection2enum(CV.frequency) +GridLabel = collection2enum(CV.grid_label) +Institute = collection2enum(CV.institution_id) +Member = collection2enum(CV.member_id) +Resolution = collection2enum(CV.nominal_resolution) +Realm = collection2enum(CV.realm) +Source = collection2enum(CV.source_id) +SourceType = collection2enum(CV.source_type) +SubExperiment = collection2enum(CV.sub_experiment_id) +Table = collection2enum(CV.table_id) +Variable = collection2enum(CV.variable_id) + + +class Properties(BaseModel): + """Data model for CMIP6 Controlled Vocabulary. + """ + activity: Activity = Field(..., alias="activity_id") + experiment: Experiment = Field(..., alias="experiment_id") + frequency: Frequency + grid_label: GridLabel + institute: Institute = Field(..., alias="institute_id") + member: Member = Field(..., alias="member_id") + resolution: Resolution = Field(..., alias="nominal_resolution") + realm: Realm = Field(..., alias="realm") + source: Source = Field(..., alias="source_id") + source_type: SourceType = Field(..., alias="source_type") + sub_experiment: SubExperiment = Field(..., alias="sub_experiment_id") + table: Table = Field(..., alias="table_id") + variable: Variable = Field(..., alias="variable_id") + initialization_index: int + physics_index: int + realization_index: int + forcing_index: int + variant_label: str + version: str + license: str = None + grid: str = None + tracking_id: str = Field(..., alias="tracking_id") + + +def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Create Polygon geometry from CFMetadata.""" + return { + "type": "Polygon", + "coordinates": [ + [ + [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_min"], + ], + [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_max"], + ], + [ + attrs["geospatial_lon_max"], + attrs["geospatial_lat_max"], + ], + [ + attrs["geospatial_lon_max"], + attrs["geospatial_lat_min"], + ], + [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_min"], + ], + ] + ], + } + + +def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: + """Create BBOX from CFMetadata.""" + return [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_min"], + attrs["geospatial_lon_max"], + attrs["geospatial_lat_max"], + ] + + +def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: + """Return unique ID for CMIP6 data collection (multiple variables).""" + keys = ["activity_id", "institution_id", "source_id", "experiment_id", "member_id", "table_id", "grid_label", "version"] + return "_".join(attrs[k] for k in keys) + + class CMIP6populator(STACpopulatorBase): def __init__( self, @@ -41,6 +154,7 @@ def __init__( data_loader = THREDDSLoader(thredds_catalog_url) self.validator = validator + for item in data_loader: print(item) super().__init__(stac_host, data_loader, config_filename) @@ -50,15 +164,23 @@ def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableM def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # TODO: next step is to implement this - self.validator(**item_data) - - def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: - # TODO: next step is to implement this - pass + # Create STAC item geometry from CFMetadata + item = dict( + id = make_cmip6_id(item_data["attributes"]), + geometry = ncattrs_to_geometry(item_data["groups"]["CFMetadata"]["attributes"]), + bbox = ncattrs_to_bbox(item_data["groups"]["CFMetadata"]["attributes"]), + properties = Properties(item_data["attributes"]).dump_model(), + start_datetime = item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_start"], + end_datetime = item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_end"], + ) + return pystac.Item(**item) + def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: + # Validation is done at the item creating stage, using the Properties class. + return True if __name__ == "__main__": diff --git a/implementations/CMIP6-UofT/datamodel.py b/implementations/CMIP6-UofT/datamodel.py deleted file mode 100644 index 1e54ee9..0000000 --- a/implementations/CMIP6-UofT/datamodel.py +++ /dev/null @@ -1,106 +0,0 @@ -""" -Data model for the attributes of a collection. -""" -# TODO: Make this data model compatible with STAC Items and Collections. - -from pydantic import BaseModel, HttpUrl, constr, validator, Field, field_validator -import datetime as dt -from typing import Literal, Optional, Dict -from collections import OrderedDict -import pyessv -from enum import Enum - - -def collection2enum(collection): - """Create Enum based on terms from pyessv collection. - - Parameters - ---------- - collection : pyessv.model.collection.Collection - pyessv collection of terms. - - Returns - ------- - Enum - Enum storing terms and their labels from collection. - """ - mp = {term.name: term.label for term in collection} - return Enum(collection.raw_name.capitalize(), mp, module="base") - - -# CMIP6 controlled vocabulary (CV) -CV = pyessv.WCRP.CMIP6 - -# Enum classes built from the pyessv' CV -Activity = collection2enum(CV.activity_id) -Experiment = collection2enum(CV.experiment_id) -Frequency = collection2enum(CV.frequency) -GridLabel = collection2enum(CV.grid_label) -Institute = collection2enum(CV.institution_id) -Member = collection2enum(CV.member_id) -Resolution = collection2enum(CV.nominal_resolution) -Realm = collection2enum(CV.realm) -Source = collection2enum(CV.source_id) -SourceType = collection2enum(CV.source_type) -SubExperiment = collection2enum(CV.sub_experiment_id) -Table = collection2enum(CV.table_id) -Variable = collection2enum(CV.variable_id) - - -class Attributes(BaseModel): - """Should be extended for each collection.""" - path_: HttpUrl - date_start: dt.datetime - date_end: dt.datetime - version: str = None - license: str = None - - -class CMIP6Attributes(Attributes): - """Data model for catalog entries for CMIP5 simulations. - """ - activity: Activity = Field(..., alias="activity_id") - experiment: Experiment = Field(..., alias="experiment_id") - frequency: Frequency - grid_label: GridLabel - institute: Institute = Field(..., alias="institute_id") - member: Member = Field(..., alias="member_id") - resolution: Resolution = Field(..., alias="nominal_resolution") - realm: Realm = Field(..., alias="realm") - source: Source = Field(..., alias="source_id") - source_type: SourceType = Field(..., alias="source_type") - sub_experiment: SubExperiment = Field(..., alias="sub_experiment_id") - table: Table = Field(..., alias="table_id") - variable: Variable = Field(..., alias="variable_id") - - - -class CatalogEntry(BaseModel): - attributes: Attributes - variables: Dict[str, CFVariable] - - def __init__(self, **kwargs): - # Copy attributes that are deeply nested within groups. - if "THREDDSMetadata" in kwargs["groups"]: - kwargs["attributes"]["path_"] = kwargs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"]["opendap_service"] - kwargs["attributes"]["date_start"] = kwargs["groups"]["CFMetadata"]["attributes"][ - "time_coverage_start"] - kwargs["attributes"]["date_end"] = kwargs["groups"]["CFMetadata"]["attributes"]["time_coverage_end"] - else: - kwargs["attributes"]["path_"] = kwargs["@location"] - - # Ingest data variables only. - variables = OrderedDict() - bounds = [v.get("attributes", {}).get("bounds") for v in kwargs["variables"].values()] - - for name, var in kwargs["variables"].items(): - # Select data variables only - if ('_CoordinateAxisType' not in var.get("attributes", {}) and - name not in var["shape"] and - name not in bounds): - variables[name] = var - variables[name]["name"] = name - - kwargs["variables"] = variables - - super().__init__(**kwargs) diff --git a/pyproject.toml b/pyproject.toml index 1c94eaf..d1d10f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,9 @@ dependencies = [ "colorlog", "pyyaml", "siphon", - "pystac" + "pystac", + "pydantic", + "pyessv" ] [tool.setuptools] From ab09580c43e9e1a5b119e0daa6ccbc6af552124f Mon Sep 17 00:00:00 2001 From: David Huard Date: Mon, 28 Aug 2023 13:33:20 -0400 Subject: [PATCH 11/69] Suggestions from Deepak. Use xncml 0.3 from_text --- STACpopulator/input.py | 48 ++++++++++++++++--- STACpopulator/stac_utils.py | 18 ++++++++ implementations/CMIP6-UofT/add_CMIP6.py | 61 ++----------------------- pyproject.toml | 1 + 4 files changed, 66 insertions(+), 62 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 2b76a26..807a335 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -1,6 +1,5 @@ import logging from abc import ABC, abstractmethod -from tempfile import NamedTemporaryFile from typing import Any, Iterator, MutableMapping, Optional, Tuple import requests @@ -83,17 +82,54 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An LOGGER.info("Requesting NcML dataset description") r = requests.get(url) - # Write response to temporary file - f = NamedTemporaryFile() - f.write(r.content) - # Convert NcML to CF-compliant dictionary - attrs = xncml.Dataset(f.name).to_cf_dict() + attrs = xncml.Dataset.from_text(r.content).to_cf_dict() attrs["access_urls"] = ds.access_urls return attrs + @staticmethod + def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Create Polygon geometry from CFMetadata.""" + return { + "type": "Polygon", + "coordinates": [ + [ + [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_min"], + ], + [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_max"], + ], + [ + attrs["geospatial_lon_max"], + attrs["geospatial_lat_max"], + ], + [ + attrs["geospatial_lon_max"], + attrs["geospatial_lat_min"], + ], + [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_min"], + ], + ] + ], + } + + @staticmethod + def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: + """Create BBOX from CFMetadata.""" + return [ + attrs["geospatial_lon_min"], + attrs["geospatial_lat_min"], + attrs["geospatial_lon_max"], + attrs["geospatial_lat_max"], + ] + class STACLoader(GenericLoader): def __init__(self) -> None: diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 4f7faba..fd59677 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,4 +1,5 @@ import re +from enum import Enum def url_validate(target: str) -> bool: @@ -23,3 +24,20 @@ def url_validate(target: str) -> bool: re.IGNORECASE, ) return True if re.match(url_regex, target) else False + + +def collection2enum(collection): + """Create Enum based on terms from pyessv collection. + + Parameters + ---------- + collection : pyessv.model.collection.Collection + pyessv collection of terms. + + Returns + ------- + Enum + Enum storing terms and their labels from collection. + """ + mp = {term.name: term.label for term in collection} + return Enum(collection.raw_name.capitalize(), mp, module="base") diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index f4dcd32..78e7988 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,13 +1,14 @@ import argparse import logging from typing import Any, MutableMapping -from enum import Enum + from colorlog import ColoredFormatter from collections import OrderedDict import pystac from pydantic import BaseModel, Field from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader +from STACpopulator.stac_utils import collection2enum import pyessv # from STACpopulator.metadata_parsers import nc_attrs_from_ncml @@ -22,21 +23,7 @@ LOGGER.propagate = False -def collection2enum(collection): - """Create Enum based on terms from pyessv collection. - - Parameters - ---------- - collection : pyessv.model.collection.Collection - pyessv collection of terms. - Returns - ------- - Enum - Enum storing terms and their labels from collection. - """ - mp = {term.name: term.label for term in collection} - return Enum(collection.raw_name.capitalize(), mp, module="base") # CMIP6 controlled vocabulary (CV) @@ -85,45 +72,7 @@ class Properties(BaseModel): tracking_id: str = Field(..., alias="tracking_id") -def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - """Create Polygon geometry from CFMetadata.""" - return { - "type": "Polygon", - "coordinates": [ - [ - [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], - ], - [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_max"], - ], - [ - attrs["geospatial_lon_max"], - attrs["geospatial_lat_max"], - ], - [ - attrs["geospatial_lon_max"], - attrs["geospatial_lat_min"], - ], - [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], - ], - ] - ], - } - - -def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: - """Create BBOX from CFMetadata.""" - return [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], - attrs["geospatial_lon_max"], - attrs["geospatial_lat_max"], - ] + def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: @@ -168,8 +117,8 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) # Create STAC item geometry from CFMetadata item = dict( id = make_cmip6_id(item_data["attributes"]), - geometry = ncattrs_to_geometry(item_data["groups"]["CFMetadata"]["attributes"]), - bbox = ncattrs_to_bbox(item_data["groups"]["CFMetadata"]["attributes"]), + geometry = THREDDSLoader.ncattrs_to_geometry(item_data["groups"]["CFMetadata"]["attributes"]), + bbox = THREDDSLoader.ncattrs_to_bbox(item_data["groups"]["CFMetadata"]["attributes"]), properties = Properties(item_data["attributes"]).dump_model(), start_datetime = item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_start"], end_datetime = item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_end"], diff --git a/pyproject.toml b/pyproject.toml index d1d10f0..dc08b7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "pyyaml", "siphon", "pystac", + "xncml", "pydantic", "pyessv" ] From 86dddd42260f8979b27799a62748815db5c5a801 Mon Sep 17 00:00:00 2001 From: David Huard Date: Mon, 28 Aug 2023 13:34:05 -0400 Subject: [PATCH 12/69] black --- implementations/CMIP6-UofT/add_CMIP6.py | 34 ++++++++++--------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 78e7988..b6cebff 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -22,10 +22,6 @@ LOGGER.setLevel(logging.INFO) LOGGER.propagate = False - - - - # CMIP6 controlled vocabulary (CV) CV = pyessv.WCRP.CMIP6 @@ -72,22 +68,20 @@ class Properties(BaseModel): tracking_id: str = Field(..., alias="tracking_id") - - - def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: """Return unique ID for CMIP6 data collection (multiple variables).""" - keys = ["activity_id", "institution_id", "source_id", "experiment_id", "member_id", "table_id", "grid_label", "version"] + keys = ["activity_id", "institution_id", "source_id", "experiment_id", "member_id", "table_id", "grid_label", + "version"] return "_".join(attrs[k] for k in keys) class CMIP6populator(STACpopulatorBase): def __init__( - self, - stac_host: str, - thredds_catalog_url: str, - config_filename: str, - validator: callable = None + self, + stac_host: str, + thredds_catalog_url: str, + config_filename: str, + validator: callable = None ) -> None: """Constructor @@ -103,7 +97,6 @@ def __init__( data_loader = THREDDSLoader(thredds_catalog_url) self.validator = validator - for item in data_loader: print(item) super().__init__(stac_host, data_loader, config_filename) @@ -116,17 +109,16 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) # Create STAC item geometry from CFMetadata item = dict( - id = make_cmip6_id(item_data["attributes"]), - geometry = THREDDSLoader.ncattrs_to_geometry(item_data["groups"]["CFMetadata"]["attributes"]), - bbox = THREDDSLoader.ncattrs_to_bbox(item_data["groups"]["CFMetadata"]["attributes"]), - properties = Properties(item_data["attributes"]).dump_model(), - start_datetime = item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_start"], - end_datetime = item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_end"], + id=make_cmip6_id(item_data["attributes"]), + geometry=THREDDSLoader.ncattrs_to_geometry(item_data["groups"]["CFMetadata"]["attributes"]), + bbox=THREDDSLoader.ncattrs_to_bbox(item_data["groups"]["CFMetadata"]["attributes"]), + properties=Properties(item_data["attributes"]).dump_model(), + start_datetime=item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_start"], + end_datetime=item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_end"], ) return pystac.Item(**item) - def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. return True From 1c7aa53bfb671a8599fe1fc20f5ceac7fdacbaca Mon Sep 17 00:00:00 2001 From: David Huard Date: Mon, 28 Aug 2023 14:17:56 -0400 Subject: [PATCH 13/69] Fix errors in metadata parsing --- implementations/CMIP6-UofT/add_CMIP6.py | 71 ++++++++++++++++--------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index b6cebff..55c7054 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,17 +1,16 @@ import argparse import logging -from typing import Any, MutableMapping +from typing import Any, MutableMapping, Literal, List +import datetime as dt from colorlog import ColoredFormatter -from collections import OrderedDict import pystac -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, FieldValidationInfo, field_validator, ValidationError from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader from STACpopulator.stac_utils import collection2enum import pyessv -# from STACpopulator.metadata_parsers import nc_attrs_from_ncml LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -30,15 +29,15 @@ Experiment = collection2enum(CV.experiment_id) Frequency = collection2enum(CV.frequency) GridLabel = collection2enum(CV.grid_label) -Institute = collection2enum(CV.institution_id) -Member = collection2enum(CV.member_id) +Institution = collection2enum(CV.institution_id) +# Member = collection2enum(CV.member_id) # This is empty Resolution = collection2enum(CV.nominal_resolution) Realm = collection2enum(CV.realm) Source = collection2enum(CV.source_id) SourceType = collection2enum(CV.source_type) SubExperiment = collection2enum(CV.sub_experiment_id) Table = collection2enum(CV.table_id) -Variable = collection2enum(CV.variable_id) +Variable = collection2enum(CV.variable_id) # This is empty class Properties(BaseModel): @@ -48,30 +47,47 @@ class Properties(BaseModel): experiment: Experiment = Field(..., alias="experiment_id") frequency: Frequency grid_label: GridLabel - institute: Institute = Field(..., alias="institute_id") - member: Member = Field(..., alias="member_id") + institution: Institution = Field(..., alias="institution_id") resolution: Resolution = Field(..., alias="nominal_resolution") - realm: Realm = Field(..., alias="realm") + realm: List[Realm] = Field(..., alias="realm") source: Source = Field(..., alias="source_id") - source_type: SourceType = Field(..., alias="source_type") - sub_experiment: SubExperiment = Field(..., alias="sub_experiment_id") + source_type: List[SourceType] = Field(..., alias="source_type") + sub_experiment: SubExperiment | Literal['none'] = Field(..., alias="sub_experiment_id") table: Table = Field(..., alias="table_id") - variable: Variable = Field(..., alias="variable_id") + variable: Variable = str # Field(..., alias="variable_id") + variant_label: str initialization_index: int physics_index: int realization_index: int forcing_index: int variant_label: str - version: str + tracking_id: str + version: str = None license: str = None grid: str = None - tracking_id: str = Field(..., alias="tracking_id") + + @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") + @classmethod + def first_item(cls, v: list, info: FieldValidationInfo): + """Pick single item from list.""" + assert len(v) == 1, f"{info.field_name} must have one item only." + return v[0] + + @field_validator("realm", "source_type", mode="before") + @classmethod + def split(cls, v: str, info: FieldValidationInfo): + """Split string into list.""" + return v.split(" ") + + +class STACItem(BaseModel): + start_datetime: dt.datetime + end_datetime: dt.datetime def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: """Return unique ID for CMIP6 data collection (multiple variables).""" - keys = ["activity_id", "institution_id", "source_id", "experiment_id", "member_id", "table_id", "grid_label", - "version"] + keys = ["activity_id", "institution_id", "source_id", "experiment_id", "variant_label", "table_id", "grid_label",] return "_".join(attrs[k] for k in keys) @@ -97,8 +113,9 @@ def __init__( data_loader = THREDDSLoader(thredds_catalog_url) self.validator = validator - for item in data_loader: - print(item) + for name, item in data_loader: + # self.create_stac_item(name, item) + print(name) super().__init__(stac_host, data_loader, config_filename) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): @@ -106,17 +123,21 @@ def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableM def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # TODO: next step is to implement this + attrs = item_data["attributes"] + meta = item_data["groups"]["CFMetadata"]["attributes"] # Create STAC item geometry from CFMetadata item = dict( - id=make_cmip6_id(item_data["attributes"]), - geometry=THREDDSLoader.ncattrs_to_geometry(item_data["groups"]["CFMetadata"]["attributes"]), - bbox=THREDDSLoader.ncattrs_to_bbox(item_data["groups"]["CFMetadata"]["attributes"]), - properties=Properties(item_data["attributes"]).dump_model(), - start_datetime=item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_start"], - end_datetime=item_data["groups"]["CFMetadata"]["attributes"]["time_coverage_end"], + id=make_cmip6_id(attrs), + geometry=THREDDSLoader.ncattrs_to_geometry(meta), + bbox=THREDDSLoader.ncattrs_to_bbox(meta), + properties=Properties(**attrs).model_dump(), + datetime=None, ) + item.update(STACItem(start_datetime=meta["time_coverage_start"], + end_datetime=meta["time_coverage_end"],).model_dump()) + return pystac.Item(**item) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: From dd171a6ee55da8408b46d48ab2ec5b95ac5d4fb7 Mon Sep 17 00:00:00 2001 From: David Huard Date: Mon, 28 Aug 2023 16:15:02 -0400 Subject: [PATCH 14/69] change dchandan user to testuser --- docker-compose.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 23ffae5..0fc7d25 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,3 +1,5 @@ +version: "3.4" + x-logging: &default-logging driver: "json-file" options: @@ -13,7 +15,7 @@ services: ports: - "8880:8000" environment: - - POSTGRES_USER=dchandan + - POSTGRES_USER=testuser - POSTGRES_PASS=password - POSTGRES_DBNAME=postgis - POSTGRES_HOST_READER=stac-db @@ -30,7 +32,7 @@ services: - POSTGRES_USER=testuser - POSTGRES_PASSWORD=password - POSTGRES_DB=postgis - - PGUSER=dchandan + - PGUSER=testuser - PGPASSWORD=password - PGHOST=localhost - PGDATABASE=postgis From 99d17b5a751648ce78f5ee8e104c0516ec2ceb82 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 30 Aug 2023 17:38:18 -0400 Subject: [PATCH 15/69] adding type hints to collection2enum --- STACpopulator/stac_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index fd59677..160f07b 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,6 +1,14 @@ import re from enum import Enum +import pyessv + +try: + from enum import EnumType as enumtype +except ImportError: + # < Python 3.11 + from enum import EnumMeta as enumtype + def url_validate(target: str) -> bool: """Validate whether a supplied URL is reliably written. @@ -26,7 +34,7 @@ def url_validate(target: str) -> bool: return True if re.match(url_regex, target) else False -def collection2enum(collection): +def collection2enum(collection: pyessv.model.collection.Collection) -> enumtype: """Create Enum based on terms from pyessv collection. Parameters From 94a43a56da7cb9f87e67ff71ca60f03ae342a237 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 30 Aug 2023 17:48:13 -0400 Subject: [PATCH 16/69] implementation for post_stac_item + logger changes --- STACpopulator/api_requests.py | 51 ++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py index 59ffc98..773cca5 100644 --- a/STACpopulator/api_requests.py +++ b/STACpopulator/api_requests.py @@ -1,7 +1,19 @@ +import logging import os -from typing import Any +from typing import Any, Optional +from urllib.parse import urljoin import requests +from colorlog import ColoredFormatter + +LOGGER = logging.getLogger(__name__) +LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" +formatter = ColoredFormatter(LOGFORMAT) +stream = logging.StreamHandler() +stream.setFormatter(formatter) +LOGGER.addHandler(stream) +LOGGER.setLevel(logging.INFO) +LOGGER.propagate = False def stac_host_reachable(url: str) -> bool: @@ -24,7 +36,7 @@ def stac_collection_exists(stac_host: str, collection_id: str) -> bool: return r.status_code == 200 -def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: +def post_stac_collection(stac_host: str, json_data: dict[str, Any], update: Optional[bool] = True) -> None: """ Post a STAC collection. @@ -34,18 +46,33 @@ def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: r = requests.post(os.path.join(stac_host, "collections"), json=json_data, verify=False) if r.status_code == 200: - print( - f"{bcolors.OKGREEN}[INFO] Pushed STAC collection [{collection_id}] to [{stac_host}] ({r.status_code}){bcolors.ENDC}" - ) + LOGGER.info(f"Created STAC collection {collection_id}") elif r.status_code == 409: - print( - f"{bcolors.WARNING}[INFO] STAC collection [{collection_id}] already exists on [{stac_host}] ({r.status_code}), updating..{bcolors.ENDC}" - ) - r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) - r.raise_for_status() + if update: + LOGGER.info(f"STAC collection {collection_id} already exists. Updating.") + r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) + r.raise_for_status() + else: + LOGGER.info(f"STAC collection {collection_id} already exists.") else: r.raise_for_status() -def post_stac_item(stac_host: str, collection_id: str, data: dict[str, dict]) -> bool: - pass +def post_stac_item( + stac_host: str, collection_id: str, json_data: dict[str, dict], update: Optional[bool] = True +) -> bool: + item_id = json_data["id"] + + r = requests.post(urljoin(stac_host, f"collections/{collection_id}/items"), json=json_data) + + if r.status_code == 200: + LOGGER.info(f"Created item {item_id}") + elif r.status_code == 409: + if update: + LOGGER.info(f"Item {item_id} already exists. Updating.") + r = requests.put(urljoin(stac_host, f"collections/{collection_id}/items"), json=json_data) + r.raise_for_status() + else: + LOGGER.info(f"Item {item_id} already exists.") + else: + r.raise_for_status() From e193c281a68a2a7abf0b0bc93378398234f28742 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 30 Aug 2023 17:48:40 -0400 Subject: [PATCH 17/69] removing validator from CMIP6populator --- implementations/CMIP6-UofT/add_CMIP6.py | 53 +++++++++++++++---------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 55c7054..b2e483d 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,16 +1,21 @@ import argparse -import logging -from typing import Any, MutableMapping, Literal, List import datetime as dt +import logging +from typing import Any, List, Literal, MutableMapping -from colorlog import ColoredFormatter +import pyessv import pystac -from pydantic import BaseModel, Field, FieldValidationInfo, field_validator, ValidationError +from colorlog import ColoredFormatter +from pydantic import ( + BaseModel, + Field, + FieldValidationInfo, + ValidationError, + field_validator, +) + from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader -from STACpopulator.stac_utils import collection2enum -import pyessv - LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -41,8 +46,8 @@ class Properties(BaseModel): - """Data model for CMIP6 Controlled Vocabulary. - """ + """Data model for CMIP6 Controlled Vocabulary.""" + activity: Activity = Field(..., alias="activity_id") experiment: Experiment = Field(..., alias="experiment_id") frequency: Frequency @@ -52,7 +57,7 @@ class Properties(BaseModel): realm: List[Realm] = Field(..., alias="realm") source: Source = Field(..., alias="source_id") source_type: List[SourceType] = Field(..., alias="source_type") - sub_experiment: SubExperiment | Literal['none'] = Field(..., alias="sub_experiment_id") + sub_experiment: SubExperiment | Literal["none"] = Field(..., alias="sub_experiment_id") table: Table = Field(..., alias="table_id") variable: Variable = str # Field(..., alias="variable_id") variant_label: str @@ -87,18 +92,20 @@ class STACItem(BaseModel): def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: """Return unique ID for CMIP6 data collection (multiple variables).""" - keys = ["activity_id", "institution_id", "source_id", "experiment_id", "variant_label", "table_id", "grid_label",] + keys = [ + "activity_id", + "institution_id", + "source_id", + "experiment_id", + "variant_label", + "table_id", + "grid_label", + ] return "_".join(attrs[k] for k in keys) class CMIP6populator(STACpopulatorBase): - def __init__( - self, - stac_host: str, - thredds_catalog_url: str, - config_filename: str, - validator: callable = None - ) -> None: + def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: str) -> None: """Constructor :param stac_host: URL to the STAC API @@ -107,11 +114,9 @@ def __init__( :type thredds_catalog_url: str :param config_filename: Yaml file containing the information about the collection to populate :type config_filename: str - :param: validator: a function that validates and returns a dictionary of attributes. """ data_loader = THREDDSLoader(thredds_catalog_url) - self.validator = validator for name, item in data_loader: # self.create_stac_item(name, item) @@ -135,8 +140,12 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) datetime=None, ) - item.update(STACItem(start_datetime=meta["time_coverage_start"], - end_datetime=meta["time_coverage_end"],).model_dump()) + item.update( + STACItem( + start_datetime=meta["time_coverage_start"], + end_datetime=meta["time_coverage_end"], + ).model_dump() + ) return pystac.Item(**item) From a0770a72ea6626da91a493fc46990e9a6ab08ba0 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 30 Aug 2023 17:48:51 -0400 Subject: [PATCH 18/69] makefile --- Makefile | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e9e1f6f --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +IMP_DIR = /Users/dchandan/DACCS/Codes/stac-populator/implementations +STAC_HOST = http://localhost:8880/stac + +testcmip6: + python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html $(IMP_DIR)/CMIP6-UofT/CMIP6.yml + + +starthost: + docker compose up + +stophost: + docker compose down + +del_docker_volume: stophost + docker volume rm stac-populator_stac-db + +resethost: del_docker_volume starthost From b481f370c386f91f6c696a80859ba7a06aa4cc16 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 31 Aug 2023 14:58:46 -0400 Subject: [PATCH 19/69] simplifying the ingestion logic --- STACpopulator/populator_base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index f80a23b..b9f6ec9 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -107,22 +107,17 @@ def create_stac_collection(self): self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) collection = pystac.Collection(id=self.collection_id, **self._collection_info) - LOGGER.info("Collection successfully created") post_stac_collection(self.stac_host, collection.to_dict()) def ingest(self) -> None: for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) - if self.validate_stac_item_cv(stac_item): - if post_stac_item(self.stac_host, self.collection_id, stac_item): - LOGGER.info(f"{item_name} successfully posted") - else: - LOGGER.error(f"Posting {item_name} failed") - self.handle_ingestion_error("Posting Error", item_name, item_data) - else: - LOGGER.error(f"Validation failed for item {item_name}") - self.handle_ingestion_error("Validation Error", item_name, item_data) + try: + post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) + except Exception: + LOGGER.error(f"Failed adding STAC item {item_name}") + self.handle_ingestion_error("Posting Error", item_name, item_data) @abstractmethod def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): From d28ab7492bdcbb481b1fbbd98606a9bd449cf2e7 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 31 Aug 2023 14:59:56 -0400 Subject: [PATCH 20/69] comments and small changes to the posting functions --- STACpopulator/api_requests.py | 37 +++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py index 773cca5..f2dcb0a 100644 --- a/STACpopulator/api_requests.py +++ b/STACpopulator/api_requests.py @@ -37,36 +37,53 @@ def stac_collection_exists(stac_host: str, collection_id: str) -> bool: def post_stac_collection(stac_host: str, json_data: dict[str, Any], update: Optional[bool] = True) -> None: - """ - Post a STAC collection. - - Returns the collection id. + """Post/create a collection on the STAC host + + :param stac_host: address of the STAC host + :type stac_host: str + :param json_data: JSON representation of the STAC collection + :type json_data: dict[str, Any] + :param update: if True, update the collection on the host server if it is already present, defaults to True + :type update: Optional[bool], optional """ collection_id = json_data["id"] r = requests.post(os.path.join(stac_host, "collections"), json=json_data, verify=False) if r.status_code == 200: - LOGGER.info(f"Created STAC collection {collection_id}") + LOGGER.info(f"Collection {collection_id} successfully created") elif r.status_code == 409: if update: - LOGGER.info(f"STAC collection {collection_id} already exists. Updating.") + LOGGER.info(f"Collection {collection_id} already exists. Updating.") r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) r.raise_for_status() else: - LOGGER.info(f"STAC collection {collection_id} already exists.") + LOGGER.info(f"Collection {collection_id} already exists.") else: r.raise_for_status() def post_stac_item( - stac_host: str, collection_id: str, json_data: dict[str, dict], update: Optional[bool] = True -) -> bool: + stac_host: str, collection_id: str, item_name: str, json_data: dict[str, dict], update: Optional[bool] = True +) -> None: + """Post a STAC item to the host server. + + :param stac_host: address of the STAC host + :type stac_host: str + :param collection_id: ID of the collection to which to post this item + :type collection_id: str + :param item_name: name of the STAC item + :type item_name: str + :param json_data: JSON representation of the STAC item + :type json_data: dict[str, dict] + :param update: if True, update the item on the host server if it is already present, defaults to True + :type update: Optional[bool], optional + """ item_id = json_data["id"] r = requests.post(urljoin(stac_host, f"collections/{collection_id}/items"), json=json_data) if r.status_code == 200: - LOGGER.info(f"Created item {item_id}") + LOGGER.info(f"Item {item_name} successfully added") elif r.status_code == 409: if update: LOGGER.info(f"Item {item_id} already exists. Updating.") From 933d00346f8e3f9e76c482b3152fe05c4e6c6477 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 1 Sep 2023 09:10:26 -0400 Subject: [PATCH 21/69] fixing issue with thredds metadata for attributes with type tag --- STACpopulator/input.py | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 807a335..c83344e 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -92,30 +92,22 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An @staticmethod def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Create Polygon geometry from CFMetadata.""" + # Oddly, for any attribute tag that has a "value" attribute, ncml metadata is returned + # as a list (of length 1). So, here, I convert the list to a value. + lon_min = attrs["geospatial_lon_min"][0] + lon_max = attrs["geospatial_lon_max"][0] + lat_min = attrs["geospatial_lat_min"][0] + lat_max = attrs["geospatial_lat_max"][0] + return { "type": "Polygon", "coordinates": [ [ - [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], - ], - [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_max"], - ], - [ - attrs["geospatial_lon_max"], - attrs["geospatial_lat_max"], - ], - [ - attrs["geospatial_lon_max"], - attrs["geospatial_lat_min"], - ], - [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], - ], + [lon_min, lat_min], + [lon_min, lat_max], + [lon_max, lat_max], + [lon_max, lat_min], + [lon_min, lat_min], ] ], } @@ -124,10 +116,10 @@ def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: """Create BBOX from CFMetadata.""" return [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], - attrs["geospatial_lon_max"], - attrs["geospatial_lat_max"], + attrs["geospatial_lon_min"][0], + attrs["geospatial_lat_min"][0], + attrs["geospatial_lon_max"][0], + attrs["geospatial_lat_max"][0], ] From f697be9c00845ab93c9faafee8424cb9b76ecfb4 Mon Sep 17 00:00:00 2001 From: David Huard Date: Tue, 5 Sep 2023 17:37:33 -0400 Subject: [PATCH 22/69] Replaced enums by literal for CMIP6 CV --- README.md | 4 +-- STACpopulator/stac_utils.py | 7 +++++ implementations/CMIP6-UofT/add_CMIP6.py | 34 +++++++++++++------------ 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index bcac544..808926c 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,6 @@ Currently, one implementation of `STACpopulatorBase` is provided in [add_CMIP6.p The provided `docker-compose` file can be used to launch a test STAC server. The `add_CMIP6.py` script can be run as: ``` -python implementations/add_CMIP6.py http://localhost:8880/stac/ https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/datasets/simulations/bias_adjusted/catalog.html implementations/CMIP6.yml +python implementations/CMIP6-UofT/add_CMIP6.py http://localhost:8880/stac/ https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html implementations/CMIP6-UofT/CMIP6.yml ``` -Note: in the script above, I am currently using a sample THREDDS catalog URL and not one relevant to the global scale CMIP6 data. \ No newline at end of file +Note: in the script above, I am currently using a sample THREDDS catalog URL and not one relevant to the global scale CMIP6 data. diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index fd59677..d0ee5a9 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -41,3 +41,10 @@ def collection2enum(collection): """ mp = {term.name: term.label for term in collection} return Enum(collection.raw_name.capitalize(), mp, module="base") + + +def collection2literal(collection): + import typing + terms = tuple(term.label for term in collection) + return typing.Literal[terms] + diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 55c7054..c3ce0d0 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, Field, FieldValidationInfo, field_validator, ValidationError from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader -from STACpopulator.stac_utils import collection2enum +from STACpopulator.stac_utils import collection2literal import pyessv @@ -24,20 +24,22 @@ # CMIP6 controlled vocabulary (CV) CV = pyessv.WCRP.CMIP6 + + # Enum classes built from the pyessv' CV -Activity = collection2enum(CV.activity_id) -Experiment = collection2enum(CV.experiment_id) -Frequency = collection2enum(CV.frequency) -GridLabel = collection2enum(CV.grid_label) -Institution = collection2enum(CV.institution_id) -# Member = collection2enum(CV.member_id) # This is empty -Resolution = collection2enum(CV.nominal_resolution) -Realm = collection2enum(CV.realm) -Source = collection2enum(CV.source_id) -SourceType = collection2enum(CV.source_type) -SubExperiment = collection2enum(CV.sub_experiment_id) -Table = collection2enum(CV.table_id) -Variable = collection2enum(CV.variable_id) # This is empty +Activity = collection2literal(CV.activity_id) +Experiment = collection2literal(CV.experiment_id) +Frequency = collection2literal(CV.frequency) +GridLabel = collection2literal(CV.grid_label) +Institution = collection2literal(CV.institution_id) +# Member = collection2literal(CV.member_id) # This is empty +Resolution = collection2literal(CV.nominal_resolution) +Realm = collection2literal(CV.realm) +Source = collection2literal(CV.source_id) +SourceType = collection2literal(CV.source_type) +SubExperiment = collection2literal(CV.sub_experiment_id) +Table = collection2literal(CV.table_id) +# Variable = collection2literal(CV.variable_id) # This is empty class Properties(BaseModel): @@ -54,7 +56,7 @@ class Properties(BaseModel): source_type: List[SourceType] = Field(..., alias="source_type") sub_experiment: SubExperiment | Literal['none'] = Field(..., alias="sub_experiment_id") table: Table = Field(..., alias="table_id") - variable: Variable = str # Field(..., alias="variable_id") + # variable: str # Field(..., alias="variable_id") variant_label: str initialization_index: int physics_index: int @@ -137,7 +139,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) item.update(STACItem(start_datetime=meta["time_coverage_start"], end_datetime=meta["time_coverage_end"],).model_dump()) - + return pystac.Item(**item) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: From 74ad5941385dcb0624f2caaf7e26d889283b5f8c Mon Sep 17 00:00:00 2001 From: David Huard Date: Wed, 20 Sep 2023 17:36:05 -0400 Subject: [PATCH 23/69] Implemented post_stac_item, using a hash of the item attributes as the ID. --- STACpopulator/api_requests.py | 35 ++++++++++++++++++++++--- STACpopulator/input.py | 28 ++++++++++---------- implementations/CMIP6-UofT/add_CMIP6.py | 9 ++++--- 3 files changed, 51 insertions(+), 21 deletions(-) diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py index 59ffc98..503798a 100644 --- a/STACpopulator/api_requests.py +++ b/STACpopulator/api_requests.py @@ -1,6 +1,5 @@ import os from typing import Any - import requests @@ -35,11 +34,11 @@ def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: if r.status_code == 200: print( - f"{bcolors.OKGREEN}[INFO] Pushed STAC collection [{collection_id}] to [{stac_host}] ({r.status_code}){bcolors.ENDC}" + f"[INFO] Pushed STAC collection [{collection_id}] to [{stac_host}] ({r.status_code})" ) elif r.status_code == 409: print( - f"{bcolors.WARNING}[INFO] STAC collection [{collection_id}] already exists on [{stac_host}] ({r.status_code}), updating..{bcolors.ENDC}" + f"[INFO] STAC collection [{collection_id}] already exists on [{stac_host}] ({r.status_code}), updating.." ) r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) r.raise_for_status() @@ -48,4 +47,32 @@ def post_stac_collection(stac_host: str, json_data: dict[str, Any]) -> None: def post_stac_item(stac_host: str, collection_id: str, data: dict[str, dict]) -> bool: - pass + """ + Post a STAC item. + """ + item_id = data["id"] + r = requests.post( + os.path.join(stac_host, "collections", collection_id, "items"), + json=data, + verify=False, + ) + + if r.status_code == 200: + print( + f"[INFO] Pushed STAC item [{item_id}] to [{stac_host}] ({r.status_code})" + ) + return True + elif r.status_code == 409: + print( + f"[INFO] STAC item [{item_id}] already exists on [{stac_host}] ({r.status_code}), updating.." + ) + r = requests.put( + os.path.join(stac_host, "collections", collection_id, "items", item_id), + json=data, + verify=False, + ) + r.raise_for_status() + return True + else: + r.raise_for_status() + return False diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 807a335..4fb73ca 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -97,24 +97,24 @@ def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, "coordinates": [ [ [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), ], [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_max"], + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_max"][0]), ], [ - attrs["geospatial_lon_max"], - attrs["geospatial_lat_max"], + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), ], [ - attrs["geospatial_lon_max"], - attrs["geospatial_lat_min"], + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_min"][0]), ], [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), ], ] ], @@ -124,10 +124,10 @@ def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: """Create BBOX from CFMetadata.""" return [ - attrs["geospatial_lon_min"], - attrs["geospatial_lat_min"], - attrs["geospatial_lon_max"], - attrs["geospatial_lat_max"], + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), ] diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index c3ce0d0..65b7df2 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -2,6 +2,7 @@ import logging from typing import Any, MutableMapping, Literal, List import datetime as dt +import hashlib from colorlog import ColoredFormatter import pystac @@ -90,7 +91,8 @@ class STACItem(BaseModel): def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: """Return unique ID for CMIP6 data collection (multiple variables).""" keys = ["activity_id", "institution_id", "source_id", "experiment_id", "variant_label", "table_id", "grid_label",] - return "_".join(attrs[k] for k in keys) + item_name = "_".join(attrs[k] for k in keys) + return hashlib.md5(item_name.encode("utf-8")).hexdigest() class CMIP6populator(STACpopulatorBase): @@ -128,6 +130,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) attrs = item_data["attributes"] meta = item_data["groups"]["CFMetadata"]["attributes"] + # uuid # Create STAC item geometry from CFMetadata item = dict( id=make_cmip6_id(attrs), @@ -139,8 +142,8 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) item.update(STACItem(start_datetime=meta["time_coverage_start"], end_datetime=meta["time_coverage_end"],).model_dump()) - - return pystac.Item(**item) + + return pystac.Item(**item).to_dict() def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. From ccf1c88a3e2a59fcea34404d982fb1e7474a71ad Mon Sep 17 00:00:00 2001 From: David Huard Date: Wed, 20 Sep 2023 18:02:42 -0400 Subject: [PATCH 24/69] added assets --- implementations/CMIP6-UofT/add_CMIP6.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 65b7df2..a93211c 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -13,6 +13,20 @@ import pyessv +media_types = {"httpserver_service": "application/x-netcdf", + "opendap_service": pystac.MediaType.HTML, + "wcs_service": pystac.MediaType.XML, + "wms_service": pystac.MediaType.XML, + "nccs_service": "application/x-netcdf", + "HTTPServer": "application/x-netcdf", + "OPENDAP": pystac.MediaType.HTML, + "NCML": pystac.MediaType.XML, + "WCS": pystac.MediaType.XML, + "ISO": pystac.MediaType.XML, + "WMS": pystac.MediaType.XML, + "NetcdfSubset": "application/x-netcdf", + } + LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" formatter = ColoredFormatter(LOGFORMAT) @@ -143,7 +157,14 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) item.update(STACItem(start_datetime=meta["time_coverage_start"], end_datetime=meta["time_coverage_end"],).model_dump()) - return pystac.Item(**item).to_dict() + stac_item = pystac.Item(**item) + + # Add assets + for name, url in item_data["access_urls"].items(): + asset = pystac.Asset(href=url, media_type=media_types.get(name, None)) + stac_item.add_asset(name, asset) + + return stac_item.to_dict() def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. From e7b67a1633acf5166e916768f1312f55264e3e63 Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 21 Sep 2023 14:14:26 -0400 Subject: [PATCH 25/69] added Datacube extension to CMIP6. Did some code clean-up --- STACpopulator/input.py | 44 +--- STACpopulator/stac_utils.py | 277 ++++++++++++++++++++++++ implementations/CMIP6-UofT/add_CMIP6.py | 56 +---- 3 files changed, 293 insertions(+), 84 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 4fb73ca..a3d5dd8 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -37,6 +37,10 @@ def reset(self): pass + + + + class THREDDSLoader(GenericLoader): def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> None: """Constructor @@ -89,46 +93,6 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An return attrs - @staticmethod - def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - """Create Polygon geometry from CFMetadata.""" - return { - "type": "Polygon", - "coordinates": [ - [ - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_max"][0]), - ], - [ - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_max"][0]), - ], - [ - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - ] - ], - } - - @staticmethod - def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: - """Create BBOX from CFMetadata.""" - return [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_max"][0]), - ] class STACLoader(GenericLoader): diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index d0ee5a9..371f35a 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,5 +1,10 @@ import re +import datetime as dt from enum import Enum +from typing import Any, Iterator, MutableMapping, Optional, Tuple +import pystac +from pystac.extensions.datacube import Dimension, DimensionType, VariableType, Variable, DatacubeExtension +from pydantic import BaseModel def url_validate(target: str) -> bool: @@ -48,3 +53,275 @@ def collection2literal(collection): terms = tuple(term.label for term in collection) return typing.Literal[terms] + +class STACItem(BaseModel): + start_datetime: dt.datetime + end_datetime: dt.datetime + + +class CFJsonItem: + """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + def __init__(self, iid: str, attrs: dict, datamodel=None): + self.attrs = attrs + + # Global attributes + gattrs = attrs["attributes"] + + # Validate using pydantic data model if given + if datamodel: + props = datamodel(**gattrs).model_dump() + else: + props = gattrs + + + # Create STAC item + itemd = dict( + id=iid, + geometry=self.ncattrs_to_geometry(), + bbox=self.ncattrs_to_bbox(), + properties=props, + datetime=None, + ) + + cfmeta = attrs["groups"]["CFMetadata"]["attributes"] + itemd.update(STACItem(start_datetime=cfmeta["time_coverage_start"], + end_datetime=cfmeta["time_coverage_end"],).model_dump()) + + item = pystac.Item(**itemd) + + # Add assets + for name, url in attrs["access_urls"].items(): + asset = pystac.Asset(href=url, media_type=media_types.get(name, None)) + item.add_asset(name, asset) + + self.item = item + + def ncattrs_to_geometry(self) -> MutableMapping[str, Any]: + """Create Polygon geometry from CFMetadata.""" + attrs = self.attrs["groups"]["CFMetadata"]["attributes"] + return { + "type": "Polygon", + "coordinates": [ + [ + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + ] + ], + } + + def ncattrs_to_bbox(self) -> list: + """Create BBOX from CFMetadata.""" + attrs = self.attrs["groups"]["CFMetadata"]["attributes"] + return [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ] + + +class CFJsonDatacube(CFJsonItem): + """Return STAC Item with Datacube extension from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} + + def __init__(self, *args, **kwds): + super().__init__(*args, **kwds) + + self.ext = DatacubeExtension.ext(self.item, add_if_missing=True) + self.ext.apply(dimensions=self.dimensions(), variables=self.variables()) + + def dimensions(self) -> dict: + """Return Dimension objects.""" + + dims = {} + for name, length in self.attrs["dimensions"].items(): + v = self.attrs["variables"][name] + bbox = self.ncattrs_to_bbox() + + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if v['attributes'].get(criterion, None) in expected: + axis = self.axis[key] + type_ = DimensionType.SPATIAL if axis in ['x', 'y', 'z'] else DimensionType.TEMPORAL + + if v['type'] == 'int': + extent = [0, int(length)] + else: # Not clear the logic is sound + if key == 'X': + extent = bbox[0], bbox[2] + elif key == "Y": + extent = bbox[1], bbox[3] + else: + extent = None + + dims[name] = Dimension(properties=dict( + axis = axis, + type = type_, + extent = extent, + description=v.get("description", v.get("long_name", criteria["standard_name"])) + ) + ) + + + return dims + + + def is_coordinate(self, attrs: dict)-> bool: + """Return whether or not variable is a coordinate.""" + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if attrs.get(criterion, None) in expected: + return True + return False + + def variables(self)->dict: + """Return Variable objects""" + variables = {} + + for name, attrs in self.attrs["variables"].items(): + if name in self.attrs["dimensions"]: + continue + + variables[name] = Variable(properties=dict( + dimensions=attrs["shape"], + type = VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, + description=attrs.get("description", attrs.get("long_name", None)), + unit=attrs.get("units", None) + )) + return variables + + + + +# From CF-Xarray +coordinate_criteria = { + 'latitude': {'standard_name': ('latitude',), + 'units': ('degree_north', + 'degree_N', + 'degreeN', + 'degrees_north', + 'degrees_N', + 'degreesN'), + '_CoordinateAxisType': ('Lat',), + 'long_name': ('latitude',)}, + 'longitude': {'standard_name': ('longitude',), + 'units': ('degree_east', + 'degree_E', + 'degreeE', + 'degrees_east', + 'degrees_E', + 'degreesE'), + '_CoordinateAxisType': ('Lon',), + 'long_name': ('longitude',)}, + 'Z': {'standard_name': ('model_level_number', + 'atmosphere_ln_pressure_coordinate', + 'atmosphere_sigma_coordinate', + 'atmosphere_hybrid_sigma_pressure_coordinate', + 'atmosphere_hybrid_height_coordinate', + 'atmosphere_sleve_coordinate', + 'ocean_sigma_coordinate', + 'ocean_s_coordinate', + 'ocean_s_coordinate_g1', + 'ocean_s_coordinate_g2', + 'ocean_sigma_z_coordinate', + 'ocean_double_sigma_coordinate'), + '_CoordinateAxisType': ('GeoZ', 'Height', 'Pressure'), + 'axis': ('Z',), + 'cartesian_axis': ('Z',), + 'grads_dim': ('z',), + 'long_name': ('model_level_number', + 'atmosphere_ln_pressure_coordinate', + 'atmosphere_sigma_coordinate', + 'atmosphere_hybrid_sigma_pressure_coordinate', + 'atmosphere_hybrid_height_coordinate', + 'atmosphere_sleve_coordinate', + 'ocean_sigma_coordinate', + 'ocean_s_coordinate', + 'ocean_s_coordinate_g1', + 'ocean_s_coordinate_g2', + 'ocean_sigma_z_coordinate', + 'ocean_double_sigma_coordinate')}, + 'vertical': {'standard_name': ('air_pressure', + 'height', + 'depth', + 'geopotential_height', + 'altitude', + 'height_above_geopotential_datum', + 'height_above_reference_ellipsoid', + 'height_above_mean_sea_level'), + 'positive': ('up', 'down'), + 'long_name': ('air_pressure', + 'height', + 'depth', + 'geopotential_height', + 'altitude', + 'height_above_geopotential_datum', + 'height_above_reference_ellipsoid', + 'height_above_mean_sea_level')}, + 'X': {'standard_name': ('projection_x_coordinate', + 'grid_longitude', + 'projection_x_angular_coordinate'), + '_CoordinateAxisType': ('GeoX',), + 'axis': ('X',), + 'cartesian_axis': ('X',), + 'grads_dim': ('x',), + 'long_name': ('projection_x_coordinate', + 'grid_longitude', + 'projection_x_angular_coordinate', + 'cell index along first dimension')}, + 'Y': {'standard_name': ('projection_y_coordinate', + 'grid_latitude', + 'projection_y_angular_coordinate'), + '_CoordinateAxisType': ('GeoY',), + 'axis': ('Y',), + 'cartesian_axis': ('Y',), + 'grads_dim': ('y',), + 'long_name': ('projection_y_coordinate', + 'grid_latitude', + 'projection_y_angular_coordinate', + 'cell index along second dimension')}, + 'T': {'standard_name': ('time',), + '_CoordinateAxisType': ('Time',), + 'axis': ('T',), + 'cartesian_axis': ('T',), + 'grads_dim': ('t',), + 'long_name': ('time',)}, + 'time': {'standard_name': ('time',), + '_CoordinateAxisType': ('Time',), + 'axis': ('T',), + 'cartesian_axis': ('T',), + 'grads_dim': ('t',), + 'long_name': ('time',)}} + + +media_types = {"httpserver_service": "application/x-netcdf", + "opendap_service": pystac.MediaType.HTML, + "wcs_service": pystac.MediaType.XML, + "wms_service": pystac.MediaType.XML, + "nccs_service": "application/x-netcdf", + "HTTPServer": "application/x-netcdf", + "OPENDAP": pystac.MediaType.HTML, + "NCML": pystac.MediaType.XML, + "WCS": pystac.MediaType.XML, + "ISO": pystac.MediaType.XML, + "WMS": pystac.MediaType.XML, + "NetcdfSubset": "application/x-netcdf", + } diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index a93211c..d469383 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -6,27 +6,13 @@ from colorlog import ColoredFormatter import pystac + from pydantic import BaseModel, Field, FieldValidationInfo, field_validator, ValidationError from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader -from STACpopulator.stac_utils import collection2literal +from STACpopulator.stac_utils import collection2literal, CFJsonDatacube, CFJsonItem import pyessv - -media_types = {"httpserver_service": "application/x-netcdf", - "opendap_service": pystac.MediaType.HTML, - "wcs_service": pystac.MediaType.XML, - "wms_service": pystac.MediaType.XML, - "nccs_service": "application/x-netcdf", - "HTTPServer": "application/x-netcdf", - "OPENDAP": pystac.MediaType.HTML, - "NCML": pystac.MediaType.XML, - "WCS": pystac.MediaType.XML, - "ISO": pystac.MediaType.XML, - "WMS": pystac.MediaType.XML, - "NetcdfSubset": "application/x-netcdf", - } - LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" formatter = ColoredFormatter(LOGFORMAT) @@ -97,9 +83,6 @@ def split(cls, v: str, info: FieldValidationInfo): return v.split(" ") -class STACItem(BaseModel): - start_datetime: dt.datetime - end_datetime: dt.datetime def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: @@ -140,31 +123,16 @@ def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableM pass def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - # TODO: next step is to implement this - attrs = item_data["attributes"] - meta = item_data["groups"]["CFMetadata"]["attributes"] - - # uuid - # Create STAC item geometry from CFMetadata - item = dict( - id=make_cmip6_id(attrs), - geometry=THREDDSLoader.ncattrs_to_geometry(meta), - bbox=THREDDSLoader.ncattrs_to_bbox(meta), - properties=Properties(**attrs).model_dump(), - datetime=None, - ) - - item.update(STACItem(start_datetime=meta["time_coverage_start"], - end_datetime=meta["time_coverage_end"],).model_dump()) - - stac_item = pystac.Item(**item) - - # Add assets - for name, url in item_data["access_urls"].items(): - asset = pystac.Asset(href=url, media_type=media_types.get(name, None)) - stac_item.add_asset(name, asset) - - return stac_item.to_dict() + # TODO: This is agnostic to the data collection, should not be in CMIP6 specific class. + iid = make_cmip6_id(item_data["attributes"]) + m = CFJsonDatacube(iid, item_data, Properties) + try: + m = CFJsonDatacube(iid, item_data, Properties) + except: + LOGGER.warning(f"Failed to add Datacube extention to item {item_name}") + m = CFJsonItem(iid, item_data, Properties) + + return m.item.to_dict() def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. From ac441f7b4ffb18866a16720cd9248a5ed7e48068 Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 21 Sep 2023 14:38:40 -0400 Subject: [PATCH 26/69] refactoring to support multiple extensions later --- STACpopulator/stac_utils.py | 19 ++++++++----------- implementations/CMIP6-UofT/add_CMIP6.py | 11 ++++++----- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 371f35a..88dc1ac 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -73,7 +73,6 @@ def __init__(self, iid: str, attrs: dict, datamodel=None): else: props = gattrs - # Create STAC item itemd = dict( id=iid, @@ -138,14 +137,15 @@ def ncattrs_to_bbox(self) -> list: ] -class CFJsonDatacube(CFJsonItem): - """Return STAC Item with Datacube extension from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" +class DatacubeExt: + """Extend STAC Item with Datacube properties.""" axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} - def __init__(self, *args, **kwds): - super().__init__(*args, **kwds) + def __init__(self, obj: CFJsonItem): + self.obj = obj + self.attrs = obj.attrs - self.ext = DatacubeExtension.ext(self.item, add_if_missing=True) + self.ext = DatacubeExtension.ext(self.obj.item, add_if_missing=True) self.ext.apply(dimensions=self.dimensions(), variables=self.variables()) def dimensions(self) -> dict: @@ -154,7 +154,7 @@ def dimensions(self) -> dict: dims = {} for name, length in self.attrs["dimensions"].items(): v = self.attrs["variables"][name] - bbox = self.ncattrs_to_bbox() + bbox = self.obj.ncattrs_to_bbox() for key, criteria in coordinate_criteria.items(): for criterion, expected in criteria.items(): @@ -180,12 +180,10 @@ def dimensions(self) -> dict: ) ) - return dims - def is_coordinate(self, attrs: dict)-> bool: - """Return whether or not variable is a coordinate.""" + """Return whether variable is a coordinate.""" for key, criteria in coordinate_criteria.items(): for criterion, expected in criteria.items(): if attrs.get(criterion, None) in expected: @@ -210,7 +208,6 @@ def variables(self)->dict: - # From CF-Xarray coordinate_criteria = { 'latitude': {'standard_name': ('latitude',), diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index d469383..f595b21 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -10,7 +10,7 @@ from pydantic import BaseModel, Field, FieldValidationInfo, field_validator, ValidationError from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader -from STACpopulator.stac_utils import collection2literal, CFJsonDatacube, CFJsonItem +from STACpopulator.stac_utils import collection2literal, DatacubeExt, CFJsonItem import pyessv LOGGER = logging.getLogger(__name__) @@ -125,14 +125,15 @@ def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableM def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # TODO: This is agnostic to the data collection, should not be in CMIP6 specific class. iid = make_cmip6_id(item_data["attributes"]) - m = CFJsonDatacube(iid, item_data, Properties) + + obj = CFJsonItem(iid, item_data, Properties) + try: - m = CFJsonDatacube(iid, item_data, Properties) + DatacubeExt(obj) except: LOGGER.warning(f"Failed to add Datacube extention to item {item_name}") - m = CFJsonItem(iid, item_data, Properties) - return m.item.to_dict() + return obj.item.to_dict() def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. From 6719b1005030fe68e7f999c3e89eca3060d1831a Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 21 Sep 2023 17:18:34 -0400 Subject: [PATCH 27/69] fixed bugs in datacube logic. --- STACpopulator/stac_utils.py | 75 ++++++++++++++----------- implementations/CMIP6-UofT/add_CMIP6.py | 15 +---- tests/test_client.py | 5 ++ 3 files changed, 49 insertions(+), 46 deletions(-) create mode 100644 tests/test_client.py diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 88dc1ac..fbac003 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -89,9 +89,14 @@ def __init__(self, iid: str, attrs: dict, datamodel=None): item = pystac.Item(**itemd) # Add assets - for name, url in attrs["access_urls"].items(): - asset = pystac.Asset(href=url, media_type=media_types.get(name, None)) - item.add_asset(name, asset) + if "access_urls" in attrs: + for name, url in attrs["access_urls"].items(): + asset = pystac.Asset(href=url, media_type=media_types.get(name, None)) + item.add_asset(name, asset) + elif 'THREDDSMetadata' in attrs["groups"]: + for name, url in attrs["groups"]['THREDDSMetadata']['groups']['services']['attributes'].items(): + asset = pystac.Asset(href=url, media_type=media_types.get(name, None)) + item.add_asset(name, asset) self.item = item @@ -153,34 +158,34 @@ def dimensions(self) -> dict: dims = {} for name, length in self.attrs["dimensions"].items(): - v = self.attrs["variables"][name] - bbox = self.obj.ncattrs_to_bbox() - - for key, criteria in coordinate_criteria.items(): - for criterion, expected in criteria.items(): - if v['attributes'].get(criterion, None) in expected: - axis = self.axis[key] - type_ = DimensionType.SPATIAL if axis in ['x', 'y', 'z'] else DimensionType.TEMPORAL - - if v['type'] == 'int': - extent = [0, int(length)] - else: # Not clear the logic is sound - if key == 'X': - extent = bbox[0], bbox[2] - elif key == "Y": - extent = bbox[1], bbox[3] - else: - extent = None - - dims[name] = Dimension(properties=dict( - axis = axis, - type = type_, - extent = extent, - description=v.get("description", v.get("long_name", criteria["standard_name"])) + v = self.attrs["variables"].get(name) + if v: + bbox = self.obj.ncattrs_to_bbox() + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if v['attributes'].get(criterion, None) in expected: + axis = self.axis[key] + type_ = DimensionType.SPATIAL if axis in ['x', 'y', 'z'] else DimensionType.TEMPORAL + + if v['type'] == 'int': + extent = [0, int(length)] + else: # Not clear the logic is sound + if key == 'X': + extent = bbox[0], bbox[2] + elif key == "Y": + extent = bbox[1], bbox[3] + else: + extent = None + + dims[name] = Dimension(properties=dict( + axis = axis, + type = type_, + extent = extent, + description=v.get("description", v.get("long_name", criteria["standard_name"])) + ) ) - ) - return dims + return dims def is_coordinate(self, attrs: dict)-> bool: """Return whether variable is a coordinate.""" @@ -194,14 +199,16 @@ def variables(self)->dict: """Return Variable objects""" variables = {} - for name, attrs in self.attrs["variables"].items(): + for name, meta in self.attrs["variables"].items(): if name in self.attrs["dimensions"]: continue - + + attrs = meta['attributes'] variables[name] = Variable(properties=dict( - dimensions=attrs["shape"], - type = VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, - description=attrs.get("description", attrs.get("long_name", None)), + dimensions=meta["shape"], + type = VariableType.AUXILIARY.value if self.is_coordinate(attrs) else + VariableType.DATA.value, + description=attrs.get("description", attrs.get("long_name")), unit=attrs.get("units", None) )) return variables diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index f595b21..4196eb6 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -25,8 +25,6 @@ # CMIP6 controlled vocabulary (CV) CV = pyessv.WCRP.CMIP6 - - # Enum classes built from the pyessv' CV Activity = collection2literal(CV.activity_id) Experiment = collection2literal(CV.experiment_id) @@ -83,8 +81,6 @@ def split(cls, v: str, info: FieldValidationInfo): return v.split(" ") - - def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: """Return unique ID for CMIP6 data collection (multiple variables).""" keys = ["activity_id", "institution_id", "source_id", "experiment_id", "variant_label", "table_id", "grid_label",] @@ -98,7 +94,6 @@ def __init__( stac_host: str, thredds_catalog_url: str, config_filename: str, - validator: callable = None ) -> None: """Constructor @@ -112,11 +107,7 @@ def __init__( """ data_loader = THREDDSLoader(thredds_catalog_url) - self.validator = validator - - for name, item in data_loader: - # self.create_stac_item(name, item) - print(name) + self.props_model = Properties super().__init__(stac_host, data_loader, config_filename) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): @@ -126,12 +117,12 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) # TODO: This is agnostic to the data collection, should not be in CMIP6 specific class. iid = make_cmip6_id(item_data["attributes"]) - obj = CFJsonItem(iid, item_data, Properties) + obj = CFJsonItem(iid, item_data, self.props_model) try: DatacubeExt(obj) except: - LOGGER.warning(f"Failed to add Datacube extention to item {item_name}") + LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") return obj.item.to_dict() diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 0000000..b35f9ac --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,5 @@ +from pystac_client import Client + +def test_cmip6(): + """Assume some CMIP6 has been ingested.""" + c = Client.open("http://localhost:8880/stac") From 7133cb615628bb48aa9810cb3d1d4f196f34f4a1 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 21 Sep 2023 23:30:10 +0200 Subject: [PATCH 28/69] revised pydantic data model --- implementations/CMIP6-UofT/add_CMIP6.py | 168 +++++++++++++++--------- 1 file changed, 105 insertions(+), 63 deletions(-) diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index b2e483d..75e57bc 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,21 +1,17 @@ import argparse -import datetime as dt +import json import logging -from typing import Any, List, Literal, MutableMapping +from datetime import datetime +from typing import Any, Dict, List, Literal, MutableMapping import pyessv -import pystac from colorlog import ColoredFormatter -from pydantic import ( - BaseModel, - Field, - FieldValidationInfo, - ValidationError, - field_validator, -) +from pydantic import AnyHttpUrl, BaseModel, Field, FieldValidationInfo, field_validator +from typing_extensions import TypedDict from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader +from STACpopulator.stac_utils import collection2enum LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -30,46 +26,64 @@ CV = pyessv.WCRP.CMIP6 # Enum classes built from the pyessv' CV -Activity = collection2enum(CV.activity_id) -Experiment = collection2enum(CV.experiment_id) +ActivityID = collection2enum(CV.activity_id) +ExperimentID = collection2enum(CV.experiment_id) Frequency = collection2enum(CV.frequency) GridLabel = collection2enum(CV.grid_label) -Institution = collection2enum(CV.institution_id) +InstitutionID = collection2enum(CV.institution_id) # Member = collection2enum(CV.member_id) # This is empty -Resolution = collection2enum(CV.nominal_resolution) +NominalResolution = collection2enum(CV.nominal_resolution) Realm = collection2enum(CV.realm) -Source = collection2enum(CV.source_id) +SourceID = collection2enum(CV.source_id) SourceType = collection2enum(CV.source_type) -SubExperiment = collection2enum(CV.sub_experiment_id) -Table = collection2enum(CV.table_id) -Variable = collection2enum(CV.variable_id) # This is empty +SubExperimentID = collection2enum(CV.sub_experiment_id) +TableID = collection2enum(CV.table_id) +# Variable = collection2enum(CV.variable_id) # This is empty -class Properties(BaseModel): +class STACAsset(BaseModel): + href: AnyHttpUrl + media_type: str + title: str + roles: List[str] + + +class Properties(BaseModel, validate_assignment=True): """Data model for CMIP6 Controlled Vocabulary.""" - activity: Activity = Field(..., alias="activity_id") - experiment: Experiment = Field(..., alias="experiment_id") - frequency: Frequency - grid_label: GridLabel - institution: Institution = Field(..., alias="institution_id") - resolution: Resolution = Field(..., alias="nominal_resolution") - realm: List[Realm] = Field(..., alias="realm") - source: Source = Field(..., alias="source_id") - source_type: List[SourceType] = Field(..., alias="source_type") - sub_experiment: SubExperiment | Literal["none"] = Field(..., alias="sub_experiment_id") - table: Table = Field(..., alias="table_id") - variable: Variable = str # Field(..., alias="variable_id") - variant_label: str - initialization_index: int - physics_index: int - realization_index: int - forcing_index: int - variant_label: str - tracking_id: str - version: str = None - license: str = None - grid: str = None + start_datetime: datetime + end_datetime: datetime + Conventions: str = Field(..., serialization_alias="cmip6:Conventions") + activity_id: ActivityID = Field(..., serialization_alias="cmip6:activity_id") + creation_date: datetime = Field(..., serialization_alias="cmip6:creation_date") + data_specs_version: str = Field(..., serialization_alias="cmip6:data_specs_version") + experiment: str = Field(..., serialization_alias="cmip6:experiment") + experiment_id: ExperimentID = Field(..., serialization_alias="cmip6:experiment_id") + frequency: Frequency = Field(..., serialization_alias="cmip6:frequency") + further_info_url: AnyHttpUrl = Field(..., serialization_alias="cmip6:further_info_url") + grid_label: GridLabel = Field(..., serialization_alias="cmip6:grid_label") + institution: str = Field(..., serialization_alias="cmip6:institution") + institution_id: InstitutionID = Field(..., serialization_alias="cmip6:institution_id") + nominal_resolution: NominalResolution = Field(..., serialization_alias="cmip6:nominal_resolution") + realm: List[Realm] = Field(..., serialization_alias="cmip6:realm") + source: str = Field(..., serialization_alias="cmip6:source") + source_id: SourceID = Field(..., serialization_alias="cmip6:source_id") + source_type: List[SourceType] = Field(..., serialization_alias="cmip6:source_type") + sub_experiment: str | Literal["none"] = Field(..., serialization_alias="cmip6:sub_experiment") + sub_experiment_id: SubExperimentID | Literal["none"] = Field(..., serialization_alias="cmip6:sub_experiment_id") + table_id: TableID = Field(..., serialization_alias="cmip6:table_id") + variable_id: str = Field(..., serialization_alias="cmip6:variable_id") + variant_label: str = Field(..., serialization_alias="cmip6:variant_label") + initialization_index: int = Field(..., serialization_alias="cmip6:initialization_index") + physics_index: int = Field(..., serialization_alias="cmip6:physics_index") + realization_index: int = Field(..., serialization_alias="cmip6:realization_index") + forcing_index: int = Field(..., serialization_alias="cmip6:forcing_index") + tracking_id: str = Field(..., serialization_alias="cmip6:tracking_id") + version: str = Field(..., serialization_alias="cmip6:version") + product: str = Field(..., serialization_alias="cmip6:product") + license: str = Field(..., serialization_alias="cmip6:license") + grid: str = Field(..., serialization_alias="cmip6:grid") + mip_era: str = Field(..., serialization_alias="cmip6:mip_era") @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") @classmethod @@ -84,14 +98,29 @@ def split(cls, v: str, info: FieldValidationInfo): """Split string into list.""" return v.split(" ") + @field_validator("version") + @classmethod + def validate_version(cls, v: str, info: FieldValidationInfo): + assert v[0] == "v", "Version string should begin with a lower case 'v'" + assert v[1:].isdigit(), "All characters in version string, except first, should be digits" + return v + + +class Geometry(TypedDict): + type: str + coordinates: List[List[List[float]]] + class STACItem(BaseModel): - start_datetime: dt.datetime - end_datetime: dt.datetime + id: str + geometry: Geometry + bbox: List[float] + properties: Properties + assets: Dict[str, STACAsset] -def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: - """Return unique ID for CMIP6 data collection (multiple variables).""" +def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: + """Return a unique ID for CMIP6 data item.""" keys = [ "activity_id", "institution_id", @@ -99,6 +128,7 @@ def make_cmip6_id(attrs: MutableMapping[str, Any]) -> str: "experiment_id", "variant_label", "table_id", + "variable_id", "grid_label", ] return "_".join(attrs[k] for k in keys) @@ -117,37 +147,49 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: st """ data_loader = THREDDSLoader(thredds_catalog_url) - - for name, item in data_loader: - # self.create_stac_item(name, item) - print(name) super().__init__(stac_host, data_loader, config_filename) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): pass def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - # TODO: next step is to implement this + """Creates the STAC item. + + :param item_name: name of the STAC item. Interpretation of name is left to the input loader implementation + :type item_name: str + :param item_data: dictionary like representation of all information on the item + :type item_data: MutableMapping[str, Any] + :return: _description_ + :rtype: MutableMapping[str, Any] + """ + attrs = item_data["attributes"] meta = item_data["groups"]["CFMetadata"]["attributes"] - # Create STAC item geometry from CFMetadata - item = dict( - id=make_cmip6_id(attrs), + props = Properties( + attrs, + start_datetime=meta["time_coverage_start"], + end_datetime=meta["time_coverage_end"], + ) + + a = STACAsset( + href=item_data["access_urls"]["HTTPServer"], + media_type="application/netcdf", + title="HTTP Server", + roles=["data"], + ) + + item = STACItem( + id="sdfdd", + properties=props, geometry=THREDDSLoader.ncattrs_to_geometry(meta), bbox=THREDDSLoader.ncattrs_to_bbox(meta), - properties=Properties(**attrs).model_dump(), - datetime=None, + assets={"http": a}, ) - item.update( - STACItem( - start_datetime=meta["time_coverage_start"], - end_datetime=meta["time_coverage_end"], - ).model_dump() - ) + stac_item_json = json.loads(item.model_dump_json(by_alias=True)) - return pystac.Item(**item) + return stac_item_json def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. @@ -163,4 +205,4 @@ def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: args = parser.parse_args() LOGGER.info(f"Arguments to call: {args}") c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file) - c.ingest() + # c.ingest() From 8fd1825803cedf1870028089d4ec7b8e26506c3e Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 21 Sep 2023 17:34:26 -0400 Subject: [PATCH 29/69] docstrings --- STACpopulator/stac_utils.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index fbac003..199f712 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -62,6 +62,18 @@ class STACItem(BaseModel): class CFJsonItem: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" def __init__(self, iid: str, attrs: dict, datamodel=None): + """ + Create STAC Item from CF JSON metadata. + + Parameters + ---------- + iid : str + Unique item ID. + attrs: dict + CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. + datamodel : pydantic.BaseModel, optional + Data model for validating global attributes. + """ self.attrs = attrs # Global attributes @@ -147,6 +159,14 @@ class DatacubeExt: axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} def __init__(self, obj: CFJsonItem): + """ + Add Datacube extension to STAC Item. + + Parameters + ---------- + obj : CFJsonItem + STAC Item created from CF JSON metadata. + """ self.obj = obj self.attrs = obj.attrs @@ -187,14 +207,6 @@ def dimensions(self) -> dict: return dims - def is_coordinate(self, attrs: dict)-> bool: - """Return whether variable is a coordinate.""" - for key, criteria in coordinate_criteria.items(): - for criterion, expected in criteria.items(): - if attrs.get(criterion, None) in expected: - return True - return False - def variables(self)->dict: """Return Variable objects""" variables = {} @@ -202,7 +214,7 @@ def variables(self)->dict: for name, meta in self.attrs["variables"].items(): if name in self.attrs["dimensions"]: continue - + attrs = meta['attributes'] variables[name] = Variable(properties=dict( dimensions=meta["shape"], @@ -213,6 +225,13 @@ def variables(self)->dict: )) return variables + def is_coordinate(self, attrs: dict)-> bool: + """Return whether variable is a coordinate.""" + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if attrs.get(criterion, None) in expected: + return True + return False # From CF-Xarray From ce268cdcde6030b3813f858ab1342b7cafa463e3 Mon Sep 17 00:00:00 2001 From: David Huard Date: Tue, 26 Sep 2023 16:48:38 -0400 Subject: [PATCH 30/69] work on cmip6 extension --- STACpopulator/extensions/__init__.py | 0 STACpopulator/extensions/cmip6.py | 167 ++++++++++++++++ STACpopulator/populator_base.py | 3 +- STACpopulator/stac_utils.py | 22 +-- ..._historical_r1i1p1f1_gr1_185001-194912.xml | 183 ++++++++++++++++++ tests/test_cmip6_extension.py | 18 ++ 6 files changed, 373 insertions(+), 20 deletions(-) create mode 100644 STACpopulator/extensions/__init__.py create mode 100644 STACpopulator/extensions/cmip6.py create mode 100644 tests/data/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml create mode 100644 tests/test_cmip6_extension.py diff --git a/STACpopulator/extensions/__init__.py b/STACpopulator/extensions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/STACpopulator/extensions/cmip6.py b/STACpopulator/extensions/cmip6.py new file mode 100644 index 0000000..96cf74a --- /dev/null +++ b/STACpopulator/extensions/cmip6.py @@ -0,0 +1,167 @@ +"""CMIP6 extension based on https://stac-extensions.github.io/cmip6/v1.0.0/schema.json""" + +from typing import Generic, TypeVar, Dict, Any, cast + +import pystac +from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension +from pystac.extensions.hooks import ExtensionHooks + +from datetime import datetime +from typing import Any, Dict, List, Literal +import pyessv +from pydantic import (AnyHttpUrl, BaseModel, Field, FieldValidationInfo, field_validator, model_serializer, + FieldSerializationInfo) + + +from STACpopulator.stac_utils import ItemProperties +from STACpopulator.stac_utils import collection2literal + +T = TypeVar("T", pystac.Collection, pystac.Item, pystac.Asset) + +SCHEMA_URI = "https://stac-extensions.github.io/cmip6/v1.0.0/schema.json" + +prefix: str = "cmip6:" + + +# CMIP6 controlled vocabulary (CV) +CV = pyessv.WCRP.CMIP6 + +# Enum classes built from the pyessv' CV +ActivityID = collection2literal(CV.activity_id) +ExperimentID = collection2literal(CV.experiment_id) +Frequency = collection2literal(CV.frequency) +GridLabel = collection2literal(CV.grid_label) +InstitutionID = collection2literal(CV.institution_id) +NominalResolution = collection2literal(CV.nominal_resolution) +Realm = collection2literal(CV.realm) +SourceID = collection2literal(CV.source_id) +SourceType = collection2literal(CV.source_type) +SubExperimentID = collection2literal(CV.sub_experiment_id) +TableID = collection2literal(CV.table_id) + + +class Properties(ItemProperties, validate_assignment=True): + """Data model for CMIP6 Controlled Vocabulary.""" + + Conventions: str + activity_id: ActivityID + creation_date: datetime + data_specs_version: str + experiment: str + experiment_id: ExperimentID + frequency: Frequency + further_info_url: AnyHttpUrl + grid_label: GridLabel + institution: str + institution_id: InstitutionID + nominal_resolution: NominalResolution + realm: List[Realm] + source: str + source_id: SourceID + source_type: List[SourceType] + sub_experiment: str | Literal["none"] + sub_experiment_id: SubExperimentID | Literal["none"] + table_id: TableID + variable_id: str + variant_label: str + initialization_index: int + physics_index: int + realization_index: int + forcing_index: int + tracking_id: str + version: str + product: str + license: str + grid: str + mip_era: str + + @model_serializer + def serialize_extension(self): + """Add prefix to all fields.""" + return {prefix + k: v for k, v in self.model_dump_json()} + + @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") + @classmethod + def first_item(cls, v: list, info: FieldValidationInfo): + """Pick single item from list.""" + assert len(v) == 1, f"{info.field_name} must have one item only." + return v[0] + + @field_validator("realm", "source_type", mode="before") + @classmethod + def split(cls, v: str, info: FieldValidationInfo): + """Split string into list.""" + return v.split(" ") + + @field_validator("version") + @classmethod + def validate_version(cls, v: str, info: FieldValidationInfo): + assert v[0] == "v", "Version string should begin with a lower case 'v'" + assert v[1:].isdigit(), "All characters in version string, except first, should be digits" + return v + + + +class CMIP6Extension(Generic[T], ExtensionManagementMixin[pystac.Item], PropertiesExtension): + """An abstract class that can be used to extend the properties of a + :class:`~pystac.Item` with properties from the :stac-ext:`CMIP6 Extension `. + + To create an instance of :class:`CMIP6Extension`, use the :meth:`CMIP6Extension.ext` method. + """ + def apply(self, attrs: Dict[str, Any]) -> None: + """Applies Datacube Extension properties to the extended + :class:`~pystac.Collection`, :class:`~pystac.Item` or :class:`~pystac.Asset`. + + Args: + dimensions : Dictionary mapping dimension name to :class:`Dimension` + objects. + variables : Dictionary mapping variable name to a :class:`Variable` + object. + """ + self.properties.update(**Properties(**attrs).model_dump_json()) + + @classmethod + def get_schema_uri(cls) -> str: + return SCHEMA_URI + @classmethod + def ext(cls, obj: T, add_if_missing: bool = False): + """Extends the given STAC Object with properties from the :stac-ext:`CMIP6 + Extension `. + + This extension can be applied to instances of :class:`~pystac.Item`. + + Raises: + pystac.ExtensionTypeError : If an invalid object type is passed. + """ + if isinstance(obj, pystac.Item): + cls.validate_has_extension(obj, add_if_missing) + return cast(CMIP6Extension[T], ItemCMIP6Extension(obj)) + else: + raise pystac.ExtensionTypeError(cls._ext_error_message(obj)) + +class ItemCMIP6Extension(CMIP6Extension[pystac.Item]): + """A concrete implementation of :class:`DatacubeExtension` on an + :class:`~pystac.Item` that extends the properties of the Item to include properties + defined in the :stac-ext:`Datacube Extension `. + + This class should generally not be instantiated directly. Instead, call + :meth:`DatacubeExtension.ext` on an :class:`~pystac.Item` to extend it. + """ + + item: pystac.Item + properties: Dict[str, Any] + + def __init__(self, item: pystac.Item): + self.item = item + self.properties = item.properties + + def __repr__(self) -> str: + return "".format(self.item.id) + + +class CMIP6ExtensionHooks(ExtensionHooks): + schema_uri: str = SCHEMA_URI + prev_extension_ids = {"cmip6"} + stac_object_types = {pystac.STACObjectType.ITEM} + +CMIP6_EXTENSION_HOOKS: ExtensionHooks = CMIP6ExtensionHooks() diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index a15ac51..2541fe7 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -58,7 +58,8 @@ def __init__( self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) - self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() + #self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() + self._collection_id = self.collection_name LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") self.create_stac_collection() diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 9f17250..52f94c5 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -124,30 +124,14 @@ def resolve(self, base_url: str) -> None: """resolve a link to the given base URL""" self.href = urljoin(base_url, self.href) -class PaginationMethods(str, AutoValueEnum): - """ - https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension - """ - - GET = auto() - POST = auto() - - -class PaginationRelations(str, AutoValueEnum): - """ - https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension - """ - - next = auto() - previous = auto() class PaginationLink(Link): """ https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension """ - rel: PaginationRelations - method: PaginationMethods + rel: Literal["next", "previous"] + method: Literal["GET", "POST"] body: Optional[Dict[Any, Any]] = None merge: bool = False @@ -158,7 +142,7 @@ class Item(BaseModel): id: str = Field(..., alias="id", min_length=1) geometry: Optional[Geometry] = None bbox: Optional[List[float]] = None - properties: ItemProperties + properties: Optional[ItemProperties] = None assets: Dict[str, Asset] = None stac_extensions: Optional[List[AnyUrl]] = [] collection: Optional[str] = None diff --git a/tests/data/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml b/tests/data/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml new file mode 100644 index 0000000..6aa0ae6 --- /dev/null +++ b/tests/data/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml @@ -0,0 +1,183 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_cmip6_extension.py b/tests/test_cmip6_extension.py new file mode 100644 index 0000000..67e3dc8 --- /dev/null +++ b/tests/test_cmip6_extension.py @@ -0,0 +1,18 @@ +from STACpopulator.extensions import cmip6 +import xncml +from pathlib import Path +from pystac import Item + +TEST_DATA = Path(__file__).parent / "data" + +def test_extension(): + ds = xncml.Dataset(TEST_DATA / "o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml") + attrs = ds.to_cf_dict() + cfmeta = attrs["groups"]["CFMetadata"]["attributes"] + + item = Item(id="test", start_datetime=cfmeta["time_coverage_start"], end_datetime=cfmeta["time_coverage_end"]) + + ext = cmip6.CMIP6Extension.ext(item, add_if_missing=True) + ext.apply(attrs) + + From dba86a520be15bba39c4bc329943a03847c9d912 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Wed, 27 Sep 2023 18:14:29 -0400 Subject: [PATCH 31/69] fixes to CMIP6 Properties model to allow serialize --- .gitignore | 4 +++- STACpopulator/extensions/cmip6.py | 38 ++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 0344ff2..7e12211 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ STACpopulator.egg-info/ .vscode/ .venv/ -jupyter/ \ No newline at end of file +jupyter/ +.idea +.vscode diff --git a/STACpopulator/extensions/cmip6.py b/STACpopulator/extensions/cmip6.py index 96cf74a..9868039 100644 --- a/STACpopulator/extensions/cmip6.py +++ b/STACpopulator/extensions/cmip6.py @@ -1,16 +1,22 @@ """CMIP6 extension based on https://stac-extensions.github.io/cmip6/v1.0.0/schema.json""" -from typing import Generic, TypeVar, Dict, Any, cast +import json +from typing import Generic, TypeVar, Union, cast import pystac from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension from pystac.extensions.hooks import ExtensionHooks -from datetime import datetime +from datetime import date, datetime from typing import Any, Dict, List, Literal import pyessv -from pydantic import (AnyHttpUrl, BaseModel, Field, FieldValidationInfo, field_validator, model_serializer, - FieldSerializationInfo) +from pydantic import ( + AnyHttpUrl, + FieldValidationInfo, + field_validator, + model_serializer, +) +from pydantic.networks import Url from STACpopulator.stac_utils import ItemProperties @@ -59,8 +65,8 @@ class Properties(ItemProperties, validate_assignment=True): source: str source_id: SourceID source_type: List[SourceType] - sub_experiment: str | Literal["none"] - sub_experiment_id: SubExperimentID | Literal["none"] + sub_experiment: Union[str, Literal["none"]] + sub_experiment_id: Union[SubExperimentID, Literal["none"]] table_id: TableID variable_id: str variant_label: str @@ -75,10 +81,19 @@ class Properties(ItemProperties, validate_assignment=True): grid: str mip_era: str - @model_serializer - def serialize_extension(self): + #@model_serializer + def serialize_extension(self) -> str: """Add prefix to all fields.""" - return {prefix + k: v for k, v in self.model_dump_json()} + + def json_encode(obj): + if isinstance(obj, Url): + return str(obj) + if isinstance(obj, (datetime, date)): + return obj.isoformat() + raise TypeError(f"Type {type(obj)} not serializable") + + data = {prefix + k: v for k, v in self.model_dump().items()} + return json.dumps(data, default=json_encode) @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") @classmethod @@ -101,7 +116,6 @@ def validate_version(cls, v: str, info: FieldValidationInfo): return v - class CMIP6Extension(Generic[T], ExtensionManagementMixin[pystac.Item], PropertiesExtension): """An abstract class that can be used to extend the properties of a :class:`~pystac.Item` with properties from the :stac-ext:`CMIP6 Extension `. @@ -118,7 +132,7 @@ def apply(self, attrs: Dict[str, Any]) -> None: variables : Dictionary mapping variable name to a :class:`Variable` object. """ - self.properties.update(**Properties(**attrs).model_dump_json()) + self.properties.update(**Properties(**attrs).model_dump()) @classmethod def get_schema_uri(cls) -> str: @@ -139,6 +153,7 @@ def ext(cls, obj: T, add_if_missing: bool = False): else: raise pystac.ExtensionTypeError(cls._ext_error_message(obj)) + class ItemCMIP6Extension(CMIP6Extension[pystac.Item]): """A concrete implementation of :class:`DatacubeExtension` on an :class:`~pystac.Item` that extends the properties of the Item to include properties @@ -164,4 +179,5 @@ class CMIP6ExtensionHooks(ExtensionHooks): prev_extension_ids = {"cmip6"} stac_object_types = {pystac.STACObjectType.ITEM} + CMIP6_EXTENSION_HOOKS: ExtensionHooks = CMIP6ExtensionHooks() From ee7a339ff5bcd00432f40109b1adb7057b8a5ebb Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 28 Sep 2023 10:11:02 -0400 Subject: [PATCH 32/69] fix prefix addition. test cmip6 extension --- STACpopulator/extensions/cmip6.py | 20 ++++++++++++-------- STACpopulator/stac_utils.py | 7 ++++--- tests/test_cmip6_extension.py | 10 ++++++---- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/STACpopulator/extensions/cmip6.py b/STACpopulator/extensions/cmip6.py index 96cf74a..2f0bdf8 100644 --- a/STACpopulator/extensions/cmip6.py +++ b/STACpopulator/extensions/cmip6.py @@ -20,8 +20,6 @@ SCHEMA_URI = "https://stac-extensions.github.io/cmip6/v1.0.0/schema.json" -prefix: str = "cmip6:" - # CMIP6 controlled vocabulary (CV) CV = pyessv.WCRP.CMIP6 @@ -75,11 +73,6 @@ class Properties(ItemProperties, validate_assignment=True): grid: str mip_era: str - @model_serializer - def serialize_extension(self): - """Add prefix to all fields.""" - return {prefix + k: v for k, v in self.model_dump_json()} - @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") @classmethod def first_item(cls, v: list, info: FieldValidationInfo): @@ -108,6 +101,8 @@ class CMIP6Extension(Generic[T], ExtensionManagementMixin[pystac.Item], Properti To create an instance of :class:`CMIP6Extension`, use the :meth:`CMIP6Extension.ext` method. """ + prefix: str = "cmip6:" + def apply(self, attrs: Dict[str, Any]) -> None: """Applies Datacube Extension properties to the extended :class:`~pystac.Collection`, :class:`~pystac.Item` or :class:`~pystac.Asset`. @@ -118,11 +113,20 @@ def apply(self, attrs: Dict[str, Any]) -> None: variables : Dictionary mapping variable name to a :class:`Variable` object. """ - self.properties.update(**Properties(**attrs).model_dump_json()) + import json + + p = Properties(**attrs) + + # Add prefix + objs = {self.prefix + k: v for (k, v) in json.loads(p.model_dump_json()).items()} + + # Update item properties + self.properties.update(**objs) @classmethod def get_schema_uri(cls) -> str: return SCHEMA_URI + @classmethod def ext(cls, obj: T, add_if_missing: bool = False): """Extends the given STAC Object with properties from the :stac-ext:`CMIP6 diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 52f94c5..c26542f 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -169,9 +169,10 @@ def __init__(self, iid: str, attrs: dict, datamodel=None): cfmeta = attrs["groups"]["CFMetadata"]["attributes"] # Global attributes - gattrs = {**attrs["attributes"], - "start_datetime": cfmeta["time_coverage_start"], - "end_datetime": cfmeta["time_coverage_end"]} + gattrs = {"start_datetime": cfmeta["time_coverage_start"], + "end_datetime": cfmeta["time_coverage_end"], + **attrs["attributes"], + } # Validate using pydantic data model if given datamodel = datamodel or dict diff --git a/tests/test_cmip6_extension.py b/tests/test_cmip6_extension.py index 67e3dc8..f899a33 100644 --- a/tests/test_cmip6_extension.py +++ b/tests/test_cmip6_extension.py @@ -1,18 +1,20 @@ from STACpopulator.extensions import cmip6 +from STACpopulator.stac_utils import CFJsonItem import xncml from pathlib import Path -from pystac import Item +from pystac import Item, validation TEST_DATA = Path(__file__).parent / "data" def test_extension(): ds = xncml.Dataset(TEST_DATA / "o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml") attrs = ds.to_cf_dict() - cfmeta = attrs["groups"]["CFMetadata"]["attributes"] - item = Item(id="test", start_datetime=cfmeta["time_coverage_start"], end_datetime=cfmeta["time_coverage_end"]) + item = CFJsonItem("test", attrs).item + validation.validate(item) ext = cmip6.CMIP6Extension.ext(item, add_if_missing=True) - ext.apply(attrs) + ext.apply(attrs["attributes"]) + assert "cmip6:realm" in item.properties From 0c829a1bf3196360567da22109f040bca3511bb7 Mon Sep 17 00:00:00 2001 From: David Huard Date: Fri, 29 Sep 2023 13:43:10 -0400 Subject: [PATCH 33/69] harmonized datacube and cmip extensions --- STACpopulator/stac_utils.py | 26 ++++--------------------- implementations/CMIP6-UofT/add_CMIP6.py | 16 ++++++++++++--- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index c26542f..24efb07 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -152,6 +152,8 @@ class Item(BaseModel): class CFJsonItem: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} + def __init__(self, iid: str, attrs: dict, datamodel=None): """ Create STAC Item from CF JSON metadata. @@ -249,28 +251,8 @@ def ncattrs_to_bbox(self) -> list: float(attrs["geospatial_lat_max"][0]), ] - -class DatacubeExt: - """Extend STAC Item with Datacube properties.""" - axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} - - def __init__(self, obj: CFJsonItem): - """ - Add Datacube extension to STAC Item. - - Parameters - ---------- - obj : CFJsonItem - STAC Item created from CF JSON metadata. - """ - self.obj = obj - self.attrs = obj.attrs - - self.ext = DatacubeExtension.ext(self.obj.item, add_if_missing=True) - self.ext.apply(dimensions=self.dimensions(), variables=self.variables()) - def dimensions(self) -> dict: - """Return Dimension objects.""" + """Return Dimension objects required for Datacube extension.""" dims = {} for name, length in self.attrs["dimensions"].items(): @@ -304,7 +286,7 @@ def dimensions(self) -> dict: return dims def variables(self)->dict: - """Return Variable objects""" + """Return Variable objects required for Datacube extension.""" variables = {} for name, meta in self.attrs["variables"].items(): diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 1b219ee..4fdea40 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -6,12 +6,13 @@ import argparse import pyessv from pydantic import AnyHttpUrl, BaseModel, Field, FieldValidationInfo, field_validator - +from pystac.extensions.datacube import DatacubeExtension from STACpopulator import STACpopulatorBase +from STACpopulator.extensions import cmip6 from STACpopulator.input import THREDDSLoader from STACpopulator.stac_utils import ItemProperties -from STACpopulator.stac_utils import collection2literal, DatacubeExt, CFJsonItem +from STACpopulator.stac_utils import collection2literal, CFJsonItem LOGGER = logging.getLogger(__name__) @@ -148,8 +149,17 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) obj = CFJsonItem(iid, item_data, self.props_model) + # Add CMIP6 extension + try: + cmip6_ext = cmip6.CMIP6Extension.ext(obj.item, add_if_missing=True) + cmip6_ext.apply(item_data["attributes"]) + except: + LOGGER.warning(f"Failed to add CMIP6 extension to item {item_name}") + + # Add datacube extension try: - DatacubeExt(obj) + dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) + dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) except: LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") From 71ce3e076185bb413c580060e36ee27990648d59 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:03:00 -0400 Subject: [PATCH 34/69] adding numpy types to python types conversion for metadata --- STACpopulator/input.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 088051f..3b2c11c 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import Any, Iterator, MutableMapping, Optional, Tuple +import numpy as np import requests import siphon import xncml @@ -37,10 +38,6 @@ def reset(self): pass - - - - class THREDDSLoader(GenericLoader): def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> None: """Constructor @@ -89,6 +86,21 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() + # Converting numpy datatypes to python standard datatypes + for key, value in attrs["attributes"].items(): + if isinstance(value, list): + newlist = [] + for item in value: + if issubclass(type(item), np.integer): + newlist.append(int(item)) + elif issubclass(type(item), np.floating): + newlist.append(float(item)) + else: + newlist.append(item) + attrs["attributes"][key] = newlist + elif isinstance(type(value), np.integer): + attrs["attributes"][key] = int(value) + attrs["access_urls"] = ds.access_urls return attrs From 350b4f4cb26f064fe83e25eb436417ccc89c5312 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:04:37 -0400 Subject: [PATCH 35/69] removing collection2enum --- STACpopulator/stac_utils.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 24efb07..cb424c2 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -47,23 +47,6 @@ def url_validate(target: str) -> bool: return True if re.match(url_regex, target) else False -def collection2enum(collection: pyessv.model.collection.Collection) -> enumtype: - """Create Enum based on terms from pyessv collection. - - Parameters - ---------- - collection : pyessv.model.collection.Collection - pyessv collection of terms. - - Returns - ------- - Enum - Enum storing terms and their labels from collection. - """ - mp = {term.name: term.label for term in collection} - return Enum(collection.raw_name.capitalize(), mp, module="base") - - def collection2literal(collection): import typing terms = tuple(term.label for term in collection) From 2a445f04b7f4c12dd50401baf8ce4502aa696aff Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:08:24 -0400 Subject: [PATCH 36/69] black --- STACpopulator/stac_utils.py | 318 +++++++++++++++++++----------------- 1 file changed, 170 insertions(+), 148 deletions(-) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index cb424c2..6c4f67a 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -135,6 +135,7 @@ class Item(BaseModel): class CFJsonItem: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} def __init__(self, iid: str, attrs: dict, datamodel=None): @@ -154,10 +155,11 @@ def __init__(self, iid: str, attrs: dict, datamodel=None): cfmeta = attrs["groups"]["CFMetadata"]["attributes"] # Global attributes - gattrs = {"start_datetime": cfmeta["time_coverage_start"], - "end_datetime": cfmeta["time_coverage_end"], - **attrs["attributes"], - } + gattrs = { + "start_datetime": cfmeta["time_coverage_start"], + "end_datetime": cfmeta["time_coverage_end"], + **attrs["attributes"], + } # Validate using pydantic data model if given datamodel = datamodel or dict @@ -179,8 +181,8 @@ class MySTACItem(Item): # Add assets if "access_urls" in attrs: root = attrs["access_urls"] - elif 'THREDDSMetadata' in attrs["groups"]: - root = attrs["groups"]['THREDDSMetadata']['groups']['services']['attributes'] + elif "THREDDSMetadata" in attrs["groups"]: + root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] else: root = {} @@ -244,31 +246,32 @@ def dimensions(self) -> dict: bbox = self.obj.ncattrs_to_bbox() for key, criteria in coordinate_criteria.items(): for criterion, expected in criteria.items(): - if v['attributes'].get(criterion, None) in expected: + if v["attributes"].get(criterion, None) in expected: axis = self.axis[key] - type_ = DimensionType.SPATIAL if axis in ['x', 'y', 'z'] else DimensionType.TEMPORAL + type_ = DimensionType.SPATIAL if axis in ["x", "y", "z"] else DimensionType.TEMPORAL - if v['type'] == 'int': + if v["type"] == "int": extent = [0, int(length)] else: # Not clear the logic is sound - if key == 'X': + if key == "X": extent = bbox[0], bbox[2] elif key == "Y": extent = bbox[1], bbox[3] else: extent = None - dims[name] = Dimension(properties=dict( - axis = axis, - type = type_, - extent = extent, - description=v.get("description", v.get("long_name", criteria["standard_name"])) + dims[name] = Dimension( + properties=dict( + axis=axis, + type=type_, + extent=extent, + description=v.get("description", v.get("long_name", criteria["standard_name"])), ) ) return dims - def variables(self)->dict: + def variables(self) -> dict: """Return Variable objects required for Datacube extension.""" variables = {} @@ -276,17 +279,18 @@ def variables(self)->dict: if name in self.attrs["dimensions"]: continue - attrs = meta['attributes'] - variables[name] = Variable(properties=dict( + attrs = meta["attributes"] + variables[name] = Variable( + properties=dict( dimensions=meta["shape"], - type = VariableType.AUXILIARY.value if self.is_coordinate(attrs) else - VariableType.DATA.value, + type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, description=attrs.get("description", attrs.get("long_name")), - unit=attrs.get("units", None) - )) + unit=attrs.get("units", None), + ) + ) return variables - def is_coordinate(self, attrs: dict)-> bool: + def is_coordinate(self, attrs: dict) -> bool: """Return whether variable is a coordinate.""" for key, criteria in coordinate_criteria.items(): for criterion, expected in criteria.items(): @@ -297,128 +301,146 @@ def is_coordinate(self, attrs: dict)-> bool: # From CF-Xarray coordinate_criteria = { - 'latitude': {'standard_name': ('latitude',), - 'units': ('degree_north', - 'degree_N', - 'degreeN', - 'degrees_north', - 'degrees_N', - 'degreesN'), - '_CoordinateAxisType': ('Lat',), - 'long_name': ('latitude',)}, - 'longitude': {'standard_name': ('longitude',), - 'units': ('degree_east', - 'degree_E', - 'degreeE', - 'degrees_east', - 'degrees_E', - 'degreesE'), - '_CoordinateAxisType': ('Lon',), - 'long_name': ('longitude',)}, - 'Z': {'standard_name': ('model_level_number', - 'atmosphere_ln_pressure_coordinate', - 'atmosphere_sigma_coordinate', - 'atmosphere_hybrid_sigma_pressure_coordinate', - 'atmosphere_hybrid_height_coordinate', - 'atmosphere_sleve_coordinate', - 'ocean_sigma_coordinate', - 'ocean_s_coordinate', - 'ocean_s_coordinate_g1', - 'ocean_s_coordinate_g2', - 'ocean_sigma_z_coordinate', - 'ocean_double_sigma_coordinate'), - '_CoordinateAxisType': ('GeoZ', 'Height', 'Pressure'), - 'axis': ('Z',), - 'cartesian_axis': ('Z',), - 'grads_dim': ('z',), - 'long_name': ('model_level_number', - 'atmosphere_ln_pressure_coordinate', - 'atmosphere_sigma_coordinate', - 'atmosphere_hybrid_sigma_pressure_coordinate', - 'atmosphere_hybrid_height_coordinate', - 'atmosphere_sleve_coordinate', - 'ocean_sigma_coordinate', - 'ocean_s_coordinate', - 'ocean_s_coordinate_g1', - 'ocean_s_coordinate_g2', - 'ocean_sigma_z_coordinate', - 'ocean_double_sigma_coordinate')}, - 'vertical': {'standard_name': ('air_pressure', - 'height', - 'depth', - 'geopotential_height', - 'altitude', - 'height_above_geopotential_datum', - 'height_above_reference_ellipsoid', - 'height_above_mean_sea_level'), - 'positive': ('up', 'down'), - 'long_name': ('air_pressure', - 'height', - 'depth', - 'geopotential_height', - 'altitude', - 'height_above_geopotential_datum', - 'height_above_reference_ellipsoid', - 'height_above_mean_sea_level')}, - 'X': {'standard_name': ('projection_x_coordinate', - 'grid_longitude', - 'projection_x_angular_coordinate'), - '_CoordinateAxisType': ('GeoX',), - 'axis': ('X',), - 'cartesian_axis': ('X',), - 'grads_dim': ('x',), - 'long_name': ('projection_x_coordinate', - 'grid_longitude', - 'projection_x_angular_coordinate', - 'cell index along first dimension')}, - 'Y': {'standard_name': ('projection_y_coordinate', - 'grid_latitude', - 'projection_y_angular_coordinate'), - '_CoordinateAxisType': ('GeoY',), - 'axis': ('Y',), - 'cartesian_axis': ('Y',), - 'grads_dim': ('y',), - 'long_name': ('projection_y_coordinate', - 'grid_latitude', - 'projection_y_angular_coordinate', - 'cell index along second dimension')}, - 'T': {'standard_name': ('time',), - '_CoordinateAxisType': ('Time',), - 'axis': ('T',), - 'cartesian_axis': ('T',), - 'grads_dim': ('t',), - 'long_name': ('time',)}, - 'time': {'standard_name': ('time',), - '_CoordinateAxisType': ('Time',), - 'axis': ('T',), - 'cartesian_axis': ('T',), - 'grads_dim': ('t',), - 'long_name': ('time',)}} - - -media_types = {"httpserver_service": "application/x-netcdf", - "opendap_service": pystac.MediaType.HTML, - "wcs_service": pystac.MediaType.XML, - "wms_service": pystac.MediaType.XML, - "nccs_service": "application/x-netcdf", - "HTTPServer": "application/x-netcdf", - "OPENDAP": pystac.MediaType.HTML, - "NCML": pystac.MediaType.XML, - "WCS": pystac.MediaType.XML, - "ISO": pystac.MediaType.XML, - "WMS": pystac.MediaType.XML, - "NetcdfSubset": "application/x-netcdf", - } - -asset_roles = {"httpserver_service": ["data"], - "opendap_service": ["data"], - "wcs_service": ["data"], - "wms_service": ["visual"], - "nccs_service": ["data"], - "HTTPServer": ["data"], - "OPENDAP": ["data"], - "NCML": ["metadata"], - "WCS": ["data"], - "ISO": ["metadata"], - "WMS": ["visual"], - "NetcdfSubset": ["data"],} + "latitude": { + "standard_name": ("latitude",), + "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), + "_CoordinateAxisType": ("Lat",), + "long_name": ("latitude",), + }, + "longitude": { + "standard_name": ("longitude",), + "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), + "_CoordinateAxisType": ("Lon",), + "long_name": ("longitude",), + }, + "Z": { + "standard_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), + "axis": ("Z",), + "cartesian_axis": ("Z",), + "grads_dim": ("z",), + "long_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + }, + "vertical": { + "standard_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + "positive": ("up", "down"), + "long_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + }, + "X": { + "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), + "_CoordinateAxisType": ("GeoX",), + "axis": ("X",), + "cartesian_axis": ("X",), + "grads_dim": ("x",), + "long_name": ( + "projection_x_coordinate", + "grid_longitude", + "projection_x_angular_coordinate", + "cell index along first dimension", + ), + }, + "Y": { + "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), + "_CoordinateAxisType": ("GeoY",), + "axis": ("Y",), + "cartesian_axis": ("Y",), + "grads_dim": ("y",), + "long_name": ( + "projection_y_coordinate", + "grid_latitude", + "projection_y_angular_coordinate", + "cell index along second dimension", + ), + }, + "T": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + "time": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, +} + + +media_types = { + "httpserver_service": "application/x-netcdf", + "opendap_service": pystac.MediaType.HTML, + "wcs_service": pystac.MediaType.XML, + "wms_service": pystac.MediaType.XML, + "nccs_service": "application/x-netcdf", + "HTTPServer": "application/x-netcdf", + "OPENDAP": pystac.MediaType.HTML, + "NCML": pystac.MediaType.XML, + "WCS": pystac.MediaType.XML, + "ISO": pystac.MediaType.XML, + "WMS": pystac.MediaType.XML, + "NetcdfSubset": "application/x-netcdf", +} + +asset_roles = { + "httpserver_service": ["data"], + "opendap_service": ["data"], + "wcs_service": ["data"], + "wms_service": ["visual"], + "nccs_service": ["data"], + "HTTPServer": ["data"], + "OPENDAP": ["data"], + "NCML": ["metadata"], + "WCS": ["data"], + "ISO": ["metadata"], + "WMS": ["visual"], + "NetcdfSubset": ["data"], +} From 2728ce664e83ab212eb0209841fbec2ff29462fc Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:13:51 -0400 Subject: [PATCH 37/69] extracting pydantic base models to models.py --- STACpopulator/models.py | 74 ++++++++++++++++ STACpopulator/stac_utils.py | 112 ++---------------------- implementations/CMIP6-UofT/add_CMIP6.py | 43 ++++----- 3 files changed, 99 insertions(+), 130 deletions(-) create mode 100644 STACpopulator/models.py diff --git a/STACpopulator/models.py b/STACpopulator/models.py new file mode 100644 index 0000000..2b617b1 --- /dev/null +++ b/STACpopulator/models.py @@ -0,0 +1,74 @@ +import datetime as dt +from typing import Any, Dict, List, Optional, Union + +from pydantic import AnyHttpUrl, AnyUrl, BaseModel, Field, field_validator +from typing_extensions import TypedDict + + +class Geometry(TypedDict): + type: str + coordinates: List[List[List[float]]] + + +class Asset(BaseModel): + href: AnyHttpUrl + media_type: Optional[str] = None + title: Optional[str] = None + description: Optional[str] = None + roles: Optional[List[str]] = None + + +class STACItemProperties(BaseModel): + start_datetime: Optional[dt.datetime] = None + end_datetime: Optional[dt.datetime] = None + datetime: Optional[dt.datetime] = None + + @field_validator("datetime", mode="before") + @classmethod + def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) -> dt: + if v == "null": + if not values["start_datetime"] and not values["end_datetime"]: + raise ValueError("start_datetime and end_datetime must be specified when datetime is null") + + +# class Link(BaseModel): +# """ +# https://github.com/radiantearth/stac-spec/blob/v1.0.0/collection-spec/collection-spec.md#link-object +# """ + +# href: str = Field(..., alias="href", min_length=1) +# rel: str = Field(..., alias="rel", min_length=1) +# type: Optional[str] = None +# title: Optional[str] = None +# # Label extension +# label: Optional[str] = Field(None, alias="label:assets") +# model_config = ConfigDict(use_enum_values=True) + +# def resolve(self, base_url: str) -> None: +# """resolve a link to the given base URL""" +# self.href = urljoin(base_url, self.href) + + +# class PaginationLink(Link): +# """ +# https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension +# """ + +# rel: Literal["next", "previous"] +# method: Literal["GET", "POST"] +# body: Optional[Dict[Any, Any]] = None +# merge: bool = False + + +# Links = RootModel[List[Union[PaginationLink, Link]]] + + +class STACItem(BaseModel): + id: str = Field(..., alias="id", min_length=1) + geometry: Optional[Geometry] = None + bbox: Optional[List[float]] = None + properties: Optional[STACItemProperties] = None + assets: Dict[str, Asset] = None + stac_extensions: Optional[List[AnyUrl]] = [] + collection: Optional[str] = None + datetime: Optional[dt.datetime] = None # Not in the spec, but needed by pystac.Item. diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 6c4f67a..df84f10 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,27 +1,12 @@ -import re import json -import datetime as dt -from enum import Enum, auto -from typing import Any, Iterator, MutableMapping, Optional, Tuple, Union -from typing import Any, Dict, List, Literal, MutableMapping -from typing_extensions import TypedDict -import pystac -from pystac.extensions.datacube import Dimension, DimensionType, VariableType, Variable, DatacubeExtension -from pydantic import AnyHttpUrl, BaseModel, field_validator, Field, ConfigDict, RootModel, AnyUrl -from urllib.parse import urljoin - - -import pyessv - +import re +from typing import Any, Literal, MutableMapping -try: - from enum import EnumType as enumtype -except ImportError: - # < Python 3.11 - from enum import EnumMeta as enumtype +import pystac +from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType +from STACpopulator.models import STACItem, STACItemProperties -STAC_VERSION = "1.0.0" def url_validate(target: str) -> bool: """Validate whether a supplied URL is reliably written. @@ -48,89 +33,8 @@ def url_validate(target: str) -> bool: def collection2literal(collection): - import typing terms = tuple(term.label for term in collection) - return typing.Literal[terms] - - -class AutoValueEnum(Enum): - def _generate_next_value_( # type: ignore - name: str, start: int, count: int, last_values: List[Any] - ) -> Any: - return name - - -# DH: There is a question here whether we want to use pystac.Item or not. -# pystac.Item takes datetime, start_datetime and end_datetime as optional parameters, and then copies them into -# properties. -# If we use pystac.Item, we don't have to put start_datetime and end_datetime into Properties, we can let pystac do -# that. -class ItemProperties(BaseModel): - start_datetime: Optional[dt.datetime] = None - end_datetime: Optional[dt.datetime] = None - datetime: Optional[dt.datetime] = None - - @field_validator("datetime", mode="before") - def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) -> dt: - if v == "null": - if not values["start_datetime"] and not values["end_datetime"]: - raise ValueError( - "start_datetime and end_datetime must be specified when datetime is null" - ) - - -class Geometry(TypedDict): - type: str - coordinates: List[List[List[float]]] - -class Asset(BaseModel): - href: AnyHttpUrl - media_type: Optional[str] = None - title: Optional[str] = None - description: Optional[str] = None - roles: Optional[List[str]] = None - -class Link(BaseModel): - """ - https://github.com/radiantearth/stac-spec/blob/v1.0.0/collection-spec/collection-spec.md#link-object - """ - - href: str = Field(..., alias="href", min_length=1) - rel: str = Field(..., alias="rel", min_length=1) - type: Optional[str] = None - title: Optional[str] = None - # Label extension - label: Optional[str] = Field(None, alias="label:assets") - model_config = ConfigDict(use_enum_values=True) - - def resolve(self, base_url: str) -> None: - """resolve a link to the given base URL""" - self.href = urljoin(base_url, self.href) - - -class PaginationLink(Link): - """ - https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension - """ - - rel: Literal["next", "previous"] - method: Literal["GET", "POST"] - body: Optional[Dict[Any, Any]] = None - merge: bool = False - -Links = RootModel[List[Union[PaginationLink, Link]]] - - -class Item(BaseModel): - id: str = Field(..., alias="id", min_length=1) - geometry: Optional[Geometry] = None - bbox: Optional[List[float]] = None - properties: Optional[ItemProperties] = None - assets: Dict[str, Asset] = None - stac_extensions: Optional[List[AnyUrl]] = [] - collection: Optional[str] = None - datetime: Optional[dt.datetime] = None # Not in the spec, but needed by pystac.Item. - + return Literal[terms] class CFJsonItem: @@ -162,9 +66,9 @@ def __init__(self, iid: str, attrs: dict, datamodel=None): } # Validate using pydantic data model if given - datamodel = datamodel or dict + datamodel = datamodel or STACItemProperties - class MySTACItem(Item): + class MySTACItem(STACItem): properties: datamodel # Create STAC item diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 4fdea40..55162f6 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,19 +1,18 @@ -import logging +import argparse import hashlib +import logging from datetime import datetime from typing import Any, Dict, List, Literal, MutableMapping -from colorlog import ColoredFormatter -import argparse + import pyessv -from pydantic import AnyHttpUrl, BaseModel, Field, FieldValidationInfo, field_validator -from pystac.extensions.datacube import DatacubeExtension +from colorlog import ColoredFormatter +from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase from STACpopulator.extensions import cmip6 from STACpopulator.input import THREDDSLoader -from STACpopulator.stac_utils import ItemProperties -from STACpopulator.stac_utils import collection2literal, CFJsonItem - +from STACpopulator.models import STACItemProperties +from STACpopulator.stac_utils import CFJsonItem, collection2literal LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -33,17 +32,15 @@ Frequency = collection2literal(CV.frequency) GridLabel = collection2literal(CV.grid_label) InstitutionID = collection2literal(CV.institution_id) -# Member = collection2literal(CV.member_id) # This is empty NominalResolution = collection2literal(CV.nominal_resolution) Realm = collection2literal(CV.realm) SourceID = collection2literal(CV.source_id) SourceType = collection2literal(CV.source_type) SubExperimentID = collection2literal(CV.sub_experiment_id) TableID = collection2literal(CV.table_id) -# Variable = collection2literal(CV.variable_id) # This is empty -class Properties(ItemProperties, validate_assignment=True): +class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): """Data model for CMIP6 Controlled Vocabulary.""" Conventions: str = Field(..., serialization_alias="cmip6:Conventions") @@ -129,7 +126,7 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: st """ data_loader = THREDDSLoader(thredds_catalog_url) - self.props_model = Properties + self.item_properties_model = CMIP6ItemProperties super().__init__(stac_host, data_loader, config_filename) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): @@ -147,22 +144,16 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = make_cmip6_item_id(item_data["attributes"]) - obj = CFJsonItem(iid, item_data, self.props_model) - - # Add CMIP6 extension - try: - cmip6_ext = cmip6.CMIP6Extension.ext(obj.item, add_if_missing=True) - cmip6_ext.apply(item_data["attributes"]) - except: - LOGGER.warning(f"Failed to add CMIP6 extension to item {item_name}") + obj = CFJsonItem(iid, item_data, self.item_properties_model) - # Add datacube extension - try: - dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) - dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) - except: - LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") + # # Add datacube extension + # try: + # dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) + # dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) + # except: + # LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") + print(obj.item.to_dict()) return obj.item.to_dict() def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: From b47d613e659767149807fed8984bfbf709f94388 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 4 Oct 2023 22:17:11 -0400 Subject: [PATCH 38/69] removing cmip6 extension code --- STACpopulator/extensions/__init__.py | 0 STACpopulator/extensions/cmip6.py | 179 ------------------------ implementations/CMIP6-UofT/add_CMIP6.py | 1 - 3 files changed, 180 deletions(-) delete mode 100644 STACpopulator/extensions/__init__.py delete mode 100644 STACpopulator/extensions/cmip6.py diff --git a/STACpopulator/extensions/__init__.py b/STACpopulator/extensions/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/STACpopulator/extensions/cmip6.py b/STACpopulator/extensions/cmip6.py deleted file mode 100644 index 2e75020..0000000 --- a/STACpopulator/extensions/cmip6.py +++ /dev/null @@ -1,179 +0,0 @@ -"""CMIP6 extension based on https://stac-extensions.github.io/cmip6/v1.0.0/schema.json""" - -import json -from typing import Generic, TypeVar, Union, cast - -import pystac -from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension -from pystac.extensions.hooks import ExtensionHooks - -from datetime import date, datetime -from typing import Any, Dict, List, Literal -import pyessv -from pydantic import ( - AnyHttpUrl, - FieldValidationInfo, - field_validator, - model_serializer, -) -from pydantic.networks import Url - - -from STACpopulator.stac_utils import ItemProperties -from STACpopulator.stac_utils import collection2literal - -T = TypeVar("T", pystac.Collection, pystac.Item, pystac.Asset) - -SCHEMA_URI = "https://stac-extensions.github.io/cmip6/v1.0.0/schema.json" - - -# CMIP6 controlled vocabulary (CV) -CV = pyessv.WCRP.CMIP6 - -# Enum classes built from the pyessv' CV -ActivityID = collection2literal(CV.activity_id) -ExperimentID = collection2literal(CV.experiment_id) -Frequency = collection2literal(CV.frequency) -GridLabel = collection2literal(CV.grid_label) -InstitutionID = collection2literal(CV.institution_id) -NominalResolution = collection2literal(CV.nominal_resolution) -Realm = collection2literal(CV.realm) -SourceID = collection2literal(CV.source_id) -SourceType = collection2literal(CV.source_type) -SubExperimentID = collection2literal(CV.sub_experiment_id) -TableID = collection2literal(CV.table_id) - - -class Properties(ItemProperties, validate_assignment=True): - """Data model for CMIP6 Controlled Vocabulary.""" - - Conventions: str - activity_id: ActivityID - creation_date: datetime - data_specs_version: str - experiment: str - experiment_id: ExperimentID - frequency: Frequency - further_info_url: AnyHttpUrl - grid_label: GridLabel - institution: str - institution_id: InstitutionID - nominal_resolution: NominalResolution - realm: List[Realm] - source: str - source_id: SourceID - source_type: List[SourceType] - sub_experiment: Union[str, Literal["none"]] - sub_experiment_id: Union[SubExperimentID, Literal["none"]] - table_id: TableID - variable_id: str - variant_label: str - initialization_index: int - physics_index: int - realization_index: int - forcing_index: int - tracking_id: str - version: str - product: str - license: str - grid: str - mip_era: str - - - @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") - @classmethod - def first_item(cls, v: list, info: FieldValidationInfo): - """Pick single item from list.""" - assert len(v) == 1, f"{info.field_name} must have one item only." - return v[0] - - @field_validator("realm", "source_type", mode="before") - @classmethod - def split(cls, v: str, info: FieldValidationInfo): - """Split string into list.""" - return v.split(" ") - - @field_validator("version") - @classmethod - def validate_version(cls, v: str, info: FieldValidationInfo): - assert v[0] == "v", "Version string should begin with a lower case 'v'" - assert v[1:].isdigit(), "All characters in version string, except first, should be digits" - return v - - -class CMIP6Extension(Generic[T], ExtensionManagementMixin[pystac.Item], PropertiesExtension): - """An abstract class that can be used to extend the properties of a - :class:`~pystac.Item` with properties from the :stac-ext:`CMIP6 Extension `. - - To create an instance of :class:`CMIP6Extension`, use the :meth:`CMIP6Extension.ext` method. - """ - prefix: str = "cmip6:" - - def apply(self, attrs: Dict[str, Any]) -> None: - """Applies Datacube Extension properties to the extended - :class:`~pystac.Collection`, :class:`~pystac.Item` or :class:`~pystac.Asset`. - - Args: - dimensions : Dictionary mapping dimension name to :class:`Dimension` - objects. - variables : Dictionary mapping variable name to a :class:`Variable` - object. - """ - import json - - p = Properties(**attrs) - - # Add prefix - objs = {self.prefix + k: v for (k, v) in json.loads(p.model_dump_json()).items()} - - # Update item properties - self.properties.update(**objs) - - @classmethod - def get_schema_uri(cls) -> str: - return SCHEMA_URI - - @classmethod - def ext(cls, obj: T, add_if_missing: bool = False): - """Extends the given STAC Object with properties from the :stac-ext:`CMIP6 - Extension `. - - This extension can be applied to instances of :class:`~pystac.Item`. - - Raises: - pystac.ExtensionTypeError : If an invalid object type is passed. - """ - if isinstance(obj, pystac.Item): - cls.validate_has_extension(obj, add_if_missing) - return cast(CMIP6Extension[T], ItemCMIP6Extension(obj)) - else: - raise pystac.ExtensionTypeError(cls._ext_error_message(obj)) - - -class ItemCMIP6Extension(CMIP6Extension[pystac.Item]): - """A concrete implementation of :class:`DatacubeExtension` on an - :class:`~pystac.Item` that extends the properties of the Item to include properties - defined in the :stac-ext:`Datacube Extension `. - - This class should generally not be instantiated directly. Instead, call - :meth:`DatacubeExtension.ext` on an :class:`~pystac.Item` to extend it. - """ - - item: pystac.Item - properties: Dict[str, Any] - - def __init__(self, item: pystac.Item): - self.item = item - self.properties = item.properties - - def __repr__(self) -> str: - return "".format(self.item.id) - - -class CMIP6ExtensionHooks(ExtensionHooks): - schema_uri: str = SCHEMA_URI - prev_extension_ids = {"cmip6"} - stac_object_types = {pystac.STACObjectType.ITEM} - - -CMIP6_EXTENSION_HOOKS: ExtensionHooks = CMIP6ExtensionHooks() diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 55162f6..8cf0298 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -9,7 +9,6 @@ from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase -from STACpopulator.extensions import cmip6 from STACpopulator.input import THREDDSLoader from STACpopulator.models import STACItemProperties from STACpopulator.stac_utils import CFJsonItem, collection2literal From 2f5dc39598f13b8f028a31723030306f287da5ce Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 6 Oct 2023 14:34:15 -0400 Subject: [PATCH 39/69] Breaking CFJsonItem part 1: extracting STAC item creation --- STACpopulator/models.py | 17 ++++- STACpopulator/populator_base.py | 21 ++++-- STACpopulator/stac_utils.py | 90 +++++++++++++++++++++++++ implementations/CMIP6-UofT/add_CMIP6.py | 19 ++++-- 4 files changed, 131 insertions(+), 16 deletions(-) diff --git a/STACpopulator/models.py b/STACpopulator/models.py index 2b617b1..625efc2 100644 --- a/STACpopulator/models.py +++ b/STACpopulator/models.py @@ -1,7 +1,14 @@ import datetime as dt from typing import Any, Dict, List, Optional, Union -from pydantic import AnyHttpUrl, AnyUrl, BaseModel, Field, field_validator +from pydantic import ( + AnyHttpUrl, + AnyUrl, + BaseModel, + Field, + SerializeAsAny, + field_validator, +) from typing_extensions import TypedDict @@ -19,6 +26,10 @@ class Asset(BaseModel): class STACItemProperties(BaseModel): + """Base STAC Item properties data model. In concrete implementations, users would want to define a new + data model that inherits from this base model and extends it with properties tailored to the data they are + ingesting.""" + start_datetime: Optional[dt.datetime] = None end_datetime: Optional[dt.datetime] = None datetime: Optional[dt.datetime] = None @@ -64,10 +75,12 @@ def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) - class STACItem(BaseModel): + """STAC Item data model.""" + id: str = Field(..., alias="id", min_length=1) geometry: Optional[Geometry] = None bbox: Optional[List[float]] = None - properties: Optional[STACItemProperties] = None + properties: Optional[SerializeAsAny[STACItemProperties]] = None assets: Dict[str, Asset] = None stac_extensions: Optional[List[AnyUrl]] = [] collection: Optional[str] = None diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 2541fe7..3d8f50c 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -58,7 +58,7 @@ def __init__( self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) - #self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() + # self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() self._collection_id = self.collection_name LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") @@ -76,6 +76,13 @@ def stac_host(self) -> str: def collection_id(self) -> str: return self._collection_id + @property + @abstractmethod + def item_properties_model(self): + """In derived classes, this property should be defined as a pydantic data model that derives from + models.STACItemProperties.""" + pass + def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): raise ValueError("stac_host URL is not appropriately formatted") @@ -115,12 +122,12 @@ def ingest(self) -> None: for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) - post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) - try: - pass - except Exception: - LOGGER.error(f"Failed adding STAC item {item_name}") - self.handle_ingestion_error("Posting Error", item_name, item_data) + # post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) + # try: + # pass + # except Exception: + # LOGGER.error(f"Failed adding STAC item {item_name}") + # self.handle_ingestion_error("Posting Error", item_name, item_data) @abstractmethod def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index df84f10..361eead 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -37,6 +37,96 @@ def collection2literal(collection): return Literal[terms] +def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Create Polygon geometry from CFMetadata.""" + attrs = attrs["groups"]["CFMetadata"]["attributes"] + return { + "type": "Polygon", + "coordinates": [ + [ + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + ] + ], + } + + +def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: + """Create BBOX from CFMetadata.""" + attrs = attrs["groups"]["CFMetadata"]["attributes"] + return [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ] + + +def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel): + """ + Create STAC Item from CF JSON metadata. + + Parameters + ---------- + iid : str + Unique item ID. + attrs: dict + CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. + datamodel : pydantic.BaseModel, optional + Data model for validating global attributes. + """ + + cfmeta = attrs["groups"]["CFMetadata"]["attributes"] + + # Create pydantic STAC item + item = STACItem( + id=iid, + geometry=ncattrs_to_geometry(attrs), + bbox=ncattrs_to_bbox(attrs), + properties=item_props_datamodel( + start_datetime=cfmeta["time_coverage_start"], + end_datetime=cfmeta["time_coverage_end"], + **attrs["attributes"], + ), + datetime=None, + ) + + # Convert pydantic STAC item to a PySTAC Item + item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) + + # Add assets + if "access_urls" in attrs: + root = attrs["access_urls"] + elif "THREDDSMetadata" in attrs["groups"]: + root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] + else: + root = {} + + for name, url in root.items(): + asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) + item.add_asset(name, asset) + + return item + + class CFJsonItem: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 8cf0298..bfb5d53 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -11,7 +11,11 @@ from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader from STACpopulator.models import STACItemProperties -from STACpopulator.stac_utils import CFJsonItem, collection2literal +from STACpopulator.stac_utils import ( + CFJsonItem, + STAC_item_from_metadata, + collection2literal, +) LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -109,10 +113,11 @@ def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: ] name = "_".join(attrs[k] for k in keys) return name - return hashlib.md5(name.encode("utf-8")).hexdigest() class CMIP6populator(STACpopulatorBase): + item_properties_model = CMIP6ItemProperties + def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: str) -> None: """Constructor @@ -125,7 +130,6 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: st """ data_loader = THREDDSLoader(thredds_catalog_url) - self.item_properties_model = CMIP6ItemProperties super().__init__(stac_host, data_loader, config_filename) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): @@ -143,17 +147,18 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = make_cmip6_item_id(item_data["attributes"]) - obj = CFJsonItem(iid, item_data, self.item_properties_model) + item = STAC_item_from_metadata(iid, item_data, self.item_properties_model) - # # Add datacube extension + # Add datacube extension # try: # dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) # dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) # except: # LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") - print(obj.item.to_dict()) - return obj.item.to_dict() + # print(obj.item.to_dict()) + # return obj.item.to_dict() + print(item.to_dict()) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. From 3f821ce73439062ae568b857680b4dbedb6c05fd Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 6 Oct 2023 14:50:24 -0400 Subject: [PATCH 40/69] Breaking CFJsonItem part 2: extracting datacube extension code --- STACpopulator/stac_utils.py | 285 +---------------------- implementations/CMIP6-UofT/add_CMIP6.py | 19 +- implementations/CMIP6-UofT/extensions.py | 201 ++++++++++++++++ 3 files changed, 210 insertions(+), 295 deletions(-) create mode 100644 implementations/CMIP6-UofT/extensions.py diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 361eead..7cf3ed9 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -3,9 +3,8 @@ from typing import Any, Literal, MutableMapping import pystac -from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType -from STACpopulator.models import STACItem, STACItemProperties +from STACpopulator.models import STACItem def url_validate(target: str) -> bool: @@ -127,288 +126,6 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop return item -class CFJsonItem: - """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" - - axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} - - def __init__(self, iid: str, attrs: dict, datamodel=None): - """ - Create STAC Item from CF JSON metadata. - - Parameters - ---------- - iid : str - Unique item ID. - attrs: dict - CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. - datamodel : pydantic.BaseModel, optional - Data model for validating global attributes. - """ - self.attrs = attrs - cfmeta = attrs["groups"]["CFMetadata"]["attributes"] - - # Global attributes - gattrs = { - "start_datetime": cfmeta["time_coverage_start"], - "end_datetime": cfmeta["time_coverage_end"], - **attrs["attributes"], - } - - # Validate using pydantic data model if given - datamodel = datamodel or STACItemProperties - - class MySTACItem(STACItem): - properties: datamodel - - # Create STAC item - item = MySTACItem( - id=iid, - geometry=self.ncattrs_to_geometry(), - bbox=self.ncattrs_to_bbox(), - properties=gattrs, - datetime=None, - ) - - item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - - # Add assets - if "access_urls" in attrs: - root = attrs["access_urls"] - elif "THREDDSMetadata" in attrs["groups"]: - root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] - else: - root = {} - - for name, url in root.items(): - asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) - item.add_asset(name, asset) - - self.item = item - - def to_json(self) -> str: - self.item.model_dump_json() - - def ncattrs_to_geometry(self) -> MutableMapping[str, Any]: - """Create Polygon geometry from CFMetadata.""" - attrs = self.attrs["groups"]["CFMetadata"]["attributes"] - return { - "type": "Polygon", - "coordinates": [ - [ - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_max"][0]), - ], - [ - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_max"][0]), - ], - [ - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - ], - ] - ], - } - - def ncattrs_to_bbox(self) -> list: - """Create BBOX from CFMetadata.""" - attrs = self.attrs["groups"]["CFMetadata"]["attributes"] - return [ - float(attrs["geospatial_lon_min"][0]), - float(attrs["geospatial_lat_min"][0]), - float(attrs["geospatial_lon_max"][0]), - float(attrs["geospatial_lat_max"][0]), - ] - - def dimensions(self) -> dict: - """Return Dimension objects required for Datacube extension.""" - - dims = {} - for name, length in self.attrs["dimensions"].items(): - v = self.attrs["variables"].get(name) - if v: - bbox = self.obj.ncattrs_to_bbox() - for key, criteria in coordinate_criteria.items(): - for criterion, expected in criteria.items(): - if v["attributes"].get(criterion, None) in expected: - axis = self.axis[key] - type_ = DimensionType.SPATIAL if axis in ["x", "y", "z"] else DimensionType.TEMPORAL - - if v["type"] == "int": - extent = [0, int(length)] - else: # Not clear the logic is sound - if key == "X": - extent = bbox[0], bbox[2] - elif key == "Y": - extent = bbox[1], bbox[3] - else: - extent = None - - dims[name] = Dimension( - properties=dict( - axis=axis, - type=type_, - extent=extent, - description=v.get("description", v.get("long_name", criteria["standard_name"])), - ) - ) - - return dims - - def variables(self) -> dict: - """Return Variable objects required for Datacube extension.""" - variables = {} - - for name, meta in self.attrs["variables"].items(): - if name in self.attrs["dimensions"]: - continue - - attrs = meta["attributes"] - variables[name] = Variable( - properties=dict( - dimensions=meta["shape"], - type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, - description=attrs.get("description", attrs.get("long_name")), - unit=attrs.get("units", None), - ) - ) - return variables - - def is_coordinate(self, attrs: dict) -> bool: - """Return whether variable is a coordinate.""" - for key, criteria in coordinate_criteria.items(): - for criterion, expected in criteria.items(): - if attrs.get(criterion, None) in expected: - return True - return False - - -# From CF-Xarray -coordinate_criteria = { - "latitude": { - "standard_name": ("latitude",), - "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), - "_CoordinateAxisType": ("Lat",), - "long_name": ("latitude",), - }, - "longitude": { - "standard_name": ("longitude",), - "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), - "_CoordinateAxisType": ("Lon",), - "long_name": ("longitude",), - }, - "Z": { - "standard_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), - "axis": ("Z",), - "cartesian_axis": ("Z",), - "grads_dim": ("z",), - "long_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - }, - "vertical": { - "standard_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - "positive": ("up", "down"), - "long_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - }, - "X": { - "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), - "_CoordinateAxisType": ("GeoX",), - "axis": ("X",), - "cartesian_axis": ("X",), - "grads_dim": ("x",), - "long_name": ( - "projection_x_coordinate", - "grid_longitude", - "projection_x_angular_coordinate", - "cell index along first dimension", - ), - }, - "Y": { - "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), - "_CoordinateAxisType": ("GeoY",), - "axis": ("Y",), - "cartesian_axis": ("Y",), - "grads_dim": ("y",), - "long_name": ( - "projection_y_coordinate", - "grid_latitude", - "projection_y_angular_coordinate", - "cell index along second dimension", - ), - }, - "T": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, - "time": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, -} - - media_types = { "httpserver_service": "application/x-netcdf", "opendap_service": pystac.MediaType.HTML, diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index bfb5d53..93d6072 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,21 +1,17 @@ import argparse -import hashlib import logging from datetime import datetime from typing import Any, Dict, List, Literal, MutableMapping import pyessv from colorlog import ColoredFormatter +from extensions import DataCubeHelper from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader from STACpopulator.models import STACItemProperties -from STACpopulator.stac_utils import ( - CFJsonItem, - STAC_item_from_metadata, - collection2literal, -) +from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -150,11 +146,12 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) item = STAC_item_from_metadata(iid, item_data, self.item_properties_model) # Add datacube extension - # try: - # dc_ext = DatacubeExtension.ext(obj.item, add_if_missing=True) - # dc_ext.apply(dimensions=obj.dimensions(), variables=obj.variables()) - # except: - # LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") + try: + dchelper = DataCubeHelper(item_data) + dc_ext = DatacubeExtension.ext(item, add_if_missing=True) + dc_ext.apply(dimensions=dchelper.dimensions(), variables=dchelper.variables()) + except: + LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") # print(obj.item.to_dict()) # return obj.item.to_dict() diff --git a/implementations/CMIP6-UofT/extensions.py b/implementations/CMIP6-UofT/extensions.py new file mode 100644 index 0000000..e09f9b2 --- /dev/null +++ b/implementations/CMIP6-UofT/extensions.py @@ -0,0 +1,201 @@ +import pystac +from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType + + +class DataCubeHelper: + """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + + axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} + + def __init__(self, attrs: dict): + """ + Create STAC Item from CF JSON metadata. + + Parameters + ---------- + iid : str + Unique item ID. + attrs: dict + CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. + datamodel : pydantic.BaseModel, optional + Data model for validating global attributes. + """ + self.attrs = attrs + + def dimensions(self) -> dict: + """Return Dimension objects required for Datacube extension.""" + + dims = {} + for name, length in self.attrs["dimensions"].items(): + v = self.attrs["variables"].get(name) + if v: + bbox = self.obj.ncattrs_to_bbox() + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if v["attributes"].get(criterion, None) in expected: + axis = self.axis[key] + type_ = DimensionType.SPATIAL if axis in ["x", "y", "z"] else DimensionType.TEMPORAL + + if v["type"] == "int": + extent = [0, int(length)] + else: # Not clear the logic is sound + if key == "X": + extent = bbox[0], bbox[2] + elif key == "Y": + extent = bbox[1], bbox[3] + else: + extent = None + + dims[name] = Dimension( + properties=dict( + axis=axis, + type=type_, + extent=extent, + description=v.get("description", v.get("long_name", criteria["standard_name"])), + ) + ) + + return dims + + def variables(self) -> dict: + """Return Variable objects required for Datacube extension.""" + variables = {} + + for name, meta in self.attrs["variables"].items(): + if name in self.attrs["dimensions"]: + continue + + attrs = meta["attributes"] + variables[name] = Variable( + properties=dict( + dimensions=meta["shape"], + type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, + description=attrs.get("description", attrs.get("long_name")), + unit=attrs.get("units", None), + ) + ) + return variables + + def is_coordinate(self, attrs: dict) -> bool: + """Return whether variable is a coordinate.""" + for key, criteria in coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if attrs.get(criterion, None) in expected: + return True + return False + + +# From CF-Xarray +coordinate_criteria = { + "latitude": { + "standard_name": ("latitude",), + "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), + "_CoordinateAxisType": ("Lat",), + "long_name": ("latitude",), + }, + "longitude": { + "standard_name": ("longitude",), + "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), + "_CoordinateAxisType": ("Lon",), + "long_name": ("longitude",), + }, + "Z": { + "standard_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), + "axis": ("Z",), + "cartesian_axis": ("Z",), + "grads_dim": ("z",), + "long_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + }, + "vertical": { + "standard_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + "positive": ("up", "down"), + "long_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + }, + "X": { + "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), + "_CoordinateAxisType": ("GeoX",), + "axis": ("X",), + "cartesian_axis": ("X",), + "grads_dim": ("x",), + "long_name": ( + "projection_x_coordinate", + "grid_longitude", + "projection_x_angular_coordinate", + "cell index along first dimension", + ), + }, + "Y": { + "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), + "_CoordinateAxisType": ("GeoY",), + "axis": ("Y",), + "cartesian_axis": ("Y",), + "grads_dim": ("y",), + "long_name": ( + "projection_y_coordinate", + "grid_latitude", + "projection_y_angular_coordinate", + "cell index along second dimension", + ), + }, + "T": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + "time": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, +} From 3c584cc424c5276c8cad6f28487525785e8de19e Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:03:49 -0400 Subject: [PATCH 41/69] updating geometry structure --- STACpopulator/models.py | 29 +++++++++++++++++++++---- STACpopulator/stac_utils.py | 10 +++++---- implementations/CMIP6-UofT/add_CMIP6.py | 5 +++-- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/STACpopulator/models.py b/STACpopulator/models.py index 625efc2..3e93802 100644 --- a/STACpopulator/models.py +++ b/STACpopulator/models.py @@ -1,6 +1,7 @@ import datetime as dt -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union +from annotated_types import Ge from pydantic import ( AnyHttpUrl, AnyUrl, @@ -9,14 +10,34 @@ SerializeAsAny, field_validator, ) -from typing_extensions import TypedDict +from xarray import Coordinates -class Geometry(TypedDict): +class Geometry(BaseModel): type: str + coordinates: List + + +class GeoJSONPoint(Geometry): + type: Literal["Point"] + coordinates: List[float] + + +class GeoJSONMultiPoint(Geometry): + type: Literal["MultiPoint"] + coordinates: List[List[float]] + + +class GeoJSONPolygon(Geometry): + type: Literal["Polygon"] coordinates: List[List[List[float]]] +class GeoJSONMultiPolygon(Geometry): + type: Literal["MultiPolygon"] + coordinates: List[List[List[List[float]]]] + + class Asset(BaseModel): href: AnyHttpUrl media_type: Optional[str] = None @@ -78,7 +99,7 @@ class STACItem(BaseModel): """STAC Item data model.""" id: str = Field(..., alias="id", min_length=1) - geometry: Optional[Geometry] = None + geometry: Optional[SerializeAsAny[Geometry]] = None bbox: Optional[List[float]] = None properties: Optional[SerializeAsAny[STACItemProperties]] = None assets: Dict[str, Asset] = None diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 7cf3ed9..9f0198b 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -79,7 +79,7 @@ def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: ] -def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel): +def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel, item_geometry_model): """ Create STAC Item from CF JSON metadata. @@ -89,8 +89,10 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop Unique item ID. attrs: dict CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. - datamodel : pydantic.BaseModel, optional - Data model for validating global attributes. + item_props_datamodel : pydantic.BaseModel + Data model describing the properties of the STAC item. + item_geometry_model : pydantic.BaseModel + Data model describing the geometry of the STAC item. """ cfmeta = attrs["groups"]["CFMetadata"]["attributes"] @@ -98,7 +100,7 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Create pydantic STAC item item = STACItem( id=iid, - geometry=ncattrs_to_geometry(attrs), + geometry=item_geometry_model(**ncattrs_to_geometry(attrs)), bbox=ncattrs_to_bbox(attrs), properties=item_props_datamodel( start_datetime=cfmeta["time_coverage_start"], diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index 93d6072..fee24ce 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -10,7 +10,7 @@ from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader -from STACpopulator.models import STACItemProperties +from STACpopulator.models import GeoJSONPolygon, STACItemProperties from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal LOGGER = logging.getLogger(__name__) @@ -113,6 +113,7 @@ def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties + item_geometry_model = GeoJSONPolygon def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: str) -> None: """Constructor @@ -143,7 +144,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = make_cmip6_item_id(item_data["attributes"]) - item = STAC_item_from_metadata(iid, item_data, self.item_properties_model) + item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) # Add datacube extension try: From b7a7ed94ea846505d541bd01ec5c729f67c5f3ce Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:25:06 -0400 Subject: [PATCH 42/69] moving np datatype conversion to a separate function --- STACpopulator/input.py | 19 +++---------------- STACpopulator/stac_utils.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 3b2c11c..2e61257 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -2,14 +2,14 @@ from abc import ABC, abstractmethod from typing import Any, Iterator, MutableMapping, Optional, Tuple -import numpy as np import requests import siphon import xncml from colorlog import ColoredFormatter -from numpy import extract from siphon.catalog import TDSCatalog +from STACpopulator.stac_utils import numpy_to_python_datatypes + LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" formatter = ColoredFormatter(LOGFORMAT) @@ -86,20 +86,7 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() - # Converting numpy datatypes to python standard datatypes - for key, value in attrs["attributes"].items(): - if isinstance(value, list): - newlist = [] - for item in value: - if issubclass(type(item), np.integer): - newlist.append(int(item)) - elif issubclass(type(item), np.floating): - newlist.append(float(item)) - else: - newlist.append(item) - attrs["attributes"][key] = newlist - elif isinstance(type(value), np.integer): - attrs["attributes"][key] = int(value) + attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) attrs["access_urls"] = ds.access_urls diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 9f0198b..50871c6 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -2,6 +2,7 @@ import re from typing import Any, Literal, MutableMapping +import numpy as np import pystac from STACpopulator.models import STACItem @@ -79,6 +80,25 @@ def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: ] +def numpy_to_python_datatypes(data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + # Converting numpy datatypes to python standard datatypes + for key, value in data.items(): + if isinstance(value, list): + newlist = [] + for item in value: + if issubclass(type(item), np.integer): + newlist.append(int(item)) + elif issubclass(type(item), np.floating): + newlist.append(float(item)) + else: + newlist.append(item) + data[key] = newlist + elif isinstance(type(value), np.integer): + data[key] = int(value) + + return data + + def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel, item_geometry_model): """ Create STAC Item from CF JSON metadata. From 48598ae49a31ede1e5803ecbbb434962b474e9a2 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:26:10 -0400 Subject: [PATCH 43/69] modifications to datacube extension helper functions as per Francis's comments --- implementations/CMIP6-UofT/extensions.py | 244 ++++++++++++----------- 1 file changed, 125 insertions(+), 119 deletions(-) diff --git a/implementations/CMIP6-UofT/extensions.py b/implementations/CMIP6-UofT/extensions.py index e09f9b2..9f77b0f 100644 --- a/implementations/CMIP6-UofT/extensions.py +++ b/implementations/CMIP6-UofT/extensions.py @@ -1,4 +1,5 @@ -import pystac +import functools + from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType @@ -22,6 +23,123 @@ def __init__(self, attrs: dict): """ self.attrs = attrs + # From CF-Xarray + self.coordinate_criteria = { + "latitude": { + "standard_name": ("latitude",), + "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), + "_CoordinateAxisType": ("Lat",), + "long_name": ("latitude",), + }, + "longitude": { + "standard_name": ("longitude",), + "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), + "_CoordinateAxisType": ("Lon",), + "long_name": ("longitude",), + }, + "Z": { + "standard_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), + "axis": ("Z",), + "cartesian_axis": ("Z",), + "grads_dim": ("z",), + "long_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + }, + "vertical": { + "standard_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + "positive": ("up", "down"), + "long_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + }, + "X": { + "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), + "_CoordinateAxisType": ("GeoX",), + "axis": ("X",), + "cartesian_axis": ("X",), + "grads_dim": ("x",), + "long_name": ( + "projection_x_coordinate", + "grid_longitude", + "projection_x_angular_coordinate", + "cell index along first dimension", + ), + }, + "Y": { + "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), + "_CoordinateAxisType": ("GeoY",), + "axis": ("Y",), + "cartesian_axis": ("Y",), + "grads_dim": ("y",), + "long_name": ( + "projection_y_coordinate", + "grid_latitude", + "projection_y_angular_coordinate", + "cell index along second dimension", + ), + }, + "T": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + "time": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + } + + @property + @functools.cache def dimensions(self) -> dict: """Return Dimension objects required for Datacube extension.""" @@ -30,7 +148,7 @@ def dimensions(self) -> dict: v = self.attrs["variables"].get(name) if v: bbox = self.obj.ncattrs_to_bbox() - for key, criteria in coordinate_criteria.items(): + for key, criteria in self.coordinate_criteria.items(): for criterion, expected in criteria.items(): if v["attributes"].get(criterion, None) in expected: axis = self.axis[key] @@ -57,6 +175,8 @@ def dimensions(self) -> dict: return dims + @property + @functools.cache def variables(self) -> dict: """Return Variable objects required for Datacube extension.""" variables = {} @@ -76,126 +196,12 @@ def variables(self) -> dict: ) return variables + @property + @functools.cache def is_coordinate(self, attrs: dict) -> bool: """Return whether variable is a coordinate.""" - for key, criteria in coordinate_criteria.items(): + for key, criteria in self.coordinate_criteria.items(): for criterion, expected in criteria.items(): if attrs.get(criterion, None) in expected: return True return False - - -# From CF-Xarray -coordinate_criteria = { - "latitude": { - "standard_name": ("latitude",), - "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), - "_CoordinateAxisType": ("Lat",), - "long_name": ("latitude",), - }, - "longitude": { - "standard_name": ("longitude",), - "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), - "_CoordinateAxisType": ("Lon",), - "long_name": ("longitude",), - }, - "Z": { - "standard_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), - "axis": ("Z",), - "cartesian_axis": ("Z",), - "grads_dim": ("z",), - "long_name": ( - "model_level_number", - "atmosphere_ln_pressure_coordinate", - "atmosphere_sigma_coordinate", - "atmosphere_hybrid_sigma_pressure_coordinate", - "atmosphere_hybrid_height_coordinate", - "atmosphere_sleve_coordinate", - "ocean_sigma_coordinate", - "ocean_s_coordinate", - "ocean_s_coordinate_g1", - "ocean_s_coordinate_g2", - "ocean_sigma_z_coordinate", - "ocean_double_sigma_coordinate", - ), - }, - "vertical": { - "standard_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - "positive": ("up", "down"), - "long_name": ( - "air_pressure", - "height", - "depth", - "geopotential_height", - "altitude", - "height_above_geopotential_datum", - "height_above_reference_ellipsoid", - "height_above_mean_sea_level", - ), - }, - "X": { - "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), - "_CoordinateAxisType": ("GeoX",), - "axis": ("X",), - "cartesian_axis": ("X",), - "grads_dim": ("x",), - "long_name": ( - "projection_x_coordinate", - "grid_longitude", - "projection_x_angular_coordinate", - "cell index along first dimension", - ), - }, - "Y": { - "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), - "_CoordinateAxisType": ("GeoY",), - "axis": ("Y",), - "cartesian_axis": ("Y",), - "grads_dim": ("y",), - "long_name": ( - "projection_y_coordinate", - "grid_latitude", - "projection_y_angular_coordinate", - "cell index along second dimension", - ), - }, - "T": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, - "time": { - "standard_name": ("time",), - "_CoordinateAxisType": ("Time",), - "axis": ("T",), - "cartesian_axis": ("T",), - "grads_dim": ("t",), - "long_name": ("time",), - }, -} From 94eb521e783d9c67a0e293c391b1ab98fbffc710 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 11:30:43 -0400 Subject: [PATCH 44/69] code cleanup --- STACpopulator/metadata_parsers.py | 61 ------------------------------- STACpopulator/models.py | 2 - STACpopulator/populator_base.py | 1 - 3 files changed, 64 deletions(-) delete mode 100644 STACpopulator/metadata_parsers.py diff --git a/STACpopulator/metadata_parsers.py b/STACpopulator/metadata_parsers.py deleted file mode 100644 index 84636f8..0000000 --- a/STACpopulator/metadata_parsers.py +++ /dev/null @@ -1,61 +0,0 @@ -import lxml.etree -import requests - - -def nc_attrs_from_ncml(url): - """Extract attributes from NcML file. - - Parameters - ---------- - url : str - Link to NcML service of THREDDS server for a dataset. - - Returns - ------- - dict - Global attribute values keyed by facet names, with variable attributes in `__variable__` nested dict, and - additional specialized attributes in `__group__` nested dict. - """ - parser = lxml.etree.XMLParser(encoding="UTF-8") - - ns = {"ncml": "http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2"} - - # Parse XML content - UTF-8 encoded documents need to be read as bytes - xml = requests.get(url).content - doc = lxml.etree.fromstring(xml, parser=parser) - nc = doc.xpath("/ncml:netcdf", namespaces=ns)[0] - - # Extract global attributes - out = _attrib_to_dict(nc.xpath("ncml:attribute", namespaces=ns)) - - # Extract group attributes - gr = {} - for group in nc.xpath("ncml:group", namespaces=ns): - gr[group.attrib["name"]] = _attrib_to_dict(group.xpath("ncml:attribute", namespaces=ns)) - - # Extract variable attributes - va = {} - for variable in nc.xpath("ncml:variable", namespaces=ns): - if "_CoordinateAxisType" in variable.xpath("ncml:attribute/@name", namespaces=ns): - continue - va[variable.attrib["name"]] = _attrib_to_dict(variable.xpath("ncml:attribute", namespaces=ns)) - - out["__group__"] = gr - out["__variable__"] = va - - return out - - -def _attrib_to_dict(elems): - """Convert element attributes to dictionary. - - Ignore attributes with names starting with _ - """ - hidden_prefix = "_" - out = {} - for e in elems: - a = e.attrib - if a["name"].startswith(hidden_prefix): - continue - out[a["name"]] = a["value"] - return out diff --git a/STACpopulator/models.py b/STACpopulator/models.py index 3e93802..f91dab5 100644 --- a/STACpopulator/models.py +++ b/STACpopulator/models.py @@ -1,7 +1,6 @@ import datetime as dt from typing import Any, Dict, List, Literal, Optional, Union -from annotated_types import Ge from pydantic import ( AnyHttpUrl, AnyUrl, @@ -10,7 +9,6 @@ SerializeAsAny, field_validator, ) -from xarray import Coordinates class Geometry(BaseModel): diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 3d8f50c..404d610 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,4 +1,3 @@ -import hashlib import logging from abc import ABC, abstractmethod from datetime import datetime From a64a2265a42726aaa824a2364b09397ea3a83647 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 12 Oct 2023 16:23:04 -0400 Subject: [PATCH 45/69] change how prefix is applied --- implementations/CMIP6-UofT/add_CMIP6.py | 70 ++++++++++++++----------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py index fee24ce..1a5dbd8 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/implementations/CMIP6-UofT/add_CMIP6.py @@ -6,7 +6,7 @@ import pyessv from colorlog import ColoredFormatter from extensions import DataCubeHelper -from pydantic import AnyHttpUrl, Field, FieldValidationInfo, field_validator +from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase from STACpopulator.input import THREDDSLoader @@ -39,40 +39,46 @@ TableID = collection2literal(CV.table_id) +def add_cmip6_prefix(name: str) -> str: + return "cmip6:" + name if "datetime" not in name else name + + class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): """Data model for CMIP6 Controlled Vocabulary.""" - Conventions: str = Field(..., serialization_alias="cmip6:Conventions") - activity_id: ActivityID = Field(..., serialization_alias="cmip6:activity_id") - creation_date: datetime = Field(..., serialization_alias="cmip6:creation_date") - data_specs_version: str = Field(..., serialization_alias="cmip6:data_specs_version") - experiment: str = Field(..., serialization_alias="cmip6:experiment") - experiment_id: ExperimentID = Field(..., serialization_alias="cmip6:experiment_id") - frequency: Frequency = Field(..., serialization_alias="cmip6:frequency") - further_info_url: AnyHttpUrl = Field(..., serialization_alias="cmip6:further_info_url") - grid_label: GridLabel = Field(..., serialization_alias="cmip6:grid_label") - institution: str = Field(..., serialization_alias="cmip6:institution") - institution_id: InstitutionID = Field(..., serialization_alias="cmip6:institution_id") - nominal_resolution: NominalResolution = Field(..., serialization_alias="cmip6:nominal_resolution") - realm: List[Realm] = Field(..., serialization_alias="cmip6:realm") - source: str = Field(..., serialization_alias="cmip6:source") - source_id: SourceID = Field(..., serialization_alias="cmip6:source_id") - source_type: List[SourceType] = Field(..., serialization_alias="cmip6:source_type") - sub_experiment: str | Literal["none"] = Field(..., serialization_alias="cmip6:sub_experiment") - sub_experiment_id: SubExperimentID | Literal["none"] = Field(..., serialization_alias="cmip6:sub_experiment_id") - table_id: TableID = Field(..., serialization_alias="cmip6:table_id") - variable_id: str = Field(..., serialization_alias="cmip6:variable_id") - variant_label: str = Field(..., serialization_alias="cmip6:variant_label") - initialization_index: int = Field(..., serialization_alias="cmip6:initialization_index") - physics_index: int = Field(..., serialization_alias="cmip6:physics_index") - realization_index: int = Field(..., serialization_alias="cmip6:realization_index") - forcing_index: int = Field(..., serialization_alias="cmip6:forcing_index") - tracking_id: str = Field(..., serialization_alias="cmip6:tracking_id") - version: str = Field("", serialization_alias="cmip6:version") - product: str = Field(..., serialization_alias="cmip6:product") - license: str = Field(..., serialization_alias="cmip6:license") - grid: str = Field(..., serialization_alias="cmip6:grid") - mip_era: str = Field(..., serialization_alias="cmip6:mip_era") + Conventions: str + activity_id: ActivityID + creation_date: datetime + data_specs_version: str + experiment: str + experiment_id: ExperimentID + frequency: Frequency + further_info_url: AnyHttpUrl + grid_label: GridLabel + institution: str + institution_id: InstitutionID + nominal_resolution: NominalResolution + realm: List[Realm] + source: str + source_id: SourceID + source_type: List[SourceType] + sub_experiment: str | Literal["none"] + sub_experiment_id: SubExperimentID | Literal["none"] + table_id: TableID + variable_id: str + variant_label: str + initialization_index: int + physics_index: int + realization_index: int + forcing_index: int + tracking_id: str + version: str = Field("") + product: str + license: str + grid: str + mip_era: str + + model_config = ConfigDict(alias_generator=add_cmip6_prefix, populate_by_name=True) @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") @classmethod From f22c1a20b10eb6dd371e33e24271f0e6c0f2d122 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Fri, 13 Oct 2023 10:17:56 -0400 Subject: [PATCH 46/69] PR changes --- Makefile | 2 +- .../implementations}/CMIP6-UofT/CMIP6.yml | 0 .../implementations}/CMIP6-UofT/add_CMIP6.py | 2 +- .../implementations}/CMIP6-UofT/extensions.py | 0 .../implementations}/NEX-GDDP-UofT/add_NEX-GDDP.py | 0 STACpopulator/stac_utils.py | 2 +- 6 files changed, 3 insertions(+), 3 deletions(-) rename {implementations => STACpopulator/implementations}/CMIP6-UofT/CMIP6.yml (100%) rename {implementations => STACpopulator/implementations}/CMIP6-UofT/add_CMIP6.py (99%) rename {implementations => STACpopulator/implementations}/CMIP6-UofT/extensions.py (100%) rename {implementations => STACpopulator/implementations}/NEX-GDDP-UofT/add_NEX-GDDP.py (100%) diff --git a/Makefile b/Makefile index e9e1f6f..ca5cf52 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -IMP_DIR = /Users/dchandan/DACCS/Codes/stac-populator/implementations +IMP_DIR = STACpopulator/implementations STAC_HOST = http://localhost:8880/stac testcmip6: diff --git a/implementations/CMIP6-UofT/CMIP6.yml b/STACpopulator/implementations/CMIP6-UofT/CMIP6.yml similarity index 100% rename from implementations/CMIP6-UofT/CMIP6.yml rename to STACpopulator/implementations/CMIP6-UofT/CMIP6.yml diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py similarity index 99% rename from implementations/CMIP6-UofT/add_CMIP6.py rename to STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 1a5dbd8..8a8f171 100644 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -82,7 +82,7 @@ class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") @classmethod - def first_item(cls, v: list, info: FieldValidationInfo): + def only_item(cls, v: list[int], info: FieldValidationInfo): """Pick single item from list.""" assert len(v) == 1, f"{info.field_name} must have one item only." return v[0] diff --git a/implementations/CMIP6-UofT/extensions.py b/STACpopulator/implementations/CMIP6-UofT/extensions.py similarity index 100% rename from implementations/CMIP6-UofT/extensions.py rename to STACpopulator/implementations/CMIP6-UofT/extensions.py diff --git a/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py b/STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py similarity index 100% rename from implementations/NEX-GDDP-UofT/add_NEX-GDDP.py rename to STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 50871c6..cf3a8c2 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -69,7 +69,7 @@ def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, } -def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list: +def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list[float]: """Create BBOX from CFMetadata.""" attrs = attrs["groups"]["CFMetadata"]["attributes"] return [ From efd9230823314450e3ed47777cbd5f363b0c7723 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Tue, 17 Oct 2023 12:07:28 -0400 Subject: [PATCH 47/69] fixing output media type and roles output for assets --- STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 6 +++--- STACpopulator/stac_utils.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 8a8f171..a137a0b 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -1,4 +1,5 @@ import argparse +import json import logging from datetime import datetime from typing import Any, Dict, List, Literal, MutableMapping @@ -160,9 +161,8 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) except: LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") - # print(obj.item.to_dict()) - # return obj.item.to_dict() - print(item.to_dict()) + # return json.dumps(item.to_dict()) + print(json.dumps(item.to_dict())) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index cf3a8c2..76b6c9f 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -142,6 +142,7 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop root = {} for name, url in root.items(): + name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) item.add_asset(name, asset) From 3e88591f26cd9ce278d28f8da0e55a13890b0476 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Tue, 17 Oct 2023 17:42:22 -0400 Subject: [PATCH 48/69] adding magpie resource link --- STACpopulator/stac_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 76b6c9f..fe8c650 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -99,6 +99,22 @@ def numpy_to_python_datatypes(data: MutableMapping[str, Any]) -> MutableMapping[ return data +def magpie_resource_link(url: str) -> pystac.Link: + """Creates a link that will be used by Cowbird to create a resource in Magpie + associated with the STAC item. + + :param url: HTTPServer access URL for a STAC item + :type url: str + :return: A PySTAC Link + :rtype: pystac.Link + """ + url_ = url.replace("fileServer", "*") + i = url_.find("*") + title = url_[i + 2 :] + link = pystac.Link(rel="source", title=title, target=url, media_type="application/x-netcdf") + return link + + def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel, item_geometry_model): """ Create STAC Item from CF JSON metadata. @@ -146,6 +162,9 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) item.add_asset(name, asset) + if root: + item.add_link(magpie_resource_link(root["HTTPServer"])) + return item From 8d66fba8877ecf8c166379b91a16b4bc49039b86 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 18 Oct 2023 13:18:18 -0400 Subject: [PATCH 49/69] adding collection resource link for Magpie --- Makefile | 3 +++ STACpopulator/input.py | 16 ++++++++++++++-- STACpopulator/populator_base.py | 3 +++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ca5cf52..588e160 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,9 @@ STAC_HOST = http://localhost:8880/stac testcmip6: python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html $(IMP_DIR)/CMIP6-UofT/CMIP6.yml +delcmip6: + curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' + @echo "" starthost: docker compose up diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 2e61257..54f0d68 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from typing import Any, Iterator, MutableMapping, Optional, Tuple +import pystac import requests import siphon import xncml @@ -22,7 +23,7 @@ class GenericLoader(ABC): def __init__(self) -> None: - pass + self.links = [] @abstractmethod def __iter__(self): @@ -58,6 +59,16 @@ def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> Non self.thredds_catalog_URL = thredds_catalog_url self.catalog = TDSCatalog(self.thredds_catalog_URL) self.catalog_head = self.catalog + self.links.append(self.magpie_collection_link()) + + def magpie_collection_link(self): + """Return Link to THREDDS catalog.""" + url = self.thredds_catalog_URL + parts = url.split("/") + i = parts.index("catalog") + service = parts[i - 1] + path = "/".join(parts[i + 1 : -1]) + return pystac.Link(rel="source", target=url, media_type="text/xml", title=f"{service}:{path}") def reset(self): """Reset the generator.""" @@ -81,7 +92,8 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An url = ds.access_urls["NCML"] LOGGER.info("Requesting NcML dataset description") - r = requests.get(url) + # r = requests.get(url) + r = requests.get(url, params={"catalog": self.catalog_head, "dataset": ds}) # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 404d610..07841bc 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -114,6 +114,9 @@ def create_stac_collection(self): self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) collection = pystac.Collection(id=self.collection_id, **self._collection_info) + + collection.add_links(self._ingest_pipeline.links) + post_stac_collection(self.stac_host, collection.to_dict()) def ingest(self) -> None: From 00a968a33e14400f07c7d53f0cf02ff993346b57 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 18 Oct 2023 23:43:10 -0400 Subject: [PATCH 50/69] posting items fixes --- STACpopulator/api_requests.py | 6 +++--- STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 4 ++-- STACpopulator/populator_base.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py index fdd9a65..23a7371 100644 --- a/STACpopulator/api_requests.py +++ b/STACpopulator/api_requests.py @@ -2,6 +2,7 @@ import os from typing import Any, Optional from urllib.parse import urljoin + import requests from colorlog import ColoredFormatter @@ -79,17 +80,16 @@ def post_stac_item( """ item_id = json_data["id"] - r = requests.post(urljoin(stac_host, f"collections/{collection_id}/items"), json=json_data) + r = requests.post(os.path.join(stac_host, f"collections/{collection_id}/items"), json=json_data) if r.status_code == 200: LOGGER.info(f"Item {item_name} successfully added") elif r.status_code == 409: if update: LOGGER.info(f"Item {item_id} already exists. Updating.") - r = requests.put(urljoin(stac_host, f"collections/{collection_id}/items/{item_id}"), json=json_data) + r = requests.put(os.path.join(stac_host, f"collections/{collection_id}/items/{item_id}"), json=json_data) r.raise_for_status() else: LOGGER.info(f"Item {item_id} already exists.") else: r.raise_for_status() - diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index a137a0b..3f2f115 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -161,8 +161,8 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) except: LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") - # return json.dumps(item.to_dict()) - print(json.dumps(item.to_dict())) + # print(json.dumps(item.to_dict())) + return json.loads(json.dumps(item.to_dict())) def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: # Validation is done at the item creating stage, using the Properties class. diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 07841bc..beb1541 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -124,7 +124,7 @@ def ingest(self) -> None: for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) - # post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) + post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) # try: # pass # except Exception: From 2c3b49de9808ba1e842fd5a93231e458e7374767 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 11:05:06 -0400 Subject: [PATCH 51/69] removing function no longer in use --- STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 4 ---- STACpopulator/populator_base.py | 4 ---- 2 files changed, 8 deletions(-) diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 3f2f115..31495cc 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -164,10 +164,6 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) # print(json.dumps(item.to_dict())) return json.loads(json.dumps(item.to_dict())) - def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: - # Validation is done at the item creating stage, using the Properties class. - return True - if __name__ == "__main__": parser = argparse.ArgumentParser(prog="CMIP6 STAC populator") diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index beb1541..cc84403 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -138,7 +138,3 @@ def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableM @abstractmethod def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: pass - - @abstractmethod - def validate_stac_item_cv(self, data: MutableMapping[str, Any]) -> bool: - pass From 6908d5548da395359a9a8f394be6a2ca4566257a Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 11:23:12 -0400 Subject: [PATCH 52/69] implemented updating stac collection and items --- .../implementations/CMIP6-UofT/add_CMIP6.py | 12 ++++-- STACpopulator/populator_base.py | 37 +++++++++---------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 31495cc..25e3ac5 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -2,7 +2,7 @@ import json import logging from datetime import datetime -from typing import Any, Dict, List, Literal, MutableMapping +from typing import Any, Dict, List, Literal, MutableMapping, Optional import pyessv from colorlog import ColoredFormatter @@ -122,7 +122,9 @@ class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon - def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: str) -> None: + def __init__( + self, stac_host: str, thredds_catalog_url: str, config_filename: str, update: Optional[bool] = False + ) -> None: """Constructor :param stac_host: URL to the STAC API @@ -134,7 +136,8 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, config_filename: st """ data_loader = THREDDSLoader(thredds_catalog_url) - super().__init__(stac_host, data_loader, config_filename) + + super().__init__(stac_host, data_loader, config_filename, update) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): pass @@ -170,8 +173,9 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) parser.add_argument("stac_host", type=str, help="STAC API address") parser.add_argument("thredds_catalog_URL", type=str, help="URL to the CMIP6 THREDDS catalog") parser.add_argument("config_file", type=str, help="Name of the configuration file") + parser.add_argument("--update", action="store_true", help="Update collection and its items") args = parser.parse_args() LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file) + c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file, args.update) c.ingest() diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index cc84403..a1fdc85 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,7 +1,7 @@ import logging from abc import ABC, abstractmethod from datetime import datetime -from typing import Any, MutableMapping +from typing import Any, MutableMapping, Optional import pystac import yaml @@ -32,6 +32,7 @@ def __init__( stac_host: str, data_loader: GenericLoader, collection_info_filename: str, + update: Optional[bool] = False, ) -> None: """Constructor @@ -56,6 +57,7 @@ def __init__( self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) + self.update = update # self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() self._collection_id = self.collection_name @@ -96,35 +98,32 @@ def create_stac_collection(self): Returns the collection. """ - if stac_collection_exists(self.stac_host, self.collection_id): - LOGGER.info(f"Collection '{self.collection_name}' already exists") - else: - LOGGER.info(f"Creating collection '{self.collection_name}'") - sp_extent = pystac.SpatialExtent([self._collection_info.pop("spatialextent")]) - tmp = self._collection_info.pop("temporalextent") - tmp_extent = pystac.TemporalExtent( + LOGGER.info(f"Creating collection '{self.collection_name}'") + sp_extent = pystac.SpatialExtent([self._collection_info.pop("spatialextent")]) + tmp = self._collection_info.pop("temporalextent") + tmp_extent = pystac.TemporalExtent( + [ [ - [ - datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, - datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, - ] + datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, + datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, ] - ) - self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) - self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) + ] + ) + self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) + self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) - collection = pystac.Collection(id=self.collection_id, **self._collection_info) + collection = pystac.Collection(id=self.collection_id, **self._collection_info) - collection.add_links(self._ingest_pipeline.links) + collection.add_links(self._ingest_pipeline.links) - post_stac_collection(self.stac_host, collection.to_dict()) + post_stac_collection(self.stac_host, collection.to_dict(), self.update) def ingest(self) -> None: LOGGER.info("Data ingestion") for item_name, item_data in self._ingest_pipeline: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) - post_stac_item(self.stac_host, self.collection_id, item_name, stac_item) + post_stac_item(self.stac_host, self.collection_id, item_name, stac_item, self.update) # try: # pass # except Exception: From 0c959ea6c20718217ff3e0a1f16f29548c38272a Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 13:27:55 -0400 Subject: [PATCH 53/69] removing need to pass yml file to app on command line --- Makefile | 2 +- .../implementations/CMIP6-UofT/add_CMIP6.py | 13 ++++--------- .../CMIP6-UofT/{CMIP6.yml => collection_config.yml} | 0 STACpopulator/populator_base.py | 13 +++++++++---- 4 files changed, 14 insertions(+), 14 deletions(-) rename STACpopulator/implementations/CMIP6-UofT/{CMIP6.yml => collection_config.yml} (100%) diff --git a/Makefile b/Makefile index 588e160..914e513 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ IMP_DIR = STACpopulator/implementations STAC_HOST = http://localhost:8880/stac testcmip6: - python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html $(IMP_DIR)/CMIP6-UofT/CMIP6.yml + python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html delcmip6: curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 25e3ac5..532cfb4 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -122,22 +122,17 @@ class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon - def __init__( - self, stac_host: str, thredds_catalog_url: str, config_filename: str, update: Optional[bool] = False - ) -> None: + def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bool] = False) -> None: """Constructor :param stac_host: URL to the STAC API :type stac_host: str :param thredds_catalog_url: the URL to the THREDDS catalog to ingest :type thredds_catalog_url: str - :param config_filename: Yaml file containing the information about the collection to populate - :type config_filename: str """ - data_loader = THREDDSLoader(thredds_catalog_url) - super().__init__(stac_host, data_loader, config_filename, update) + super().__init__(stac_host, data_loader, update) def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): pass @@ -172,10 +167,10 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) parser = argparse.ArgumentParser(prog="CMIP6 STAC populator") parser.add_argument("stac_host", type=str, help="STAC API address") parser.add_argument("thredds_catalog_URL", type=str, help="URL to the CMIP6 THREDDS catalog") - parser.add_argument("config_file", type=str, help="Name of the configuration file") parser.add_argument("--update", action="store_true", help="Update collection and its items") args = parser.parse_args() + LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file, args.update) + c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.update) c.ingest() diff --git a/STACpopulator/implementations/CMIP6-UofT/CMIP6.yml b/STACpopulator/implementations/CMIP6-UofT/collection_config.yml similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/CMIP6.yml rename to STACpopulator/implementations/CMIP6-UofT/collection_config.yml diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index a1fdc85..38ef8be 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,4 +1,6 @@ import logging +import os +import sys from abc import ABC, abstractmethod from datetime import datetime from typing import Any, MutableMapping, Optional @@ -31,7 +33,6 @@ def __init__( self, stac_host: str, data_loader: GenericLoader, - collection_info_filename: str, update: Optional[bool] = False, ) -> None: """Constructor @@ -40,13 +41,17 @@ def __init__( :type stac_host: str :param data_loader: A concrete implementation of the GenericLoader abstract base class :type data_loader: GenericLoader - :param collection_info_filename: Yaml file containing the information about the collection to populate - :type collection_info_filename: str :raises RuntimeError: Raised if one of the required definitions is not found in the collection info filename """ super().__init__() - with open(collection_info_filename) as f: + self._collection_info_filename = "collection_config.yml" + self._app_directory = os.path.dirname(sys.argv[0]) + + if not os.path.exists(os.path.join(self._app_directory, self._collection_info_filename)): + raise RuntimeError(f"Missing {self._collection_info_filename} file for this implementation") + + with open(os.path.join(self._app_directory, self._collection_info_filename)) as f: self._collection_info = yaml.load(f, yaml.Loader) req_definitions = ["title", "description", "keywords", "license"] From 73b277337cc820e0333297e4d9be2442e503b8c3 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 13:29:34 -0400 Subject: [PATCH 54/69] code cleanup --- STACpopulator/api_requests.py | 1 - STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py | 2 +- STACpopulator/populator_base.py | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py index 23a7371..35b0dc2 100644 --- a/STACpopulator/api_requests.py +++ b/STACpopulator/api_requests.py @@ -1,7 +1,6 @@ import logging import os from typing import Any, Optional -from urllib.parse import urljoin import requests from colorlog import ColoredFormatter diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py index 532cfb4..207add8 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py @@ -2,7 +2,7 @@ import json import logging from datetime import datetime -from typing import Any, Dict, List, Literal, MutableMapping, Optional +from typing import Any, List, Literal, MutableMapping, Optional import pyessv from colorlog import ColoredFormatter diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 38ef8be..4e75cb1 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -12,7 +12,6 @@ from STACpopulator.api_requests import ( post_stac_collection, post_stac_item, - stac_collection_exists, stac_host_reachable, ) from STACpopulator.input import GenericLoader @@ -64,7 +63,6 @@ def __init__( self._stac_host = self.validate_host(stac_host) self.update = update - # self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() self._collection_id = self.collection_name LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") From 9e919c25df954306cedcfa813bc65e1d4ee2c06b Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 14:01:56 -0400 Subject: [PATCH 55/69] adding __init__ files --- STACpopulator/implementations/CMIP6-UofT/__init__.py | 0 STACpopulator/implementations/NEX-GDDP-UofT/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 STACpopulator/implementations/CMIP6-UofT/__init__.py create mode 100644 STACpopulator/implementations/NEX-GDDP-UofT/__init__.py diff --git a/STACpopulator/implementations/CMIP6-UofT/__init__.py b/STACpopulator/implementations/CMIP6-UofT/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/STACpopulator/implementations/NEX-GDDP-UofT/__init__.py b/STACpopulator/implementations/NEX-GDDP-UofT/__init__.py new file mode 100644 index 0000000..e69de29 From c62fb801439b2d4900550e157df431984122654e Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 14:07:47 -0400 Subject: [PATCH 56/69] fix --- STACpopulator/implementations/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 STACpopulator/implementations/__init__.py diff --git a/STACpopulator/implementations/__init__.py b/STACpopulator/implementations/__init__.py new file mode 100644 index 0000000..e69de29 From 10db1281b46298828727293159e7a5a52e71cb89 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 19 Oct 2023 16:20:58 -0400 Subject: [PATCH 57/69] more fixes --- Makefile | 2 +- .../implementations/{CMIP6-UofT => CMIP6_UofT}/__init__.py | 0 .../implementations/{CMIP6-UofT => CMIP6_UofT}/add_CMIP6.py | 2 +- .../{CMIP6-UofT => CMIP6_UofT}/collection_config.yml | 0 .../implementations/{CMIP6-UofT => CMIP6_UofT}/extensions.py | 0 .../{NEX-GDDP-UofT => NEX_GDDP_UofT}/__init__.py | 0 .../{NEX-GDDP-UofT => NEX_GDDP_UofT}/add_NEX-GDDP.py | 0 7 files changed, 2 insertions(+), 2 deletions(-) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/__init__.py (100%) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/add_CMIP6.py (98%) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/collection_config.yml (100%) rename STACpopulator/implementations/{CMIP6-UofT => CMIP6_UofT}/extensions.py (100%) rename STACpopulator/implementations/{NEX-GDDP-UofT => NEX_GDDP_UofT}/__init__.py (100%) rename STACpopulator/implementations/{NEX-GDDP-UofT => NEX_GDDP_UofT}/add_NEX-GDDP.py (100%) diff --git a/Makefile b/Makefile index 914e513..b08e1b8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ IMP_DIR = STACpopulator/implementations STAC_HOST = http://localhost:8880/stac testcmip6: - python $(IMP_DIR)/CMIP6-UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html + python $(IMP_DIR)/CMIP6_UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html delcmip6: curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' diff --git a/STACpopulator/implementations/CMIP6-UofT/__init__.py b/STACpopulator/implementations/CMIP6_UofT/__init__.py similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/__init__.py rename to STACpopulator/implementations/CMIP6_UofT/__init__.py diff --git a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py similarity index 98% rename from STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py rename to STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 207add8..56bf4e6 100644 --- a/STACpopulator/implementations/CMIP6-UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -6,10 +6,10 @@ import pyessv from colorlog import ColoredFormatter -from extensions import DataCubeHelper from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator from STACpopulator import STACpopulatorBase +from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper from STACpopulator.input import THREDDSLoader from STACpopulator.models import GeoJSONPolygon, STACItemProperties from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal diff --git a/STACpopulator/implementations/CMIP6-UofT/collection_config.yml b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/collection_config.yml rename to STACpopulator/implementations/CMIP6_UofT/collection_config.yml diff --git a/STACpopulator/implementations/CMIP6-UofT/extensions.py b/STACpopulator/implementations/CMIP6_UofT/extensions.py similarity index 100% rename from STACpopulator/implementations/CMIP6-UofT/extensions.py rename to STACpopulator/implementations/CMIP6_UofT/extensions.py diff --git a/STACpopulator/implementations/NEX-GDDP-UofT/__init__.py b/STACpopulator/implementations/NEX_GDDP_UofT/__init__.py similarity index 100% rename from STACpopulator/implementations/NEX-GDDP-UofT/__init__.py rename to STACpopulator/implementations/NEX_GDDP_UofT/__init__.py diff --git a/STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py b/STACpopulator/implementations/NEX_GDDP_UofT/add_NEX-GDDP.py similarity index 100% rename from STACpopulator/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py rename to STACpopulator/implementations/NEX_GDDP_UofT/add_NEX-GDDP.py From 25985dbbf27aaeeae1c53d8f82ac648e6d31a379 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Mon, 23 Oct 2023 12:00:06 -0400 Subject: [PATCH 58/69] diagnostics --- STACpopulator/input.py | 8 ++++++++ STACpopulator/stac_utils.py | 6 ++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 54f0d68..be67ede 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -76,6 +76,8 @@ def reset(self): def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" + # print(f"At START catalog head is: {self.catalog_head}") + print(self.catalog_head.__dict__) if self.catalog_head.datasets.items(): for item_name, ds in self.catalog_head.datasets.items(): attrs = self.extract_metadata(ds) @@ -84,6 +86,7 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: if self._depth > 0: for name, ref in self.catalog_head.catalog_refs.items(): self.catalog_head = ref.follow() + print(f"catalog head is: {self.catalog_head}") self._depth -= 1 yield from self @@ -91,6 +94,11 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An # Get URL for NCML service url = ds.access_urls["NCML"] + print(url) + # print(self.catalog_head) + print(f"ds = {ds}") + print(ds.__dict__) + print(self.catalog_head.catalog_url) LOGGER.info("Requesting NcML dataset description") # r = requests.get(url) r = requests.get(url, params={"catalog": self.catalog_head, "dataset": ds}) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index fe8c650..62b795f 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -151,8 +151,10 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Add assets if "access_urls" in attrs: + print("access_urls") root = attrs["access_urls"] elif "THREDDSMetadata" in attrs["groups"]: + print("THREDDSMetadata") root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] else: root = {} @@ -162,8 +164,8 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) item.add_asset(name, asset) - if root: - item.add_link(magpie_resource_link(root["HTTPServer"])) + # if root: + # item.add_link(magpie_resource_link(root["HTTPServer"])) return item From 6d675bcfe156bd633e665bb514052a245596156f Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Mon, 23 Oct 2023 15:59:35 -0400 Subject: [PATCH 59/69] removing unused code --- tests/test_client.py | 5 ----- tests/test_cmip6_extension.py | 20 -------------------- 2 files changed, 25 deletions(-) delete mode 100644 tests/test_client.py delete mode 100644 tests/test_cmip6_extension.py diff --git a/tests/test_client.py b/tests/test_client.py deleted file mode 100644 index b35f9ac..0000000 --- a/tests/test_client.py +++ /dev/null @@ -1,5 +0,0 @@ -from pystac_client import Client - -def test_cmip6(): - """Assume some CMIP6 has been ingested.""" - c = Client.open("http://localhost:8880/stac") diff --git a/tests/test_cmip6_extension.py b/tests/test_cmip6_extension.py deleted file mode 100644 index f899a33..0000000 --- a/tests/test_cmip6_extension.py +++ /dev/null @@ -1,20 +0,0 @@ -from STACpopulator.extensions import cmip6 -from STACpopulator.stac_utils import CFJsonItem -import xncml -from pathlib import Path -from pystac import Item, validation - -TEST_DATA = Path(__file__).parent / "data" - -def test_extension(): - ds = xncml.Dataset(TEST_DATA / "o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml") - attrs = ds.to_cf_dict() - - item = CFJsonItem("test", attrs).item - validation.validate(item) - - ext = cmip6.CMIP6Extension.ext(item, add_if_missing=True) - ext.apply(attrs["attributes"]) - assert "cmip6:realm" in item.properties - - From 65bd5bbaac751e535c08fb1441cdef1087e85db2 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Mon, 23 Oct 2023 17:43:44 -0400 Subject: [PATCH 60/69] refactoring to allow more flexibility --- Makefile | 2 +- .../implementations/CMIP6_UofT/add_CMIP6.py | 20 +-- .../CMIP6_UofT/collection_config.yml | 1 + STACpopulator/input.py | 60 ++++----- STACpopulator/populator_base.py | 53 +++----- STACpopulator/stac_utils.py | 79 +++++++---- tests/ref.txt | 124 ++++++++++++++++++ tests/test_standalone_stac_item.py | 30 +++++ 8 files changed, 269 insertions(+), 100 deletions(-) create mode 100644 tests/ref.txt create mode 100644 tests/test_standalone_stac_item.py diff --git a/Makefile b/Makefile index b08e1b8..439f93e 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ testcmip6: python $(IMP_DIR)/CMIP6_UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html delcmip6: - curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' + curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6_UofT' @echo "" starthost: diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 56bf4e6..fc39baf 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -10,7 +10,7 @@ from STACpopulator import STACpopulatorBase from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper -from STACpopulator.input import THREDDSLoader +from STACpopulator.input import GenericLoader, THREDDSLoader from STACpopulator.models import GeoJSONPolygon, STACItemProperties from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal @@ -122,7 +122,7 @@ class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon - def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bool] = False) -> None: + def __init__(self, stac_host: str, data_loader: GenericLoader, update: Optional[bool] = False) -> None: """Constructor :param stac_host: URL to the STAC API @@ -130,13 +130,8 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bo :param thredds_catalog_url: the URL to the THREDDS catalog to ingest :type thredds_catalog_url: str """ - data_loader = THREDDSLoader(thredds_catalog_url) - super().__init__(stac_host, data_loader, update) - def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): - pass - def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Creates the STAC item. @@ -172,5 +167,14 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) args = parser.parse_args() LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.update) + + mode = "full" + + if mode == "full": + data_loader = THREDDSLoader(args.thredds_catalog_URL) + else: + # To be implemented + data_loader = ErrorLoader(args.error_file) + + c = CMIP6populator(args.stac_host, data_loader, args.update) c.ingest() diff --git a/STACpopulator/implementations/CMIP6_UofT/collection_config.yml b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml index a57875b..0f43c78 100644 --- a/STACpopulator/implementations/CMIP6_UofT/collection_config.yml +++ b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml @@ -1,4 +1,5 @@ title: CMIP6 +id: CMIP6_UofT description: Coupled Model Intercomparison Project phase 6 keywords: ['CMIP', 'CMIP6', 'WCRP', 'Climate Change'] license: "CC-BY-4.0" diff --git a/STACpopulator/input.py b/STACpopulator/input.py index be67ede..272f9ad 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -4,12 +4,11 @@ import pystac import requests -import siphon import xncml from colorlog import ColoredFormatter from siphon.catalog import TDSCatalog -from STACpopulator.stac_utils import numpy_to_python_datatypes +from STACpopulator.stac_utils import numpy_to_python_datatypes, url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -52,23 +51,41 @@ def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> Non super().__init__() self._depth = depth if depth is not None else 1000 - if thredds_catalog_url.endswith(".html"): - thredds_catalog_url = thredds_catalog_url.replace(".html", ".xml") - LOGGER.info("Converting catalog URL from html to xml") + self.thredds_catalog_URL = self.validate_catalog_url(thredds_catalog_url) - self.thredds_catalog_URL = thredds_catalog_url self.catalog = TDSCatalog(self.thredds_catalog_URL) self.catalog_head = self.catalog self.links.append(self.magpie_collection_link()) - def magpie_collection_link(self): - """Return Link to THREDDS catalog.""" + def validate_catalog_url(self, url: str) -> str: + """Validate the user-provided catalog URL. + + :param url: URL to the THREDDS catalog + :type url: str + :raises RuntimeError: if URL is invalid or contains query parameters. + :return: a valid URL + :rtype: str + """ + if url_validate(url): + if "?" in url: + raise RuntimeError("THREDDS catalog URL should not contain query parameter") + else: + raise RuntimeError("Invalid URL") + + return url.replace(".html", ".xml") if url.endswith(".html") else url + + def magpie_collection_link(self) -> pystac.Link: + """Creates a PySTAC Link for the collection that is used by Cowbird and Magpie. + + :return: A PySTAC Link + :rtype: pystac.Link + """ url = self.thredds_catalog_URL parts = url.split("/") i = parts.index("catalog") - service = parts[i - 1] + # service = parts[i - 1] path = "/".join(parts[i + 1 : -1]) - return pystac.Link(rel="source", target=url, media_type="text/xml", title=f"{service}:{path}") + return pystac.Link(rel="source", target=url, media_type="text/xml", title=path) def reset(self): """Reset the generator.""" @@ -76,40 +93,23 @@ def reset(self): def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" - # print(f"At START catalog head is: {self.catalog_head}") - print(self.catalog_head.__dict__) if self.catalog_head.datasets.items(): for item_name, ds in self.catalog_head.datasets.items(): - attrs = self.extract_metadata(ds) + attrs = self.extract_metadata(ds.access_urls["NCML"], self.catalog_head.catalog_url, ds.url_path) yield item_name, attrs if self._depth > 0: for name, ref in self.catalog_head.catalog_refs.items(): self.catalog_head = ref.follow() - print(f"catalog head is: {self.catalog_head}") self._depth -= 1 yield from self - def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: - # Get URL for NCML service - url = ds.access_urls["NCML"] - - print(url) - # print(self.catalog_head) - print(f"ds = {ds}") - print(ds.__dict__) - print(self.catalog_head.catalog_url) + def extract_metadata(self, ncml_url: str, catalog_url: str, dataset_path: str) -> MutableMapping[str, Any]: LOGGER.info("Requesting NcML dataset description") - # r = requests.get(url) - r = requests.get(url, params={"catalog": self.catalog_head, "dataset": ds}) - + r = requests.get(ncml_url, params={"catalog": catalog_url, "dataset": dataset_path}) # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() - attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) - - attrs["access_urls"] = ds.access_urls - return attrs diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 4e75cb1..e6b795d 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,12 +1,9 @@ import logging -import os -import sys from abc import ABC, abstractmethod from datetime import datetime from typing import Any, MutableMapping, Optional import pystac -import yaml from colorlog import ColoredFormatter from STACpopulator.api_requests import ( @@ -15,7 +12,7 @@ stac_host_reachable, ) from STACpopulator.input import GenericLoader -from STACpopulator.stac_utils import url_validate +from STACpopulator.stac_utils import load_collection_configuration, url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -44,20 +41,7 @@ def __init__( """ super().__init__() - self._collection_info_filename = "collection_config.yml" - self._app_directory = os.path.dirname(sys.argv[0]) - - if not os.path.exists(os.path.join(self._app_directory, self._collection_info_filename)): - raise RuntimeError(f"Missing {self._collection_info_filename} file for this implementation") - - with open(os.path.join(self._app_directory, self._collection_info_filename)) as f: - self._collection_info = yaml.load(f, yaml.Loader) - - req_definitions = ["title", "description", "keywords", "license"] - for req in req_definitions: - if req not in self._collection_info.keys(): - LOGGER.error(f"'{req}' is required in the configuration file") - raise RuntimeError(f"'{req}' is required in the configuration file") + self._collection_info = load_collection_configuration() self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) @@ -78,7 +62,7 @@ def stac_host(self) -> str: @property def collection_id(self) -> str: - return self._collection_id + return self._collection_info["id"] @property @abstractmethod @@ -87,15 +71,26 @@ def item_properties_model(self): models.STACItemProperties.""" pass + @property + @abstractmethod + def item_geometry_model(self): + """In derived classes, this property should be defined as a pydantic data model that derives from + models.STACItemProperties.""" + pass + + @abstractmethod + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + pass + def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): raise ValueError("stac_host URL is not appropriately formatted") if not stac_host_reachable(stac_host): - raise ValueError("stac_host is not reachable") + raise RuntimeError("stac_host is not reachable") return stac_host - def create_stac_collection(self): + def create_stac_collection(self) -> None: """ Create a basic STAC collection. @@ -114,8 +109,7 @@ def create_stac_collection(self): ) self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) - - collection = pystac.Collection(id=self.collection_id, **self._collection_info) + collection = pystac.Collection(**self._collection_info) collection.add_links(self._ingest_pipeline.links) @@ -127,16 +121,3 @@ def ingest(self) -> None: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) post_stac_item(self.stac_host, self.collection_id, item_name, stac_item, self.update) - # try: - # pass - # except Exception: - # LOGGER.error(f"Failed adding STAC item {item_name}") - # self.handle_ingestion_error("Posting Error", item_name, item_data) - - @abstractmethod - def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): - pass - - @abstractmethod - def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - pass diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 62b795f..d3786e1 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,12 +1,27 @@ +import datetime import json +import logging +import os import re +import sys from typing import Any, Literal, MutableMapping import numpy as np import pystac +import yaml +from colorlog import ColoredFormatter from STACpopulator.models import STACItem +LOGGER = logging.getLogger(__name__) +LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" +formatter = ColoredFormatter(LOGFORMAT) +stream = logging.StreamHandler() +stream.setFormatter(formatter) +LOGGER.addHandler(stream) +LOGGER.setLevel(logging.INFO) +LOGGER.propagate = False + def url_validate(target: str) -> bool: """Validate whether a supplied URL is reliably written. @@ -32,6 +47,33 @@ def url_validate(target: str) -> bool: return True if re.match(url_regex, target) else False +def load_collection_configuration() -> MutableMapping[str, Any]: + """Reads details of the STAC Collection to be created from a configuration file. the + code expects a "collection_config.yml" file to be present in the app directory. + + :raises RuntimeError: If the configuration file is not present + :raises RuntimeError: If required values are not present in the configuration file + :return: A python dictionary describing the details of the Collection + :rtype: MutableMapping[str, Any] + """ + collection_info_filename = "collection_config.yml" + app_directory = os.path.dirname(sys.argv[0]) + + if not os.path.exists(os.path.join(app_directory, collection_info_filename)): + raise RuntimeError(f"Missing {collection_info_filename} file for this implementation") + + with open(os.path.join(app_directory, collection_info_filename)) as f: + collection_info = yaml.load(f, yaml.Loader) + + req_definitions = ["title", "id", "description", "keywords", "license"] + for req in req_definitions: + if req not in collection_info.keys(): + LOGGER.error(f"'{req}' is required in the configuration file") + raise RuntimeError(f"'{req}' is required in the configuration file") + + return collection_info + + def collection2literal(collection): terms = tuple(term.label for term in collection) return Literal[terms] @@ -149,40 +191,34 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Convert pydantic STAC item to a PySTAC Item item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - # Add assets - if "access_urls" in attrs: - print("access_urls") - root = attrs["access_urls"] - elif "THREDDSMetadata" in attrs["groups"]: - print("THREDDSMetadata") - root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] - else: - root = {} + root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] for name, url in root.items(): name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) + + name = asset_name_remaps[name] if name in asset_name_remaps.keys() else name item.add_asset(name, asset) - # if root: - # item.add_link(magpie_resource_link(root["HTTPServer"])) + item.add_link(magpie_resource_link(root["httpserver_service"])) return item +asset_name_remaps = { + "httpserver_service": "HTTPServer", + "opendap_service": "OPENDAP", + "wcs_service": "WCS", + "wms_service": "WMS", + "nccs_service": "NetcdfSubset", +} + media_types = { "httpserver_service": "application/x-netcdf", "opendap_service": pystac.MediaType.HTML, "wcs_service": pystac.MediaType.XML, "wms_service": pystac.MediaType.XML, "nccs_service": "application/x-netcdf", - "HTTPServer": "application/x-netcdf", - "OPENDAP": pystac.MediaType.HTML, - "NCML": pystac.MediaType.XML, - "WCS": pystac.MediaType.XML, - "ISO": pystac.MediaType.XML, - "WMS": pystac.MediaType.XML, - "NetcdfSubset": "application/x-netcdf", } asset_roles = { @@ -191,11 +227,4 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop "wcs_service": ["data"], "wms_service": ["visual"], "nccs_service": ["data"], - "HTTPServer": ["data"], - "OPENDAP": ["data"], - "NCML": ["metadata"], - "WCS": ["data"], - "ISO": ["metadata"], - "WMS": ["visual"], - "NetcdfSubset": ["data"], } diff --git a/tests/ref.txt b/tests/ref.txt new file mode 100644 index 0000000..f3b8c23 --- /dev/null +++ b/tests/ref.txt @@ -0,0 +1,124 @@ +{ + "type": "Feature", + "stac_version": "1.0.0", + "id": "ScenarioMIP_CCCma_CanESM5_ssp245_r13i1p2f1_SImon_siconc_gn", + "properties": { + "start_datetime": "2019-12-06T12:00:00Z", + "end_datetime": "2020-11-04T12:00:00Z", + "datetime": null, + "cmip6:Conventions": "CF-1.7 CMIP-6.2", + "cmip6:activity_id": "ScenarioMIP", + "cmip6:creation_date": "2019-09-25T23:01:33Z", + "cmip6:data_specs_version": "01.00.30", + "cmip6:experiment": "update of RCP4.5 based on SSP2", + "cmip6:experiment_id": "ssp245", + "cmip6:frequency": "mon", + "cmip6:further_info_url": "https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1", + "cmip6:grid_label": "gn", + "cmip6:institution": "Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada", + "cmip6:institution_id": "CCCma", + "cmip6:nominal_resolution": "100 km", + "cmip6:realm": [ + "seaIce" + ], + "cmip6:source": "CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2", + "cmip6:source_id": "CanESM5", + "cmip6:source_type": [ + "AOGCM" + ], + "cmip6:sub_experiment": "none", + "cmip6:sub_experiment_id": "none", + "cmip6:table_id": "SImon", + "cmip6:variable_id": "siconc", + "cmip6:variant_label": "r13i1p2f1", + "cmip6:initialization_index": 1, + "cmip6:physics_index": 2, + "cmip6:realization_index": 13, + "cmip6:forcing_index": 1, + "cmip6:tracking_id": "hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95", + "cmip6:version": "v20190429", + "cmip6:product": "model-output", + "cmip6:license": "CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.", + "cmip6:grid": "ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m", + "cmip6:mip_era": "CMIP6" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 0.049800001084804535, + -78.39350128173828 + ], + [ + 0.049800001084804535, + 89.74176788330078 + ], + [ + 359.99493408203125, + 89.74176788330078 + ], + [ + 359.99493408203125, + -78.39350128173828 + ], + [ + 0.049800001084804535, + -78.39350128173828 + ] + ] + ] + }, + "links": [ + { + "rel": "source", + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "title": "birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + } + ], + "assets": { + "HTTPServer": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + }, + "OPENDAP": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "text/html", + "roles": [ + "data" + ] + }, + "WCS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WCS&version=1.0.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "data" + ] + }, + "WMS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WMS&version=1.3.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "visual" + ] + }, + "NetcdfSubset": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc/dataset.html", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + } + }, + "bbox": [ + 0.049800001084804535, + -78.39350128173828, + 359.99493408203125, + 89.74176788330078 + ], + "stac_extensions": [] +} \ No newline at end of file diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py new file mode 100644 index 0000000..f0dc3c8 --- /dev/null +++ b/tests/test_standalone_stac_item.py @@ -0,0 +1,30 @@ +import json + +import requests +import xncml + +from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import ( + CMIP6ItemProperties, + make_cmip6_item_id, +) +from STACpopulator.models import GeoJSONPolygon +from STACpopulator.stac_utils import STAC_item_from_metadata + + +def test_standalone_stac_item(): + url = ( + "https://pavics.ouranos.ca/twitcher/ows/proxy/" + "thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + "?catalog=https%3A%2F%2Fpavics.ouranos.ca%2Ftwitcher%2Fows%2Fproxy%2F" + "thredds%2Fcatalog%2Fbirdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fcatalog.html" + "&dataset=birdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fsic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + ) + + attrs = xncml.Dataset.from_text(requests.get(url).content).to_cf_dict() + stac_item_id = make_cmip6_item_id(attrs["attributes"]) + stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) + + with open("tests/ref.txt", "r") as ff: + reference = json.load(ff) + + assert stac_item.to_dict() == reference From f540dbe1ef7f4e17ad736e743bfb77c184616fd3 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 26 Oct 2023 13:53:22 -0400 Subject: [PATCH 61/69] fix datacube extension --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 3 ++- STACpopulator/implementations/CMIP6_UofT/extensions.py | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index fc39baf..32f8577 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -7,6 +7,7 @@ import pyessv from colorlog import ColoredFormatter from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator +from pystac.extensions.datacube import DatacubeExtension from STACpopulator import STACpopulatorBase from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper @@ -150,7 +151,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) try: dchelper = DataCubeHelper(item_data) dc_ext = DatacubeExtension.ext(item, add_if_missing=True) - dc_ext.apply(dimensions=dchelper.dimensions(), variables=dchelper.variables()) + dc_ext.apply(dimensions=dchelper.dimensions, variables=dchelper.variables) except: LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") diff --git a/STACpopulator/implementations/CMIP6_UofT/extensions.py b/STACpopulator/implementations/CMIP6_UofT/extensions.py index 9f77b0f..31450a6 100644 --- a/STACpopulator/implementations/CMIP6_UofT/extensions.py +++ b/STACpopulator/implementations/CMIP6_UofT/extensions.py @@ -2,6 +2,8 @@ from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType +from STACpopulator.stac_utils import ncattrs_to_bbox + class DataCubeHelper: """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" @@ -147,7 +149,7 @@ def dimensions(self) -> dict: for name, length in self.attrs["dimensions"].items(): v = self.attrs["variables"].get(name) if v: - bbox = self.obj.ncattrs_to_bbox() + bbox = ncattrs_to_bbox(self.attrs) for key, criteria in self.coordinate_criteria.items(): for criterion, expected in criteria.items(): if v["attributes"].get(criterion, None) in expected: @@ -196,8 +198,8 @@ def variables(self) -> dict: ) return variables - @property - @functools.cache + # @property + # @functools.cache def is_coordinate(self, attrs: dict) -> bool: """Return whether variable is a coordinate.""" for key, criteria in self.coordinate_criteria.items(): From 323c9453afb78a67461a6d8a2a992a7b02108b98 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 26 Oct 2023 20:12:01 -0400 Subject: [PATCH 62/69] pr changes --- STACpopulator/populator_base.py | 6 +++--- tests/{ref.txt => ref.json} | 0 tests/test_standalone_stac_item.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename tests/{ref.txt => ref.json} (100%) diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index e6b795d..f8ccb1c 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -69,18 +69,18 @@ def collection_id(self) -> str: def item_properties_model(self): """In derived classes, this property should be defined as a pydantic data model that derives from models.STACItemProperties.""" - pass + raise NotImplementedError @property @abstractmethod def item_geometry_model(self): """In derived classes, this property should be defined as a pydantic data model that derives from models.STACItemProperties.""" - pass + raise NotImplementedError @abstractmethod def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - pass + raise NotImplementedError def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): diff --git a/tests/ref.txt b/tests/ref.json similarity index 100% rename from tests/ref.txt rename to tests/ref.json diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py index f0dc3c8..d7239a8 100644 --- a/tests/test_standalone_stac_item.py +++ b/tests/test_standalone_stac_item.py @@ -24,7 +24,7 @@ def test_standalone_stac_item(): stac_item_id = make_cmip6_item_id(attrs["attributes"]) stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) - with open("tests/ref.txt", "r") as ff: + with open("tests/ref.json", "r") as ff: reference = json.load(ff) assert stac_item.to_dict() == reference From 0581c615c9959bc0d1f898b275d4f8f93c66c15d Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Thu, 26 Oct 2023 20:13:50 -0400 Subject: [PATCH 63/69] reverting to old way to read thredds access links --- STACpopulator/input.py | 9 ++++++--- STACpopulator/stac_utils.py | 25 ++++++++++++------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 272f9ad..2522f15 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -4,6 +4,7 @@ import pystac import requests +import siphon import xncml from colorlog import ColoredFormatter from siphon.catalog import TDSCatalog @@ -95,7 +96,7 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" if self.catalog_head.datasets.items(): for item_name, ds in self.catalog_head.datasets.items(): - attrs = self.extract_metadata(ds.access_urls["NCML"], self.catalog_head.catalog_url, ds.url_path) + attrs = self.extract_metadata(ds) yield item_name, attrs if self._depth > 0: @@ -104,12 +105,14 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: self._depth -= 1 yield from self - def extract_metadata(self, ncml_url: str, catalog_url: str, dataset_path: str) -> MutableMapping[str, Any]: + def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: LOGGER.info("Requesting NcML dataset description") - r = requests.get(ncml_url, params={"catalog": catalog_url, "dataset": dataset_path}) + url = ds.access_urls["NCML"] + r = requests.get(url) # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) + attrs["access_urls"] = ds.access_urls return attrs diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index d3786e1..c245ed1 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -191,16 +191,15 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Convert pydantic STAC item to a PySTAC Item item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] + root = attrs["access_urls"] for name, url in root.items(): name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) - name = asset_name_remaps[name] if name in asset_name_remaps.keys() else name item.add_asset(name, asset) - item.add_link(magpie_resource_link(root["httpserver_service"])) + item.add_link(magpie_resource_link(root["HTTPServer"])) return item @@ -214,17 +213,17 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop } media_types = { - "httpserver_service": "application/x-netcdf", - "opendap_service": pystac.MediaType.HTML, - "wcs_service": pystac.MediaType.XML, - "wms_service": pystac.MediaType.XML, - "nccs_service": "application/x-netcdf", + "HTTPServer": "application/x-netcdf", + "OPENDAP": pystac.MediaType.HTML, + "WCS": pystac.MediaType.XML, + "WMS": pystac.MediaType.XML, + "NetcdfSubset": "application/x-netcdf", } asset_roles = { - "httpserver_service": ["data"], - "opendap_service": ["data"], - "wcs_service": ["data"], - "wms_service": ["visual"], - "nccs_service": ["data"], + "HTTPServer": ["data"], + "OPENDAP": ["data"], + "WCS": ["data"], + "WMS": ["visual"], + "NetcdfSubset": ["data"], } From 37a26e19d50c8c129f78dce995344b400ca1ee8a Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:11:13 -0500 Subject: [PATCH 64/69] adding ability to get single file from THREDDS loader --- STACpopulator/input.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/STACpopulator/input.py b/STACpopulator/input.py index 2522f15..25750c0 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -105,6 +105,9 @@ def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: self._depth -= 1 yield from self + def __getitem__(self, dataset): + return self.catalog.datasets[dataset] + def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: LOGGER.info("Requesting NcML dataset description") url = ds.access_urls["NCML"] From e55591dd0b7f7db6cd4ee7256512d5693d282145 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:12:15 -0500 Subject: [PATCH 65/69] making make_cmip6_item_id a staticmethod --- .../implementations/CMIP6_UofT/add_CMIP6.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 32f8577..a4285bf 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -133,6 +133,22 @@ def __init__(self, stac_host: str, data_loader: GenericLoader, update: Optional[ """ super().__init__(stac_host, data_loader, update) + @staticmethod + def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: + """Return a unique ID for CMIP6 data item.""" + keys = [ + "activity_id", + "institution_id", + "source_id", + "experiment_id", + "variant_label", + "table_id", + "variable_id", + "grid_label", + ] + name = "_".join(attrs[k] for k in keys) + return name + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Creates the STAC item. @@ -143,7 +159,7 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) :return: _description_ :rtype: MutableMapping[str, Any] """ - iid = make_cmip6_item_id(item_data["attributes"]) + iid = self.make_cmip6_item_id(item_data["attributes"]) item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) From f1e28db4c22a6ba6082ba11a9acf896cb550accf Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:13:34 -0500 Subject: [PATCH 66/69] wrapping call to make STAC item with a try-exepcet block --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index a4285bf..65c1457 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -161,7 +161,12 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) """ iid = self.make_cmip6_item_id(item_data["attributes"]) - item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) + try: + item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) + except pydantic_core._pydantic_core.ValidationError: + print(f"ERROR: ValidationError for {iid}") + return -1 + # Add datacube extension try: From 8bb21e11b6d7642cce11c6f3a1f0aeda684716ce Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:15:21 -0500 Subject: [PATCH 67/69] fixing commit e55591dd0b7f7db6cd4ee7256512d5693d282145 --- .../implementations/CMIP6_UofT/add_CMIP6.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 65c1457..3de2435 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -103,22 +103,6 @@ def validate_version(cls, v: str, info: FieldValidationInfo): return v -def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: - """Return a unique ID for CMIP6 data item.""" - keys = [ - "activity_id", - "institution_id", - "source_id", - "experiment_id", - "variant_label", - "table_id", - "variable_id", - "grid_label", - ] - name = "_".join(attrs[k] for k in keys) - return name - - class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon From 3055afc382bcb27a34abd0a63d0428371a84a034 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:16:51 -0500 Subject: [PATCH 68/69] more fixes to previous commits --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 3de2435..eedecc9 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -4,6 +4,7 @@ from datetime import datetime from typing import Any, List, Literal, MutableMapping, Optional +import pydantic_core import pyessv from colorlog import ColoredFormatter from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator @@ -151,6 +152,10 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) print(f"ERROR: ValidationError for {iid}") return -1 + # Add the CMIP6 STAC extension + item.stac_extensions.append( + "https://raw.githubusercontent.com/TomAugspurger/cmip6/main/json-schema/schema.json" + ) # Add datacube extension try: From 3f1d2843fc0a887f5ab74ae06af034d70fda01a6 Mon Sep 17 00:00:00 2001 From: Deepak Chandan Date: Wed, 8 Nov 2023 16:18:02 -0500 Subject: [PATCH 69/69] making tracking_id optional in CMIP6ItemProperties --- STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index eedecc9..6d6fedb 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -74,7 +74,7 @@ class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): physics_index: int realization_index: int forcing_index: int - tracking_id: str + tracking_id: str = "" version: str = Field("") product: str license: str