diff --git a/.gitignore b/.gitignore index 80f0926..7e12211 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ *.pyc STACpopulator.egg-info/ .vscode/ +.venv/ +jupyter/ +.idea +.vscode diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..439f93e --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +IMP_DIR = STACpopulator/implementations +STAC_HOST = http://localhost:8880/stac + +testcmip6: + python $(IMP_DIR)/CMIP6_UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html + +delcmip6: + curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6_UofT' + @echo "" + +starthost: + docker compose up + +stophost: + docker compose down + +del_docker_volume: stophost + docker volume rm stac-populator_stac-db + +resethost: del_docker_volume starthost diff --git a/README.md b/README.md index bcac544..808926c 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,6 @@ Currently, one implementation of `STACpopulatorBase` is provided in [add_CMIP6.p The provided `docker-compose` file can be used to launch a test STAC server. The `add_CMIP6.py` script can be run as: ``` -python implementations/add_CMIP6.py http://localhost:8880/stac/ https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/datasets/simulations/bias_adjusted/catalog.html implementations/CMIP6.yml +python implementations/CMIP6-UofT/add_CMIP6.py http://localhost:8880/stac/ https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html implementations/CMIP6-UofT/CMIP6.yml ``` -Note: in the script above, I am currently using a sample THREDDS catalog URL and not one relevant to the global scale CMIP6 data. \ No newline at end of file +Note: in the script above, I am currently using a sample THREDDS catalog URL and not one relevant to the global scale CMIP6 data. diff --git a/STACpopulator/api_requests.py b/STACpopulator/api_requests.py new file mode 100644 index 0000000..35b0dc2 --- /dev/null +++ b/STACpopulator/api_requests.py @@ -0,0 +1,94 @@ +import logging +import os +from typing import Any, Optional + +import requests +from colorlog import ColoredFormatter + +LOGGER = logging.getLogger(__name__) +LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" +formatter = ColoredFormatter(LOGFORMAT) +stream = logging.StreamHandler() +stream.setFormatter(formatter) +LOGGER.addHandler(stream) +LOGGER.setLevel(logging.INFO) +LOGGER.propagate = False + + +def stac_host_reachable(url: str) -> bool: + try: + registry = requests.get(url) + registry.raise_for_status() + return True + except (requests.exceptions.RequestException, requests.exceptions.ConnectionError): + return False + + +def stac_collection_exists(stac_host: str, collection_id: str) -> bool: + """ + Get a STAC collection + + Returns the collection JSON. + """ + r = requests.get(os.path.join(stac_host, "collections", collection_id), verify=False) + + return r.status_code == 200 + + +def post_stac_collection(stac_host: str, json_data: dict[str, Any], update: Optional[bool] = True) -> None: + """Post/create a collection on the STAC host + + :param stac_host: address of the STAC host + :type stac_host: str + :param json_data: JSON representation of the STAC collection + :type json_data: dict[str, Any] + :param update: if True, update the collection on the host server if it is already present, defaults to True + :type update: Optional[bool], optional + """ + collection_id = json_data["id"] + r = requests.post(os.path.join(stac_host, "collections"), json=json_data, verify=False) + + if r.status_code == 200: + LOGGER.info(f"Collection {collection_id} successfully created") + elif r.status_code == 409: + if update: + LOGGER.info(f"Collection {collection_id} already exists. Updating.") + r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) + r.raise_for_status() + else: + LOGGER.info(f"Collection {collection_id} already exists.") + else: + r.raise_for_status() + + +def post_stac_item( + stac_host: str, collection_id: str, item_name: str, json_data: dict[str, dict], update: Optional[bool] = True +) -> None: + """Post a STAC item to the host server. + + :param stac_host: address of the STAC host + :type stac_host: str + :param collection_id: ID of the collection to which to post this item + :type collection_id: str + :param item_name: name of the STAC item + :type item_name: str + :param json_data: JSON representation of the STAC item + :type json_data: dict[str, dict] + :param update: if True, update the item on the host server if it is already present, defaults to True + :type update: Optional[bool], optional + """ + item_id = json_data["id"] + + r = requests.post(os.path.join(stac_host, f"collections/{collection_id}/items"), json=json_data) + + if r.status_code == 200: + LOGGER.info(f"Item {item_name} successfully added") + elif r.status_code == 409: + if update: + LOGGER.info(f"Item {item_id} already exists. Updating.") + r = requests.put(os.path.join(stac_host, f"collections/{collection_id}/items/{item_id}"), json=json_data) + r.raise_for_status() + else: + LOGGER.info(f"Item {item_id} already exists.") + else: + r.raise_for_status() diff --git a/implementations/NEX-GDDP-UofT/add_NEX-GDDP.py b/STACpopulator/implementations/CMIP6_UofT/__init__.py similarity index 100% rename from implementations/NEX-GDDP-UofT/add_NEX-GDDP.py rename to STACpopulator/implementations/CMIP6_UofT/__init__.py diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py new file mode 100644 index 0000000..6d6fedb --- /dev/null +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -0,0 +1,191 @@ +import argparse +import json +import logging +from datetime import datetime +from typing import Any, List, Literal, MutableMapping, Optional + +import pydantic_core +import pyessv +from colorlog import ColoredFormatter +from pydantic import AnyHttpUrl, ConfigDict, Field, FieldValidationInfo, field_validator +from pystac.extensions.datacube import DatacubeExtension + +from STACpopulator import STACpopulatorBase +from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper +from STACpopulator.input import GenericLoader, THREDDSLoader +from STACpopulator.models import GeoJSONPolygon, STACItemProperties +from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal + +LOGGER = logging.getLogger(__name__) +LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" +formatter = ColoredFormatter(LOGFORMAT) +stream = logging.StreamHandler() +stream.setFormatter(formatter) +LOGGER.addHandler(stream) +LOGGER.setLevel(logging.INFO) +LOGGER.propagate = False + +# CMIP6 controlled vocabulary (CV) +CV = pyessv.WCRP.CMIP6 + +# Enum classes built from the pyessv' CV +ActivityID = collection2literal(CV.activity_id) +ExperimentID = collection2literal(CV.experiment_id) +Frequency = collection2literal(CV.frequency) +GridLabel = collection2literal(CV.grid_label) +InstitutionID = collection2literal(CV.institution_id) +NominalResolution = collection2literal(CV.nominal_resolution) +Realm = collection2literal(CV.realm) +SourceID = collection2literal(CV.source_id) +SourceType = collection2literal(CV.source_type) +SubExperimentID = collection2literal(CV.sub_experiment_id) +TableID = collection2literal(CV.table_id) + + +def add_cmip6_prefix(name: str) -> str: + return "cmip6:" + name if "datetime" not in name else name + + +class CMIP6ItemProperties(STACItemProperties, validate_assignment=True): + """Data model for CMIP6 Controlled Vocabulary.""" + + Conventions: str + activity_id: ActivityID + creation_date: datetime + data_specs_version: str + experiment: str + experiment_id: ExperimentID + frequency: Frequency + further_info_url: AnyHttpUrl + grid_label: GridLabel + institution: str + institution_id: InstitutionID + nominal_resolution: NominalResolution + realm: List[Realm] + source: str + source_id: SourceID + source_type: List[SourceType] + sub_experiment: str | Literal["none"] + sub_experiment_id: SubExperimentID | Literal["none"] + table_id: TableID + variable_id: str + variant_label: str + initialization_index: int + physics_index: int + realization_index: int + forcing_index: int + tracking_id: str = "" + version: str = Field("") + product: str + license: str + grid: str + mip_era: str + + model_config = ConfigDict(alias_generator=add_cmip6_prefix, populate_by_name=True) + + @field_validator("initialization_index", "physics_index", "realization_index", "forcing_index", mode="before") + @classmethod + def only_item(cls, v: list[int], info: FieldValidationInfo): + """Pick single item from list.""" + assert len(v) == 1, f"{info.field_name} must have one item only." + return v[0] + + @field_validator("realm", "source_type", mode="before") + @classmethod + def split(cls, v: str, info: FieldValidationInfo): + """Split string into list.""" + return v.split(" ") + + @field_validator("version") + @classmethod + def validate_version(cls, v: str, info: FieldValidationInfo): + assert v[0] == "v", "Version string should begin with a lower case 'v'" + assert v[1:].isdigit(), "All characters in version string, except first, should be digits" + return v + + +class CMIP6populator(STACpopulatorBase): + item_properties_model = CMIP6ItemProperties + item_geometry_model = GeoJSONPolygon + + def __init__(self, stac_host: str, data_loader: GenericLoader, update: Optional[bool] = False) -> None: + """Constructor + + :param stac_host: URL to the STAC API + :type stac_host: str + :param thredds_catalog_url: the URL to the THREDDS catalog to ingest + :type thredds_catalog_url: str + """ + super().__init__(stac_host, data_loader, update) + + @staticmethod + def make_cmip6_item_id(attrs: MutableMapping[str, Any]) -> str: + """Return a unique ID for CMIP6 data item.""" + keys = [ + "activity_id", + "institution_id", + "source_id", + "experiment_id", + "variant_label", + "table_id", + "variable_id", + "grid_label", + ] + name = "_".join(attrs[k] for k in keys) + return name + + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Creates the STAC item. + + :param item_name: name of the STAC item. Interpretation of name is left to the input loader implementation + :type item_name: str + :param item_data: dictionary like representation of all information on the item + :type item_data: MutableMapping[str, Any] + :return: _description_ + :rtype: MutableMapping[str, Any] + """ + iid = self.make_cmip6_item_id(item_data["attributes"]) + + try: + item = STAC_item_from_metadata(iid, item_data, self.item_properties_model, self.item_geometry_model) + except pydantic_core._pydantic_core.ValidationError: + print(f"ERROR: ValidationError for {iid}") + return -1 + + # Add the CMIP6 STAC extension + item.stac_extensions.append( + "https://raw.githubusercontent.com/TomAugspurger/cmip6/main/json-schema/schema.json" + ) + + # Add datacube extension + try: + dchelper = DataCubeHelper(item_data) + dc_ext = DatacubeExtension.ext(item, add_if_missing=True) + dc_ext.apply(dimensions=dchelper.dimensions, variables=dchelper.variables) + except: + LOGGER.warning(f"Failed to add Datacube extension to item {item_name}") + + # print(json.dumps(item.to_dict())) + return json.loads(json.dumps(item.to_dict())) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(prog="CMIP6 STAC populator") + parser.add_argument("stac_host", type=str, help="STAC API address") + parser.add_argument("thredds_catalog_URL", type=str, help="URL to the CMIP6 THREDDS catalog") + parser.add_argument("--update", action="store_true", help="Update collection and its items") + + args = parser.parse_args() + + LOGGER.info(f"Arguments to call: {args}") + + mode = "full" + + if mode == "full": + data_loader = THREDDSLoader(args.thredds_catalog_URL) + else: + # To be implemented + data_loader = ErrorLoader(args.error_file) + + c = CMIP6populator(args.stac_host, data_loader, args.update) + c.ingest() diff --git a/implementations/CMIP6-UofT/CMIP6.yml b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml similarity index 93% rename from implementations/CMIP6-UofT/CMIP6.yml rename to STACpopulator/implementations/CMIP6_UofT/collection_config.yml index a57875b..0f43c78 100644 --- a/implementations/CMIP6-UofT/CMIP6.yml +++ b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml @@ -1,4 +1,5 @@ title: CMIP6 +id: CMIP6_UofT description: Coupled Model Intercomparison Project phase 6 keywords: ['CMIP', 'CMIP6', 'WCRP', 'Climate Change'] license: "CC-BY-4.0" diff --git a/STACpopulator/implementations/CMIP6_UofT/extensions.py b/STACpopulator/implementations/CMIP6_UofT/extensions.py new file mode 100644 index 0000000..31450a6 --- /dev/null +++ b/STACpopulator/implementations/CMIP6_UofT/extensions.py @@ -0,0 +1,209 @@ +import functools + +from pystac.extensions.datacube import Dimension, DimensionType, Variable, VariableType + +from STACpopulator.stac_utils import ncattrs_to_bbox + + +class DataCubeHelper: + """Return STAC Item from CF JSON metadata, as provided by `xncml.Dataset.to_cf_dict`.""" + + axis = {"X": "x", "Y": "y", "Z": "z", "T": "t", "longitude": "x", "latitude": "y", "vertical": "z", "time": "t"} + + def __init__(self, attrs: dict): + """ + Create STAC Item from CF JSON metadata. + + Parameters + ---------- + iid : str + Unique item ID. + attrs: dict + CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. + datamodel : pydantic.BaseModel, optional + Data model for validating global attributes. + """ + self.attrs = attrs + + # From CF-Xarray + self.coordinate_criteria = { + "latitude": { + "standard_name": ("latitude",), + "units": ("degree_north", "degree_N", "degreeN", "degrees_north", "degrees_N", "degreesN"), + "_CoordinateAxisType": ("Lat",), + "long_name": ("latitude",), + }, + "longitude": { + "standard_name": ("longitude",), + "units": ("degree_east", "degree_E", "degreeE", "degrees_east", "degrees_E", "degreesE"), + "_CoordinateAxisType": ("Lon",), + "long_name": ("longitude",), + }, + "Z": { + "standard_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + "_CoordinateAxisType": ("GeoZ", "Height", "Pressure"), + "axis": ("Z",), + "cartesian_axis": ("Z",), + "grads_dim": ("z",), + "long_name": ( + "model_level_number", + "atmosphere_ln_pressure_coordinate", + "atmosphere_sigma_coordinate", + "atmosphere_hybrid_sigma_pressure_coordinate", + "atmosphere_hybrid_height_coordinate", + "atmosphere_sleve_coordinate", + "ocean_sigma_coordinate", + "ocean_s_coordinate", + "ocean_s_coordinate_g1", + "ocean_s_coordinate_g2", + "ocean_sigma_z_coordinate", + "ocean_double_sigma_coordinate", + ), + }, + "vertical": { + "standard_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + "positive": ("up", "down"), + "long_name": ( + "air_pressure", + "height", + "depth", + "geopotential_height", + "altitude", + "height_above_geopotential_datum", + "height_above_reference_ellipsoid", + "height_above_mean_sea_level", + ), + }, + "X": { + "standard_name": ("projection_x_coordinate", "grid_longitude", "projection_x_angular_coordinate"), + "_CoordinateAxisType": ("GeoX",), + "axis": ("X",), + "cartesian_axis": ("X",), + "grads_dim": ("x",), + "long_name": ( + "projection_x_coordinate", + "grid_longitude", + "projection_x_angular_coordinate", + "cell index along first dimension", + ), + }, + "Y": { + "standard_name": ("projection_y_coordinate", "grid_latitude", "projection_y_angular_coordinate"), + "_CoordinateAxisType": ("GeoY",), + "axis": ("Y",), + "cartesian_axis": ("Y",), + "grads_dim": ("y",), + "long_name": ( + "projection_y_coordinate", + "grid_latitude", + "projection_y_angular_coordinate", + "cell index along second dimension", + ), + }, + "T": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + "time": { + "standard_name": ("time",), + "_CoordinateAxisType": ("Time",), + "axis": ("T",), + "cartesian_axis": ("T",), + "grads_dim": ("t",), + "long_name": ("time",), + }, + } + + @property + @functools.cache + def dimensions(self) -> dict: + """Return Dimension objects required for Datacube extension.""" + + dims = {} + for name, length in self.attrs["dimensions"].items(): + v = self.attrs["variables"].get(name) + if v: + bbox = ncattrs_to_bbox(self.attrs) + for key, criteria in self.coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if v["attributes"].get(criterion, None) in expected: + axis = self.axis[key] + type_ = DimensionType.SPATIAL if axis in ["x", "y", "z"] else DimensionType.TEMPORAL + + if v["type"] == "int": + extent = [0, int(length)] + else: # Not clear the logic is sound + if key == "X": + extent = bbox[0], bbox[2] + elif key == "Y": + extent = bbox[1], bbox[3] + else: + extent = None + + dims[name] = Dimension( + properties=dict( + axis=axis, + type=type_, + extent=extent, + description=v.get("description", v.get("long_name", criteria["standard_name"])), + ) + ) + + return dims + + @property + @functools.cache + def variables(self) -> dict: + """Return Variable objects required for Datacube extension.""" + variables = {} + + for name, meta in self.attrs["variables"].items(): + if name in self.attrs["dimensions"]: + continue + + attrs = meta["attributes"] + variables[name] = Variable( + properties=dict( + dimensions=meta["shape"], + type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, + description=attrs.get("description", attrs.get("long_name")), + unit=attrs.get("units", None), + ) + ) + return variables + + # @property + # @functools.cache + def is_coordinate(self, attrs: dict) -> bool: + """Return whether variable is a coordinate.""" + for key, criteria in self.coordinate_criteria.items(): + for criterion, expected in criteria.items(): + if attrs.get(criterion, None) in expected: + return True + return False diff --git a/STACpopulator/implementations/NEX_GDDP_UofT/__init__.py b/STACpopulator/implementations/NEX_GDDP_UofT/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/STACpopulator/implementations/NEX_GDDP_UofT/add_NEX-GDDP.py b/STACpopulator/implementations/NEX_GDDP_UofT/add_NEX-GDDP.py new file mode 100644 index 0000000..e69de29 diff --git a/STACpopulator/implementations/__init__.py b/STACpopulator/implementations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/STACpopulator/input.py b/STACpopulator/input.py index f59328f..25750c0 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -1,10 +1,16 @@ import logging from abc import ABC, abstractmethod -from typing import Optional +from typing import Any, Iterator, MutableMapping, Optional, Tuple +import pystac +import requests +import siphon +import xncml from colorlog import ColoredFormatter from siphon.catalog import TDSCatalog +from STACpopulator.stac_utils import numpy_to_python_datatypes, url_validate + LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" formatter = ColoredFormatter(LOGFORMAT) @@ -17,7 +23,7 @@ class GenericLoader(ABC): def __init__(self) -> None: - pass + self.links = [] @abstractmethod def __iter__(self): @@ -46,21 +52,52 @@ def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> Non super().__init__() self._depth = depth if depth is not None else 1000 - if thredds_catalog_url.endswith(".html"): - thredds_catalog_url = thredds_catalog_url.replace(".html", ".xml") - LOGGER.info("Converting catalog URL from html to xml") + self.thredds_catalog_URL = self.validate_catalog_url(thredds_catalog_url) - self.thredds_catalog_URL = thredds_catalog_url self.catalog = TDSCatalog(self.thredds_catalog_URL) self.catalog_head = self.catalog + self.links.append(self.magpie_collection_link()) + + def validate_catalog_url(self, url: str) -> str: + """Validate the user-provided catalog URL. + + :param url: URL to the THREDDS catalog + :type url: str + :raises RuntimeError: if URL is invalid or contains query parameters. + :return: a valid URL + :rtype: str + """ + if url_validate(url): + if "?" in url: + raise RuntimeError("THREDDS catalog URL should not contain query parameter") + else: + raise RuntimeError("Invalid URL") + + return url.replace(".html", ".xml") if url.endswith(".html") else url + + def magpie_collection_link(self) -> pystac.Link: + """Creates a PySTAC Link for the collection that is used by Cowbird and Magpie. + + :return: A PySTAC Link + :rtype: pystac.Link + """ + url = self.thredds_catalog_URL + parts = url.split("/") + i = parts.index("catalog") + # service = parts[i - 1] + path = "/".join(parts[i + 1 : -1]) + return pystac.Link(rel="source", target=url, media_type="text/xml", title=path) def reset(self): """Reset the generator.""" self.catalog_head = self.catalog - def __iter__(self): + def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" - yield from self.catalog_head.datasets.items() + if self.catalog_head.datasets.items(): + for item_name, ds in self.catalog_head.datasets.items(): + attrs = self.extract_metadata(ds) + yield item_name, attrs if self._depth > 0: for name, ref in self.catalog_head.catalog_refs.items(): @@ -68,11 +105,29 @@ def __iter__(self): self._depth -= 1 yield from self + def __getitem__(self, dataset): + return self.catalog.datasets[dataset] -class RemoteTHREDDSLoader(THREDDSLoader): - def __init__(self, thredds_catalog_url: str, depth: int | None = None) -> None: - super().__init__(thredds_catalog_url, depth) - # more stuff to follow based on needs of a concrete implementation + def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: + LOGGER.info("Requesting NcML dataset description") + url = ds.access_urls["NCML"] + r = requests.get(url) + # Convert NcML to CF-compliant dictionary + attrs = xncml.Dataset.from_text(r.content).to_cf_dict() + attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) + attrs["access_urls"] = ds.access_urls + return attrs + + +class STACLoader(GenericLoader): + def __init__(self) -> None: + super().__init__() + + def __iter__(self): + raise NotImplementedError + + def reset(self): + raise NotImplementedError class GeoServerLoader(GenericLoader): diff --git a/STACpopulator/metadata_parsers.py b/STACpopulator/metadata_parsers.py deleted file mode 100644 index 84636f8..0000000 --- a/STACpopulator/metadata_parsers.py +++ /dev/null @@ -1,61 +0,0 @@ -import lxml.etree -import requests - - -def nc_attrs_from_ncml(url): - """Extract attributes from NcML file. - - Parameters - ---------- - url : str - Link to NcML service of THREDDS server for a dataset. - - Returns - ------- - dict - Global attribute values keyed by facet names, with variable attributes in `__variable__` nested dict, and - additional specialized attributes in `__group__` nested dict. - """ - parser = lxml.etree.XMLParser(encoding="UTF-8") - - ns = {"ncml": "http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2"} - - # Parse XML content - UTF-8 encoded documents need to be read as bytes - xml = requests.get(url).content - doc = lxml.etree.fromstring(xml, parser=parser) - nc = doc.xpath("/ncml:netcdf", namespaces=ns)[0] - - # Extract global attributes - out = _attrib_to_dict(nc.xpath("ncml:attribute", namespaces=ns)) - - # Extract group attributes - gr = {} - for group in nc.xpath("ncml:group", namespaces=ns): - gr[group.attrib["name"]] = _attrib_to_dict(group.xpath("ncml:attribute", namespaces=ns)) - - # Extract variable attributes - va = {} - for variable in nc.xpath("ncml:variable", namespaces=ns): - if "_CoordinateAxisType" in variable.xpath("ncml:attribute/@name", namespaces=ns): - continue - va[variable.attrib["name"]] = _attrib_to_dict(variable.xpath("ncml:attribute", namespaces=ns)) - - out["__group__"] = gr - out["__variable__"] = va - - return out - - -def _attrib_to_dict(elems): - """Convert element attributes to dictionary. - - Ignore attributes with names starting with _ - """ - hidden_prefix = "_" - out = {} - for e in elems: - a = e.attrib - if a["name"].startswith(hidden_prefix): - continue - out[a["name"]] = a["value"] - return out diff --git a/STACpopulator/models.py b/STACpopulator/models.py new file mode 100644 index 0000000..f91dab5 --- /dev/null +++ b/STACpopulator/models.py @@ -0,0 +1,106 @@ +import datetime as dt +from typing import Any, Dict, List, Literal, Optional, Union + +from pydantic import ( + AnyHttpUrl, + AnyUrl, + BaseModel, + Field, + SerializeAsAny, + field_validator, +) + + +class Geometry(BaseModel): + type: str + coordinates: List + + +class GeoJSONPoint(Geometry): + type: Literal["Point"] + coordinates: List[float] + + +class GeoJSONMultiPoint(Geometry): + type: Literal["MultiPoint"] + coordinates: List[List[float]] + + +class GeoJSONPolygon(Geometry): + type: Literal["Polygon"] + coordinates: List[List[List[float]]] + + +class GeoJSONMultiPolygon(Geometry): + type: Literal["MultiPolygon"] + coordinates: List[List[List[List[float]]]] + + +class Asset(BaseModel): + href: AnyHttpUrl + media_type: Optional[str] = None + title: Optional[str] = None + description: Optional[str] = None + roles: Optional[List[str]] = None + + +class STACItemProperties(BaseModel): + """Base STAC Item properties data model. In concrete implementations, users would want to define a new + data model that inherits from this base model and extends it with properties tailored to the data they are + ingesting.""" + + start_datetime: Optional[dt.datetime] = None + end_datetime: Optional[dt.datetime] = None + datetime: Optional[dt.datetime] = None + + @field_validator("datetime", mode="before") + @classmethod + def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) -> dt: + if v == "null": + if not values["start_datetime"] and not values["end_datetime"]: + raise ValueError("start_datetime and end_datetime must be specified when datetime is null") + + +# class Link(BaseModel): +# """ +# https://github.com/radiantearth/stac-spec/blob/v1.0.0/collection-spec/collection-spec.md#link-object +# """ + +# href: str = Field(..., alias="href", min_length=1) +# rel: str = Field(..., alias="rel", min_length=1) +# type: Optional[str] = None +# title: Optional[str] = None +# # Label extension +# label: Optional[str] = Field(None, alias="label:assets") +# model_config = ConfigDict(use_enum_values=True) + +# def resolve(self, base_url: str) -> None: +# """resolve a link to the given base URL""" +# self.href = urljoin(base_url, self.href) + + +# class PaginationLink(Link): +# """ +# https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension +# """ + +# rel: Literal["next", "previous"] +# method: Literal["GET", "POST"] +# body: Optional[Dict[Any, Any]] = None +# merge: bool = False + + +# Links = RootModel[List[Union[PaginationLink, Link]]] + + +class STACItem(BaseModel): + """STAC Item data model.""" + + id: str = Field(..., alias="id", min_length=1) + geometry: Optional[SerializeAsAny[Geometry]] = None + bbox: Optional[List[float]] = None + properties: Optional[SerializeAsAny[STACItemProperties]] = None + assets: Dict[str, Asset] = None + stac_extensions: Optional[List[AnyUrl]] = [] + collection: Optional[str] = None + datetime: Optional[dt.datetime] = None # Not in the spec, but needed by pystac.Item. diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 8c6465c..f8ccb1c 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,18 +1,18 @@ -import hashlib import logging from abc import ABC, abstractmethod +from datetime import datetime +from typing import Any, MutableMapping, Optional -import yaml +import pystac from colorlog import ColoredFormatter -from STACpopulator.input import GenericLoader -from STACpopulator.stac_utils import ( - create_stac_collection, - post_collection, - stac_collection_exists, +from STACpopulator.api_requests import ( + post_stac_collection, + post_stac_item, stac_host_reachable, - url_validate, ) +from STACpopulator.input import GenericLoader +from STACpopulator.stac_utils import load_collection_configuration, url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -29,7 +29,7 @@ def __init__( self, stac_host: str, data_loader: GenericLoader, - collection_info_filename: str, + update: Optional[bool] = False, ) -> None: """Constructor @@ -37,27 +37,20 @@ def __init__( :type stac_host: str :param data_loader: A concrete implementation of the GenericLoader abstract base class :type data_loader: GenericLoader - :param collection_info_filename: Yaml file containing the information about the collection to populate - :type collection_info_filename: str :raises RuntimeError: Raised if one of the required definitions is not found in the collection info filename """ super().__init__() - with open(collection_info_filename) as f: - self._collection_info = yaml.load(f, yaml.Loader) - - req_definitions = ["title", "description", "keywords", "license"] - for req in req_definitions: - if req not in self._collection_info.keys(): - LOGGER.error(f"'{req}' is required in the configuration file") - raise RuntimeError(f"'{req}' is required in the configuration file") + self._collection_info = load_collection_configuration() self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) + self.update = update - self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest() + self._collection_id = self.collection_name LOGGER.info("Initialization complete") LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}") + self.create_stac_collection() @property def collection_name(self) -> str: @@ -69,36 +62,62 @@ def stac_host(self) -> str: @property def collection_id(self) -> str: - return self._collection_id + return self._collection_info["id"] + + @property + @abstractmethod + def item_properties_model(self): + """In derived classes, this property should be defined as a pydantic data model that derives from + models.STACItemProperties.""" + raise NotImplementedError + + @property + @abstractmethod + def item_geometry_model(self): + """In derived classes, this property should be defined as a pydantic data model that derives from + models.STACItemProperties.""" + raise NotImplementedError + + @abstractmethod + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + raise NotImplementedError def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): raise ValueError("stac_host URL is not appropriately formatted") if not stac_host_reachable(stac_host): - raise ValueError("stac_host is not reachable") + raise RuntimeError("stac_host is not reachable") return stac_host - def ingest(self) -> None: - # First create collection if it doesn't exist - if not stac_collection_exists(self.stac_host, self.collection_id): - LOGGER.info(f"Creating collection '{self.collection_name}'") - pystac_collection = create_stac_collection(self.collection_id, self._collection_info) - post_collection(self.stac_host, pystac_collection) - LOGGER.info("Collection successfully created") - else: - LOGGER.info(f"Collection '{self.collection_name}' already exists") - # for item in self.crawler(self.catalog, **self._crawler_args): - # stac_item = self.process_STAC_item(item) - # self.post_item(stac_item) - - def post_item(self, data: dict[str, dict]) -> None: - pass + def create_stac_collection(self) -> None: + """ + Create a basic STAC collection. - @abstractmethod - def process_stac_item(self): # noqa N802 - pass + Returns the collection. + """ + LOGGER.info(f"Creating collection '{self.collection_name}'") + sp_extent = pystac.SpatialExtent([self._collection_info.pop("spatialextent")]) + tmp = self._collection_info.pop("temporalextent") + tmp_extent = pystac.TemporalExtent( + [ + [ + datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, + datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, + ] + ] + ) + self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) + self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) + collection = pystac.Collection(**self._collection_info) + + collection.add_links(self._ingest_pipeline.links) + + post_stac_collection(self.stac_host, collection.to_dict(), self.update) - @abstractmethod - def validate_stac_item_cv(self): # noqa N802 - pass + def ingest(self) -> None: + LOGGER.info("Data ingestion") + for item_name, item_data in self._ingest_pipeline: + LOGGER.info(f"Creating STAC representation for {item_name}") + stac_item = self.create_stac_item(item_name, item_data) + post_stac_item(self.stac_host, self.collection_id, item_name, stac_item, self.update) diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 743f53a..c245ed1 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,10 +1,26 @@ +import datetime +import json +import logging import os import re -from datetime import datetime -from typing import Any +import sys +from typing import Any, Literal, MutableMapping +import numpy as np import pystac -import requests +import yaml +from colorlog import ColoredFormatter + +from STACpopulator.models import STACItem + +LOGGER = logging.getLogger(__name__) +LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" +formatter = ColoredFormatter(LOGFORMAT) +stream = logging.StreamHandler() +stream.setFormatter(formatter) +LOGGER.addHandler(stream) +LOGGER.setLevel(logging.INFO) +LOGGER.propagate = False def url_validate(target: str) -> bool: @@ -31,69 +47,183 @@ def url_validate(target: str) -> bool: return True if re.match(url_regex, target) else False -def stac_host_reachable(url: str) -> bool: - try: - registry = requests.get(url) - registry.raise_for_status() - return True - except (requests.exceptions.RequestException, requests.exceptions.ConnectionError): - return False - +def load_collection_configuration() -> MutableMapping[str, Any]: + """Reads details of the STAC Collection to be created from a configuration file. the + code expects a "collection_config.yml" file to be present in the app directory. -def stac_collection_exists(stac_host: str, collection_id: str) -> bool: + :raises RuntimeError: If the configuration file is not present + :raises RuntimeError: If required values are not present in the configuration file + :return: A python dictionary describing the details of the Collection + :rtype: MutableMapping[str, Any] """ - Get a STAC collection + collection_info_filename = "collection_config.yml" + app_directory = os.path.dirname(sys.argv[0]) - Returns the collection JSON. - """ - r = requests.get(os.path.join(stac_host, "collections", collection_id), verify=False) + if not os.path.exists(os.path.join(app_directory, collection_info_filename)): + raise RuntimeError(f"Missing {collection_info_filename} file for this implementation") - return r.status_code == 200 + with open(os.path.join(app_directory, collection_info_filename)) as f: + collection_info = yaml.load(f, yaml.Loader) + req_definitions = ["title", "id", "description", "keywords", "license"] + for req in req_definitions: + if req not in collection_info.keys(): + LOGGER.error(f"'{req}' is required in the configuration file") + raise RuntimeError(f"'{req}' is required in the configuration file") -def create_stac_collection(collection_id: str, collection_info: dict[str, Any]) -> dict[str, Any]: - """ - Create a basic STAC collection. + return collection_info - Returns the collection. - """ - sp_extent = pystac.SpatialExtent([collection_info.pop("spatialextent")]) - tmp = collection_info.pop("temporalextent") - tmp_extent = pystac.TemporalExtent( - [ +def collection2literal(collection): + terms = tuple(term.label for term in collection) + return Literal[terms] + + +def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + """Create Polygon geometry from CFMetadata.""" + attrs = attrs["groups"]["CFMetadata"]["attributes"] + return { + "type": "Polygon", + "coordinates": [ [ - datetime.strptime(tmp[0], "%Y-%m-%d") if tmp[0] is not None else None, - datetime.strptime(tmp[1], "%Y-%m-%d") if tmp[1] is not None else None, + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ], + [ + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_min"][0]), + ], + [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + ], ] - ] + ], + } + + +def ncattrs_to_bbox(attrs: MutableMapping[str, Any]) -> list[float]: + """Create BBOX from CFMetadata.""" + attrs = attrs["groups"]["CFMetadata"]["attributes"] + return [ + float(attrs["geospatial_lon_min"][0]), + float(attrs["geospatial_lat_min"][0]), + float(attrs["geospatial_lon_max"][0]), + float(attrs["geospatial_lat_max"][0]), + ] + + +def numpy_to_python_datatypes(data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + # Converting numpy datatypes to python standard datatypes + for key, value in data.items(): + if isinstance(value, list): + newlist = [] + for item in value: + if issubclass(type(item), np.integer): + newlist.append(int(item)) + elif issubclass(type(item), np.floating): + newlist.append(float(item)) + else: + newlist.append(item) + data[key] = newlist + elif isinstance(type(value), np.integer): + data[key] = int(value) + + return data + + +def magpie_resource_link(url: str) -> pystac.Link: + """Creates a link that will be used by Cowbird to create a resource in Magpie + associated with the STAC item. + + :param url: HTTPServer access URL for a STAC item + :type url: str + :return: A PySTAC Link + :rtype: pystac.Link + """ + url_ = url.replace("fileServer", "*") + i = url_.find("*") + title = url_[i + 2 :] + link = pystac.Link(rel="source", title=title, target=url, media_type="application/x-netcdf") + return link + + +def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_props_datamodel, item_geometry_model): + """ + Create STAC Item from CF JSON metadata. + + Parameters + ---------- + iid : str + Unique item ID. + attrs: dict + CF JSON metadata returned by `xncml.Dataset.to_cf_dict`. + item_props_datamodel : pydantic.BaseModel + Data model describing the properties of the STAC item. + item_geometry_model : pydantic.BaseModel + Data model describing the geometry of the STAC item. + """ + + cfmeta = attrs["groups"]["CFMetadata"]["attributes"] + + # Create pydantic STAC item + item = STACItem( + id=iid, + geometry=item_geometry_model(**ncattrs_to_geometry(attrs)), + bbox=ncattrs_to_bbox(attrs), + properties=item_props_datamodel( + start_datetime=cfmeta["time_coverage_start"], + end_datetime=cfmeta["time_coverage_end"], + **attrs["attributes"], + ), + datetime=None, ) - collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) - collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) - collection = pystac.Collection(id=collection_id, **collection_info) + # Convert pydantic STAC item to a PySTAC Item + item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - return collection.to_dict() + root = attrs["access_urls"] + for name, url in root.items(): + name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str + asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) -def post_collection(stac_host: str, json_data: dict[str, Any]) -> None: - """ - Post a STAC collection. + item.add_asset(name, asset) - Returns the collection id. - """ - collection_id = json_data["id"] - r = requests.post(os.path.join(stac_host, "collections"), json=json_data, verify=False) - - if r.status_code == 200: - print( - f"{bcolors.OKGREEN}[INFO] Pushed STAC collection [{collection_id}] to [{stac_host}] ({r.status_code}){bcolors.ENDC}" - ) - elif r.status_code == 409: - print( - f"{bcolors.WARNING}[INFO] STAC collection [{collection_id}] already exists on [{stac_host}] ({r.status_code}), updating..{bcolors.ENDC}" - ) - r = requests.put(os.path.join(stac_host, "collections"), json=json_data, verify=False) - r.raise_for_status() - else: - r.raise_for_status() + item.add_link(magpie_resource_link(root["HTTPServer"])) + + return item + + +asset_name_remaps = { + "httpserver_service": "HTTPServer", + "opendap_service": "OPENDAP", + "wcs_service": "WCS", + "wms_service": "WMS", + "nccs_service": "NetcdfSubset", +} + +media_types = { + "HTTPServer": "application/x-netcdf", + "OPENDAP": pystac.MediaType.HTML, + "WCS": pystac.MediaType.XML, + "WMS": pystac.MediaType.XML, + "NetcdfSubset": "application/x-netcdf", +} + +asset_roles = { + "HTTPServer": ["data"], + "OPENDAP": ["data"], + "WCS": ["data"], + "WMS": ["visual"], + "NetcdfSubset": ["data"], +} diff --git a/docker-compose.yml b/docker-compose.yml index 23ffae5..0fc7d25 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,3 +1,5 @@ +version: "3.4" + x-logging: &default-logging driver: "json-file" options: @@ -13,7 +15,7 @@ services: ports: - "8880:8000" environment: - - POSTGRES_USER=dchandan + - POSTGRES_USER=testuser - POSTGRES_PASS=password - POSTGRES_DBNAME=postgis - POSTGRES_HOST_READER=stac-db @@ -30,7 +32,7 @@ services: - POSTGRES_USER=testuser - POSTGRES_PASSWORD=password - POSTGRES_DB=postgis - - PGUSER=dchandan + - PGUSER=testuser - PGPASSWORD=password - PGHOST=localhost - PGDATABASE=postgis diff --git a/implementations/CMIP6-UofT/add_CMIP6.py b/implementations/CMIP6-UofT/add_CMIP6.py deleted file mode 100644 index 349540d..0000000 --- a/implementations/CMIP6-UofT/add_CMIP6.py +++ /dev/null @@ -1,60 +0,0 @@ -import argparse -import logging - -from colorlog import ColoredFormatter - -from STACpopulator import STACpopulatorBase -from STACpopulator.input import THREDDSLoader - -# from STACpopulator.metadata_parsers import nc_attrs_from_ncml - -LOGGER = logging.getLogger(__name__) -LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" -formatter = ColoredFormatter(LOGFORMAT) -stream = logging.StreamHandler() -stream.setFormatter(formatter) -LOGGER.addHandler(stream) -LOGGER.setLevel(logging.INFO) -LOGGER.propagate = False - - -class CMIP6populator(STACpopulatorBase): - def __init__( - self, - stac_host: str, - thredds_catalog_url: str, - config_filename: str, - ) -> None: - """Constructor - - :param stac_host: URL to the STAC API - :type stac_host: str - :param thredds_catalog_url: the URL to the THREDDS catalog to ingest - :type thredds_catalog_url: str - :param config_filename: Yaml file containing the information about the collection to populate - :type config_filename: str - """ - data_loader = THREDDSLoader(thredds_catalog_url) - for item in data_loader: - print(item) - super().__init__(stac_host, data_loader, config_filename) - - def process_stac_item(self): # noqa N802 - # TODO: next step is to implement this - print("here") - - def validate_stac_item_cv(self): - # TODO: next step is to implement this - pass - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(prog="CMIP6 STAC populator") - parser.add_argument("stac_host", type=str, help="STAC API address") - parser.add_argument("thredds_catalog_URL", type=str, help="URL to the CMIP6 THREDDS catalog") - parser.add_argument("config_file", type=str, help="Name of the configuration file") - - args = parser.parse_args() - LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.config_file) - c.ingest() diff --git a/pyproject.toml b/pyproject.toml index 1c94eaf..dc08b7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,10 @@ dependencies = [ "colorlog", "pyyaml", "siphon", - "pystac" + "pystac", + "xncml", + "pydantic", + "pyessv" ] [tool.setuptools] diff --git a/tests/data/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml b/tests/data/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml new file mode 100644 index 0000000..6aa0ae6 --- /dev/null +++ b/tests/data/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.xml @@ -0,0 +1,183 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/ref.json b/tests/ref.json new file mode 100644 index 0000000..f3b8c23 --- /dev/null +++ b/tests/ref.json @@ -0,0 +1,124 @@ +{ + "type": "Feature", + "stac_version": "1.0.0", + "id": "ScenarioMIP_CCCma_CanESM5_ssp245_r13i1p2f1_SImon_siconc_gn", + "properties": { + "start_datetime": "2019-12-06T12:00:00Z", + "end_datetime": "2020-11-04T12:00:00Z", + "datetime": null, + "cmip6:Conventions": "CF-1.7 CMIP-6.2", + "cmip6:activity_id": "ScenarioMIP", + "cmip6:creation_date": "2019-09-25T23:01:33Z", + "cmip6:data_specs_version": "01.00.30", + "cmip6:experiment": "update of RCP4.5 based on SSP2", + "cmip6:experiment_id": "ssp245", + "cmip6:frequency": "mon", + "cmip6:further_info_url": "https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1", + "cmip6:grid_label": "gn", + "cmip6:institution": "Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada", + "cmip6:institution_id": "CCCma", + "cmip6:nominal_resolution": "100 km", + "cmip6:realm": [ + "seaIce" + ], + "cmip6:source": "CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2", + "cmip6:source_id": "CanESM5", + "cmip6:source_type": [ + "AOGCM" + ], + "cmip6:sub_experiment": "none", + "cmip6:sub_experiment_id": "none", + "cmip6:table_id": "SImon", + "cmip6:variable_id": "siconc", + "cmip6:variant_label": "r13i1p2f1", + "cmip6:initialization_index": 1, + "cmip6:physics_index": 2, + "cmip6:realization_index": 13, + "cmip6:forcing_index": 1, + "cmip6:tracking_id": "hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95", + "cmip6:version": "v20190429", + "cmip6:product": "model-output", + "cmip6:license": "CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.", + "cmip6:grid": "ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m", + "cmip6:mip_era": "CMIP6" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 0.049800001084804535, + -78.39350128173828 + ], + [ + 0.049800001084804535, + 89.74176788330078 + ], + [ + 359.99493408203125, + 89.74176788330078 + ], + [ + 359.99493408203125, + -78.39350128173828 + ], + [ + 0.049800001084804535, + -78.39350128173828 + ] + ] + ] + }, + "links": [ + { + "rel": "source", + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "title": "birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + } + ], + "assets": { + "HTTPServer": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + }, + "OPENDAP": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "text/html", + "roles": [ + "data" + ] + }, + "WCS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WCS&version=1.0.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "data" + ] + }, + "WMS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WMS&version=1.3.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "visual" + ] + }, + "NetcdfSubset": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc/dataset.html", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + } + }, + "bbox": [ + 0.049800001084804535, + -78.39350128173828, + 359.99493408203125, + 89.74176788330078 + ], + "stac_extensions": [] +} \ No newline at end of file diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py new file mode 100644 index 0000000..d7239a8 --- /dev/null +++ b/tests/test_standalone_stac_item.py @@ -0,0 +1,30 @@ +import json + +import requests +import xncml + +from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import ( + CMIP6ItemProperties, + make_cmip6_item_id, +) +from STACpopulator.models import GeoJSONPolygon +from STACpopulator.stac_utils import STAC_item_from_metadata + + +def test_standalone_stac_item(): + url = ( + "https://pavics.ouranos.ca/twitcher/ows/proxy/" + "thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + "?catalog=https%3A%2F%2Fpavics.ouranos.ca%2Ftwitcher%2Fows%2Fproxy%2F" + "thredds%2Fcatalog%2Fbirdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fcatalog.html" + "&dataset=birdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fsic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + ) + + attrs = xncml.Dataset.from_text(requests.get(url).content).to_cf_dict() + stac_item_id = make_cmip6_item_id(attrs["attributes"]) + stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) + + with open("tests/ref.json", "r") as ff: + reference = json.load(ff) + + assert stac_item.to_dict() == reference