diff --git a/Makefile b/Makefile index b08e1b8..439f93e 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ testcmip6: python $(IMP_DIR)/CMIP6_UofT/add_CMIP6.py $(STAC_HOST) https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html delcmip6: - curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6' + curl --location --request DELETE '$(STAC_HOST)/collections/CMIP6_UofT' @echo "" starthost: diff --git a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py index 56bf4e6..fc39baf 100644 --- a/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py +++ b/STACpopulator/implementations/CMIP6_UofT/add_CMIP6.py @@ -10,7 +10,7 @@ from STACpopulator import STACpopulatorBase from STACpopulator.implementations.CMIP6_UofT.extensions import DataCubeHelper -from STACpopulator.input import THREDDSLoader +from STACpopulator.input import GenericLoader, THREDDSLoader from STACpopulator.models import GeoJSONPolygon, STACItemProperties from STACpopulator.stac_utils import STAC_item_from_metadata, collection2literal @@ -122,7 +122,7 @@ class CMIP6populator(STACpopulatorBase): item_properties_model = CMIP6ItemProperties item_geometry_model = GeoJSONPolygon - def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bool] = False) -> None: + def __init__(self, stac_host: str, data_loader: GenericLoader, update: Optional[bool] = False) -> None: """Constructor :param stac_host: URL to the STAC API @@ -130,13 +130,8 @@ def __init__(self, stac_host: str, thredds_catalog_url: str, update: Optional[bo :param thredds_catalog_url: the URL to the THREDDS catalog to ingest :type thredds_catalog_url: str """ - data_loader = THREDDSLoader(thredds_catalog_url) - super().__init__(stac_host, data_loader, update) - def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): - pass - def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """Creates the STAC item. @@ -172,5 +167,14 @@ def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) args = parser.parse_args() LOGGER.info(f"Arguments to call: {args}") - c = CMIP6populator(args.stac_host, args.thredds_catalog_URL, args.update) + + mode = "full" + + if mode == "full": + data_loader = THREDDSLoader(args.thredds_catalog_URL) + else: + # To be implemented + data_loader = ErrorLoader(args.error_file) + + c = CMIP6populator(args.stac_host, data_loader, args.update) c.ingest() diff --git a/STACpopulator/implementations/CMIP6_UofT/collection_config.yml b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml index a57875b..0f43c78 100644 --- a/STACpopulator/implementations/CMIP6_UofT/collection_config.yml +++ b/STACpopulator/implementations/CMIP6_UofT/collection_config.yml @@ -1,4 +1,5 @@ title: CMIP6 +id: CMIP6_UofT description: Coupled Model Intercomparison Project phase 6 keywords: ['CMIP', 'CMIP6', 'WCRP', 'Climate Change'] license: "CC-BY-4.0" diff --git a/STACpopulator/input.py b/STACpopulator/input.py index be67ede..272f9ad 100644 --- a/STACpopulator/input.py +++ b/STACpopulator/input.py @@ -4,12 +4,11 @@ import pystac import requests -import siphon import xncml from colorlog import ColoredFormatter from siphon.catalog import TDSCatalog -from STACpopulator.stac_utils import numpy_to_python_datatypes +from STACpopulator.stac_utils import numpy_to_python_datatypes, url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -52,23 +51,41 @@ def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> Non super().__init__() self._depth = depth if depth is not None else 1000 - if thredds_catalog_url.endswith(".html"): - thredds_catalog_url = thredds_catalog_url.replace(".html", ".xml") - LOGGER.info("Converting catalog URL from html to xml") + self.thredds_catalog_URL = self.validate_catalog_url(thredds_catalog_url) - self.thredds_catalog_URL = thredds_catalog_url self.catalog = TDSCatalog(self.thredds_catalog_URL) self.catalog_head = self.catalog self.links.append(self.magpie_collection_link()) - def magpie_collection_link(self): - """Return Link to THREDDS catalog.""" + def validate_catalog_url(self, url: str) -> str: + """Validate the user-provided catalog URL. + + :param url: URL to the THREDDS catalog + :type url: str + :raises RuntimeError: if URL is invalid or contains query parameters. + :return: a valid URL + :rtype: str + """ + if url_validate(url): + if "?" in url: + raise RuntimeError("THREDDS catalog URL should not contain query parameter") + else: + raise RuntimeError("Invalid URL") + + return url.replace(".html", ".xml") if url.endswith(".html") else url + + def magpie_collection_link(self) -> pystac.Link: + """Creates a PySTAC Link for the collection that is used by Cowbird and Magpie. + + :return: A PySTAC Link + :rtype: pystac.Link + """ url = self.thredds_catalog_URL parts = url.split("/") i = parts.index("catalog") - service = parts[i - 1] + # service = parts[i - 1] path = "/".join(parts[i + 1 : -1]) - return pystac.Link(rel="source", target=url, media_type="text/xml", title=f"{service}:{path}") + return pystac.Link(rel="source", target=url, media_type="text/xml", title=path) def reset(self): """Reset the generator.""" @@ -76,40 +93,23 @@ def reset(self): def __iter__(self) -> Iterator[Tuple[str, MutableMapping[str, Any]]]: """Return a generator walking a THREDDS data catalog for datasets.""" - # print(f"At START catalog head is: {self.catalog_head}") - print(self.catalog_head.__dict__) if self.catalog_head.datasets.items(): for item_name, ds in self.catalog_head.datasets.items(): - attrs = self.extract_metadata(ds) + attrs = self.extract_metadata(ds.access_urls["NCML"], self.catalog_head.catalog_url, ds.url_path) yield item_name, attrs if self._depth > 0: for name, ref in self.catalog_head.catalog_refs.items(): self.catalog_head = ref.follow() - print(f"catalog head is: {self.catalog_head}") self._depth -= 1 yield from self - def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, Any]: - # Get URL for NCML service - url = ds.access_urls["NCML"] - - print(url) - # print(self.catalog_head) - print(f"ds = {ds}") - print(ds.__dict__) - print(self.catalog_head.catalog_url) + def extract_metadata(self, ncml_url: str, catalog_url: str, dataset_path: str) -> MutableMapping[str, Any]: LOGGER.info("Requesting NcML dataset description") - # r = requests.get(url) - r = requests.get(url, params={"catalog": self.catalog_head, "dataset": ds}) - + r = requests.get(ncml_url, params={"catalog": catalog_url, "dataset": dataset_path}) # Convert NcML to CF-compliant dictionary attrs = xncml.Dataset.from_text(r.content).to_cf_dict() - attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) - - attrs["access_urls"] = ds.access_urls - return attrs diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index 4e75cb1..e6b795d 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -1,12 +1,9 @@ import logging -import os -import sys from abc import ABC, abstractmethod from datetime import datetime from typing import Any, MutableMapping, Optional import pystac -import yaml from colorlog import ColoredFormatter from STACpopulator.api_requests import ( @@ -15,7 +12,7 @@ stac_host_reachable, ) from STACpopulator.input import GenericLoader -from STACpopulator.stac_utils import url_validate +from STACpopulator.stac_utils import load_collection_configuration, url_validate LOGGER = logging.getLogger(__name__) LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" @@ -44,20 +41,7 @@ def __init__( """ super().__init__() - self._collection_info_filename = "collection_config.yml" - self._app_directory = os.path.dirname(sys.argv[0]) - - if not os.path.exists(os.path.join(self._app_directory, self._collection_info_filename)): - raise RuntimeError(f"Missing {self._collection_info_filename} file for this implementation") - - with open(os.path.join(self._app_directory, self._collection_info_filename)) as f: - self._collection_info = yaml.load(f, yaml.Loader) - - req_definitions = ["title", "description", "keywords", "license"] - for req in req_definitions: - if req not in self._collection_info.keys(): - LOGGER.error(f"'{req}' is required in the configuration file") - raise RuntimeError(f"'{req}' is required in the configuration file") + self._collection_info = load_collection_configuration() self._ingest_pipeline = data_loader self._stac_host = self.validate_host(stac_host) @@ -78,7 +62,7 @@ def stac_host(self) -> str: @property def collection_id(self) -> str: - return self._collection_id + return self._collection_info["id"] @property @abstractmethod @@ -87,15 +71,26 @@ def item_properties_model(self): models.STACItemProperties.""" pass + @property + @abstractmethod + def item_geometry_model(self): + """In derived classes, this property should be defined as a pydantic data model that derives from + models.STACItemProperties.""" + pass + + @abstractmethod + def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: + pass + def validate_host(self, stac_host: str) -> str: if not url_validate(stac_host): raise ValueError("stac_host URL is not appropriately formatted") if not stac_host_reachable(stac_host): - raise ValueError("stac_host is not reachable") + raise RuntimeError("stac_host is not reachable") return stac_host - def create_stac_collection(self): + def create_stac_collection(self) -> None: """ Create a basic STAC collection. @@ -114,8 +109,7 @@ def create_stac_collection(self): ) self._collection_info["extent"] = pystac.Extent(sp_extent, tmp_extent) self._collection_info["summaries"] = pystac.Summaries({"needs_summaries_update": ["true"]}) - - collection = pystac.Collection(id=self.collection_id, **self._collection_info) + collection = pystac.Collection(**self._collection_info) collection.add_links(self._ingest_pipeline.links) @@ -127,16 +121,3 @@ def ingest(self) -> None: LOGGER.info(f"Creating STAC representation for {item_name}") stac_item = self.create_stac_item(item_name, item_data) post_stac_item(self.stac_host, self.collection_id, item_name, stac_item, self.update) - # try: - # pass - # except Exception: - # LOGGER.error(f"Failed adding STAC item {item_name}") - # self.handle_ingestion_error("Posting Error", item_name, item_data) - - @abstractmethod - def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]): - pass - - @abstractmethod - def create_stac_item(self, item_name: str, item_data: MutableMapping[str, Any]) -> MutableMapping[str, Any]: - pass diff --git a/STACpopulator/stac_utils.py b/STACpopulator/stac_utils.py index 62b795f..d3786e1 100644 --- a/STACpopulator/stac_utils.py +++ b/STACpopulator/stac_utils.py @@ -1,12 +1,27 @@ +import datetime import json +import logging +import os import re +import sys from typing import Any, Literal, MutableMapping import numpy as np import pystac +import yaml +from colorlog import ColoredFormatter from STACpopulator.models import STACItem +LOGGER = logging.getLogger(__name__) +LOGFORMAT = " %(log_color)s%(levelname)s:%(reset)s %(blue)s[%(name)-30s]%(reset)s %(message)s" +formatter = ColoredFormatter(LOGFORMAT) +stream = logging.StreamHandler() +stream.setFormatter(formatter) +LOGGER.addHandler(stream) +LOGGER.setLevel(logging.INFO) +LOGGER.propagate = False + def url_validate(target: str) -> bool: """Validate whether a supplied URL is reliably written. @@ -32,6 +47,33 @@ def url_validate(target: str) -> bool: return True if re.match(url_regex, target) else False +def load_collection_configuration() -> MutableMapping[str, Any]: + """Reads details of the STAC Collection to be created from a configuration file. the + code expects a "collection_config.yml" file to be present in the app directory. + + :raises RuntimeError: If the configuration file is not present + :raises RuntimeError: If required values are not present in the configuration file + :return: A python dictionary describing the details of the Collection + :rtype: MutableMapping[str, Any] + """ + collection_info_filename = "collection_config.yml" + app_directory = os.path.dirname(sys.argv[0]) + + if not os.path.exists(os.path.join(app_directory, collection_info_filename)): + raise RuntimeError(f"Missing {collection_info_filename} file for this implementation") + + with open(os.path.join(app_directory, collection_info_filename)) as f: + collection_info = yaml.load(f, yaml.Loader) + + req_definitions = ["title", "id", "description", "keywords", "license"] + for req in req_definitions: + if req not in collection_info.keys(): + LOGGER.error(f"'{req}' is required in the configuration file") + raise RuntimeError(f"'{req}' is required in the configuration file") + + return collection_info + + def collection2literal(collection): terms = tuple(term.label for term in collection) return Literal[terms] @@ -149,40 +191,34 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop # Convert pydantic STAC item to a PySTAC Item item = pystac.Item(**json.loads(item.model_dump_json(by_alias=True))) - # Add assets - if "access_urls" in attrs: - print("access_urls") - root = attrs["access_urls"] - elif "THREDDSMetadata" in attrs["groups"]: - print("THREDDSMetadata") - root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] - else: - root = {} + root = attrs["groups"]["THREDDSMetadata"]["groups"]["services"]["attributes"] for name, url in root.items(): name = str(name) # converting name from siphon.catalog.CaseInsensitiveStr to str asset = pystac.Asset(href=url, media_type=media_types.get(name), roles=asset_roles.get(name)) + + name = asset_name_remaps[name] if name in asset_name_remaps.keys() else name item.add_asset(name, asset) - # if root: - # item.add_link(magpie_resource_link(root["HTTPServer"])) + item.add_link(magpie_resource_link(root["httpserver_service"])) return item +asset_name_remaps = { + "httpserver_service": "HTTPServer", + "opendap_service": "OPENDAP", + "wcs_service": "WCS", + "wms_service": "WMS", + "nccs_service": "NetcdfSubset", +} + media_types = { "httpserver_service": "application/x-netcdf", "opendap_service": pystac.MediaType.HTML, "wcs_service": pystac.MediaType.XML, "wms_service": pystac.MediaType.XML, "nccs_service": "application/x-netcdf", - "HTTPServer": "application/x-netcdf", - "OPENDAP": pystac.MediaType.HTML, - "NCML": pystac.MediaType.XML, - "WCS": pystac.MediaType.XML, - "ISO": pystac.MediaType.XML, - "WMS": pystac.MediaType.XML, - "NetcdfSubset": "application/x-netcdf", } asset_roles = { @@ -191,11 +227,4 @@ def STAC_item_from_metadata(iid: str, attrs: MutableMapping[str, Any], item_prop "wcs_service": ["data"], "wms_service": ["visual"], "nccs_service": ["data"], - "HTTPServer": ["data"], - "OPENDAP": ["data"], - "NCML": ["metadata"], - "WCS": ["data"], - "ISO": ["metadata"], - "WMS": ["visual"], - "NetcdfSubset": ["data"], } diff --git a/tests/ref.txt b/tests/ref.txt new file mode 100644 index 0000000..f3b8c23 --- /dev/null +++ b/tests/ref.txt @@ -0,0 +1,124 @@ +{ + "type": "Feature", + "stac_version": "1.0.0", + "id": "ScenarioMIP_CCCma_CanESM5_ssp245_r13i1p2f1_SImon_siconc_gn", + "properties": { + "start_datetime": "2019-12-06T12:00:00Z", + "end_datetime": "2020-11-04T12:00:00Z", + "datetime": null, + "cmip6:Conventions": "CF-1.7 CMIP-6.2", + "cmip6:activity_id": "ScenarioMIP", + "cmip6:creation_date": "2019-09-25T23:01:33Z", + "cmip6:data_specs_version": "01.00.30", + "cmip6:experiment": "update of RCP4.5 based on SSP2", + "cmip6:experiment_id": "ssp245", + "cmip6:frequency": "mon", + "cmip6:further_info_url": "https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1", + "cmip6:grid_label": "gn", + "cmip6:institution": "Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada", + "cmip6:institution_id": "CCCma", + "cmip6:nominal_resolution": "100 km", + "cmip6:realm": [ + "seaIce" + ], + "cmip6:source": "CanESM5 (2019): \naerosol: interactive\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\natmosChem: specified oxidants for aerosols\nland: CLASS3.6/CTEM1.2\nlandIce: specified ice sheets\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\nseaIce: LIM2", + "cmip6:source_id": "CanESM5", + "cmip6:source_type": [ + "AOGCM" + ], + "cmip6:sub_experiment": "none", + "cmip6:sub_experiment_id": "none", + "cmip6:table_id": "SImon", + "cmip6:variable_id": "siconc", + "cmip6:variant_label": "r13i1p2f1", + "cmip6:initialization_index": 1, + "cmip6:physics_index": 2, + "cmip6:realization_index": 13, + "cmip6:forcing_index": 1, + "cmip6:tracking_id": "hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95", + "cmip6:version": "v20190429", + "cmip6:product": "model-output", + "cmip6:license": "CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.", + "cmip6:grid": "ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m", + "cmip6:mip_era": "CMIP6" + }, + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + 0.049800001084804535, + -78.39350128173828 + ], + [ + 0.049800001084804535, + 89.74176788330078 + ], + [ + 359.99493408203125, + 89.74176788330078 + ], + [ + 359.99493408203125, + -78.39350128173828 + ], + [ + 0.049800001084804535, + -78.39350128173828 + ] + ] + ] + }, + "links": [ + { + "rel": "source", + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "title": "birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + } + ], + "assets": { + "HTTPServer": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + }, + "OPENDAP": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc", + "type": "text/html", + "roles": [ + "data" + ] + }, + "WCS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WCS&version=1.0.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "data" + ] + }, + "WMS": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WMS&version=1.3.0&request=GetCapabilities", + "type": "application/xml", + "roles": [ + "visual" + ] + }, + "NetcdfSubset": { + "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc/dataset.html", + "type": "application/x-netcdf", + "roles": [ + "data" + ] + } + }, + "bbox": [ + 0.049800001084804535, + -78.39350128173828, + 359.99493408203125, + 89.74176788330078 + ], + "stac_extensions": [] +} \ No newline at end of file diff --git a/tests/test_standalone_stac_item.py b/tests/test_standalone_stac_item.py new file mode 100644 index 0000000..f0dc3c8 --- /dev/null +++ b/tests/test_standalone_stac_item.py @@ -0,0 +1,30 @@ +import json + +import requests +import xncml + +from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import ( + CMIP6ItemProperties, + make_cmip6_item_id, +) +from STACpopulator.models import GeoJSONPolygon +from STACpopulator.stac_utils import STAC_item_from_metadata + + +def test_standalone_stac_item(): + url = ( + "https://pavics.ouranos.ca/twitcher/ows/proxy/" + "thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + "?catalog=https%3A%2F%2Fpavics.ouranos.ca%2Ftwitcher%2Fows%2Fproxy%2F" + "thredds%2Fcatalog%2Fbirdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fcatalog.html" + "&dataset=birdhouse%2Ftestdata%2Fxclim%2Fcmip6%2Fsic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc" + ) + + attrs = xncml.Dataset.from_text(requests.get(url).content).to_cf_dict() + stac_item_id = make_cmip6_item_id(attrs["attributes"]) + stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon) + + with open("tests/ref.txt", "r") as ff: + reference = json.load(ff) + + assert stac_item.to_dict() == reference