From 9da080bbe912dc54654e5bab9d2b29880f8d027b Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Thu, 19 Oct 2023 17:30:40 -0400 Subject: [PATCH 1/6] [WIP] trying to make STACpopulator work with 'arch-finalization-proposal' branch --- Makefile | 3 +- notebooks/ncml2stac.ipynb | 470 +++++++++++--------------------------- requirements.txt | 2 +- 3 files changed, 140 insertions(+), 335 deletions(-) diff --git a/Makefile b/Makefile index d996aed..5c0a603 100644 --- a/Makefile +++ b/Makefile @@ -71,7 +71,8 @@ PIP_USE_FEATURE := `python -c '\ except ImportError: \ from distutils.version import LooseVersion as Version \ print(Version(pip.__version__) < Version("21.0"))'` -PIP_XARGS ?= +# when a repository must be cloned locally to build/install it, (w)ipe if path conflicts +PIP_XARGS ?= --exists-action=w ifeq ("$(PIP_USE_FEATURE)", "True") PIP_XARGS := --use-feature=2020-resolver $(PIP_XARGS) endif diff --git a/notebooks/ncml2stac.ipynb b/notebooks/ncml2stac.ipynb index cb65f15..ca4fb8f 100644 --- a/notebooks/ncml2stac.ipynb +++ b/notebooks/ncml2stac.ipynb @@ -38,9 +38,11 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 1, "outputs": [], "source": [ + "import os.path\n", + "\n", "# NOTE:\n", "# If using indented code block here (eg: 'if TYPE_CHECKING:'),\n", "# it is important to have other things than 'ipython2cwl' imports.\n", @@ -71,8 +73,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-09-29T22:45:33.099193431Z", - "start_time": "2023-09-29T22:45:33.014184569Z" + "end_time": "2023-10-19T21:24:19.361222561Z", + "start_time": "2023-10-19T21:24:19.358206330Z" } }, "id": "61f43c81dc3aa6c2" @@ -94,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 2, "outputs": [ { "name": "stdout", @@ -105,7 +107,7 @@ "remote: Counting objects: 100% (1557/1557), done.\u001B[K\r\n", "remote: Compressing objects: 100% (476/476), done.\u001B[K\r\n", "remote: Total 63068 (delta 1258), reused 1327 (delta 1070), pack-reused 61511\u001B[K\r\n", - "Receiving objects: 100% (63068/63068), 6.06 MiB | 5.05 MiB/s, done.\r\n", + "Receiving objects: 100% (63068/63068), 6.06 MiB | 11.17 MiB/s, done.\r\n", "Resolving deltas: 100% (60270/60270), done.\r\n", "\r\n", "Local identity for pyessv-archive set to \"Francis Charette Migneault \"\r\n" @@ -120,44 +122,54 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-09-29T22:45:37.397132140Z", - "start_time": "2023-09-29T22:45:33.063477776Z" + "end_time": "2023-10-19T21:24:21.492635070Z", + "start_time": "2023-10-19T21:24:19.358363899Z" } }, "id": "f10d85e12b47da43" }, { "cell_type": "code", - "execution_count": 87, - "outputs": [], + "execution_count": 3, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-10-19 21:24:21.957844 [INFO] :: PYESSV :: Loading vocabularies from /home/francis/.esdoc/pyessv-archive ... please wait\n" + ] + } + ], "source": [ "import hashlib\n", "import json\n", + "import os\n", "import tempfile\n", "from datetime import datetime, date\n", "from enum import Enum\n", + "from urllib.parse import parse_qs, urlparse, unquote\n", "\n", "import numpy as np\n", "import pystac\n", "import requests\n", - "import xncml\n", + "import siphon.catalog\n", "from pydantic.networks import Url\n", "\n", - "from STACpopulator.extensions import cmip6\n", - "from STACpopulator.stac_utils import CFJsonItem, DatacubeExt" + "from STACpopulator.input import THREDDSLoader\n", + "from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import CMIP6populator" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-09-29T22:45:37.444764964Z", - "start_time": "2023-09-29T22:45:37.400457899Z" + "end_time": "2023-10-19T21:24:22.260016495Z", + "start_time": "2023-10-19T21:24:21.483709731Z" } }, "id": "f68ea4339c5e4a9d" }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 4, "outputs": [ { "name": "stdout", @@ -244,7 +256,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -350,6 +362,7 @@ ], "source": [ "# retrieve the file contents\n", + "input_ncml_href = input_ncml\n", "if not (input_ncml.startswith(\"/\") or input_ncml.startswith(\"file:///\")):\n", " resp = requests.get(input_ncml, headers={\"Accept\": \"text/xml, application/xml\"}, timeout=5)\n", " if not resp.status_code == 200 and resp.text.startswith(\"" - }, - "execution_count": 89, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "outputs": [], "source": [ - "# FIXME: duplicate code\n", - "# this is defined in:\n", - "# https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L102-L116\n", - "# but we cannot import it since outside of installed 'STACpopulator' module\n", - "def make_cmip6_item_id(_attrs: \"JsonLike\") -> str:\n", - " \"\"\"Return a unique ID for CMIP6 data item.\"\"\"\n", - " keys = [\n", - " \"activity_id\",\n", - " \"institution_id\",\n", - " \"source_id\",\n", - " \"experiment_id\",\n", - " \"variant_label\",\n", - " \"table_id\",\n", - " \"variable_id\",\n", - " \"grid_label\",\n", - " ]\n", - " name = \"_\".join(_attrs[k] for k in keys)\n", - " return hashlib.md5(name.encode(\"utf-8\")).hexdigest()\n", + "# # FIXME: duplicate code\n", + "# # this is defined in:\n", + "# # https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L102-L116\n", + "# # but we cannot import it since outside of installed 'STACpopulator' module\n", + "# def make_cmip6_item_id(_attrs: \"JsonLike\") -> str:\n", + "# \"\"\"Return a unique ID for CMIP6 data item.\"\"\"\n", + "# keys = [\n", + "# \"activity_id\",\n", + "# \"institution_id\",\n", + "# \"source_id\",\n", + "# \"experiment_id\",\n", + "# \"variant_label\",\n", + "# \"table_id\",\n", + "# \"variable_id\",\n", + "# \"grid_label\",\n", + "# ]\n", + "# name = \"_\".join(_attrs[k] for k in keys)\n", + "# return hashlib.md5(name.encode(\"utf-8\")).hexdigest()\n", + "#\n", + "#\n", + "# # FIXME: temporary patch of URL/Media-Type\n", + "# # https://github.com/crim-ca/stac-populator/pull/23#discussion_r1341819744\n", + "# class CFJsonItemNetCDF(CFJsonItem):\n", + "# def item_link(self) -> pystac.Link:\n", + "# url = self.attrs[\"@location\"] # NetCDF URL\n", + "# name = self.attrs[\"groups\"][\"THREDDSMetadata\"][\"attributes\"][\"id\"]\n", + "# path = url.split(name, 1)[0]\n", + "# parts = list(filter(lambda _: bool(_), path.rsplit(\"/\", 3)))\n", + "# service = parts[-2] # always 1 path part for the service\n", + "# link = pystac.Link(\n", + "# rel=\"source\",\n", + "# target=url,\n", + "# media_type=\"application/x-netcdf\",\n", + "# title=f\"{service}:{name}\"\n", + "# )\n", + "# return link\n", + "#\n", + "#\n", + "# # FIXME: partial duplicate code\n", + "# # https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L138-L165\n", + "# # should be combined into a single callable function that doesn't depend on the rest of the THREDDS crawling iterator\n", + "# ds = xncml.Dataset(input_ncml)\n", + "# input_ncml_data = ds.to_cf_dict()\n", + "#\n", + "# # FIXME: AttributeError\n", + "# nc_services = getattr(ds, \"access_urls\", None)\n", + "# if nc_services:\n", + "# attrs[\"access_urls\"] = nc_services\n", + "#\n", + "# stac_item_id = make_cmip6_item_id(attrs[\"attributes\"])\n", + "# attrs[\"id\"] = stac_item_id\n", + "# stac_item = CFJsonItemNetCDF(stac_item_id, attrs, cmip6.Properties)\n", + "# DatacubeExt(stac_item)\n", "\n", + "class NCMLSingleFileLoader(THREDDSLoader):\n", + " def __init__(self, *_, **__):\n", + " # ignore original THREDDSLoader init, move directly to its parent\n", + " # don't automatically parse whole THREDDS catalog!\n", + " super(THREDDSLoader, self).__init__()\n", "\n", - "# FIXME: temporary patch of URL/Media-Type\n", - "# https://github.com/crim-ca/stac-populator/pull/23#discussion_r1341819744\n", - "class CFJsonItemNetCDF(CFJsonItem):\n", - " def item_link(self) -> pystac.Link:\n", - " url = self.attrs[\"@location\"] # NetCDF URL\n", - " name = self.attrs[\"groups\"][\"THREDDSMetadata\"][\"attributes\"][\"id\"]\n", - " path = url.split(name, 1)[0]\n", - " parts = list(filter(lambda _: bool(_), path.rsplit(\"/\", 3)))\n", - " service = parts[-2] # always 1 path part for the service\n", - " link = pystac.Link(\n", - " rel=\"source\",\n", - " target=url,\n", - " media_type=\"application/x-netcdf\",\n", - " title=f\"{service}:{name}\"\n", - " )\n", - " return link\n", + "\n", + "class CMIP6SingleFilePopulator(CMIP6populator):\n", + " def __init__(self, stac_host, thredds_catalog_url, update=False):\n", + " self.data_loader = NCMLSingleFileLoader(thredds_catalog_url, depth=0)\n", + "\n", + " # ignore original CMIP6populator init, as well as its parent STACpopulatorBase\n", + " # - don't automatically parse whole THREDDS catalog!\n", + " # - don't check for unnecessary 'collection_config.yml' file\n", + " #super(CMIP6populator, self).__init__(stac_host, self.data_loader, update)\n", "\n", "\n", - "# FIXME: partial duplicate code\n", - "# https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L138-L165\n", - "# should be combined into a single callable function that doesn't depend on the rest of the THREDDS crawling iterator\n", - "ds = xncml.Dataset(input_ncml)\n", - "attrs = ds.to_cf_dict()\n", + "input_ncml_href_parsed = urlparse(input_ncml_href)\n", + "input_ncml_href_params = parse_qs(input_ncml_href_parsed.query)\n", + "if \"catalog\" in input_ncml_href_params:\n", + " input_ncml_catalog_href = unquote(input_ncml_href_params[\"catalog\"][0])\n", + " input_ncml_catalog_href = os.path.splitext(input_ncml_catalog_href)[0] + \".xml\" # in case it was HTML\n", + "else:\n", + " input_ncml_catalog_href = input_ncml_href.split(\"?\", 1)[0]\n", + " input_ncml_catalog_href = input_ncml_catalog_href.replace(\"/ncml\", \"/catalog/\")\n", + " input_ncml_catalog_href = os.path.join(os.path.dirname(input_ncml_catalog_href), \"catalog.xml\")\n", + "input_ncml_catalog_xml = requests.get(input_ncml_catalog_href, headers={\"Accept\": \"text/xml, application/xml\"}, timeout=5).text\n", + "catalog_xml = siphon.catalog.ET.fromstring(input_ncml_catalog_xml)\n", "\n", - "# FIXME: AttributeError\n", - "nc_services = getattr(ds, \"access_urls\", None)\n", - "if nc_services:\n", - " attrs[\"access_urls\"] = nc_services\n", + "# technically invalid STAC host, but just need something for URL schema validation\n", + "stac_host = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n", + "cmip6_pop = CMIP6SingleFilePopulator(stac_host, input_ncml_catalog_href)\n", + "ncml_xml = siphon.catalog.ET.fromstring(input_ncml_xml)\n", "\n", - "stac_item_id = make_cmip6_item_id(attrs[\"attributes\"])\n", - "attrs[\"id\"] = stac_item_id\n", - "stac_item = CFJsonItemNetCDF(stac_item_id, attrs, cmip6.Properties)\n", - "DatacubeExt(stac_item)" + "# FIXME: hack, missing 'name' in XML header when accessing NCML directly, but available in nested 'dataset' attribute\n", + "ncml_name = ncml_xml.attrib.get(\"name\") or catalog_xml.find(catalog_xml.tag.rsplit(\"catalog\", 1)[0] + \"dataset\").attrib[\"ID\"]\n", + "ncml_xml.attrib.setdefault(\"name\", ncml_name)\n", + "catalog_xml.attrib.setdefault(\"name\", ncml_name)\n", + "\n", + "catalog_ds = siphon.catalog.Dataset(catalog_xml)\n", + "ncml_data = cmip6_pop.data_loader.extract_metadata(catalog_ds)\n", + "stac_item = cmip6_pop.create_stac_item(ncml_name, ncml_data)" ], "metadata": { "collapsed": false, + "is_executing": true, "ExecuteTime": { - "end_time": "2023-09-29T22:45:37.711546409Z", - "start_time": "2023-09-29T22:45:37.646200547Z" + "start_time": "2023-10-19T21:27:02.720168481Z" } }, "id": "299946ccd58e2efc" @@ -463,257 +511,15 @@ }, { "cell_type": "code", - "execution_count": 90, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"type\": \"Feature\",\n", - " \"stac_version\": \"1.0.0\",\n", - " \"id\": \"36c83a8bb9d382ff2ffed7b9ba422cd3\",\n", - " \"properties\": {\n", - " \"start_datetime\": \"2019-12-06T12:00:00Z\",\n", - " \"end_datetime\": \"2020-11-04T12:00:00Z\",\n", - " \"datetime\": null,\n", - " \"Conventions\": \"CF-1.7 CMIP-6.2\",\n", - " \"activity_id\": \"ScenarioMIP\",\n", - " \"creation_date\": \"2019-09-25T23:01:33Z\",\n", - " \"data_specs_version\": \"01.00.30\",\n", - " \"experiment\": \"update of RCP4.5 based on SSP2\",\n", - " \"experiment_id\": \"ssp245\",\n", - " \"frequency\": \"mon\",\n", - " \"further_info_url\": \"https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1\",\n", - " \"grid_label\": \"gn\",\n", - " \"institution\": \"Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada\",\n", - " \"institution_id\": \"CCCma\",\n", - " \"nominal_resolution\": \"100 km\",\n", - " \"realm\": [\n", - " \"seaIce\"\n", - " ],\n", - " \"source\": \"CanESM5 (2019): \\naerosol: interactive\\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\\natmosChem: specified oxidants for aerosols\\nland: CLASS3.6/CTEM1.2\\nlandIce: specified ice sheets\\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\\nseaIce: LIM2\",\n", - " \"source_id\": \"CanESM5\",\n", - " \"source_type\": [\n", - " \"AOGCM\"\n", - " ],\n", - " \"sub_experiment\": \"none\",\n", - " \"sub_experiment_id\": \"none\",\n", - " \"table_id\": \"SImon\",\n", - " \"variable_id\": \"siconc\",\n", - " \"variant_label\": \"r13i1p2f1\",\n", - " \"initialization_index\": 1,\n", - " \"physics_index\": 2,\n", - " \"realization_index\": 13,\n", - " \"forcing_index\": 1,\n", - " \"tracking_id\": \"hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95\",\n", - " \"version\": \"v20190429\",\n", - " \"product\": \"model-output\",\n", - " \"license\": \"CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.\",\n", - " \"grid\": \"ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m\",\n", - " \"mip_era\": \"CMIP6\",\n", - " \"cube:dimensions\": {\n", - " \"time\": {\n", - " \"axis\": \"t\",\n", - " \"type\": \"temporal\",\n", - " \"extent\": null,\n", - " \"description\": [\n", - " \"time\"\n", - " ]\n", - " },\n", - " \"j\": {\n", - " \"axis\": \"y\",\n", - " \"type\": \"spatial\",\n", - " \"extent\": [\n", - " 0,\n", - " 291\n", - " ],\n", - " \"description\": [\n", - " \"projection_y_coordinate\",\n", - " \"grid_latitude\",\n", - " \"projection_y_angular_coordinate\"\n", - " ]\n", - " },\n", - " \"i\": {\n", - " \"axis\": \"x\",\n", - " \"type\": \"spatial\",\n", - " \"extent\": [\n", - " 0,\n", - " 360\n", - " ],\n", - " \"description\": [\n", - " \"projection_x_coordinate\",\n", - " \"grid_longitude\",\n", - " \"projection_x_angular_coordinate\"\n", - " ]\n", - " }\n", - " },\n", - " \"cube:variables\": {\n", - " \"time_bnds\": {\n", - " \"dimensions\": [\n", - " \"time\",\n", - " \"bnds\"\n", - " ],\n", - " \"type\": \"data\",\n", - " \"description\": null,\n", - " \"unit\": null\n", - " },\n", - " \"vertices_latitude\": {\n", - " \"dimensions\": [\n", - " \"j\",\n", - " \"i\",\n", - " \"vertices\"\n", - " ],\n", - " \"type\": \"data\",\n", - " \"description\": null,\n", - " \"unit\": null\n", - " },\n", - " \"vertices_longitude\": {\n", - " \"dimensions\": [\n", - " \"j\",\n", - " \"i\",\n", - " \"vertices\"\n", - " ],\n", - " \"type\": \"data\",\n", - " \"description\": null,\n", - " \"unit\": null\n", - " },\n", - " \"siconc\": {\n", - " \"dimensions\": [\n", - " \"time\",\n", - " \"j\",\n", - " \"i\"\n", - " ],\n", - " \"type\": \"data\",\n", - " \"description\": \"Sea-Ice Area Percentage (Ocean Grid)\",\n", - " \"unit\": \"%\"\n", - " },\n", - " \"areacello\": {\n", - " \"dimensions\": [\n", - " \"j\",\n", - " \"i\"\n", - " ],\n", - " \"type\": \"data\",\n", - " \"description\": \"Grid-Cell Area for Ocean Variables\",\n", - " \"unit\": \"m2\"\n", - " },\n", - " \"type\": {\n", - " \"dimensions\": [\n", - " \"maxStrlen64\"\n", - " ],\n", - " \"type\": \"data\",\n", - " \"description\": \"Sea Ice area type\",\n", - " \"unit\": null\n", - " },\n", - " \"latitude\": {\n", - " \"dimensions\": [\n", - " \"j\",\n", - " \"i\"\n", - " ],\n", - " \"type\": \"auxiliary\",\n", - " \"description\": \"latitude\",\n", - " \"unit\": \"degrees_north\"\n", - " },\n", - " \"longitude\": {\n", - " \"dimensions\": [\n", - " \"j\",\n", - " \"i\"\n", - " ],\n", - " \"type\": \"auxiliary\",\n", - " \"description\": \"longitude\",\n", - " \"unit\": \"degrees_east\"\n", - " }\n", - " }\n", - " },\n", - " \"geometry\": {\n", - " \"type\": \"Polygon\",\n", - " \"coordinates\": [\n", - " [\n", - " [\n", - " 0.049800001084804535,\n", - " -78.39350128173828\n", - " ],\n", - " [\n", - " 0.049800001084804535,\n", - " 89.74176788330078\n", - " ],\n", - " [\n", - " 359.99493408203125,\n", - " 89.74176788330078\n", - " ],\n", - " [\n", - " 359.99493408203125,\n", - " -78.39350128173828\n", - " ],\n", - " [\n", - " 0.049800001084804535,\n", - " -78.39350128173828\n", - " ]\n", - " ]\n", - " ]\n", - " },\n", - " \"links\": [\n", - " {\n", - " \"rel\": \"source\",\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", - " \"type\": \"application/x-netcdf\",\n", - " \"title\": \"thredds:birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\"\n", - " }\n", - " ],\n", - " \"assets\": {\n", - " \"httpserver_service\": {\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", - " \"type\": \"application/x-netcdf\",\n", - " \"roles\": [\n", - " \"data\"\n", - " ]\n", - " },\n", - " \"opendap_service\": {\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", - " \"type\": \"text/html\",\n", - " \"roles\": [\n", - " \"data\"\n", - " ]\n", - " },\n", - " \"wcs_service\": {\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WCS&version=1.0.0&request=GetCapabilities\",\n", - " \"type\": \"application/xml\",\n", - " \"roles\": [\n", - " \"data\"\n", - " ]\n", - " },\n", - " \"wms_service\": {\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc?service=WMS&version=1.3.0&request=GetCapabilities\",\n", - " \"type\": \"application/xml\",\n", - " \"roles\": [\n", - " \"visual\"\n", - " ]\n", - " },\n", - " \"nccs_service\": {\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc/dataset.html\",\n", - " \"type\": \"application/x-netcdf\",\n", - " \"roles\": [\n", - " \"data\"\n", - " ]\n", - " }\n", - " },\n", - " \"bbox\": [\n", - " 0.049800001084804535,\n", - " -78.39350128173828,\n", - " 359.99493408203125,\n", - " 89.74176788330078\n", - " ],\n", - " \"stac_extensions\": [\n", - " \"https://stac-extensions.github.io/datacube/v2.0.0/schema.json\"\n", - " ]\n", - "}\n" - ] - } - ], + "execution_count": null, + "outputs": [], "source": [ + "AnyDateTime = Union[datetime, date]\n", + "AnyJsonEncodable = Union[pystac.Item, np.ndarray, np.number, Url, Enum, AnyDateTime, \"JsonLike\"]\n", + "\n", "stac_item_data = stac_item.item.to_dict()\n", "\n", - "def json_encode(obj: \"pystac.Item\") -> Union[\"JsonLike\", str]:\n", + "def json_encode(obj: \"AnyJsonEncodable\") -> Union[\"JsonLike\", str]:\n", " if isinstance(obj, (np.ndarray, np.number)):\n", " return obj.tolist()\n", " if isinstance(obj, (Url, Enum)):\n", @@ -728,8 +534,7 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-09-29T22:45:37.741924891Z", - "start_time": "2023-09-29T22:45:37.672054869Z" + "start_time": "2023-10-19T18:24:46.424220761Z" } }, "id": "4eeb52c23edccb31" @@ -746,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": null, "outputs": [], "source": [ "# NOTE:\n", @@ -759,8 +564,7 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-09-29T22:45:37.742086738Z", - "start_time": "2023-09-29T22:45:37.715603867Z" + "start_time": "2023-10-19T18:24:46.424257773Z" } }, "id": "e4fa98fcad8b5556" @@ -768,7 +572,7 @@ ], "metadata": { "kernelspec": { - "name": "python3", + "name": "ncml2stac", "language": "python", "display_name": "ncml2stac" }, diff --git a/requirements.txt b/requirements.txt index 0920de8..fc5def8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ # Following does not work # STACpopulator @ https://github.com/crim-ca/stac-populator/archive/refs/heads/weaver-repo2cwl-ncml2stac.zip # Also, editable '-e' required, otherwise module still not found... --e git+https://github.com/Ouranosinc/stac-populator@collection_link#egg=STACpopulator +-e git+https://github.com/crim-ca/stac-populator@arch-finalization-proposal#egg=STACpopulator From 5b3a3baabb41901b47b53d3e80e30479816ecb9c Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Thu, 19 Oct 2023 18:58:55 -0400 Subject: [PATCH 2/6] [WIP] working version with hacked STACpopulator of 'arch-finalization-proposal' branch --- notebooks/ncml2stac.ipynb | 292 +++++++++++++++++++++++++++++++------- 1 file changed, 239 insertions(+), 53 deletions(-) diff --git a/notebooks/ncml2stac.ipynb b/notebooks/ncml2stac.ipynb index ca4fb8f..56483f5 100644 --- a/notebooks/ncml2stac.ipynb +++ b/notebooks/ncml2stac.ipynb @@ -38,11 +38,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "outputs": [], "source": [ - "import os.path\n", - "\n", "# NOTE:\n", "# If using indented code block here (eg: 'if TYPE_CHECKING:'),\n", "# it is important to have other things than 'ipython2cwl' imports.\n", @@ -73,8 +71,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T21:24:19.361222561Z", - "start_time": "2023-10-19T21:24:19.358206330Z" + "end_time": "2023-10-19T22:52:47.572589682Z", + "start_time": "2023-10-19T22:52:47.529149722Z" } }, "id": "61f43c81dc3aa6c2" @@ -96,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "outputs": [ { "name": "stdout", @@ -106,8 +104,8 @@ "remote: Enumerating objects: 63068, done.\u001B[K\r\n", "remote: Counting objects: 100% (1557/1557), done.\u001B[K\r\n", "remote: Compressing objects: 100% (476/476), done.\u001B[K\r\n", - "remote: Total 63068 (delta 1258), reused 1327 (delta 1070), pack-reused 61511\u001B[K\r\n", - "Receiving objects: 100% (63068/63068), 6.06 MiB | 11.17 MiB/s, done.\r\n", + "remote: Total 63068 (delta 1258), reused 1327 (delta 1070), pack-reused 61511\u001B[Ks: 22% (13875/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 24% (15137/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 26% (16398/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 28% (17660/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 29% (18684/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 31% (19552/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 34% (21444/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 36% (22705/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 40% (25228/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 42% (26489/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 45% (28381/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 48% (30273/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 50% (31534/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 53% (33427/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 55% (34688/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 58% (36580/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 61% (38472/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 66% (41625/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 68% (42887/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 70% (44148/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 74% (46671/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 76% (47932/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 79% (49824/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 81% (51086/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 83% (52347/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 85% (53608/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 87% (54870/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 90% (56762/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 93% (58654/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 94% (59744/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 96% (60546/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 98% (61807/63068), 5.50 MiB | 1.34 MiB/s\r\n", + "Receiving objects: 100% (63068/63068), 6.06 MiB | 1.40 MiB/s, done.\r\n", "Resolving deltas: 100% (60270/60270), done.\r\n", "\r\n", "Local identity for pyessv-archive set to \"Francis Charette Migneault \"\r\n" @@ -122,24 +120,16 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T21:24:21.492635070Z", - "start_time": "2023-10-19T21:24:19.358363899Z" + "end_time": "2023-10-19T22:52:53.446668050Z", + "start_time": "2023-10-19T22:52:47.571096999Z" } }, "id": "f10d85e12b47da43" }, { "cell_type": "code", - "execution_count": 3, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2023-10-19 21:24:21.957844 [INFO] :: PYESSV :: Loading vocabularies from /home/francis/.esdoc/pyessv-archive ... please wait\n" - ] - } - ], + "execution_count": 17, + "outputs": [], "source": [ "import hashlib\n", "import json\n", @@ -161,15 +151,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T21:24:22.260016495Z", - "start_time": "2023-10-19T21:24:21.483709731Z" + "end_time": "2023-10-19T22:52:53.452740519Z", + "start_time": "2023-10-19T22:52:53.448504173Z" } }, "id": "f68ea4339c5e4a9d" }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "outputs": [ { "name": "stdout", @@ -382,16 +372,25 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T21:24:22.400593718Z", - "start_time": "2023-10-19T21:24:22.262694266Z" + "end_time": "2023-10-19T22:52:53.626782964Z", + "start_time": "2023-10-19T22:52:53.453091991Z" } }, "id": "4fc2f66493dc56c5" }, { "cell_type": "code", - "execution_count": null, - "outputs": [], + "execution_count": 23, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \u001B[32mINFO:\u001B[0m \u001B[34m[STACpopulator.input ]\u001B[0m Requesting NcML dataset description\u001B[0m\n", + " \u001B[33mWARNING:\u001B[0m \u001B[34m[STACpopulator.implementations.CMIP6_UofT.add_CMIP6]\u001B[0m Failed to add Datacube extension to item sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\u001B[0m\n" + ] + } + ], "source": [ "# # FIXME: duplicate code\n", "# # this is defined in:\n", @@ -447,54 +446,92 @@ "# stac_item = CFJsonItemNetCDF(stac_item_id, attrs, cmip6.Properties)\n", "# DatacubeExt(stac_item)\n", "\n", - "class NCMLSingleFileLoader(THREDDSLoader):\n", - " def __init__(self, *_, **__):\n", - " # ignore original THREDDSLoader init, move directly to its parent\n", - " # don't automatically parse whole THREDDS catalog!\n", - " super(THREDDSLoader, self).__init__()\n", + "# class NCMLSingleFileLoader(THREDDSLoader):\n", + "# def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None):\n", + "# # ignore original THREDDSLoader init, move directly to its parent\n", + "# # don't automatically parse whole THREDDS catalog!\n", + "# super(THREDDSLoader, self).__init__()\n", "\n", "\n", "class CMIP6SingleFilePopulator(CMIP6populator):\n", - " def __init__(self, stac_host, thredds_catalog_url, update=False):\n", - " self.data_loader = NCMLSingleFileLoader(thredds_catalog_url, depth=0)\n", - "\n", + " # WARNING:\n", + " # to limit as much as possible how many useless iterations crawling the datasets is done\n", + " # this implementation enforces a depth=1.\n", + " # therefore, the 'thredds_catalog_url' must be \"right above\" the target NCML file URL\n", + " # example:\n", + " # To describe:\n", + " # https://svc.com/thredds/ncml/some/nested/netcdf.nc\"\n", + " # thredds_catalog_url should be:\n", + " # https://svc.com/thredds/catalog/some/nested/catalog.xml\"\n", + " def __init__(self, stac_host, thredds_catalog_url, target_item, update=False):\n", + " ##self.data_loader = NCMLSingleFileLoader(thredds_catalog_url, depth=0)\n", " # ignore original CMIP6populator init, as well as its parent STACpopulatorBase\n", " # - don't automatically parse whole THREDDS catalog!\n", " # - don't check for unnecessary 'collection_config.yml' file\n", " #super(CMIP6populator, self).__init__(stac_host, self.data_loader, update)\n", + " ###super().__init__(stac_host, thredds_catalog_url, update=update)\n", + "\n", + " # FIXME: just reimplement what is needed (not config needed, don't care about STAC Collections...)\n", + " self.target_item = target_item\n", + " self._stac_host = stac_host\n", + " self._ingest_pipeline = THREDDSLoader(thredds_catalog_url, depth=1)\n", + "\n", + " # FIXME: just reimplement it as needed\n", + " def __iter__(self) -> \"Iterator[Tuple[str, MutableMapping[str, Any]]]\":\n", + " \"\"\"Return a generator walking a THREDDS data catalog for datasets.\"\"\"\n", + " if self.catalog_head.datasets.items():\n", + " for item_name, ds in self.catalog_head.datasets.items():\n", + " # FIXME: filter for our item, ignore irrelevant entries (hopefully there's not too many...)\n", + " if item_name != self.target_item:\n", + " continue\n", + " attrs = self.extract_metadata(ds)\n", + " yield item_name, attrs\n", + "\n", + " if self._depth > 0:\n", + " for name, ref in self.catalog_head.catalog_refs.items():\n", + " self.catalog_head = ref.follow()\n", + " self._depth -= 1\n", + " yield from self\n", "\n", "\n", "input_ncml_href_parsed = urlparse(input_ncml_href)\n", "input_ncml_href_params = parse_qs(input_ncml_href_parsed.query)\n", "if \"catalog\" in input_ncml_href_params:\n", + " input_ncml_target_href = input_ncml_href.split(\"?\")[0]\n", " input_ncml_catalog_href = unquote(input_ncml_href_params[\"catalog\"][0])\n", " input_ncml_catalog_href = os.path.splitext(input_ncml_catalog_href)[0] + \".xml\" # in case it was HTML\n", "else:\n", - " input_ncml_catalog_href = input_ncml_href.split(\"?\", 1)[0]\n", + " input_ncml_target_href = input_ncml_href\n", + " input_ncml_catalog_href = input_ncml_href.split(\"?\", 1)[0] # just in case there's extra query params\n", " input_ncml_catalog_href = input_ncml_catalog_href.replace(\"/ncml\", \"/catalog/\")\n", " input_ncml_catalog_href = os.path.join(os.path.dirname(input_ncml_catalog_href), \"catalog.xml\")\n", "input_ncml_catalog_xml = requests.get(input_ncml_catalog_href, headers={\"Accept\": \"text/xml, application/xml\"}, timeout=5).text\n", - "catalog_xml = siphon.catalog.ET.fromstring(input_ncml_catalog_xml)\n", + "input_ncml_target_name = os.path.split(input_ncml_target_href)[-1]\n", + "## #catalog_xml = siphon.catalog.ET.fromstring(input_ncml_catalog_xml)\n", "\n", "# technically invalid STAC host, but just need something for URL schema validation\n", "stac_host = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n", - "cmip6_pop = CMIP6SingleFilePopulator(stac_host, input_ncml_catalog_href)\n", - "ncml_xml = siphon.catalog.ET.fromstring(input_ncml_xml)\n", - "\n", - "# FIXME: hack, missing 'name' in XML header when accessing NCML directly, but available in nested 'dataset' attribute\n", - "ncml_name = ncml_xml.attrib.get(\"name\") or catalog_xml.find(catalog_xml.tag.rsplit(\"catalog\", 1)[0] + \"dataset\").attrib[\"ID\"]\n", - "ncml_xml.attrib.setdefault(\"name\", ncml_name)\n", - "catalog_xml.attrib.setdefault(\"name\", ncml_name)\n", + "cmip6_pop = CMIP6SingleFilePopulator(stac_host, input_ncml_catalog_href, target_item=input_ncml_target_name)\n", + "# ncml_xml = siphon.catalog.ET.fromstring(input_ncml_xml)\n", + "#\n", + "# # FIXME: hack, missing 'name' in XML header when accessing NCML directly, but available in nested 'dataset' attribute\n", + "# ncml_name = ncml_xml.attrib.get(\"name\") or catalog_xml.find(catalog_xml.tag.rsplit(\"catalog\", 1)[0] + \"dataset\").attrib[\"ID\"]\n", + "# ncml_xml.attrib.setdefault(\"name\", ncml_name)\n", + "# catalog_xml.attrib.setdefault(\"name\", ncml_name)\n", + "# catalog_ds = siphon.catalog.Dataset(catalog_xml)\n", + "# catalog_loader = siphon.catalog.TDSCatalog(input_ncml_catalog_href)\n", + "# ncml_data = cmip6_pop.data_loader.extract_metadata(catalog_ds)\n", "\n", - "catalog_ds = siphon.catalog.Dataset(catalog_xml)\n", - "ncml_data = cmip6_pop.data_loader.extract_metadata(catalog_ds)\n", - "stac_item = cmip6_pop.create_stac_item(ncml_name, ncml_data)" + "# FIXME: do what ingest(), would to, triggering the full processing chains via iter, but without the POST steps\n", + "for ncml_name, ncml_data in cmip6_pop._ingest_pipeline:\n", + " stac_item_data = cmip6_pop.create_stac_item(ncml_name, ncml_data)\n", + " break" ], "metadata": { "collapsed": false, - "is_executing": true, "ExecuteTime": { - "start_time": "2023-10-19T21:27:02.720168481Z" + "end_time": "2023-10-19T22:58:02.461289611Z", + "start_time": "2023-10-19T22:58:02.017205859Z" } }, "id": "299946ccd58e2efc" @@ -511,13 +548,161 @@ }, { "cell_type": "code", - "execution_count": null, - "outputs": [], + "execution_count": 24, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"type\": \"Feature\",\n", + " \"stac_version\": \"1.0.0\",\n", + " \"id\": \"ScenarioMIP_CCCma_CanESM5_ssp245_r13i1p2f1_SImon_siconc_gn\",\n", + " \"properties\": {\n", + " \"start_datetime\": \"2019-12-06T12:00:00Z\",\n", + " \"end_datetime\": \"2020-11-04T12:00:00Z\",\n", + " \"datetime\": null,\n", + " \"cmip6:Conventions\": \"CF-1.7 CMIP-6.2\",\n", + " \"cmip6:activity_id\": \"ScenarioMIP\",\n", + " \"cmip6:creation_date\": \"2019-09-25T23:01:33Z\",\n", + " \"cmip6:data_specs_version\": \"01.00.30\",\n", + " \"cmip6:experiment\": \"update of RCP4.5 based on SSP2\",\n", + " \"cmip6:experiment_id\": \"ssp245\",\n", + " \"cmip6:frequency\": \"mon\",\n", + " \"cmip6:further_info_url\": \"https://furtherinfo.es-doc.org/CMIP6.CCCma.CanESM5.ssp245.none.r13i1p2f1\",\n", + " \"cmip6:grid_label\": \"gn\",\n", + " \"cmip6:institution\": \"Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada, Victoria, BC V8P 5C2, Canada\",\n", + " \"cmip6:institution_id\": \"CCCma\",\n", + " \"cmip6:nominal_resolution\": \"100 km\",\n", + " \"cmip6:realm\": [\n", + " \"seaIce\"\n", + " ],\n", + " \"cmip6:source\": \"CanESM5 (2019): \\naerosol: interactive\\natmos: CanAM5 (T63L49 native atmosphere, T63 Linear Gaussian Grid; 128 x 64 longitude/latitude; 49 levels; top level 1 hPa)\\natmosChem: specified oxidants for aerosols\\nland: CLASS3.6/CTEM1.2\\nlandIce: specified ice sheets\\nocean: NEMO3.4.1 (ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m)\\nocnBgchem: Canadian Model of Ocean Carbon (CMOC); NPZD ecosystem with OMIP prescribed carbonate chemistry\\nseaIce: LIM2\",\n", + " \"cmip6:source_id\": \"CanESM5\",\n", + " \"cmip6:source_type\": [\n", + " \"AOGCM\"\n", + " ],\n", + " \"cmip6:sub_experiment\": \"none\",\n", + " \"cmip6:sub_experiment_id\": \"none\",\n", + " \"cmip6:table_id\": \"SImon\",\n", + " \"cmip6:variable_id\": \"siconc\",\n", + " \"cmip6:variant_label\": \"r13i1p2f1\",\n", + " \"cmip6:initialization_index\": 1,\n", + " \"cmip6:physics_index\": 2,\n", + " \"cmip6:realization_index\": 13,\n", + " \"cmip6:forcing_index\": 1,\n", + " \"cmip6:tracking_id\": \"hdl:21.14100/9e4f804b-c161-44fa-acd1-c2e94e220c95\",\n", + " \"cmip6:version\": \"v20190429\",\n", + " \"cmip6:product\": \"model-output\",\n", + " \"cmip6:license\": \"CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.\",\n", + " \"cmip6:grid\": \"ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m\",\n", + " \"cmip6:mip_era\": \"CMIP6\"\n", + " },\n", + " \"geometry\": {\n", + " \"type\": \"Polygon\",\n", + " \"coordinates\": [\n", + " [\n", + " [\n", + " 0.049800001084804535,\n", + " -78.39350128173828\n", + " ],\n", + " [\n", + " 0.049800001084804535,\n", + " 89.74176788330078\n", + " ],\n", + " [\n", + " 359.99493408203125,\n", + " 89.74176788330078\n", + " ],\n", + " [\n", + " 359.99493408203125,\n", + " -78.39350128173828\n", + " ],\n", + " [\n", + " 0.049800001084804535,\n", + " -78.39350128173828\n", + " ]\n", + " ]\n", + " ]\n", + " },\n", + " \"links\": [\n", + " {\n", + " \"rel\": \"source\",\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"application/x-netcdf\",\n", + " \"title\": \"birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\"\n", + " }\n", + " ],\n", + " \"assets\": {\n", + " \"HTTPServer\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"application/x-netcdf\",\n", + " \"roles\": [\n", + " \"data\"\n", + " ]\n", + " },\n", + " \"OPENDAP\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"text/html\",\n", + " \"roles\": [\n", + " \"data\"\n", + " ]\n", + " },\n", + " \"NCML\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"application/xml\",\n", + " \"roles\": [\n", + " \"metadata\"\n", + " ]\n", + " },\n", + " \"UDDC\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\"\n", + " },\n", + " \"ISO\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"application/xml\",\n", + " \"roles\": [\n", + " \"metadata\"\n", + " ]\n", + " },\n", + " \"WCS\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"application/xml\",\n", + " \"roles\": [\n", + " \"data\"\n", + " ]\n", + " },\n", + " \"WMS\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wms/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"application/xml\",\n", + " \"roles\": [\n", + " \"visual\"\n", + " ]\n", + " },\n", + " \"NetcdfSubset\": {\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncss/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"application/x-netcdf\",\n", + " \"roles\": [\n", + " \"data\"\n", + " ]\n", + " }\n", + " },\n", + " \"bbox\": [\n", + " 0.049800001084804535,\n", + " -78.39350128173828,\n", + " 359.99493408203125,\n", + " 89.74176788330078\n", + " ],\n", + " \"stac_extensions\": []\n", + "}\n" + ] + } + ], "source": [ "AnyDateTime = Union[datetime, date]\n", "AnyJsonEncodable = Union[pystac.Item, np.ndarray, np.number, Url, Enum, AnyDateTime, \"JsonLike\"]\n", "\n", - "stac_item_data = stac_item.item.to_dict()\n", + "##stac_item_data = stac_item.item.to_dict()\n", "\n", "def json_encode(obj: \"AnyJsonEncodable\") -> Union[\"JsonLike\", str]:\n", " if isinstance(obj, (np.ndarray, np.number)):\n", @@ -534,7 +719,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "start_time": "2023-10-19T18:24:46.424220761Z" + "end_time": "2023-10-19T22:58:04.279186938Z", + "start_time": "2023-10-19T22:58:04.267912228Z" } }, "id": "4eeb52c23edccb31" From 2849e7ffd178dae6b74c0d07f4db81e395c3e209 Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Tue, 9 Jan 2024 16:02:12 -0500 Subject: [PATCH 3/6] update ncml2stac notebook working with STACPopulator 0.5.0 --- CHANGES.md | 2 +- notebooks/ncml2stac.ipynb | 288 +++++++++++++++++++++----------------- requirements.txt | 6 +- 3 files changed, 163 insertions(+), 133 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index c57bad3..363f5ff 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,7 +4,7 @@ Changes [Unreleased](https://github.com/crim-ca/ncml2stac/tree/master) (latest) ------------------------------------------------------------------------------------------------------------------ - +- Update STAC Item generation from NCML using `STACpopulator==0.5.0` to employ all latest fixes. [0.2.0](https://github.com/crim-ca/ncml2stac/tree/0.2.0) (2023-10-02) ------------------------------------------------------------------------------------------------------------------ diff --git a/notebooks/ncml2stac.ipynb b/notebooks/ncml2stac.ipynb index 56483f5..d1e60d5 100644 --- a/notebooks/ncml2stac.ipynb +++ b/notebooks/ncml2stac.ipynb @@ -8,7 +8,7 @@ "This notebook should be compiled into a standalone *CWL* definition using the following command:\n", "\n", "```shell\n", - "jupyter-repo2cwl \"https://github.com/crim-ca/ncml2sta\" -o /tmp\n", + "jupyter-repo2cwl \"https://github.com/crim-ca/ncml2stac\" -o /tmp\n", "```\n", "(replace the Git repository URL by the path if the clone locally)\n", "\n", @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "outputs": [], "source": [ "# NOTE:\n", @@ -71,8 +71,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:47.572589682Z", - "start_time": "2023-10-19T22:52:47.529149722Z" + "end_time": "2024-01-09T20:52:53.340694868Z", + "start_time": "2024-01-09T20:52:53.336885229Z" } }, "id": "61f43c81dc3aa6c2" @@ -94,19 +94,19 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into '/home/francis/.esdoc/pyessv-archive'...\r\n", - "remote: Enumerating objects: 63068, done.\u001B[K\r\n", - "remote: Counting objects: 100% (1557/1557), done.\u001B[K\r\n", - "remote: Compressing objects: 100% (476/476), done.\u001B[K\r\n", - "remote: Total 63068 (delta 1258), reused 1327 (delta 1070), pack-reused 61511\u001B[Ks: 22% (13875/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 24% (15137/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 26% (16398/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 28% (17660/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 29% (18684/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 31% (19552/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 34% (21444/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 36% (22705/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 40% (25228/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 42% (26489/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 45% (28381/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 48% (30273/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 50% (31534/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 53% (33427/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 55% (34688/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 58% (36580/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 61% (38472/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 66% (41625/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 68% (42887/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 70% (44148/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 74% (46671/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 76% (47932/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 79% (49824/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 81% (51086/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 83% (52347/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 85% (53608/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 87% (54870/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 90% (56762/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 93% (58654/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 94% (59744/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 96% (60546/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 98% (61807/63068), 5.50 MiB | 1.34 MiB/s\r\n", - "Receiving objects: 100% (63068/63068), 6.06 MiB | 1.40 MiB/s, done.\r\n", - "Resolving deltas: 100% (60270/60270), done.\r\n", + "remote: Enumerating objects: 7728, done.\u001B[K\r\n", + "remote: Counting objects: 100% (7728/7728), done.\u001B[K\r\n", + "remote: Compressing objects: 100% (2840/2840), done.\u001B[K\r\n", + "remote: Total 7728 (delta 6653), reused 5274 (delta 4866), pack-reused 0\u001B[K\r\n", + "Receiving objects: 100% (7728/7728), 806.86 KiB | 5.68 MiB/s, done.\r\n", + "Resolving deltas: 100% (6653/6653), done.\r\n", "\r\n", "Local identity for pyessv-archive set to \"Francis Charette Migneault \"\r\n" ] @@ -115,20 +115,20 @@ "source": [ "!rm -fr ~/.esdoc/pyessv-archive\n", "!mkdir -p ~/.esdoc/\n", - "!git clone https://github.com/ES-DOC/pyessv-archive ~/.esdoc/pyessv-archive" + "!git clone --depth 1 https://github.com/ES-DOC/pyessv-archive ~/.esdoc/pyessv-archive" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:53.446668050Z", - "start_time": "2023-10-19T22:52:47.571096999Z" + "end_time": "2024-01-09T20:46:18.837092675Z", + "start_time": "2024-01-09T20:46:17.329893874Z" } }, "id": "f10d85e12b47da43" }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "outputs": [], "source": [ "import hashlib\n", @@ -151,15 +151,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:53.452740519Z", - "start_time": "2023-10-19T22:52:53.448504173Z" + "end_time": "2024-01-09T20:53:47.648766094Z", + "start_time": "2024-01-09T20:53:47.274668578Z" } }, "id": "f68ea4339c5e4a9d" }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "outputs": [ { "name": "stdout", @@ -246,7 +246,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -372,86 +372,29 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:53.626782964Z", - "start_time": "2023-10-19T22:52:53.453091991Z" + "end_time": "2024-01-09T20:53:00.747428182Z", + "start_time": "2024-01-09T20:53:00.626261513Z" } }, "id": "4fc2f66493dc56c5" }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " \u001B[32mINFO:\u001B[0m \u001B[34m[STACpopulator.input ]\u001B[0m Requesting NcML dataset description\u001B[0m\n", - " \u001B[33mWARNING:\u001B[0m \u001B[34m[STACpopulator.implementations.CMIP6_UofT.add_CMIP6]\u001B[0m Failed to add Datacube extension to item sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\u001B[0m\n" + " \u001B[32mINFO:\u001B[0m \u001B[34m[STACpopulator.input ]\u001B[0m Requesting NcML dataset description\u001B[0m\n" ] } ], "source": [ - "# # FIXME: duplicate code\n", - "# # this is defined in:\n", - "# # https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L102-L116\n", - "# # but we cannot import it since outside of installed 'STACpopulator' module\n", - "# def make_cmip6_item_id(_attrs: \"JsonLike\") -> str:\n", - "# \"\"\"Return a unique ID for CMIP6 data item.\"\"\"\n", - "# keys = [\n", - "# \"activity_id\",\n", - "# \"institution_id\",\n", - "# \"source_id\",\n", - "# \"experiment_id\",\n", - "# \"variant_label\",\n", - "# \"table_id\",\n", - "# \"variable_id\",\n", - "# \"grid_label\",\n", - "# ]\n", - "# name = \"_\".join(_attrs[k] for k in keys)\n", - "# return hashlib.md5(name.encode(\"utf-8\")).hexdigest()\n", - "#\n", - "#\n", - "# # FIXME: temporary patch of URL/Media-Type\n", - "# # https://github.com/crim-ca/stac-populator/pull/23#discussion_r1341819744\n", - "# class CFJsonItemNetCDF(CFJsonItem):\n", - "# def item_link(self) -> pystac.Link:\n", - "# url = self.attrs[\"@location\"] # NetCDF URL\n", - "# name = self.attrs[\"groups\"][\"THREDDSMetadata\"][\"attributes\"][\"id\"]\n", - "# path = url.split(name, 1)[0]\n", - "# parts = list(filter(lambda _: bool(_), path.rsplit(\"/\", 3)))\n", - "# service = parts[-2] # always 1 path part for the service\n", - "# link = pystac.Link(\n", - "# rel=\"source\",\n", - "# target=url,\n", - "# media_type=\"application/x-netcdf\",\n", - "# title=f\"{service}:{name}\"\n", - "# )\n", - "# return link\n", - "#\n", - "#\n", - "# # FIXME: partial duplicate code\n", - "# # https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L138-L165\n", - "# # should be combined into a single callable function that doesn't depend on the rest of the THREDDS crawling iterator\n", - "# ds = xncml.Dataset(input_ncml)\n", - "# input_ncml_data = ds.to_cf_dict()\n", - "#\n", - "# # FIXME: AttributeError\n", - "# nc_services = getattr(ds, \"access_urls\", None)\n", - "# if nc_services:\n", - "# attrs[\"access_urls\"] = nc_services\n", - "#\n", - "# stac_item_id = make_cmip6_item_id(attrs[\"attributes\"])\n", - "# attrs[\"id\"] = stac_item_id\n", - "# stac_item = CFJsonItemNetCDF(stac_item_id, attrs, cmip6.Properties)\n", - "# DatacubeExt(stac_item)\n", - "\n", - "# class NCMLSingleFileLoader(THREDDSLoader):\n", - "# def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None):\n", - "# # ignore original THREDDSLoader init, move directly to its parent\n", - "# # don't automatically parse whole THREDDS catalog!\n", - "# super(THREDDSLoader, self).__init__()\n", - "\n", + "# NOTE:\n", + "# Since we are only interested to convert a single NCML to STAC Item,\n", + "# override the logic of the provided populator such that it does not\n", + "# automatically iterate over the complete THREDDS catalog contents.\n", "\n", "class CMIP6SingleFilePopulator(CMIP6populator):\n", " # WARNING:\n", @@ -464,14 +407,7 @@ " # thredds_catalog_url should be:\n", " # https://svc.com/thredds/catalog/some/nested/catalog.xml\"\n", " def __init__(self, stac_host, thredds_catalog_url, target_item, update=False):\n", - " ##self.data_loader = NCMLSingleFileLoader(thredds_catalog_url, depth=0)\n", - " # ignore original CMIP6populator init, as well as its parent STACpopulatorBase\n", - " # - don't automatically parse whole THREDDS catalog!\n", - " # - don't check for unnecessary 'collection_config.yml' file\n", - " #super(CMIP6populator, self).__init__(stac_host, self.data_loader, update)\n", - " ###super().__init__(stac_host, thredds_catalog_url, update=update)\n", - "\n", - " # FIXME: just reimplement what is needed (not config needed, don't care about STAC Collections...)\n", + " # FIXME: just reimplement what is needed (no config needed, we don't care about STAC Collections...)\n", " self.target_item = target_item\n", " self._stac_host = stac_host\n", " self._ingest_pipeline = THREDDSLoader(thredds_catalog_url, depth=1)\n", @@ -512,26 +448,16 @@ "# technically invalid STAC host, but just need something for URL schema validation\n", "stac_host = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n", "cmip6_pop = CMIP6SingleFilePopulator(stac_host, input_ncml_catalog_href, target_item=input_ncml_target_name)\n", - "# ncml_xml = siphon.catalog.ET.fromstring(input_ncml_xml)\n", - "#\n", - "# # FIXME: hack, missing 'name' in XML header when accessing NCML directly, but available in nested 'dataset' attribute\n", - "# ncml_name = ncml_xml.attrib.get(\"name\") or catalog_xml.find(catalog_xml.tag.rsplit(\"catalog\", 1)[0] + \"dataset\").attrib[\"ID\"]\n", - "# ncml_xml.attrib.setdefault(\"name\", ncml_name)\n", - "# catalog_xml.attrib.setdefault(\"name\", ncml_name)\n", - "# catalog_ds = siphon.catalog.Dataset(catalog_xml)\n", - "# catalog_loader = siphon.catalog.TDSCatalog(input_ncml_catalog_href)\n", - "# ncml_data = cmip6_pop.data_loader.extract_metadata(catalog_ds)\n", "\n", - "# FIXME: do what ingest(), would to, triggering the full processing chains via iter, but without the POST steps\n", - "for ncml_name, ncml_data in cmip6_pop._ingest_pipeline:\n", - " stac_item_data = cmip6_pop.create_stac_item(ncml_name, ncml_data)\n", - " break" + "# FIXME: do what ingest() would do, triggering the full processing chains via iter, but without the POST step to STAC API\n", + "ncml_name, ncml_data = next(iter(cmip6_pop._ingest_pipeline))\n", + "stac_item_data = cmip6_pop.create_stac_item(ncml_name, ncml_data)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:58:02.461289611Z", - "start_time": "2023-10-19T22:58:02.017205859Z" + "end_time": "2024-01-09T21:00:20.939457054Z", + "start_time": "2024-01-09T21:00:20.611135424Z" } }, "id": "299946ccd58e2efc" @@ -548,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "outputs": [ { "name": "stdout", @@ -561,7 +487,6 @@ " \"properties\": {\n", " \"start_datetime\": \"2019-12-06T12:00:00Z\",\n", " \"end_datetime\": \"2020-11-04T12:00:00Z\",\n", - " \"datetime\": null,\n", " \"cmip6:Conventions\": \"CF-1.7 CMIP-6.2\",\n", " \"cmip6:activity_id\": \"ScenarioMIP\",\n", " \"cmip6:creation_date\": \"2019-09-25T23:01:33Z\",\n", @@ -596,7 +521,112 @@ " \"cmip6:product\": \"model-output\",\n", " \"cmip6:license\": \"CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.\",\n", " \"cmip6:grid\": \"ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m\",\n", - " \"cmip6:mip_era\": \"CMIP6\"\n", + " \"cmip6:mip_era\": \"CMIP6\",\n", + " \"cube:dimensions\": {\n", + " \"time\": {\n", + " \"type\": \"temporal\",\n", + " \"extent\": [\n", + " \"2019-12-06T12:00:00Z\",\n", + " \"2020-11-04T12:00:00Z\"\n", + " ],\n", + " \"description\": \"time\"\n", + " },\n", + " \"j\": {\n", + " \"type\": \"spatial\",\n", + " \"extent\": [\n", + " 0,\n", + " 291\n", + " ],\n", + " \"description\": \"projection_y_coordinate\",\n", + " \"axis\": \"y\"\n", + " },\n", + " \"i\": {\n", + " \"type\": \"spatial\",\n", + " \"extent\": [\n", + " 0,\n", + " 360\n", + " ],\n", + " \"description\": \"projection_x_coordinate\",\n", + " \"axis\": \"x\"\n", + " }\n", + " },\n", + " \"cube:variables\": {\n", + " \"time_bnds\": {\n", + " \"dimensions\": [\n", + " \"time\",\n", + " \"bnds\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"vertices_latitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\",\n", + " \"vertices\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"vertices_longitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\",\n", + " \"vertices\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"siconc\": {\n", + " \"dimensions\": [\n", + " \"time\",\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"Sea-Ice Area Percentage (Ocean Grid)\",\n", + " \"unit\": \"%\"\n", + " },\n", + " \"areacello\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"Grid-Cell Area for Ocean Variables\",\n", + " \"unit\": \"m2\"\n", + " },\n", + " \"type\": {\n", + " \"dimensions\": [\n", + " \"maxStrlen64\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"Sea Ice area type\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"latitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"auxiliary\",\n", + " \"description\": \"latitude\",\n", + " \"unit\": \"degrees_north\"\n", + " },\n", + " \"longitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"auxiliary\",\n", + " \"description\": \"longitude\",\n", + " \"unit\": \"degrees_east\"\n", + " }\n", + " },\n", + " \"datetime\": null\n", " },\n", " \"geometry\": {\n", " \"type\": \"Polygon\",\n", @@ -641,29 +671,27 @@ " \"data\"\n", " ]\n", " },\n", - " \"OPENDAP\": {\n", + " \"OpenDAP\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", " \"type\": \"text/html\",\n", " \"roles\": [\n", " \"data\"\n", " ]\n", " },\n", - " \"NCML\": {\n", + " \"NcML\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", - " \"type\": \"application/xml\",\n", - " \"roles\": [\n", - " \"metadata\"\n", - " ]\n", + " \"type\": \"\",\n", + " \"roles\": []\n", " },\n", " \"UDDC\": {\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\"\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"\",\n", + " \"roles\": []\n", " },\n", " \"ISO\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", - " \"type\": \"application/xml\",\n", - " \"roles\": [\n", - " \"metadata\"\n", - " ]\n", + " \"type\": \"\",\n", + " \"roles\": []\n", " },\n", " \"WCS\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", @@ -693,7 +721,10 @@ " 359.99493408203125,\n", " 89.74176788330078\n", " ],\n", - " \"stac_extensions\": []\n", + " \"stac_extensions\": [\n", + " \"https://raw.githubusercontent.com/TomAugspurger/cmip6/main/json-schema/schema.json\",\n", + " \"https://stac-extensions.github.io/datacube/v2.2.0/schema.json\"\n", + " ]\n", "}\n" ] } @@ -719,8 +750,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:58:04.279186938Z", - "start_time": "2023-10-19T22:58:04.267912228Z" + "end_time": "2024-01-09T21:00:30.701468612Z", + "start_time": "2024-01-09T21:00:30.695291176Z" } }, "id": "4eeb52c23edccb31" @@ -737,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "outputs": [], "source": [ "# NOTE:\n", @@ -750,7 +781,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "start_time": "2023-10-19T18:24:46.424257773Z" + "end_time": "2024-01-09T20:54:12.730333186Z", + "start_time": "2024-01-09T20:54:12.724897702Z" } }, "id": "e4fa98fcad8b5556" @@ -758,9 +790,9 @@ ], "metadata": { "kernelspec": { - "name": "ncml2stac", + "name": "stac", "language": "python", - "display_name": "ncml2stac" + "display_name": "stac" }, "language_info": { "codemirror_mode": { diff --git a/requirements.txt b/requirements.txt index fc5def8..67bb53b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -r requirements-sys.txt -# Following does not work -# STACpopulator @ https://github.com/crim-ca/stac-populator/archive/refs/heads/weaver-repo2cwl-ncml2stac.zip -# Also, editable '-e' required, otherwise module still not found... --e git+https://github.com/crim-ca/stac-populator@arch-finalization-proposal#egg=STACpopulator +# editable '-e' required, otherwise module still not found... +-e git+https://github.com/crim-ca/stac-populator@0.5.0#egg=STACpopulator From 3a0c531d5ee8d6dd2b8a4c159de7cf52c183b71a Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Tue, 9 Jan 2024 16:45:40 -0500 Subject: [PATCH 4/6] update ncml2stac notebook with simpler definition (less overrides) + fix linting --- notebooks/ncml2stac.ipynb | 94 +++++++++++++++++++-------------------- requirements-dev.txt | 2 +- 2 files changed, 47 insertions(+), 49 deletions(-) diff --git a/notebooks/ncml2stac.ipynb b/notebooks/ncml2stac.ipynb index d1e60d5..bfa3c8b 100644 --- a/notebooks/ncml2stac.ipynb +++ b/notebooks/ncml2stac.ipynb @@ -38,11 +38,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "outputs": [], "source": [ "# NOTE:\n", - "# If using indented code block here (eg: 'if TYPE_CHECKING:'),\n", + "# If using code that is not preserved at runtime (eg: 'if TYPE_CHECKING:'),\n", "# it is important to have other things than 'ipython2cwl' imports.\n", "# When ported into the generated python script, imports from 'ipython2cwl' are removed,\n", "# which can cause syntax/indent errors.\n", @@ -71,8 +71,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-01-09T20:52:53.340694868Z", - "start_time": "2024-01-09T20:52:53.336885229Z" + "end_time": "2024-01-09T21:23:06.458322329Z", + "start_time": "2024-01-09T21:23:06.322097107Z" } }, "id": "61f43c81dc3aa6c2" @@ -128,10 +128,17 @@ }, { "cell_type": "code", - "execution_count": 5, - "outputs": [], + "execution_count": 2, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-01-09 21:23:12.406510 [INFO] :: PYESSV :: Loading vocabularies from /home/francis/.esdoc/pyessv-archive ... please wait\n" + ] + } + ], "source": [ - "import hashlib\n", "import json\n", "import os\n", "import tempfile\n", @@ -142,7 +149,6 @@ "import numpy as np\n", "import pystac\n", "import requests\n", - "import siphon.catalog\n", "from pydantic.networks import Url\n", "\n", "from STACpopulator.input import THREDDSLoader\n", @@ -151,15 +157,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-01-09T20:53:47.648766094Z", - "start_time": "2024-01-09T20:53:47.274668578Z" + "end_time": "2024-01-09T21:23:13.576072319Z", + "start_time": "2024-01-09T21:23:11.298045355Z" } }, "id": "f68ea4339c5e4a9d" }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "outputs": [ { "name": "stdout", @@ -372,15 +378,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-01-09T20:53:00.747428182Z", - "start_time": "2024-01-09T20:53:00.626261513Z" + "end_time": "2024-01-09T21:23:20.489912892Z", + "start_time": "2024-01-09T21:23:19.994789137Z" } }, "id": "4fc2f66493dc56c5" }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "outputs": [ { "name": "stderr", @@ -406,34 +412,29 @@ " # https://svc.com/thredds/ncml/some/nested/netcdf.nc\"\n", " # thredds_catalog_url should be:\n", " # https://svc.com/thredds/catalog/some/nested/catalog.xml\"\n", - " def __init__(self, stac_host, thredds_catalog_url, target_item, update=False):\n", + " def __init__( # pylint: disable=W0231 # super init not called on purpose to avoid loading missing config\n", + " self,\n", + " stac_host,\n", + " thredds_catalog_url,\n", + " update=False,\n", + " ):\n", " # FIXME: just reimplement what is needed (no config needed, we don't care about STAC Collections...)\n", - " self.target_item = target_item\n", " self._stac_host = stac_host\n", - " self._ingest_pipeline = THREDDSLoader(thredds_catalog_url, depth=1)\n", - "\n", - " # FIXME: just reimplement it as needed\n", - " def __iter__(self) -> \"Iterator[Tuple[str, MutableMapping[str, Any]]]\":\n", - " \"\"\"Return a generator walking a THREDDS data catalog for datasets.\"\"\"\n", - " if self.catalog_head.datasets.items():\n", - " for item_name, ds in self.catalog_head.datasets.items():\n", - " # FIXME: filter for our item, ignore irrelevant entries (hopefully there's not too many...)\n", - " if item_name != self.target_item:\n", - " continue\n", - " attrs = self.extract_metadata(ds)\n", - " yield item_name, attrs\n", + " self._ingest_pipeline = THREDDSLoader(thredds_catalog_url, depth=0)\n", + " self.update = update\n", "\n", - " if self._depth > 0:\n", - " for name, ref in self.catalog_head.catalog_refs.items():\n", - " self.catalog_head = ref.follow()\n", - " self._depth -= 1\n", - " yield from self\n", + " # FIXME: perform what ingest() does, but only for a single item and without STAC API POST request\n", + " def ncml2stac(self, target_item: str):\n", + " ds = self._ingest_pipeline[target_item]\n", + " ncml_data = self._ingest_pipeline.extract_metadata(ds)\n", + " stac_item = self.create_stac_item(target_item, ncml_data)\n", + " return stac_item\n", "\n", "\n", "input_ncml_href_parsed = urlparse(input_ncml_href)\n", "input_ncml_href_params = parse_qs(input_ncml_href_parsed.query)\n", "if \"catalog\" in input_ncml_href_params:\n", - " input_ncml_target_href = input_ncml_href.split(\"?\")[0]\n", + " input_ncml_target_href = input_ncml_href.split(\"?\", 1)[0]\n", " input_ncml_catalog_href = unquote(input_ncml_href_params[\"catalog\"][0])\n", " input_ncml_catalog_href = os.path.splitext(input_ncml_catalog_href)[0] + \".xml\" # in case it was HTML\n", "else:\n", @@ -441,23 +442,22 @@ " input_ncml_catalog_href = input_ncml_href.split(\"?\", 1)[0] # just in case there's extra query params\n", " input_ncml_catalog_href = input_ncml_catalog_href.replace(\"/ncml\", \"/catalog/\")\n", " input_ncml_catalog_href = os.path.join(os.path.dirname(input_ncml_catalog_href), \"catalog.xml\")\n", - "input_ncml_catalog_xml = requests.get(input_ncml_catalog_href, headers={\"Accept\": \"text/xml, application/xml\"}, timeout=5).text\n", + "input_ncml_headers = {\"Accept\": \"text/xml, application/xml\"}\n", + "input_ncml_catalog_xml = requests.get(input_ncml_catalog_href, headers=input_ncml_headers, timeout=5).text\n", "input_ncml_target_name = os.path.split(input_ncml_target_href)[-1]\n", - "## #catalog_xml = siphon.catalog.ET.fromstring(input_ncml_catalog_xml)\n", "\n", "# technically invalid STAC host, but just need something for URL schema validation\n", - "stac_host = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n", - "cmip6_pop = CMIP6SingleFilePopulator(stac_host, input_ncml_catalog_href, target_item=input_ncml_target_name)\n", + "stac_host_url = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n", + "cmip6_pop = CMIP6SingleFilePopulator(stac_host_url, input_ncml_catalog_href)\n", "\n", - "# FIXME: do what ingest() would do, triggering the full processing chains via iter, but without the POST step to STAC API\n", - "ncml_name, ncml_data = next(iter(cmip6_pop._ingest_pipeline))\n", - "stac_item_data = cmip6_pop.create_stac_item(ncml_name, ncml_data)" + "# generate the STAC Item definition corresponding to the NCML content\n", + "stac_item_data = cmip6_pop.ncml2stac(input_ncml_target_name)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-01-09T21:00:20.939457054Z", - "start_time": "2024-01-09T21:00:20.611135424Z" + "end_time": "2024-01-09T21:43:16.613980591Z", + "start_time": "2024-01-09T21:43:16.278982130Z" } }, "id": "299946ccd58e2efc" @@ -474,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "outputs": [ { "name": "stdout", @@ -733,8 +733,6 @@ "AnyDateTime = Union[datetime, date]\n", "AnyJsonEncodable = Union[pystac.Item, np.ndarray, np.number, Url, Enum, AnyDateTime, \"JsonLike\"]\n", "\n", - "##stac_item_data = stac_item.item.to_dict()\n", - "\n", "def json_encode(obj: \"AnyJsonEncodable\") -> Union[\"JsonLike\", str]:\n", " if isinstance(obj, (np.ndarray, np.number)):\n", " return obj.tolist()\n", @@ -750,8 +748,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-01-09T21:00:30.701468612Z", - "start_time": "2024-01-09T21:00:30.695291176Z" + "end_time": "2024-01-09T21:43:20.412846636Z", + "start_time": "2024-01-09T21:43:20.405259220Z" } }, "id": "4eeb52c23edccb31" diff --git a/requirements-dev.txt b/requirements-dev.txt index 0f848dc..6999cc9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -35,7 +35,7 @@ pytest-notebook pytest-rerunfailures pycodestyle pydocstyle -pylint>=2.15.4; python_version >= "3.7" +pylint>=2.15.4,<3 pylint-per-file-ignores; python_version >= "3.7" pylint_quotes safety From 86004de44b9546efe0ae2c84d078bcec601c7c8c Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Tue, 9 Jan 2024 18:12:57 -0500 Subject: [PATCH 5/6] patch notebook kernel --- notebooks/ncml2stac.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/ncml2stac.ipynb b/notebooks/ncml2stac.ipynb index bfa3c8b..8ee8c2d 100644 --- a/notebooks/ncml2stac.ipynb +++ b/notebooks/ncml2stac.ipynb @@ -788,9 +788,9 @@ ], "metadata": { "kernelspec": { - "name": "stac", + "name": "ncml2stac", "language": "python", - "display_name": "stac" + "display_name": "ncml2stac" }, "language_info": { "codemirror_mode": { From a1faa5371e75dd7f2c68d0af73f54e862a220b9a Mon Sep 17 00:00:00 2001 From: Francis Charette Migneault Date: Tue, 9 Jan 2024 18:56:42 -0500 Subject: [PATCH 6/6] adjust notebook to use default python kernel --- notebooks/ncml2stac.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/ncml2stac.ipynb b/notebooks/ncml2stac.ipynb index 8ee8c2d..8da8ed4 100644 --- a/notebooks/ncml2stac.ipynb +++ b/notebooks/ncml2stac.ipynb @@ -788,7 +788,7 @@ ], "metadata": { "kernelspec": { - "name": "ncml2stac", + "name": "python3", "language": "python", "display_name": "ncml2stac" },