diff --git a/notebooks/ncml2stac.ipynb b/notebooks/ncml2stac.ipynb index 56483f5..d1e60d5 100644 --- a/notebooks/ncml2stac.ipynb +++ b/notebooks/ncml2stac.ipynb @@ -8,7 +8,7 @@ "This notebook should be compiled into a standalone *CWL* definition using the following command:\n", "\n", "```shell\n", - "jupyter-repo2cwl \"https://github.com/crim-ca/ncml2sta\" -o /tmp\n", + "jupyter-repo2cwl \"https://github.com/crim-ca/ncml2stac\" -o /tmp\n", "```\n", "(replace the Git repository URL by the path if the clone locally)\n", "\n", @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "outputs": [], "source": [ "# NOTE:\n", @@ -71,8 +71,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:47.572589682Z", - "start_time": "2023-10-19T22:52:47.529149722Z" + "end_time": "2024-01-09T20:52:53.340694868Z", + "start_time": "2024-01-09T20:52:53.336885229Z" } }, "id": "61f43c81dc3aa6c2" @@ -94,19 +94,19 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cloning into '/home/francis/.esdoc/pyessv-archive'...\r\n", - "remote: Enumerating objects: 63068, done.\u001B[K\r\n", - "remote: Counting objects: 100% (1557/1557), done.\u001B[K\r\n", - "remote: Compressing objects: 100% (476/476), done.\u001B[K\r\n", - "remote: Total 63068 (delta 1258), reused 1327 (delta 1070), pack-reused 61511\u001B[Ks: 22% (13875/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 24% (15137/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 26% (16398/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 28% (17660/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 29% (18684/63068), 1.07 MiB | 708.00 KiB/sReceiving objects: 31% (19552/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 34% (21444/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 36% (22705/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 40% (25228/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 42% (26489/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 45% (28381/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 48% (30273/63068), 1.71 MiB | 849.00 KiB/sReceiving objects: 50% (31534/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 53% (33427/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 55% (34688/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 58% (36580/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 61% (38472/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 66% (41625/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 68% (42887/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 70% (44148/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 74% (46671/63068), 2.49 MiB | 986.00 KiB/sReceiving objects: 76% (47932/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 79% (49824/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 81% (51086/63068), 3.43 MiB | 1.11 MiB/sReceiving objects: 83% (52347/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 85% (53608/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 87% (54870/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 90% (56762/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 93% (58654/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 94% (59744/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 96% (60546/63068), 4.40 MiB | 1.22 MiB/sReceiving objects: 98% (61807/63068), 5.50 MiB | 1.34 MiB/s\r\n", - "Receiving objects: 100% (63068/63068), 6.06 MiB | 1.40 MiB/s, done.\r\n", - "Resolving deltas: 100% (60270/60270), done.\r\n", + "remote: Enumerating objects: 7728, done.\u001B[K\r\n", + "remote: Counting objects: 100% (7728/7728), done.\u001B[K\r\n", + "remote: Compressing objects: 100% (2840/2840), done.\u001B[K\r\n", + "remote: Total 7728 (delta 6653), reused 5274 (delta 4866), pack-reused 0\u001B[K\r\n", + "Receiving objects: 100% (7728/7728), 806.86 KiB | 5.68 MiB/s, done.\r\n", + "Resolving deltas: 100% (6653/6653), done.\r\n", "\r\n", "Local identity for pyessv-archive set to \"Francis Charette Migneault \"\r\n" ] @@ -115,20 +115,20 @@ "source": [ "!rm -fr ~/.esdoc/pyessv-archive\n", "!mkdir -p ~/.esdoc/\n", - "!git clone https://github.com/ES-DOC/pyessv-archive ~/.esdoc/pyessv-archive" + "!git clone --depth 1 https://github.com/ES-DOC/pyessv-archive ~/.esdoc/pyessv-archive" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:53.446668050Z", - "start_time": "2023-10-19T22:52:47.571096999Z" + "end_time": "2024-01-09T20:46:18.837092675Z", + "start_time": "2024-01-09T20:46:17.329893874Z" } }, "id": "f10d85e12b47da43" }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "outputs": [], "source": [ "import hashlib\n", @@ -151,15 +151,15 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:53.452740519Z", - "start_time": "2023-10-19T22:52:53.448504173Z" + "end_time": "2024-01-09T20:53:47.648766094Z", + "start_time": "2024-01-09T20:53:47.274668578Z" } }, "id": "f68ea4339c5e4a9d" }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "outputs": [ { "name": "stdout", @@ -246,7 +246,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -372,86 +372,29 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:52:53.626782964Z", - "start_time": "2023-10-19T22:52:53.453091991Z" + "end_time": "2024-01-09T20:53:00.747428182Z", + "start_time": "2024-01-09T20:53:00.626261513Z" } }, "id": "4fc2f66493dc56c5" }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " \u001B[32mINFO:\u001B[0m \u001B[34m[STACpopulator.input ]\u001B[0m Requesting NcML dataset description\u001B[0m\n", - " \u001B[33mWARNING:\u001B[0m \u001B[34m[STACpopulator.implementations.CMIP6_UofT.add_CMIP6]\u001B[0m Failed to add Datacube extension to item sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\u001B[0m\n" + " \u001B[32mINFO:\u001B[0m \u001B[34m[STACpopulator.input ]\u001B[0m Requesting NcML dataset description\u001B[0m\n" ] } ], "source": [ - "# # FIXME: duplicate code\n", - "# # this is defined in:\n", - "# # https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L102-L116\n", - "# # but we cannot import it since outside of installed 'STACpopulator' module\n", - "# def make_cmip6_item_id(_attrs: \"JsonLike\") -> str:\n", - "# \"\"\"Return a unique ID for CMIP6 data item.\"\"\"\n", - "# keys = [\n", - "# \"activity_id\",\n", - "# \"institution_id\",\n", - "# \"source_id\",\n", - "# \"experiment_id\",\n", - "# \"variant_label\",\n", - "# \"table_id\",\n", - "# \"variable_id\",\n", - "# \"grid_label\",\n", - "# ]\n", - "# name = \"_\".join(_attrs[k] for k in keys)\n", - "# return hashlib.md5(name.encode(\"utf-8\")).hexdigest()\n", - "#\n", - "#\n", - "# # FIXME: temporary patch of URL/Media-Type\n", - "# # https://github.com/crim-ca/stac-populator/pull/23#discussion_r1341819744\n", - "# class CFJsonItemNetCDF(CFJsonItem):\n", - "# def item_link(self) -> pystac.Link:\n", - "# url = self.attrs[\"@location\"] # NetCDF URL\n", - "# name = self.attrs[\"groups\"][\"THREDDSMetadata\"][\"attributes\"][\"id\"]\n", - "# path = url.split(name, 1)[0]\n", - "# parts = list(filter(lambda _: bool(_), path.rsplit(\"/\", 3)))\n", - "# service = parts[-2] # always 1 path part for the service\n", - "# link = pystac.Link(\n", - "# rel=\"source\",\n", - "# target=url,\n", - "# media_type=\"application/x-netcdf\",\n", - "# title=f\"{service}:{name}\"\n", - "# )\n", - "# return link\n", - "#\n", - "#\n", - "# # FIXME: partial duplicate code\n", - "# # https://github.com/crim-ca/stac-populator/blob/arch-changes/implementations/CMIP6-UofT/add_CMIP6.py#L138-L165\n", - "# # should be combined into a single callable function that doesn't depend on the rest of the THREDDS crawling iterator\n", - "# ds = xncml.Dataset(input_ncml)\n", - "# input_ncml_data = ds.to_cf_dict()\n", - "#\n", - "# # FIXME: AttributeError\n", - "# nc_services = getattr(ds, \"access_urls\", None)\n", - "# if nc_services:\n", - "# attrs[\"access_urls\"] = nc_services\n", - "#\n", - "# stac_item_id = make_cmip6_item_id(attrs[\"attributes\"])\n", - "# attrs[\"id\"] = stac_item_id\n", - "# stac_item = CFJsonItemNetCDF(stac_item_id, attrs, cmip6.Properties)\n", - "# DatacubeExt(stac_item)\n", - "\n", - "# class NCMLSingleFileLoader(THREDDSLoader):\n", - "# def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None):\n", - "# # ignore original THREDDSLoader init, move directly to its parent\n", - "# # don't automatically parse whole THREDDS catalog!\n", - "# super(THREDDSLoader, self).__init__()\n", - "\n", + "# NOTE:\n", + "# Since we are only interested to convert a single NCML to STAC Item,\n", + "# override the logic of the provided populator such that it does not\n", + "# automatically iterate over the complete THREDDS catalog contents.\n", "\n", "class CMIP6SingleFilePopulator(CMIP6populator):\n", " # WARNING:\n", @@ -464,14 +407,7 @@ " # thredds_catalog_url should be:\n", " # https://svc.com/thredds/catalog/some/nested/catalog.xml\"\n", " def __init__(self, stac_host, thredds_catalog_url, target_item, update=False):\n", - " ##self.data_loader = NCMLSingleFileLoader(thredds_catalog_url, depth=0)\n", - " # ignore original CMIP6populator init, as well as its parent STACpopulatorBase\n", - " # - don't automatically parse whole THREDDS catalog!\n", - " # - don't check for unnecessary 'collection_config.yml' file\n", - " #super(CMIP6populator, self).__init__(stac_host, self.data_loader, update)\n", - " ###super().__init__(stac_host, thredds_catalog_url, update=update)\n", - "\n", - " # FIXME: just reimplement what is needed (not config needed, don't care about STAC Collections...)\n", + " # FIXME: just reimplement what is needed (no config needed, we don't care about STAC Collections...)\n", " self.target_item = target_item\n", " self._stac_host = stac_host\n", " self._ingest_pipeline = THREDDSLoader(thredds_catalog_url, depth=1)\n", @@ -512,26 +448,16 @@ "# technically invalid STAC host, but just need something for URL schema validation\n", "stac_host = f\"{input_ncml_href_parsed.scheme}://{input_ncml_href_parsed.netloc}\"\n", "cmip6_pop = CMIP6SingleFilePopulator(stac_host, input_ncml_catalog_href, target_item=input_ncml_target_name)\n", - "# ncml_xml = siphon.catalog.ET.fromstring(input_ncml_xml)\n", - "#\n", - "# # FIXME: hack, missing 'name' in XML header when accessing NCML directly, but available in nested 'dataset' attribute\n", - "# ncml_name = ncml_xml.attrib.get(\"name\") or catalog_xml.find(catalog_xml.tag.rsplit(\"catalog\", 1)[0] + \"dataset\").attrib[\"ID\"]\n", - "# ncml_xml.attrib.setdefault(\"name\", ncml_name)\n", - "# catalog_xml.attrib.setdefault(\"name\", ncml_name)\n", - "# catalog_ds = siphon.catalog.Dataset(catalog_xml)\n", - "# catalog_loader = siphon.catalog.TDSCatalog(input_ncml_catalog_href)\n", - "# ncml_data = cmip6_pop.data_loader.extract_metadata(catalog_ds)\n", "\n", - "# FIXME: do what ingest(), would to, triggering the full processing chains via iter, but without the POST steps\n", - "for ncml_name, ncml_data in cmip6_pop._ingest_pipeline:\n", - " stac_item_data = cmip6_pop.create_stac_item(ncml_name, ncml_data)\n", - " break" + "# FIXME: do what ingest() would do, triggering the full processing chains via iter, but without the POST step to STAC API\n", + "ncml_name, ncml_data = next(iter(cmip6_pop._ingest_pipeline))\n", + "stac_item_data = cmip6_pop.create_stac_item(ncml_name, ncml_data)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:58:02.461289611Z", - "start_time": "2023-10-19T22:58:02.017205859Z" + "end_time": "2024-01-09T21:00:20.939457054Z", + "start_time": "2024-01-09T21:00:20.611135424Z" } }, "id": "299946ccd58e2efc" @@ -548,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "outputs": [ { "name": "stdout", @@ -561,7 +487,6 @@ " \"properties\": {\n", " \"start_datetime\": \"2019-12-06T12:00:00Z\",\n", " \"end_datetime\": \"2020-11-04T12:00:00Z\",\n", - " \"datetime\": null,\n", " \"cmip6:Conventions\": \"CF-1.7 CMIP-6.2\",\n", " \"cmip6:activity_id\": \"ScenarioMIP\",\n", " \"cmip6:creation_date\": \"2019-09-25T23:01:33Z\",\n", @@ -596,7 +521,112 @@ " \"cmip6:product\": \"model-output\",\n", " \"cmip6:license\": \"CMIP6 model data produced by The Government of Canada (Canadian Centre for Climate Modelling and Analysis, Environment and Climate Change Canada) is licensed under a Creative Commons Attribution ShareAlike 4.0 International License (https://creativecommons.org/licenses). Consult https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 output, including citation requirements and proper acknowledgment. Further information about this data, including some limitations, can be found via the further_info_url (recorded as a global attribute in this file) and at https:///pcmdi.llnl.gov/. The data producers and data providers make no warranty, either express or implied, including, but not limited to, warranties of merchantability and fitness for a particular purpose. All liabilities arising from the supply of the information (including any liability arising in negligence) are excluded to the fullest extent permitted by law.\",\n", " \"cmip6:grid\": \"ORCA1 tripolar grid, 1 deg with refinement to 1/3 deg within 20 degrees of the equator; 361 x 290 longitude/latitude; 45 vertical levels; top grid cell 0-6.19 m\",\n", - " \"cmip6:mip_era\": \"CMIP6\"\n", + " \"cmip6:mip_era\": \"CMIP6\",\n", + " \"cube:dimensions\": {\n", + " \"time\": {\n", + " \"type\": \"temporal\",\n", + " \"extent\": [\n", + " \"2019-12-06T12:00:00Z\",\n", + " \"2020-11-04T12:00:00Z\"\n", + " ],\n", + " \"description\": \"time\"\n", + " },\n", + " \"j\": {\n", + " \"type\": \"spatial\",\n", + " \"extent\": [\n", + " 0,\n", + " 291\n", + " ],\n", + " \"description\": \"projection_y_coordinate\",\n", + " \"axis\": \"y\"\n", + " },\n", + " \"i\": {\n", + " \"type\": \"spatial\",\n", + " \"extent\": [\n", + " 0,\n", + " 360\n", + " ],\n", + " \"description\": \"projection_x_coordinate\",\n", + " \"axis\": \"x\"\n", + " }\n", + " },\n", + " \"cube:variables\": {\n", + " \"time_bnds\": {\n", + " \"dimensions\": [\n", + " \"time\",\n", + " \"bnds\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"vertices_latitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\",\n", + " \"vertices\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"vertices_longitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\",\n", + " \"vertices\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"siconc\": {\n", + " \"dimensions\": [\n", + " \"time\",\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"Sea-Ice Area Percentage (Ocean Grid)\",\n", + " \"unit\": \"%\"\n", + " },\n", + " \"areacello\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"Grid-Cell Area for Ocean Variables\",\n", + " \"unit\": \"m2\"\n", + " },\n", + " \"type\": {\n", + " \"dimensions\": [\n", + " \"maxStrlen64\"\n", + " ],\n", + " \"type\": \"data\",\n", + " \"description\": \"Sea Ice area type\",\n", + " \"unit\": \"\"\n", + " },\n", + " \"latitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"auxiliary\",\n", + " \"description\": \"latitude\",\n", + " \"unit\": \"degrees_north\"\n", + " },\n", + " \"longitude\": {\n", + " \"dimensions\": [\n", + " \"j\",\n", + " \"i\"\n", + " ],\n", + " \"type\": \"auxiliary\",\n", + " \"description\": \"longitude\",\n", + " \"unit\": \"degrees_east\"\n", + " }\n", + " },\n", + " \"datetime\": null\n", " },\n", " \"geometry\": {\n", " \"type\": \"Polygon\",\n", @@ -641,29 +671,27 @@ " \"data\"\n", " ]\n", " },\n", - " \"OPENDAP\": {\n", + " \"OpenDAP\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/dodsC/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", " \"type\": \"text/html\",\n", " \"roles\": [\n", " \"data\"\n", " ]\n", " },\n", - " \"NCML\": {\n", + " \"NcML\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/ncml/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", - " \"type\": \"application/xml\",\n", - " \"roles\": [\n", - " \"metadata\"\n", - " ]\n", + " \"type\": \"\",\n", + " \"roles\": []\n", " },\n", " \"UDDC\": {\n", - " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\"\n", + " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/uddc/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", + " \"type\": \"\",\n", + " \"roles\": []\n", " },\n", " \"ISO\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/iso/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", - " \"type\": \"application/xml\",\n", - " \"roles\": [\n", - " \"metadata\"\n", - " ]\n", + " \"type\": \"\",\n", + " \"roles\": []\n", " },\n", " \"WCS\": {\n", " \"href\": \"https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/wcs/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc\",\n", @@ -693,7 +721,10 @@ " 359.99493408203125,\n", " 89.74176788330078\n", " ],\n", - " \"stac_extensions\": []\n", + " \"stac_extensions\": [\n", + " \"https://raw.githubusercontent.com/TomAugspurger/cmip6/main/json-schema/schema.json\",\n", + " \"https://stac-extensions.github.io/datacube/v2.2.0/schema.json\"\n", + " ]\n", "}\n" ] } @@ -719,8 +750,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2023-10-19T22:58:04.279186938Z", - "start_time": "2023-10-19T22:58:04.267912228Z" + "end_time": "2024-01-09T21:00:30.701468612Z", + "start_time": "2024-01-09T21:00:30.695291176Z" } }, "id": "4eeb52c23edccb31" @@ -737,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "outputs": [], "source": [ "# NOTE:\n", @@ -750,7 +781,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "start_time": "2023-10-19T18:24:46.424257773Z" + "end_time": "2024-01-09T20:54:12.730333186Z", + "start_time": "2024-01-09T20:54:12.724897702Z" } }, "id": "e4fa98fcad8b5556" @@ -758,9 +790,9 @@ ], "metadata": { "kernelspec": { - "name": "ncml2stac", + "name": "stac", "language": "python", - "display_name": "ncml2stac" + "display_name": "stac" }, "language_info": { "codemirror_mode": {