From a585d9ecd27cd96b628d7eaaaf7ad098ccbf9d7f Mon Sep 17 00:00:00 2001 From: David Huard Date: Tue, 27 Feb 2024 16:56:25 -0500 Subject: [PATCH 1/2] use cf-xarray heuristic to identify bound variables. Add test --- CHANGES.md | 3 +- STACpopulator/extensions/datacube.py | 54 +++---- ...3_historical_r2i1p1f1_gr_185001-201412.xml | 133 ++++++++++++++++++ tests/test_cmip6_datacube.py | 19 +++ 4 files changed, 183 insertions(+), 26 deletions(-) create mode 100644 tests/data/clt_Amon_EC-Earth3_historical_r2i1p1f1_gr_185001-201412.xml diff --git a/CHANGES.md b/CHANGES.md index 8cb305a..2a23b4f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,7 +2,8 @@ ## [Unreleased](https://github.com/crim-ca/stac-populator) (latest) - +* Make sure *bounds* variables are given the auxiliary type attribute. +* Fix for variables that have no attributes. ## [0.6.0](https://github.com/crim-ca/stac-populator/tree/0.6.0) (2024-02-22) diff --git a/STACpopulator/extensions/datacube.py b/STACpopulator/extensions/datacube.py index 2d52bb4..e504c7a 100644 --- a/STACpopulator/extensions/datacube.py +++ b/STACpopulator/extensions/datacube.py @@ -184,6 +184,7 @@ def dimensions(self) -> dict[str, Dimension]: def variables(self) -> dict[str, Variable]: """Return Variable objects required for Datacube extension.""" variables = {} + bounds = self.bounds() for name, meta in self.attrs["variables"].items(): if name in self.attrs["dimensions"]: @@ -192,44 +193,47 @@ def variables(self) -> dict[str, Variable]: # Some variables like "time_bnds" in some model files do not have any attributes. attrs = meta.get("attributes", {}) - self._infer_variable_units_description(name, attrs) + if name in bounds: + # Bounds are auxiliary variables + dtype = VariableType.AUXILIARY.value + + # We can safely assume that the bounds variable has the same units as the variable it bounds. + if "units" not in attrs: + if (u := self.attrs["variables"][bounds[name]].get("attributes", {}).get("units")) is not None: + attrs["units"] = u + + elif self.is_coordinate(attrs): + # Using the CF-xarray heuristics to determine if variable is a coordinate. + dtype = VariableType.AUXILIARY.value + else: + dtype = VariableType.DATA.value variables[name] = Variable( properties=dict( dimensions=meta["shape"], - type=VariableType.AUXILIARY.value if self.is_coordinate(attrs) else VariableType.DATA.value, + type=dtype, description=attrs.get("description", attrs.get("long_name", "")), unit=attrs.get("units", ""), ) ) return variables - def _infer_variable_units_description(self, name, attrs): - """Try to infer the units and description of some simple coordinate variables.""" - if name == "time_bnds": - related_variable = "time" - attrs["description"] = "bounds for the time coordinate" - elif name == "lat_bnds": - related_variable = "lat" - attrs["description"] = "bounds for the latitude coordinate" - elif name == "lon_bnds": - related_variable = "lon" - attrs["description"] = "bounds for the longitude coordinate" - else: - return - - try: - attrs["units"] = self.attrs["variables"][related_variable]["attributes"]["units"] - except KeyError: - pass + def bounds(self): + """Return a list of variables that are bounds for other variables.""" + out = {} + for name, meta in self.attrs["variables"].items(): + attrs = meta.get("attributes", {}) + if "bounds" in attrs: + out[attrs["bounds"]] = name + return out - def is_coordinate(self, attrs: MutableMapping[str, Any]) -> bool: - """Return whether variable is a coordinate.""" - if (desc := attrs.get("description", None)) is not None: - if "bounds for" in desc: - return True + def is_coordinate(self, attrs: MutableMapping[str, Any]) -> bool: + """Return whether variable is a coordinate. + - data: a variable indicating some measured value, for example "precipitation", "temperature", etc. + - auxiliary: a variable that contains coordinate data, but isn't a dimension in cube:dimensions. + """ for key, criteria in self.coordinate_criteria.items(): for criterion, expected in criteria.items(): if attrs.get(criterion, None) in expected: diff --git a/tests/data/clt_Amon_EC-Earth3_historical_r2i1p1f1_gr_185001-201412.xml b/tests/data/clt_Amon_EC-Earth3_historical_r2i1p1f1_gr_185001-201412.xml new file mode 100644 index 0000000..9e63c91 --- /dev/null +++ b/tests/data/clt_Amon_EC-Earth3_historical_r2i1p1f1_gr_185001-201412.xml @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_cmip6_datacube.py b/tests/test_cmip6_datacube.py index 9adafa0..bf8b908 100644 --- a/tests/test_cmip6_datacube.py +++ b/tests/test_cmip6_datacube.py @@ -35,3 +35,22 @@ def test_datacube_helper(): assert len(schemas) >= 2 assert "item.json" in schemas[0] assert "datacube" in schemas[1] + + +def test_auxiliary_variables(): + # https://github.com/crim-ca/stac-populator/issues/52 + + file_path = DIR / "data" / "clt_Amon_EC-Earth3_historical_r2i1p1f1_gr_185001-201412.xml" + + ds = xncml.Dataset(filepath=str(file_path)) + attrs = ds.to_cf_dict() + attrs["access_urls"] = {"HTTPServer": "http://example.com"} + item = CMIP6Helper(attrs, GeoJSONPolygon).stac_item() + + dc = DataCubeHelper(attrs) + dc_ext = DatacubeExtension.ext(item, add_if_missing=True) + dc_ext.apply(dimensions=dc.dimensions, variables=dc.variables) + + p = dc_ext.properties + assert set(['time', 'lat', 'lon']) == set(p['cube:dimensions'].keys()) + assert p["cube:variables"]["lon_bnds"]["unit"] == "degrees_east" From c99e29f1d70f83e2c068c661a4576d0b52b7219b Mon Sep 17 00:00:00 2001 From: David Huard Date: Thu, 29 Feb 2024 14:11:21 -0500 Subject: [PATCH 2/2] added bounds description --- STACpopulator/extensions/datacube.py | 3 +++ tests/test_cmip6_datacube.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/STACpopulator/extensions/datacube.py b/STACpopulator/extensions/datacube.py index e504c7a..b394416 100644 --- a/STACpopulator/extensions/datacube.py +++ b/STACpopulator/extensions/datacube.py @@ -202,6 +202,9 @@ def variables(self) -> dict[str, Variable]: if (u := self.attrs["variables"][bounds[name]].get("attributes", {}).get("units")) is not None: attrs["units"] = u + if "description" not in "attrs": + attrs["description"] = f"bounds for the {bounds[name]} coordinate" + elif self.is_coordinate(attrs): # Using the CF-xarray heuristics to determine if variable is a coordinate. dtype = VariableType.AUXILIARY.value diff --git a/tests/test_cmip6_datacube.py b/tests/test_cmip6_datacube.py index d074417..511a378 100644 --- a/tests/test_cmip6_datacube.py +++ b/tests/test_cmip6_datacube.py @@ -55,5 +55,6 @@ def test_auxiliary_variables(): assert set(['time', 'lat', 'lon']) == set(p['cube:dimensions'].keys()) assert p["cube:variables"]["lon_bnds"]["unit"] == "degrees_east" assert p["cube:variables"]["time_bnds"]["unit"] == "days since 1850-01-01" - assert p["cube:variables"]["clt"]["type"] == "data" assert p["cube:variables"]["time_bnds"]["type"] == "auxiliary" + assert p["cube:variables"]["time_bnds"]["description"] == "bounds for the time coordinate" + assert p["cube:variables"]["clt"]["type"] == "data"