Fix subgroup dims in HDF reader (#366)

* regression test * pass regression test * fix other tests of internal methods * cleaning minor nits * test that opening an empty group raises a NotImplementedError * change incorrect-looking return type * release note * fix bad merge
zarr-developers · Jan 22, 2025 · 8f64962 · 8f64962
1 parent 0d2d6ab
commit 8f64962
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 16 deletions.
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -35,6 +35,8 @@ Bug fixes
 
 - Fix bug preventing generating references for the root group of a file when a subgroup exists.
   (:issue:`336`, :pull:`338`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+- Fix bug in HDF reader where dimension names of dimensions in a subgroup would be incorrect.
+  (:issue:`364`, :pull:`366`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Fix bug in dmrpp reader so _FillValue is included in variables' encodings.
   (:pull:`369`) By `Aimee Barciauskas <https://github.com/abarciauskas-bgse>`_.
 - Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data.

diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py
@@ -169,7 +169,7 @@ def add_chunk_info(blob):
             return chunk_manifest
 
     @staticmethod
-    def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
+    def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
         """
         Get a list of dimension scale names attached to input HDF5 dataset.
 
@@ -181,10 +181,12 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
         ----------
         dataset : h5py.Dataset
             An h5py dataset.
+        group : str
+            Name of the group we are pulling these dimensions from. Required for potentially removing subgroup prefixes.
 
         Returns
         -------
-        list
+        list[str]
             List with HDF5 path names of dimension scales attached to input
             dataset.
         """
@@ -208,7 +210,11 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
                     # In this case, we mimic netCDF4 and assign phony dimension names.
                     # See https://github.com/fsspec/kerchunk/issues/41
                     dims.append(f"phony_dim_{n}")
-        return dims
+
+        if not group.endswith("/"):
+            group += "/"
+
+        return [dim.removeprefix(group) for dim in dims]
 
     @staticmethod
     def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
@@ -257,20 +263,25 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
         return attrs
 
     @staticmethod
-    def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable]:
+    def _dataset_to_variable(
+        path: str,
+        dataset: H5Dataset,
+        group: str,
+    ) -> Optional[xr.Variable]:
         """
         Extract an xarray Variable with ManifestArray data from an h5py dataset
 
         Parameters
         ----------
         dataset : h5py.Dataset
             An h5py dataset.
+        group : str
+            Name of the group containing this h5py.Dataset.
 
         Returns
         -------
         list: xarray.Variable
             A list of xarray variables.
-
         """
         # This chunk determination logic mirrors zarr-python's create
         # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
@@ -305,7 +316,7 @@ def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable]
             shape=dataset.shape,
             zarr_format=2,
         )
-        dims = HDFVirtualBackend._dataset_dims(dataset)
+        dims = HDFVirtualBackend._dataset_dims(dataset, group=group)
         manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
         if manifest:
             marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
@@ -330,37 +341,44 @@ def _virtual_vars_from_hdf(
         ----------
         path: str
             The path of the hdf5 file.
-        group: str
-            The name of the group for which to extract variables.
+        group: str, optional
+            The name of the group for which to extract variables. None refers to the root group.
         drop_variables: list of str
             A list of variable names to skip extracting.
         reader_options: dict
-            A dictionary of reader options passed to fsspec when opening the
-            file.
+            A dictionary of reader options passed to fsspec when opening the file.
 
         Returns
         -------
         dict
             A dictionary of Xarray Variables with the variable names as keys.
-
         """
         if drop_variables is None:
             drop_variables = []
+
         open_file = _FsspecFSFromFilepath(
             filepath=path, reader_options=reader_options
         ).open_file()
         f = h5py.File(open_file, mode="r")
-        if group:
+
+        if group is not None:
             g = f[group]
+            group_name = group
             if not isinstance(g, h5py.Group):
                 raise ValueError("The provided group is not an HDF group")
         else:
             g = f
+            group_name = ""
+
         variables = {}
         for key in g.keys():
             if key not in drop_variables:
                 if isinstance(g[key], h5py.Dataset):
-                    variable = HDFVirtualBackend._dataset_to_variable(path, g[key])
+                    variable = HDFVirtualBackend._dataset_to_variable(
+                        path=path,
+                        dataset=g[key],
+                        group=group_name,
+                    )
                     if variable is not None:
                         variables[key] = variable
                 else:

diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py
@@ -3,6 +3,7 @@
 import h5py  # type: ignore
 import pytest
 
+from virtualizarr import open_virtual_dataset
 from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import (
     requires_hdf5plugin,
@@ -90,22 +91,24 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
         f = h5py.File(chunked_dimensions_netcdf4_file)
         ds = f["data"]
         var = HDFVirtualBackend._dataset_to_variable(
-            chunked_dimensions_netcdf4_file, ds
+            chunked_dimensions_netcdf4_file, ds, group=""
         )
         assert var.chunks == (50, 50)
 
     def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
         f = h5py.File(single_dimension_scale_hdf5_file)
         ds = f["data"]
         var = HDFVirtualBackend._dataset_to_variable(
-            single_dimension_scale_hdf5_file, ds
+            single_dimension_scale_hdf5_file, ds, group=""
         )
         assert var.chunks == (2,)
 
     def test_dataset_attributes(self, string_attributes_hdf5_file):
         f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
-        var = HDFVirtualBackend._dataset_to_variable(string_attributes_hdf5_file, ds)
+        var = HDFVirtualBackend._dataset_to_variable(
+            string_attributes_hdf5_file, ds, group=""
+        )
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
@@ -178,3 +181,30 @@ def test_coord_names(
         maybe_open_loadable_vars_and_indexes.return_value = (0, 0)
         HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file)
         assert construct_virtual_dataset.call_args[1]["coord_names"] == ["lat", "lon"]
+
+
+@requires_hdf5plugin
+@requires_imagecodecs
+@pytest.mark.parametrize("group", ["subgroup", "subgroup/"])
+def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group):
+    # regression test for GH issue #364
+    vds = open_virtual_dataset(
+        netcdf4_file_with_data_in_multiple_groups,
+        group=group,
+        backend=HDFVirtualBackend,
+    )
+    assert list(vds.dims) == ["dim_0"]
+
+
+@requires_hdf5plugin
+@requires_imagecodecs
+def test_nested_groups(hdf5_groups_file):
+    # try to open an empty group
+    with pytest.raises(
+        NotImplementedError, match="Nested groups are not yet supported"
+    ):
+        open_virtual_dataset(
+            hdf5_groups_file,
+            group="/",
+            backend=HDFVirtualBackend,
+        )