diff --git a/docs/releases.rst b/docs/releases.rst index 02b270bf..4a42bea4 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -35,6 +35,8 @@ Bug fixes - Fix bug preventing generating references for the root group of a file when a subgroup exists. (:issue:`336`, :pull:`338`) By `Tom Nicholas `_. +- Fix bug in HDF reader where dimension names of dimensions in a subgroup would be incorrect. + (:issue:`364`, :pull:`366`) By `Tom Nicholas `_. - Fix bug in dmrpp reader so _FillValue is included in variables' encodings. (:pull:`369`) By `Aimee Barciauskas `_. - Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data. diff --git a/virtualizarr/readers/hdf/hdf.py b/virtualizarr/readers/hdf/hdf.py index 7b03ea8e..98da515e 100644 --- a/virtualizarr/readers/hdf/hdf.py +++ b/virtualizarr/readers/hdf/hdf.py @@ -169,7 +169,7 @@ def add_chunk_info(blob): return chunk_manifest @staticmethod - def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]: + def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]: """ Get a list of dimension scale names attached to input HDF5 dataset. @@ -181,10 +181,12 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]: ---------- dataset : h5py.Dataset An h5py dataset. + group : str + Name of the group we are pulling these dimensions from. Required for potentially removing subgroup prefixes. Returns ------- - list + list[str] List with HDF5 path names of dimension scales attached to input dataset. """ @@ -208,7 +210,11 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]: # In this case, we mimic netCDF4 and assign phony dimension names. # See https://github.com/fsspec/kerchunk/issues/41 dims.append(f"phony_dim_{n}") - return dims + + if not group.endswith("/"): + group += "/" + + return [dim.removeprefix(group) for dim in dims] @staticmethod def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): @@ -257,7 +263,11 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]): return attrs @staticmethod - def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable]: + def _dataset_to_variable( + path: str, + dataset: H5Dataset, + group: str, + ) -> Optional[xr.Variable]: """ Extract an xarray Variable with ManifestArray data from an h5py dataset @@ -265,12 +275,13 @@ def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable] ---------- dataset : h5py.Dataset An h5py dataset. + group : str + Name of the group containing this h5py.Dataset. Returns ------- list: xarray.Variable A list of xarray variables. - """ # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 @@ -305,7 +316,7 @@ def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable] shape=dataset.shape, zarr_format=2, ) - dims = HDFVirtualBackend._dataset_dims(dataset) + dims = HDFVirtualBackend._dataset_dims(dataset, group=group) manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset) if manifest: marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) @@ -330,37 +341,44 @@ def _virtual_vars_from_hdf( ---------- path: str The path of the hdf5 file. - group: str - The name of the group for which to extract variables. + group: str, optional + The name of the group for which to extract variables. None refers to the root group. drop_variables: list of str A list of variable names to skip extracting. reader_options: dict - A dictionary of reader options passed to fsspec when opening the - file. + A dictionary of reader options passed to fsspec when opening the file. Returns ------- dict A dictionary of Xarray Variables with the variable names as keys. - """ if drop_variables is None: drop_variables = [] + open_file = _FsspecFSFromFilepath( filepath=path, reader_options=reader_options ).open_file() f = h5py.File(open_file, mode="r") - if group: + + if group is not None: g = f[group] + group_name = group if not isinstance(g, h5py.Group): raise ValueError("The provided group is not an HDF group") else: g = f + group_name = "" + variables = {} for key in g.keys(): if key not in drop_variables: if isinstance(g[key], h5py.Dataset): - variable = HDFVirtualBackend._dataset_to_variable(path, g[key]) + variable = HDFVirtualBackend._dataset_to_variable( + path=path, + dataset=g[key], + group=group_name, + ) if variable is not None: variables[key] = variable else: diff --git a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py index 13a051d4..2e6c7c33 100644 --- a/virtualizarr/tests/test_readers/test_hdf/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf/test_hdf.py @@ -3,6 +3,7 @@ import h5py # type: ignore import pytest +from virtualizarr import open_virtual_dataset from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( requires_hdf5plugin, @@ -90,7 +91,7 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): f = h5py.File(chunked_dimensions_netcdf4_file) ds = f["data"] var = HDFVirtualBackend._dataset_to_variable( - chunked_dimensions_netcdf4_file, ds + chunked_dimensions_netcdf4_file, ds, group="" ) assert var.chunks == (50, 50) @@ -98,14 +99,16 @@ def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): f = h5py.File(single_dimension_scale_hdf5_file) ds = f["data"] var = HDFVirtualBackend._dataset_to_variable( - single_dimension_scale_hdf5_file, ds + single_dimension_scale_hdf5_file, ds, group="" ) assert var.chunks == (2,) def test_dataset_attributes(self, string_attributes_hdf5_file): f = h5py.File(string_attributes_hdf5_file) ds = f["data"] - var = HDFVirtualBackend._dataset_to_variable(string_attributes_hdf5_file, ds) + var = HDFVirtualBackend._dataset_to_variable( + string_attributes_hdf5_file, ds, group="" + ) assert var.attrs["attribute_name"] == "attribute_name" @@ -178,3 +181,30 @@ def test_coord_names( maybe_open_loadable_vars_and_indexes.return_value = (0, 0) HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file) assert construct_virtual_dataset.call_args[1]["coord_names"] == ["lat", "lon"] + + +@requires_hdf5plugin +@requires_imagecodecs +@pytest.mark.parametrize("group", ["subgroup", "subgroup/"]) +def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group): + # regression test for GH issue #364 + vds = open_virtual_dataset( + netcdf4_file_with_data_in_multiple_groups, + group=group, + backend=HDFVirtualBackend, + ) + assert list(vds.dims) == ["dim_0"] + + +@requires_hdf5plugin +@requires_imagecodecs +def test_nested_groups(hdf5_groups_file): + # try to open an empty group + with pytest.raises( + NotImplementedError, match="Nested groups are not yet supported" + ): + open_virtual_dataset( + hdf5_groups_file, + group="/", + backend=HDFVirtualBackend, + )