Skip to content

Commit

Permalink
Fix subgroup dims in HDF reader (#366)
Browse files Browse the repository at this point in the history
* regression test

* pass regression test

* fix other tests of internal methods

* cleaning minor nits

* test that opening an empty group raises a NotImplementedError

* change incorrect-looking return type

* release note

* fix bad merge
  • Loading branch information
TomNicholas authored Jan 22, 2025
1 parent 0d2d6ab commit 8f64962
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 16 deletions.
2 changes: 2 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ Bug fixes

- Fix bug preventing generating references for the root group of a file when a subgroup exists.
(:issue:`336`, :pull:`338`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Fix bug in HDF reader where dimension names of dimensions in a subgroup would be incorrect.
(:issue:`364`, :pull:`366`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
- Fix bug in dmrpp reader so _FillValue is included in variables' encodings.
(:pull:`369`) By `Aimee Barciauskas <https://github.com/abarciauskas-bgse>`_.
- Fix bug passing arguments to FITS reader, and test it on Hubble Space Telescope data.
Expand Down
44 changes: 31 additions & 13 deletions virtualizarr/readers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def add_chunk_info(blob):
return chunk_manifest

@staticmethod
def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
def _dataset_dims(dataset: H5Dataset, group: str = "") -> List[str]:
"""
Get a list of dimension scale names attached to input HDF5 dataset.
Expand All @@ -181,10 +181,12 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
----------
dataset : h5py.Dataset
An h5py dataset.
group : str
Name of the group we are pulling these dimensions from. Required for potentially removing subgroup prefixes.
Returns
-------
list
list[str]
List with HDF5 path names of dimension scales attached to input
dataset.
"""
Expand All @@ -208,7 +210,11 @@ def _dataset_dims(dataset: H5Dataset) -> Union[List[str], List[None]]:
# In this case, we mimic netCDF4 and assign phony dimension names.
# See https://github.com/fsspec/kerchunk/issues/41
dims.append(f"phony_dim_{n}")
return dims

if not group.endswith("/"):
group += "/"

return [dim.removeprefix(group) for dim in dims]

@staticmethod
def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
Expand Down Expand Up @@ -257,20 +263,25 @@ def _extract_attrs(h5obj: Union[H5Dataset, H5Group]):
return attrs

@staticmethod
def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable]:
def _dataset_to_variable(
path: str,
dataset: H5Dataset,
group: str,
) -> Optional[xr.Variable]:
"""
Extract an xarray Variable with ManifestArray data from an h5py dataset
Parameters
----------
dataset : h5py.Dataset
An h5py dataset.
group : str
Name of the group containing this h5py.Dataset.
Returns
-------
list: xarray.Variable
A list of xarray variables.
"""
# This chunk determination logic mirrors zarr-python's create
# https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
Expand Down Expand Up @@ -305,7 +316,7 @@ def _dataset_to_variable(path: str, dataset: H5Dataset) -> Optional[xr.Variable]
shape=dataset.shape,
zarr_format=2,
)
dims = HDFVirtualBackend._dataset_dims(dataset)
dims = HDFVirtualBackend._dataset_dims(dataset, group=group)
manifest = HDFVirtualBackend._dataset_chunk_manifest(path, dataset)
if manifest:
marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
Expand All @@ -330,37 +341,44 @@ def _virtual_vars_from_hdf(
----------
path: str
The path of the hdf5 file.
group: str
The name of the group for which to extract variables.
group: str, optional
The name of the group for which to extract variables. None refers to the root group.
drop_variables: list of str
A list of variable names to skip extracting.
reader_options: dict
A dictionary of reader options passed to fsspec when opening the
file.
A dictionary of reader options passed to fsspec when opening the file.
Returns
-------
dict
A dictionary of Xarray Variables with the variable names as keys.
"""
if drop_variables is None:
drop_variables = []

open_file = _FsspecFSFromFilepath(
filepath=path, reader_options=reader_options
).open_file()
f = h5py.File(open_file, mode="r")
if group:

if group is not None:
g = f[group]
group_name = group
if not isinstance(g, h5py.Group):
raise ValueError("The provided group is not an HDF group")
else:
g = f
group_name = ""

variables = {}
for key in g.keys():
if key not in drop_variables:
if isinstance(g[key], h5py.Dataset):
variable = HDFVirtualBackend._dataset_to_variable(path, g[key])
variable = HDFVirtualBackend._dataset_to_variable(
path=path,
dataset=g[key],
group=group_name,
)
if variable is not None:
variables[key] = variable
else:
Expand Down
36 changes: 33 additions & 3 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import h5py # type: ignore
import pytest

from virtualizarr import open_virtual_dataset
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
requires_hdf5plugin,
Expand Down Expand Up @@ -90,22 +91,24 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
f = h5py.File(chunked_dimensions_netcdf4_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
chunked_dimensions_netcdf4_file, ds
chunked_dimensions_netcdf4_file, ds, group=""
)
assert var.chunks == (50, 50)

def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
f = h5py.File(single_dimension_scale_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(
single_dimension_scale_hdf5_file, ds
single_dimension_scale_hdf5_file, ds, group=""
)
assert var.chunks == (2,)

def test_dataset_attributes(self, string_attributes_hdf5_file):
f = h5py.File(string_attributes_hdf5_file)
ds = f["data"]
var = HDFVirtualBackend._dataset_to_variable(string_attributes_hdf5_file, ds)
var = HDFVirtualBackend._dataset_to_variable(
string_attributes_hdf5_file, ds, group=""
)
assert var.attrs["attribute_name"] == "attribute_name"


Expand Down Expand Up @@ -178,3 +181,30 @@ def test_coord_names(
maybe_open_loadable_vars_and_indexes.return_value = (0, 0)
HDFVirtualBackend.open_virtual_dataset(root_coordinates_hdf5_file)
assert construct_virtual_dataset.call_args[1]["coord_names"] == ["lat", "lon"]


@requires_hdf5plugin
@requires_imagecodecs
@pytest.mark.parametrize("group", ["subgroup", "subgroup/"])
def test_subgroup_variable_names(netcdf4_file_with_data_in_multiple_groups, group):
# regression test for GH issue #364
vds = open_virtual_dataset(
netcdf4_file_with_data_in_multiple_groups,
group=group,
backend=HDFVirtualBackend,
)
assert list(vds.dims) == ["dim_0"]


@requires_hdf5plugin
@requires_imagecodecs
def test_nested_groups(hdf5_groups_file):
# try to open an empty group
with pytest.raises(
NotImplementedError, match="Nested groups are not yet supported"
):
open_virtual_dataset(
hdf5_groups_file,
group="/",
backend=HDFVirtualBackend,
)

0 comments on commit 8f64962

Please sign in to comment.