warn and return bytes undecoded in case of UnicodeDecodeError in h5ne…

…tcdf-backend (#8874) * warn and return bytes undecoded in case of UnicodeDecodeError in h5netcdf-backend * add whats-new.rst entry * merge maybe_decode_bytes function into _read_attributes, add attribute and variable name to warning
pydata · Mar 26, 2024 · 55173e8 · 55173e8
1 parent ee02113
commit 55173e8
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 12 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -60,15 +60,17 @@ Bug fixes
   `CFMaskCoder`/`CFScaleOffsetCoder` (:issue:`2304`, :issue:`5597`,
   :issue:`7691`, :pull:`8713`, see also discussion in :pull:`7654`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
-- do not cast `_FillValue`/`missing_value` in `CFMaskCoder` if `_Unsigned` is provided
+- Do not cast `_FillValue`/`missing_value` in `CFMaskCoder` if `_Unsigned` is provided
   (:issue:`8844`, :pull:`8852`).
 - Adapt handling of copy keyword argument for numpy >= 2.0dev
-  (:issue:`8844`, :pull:`8851`, :pull:`8865``).
+  (:issue:`8844`, :pull:`8851`, :pull:`8865`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
-- import trapz/trapezoid depending on numpy version.
+- Import trapz/trapezoid depending on numpy version
   (:issue:`8844`, :pull:`8865`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
-
+- Warn and return bytes undecoded in case of UnicodeDecodeError in h5netcdf-backend
+  (:issue:`5563`, :pull:`8874`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 
 Documentation

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -28,6 +28,7 @@
 from xarray.core import indexing
 from xarray.core.utils import (
     FrozenDict,
+    emit_user_level_warning,
     is_remote_uri,
     read_magic_number_from_file,
     try_read_magic_number_from_file_or_path,
@@ -58,21 +59,23 @@ def _getitem(self, key):
             return array[key]
 
 
-def maybe_decode_bytes(txt):
-    if isinstance(txt, bytes):
-        return txt.decode("utf-8")
-    else:
-        return txt
-
-
 def _read_attributes(h5netcdf_var):
     # GH451
     # to ensure conventions decoding works properly on Python 3, decode all
     # bytes attributes to strings
     attrs = {}
     for k, v in h5netcdf_var.attrs.items():
         if k not in ["_FillValue", "missing_value"]:
-            v = maybe_decode_bytes(v)
+            if isinstance(v, bytes):
+                try:
+                    v = v.decode("utf-8")
+                except UnicodeDecodeError:
+                    emit_user_level_warning(
+                        f"'utf-8' codec can't decode bytes for attribute "
+                        f"{k!r} of h5netcdf object {h5netcdf_var.name!r}, "
+                        f"returning bytes undecoded.",
+                        UnicodeWarning,
+                    )
         attrs[k] = v
     return attrs
 

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -3560,6 +3560,16 @@ def test_dump_encodings_h5py(self) -> None:
             assert actual.x.encoding["compression"] == "lzf"
             assert actual.x.encoding["compression_opts"] is None
 
+    def test_decode_utf8_warning(self) -> None:
+        title = b"\xc3"
+        with create_tmp_file() as tmp_file:
+            with nc4.Dataset(tmp_file, "w") as f:
+                f.title = title
+            with pytest.warns(UnicodeWarning, match="returning bytes undecoded") as w:
+                ds = xr.load_dataset(tmp_file, engine="h5netcdf")
+                assert ds.title == title
+                assert "attribute 'title' of h5netcdf object '/'" in str(w[0].message)
+
 
 @requires_h5netcdf
 @requires_netCDF4