Skip to content

Commit

Permalink
Support for the new compression arguments. (#7551)
Browse files Browse the repository at this point in the history
* Support for the new compression arguments.

Use a dict for the arguments and update it with the encoding, so all variables are passed.

* significant_digit and other missing keys added

Should close #7634

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* test for the new compression argument

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* move the new test to TestNetCDF4Data

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* simplify this line (code review)

* Added entry to whats-new

Also removed an unnecesary call to monkeypatch fixture.

* bump netcdf4 to 1.6.2 in min-all-deps.yml

* parametrize compression in test

* Revert "bump netcdf4 to 1.6.2 in min-all-deps.yml"

This reverts commit c2ce8d5.

* check netCDF4 version and skip test if netcdf4 version <1.6.2

* fix typing

* Larger chunks to avoid random blosc errors

With smaller chunks it raises "Blosc_FIlter Error: blosc_filter: Buffer is uncompressible." one out of three times.

* use decorator to skip old netCDF4 versions

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove stale version-property

* fix whats-new.rst

* fix requires-decorator

* fix for asserts of other tests that use test data

* Apply suggestions from code review

* Update xarray/tests/__init__.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update xarray/tests/test_backends.py

---------

Co-authored-by: garciam <[email protected]>
Co-authored-by: Deepak Cherian <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ryan Abernathey <[email protected]>
Co-authored-by: Kai Mühlbauer <[email protected]>
Co-authored-by: Kai Mühlbauer <[email protected]>
  • Loading branch information
7 people authored Dec 21, 2023
1 parent c35d6b6 commit a04900d
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 17 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ New Features

- :py:meth:`xr.cov` and :py:meth:`xr.corr` now support using weights (:issue:`8527`, :pull:`7392`).
By `Llorenç Lledó <https://github.com/lluritu>`_.
- Accept the compression arguments new in netCDF 1.6.0 in the netCDF4 backend.
See `netCDF4 documentation <https://unidata.github.io/netcdf4-python/#efficient-compression-of-netcdf-variables>`_ for details.
By `Markel García-Díez <https://github.com/markelg>`_. (:issue:`6929`, :pull:`7551`) Note that some
new compression filters needs plugins to be installed which may not be available in all netCDF distributions.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
25 changes: 17 additions & 8 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,12 @@ def _extract_nc4_variable_encoding(
"_FillValue",
"dtype",
"compression",
"significant_digits",
"quantize_mode",
"blosc_shuffle",
"szip_coding",
"szip_pixels_per_block",
"endian",
}
if lsd_okay:
valid_encodings.add("least_significant_digit")
Expand Down Expand Up @@ -497,20 +503,23 @@ def prepare_variable(
if name in self.ds.variables:
nc4_var = self.ds.variables[name]
else:
nc4_var = self.ds.createVariable(
default_args = dict(
varname=name,
datatype=datatype,
dimensions=variable.dims,
zlib=encoding.get("zlib", False),
complevel=encoding.get("complevel", 4),
shuffle=encoding.get("shuffle", True),
fletcher32=encoding.get("fletcher32", False),
contiguous=encoding.get("contiguous", False),
chunksizes=encoding.get("chunksizes"),
zlib=False,
complevel=4,
shuffle=True,
fletcher32=False,
contiguous=False,
chunksizes=None,
endian="native",
least_significant_digit=encoding.get("least_significant_digit"),
least_significant_digit=None,
fill_value=fill_value,
)
default_args.update(encoding)
default_args.pop("_FillValue", None)
nc4_var = self.ds.createVariable(**default_args)

nc4_var.setncatts(attrs)

Expand Down
32 changes: 25 additions & 7 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import importlib
import platform
import string
import warnings
from contextlib import contextmanager, nullcontext
from unittest import mock # noqa: F401
Expand Down Expand Up @@ -112,6 +113,10 @@ def _importorskip(
not has_h5netcdf_ros3[0], reason="requires h5netcdf 1.3.0"
)

has_netCDF4_1_6_2_or_above, requires_netCDF4_1_6_2_or_above = _importorskip(
"netCDF4", "1.6.2"
)

# change some global options for tests
set_options(warn_for_unclosed_files=True)

Expand Down Expand Up @@ -262,28 +267,41 @@ def assert_allclose(a, b, check_default_indexes=True, **kwargs):
xarray.testing._assert_internal_invariants(b, check_default_indexes)


def create_test_data(seed: int | None = None, add_attrs: bool = True) -> Dataset:
_DEFAULT_TEST_DIM_SIZES = (8, 9, 10)


def create_test_data(
seed: int | None = None,
add_attrs: bool = True,
dim_sizes: tuple[int, int, int] = _DEFAULT_TEST_DIM_SIZES,
) -> Dataset:
rs = np.random.RandomState(seed)
_vars = {
"var1": ["dim1", "dim2"],
"var2": ["dim1", "dim2"],
"var3": ["dim3", "dim1"],
}
_dims = {"dim1": 8, "dim2": 9, "dim3": 10}
_dims = {"dim1": dim_sizes[0], "dim2": dim_sizes[1], "dim3": dim_sizes[2]}

obj = Dataset()
obj["dim2"] = ("dim2", 0.5 * np.arange(_dims["dim2"]))
obj["dim3"] = ("dim3", list("abcdefghij"))
if _dims["dim3"] > 26:
raise RuntimeError(
f'Not enough letters for filling this dimension size ({_dims["dim3"]})'
)
obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]]))
obj["time"] = ("time", pd.date_range("2000-01-01", periods=20))
for v, dims in sorted(_vars.items()):
data = rs.normal(size=tuple(_dims[d] for d in dims))
obj[v] = (dims, data)
if add_attrs:
obj[v].attrs = {"foo": "variable"}
obj.coords["numbers"] = (
"dim3",
np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64"),
)

if dim_sizes == _DEFAULT_TEST_DIM_SIZES:
numbers_values = np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3], dtype="int64")
else:
numbers_values = np.random.randint(0, 3, _dims["dim3"], dtype="int64")
obj.coords["numbers"] = ("dim3", numbers_values)
obj.encoding = {"foo": "bar"}
assert all(obj.data.flags.writeable for obj in obj.variables.values())
return obj
Expand Down
73 changes: 71 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
requires_h5netcdf_ros3,
requires_iris,
requires_netCDF4,
requires_netCDF4_1_6_2_or_above,
requires_pydap,
requires_pynio,
requires_scipy,
Expand Down Expand Up @@ -1486,7 +1487,7 @@ def test_dump_and_open_encodings(self) -> None:
assert ds.variables["time"].getncattr("units") == units
assert_array_equal(ds.variables["time"], np.arange(10) + 4)

def test_compression_encoding(self) -> None:
def test_compression_encoding_legacy(self) -> None:
data = create_test_data()
data["var2"].encoding.update(
{
Expand Down Expand Up @@ -1767,6 +1768,74 @@ def test_setncattr_string(self) -> None:
assert_array_equal(one_element_list_of_strings, totest.attrs["bar"])
assert one_string == totest.attrs["baz"]

@pytest.mark.parametrize(
"compression",
[
None,
"zlib",
"szip",
"zstd",
"blosc_lz",
"blosc_lz4",
"blosc_lz4hc",
"blosc_zlib",
"blosc_zstd",
],
)
@requires_netCDF4_1_6_2_or_above
@pytest.mark.xfail(ON_WINDOWS, reason="new compression not yet implemented")
def test_compression_encoding(self, compression: str | None) -> None:
data = create_test_data(dim_sizes=(20, 80, 10))
encoding_params: dict[str, Any] = dict(compression=compression, blosc_shuffle=1)
data["var2"].encoding.update(encoding_params)
data["var2"].encoding.update(
{
"chunksizes": (20, 40),
"original_shape": data.var2.shape,
"blosc_shuffle": 1,
"fletcher32": False,
}
)
with self.roundtrip(data) as actual:
expected_encoding = data["var2"].encoding.copy()
# compression does not appear in the retrieved encoding, that differs
# from the input encoding. shuffle also chantges. Here we modify the
# expected encoding to account for this
compression = expected_encoding.pop("compression")
blosc_shuffle = expected_encoding.pop("blosc_shuffle")
if compression is not None:
if "blosc" in compression and blosc_shuffle:
expected_encoding["blosc"] = {
"compressor": compression,
"shuffle": blosc_shuffle,
}
expected_encoding["shuffle"] = False
elif compression == "szip":
expected_encoding["szip"] = {
"coding": "nn",
"pixels_per_block": 8,
}
expected_encoding["shuffle"] = False
else:
# This will set a key like zlib=true which is what appears in
# the encoding when we read it.
expected_encoding[compression] = True
if compression == "zstd":
expected_encoding["shuffle"] = False
else:
expected_encoding["shuffle"] = False

actual_encoding = actual["var2"].encoding
assert expected_encoding.items() <= actual_encoding.items()
if (
encoding_params["compression"] is not None
and "blosc" not in encoding_params["compression"]
):
# regression test for #156
expected = data.isel(dim1=0)
with self.roundtrip(expected) as actual:
assert_equal(expected, actual)

@pytest.mark.skip(reason="https://github.com/Unidata/netcdf4-python/issues/1195")
def test_refresh_from_disk(self) -> None:
super().test_refresh_from_disk()
Expand Down Expand Up @@ -4518,7 +4587,7 @@ def test_extract_nc4_variable_encoding(self) -> None:
assert {} == encoding

@requires_netCDF4
def test_extract_nc4_variable_encoding_netcdf4(self, monkeypatch):
def test_extract_nc4_variable_encoding_netcdf4(self):
# New netCDF4 1.6.0 compression argument.
var = xr.Variable(("x",), [1, 2, 3], {}, {"compression": "szlib"})
_extract_nc4_variable_encoding(var, backend="netCDF4", raise_on_invalid=True)
Expand Down

0 comments on commit a04900d

Please sign in to comment.