From ebc79bdddb2ab63bc8708496f93d55309dc5a7c4 Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Tue, 29 Apr 2025 13:11:09 -0500 Subject: [PATCH 1/7] Adds chunk key encoding to kwargs passed to zarr Accept chunk_key_encoding for Datasets and DataArrays Add test Add documentation for chunk key encoding --- doc/internals/zarr-encoding-spec.rst | 44 ++++++++++++++++++++++++++++ xarray/backends/zarr.py | 1 + xarray/tests/test_backends.py | 33 +++++++++++++++++++++ 3 files changed, 78 insertions(+) diff --git a/doc/internals/zarr-encoding-spec.rst b/doc/internals/zarr-encoding-spec.rst index 958dad166e1..9431d2bc945 100644 --- a/doc/internals/zarr-encoding-spec.rst +++ b/doc/internals/zarr-encoding-spec.rst @@ -73,3 +73,47 @@ re-open it directly with Zarr: import shutil shutil.rmtree("rasm.zarr") + +Chunk Key Encoding +----------------- + +When writing data to Zarr stores, Xarray supports customizing how chunk keys are encoded +through the ``chunk_key_encoding`` parameter in the variable's encoding dictionary. This +is particularly useful when working with Zarr V2 arrays and you need to control the +dimension separator in chunk keys. + +For example, to specify a custom separator for chunk keys: + +.. ipython:: python + :okwarning: + + import xarray as xr + import numpy as np + from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding + + # Create a custom chunk key encoding with "/" as separator + enc = V2ChunkKeyEncoding(separator="/").to_dict() + + # Create and write a dataset with custom chunk key encoding + arr = np.ones((42, 100)) + ds = xr.DataArray(arr, name="var1").to_dataset() + ds.to_zarr("example.zarr", zarr_format=2, mode="w", + encoding={"var1": {"chunks": (42, 50), + "chunk_key_encoding": enc}}) + +The ``chunk_key_encoding`` option accepts a dictionary that specifies the encoding +configuration. For Zarr V2 arrays, you can use the ``V2ChunkKeyEncoding`` class from +``zarr.core.chunk_key_encodings`` to generate this configuration. This is particularly +useful when you need to ensure compatibility with specific Zarr V2 storage layouts or +when working with tools that expect a particular chunk key format. + +.. note:: + The ``chunk_key_encoding`` option is only relevant when writing to Zarr stores. + When reading Zarr arrays, Xarray automatically detects and uses the appropriate + chunk key encoding based on the store's format and configuration. + +.. ipython:: python + :suppress: + + import shutil + shutil.rmtree("example.zarr") diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 1a46346dda7..b11646d711a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -464,6 +464,7 @@ def extract_zarr_variable_encoding( "serializer", "cache_metadata", "write_empty_chunks", + "chunk_key_encoding", } if zarr_format == 3: valid_encodings.add("fill_value") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1d9c90b37b1..3498fa4adeb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3661,6 +3661,39 @@ def create_zarr_target(self): else: yield {} + def test_chunk_key_encoding(self) -> None: + from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding + + # Create a dataset with a variable name containing a period + data = np.ones((4, 4)) + original = Dataset({"var1": (("x", "y"), data)}) + + # Set up chunk key encoding with slash separator + encoding = { + "var1": { + "chunk_key_encoding": V2ChunkKeyEncoding(separator="/").to_dict(), + "chunks": (2, 2), + } + } + + # Write to store with custom encoding + with self.create_zarr_target() as store: + original.to_zarr(store, encoding=encoding) + + # Verify the chunk keys in store use the slash separator + if not has_zarr_v3: + chunk_keys = [k for k in store.keys() if k.startswith("var1/")] + assert len(chunk_keys) > 0 + for key in chunk_keys: + assert "/" in key + assert "." not in key.split("/")[1:] # No dots in chunk coordinates + + # Read back and verify data + with xr.open_zarr(store) as actual: + assert_identical(original, actual) + # Verify chunks are preserved + assert actual["var1"].encoding["chunks"] == (2, 2) + @requires_zarr @pytest.mark.skipif( From 9f680007bdd1d92ec5cb3e7c95b32f010438c7ff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 29 Apr 2025 18:18:06 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/internals/zarr-encoding-spec.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/internals/zarr-encoding-spec.rst b/doc/internals/zarr-encoding-spec.rst index 9431d2bc945..62fc8c0a8d2 100644 --- a/doc/internals/zarr-encoding-spec.rst +++ b/doc/internals/zarr-encoding-spec.rst @@ -97,9 +97,12 @@ For example, to specify a custom separator for chunk keys: # Create and write a dataset with custom chunk key encoding arr = np.ones((42, 100)) ds = xr.DataArray(arr, name="var1").to_dataset() - ds.to_zarr("example.zarr", zarr_format=2, mode="w", - encoding={"var1": {"chunks": (42, 50), - "chunk_key_encoding": enc}}) + ds.to_zarr( + "example.zarr", + zarr_format=2, + mode="w", + encoding={"var1": {"chunks": (42, 50), "chunk_key_encoding": enc}}, + ) The ``chunk_key_encoding`` option accepts a dictionary that specifies the encoding configuration. For Zarr V2 arrays, you can use the ``V2ChunkKeyEncoding`` class from @@ -116,4 +119,5 @@ when working with tools that expect a particular chunk key format. :suppress: import shutil + shutil.rmtree("example.zarr") From 27c52cf50ecfcf74b8ab4ccb3e12f667243ccb5e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 29 Apr 2025 18:19:29 +0000 Subject: [PATCH 3/7] Update whats-new --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 868ce6005c0..a7fbb12e105 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -76,6 +76,8 @@ Bug fixes By `Mathias Hauser `_. - Variables with no temporal dimension are left untouched by :py:meth:`~xarray.Dataset.convert_calendar`. (:issue:`10266`, :pull:`10268`) By `Pascal Bourgault `_. +- Enable `chunk_key_encoding` in :py:meth:`~xarray.Dataset.to_zarr` for Zarr v2 Datasets (:pull:`10274`) + By `BrianMichell `_. Documentation ~~~~~~~~~~~~~ From 3249fb91c4e87d3f7501583b4c556f7ef757df76 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 29 Apr 2025 18:25:56 +0000 Subject: [PATCH 4/7] Fixing pre-commit --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a7fbb12e105..9e7f4ebf1b1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -76,7 +76,7 @@ Bug fixes By `Mathias Hauser `_. - Variables with no temporal dimension are left untouched by :py:meth:`~xarray.Dataset.convert_calendar`. (:issue:`10266`, :pull:`10268`) By `Pascal Bourgault `_. -- Enable `chunk_key_encoding` in :py:meth:`~xarray.Dataset.to_zarr` for Zarr v2 Datasets (:pull:`10274`) +- Enable ``chunk_key_encoding`` in :py:meth:`~xarray.Dataset.to_zarr` for Zarr v2 Datasets (:pull:`10274`) By `BrianMichell `_. Documentation From bcc0e5a3be193a0add45042861d5cba87f5eac2d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 29 Apr 2025 21:01:40 +0000 Subject: [PATCH 5/7] Add fallback for zarr<3 --- xarray/tests/test_backends.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3498fa4adeb..21709672cdc 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3662,7 +3662,12 @@ def create_zarr_target(self): yield {} def test_chunk_key_encoding(self) -> None: - from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding + try: + from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding + encoding = V2ChunkKeyEncoding(separator="/").to_dict() + except ImportError: + # Fallback for zarr<3 + encoding = {'name': 'v2', 'configuration': {'separator': '/'}} # Create a dataset with a variable name containing a period data = np.ones((4, 4)) @@ -3671,7 +3676,7 @@ def test_chunk_key_encoding(self) -> None: # Set up chunk key encoding with slash separator encoding = { "var1": { - "chunk_key_encoding": V2ChunkKeyEncoding(separator="/").to_dict(), + "chunk_key_encoding": encoding, "chunks": (2, 2), } } From 471061cea0cd1135e18b1218ebce557119e1fe4a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 29 Apr 2025 21:02:24 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 21709672cdc..56a9e412fac 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3664,10 +3664,11 @@ def create_zarr_target(self): def test_chunk_key_encoding(self) -> None: try: from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding + encoding = V2ChunkKeyEncoding(separator="/").to_dict() except ImportError: # Fallback for zarr<3 - encoding = {'name': 'v2', 'configuration': {'separator': '/'}} + encoding = {"name": "v2", "configuration": {"separator": "/"}} # Create a dataset with a variable name containing a period data = np.ones((4, 4)) From 1ac47b9b04036b521654fc9cb2fe8a553e1b2eac Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 1 May 2025 15:24:57 +0000 Subject: [PATCH 7/7] Fix docs build warning --- doc/internals/zarr-encoding-spec.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internals/zarr-encoding-spec.rst b/doc/internals/zarr-encoding-spec.rst index 62fc8c0a8d2..83f07929b03 100644 --- a/doc/internals/zarr-encoding-spec.rst +++ b/doc/internals/zarr-encoding-spec.rst @@ -75,7 +75,7 @@ re-open it directly with Zarr: shutil.rmtree("rasm.zarr") Chunk Key Encoding ------------------ +------------------ When writing data to Zarr stores, Xarray supports customizing how chunk keys are encoded through the ``chunk_key_encoding`` parameter in the variable's encoding dictionary. This