Skip to content

Numcodecs in v3 #3037

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
40 changes: 31 additions & 9 deletions src/zarr/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from importlib.metadata import entry_points as get_entry_points
from typing import TYPE_CHECKING, Any, Generic, TypeVar

import numcodecs

from zarr.core.config import BadConfigError, config
from zarr.core.dtype import data_type_registry

Expand Down Expand Up @@ -177,6 +179,20 @@
return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type]


def numcodec_to_zarr3_codec(codec: numcodecs.abc.Codec) -> Codec:
"""
Convert a numcodecs codec to a zarr v3 compatible numcodecs.zarr3 codec instance.
"""
codec_config = codec.get_config()
codec_name = codec_config.pop("id", None)
if codec_name is None:
raise ValueError(f"Codec configuration does not contain 'id': {codec_config}")
codec_cls = get_codec_class(f"numcodecs.{codec_name}")
if codec_cls is None:
raise ValueError(f"Codec class for 'numcodecs.{codec_name}' not found.")
return codec_cls.from_dict({"name": f"numcodecs.{codec_name}", "configuration": codec_config})

Check warning on line 193 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L186-L193

Added lines #L186 - L193 were not covered by tests


def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec:
"""
Normalize the input to a ``BytesBytesCodec`` instance.
Expand All @@ -185,15 +201,17 @@
"""
from zarr.abc.codec import BytesBytesCodec

if isinstance(data, dict):
if isinstance(data, numcodecs.abc.Codec):
result = numcodec_to_zarr3_codec(data)

Check warning on line 205 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L205

Added line #L205 was not covered by tests
elif isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, BytesBytesCodec):
msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead."
raise TypeError(msg)
else:
if not isinstance(data, BytesBytesCodec):
raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.")
result = data
if not isinstance(result, BytesBytesCodec):
raise TypeError(f"Expected a BytesBytesCodec. Got {type(result)} instead.")

Check warning on line 214 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L214

Added line #L214 was not covered by tests
return result


Expand All @@ -205,15 +223,17 @@
"""
from zarr.abc.codec import ArrayBytesCodec

if isinstance(data, dict):
if isinstance(data, numcodecs.abc.Codec):
result = numcodec_to_zarr3_codec(data)

Check warning on line 227 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L227

Added line #L227 was not covered by tests
elif isinstance(data, dict):
result = _resolve_codec(data)
if not isinstance(result, ArrayBytesCodec):
msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead."
raise TypeError(msg)
else:
if not isinstance(data, ArrayBytesCodec):
raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.")
result = data
if not isinstance(result, ArrayBytesCodec):
raise TypeError(f"Expected a ArrayBytesCodec. Got {type(result)} instead.")

Check warning on line 236 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L236

Added line #L236 was not covered by tests
return result


Expand All @@ -225,15 +245,17 @@
"""
from zarr.abc.codec import ArrayArrayCodec

if isinstance(data, dict):
if isinstance(data, numcodecs.abc.Codec):
result = numcodec_to_zarr3_codec(data)
elif isinstance(data, dict):

Check warning on line 250 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L248-L250

Added lines #L248 - L250 were not covered by tests
result = _resolve_codec(data)
if not isinstance(result, ArrayArrayCodec):
msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead."
raise TypeError(msg)
else:
if not isinstance(data, ArrayArrayCodec):
raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.")
result = data
if not isinstance(result, ArrayArrayCodec):
raise TypeError(f"Expected a ArrayArrayCodec. Got {type(result)} instead.")

Check warning on line 258 in src/zarr/registry.py

View check run for this annotation

Codecov / codecov/patch

src/zarr/registry.py#L257-L258

Added lines #L257 - L258 were not covered by tests
return result


Expand Down
75 changes: 75 additions & 0 deletions tests/test_codecs/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

import numcodecs
import numcodecs.zarr3
import numpy as np
import pytest

import zarr
import zarr.api
import zarr.api.asynchronous
from zarr import Array, AsyncArray, config
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
from zarr.codecs import (
BytesCodec,
GzipCodec,
Expand All @@ -23,7 +26,9 @@
from zarr.storage import StorePath

if TYPE_CHECKING:
from zarr.abc.codec import Codec
from zarr.abc.store import Store
from zarr.core.array import CompressorsLike, FiltersLike, SerializerLike
from zarr.core.buffer.core import NDArrayLikeOrScalar
from zarr.core.common import ChunkCoords, MemoryOrder

Expand Down Expand Up @@ -413,3 +418,73 @@ async def test_resize(store: Store) -> None:
assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None
assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is None
assert await store.get(f"{path}/1.1", prototype=default_buffer_prototype()) is None


@pytest.mark.parametrize("store", ["memory"], indirect=["store"])
@pytest.mark.parametrize(
("codec_v2", "expected_v3_cls"),
[
(numcodecs.BZ2(), numcodecs.zarr3.BZ2),
(numcodecs.CRC32(), numcodecs.zarr3.CRC32),
(numcodecs.CRC32C(), numcodecs.zarr3.CRC32C),
(numcodecs.LZ4(), numcodecs.zarr3.LZ4),
(numcodecs.LZMA(), numcodecs.zarr3.LZMA),
# (numcodecs.ZFPY(), numcodecs.zarr3.ZFPY), AttributeError: module 'numcodecs' has no attribute 'ZFPY'
(numcodecs.Adler32(), numcodecs.zarr3.Adler32),
(
numcodecs.AsType(encode_dtype=np.float64, decode_dtype=np.float32),
numcodecs.zarr3.AsType,
),
(numcodecs.BitRound(keepbits=10), numcodecs.zarr3.BitRound),
(numcodecs.Blosc(), numcodecs.zarr3.Blosc),
(numcodecs.Delta(dtype=np.float64), numcodecs.zarr3.Delta),
(
numcodecs.FixedScaleOffset(offset=1000, scale=10, dtype="f8", astype="u1"),
numcodecs.zarr3.FixedScaleOffset,
),
(numcodecs.Fletcher32(), numcodecs.zarr3.Fletcher32),
(numcodecs.GZip(), numcodecs.zarr3.GZip),
(numcodecs.JenkinsLookup3(), numcodecs.zarr3.JenkinsLookup3),
# (numcodecs.PCodec(), numcodecs.zarr3.PCodec), AttributeError: module 'numcodecs' has no attribute 'PCodec'
(numcodecs.PackBits(), numcodecs.zarr3.PackBits),
(numcodecs.Quantize(digits=1, dtype="f8"), numcodecs.zarr3.Quantize),
(numcodecs.Shuffle(), numcodecs.zarr3.Shuffle),
(numcodecs.Zlib(), numcodecs.zarr3.Zlib),
(numcodecs.Zstd(), numcodecs.zarr3.Zstd),
],
)
def test_numcodecs_in_v3(
store: Store, codec_v2: numcodecs.abc.Codec, expected_v3_cls: type[Codec]
) -> None:
import zarr.registry

result_v3 = zarr.registry.numcodec_to_zarr3_codec(codec_v2)

assert result_v3.__class__ == expected_v3_cls
assert result_v3.to_dict()["name"] == f"numcodecs.{codec_v2.codec_id}"
codec_v2_config = codec_v2.get_config()
codec_v2_config.pop("id")
assert result_v3.to_dict()["configuration"] == codec_v2_config

filters: FiltersLike = "auto"
serializer: SerializerLike = "auto"
compressors: CompressorsLike = "auto"
if isinstance(result_v3, ArrayArrayCodec):
filters = [codec_v2]
elif isinstance(result_v3, ArrayBytesCodec):
serializer = codec_v2
elif isinstance(result_v3, BytesBytesCodec):
compressors = [codec_v2]
else:
raise TypeError(f"unsupported type: {result_v3.__class__}")

zarr.create_array(
store,
shape=(64,),
chunks=(64,),
dtype=np.bool,
fill_value=False,
filters=filters,
compressors=compressors,
serializer=serializer,
)
Loading