Merge pull request #95 from d-v-b/fix/no-empty-codecs

d-v-b · web-flow · commit e790176620b0 · 2025-08-14T13:15:22.000+02:00
disallow empty codecs, and use a sane default in auto_codecs, allow codecs to be specified by strings
diff --git a/src/pydantic_zarr/core.py b/src/pydantic_zarr/core.py
@@ -6,6 +6,7 @@
     Any,
     Literal,
     TypeAlias,
+    TypeVar,
     overload,
 )
 
@@ -24,6 +25,8 @@
 
 AccessMode: TypeAlias = Literal["w", "w+", "r", "a"]
 
+T = TypeVar("T")
+
 
 @overload
 def tuplify_json(obj: Mapping) -> Mapping: ...
@@ -133,3 +136,12 @@ def maybe_node(
         return get_node(spath.store, spath.path, zarr_format=zarr_format)
     except FileNotFoundError:
         return None
+
+
+def ensure_multiple(data: Sequence[T]) -> Sequence[T]:
+    """
+    Ensure that there is at least one element in the sequence
+    """
+    if len(data) < 1:
+        raise ValueError("Invalid length. Expected 1 or more, got 0.")
+    return data
diff --git a/src/pydantic_zarr/v3.py b/src/pydantic_zarr/v3.py
@@ -30,6 +30,7 @@
     IncEx,
     StrictBase,
     ensure_key_no_path,
+    ensure_multiple,
     maybe_node,
     model_like,
     tuplify_json,
@@ -94,6 +95,10 @@ class AnyNamedConfig(NamedConfig[str, Mapping[str, object]]):
     """
 
 
+CodecLike = str | AnyNamedConfig
+"""A type modelling the permissible declarations for codecs"""
+
+
 class RegularChunkingConfig(TypedDict):
     chunk_shape: tuple[int, ...]
 
@@ -160,7 +165,9 @@ def parse_dtype_v3(dtype: npt.DTypeLike | Mapping[str, object]) -> Mapping[str,
                 raise ValueError(f"Unsupported dtype: {dtype}")
 
 
-DtypeStr = Annotated[str, BeforeValidator(parse_dtype_v3)]
+DTypeStr = Annotated[str, BeforeValidator(parse_dtype_v3)]
+DTypeLike = DTypeStr | AnyNamedConfig
+CodecTuple = Annotated[tuple[CodecLike, ...], BeforeValidator(ensure_multiple)]
 
 
 class ArraySpec(NodeSpec, Generic[TAttr]):
@@ -196,11 +203,11 @@ class ArraySpec(NodeSpec, Generic[TAttr]):
     node_type: Literal["array"] = "array"
     attributes: TAttr = cast(TAttr, {})
     shape: tuple[int, ...]
-    data_type: DtypeStr | AnyNamedConfig
+    data_type: DTypeLike
     chunk_grid: RegularChunking  # todo: validate this against shape
     chunk_key_encoding: DefaultChunkKeyEncoding  # todo: validate this against shape
     fill_value: FillValue  # todo: validate this against the data type
-    codecs: tuple[AnyNamedConfig, ...]
+    codecs: CodecTuple
     storage_transformers: tuple[AnyNamedConfig, ...] = ()
     dimension_names: tuple[str | None, ...] | None = None  # todo: validate this against shape
 
@@ -252,7 +259,7 @@ def from_array(
         chunk_grid: Literal["auto"] | AnyNamedConfig = "auto",
         chunk_key_encoding: Literal["auto"] | AnyNamedConfig = "auto",
         fill_value: Literal["auto"] | FillValue = "auto",
-        codecs: Literal["auto"] | Sequence[AnyNamedConfig] = "auto",
+        codecs: Literal["auto"] | Sequence[CodecLike] = "auto",
         storage_transformers: Literal["auto"] | Sequence[AnyNamedConfig] = "auto",
         dimension_names: Literal["auto"] | Sequence[str | None] = "auto",
     ) -> Self:
@@ -293,11 +300,11 @@ def from_array(
         else:
             fill_value_actual = fill_value
 
-        codecs_actual: Sequence[AnyNamedConfig]
+        codecs_actual: tuple[CodecLike, ...]
         if codecs == "auto":
             codecs_actual = auto_codecs(array)
         else:
-            codecs_actual = codecs
+            codecs_actual = tuple(codecs)
         storage_transformers_actual: Sequence[AnyNamedConfig]
         if storage_transformers == "auto":
             storage_transformers_actual = auto_storage_transformers(array)
@@ -1017,10 +1024,14 @@ def auto_fill_value(data: object) -> FillValue:
     raise ValueError("Cannot determine default data type for object without shape attribute.")
 
 
-def auto_codecs(data: object) -> tuple[AnyNamedConfig, ...]:
+def auto_codecs(data: object) -> tuple[CodecLike, ...]:
+    """
+    Automatically create a tuple of codecs from an arbitrary python object.
+    """
     if hasattr(data, "codecs"):
+        # todo: type check
         return tuple(data.codecs)
-    return ()
+    return ({"name": "bytes"},)
 
 
 def auto_storage_transformers(data: object) -> tuple[AnyNamedConfig, ...]:
diff --git a/tests/test_pydantic_zarr/test_v3.py b/tests/test_pydantic_zarr/test_v3.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pytest
 import zarr
+from pydantic import ValidationError
 
 from pydantic_zarr.core import tuplify_json
 from pydantic_zarr.v3 import (
@@ -18,6 +19,7 @@
     NamedConfig,
     RegularChunking,
     RegularChunkingConfig,
+    auto_codecs,
 )
 
 from .conftest import DTYPE_EXAMPLES_V3, DTypeExample
@@ -44,7 +46,9 @@ def test_serialize_deserialize() -> None:
 
 
 def test_from_array() -> None:
-    array_spec = ArraySpec.from_array(np.arange(10))
+    array = np.arange(10)
+    array_spec = ArraySpec.from_array(array)
+
     assert array_spec == ArraySpec(
         zarr_format=3,
         node_type="array",
@@ -58,10 +62,32 @@ def test_from_array() -> None:
             name="default", configuration=DefaultChunkKeyEncodingConfig(separator="/")
         ),
         fill_value=0,
-        codecs=(),
+        codecs=auto_codecs(array),
         storage_transformers=(),
         dimension_names=None,
     )
+    # check that we can write this array to zarr
+    # TODO: fix type of the store argument in to_zarr
+    array_spec.to_zarr(store={}, path="")  # type: ignore[arg-type]
+
+
+def test_arrayspec_no_empty_codecs() -> None:
+    """
+    Ensure that it is not possible to create an ArraySpec with no codecs
+    """
+
+    with pytest.raises(
+        ValidationError, match="Value error, Invalid length. Expected 1 or more, got 0."
+    ):
+        ArraySpec(
+            shape=(1,),
+            data_type="uint8",
+            codecs=[],
+            attributes={},
+            fill_value=0,
+            chunk_grid={"name": "regular", "configuration": {"chunk_shape": (1,)}},
+            chunk_key_encoding={"name": "default", "configuration": {"separator": "/"}},
+        )
 
 
 @pytest.mark.filterwarnings("ignore:The dtype:UserWarning")