diff --git a/changes/3417.bugfix.md b/changes/3417.bugfix.md new file mode 100644 index 0000000000..be5b44f3e9 --- /dev/null +++ b/changes/3417.bugfix.md @@ -0,0 +1,4 @@ +Fixed `BytesCodec.from_dict` so that `BytesCodec` instances roundtrip to / from +their dict representation. `BytesCodec.from_dict` now interprets a missing +`endian` configuration as `endian=None` (matching what `BytesCodec.to_dict` +emits), instead of falling back to the system's native byte order. diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 1c67c65e98..240c077627 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -62,6 +62,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: data, "bytes", require_configuration=False ) configuration_parsed = configuration_parsed or {} + configuration_parsed.setdefault("endian", None) return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: diff --git a/tests/test_codecs/test_bytes.py b/tests/test_codecs/test_bytes.py index 25c786a405..03dd0b40c6 100644 --- a/tests/test_codecs/test_bytes.py +++ b/tests/test_codecs/test_bytes.py @@ -5,10 +5,14 @@ import enum import sys import warnings -from typing import Any, cast +from typing import TYPE_CHECKING, Any, Literal, cast +import numpy as np import pytest +import zarr +from tests.conftest import Expect, ExpectFail +from zarr.abc.codec import SupportsSyncCodec from zarr.codecs.bytes import ( ENDIAN, BytesCodec, @@ -16,9 +20,83 @@ EndianLiteral, ) from zarr.core.array_spec import ArrayConfig, ArraySpec -from zarr.core.buffer import default_buffer_prototype +from zarr.core.buffer import NDBuffer, default_buffer_prototype +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.npy.int import Int8, Int32 from zarr.core.dtype.npy.structured import Struct +from zarr.storage import StorePath + +from .test_codecs import _AsyncArrayProxy + +if TYPE_CHECKING: + from zarr.abc.store import Store + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +@pytest.mark.parametrize("input_dtype", [">u2", "u2", " None: + """ + The `bytes` codec stores multi-byte data in the byte order configured on the + codec, regardless of the input array's byte order, and reads it back to the + original values. The input-dtype/store-endian cross-product exercises the + encode-side byteswap (input byte order != store byte order) and the no-op + case alike. Compression is disabled so the stored chunk is the codec's raw + output and its byte layout can be asserted directly. + """ + data = np.arange(0, 256, dtype=input_dtype).reshape((16, 16)) + path = "endian" + spath = StorePath(store, path) + a = await zarr.api.asynchronous.create_array( + spath, + shape=data.shape, + chunks=(16, 16), + dtype="uint16", + fill_value=0, + compressors=None, + serializer=BytesCodec(endian=store_endian), + ) + + await _AsyncArrayProxy(a)[:, :].set(data) + + # The stored chunk is laid out in the byte order configured on the codec. + stored = await store.get(f"{path}/c/0/0", prototype=default_buffer_prototype()) + assert stored is not None + expected_dtype = ">u2" if store_endian == "big" else " None: + assert isinstance(BytesCodec(), SupportsSyncCodec) + + +def test_bytes_codec_sync_roundtrip() -> None: + codec = BytesCodec() + arr = np.arange(100, dtype="float64") + zdtype = get_data_type_from_native_dtype(arr.dtype) + spec = ArraySpec( + shape=arr.shape, + dtype=zdtype, + fill_value=zdtype.cast_scalar(0), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) + + codec = codec.evolve_from_array_spec(spec) + + encoded = codec._encode_sync(nd_buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + np.testing.assert_array_equal(arr, decoded.as_numpy_array()) @pytest.mark.parametrize("endian", ENDIAN) @@ -46,6 +124,43 @@ def test_bytes_codec_json_roundtrip(endian: EndianLiteral) -> None: assert restored == codec +# to_dict and from_dict are inverses over this (endian setting, wire dict) mapping: +# to_dict turns the endian setting into the dict; from_dict recovers it. +_ENDIAN_DICT_CASES: list[Expect[EndianLiteral | None, dict[str, Any]]] = [ + Expect( + input="little", + output={"name": "bytes", "configuration": {"endian": "little"}}, + id="little", + ), + Expect( + input="big", + output={"name": "bytes", "configuration": {"endian": "big"}}, + id="big", + ), + Expect(input=None, output={"name": "bytes"}, id="missing"), +] + + +@pytest.mark.parametrize("case", _ENDIAN_DICT_CASES, ids=lambda c: c.id) +def test_to_dict(case: Expect[EndianLiteral | None, dict[str, Any]]) -> None: + assert BytesCodec(endian=case.input).to_dict() == case.output + + +@pytest.mark.parametrize("case", _ENDIAN_DICT_CASES, ids=lambda c: c.id) +def test_from_dict(case: Expect[EndianLiteral | None, dict[str, Any]]) -> None: + assert BytesCodec.from_dict(case.output).endian == case.input + + +@pytest.mark.parametrize("endian", ["little", "big", pytest.param(None, id="missing")]) +def test_roundtrip(endian: EndianLiteral | None) -> None: + codec = BytesCodec(endian=endian) + + encoded = codec.to_dict() + roundtripped = BytesCodec.from_dict(encoded) + + assert codec == roundtripped + + @pytest.mark.parametrize( ("member", "expected"), [("little", "little"), ("big", "big")], @@ -105,14 +220,25 @@ def test_bytes_codec_init_with_deprecated_class_member() -> None: assert codec.endian == "little" -def test_bytes_codec_rejects_unknown_endian() -> None: +@pytest.mark.parametrize( + "case", + [ + ExpectFail( + input="north", + exception=ValueError, + id="unknown-string", + msg="endian must be one of", + ), + ], + ids=lambda c: c.id, +) +def test_bytes_codec_rejects_unknown_endian(case: ExpectFail[Any]) -> None: """ - `BytesCodec.__init__` raises `ValueError` when given a string outside + `BytesCodec.__init__` raises `ValueError` when given a value outside `ENDIAN`, and the error message names the offending parameter. """ - kwargs: dict[str, Any] = {"endian": "north"} - with pytest.raises(ValueError, match="endian must be one of"): - BytesCodec(**kwargs) + with case.raises(): + BytesCodec(endian=case.input) def test_endian_attribute_error_for_unknown_member() -> None: diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py deleted file mode 100644 index c505cee828..0000000000 --- a/tests/test_codecs/test_endian.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import Literal - -import numpy as np -import pytest - -import zarr -from zarr.abc.codec import SupportsSyncCodec -from zarr.abc.store import Store -from zarr.codecs import BytesCodec -from zarr.core.array_spec import ArrayConfig, ArraySpec -from zarr.core.buffer import NDBuffer, default_buffer_prototype -from zarr.core.dtype import get_data_type_from_native_dtype -from zarr.storage import StorePath - -from .test_codecs import _AsyncArrayProxy - - -@pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") -@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) -@pytest.mark.parametrize("endian", ["big", "little"]) -async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: - data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - path = "endian" - spath = StorePath(store, path) - a = await zarr.api.asynchronous.create_array( - spath, - shape=data.shape, - chunks=(16, 16), - dtype=data.dtype, - fill_value=0, - chunk_key_encoding={"name": "v2", "separator": "."}, - serializer=BytesCodec(endian=endian), - ) - - await _AsyncArrayProxy(a)[:, :].set(data) - readback_data = await _AsyncArrayProxy(a)[:, :].get() - assert np.array_equal(data, readback_data) - - -def test_bytes_codec_supports_sync() -> None: - assert isinstance(BytesCodec(), SupportsSyncCodec) - - -def test_bytes_codec_sync_roundtrip() -> None: - codec = BytesCodec() - arr = np.arange(100, dtype="float64") - zdtype = get_data_type_from_native_dtype(arr.dtype) - spec = ArraySpec( - shape=arr.shape, - dtype=zdtype, - fill_value=zdtype.cast_scalar(0), - config=ArrayConfig(order="C", write_empty_chunks=True), - prototype=default_buffer_prototype(), - ) - nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) - - codec = codec.evolve_from_array_spec(spec) - - encoded = codec._encode_sync(nd_buf, spec) - assert encoded is not None - decoded = codec._decode_sync(encoded, spec) - np.testing.assert_array_equal(arr, decoded.as_numpy_array()) - - -@pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") -@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) -@pytest.mark.parametrize("dtype_input_endian", [">u2", "u2", " None: - data = np.arange(0, 256, dtype=dtype_input_endian).reshape((16, 16)) - path = "endian" - spath = StorePath(store, path) - a = await zarr.api.asynchronous.create_array( - spath, - shape=data.shape, - chunks=(16, 16), - dtype="uint16", - fill_value=0, - chunk_key_encoding={"name": "v2", "separator": "."}, - serializer=BytesCodec(endian=dtype_store_endian), - ) - - await _AsyncArrayProxy(a)[:, :].set(data) - readback_data = await _AsyncArrayProxy(a)[:, :].get() - assert np.array_equal(data, readback_data)