diff --git a/.github/workflows/hypothesis.yaml b/.github/workflows/hypothesis.yaml index d15b5b1405..cfffbe26aa 100644 --- a/.github/workflows/hypothesis.yaml +++ b/.github/workflows/hypothesis.yaml @@ -41,6 +41,7 @@ jobs: steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: + fetch-depth: 0 # grab all tags so hatch-vcs derives real versions for zarr-python and the in-tree zarr-metadata persist-credentials: false - name: Set HYPOTHESIS_PROFILE based on trigger env: diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index 43436de947..8996e0a026 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -63,9 +63,119 @@ jobs: ls ls dist + # --------------------------------------------------------------------------- + # Pre-publish gate: confirm zarr-metadata's required floor is on PyPI. + # + # zarr-python and zarr-metadata co-develop in this monorepo. During local + # development zarr-metadata is resolved from packages/zarr-metadata/ via the + # uv workspace (see [tool.uv.sources] in pyproject.toml). The wheel we are + # about to publish, however, only carries a version-range requirement + # (e.g. `zarr-metadata>=0.1.1,<0.2`); end users will resolve that against + # PyPI. + # + # The failure mode this job catches: a zarr-python PR added code that + # depends on a zarr-metadata feature that has been merged into + # packages/zarr-metadata/ but not yet released to PyPI. CI passed because + # the workspace override resolved to the in-tree copy, but a user installing + # the resulting zarr-python wheel would get a published zarr-metadata that + # lacks the feature, and zarr-python would fail at import or first use. + # + # The mitigation here is a presence check on PyPI: extract the floor of + # zarr-python's zarr-metadata requirement from the wheel's METADATA file, + # and refuse to upload if that exact version is not yet on PyPI. This is + # analogous to what `cargo publish` does automatically against crates.io, + # but expressed as a CI step because twine has no built-in equivalent. + # + # When you bump zarr-metadata to a new version that zarr-python depends on, + # the required release order is: + # 1. release zarr-metadata to PyPI; + # 2. bump the floor in zarr-python's [project.dependencies]; + # 3. release zarr-python. + # This job will fail at step 3 if step 1 was skipped. + # --------------------------------------------------------------------------- + verify_pypi_dependency: + name: Verify zarr-metadata floor is on PyPI + needs: [build_artifacts] + runs-on: ubuntu-latest + # Run only on actual releases. Pull-request and push-to-main runs go + # through CI without this gate, since their wheels are never uploaded. + if: github.event_name == 'release' + steps: + - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: releases + path: dist + + - name: Check zarr-metadata floor is published on PyPI + run: | + # The wheel's METADATA file lives at zarr-*.dist-info/METADATA inside + # the wheel. `unzip -p` writes a file's contents to stdout without + # extracting; the glob matches whichever dist-info dir is inside. + metadata="$(unzip -p dist/zarr-*.whl '*.dist-info/METADATA')" + + # Pick the Requires-Dist line for zarr-metadata. The wheel may have + # several Requires-Dist lines for different extras; we want the one + # that applies unconditionally (no `; extra == "..."` marker). + # Match `Requires-Dist: zarr-metadata` followed by anything that + # ends a project name in PEP 508: a version operator (<, >, =, !, + # ~), whitespace, `[` (extras), `;` (markers), `(` (legacy + # parenthesized version), or end-of-line. The character class + # excludes letters/digits/underscore/hyphen, so a hypothetical + # `zarr-metadata-ext` dep would not match. + req_line="$(printf '%s' "$metadata" \ + | grep -E '^Requires-Dist: zarr-metadata([^A-Za-z0-9_-]|$)' \ + | grep -v 'extra ==' \ + || true)" + + if [ -z "$req_line" ]; then + echo "::error::Could not find an unconditional Requires-Dist line for zarr-metadata in the built wheel." + echo "Wheel METADATA Requires-Dist lines:" + printf '%s' "$metadata" | grep '^Requires-Dist:' || true + exit 1 + fi + echo "Requires-Dist line: $req_line" + + # Extract the floor: the version after `>=`. Version specifiers in + # PEP 440 are comma-separated (e.g. `>=0.1.1, <0.2`); the floor is + # the bound after the first `>=`. `grep -oE '>=[^,]+'` captures + # `>=0.1.1` (everything up to the comma), then we strip the + # operator and surrounding whitespace. + floor="$(printf '%s' "$req_line" \ + | grep -oE '>=[[:space:]]*[^,]+' \ + | sed 's/^>=[[:space:]]*//; s/[[:space:]]*$//' \ + | head -1)" + + if [ -z "$floor" ]; then + echo '::error::Could not extract a >= floor from:' "$req_line" + echo "zarr-python's zarr-metadata requirement must include a >= bound so this gate has something to check." + exit 1 + fi + echo "zarr-metadata floor: $floor" + + # PyPI's JSON API returns 200 if the named version exists and 404 + # if it doesn't. -s silences progress output; -o /dev/null discards + # the body; -w %%{http_code} prints just the status. Any non-200 + # response means the floor has not been published yet. + status="$(curl -s -o /dev/null -w '%{http_code}' \ + "https://pypi.org/pypi/zarr-metadata/${floor}/json")" + + if [ "$status" != "200" ]; then + echo "::error::zarr-metadata ${floor} is not available on PyPI (HTTP ${status})." + echo "" + echo "The wheel about to be uploaded declares it requires zarr-metadata ${floor} or later," + echo "but no such release exists on PyPI. Publish zarr-metadata ${floor} first, then" + echo "re-run this release workflow." + exit 1 + fi + echo "OK: zarr-metadata ${floor} is on PyPI; safe to upload zarr-python." + upload_pypi: name: Upload to PyPI - needs: [build_artifacts, test_dist_pypi] + # Depend on the new gate so the upload step does not run if the floor + # is missing from PyPI. The gate runs only on releases (see its `if:` + # condition); on PR / push runs it is skipped, and skipped jobs in a + # `needs:` list are treated as satisfied by GitHub Actions. + needs: [build_artifacts, test_dist_pypi, verify_pypi_dependency] runs-on: ubuntu-latest if: github.event_name == 'release' environment: diff --git a/.github/workflows/zarr-metadata.yml b/.github/workflows/zarr-metadata.yml index 3081abf094..01f017939b 100644 --- a/.github/workflows/zarr-metadata.yml +++ b/.github/workflows/zarr-metadata.yml @@ -20,6 +20,13 @@ concurrency: cancel-in-progress: true jobs: + # zarr-metadata CI installs zarr-metadata standalone, not as a uv + # workspace member. The workspace at the repo root forces uv to honor + # `requires-python = ">=3.12"` from zarr-python's pyproject.toml, which + # blocks Python 3.11 even though zarr-metadata itself supports 3.11+. + # Using `uv venv` + `uv pip install` from a tmp directory bypasses + # workspace resolution and tests zarr-metadata the way downstream users + # actually install it: as a standalone package from PyPI. test: name: pytest py=${{ matrix.python-version }} runs-on: ubuntu-latest @@ -39,12 +46,18 @@ jobs: uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 with: enable-cache: true - - name: Set up Python ${{ matrix.python-version }} - run: uv python install ${{ matrix.python-version }} - - name: Sync test dependency group - run: uv sync --group test --python ${{ matrix.python-version }} + - name: Create standalone Python ${{ matrix.python-version }} venv + # Place the venv outside the workspace tree so uv doesn't try + # to resolve workspace-wide requirements. + run: uv venv "$RUNNER_TEMP/zm-venv" --python ${{ matrix.python-version }} --seed + - name: Install zarr-metadata and test deps + run: | + uv pip install \ + --python "$RUNNER_TEMP/zm-venv/bin/python" \ + --group pyproject.toml:test \ + . - name: Run pytest - run: uv run --group test pytest tests + run: '"$RUNNER_TEMP/zm-venv/bin/python" -m pytest tests' ruff: name: ruff @@ -77,12 +90,16 @@ jobs: uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 with: enable-cache: true - - name: Set up Python - run: uv python install 3.11 - - name: Sync test dependency group - run: uv sync --group test --python 3.11 + - name: Create standalone Python 3.11 venv + run: uv venv "$RUNNER_TEMP/zm-venv" --python 3.11 --seed + - name: Install zarr-metadata and test deps + run: | + uv pip install \ + --python "$RUNNER_TEMP/zm-venv/bin/python" \ + --group pyproject.toml:test \ + . - name: Run pyright - run: uv run --group test --with pyright pyright src + run: uvx --python "$RUNNER_TEMP/zm-venv/bin/python" pyright src zarr-metadata-complete: name: zarr-metadata complete diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fb2e8c3c6f..70e9af31db 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -57,6 +57,17 @@ repos: entry: "\\.(lstrip|rstrip)\\([\"'][^\"']{2,}[\"']\\)" types: [python] files: ^(src|tests)/ + - id: check-min-deps-floor + name: check min_deps zarr-metadata pin matches the project floor + # language: python (not system) so pre-commit provisions an + # interpreter; the script is stdlib-only, so no extra deps are + # needed. Avoids assuming a bare `python` is on PATH. + language: python + entry: python ci/check_min_deps_floor.py + # Run whenever pyproject.toml changes; pass_filenames is False because + # the script reads the file directly rather than processing argv. + pass_filenames: false + files: ^pyproject\.toml$ - repo: https://github.com/zizmorcore/zizmor-pre-commit rev: v1.25.2 hooks: diff --git a/changes/3961.feature.md b/changes/3961.feature.md new file mode 100644 index 0000000000..00e42cfc7b --- /dev/null +++ b/changes/3961.feature.md @@ -0,0 +1,3 @@ +``zarr-python`` now depends on the [``zarr-metadata``](https://pypi.org/project/zarr-metadata/) package, which provides spec-defined TypedDicts and literal types for Zarr v2 and v3 metadata documents. Several internal types previously defined in ``zarr-python`` are now aliases that re-export their canonical definitions from ``zarr-metadata``: ``zarr.codecs.blosc.BloscShuffleLiteral``, ``zarr.codecs.blosc.BloscCnameLiteral``, ``zarr.codecs.blosc.BloscConfigV3``, ``zarr.codecs.blosc.BloscJSON_V3``, ``zarr.codecs.cast_value.RoundingMode``, ``zarr.codecs.cast_value.OutOfRangeMode``, ``zarr.core.metadata.v2.ArrayV2MetadataDict``, ``zarr.core.metadata.v3.AllowedExtraField``, and ``zarr.core.metadata.v3.ArrayMetadataJSON_V3``. + +The version requirement (``zarr-metadata>=0.3.0,<0.4``) caps the major version so a future breaking change in ``zarr-metadata`` cannot silently break installed ``zarr-python``. During local development, ``zarr-metadata`` is resolved from the in-tree copy under ``packages/zarr-metadata/`` via a uv workspace; see [the contributing guide](https://zarr.readthedocs.io/en/stable/contributing.html) for details. diff --git a/changes/3961.misc.md b/changes/3961.misc.md new file mode 100644 index 0000000000..0180c88d39 --- /dev/null +++ b/changes/3961.misc.md @@ -0,0 +1,13 @@ +`Struct.to_json` now emits the `configuration.fields` array as a tuple rather +than a list. The serialized JSON is unchanged (a JSON array is produced either +way), but the in-memory dict returned by `to_json(zarr_format=3)` now holds a +tuple, matching the `tuple[StructField, ...]` shape that `zarr-metadata` models +for this field. + +Internal: zarr-python now sources its codec, dtype, and chunk-grid name +constants and the `Endianness`, `BloscShuffle`, `BloscCname`, sharding +`IndexLocation`, and `DateTimeUnit` literal types from `zarr-metadata`'s +top-level exports rather than re-defining them. The historical zarr-python +names (e.g. `zarr.codecs.bytes.EndianLiteral`, +`zarr.codecs.sharding.IndexLocation`) are retained as re-exports, so existing +imports keep working. No user-facing behavior changes. diff --git a/ci/check_min_deps_floor.py b/ci/check_min_deps_floor.py new file mode 100644 index 0000000000..461e1d0e47 --- /dev/null +++ b/ci/check_min_deps_floor.py @@ -0,0 +1,111 @@ +""" +Enforce the invariant: `min_deps` pins zarr-metadata to the floor of +zarr-python's declared zarr-metadata range. + +zarr-python declares `zarr-metadata>=X.Y.Z,<...>` in `[project.dependencies]`. +The `min_deps` hatch env tests against the *minimum* supported deps, so it +must pin zarr-metadata to exactly that floor (e.g. `zarr-metadata==X.Y.Z`). +Without this script the two declarations can drift silently — the project's +floor could rise without `min_deps` noticing, and `min_deps` would no longer +verify what its name claims. + +Run: + python ci/check_min_deps_floor.py + +Exits 0 if floors agree; non-zero with a clear message if not. +""" + +from __future__ import annotations + +import re +import sys +import tomllib +from pathlib import Path + +ROOT = Path(__file__).parent.parent.resolve() +PYPROJECT = ROOT / "pyproject.toml" + +# Match `>=X.Y.Z` (with or without surrounding whitespace) inside a PEP 440 +# version specifier set. Captures just the version number. +_FLOOR_RE = re.compile(r">=\s*([^,\s]+)") +# Match `==X.Y.Z` likewise. Captures the version number. +_PIN_RE = re.compile(r"==\s*([^,\s]+)") + + +def find_zarr_metadata_floor(deps: list[str]) -> str: + """Return the >= floor of zarr-metadata declared in `deps`. + + `deps` is a list of PEP 508 strings, e.g. as found in + `[project.dependencies]`. Raises if zarr-metadata is not present, or + if its specifier set has no `>=` bound. + """ + for dep in deps: + # Project name is everything up to the first non-name character. + # Quick split: package name terminates at the first occurrence of a + # version operator, whitespace, `[`, `;`, or `(`. + name = re.split(r"[<>=!~\s\[;(]", dep, maxsplit=1)[0].strip() + if name == "zarr-metadata": + match = _FLOOR_RE.search(dep) + if not match: + raise SystemExit( + f"zarr-metadata dependency has no `>=` floor: {dep!r}\n" + "Floor verification requires an explicit lower bound." + ) + return match.group(1) + raise SystemExit( + "zarr-metadata not found in [project.dependencies]. " + "This script assumes zarr-python depends on zarr-metadata." + ) + + +def find_zarr_metadata_pin(deps: list[str]) -> str: + """Return the `==` pin of zarr-metadata declared in `deps`. + + `deps` is a list of PEP 508 strings, e.g. as found in + `[tool.hatch.envs.min_deps.extra-dependencies]`. Raises if + zarr-metadata is not present, or if its specifier is not a `==` pin. + """ + for dep in deps: + name = re.split(r"[<>=!~\s\[;(]", dep, maxsplit=1)[0].strip() + if name == "zarr-metadata": + match = _PIN_RE.search(dep) + if not match: + raise SystemExit( + f"min_deps zarr-metadata entry is not an `==` pin: {dep!r}\n" + "The min_deps env must pin zarr-metadata exactly to the floor." + ) + return match.group(1) + raise SystemExit( + "zarr-metadata not found in [tool.hatch.envs.min_deps.extra-dependencies].\n" + "Add `'zarr-metadata=='` to keep min_deps testing the declared floor." + ) + + +def main() -> int: + data = tomllib.loads(PYPROJECT.read_text()) + + project_deps = data["project"]["dependencies"] + floor = find_zarr_metadata_floor(project_deps) + + min_deps_extra = data["tool"]["hatch"]["envs"]["min_deps"]["extra-dependencies"] + pin = find_zarr_metadata_pin(min_deps_extra) + + if floor != pin: + print( + f"floor / min_deps pin mismatch for zarr-metadata:\n" + f" [project.dependencies] floor: >={floor}\n" + f" [tool.hatch.envs.min_deps] pin: =={pin}\n" + f"\n" + f"These must agree. Either update the floor in " + f"[project.dependencies] or the pin in min_deps so both name " + f"the same zarr-metadata version.", + file=sys.stderr, + ) + return 1 + + print(f"OK: zarr-metadata floor {floor} matches min_deps pin {pin}.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/contributing.md b/docs/contributing.md index 750f7c7a65..922e653f16 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -93,6 +93,44 @@ To verify that your development environment is working, you can run the unit tes hatch env run --env test.py3.12-optional run ``` +#### The zarr-metadata package and the workspace + +zarr-python depends on [`zarr-metadata`](https://pypi.org/project/zarr-metadata/), a small package of TypedDicts and literals describing the JSON shape of Zarr v2 and v3 metadata documents. Both packages live in this repository: + +- zarr-python: the project root. +- zarr-metadata: [`packages/zarr-metadata/`](https://github.com/zarr-developers/zarr-python/tree/main/packages/zarr-metadata) — its own `pyproject.toml`, source tree, and tests. + +This is configured as a workspace in two places, because the project supports both [`uv`](https://docs.astral.sh/uv/) and [`hatch`](https://hatch.pypa.io/) as front-ends. + +**uv workspace declaration** (consumed by `uv sync`, `uv run`, and anything reading uv's project metadata): + +```toml +[tool.uv.workspace] +members = ["packages/zarr-metadata"] + +[tool.uv.sources] +zarr-metadata = { workspace = true } +``` + +**Hatch workspace declaration** (consumed by `hatch env run`, including the CI test matrix in `test.yml`): + +```toml +[tool.hatch.envs.test] +workspace.members = ["packages/zarr-metadata"] +``` + +Both mechanisms point at the same in-tree path. They have to be declared separately because uv and hatch don't share configuration. The `dev` env, the `test` matrix, the inherited `gputest` and `upstream` envs all use the in-tree source. The `min_deps` env explicitly opts out (`workspace.members = []`) so it tests against the minimum supported zarr-metadata from PyPI — the floor of the version range in `[project.dependencies]`. + +What this means in practice: + +- **During local development** (whether you invoke `uv run pytest` or `hatch env run --env test.py3.12-optional run`), zarr-python resolves `zarr-metadata` from the in-tree source under `packages/zarr-metadata/`. Changes you make there are immediately visible to zarr-python without reinstalling. +- **In the published wheel**, only the `[project.dependencies]` version requirement (`zarr-metadata>=0.3.0,<0.4`) is carried. The workspace declarations are development-only configuration. Users installing zarr-python from PyPI get the published zarr-metadata wheel. +- **In CI**, the primary test matrix (`test.yml`) runs `hatch env run` against the in-tree zarr-metadata. A change in `packages/zarr-metadata/` that breaks zarr-python surfaces immediately, before zarr-metadata is released to PyPI. The `min_deps` job additionally exercises the published floor on every PR, so a change in zarr-python that *requires* an unreleased zarr-metadata feature also gets caught. + +If you change zarr-metadata, also run zarr-python's test suite. The workspace setup makes this transparent — your usual `uv run pytest` or `hatch env run` picks up the in-tree source automatically. + +When releasing a new zarr-metadata version that contains a breaking change, also bump zarr-python's version cap on zarr-metadata (currently `<0.3`) in the same release cycle. See [Releasing zarr-python when zarr-metadata has changed](#releasing-zarr-python-when-zarr-metadata-has-changed) below for the full procedure. + ### Creating a branch Before you do any new work or submit a pull request, please open an issue on GitHub to report the bug or propose the feature you'd like to add. @@ -421,6 +459,32 @@ We aim to either **promote** or **remove** experimental features within **6 mont Features in `zarr.experimental` carry no stability guarantees. They may be changed or removed in any release, including patch releases. If you depend on an experimental feature, pin your `zarr-python` version accordingly. +## Release procedure + +Open an issue on GitHub announcing the release using the release checklist template: +[https://github.com/zarr-developers/zarr-python/issues/new?template=release-checklist.md](https://github.com/zarr-developers/zarr-python/issues/new?template=release-checklist.md). The release checklist includes all steps necessary for the release. + +### Preparing a release + +Releases are prepared using the ["Prepare release notes"](https://github.com/zarr-developers/zarr-python/actions/workflows/prepare_release.yml) workflow. To run it: + +1. Go to the [workflow page](https://github.com/zarr-developers/zarr-python/actions/workflows/prepare_release.yml) and click "Run workflow". +2. Enter the release version (e.g. `3.2.0`) and the target branch (defaults to `main`). +3. The workflow will run `towncrier build` to render the changelog, remove consumed fragments from `changes/`, and open a pull request on the `release/v` branch. +4. The release PR is automatically labeled `run-downstream`, which triggers the [downstream test workflow](https://github.com/zarr-developers/zarr-python/actions/workflows/downstream.yml) to run Xarray and numcodecs integration tests against the release branch. +5. Review the rendered changelog in `docs/release-notes.md` and verify downstream tests pass before merging. + +### Releasing zarr-python when zarr-metadata has changed + +zarr-python depends on the [`zarr-metadata`](https://pypi.org/project/zarr-metadata/) package, which is developed in the same monorepo (see [The zarr-metadata package and the uv workspace](#the-zarr-metadata-package-and-the-uv-workspace) above). When a zarr-python release depends on a zarr-metadata change that has not yet been published to PyPI, the release must follow this order: + +1. **Bump zarr-metadata's version** in `packages/zarr-metadata/pyproject.toml` and `packages/zarr-metadata/src/zarr_metadata/__init__.py` (the version literal). Use semver: bump the minor for breaking type changes, the patch for additive changes. +2. **Release zarr-metadata to PyPI.** Tag and publish from `packages/zarr-metadata/`. +3. **Bump zarr-python's floor** on zarr-metadata in `[project.dependencies]` (e.g. `zarr-metadata>=0.2.0,<0.3` → `zarr-metadata>=0.3.0,<0.4`). Update `[tool.uv.workspace]` and `[tool.uv.sources]` only if necessary. +4. **Release zarr-python.** + +If steps 1 and 2 are skipped (or step 3's bumped floor names a version that does not yet exist on PyPI), the `verify_pypi_dependency` job in [`releases.yml`](https://github.com/zarr-developers/zarr-python/blob/main/.github/workflows/releases.yml) will fail before the upload step runs. This gate exists because the wheel ships only a version-range requirement; pip resolves that against PyPI on the user's machine, and there is no built-in equivalent of `cargo publish`'s automatic check that the declared dependency is actually available in the registry. + ## Benchmarks Zarr uses [pytest-benchmark](https://pytest-benchmark.readthedocs.io/en/latest/) for running diff --git a/pyproject.toml b/pyproject.toml index 493b18822a..2b42aa3fca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ 'google-crc32c>=1.5', 'typing_extensions>=4.14', 'donfig>=0.8', + 'zarr-metadata>=0.3.0,<0.4', ] dynamic = [ @@ -158,6 +159,15 @@ omit = [ "src/zarr/testing/conftest.py", # only for downstream projects ] +# When developing zarr-python locally, resolve zarr-metadata from the in-tree +# package under packages/zarr-metadata/. The `[project.dependencies]` version +# requirement is what propagates to consumers installing from PyPI. +[tool.uv.workspace] +members = ["packages/zarr-metadata"] + +[tool.uv.sources] +zarr-metadata = { workspace = true } + [tool.hatch] version.source = "vcs" # Only consider zarr-python's own `v*` tags when deriving the version. Without @@ -172,9 +182,18 @@ hooks.vcs.version-file = "src/zarr/_version.py" [tool.hatch.envs.dev] dependency-groups = ["dev"] +# Resolve zarr-metadata from the in-tree workspace member, not PyPI. See +# `[tool.uv.sources]` above for the equivalent for `uv run` invocations. +workspace.members = ["packages/zarr-metadata"] [tool.hatch.envs.test] dependency-groups = ["test"] +# Resolve zarr-metadata from the in-tree workspace member, not PyPI, so CI +# in `test.yml` exercises the integration between the two packages on every +# PR. Envs that inherit via `template = "test"` (gputest, upstream) pick +# this up automatically; min_deps overrides it (see below) to test against +# the published floor. +workspace.members = ["packages/zarr-metadata"] [tool.hatch.envs.test.env-vars] @@ -253,6 +272,15 @@ PIP_EXTRA_INDEX_URL = "https://pypi.org/simple/" PIP_PRE = "1" [tool.hatch.envs.min_deps] +# Use pip rather than the inherited uv installer. This env must resolve +# zarr-metadata from PyPI (the published floor pinned below), not the in-tree +# workspace member. uv would honor the root `[tool.uv.sources] zarr-metadata = +# { workspace = true }` and substitute the workspace copy (whose hatch-vcs dev +# version can never equal the `==` floor), producing "No solution found" — see +# https://github.com/pypa/hatch/issues/1639. pip ignores `[tool.uv.sources]` +# entirely, so the `zarr-metadata==` pin below resolves against the +# published wheel and keeps the "minimum supported deps" guarantee honest. +installer = "pip" description = """Test environment for minimum supported dependencies See Spec 0000 for details and drop schedule: https://scientific-python.org/specs/spec-0000/ @@ -271,6 +299,10 @@ extra-dependencies = [ 'typing_extensions==4.14.*', 'donfig==0.8.*', 'obstore==0.5.*', + # Pin to the floor of zarr-python's declared zarr-metadata range. Must + # match the >= bound in [project.dependencies] above; the + # `check_min_deps_floor.py` pre-commit hook enforces this invariant. + 'zarr-metadata==0.3.0', ] [tool.hatch.envs.default] diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py index 9a1b47b351..a9b0fadc4e 100644 --- a/src/zarr/codecs/__init__.py +++ b/src/zarr/codecs/__init__.py @@ -1,5 +1,17 @@ from __future__ import annotations +from zarr_metadata import ( + BLOSC_CODEC_NAME, + BYTES_CODEC_NAME, + CAST_VALUE_CODEC_NAME, + CRC32C_CODEC_NAME, + GZIP_CODEC_NAME, + SCALE_OFFSET_CODEC_NAME, + SHARDING_INDEXED_CODEC_NAME, + TRANSPOSE_CODEC_NAME, + ZSTD_CODEC_NAME, +) + from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle from zarr.codecs.bytes import BytesCodec, Endian from zarr.codecs.cast_value import CastValue @@ -54,20 +66,20 @@ "ZstdCodec", ] -register_codec("blosc", BloscCodec) -register_codec("cast_value", CastValue) -register_codec("bytes", BytesCodec) +register_codec(BLOSC_CODEC_NAME, BloscCodec) +register_codec(CAST_VALUE_CODEC_NAME, CastValue) +register_codec(BYTES_CODEC_NAME, BytesCodec) # compatibility with earlier versions of ZEP1 register_codec("endian", BytesCodec) -register_codec("crc32c", Crc32cCodec) -register_codec("gzip", GzipCodec) -register_codec("scale_offset", ScaleOffset) -register_codec("sharding_indexed", ShardingCodec) -register_codec("zstd", ZstdCodec) +register_codec(CRC32C_CODEC_NAME, Crc32cCodec) +register_codec(GZIP_CODEC_NAME, GzipCodec) +register_codec(SCALE_OFFSET_CODEC_NAME, ScaleOffset) +register_codec(SHARDING_INDEXED_CODEC_NAME, ShardingCodec) +register_codec(ZSTD_CODEC_NAME, ZstdCodec) register_codec("vlen-utf8", VLenUTF8Codec) register_codec("vlen-bytes", VLenBytesCodec) -register_codec("transpose", TransposeCodec) +register_codec(TRANSPOSE_CODEC_NAME, TransposeCodec) # Register all the codecs formerly contained in numcodecs.zarr3 diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index 087de716fc..f10114553c 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -3,16 +3,24 @@ import asyncio from dataclasses import dataclass, field, replace from functools import cached_property -from typing import TYPE_CHECKING, ClassVar, Final, Literal, NotRequired, TypedDict +from typing import TYPE_CHECKING, ClassVar, Literal, NotRequired, TypedDict import numcodecs +import zarr_metadata from numcodecs.blosc import Blosc from packaging.version import Version +from zarr_metadata import BLOSC_CODEC_NAME +from zarr_metadata.v3.codec.blosc import ( + BloscCodecConfiguration as _BloscCodecConfiguration, +) +from zarr_metadata.v3.codec.blosc import ( + BloscCodecObject as _BloscCodecObject, +) from zarr.abc.codec import BytesBytesCodec from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, NamedRequiredConfig, parse_named_configuration +from zarr.core.common import JSON, parse_named_configuration from zarr.core.dtype.common import HasItemSize if TYPE_CHECKING: @@ -21,19 +29,24 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer -BloscShuffleLiteral = Literal["noshuffle", "shuffle", "bitshuffle"] +# Re-exported under zarr-python's historical names; canonical definitions live +# in `zarr_metadata`. Plain assignments (not `import as`) so these remain +# explicitly importable from this module. +BloscShuffleLiteral = zarr_metadata.BloscShuffle """The shuffle values permitted for the blosc codec""" -BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") +BLOSC_SHUFFLE = zarr_metadata.BLOSC_SHUFFLE -BloscCnameLiteral = Literal["lz4", "lz4hc", "blosclz", "snappy", "zlib", "zstd"] +BloscCnameLiteral = zarr_metadata.BloscCName """The codec identifiers used in the blosc codec""" -BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "snappy", "zlib", "zstd") +BLOSC_CNAME = zarr_metadata.BLOSC_CNAME class BloscConfigV2(TypedDict): - """Configuration for the V2 Blosc codec""" + """Configuration for the V2 Blosc codec. + + v2 codec shapes predate zarr-metadata, which models only v3 codecs.""" cname: BloscCnameLiteral clevel: int @@ -42,20 +55,8 @@ class BloscConfigV2(TypedDict): typesize: NotRequired[int] -class BloscConfigV3(TypedDict): - """Configuration for the V3 Blosc codec""" - - cname: BloscCnameLiteral - clevel: int - shuffle: BloscShuffleLiteral - blocksize: int - typesize: int - - -class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): - """ - The JSON form of the Blosc codec in Zarr V3. - """ +BloscConfigV3 = _BloscCodecConfiguration +BloscJSON_V3 = _BloscCodecObject class BloscShuffle(metaclass=_DeprecatedStrEnumMeta): @@ -264,12 +265,12 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "blosc") + _, configuration_parsed = parse_named_configuration(data, BLOSC_CODEC_NAME) return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: result: BloscJSON_V3 = { - "name": "blosc", + "name": BLOSC_CODEC_NAME, "configuration": { "typesize": self.typesize, "cname": self.cname, diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 240c077627..684820fe14 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -3,7 +3,10 @@ import sys import warnings from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, ClassVar, Final, Literal +from typing import TYPE_CHECKING, ClassVar, Final + +import zarr_metadata +from zarr_metadata import BYTES_CODEC_NAME from zarr.abc.codec import ArrayBytesCodec from zarr.codecs._deprecated_enum import _coerce_enum_input, _DeprecatedStrEnumMeta @@ -17,11 +20,13 @@ from zarr.core.array_spec import ArraySpec - -EndianLiteral = Literal["little", "big"] +# Re-exported under zarr-python's historical names; canonical definitions live +# in `zarr_metadata`. Plain assignments (not `import as`) so these remain +# explicitly importable from this module. +EndianLiteral = zarr_metadata.Endianness """Byte order of multi-byte numeric data.""" -ENDIAN: Final = ("little", "big") +ENDIAN: Final = zarr_metadata.ENDIANNESS class Endian(metaclass=_DeprecatedStrEnumMeta): @@ -59,7 +64,7 @@ def __init__(self, *, endian: Endian | EndianLiteral | None = sys.byteorder) -> @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: _, configuration_parsed = parse_named_configuration( - data, "bytes", require_configuration=False + data, BYTES_CODEC_NAME, require_configuration=False ) configuration_parsed = configuration_parsed or {} configuration_parsed.setdefault("endian", None) @@ -67,9 +72,9 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: if self.endian is None: - return {"name": "bytes"} + return {"name": BYTES_CODEC_NAME} else: - return {"name": "bytes", "configuration": {"endian": self.endian}} + return {"name": BYTES_CODEC_NAME, "configuration": {"endian": self.endian}} def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if isinstance(array_spec.dtype, Struct): diff --git a/src/zarr/codecs/cast_value.py b/src/zarr/codecs/cast_value.py index eb8a4de248..dad7dd90a9 100644 --- a/src/zarr/codecs/cast_value.py +++ b/src/zarr/codecs/cast_value.py @@ -12,9 +12,10 @@ from collections.abc import Mapping from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, Final, Literal, TypedDict, cast +from typing import TYPE_CHECKING, Final, TypedDict, cast import numpy as np +from zarr_metadata import CAST_VALUE_CODEC_NAME from zarr.abc.codec import ArrayArrayCodec from zarr.core.common import JSON, parse_named_configuration @@ -23,6 +24,13 @@ if TYPE_CHECKING: from typing import NotRequired, Self + from zarr_metadata.v3.codec.cast_value import ( + CastOutOfRangeMode as OutOfRangeMode, + ) + from zarr_metadata.v3.codec.cast_value import ( + CastRoundingMode as RoundingMode, + ) + from zarr.core.array_spec import ArraySpec from zarr.core.buffer import NDBuffer from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType @@ -33,17 +41,6 @@ class ScalarMapJSON(TypedDict): decode: NotRequired[list[tuple[object, object]]] -RoundingMode = Literal[ - "nearest-even", - "towards-zero", - "towards-positive", - "towards-negative", - "nearest-away", -] - -OutOfRangeMode = Literal["clamp", "wrap"] - - class ScalarMap(TypedDict, total=False): """ The normalized, in-memory form of a scalar map. @@ -230,7 +227,7 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: _, configuration_parsed = parse_named_configuration( - data, "cast_value", require_configuration=True + data, CAST_VALUE_CODEC_NAME, require_configuration=True ) return cls(**configuration_parsed) # type: ignore[arg-type] @@ -241,12 +238,18 @@ def to_dict(self) -> dict[str, JSON]: if self.out_of_range is not None: config["out_of_range"] = self.out_of_range if self.scalar_map is not None: - json_map: dict[str, list[tuple[object, object]]] = {} + # Emit ScalarMap entries as a tuple of 2-tuples. JSON Arrays are + # typed fixed-length containers at the spec level; the + # in-memory canonical shape is `tuple[tuple[object, object], ...]` + # to match `zarr_metadata.v3.codec.cast_value.ScalarMap`. + json_map: dict[str, tuple[tuple[object, object], ...]] = {} for direction in ("encode", "decode"): if direction in self.scalar_map: - json_map[direction] = [(k, v) for k, v in self.scalar_map[direction].items()] + json_map[direction] = tuple( + (k, v) for k, v in self.scalar_map[direction].items() + ) config["scalar_map"] = cast("JSON", json_map) - return {"name": "cast_value", "configuration": config} + return {"name": CAST_VALUE_CODEC_NAME, "configuration": config} def validate( self, diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index ebe2ac8f7a..c2d6e8b37c 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -6,6 +6,7 @@ import google_crc32c import numpy as np import typing_extensions +from zarr_metadata import CRC32C_CODEC_NAME from zarr.abc.codec import BytesBytesCodec from zarr.core.common import JSON, parse_named_configuration @@ -25,11 +26,11 @@ class Crc32cCodec(BytesBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - parse_named_configuration(data, "crc32c", require_configuration=False) + parse_named_configuration(data, CRC32C_CODEC_NAME, require_configuration=False) return cls() def to_dict(self) -> dict[str, JSON]: - return {"name": "crc32c"} + return {"name": CRC32C_CODEC_NAME} def _decode_sync( self, diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index b8591748f7..66e1aa0d03 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING from numcodecs.gzip import GZip +from zarr_metadata import GZIP_CODEC_NAME from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper @@ -43,11 +44,11 @@ def __init__(self, *, level: int = 5) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "gzip") + _, configuration_parsed = parse_named_configuration(data, GZIP_CODEC_NAME) return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: - return {"name": "gzip", "configuration": {"level": self.level}} + return {"name": GZIP_CODEC_NAME, "configuration": {"level": self.level}} @cached_property def _gzip_codec(self) -> GZip: diff --git a/src/zarr/codecs/scale_offset.py b/src/zarr/codecs/scale_offset.py index c96e177c6b..bfa887407a 100644 --- a/src/zarr/codecs/scale_offset.py +++ b/src/zarr/codecs/scale_offset.py @@ -5,6 +5,7 @@ import numpy as np import numpy.typing as npt +from zarr_metadata import SCALE_OFFSET_CODEC_NAME from zarr.abc.codec import ArrayArrayCodec from zarr.core.common import JSON, parse_named_configuration @@ -327,20 +328,20 @@ def __init__(self, *, offset: object = 0, scale: object = 1) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: _, configuration_parsed = parse_named_configuration( - data, "scale_offset", require_configuration=False + data, SCALE_OFFSET_CODEC_NAME, require_configuration=False ) configuration_parsed = configuration_parsed or {} return cls(**configuration_parsed) def to_dict(self) -> dict[str, JSON]: if self.offset == 0 and self.scale == 1: - return {"name": "scale_offset"} + return {"name": SCALE_OFFSET_CODEC_NAME} config: dict[str, JSON] = {} if self.offset != 0: config["offset"] = self.offset if self.scale != 1: config["scale"] = self.scale - return {"name": "scale_offset", "configuration": config} + return {"name": SCALE_OFFSET_CODEC_NAME, "configuration": config} def validate( self, diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 332aab3351..7bd917e0e5 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -7,6 +7,8 @@ import numpy as np import numpy.typing as npt +import zarr_metadata +from zarr_metadata import SHARDING_INDEXED_CODEC_NAME from zarr.abc.codec import ( ArrayBytesCodec, @@ -73,10 +75,13 @@ ShardMutableMapping = MutableMapping[tuple[int, ...], Buffer | None] -IndexLocation = Literal["start", "end"] +# Re-exported under zarr-python's historical names; canonical definitions live +# in `zarr_metadata`. Plain assignments (not `import as`) so these remain +# explicitly importable from this module. +IndexLocation = zarr_metadata.ShardingIndexLocation """Position of the shard index within the encoded shard.""" -INDEX_LOCATION: Final = ("start", "end") +INDEX_LOCATION: Final = zarr_metadata.SHARDING_INDEX_LOCATION class ShardingCodecIndexLocation(metaclass=_DeprecatedStrEnumMeta): @@ -384,7 +389,7 @@ def __setstate__(self, state: dict[str, Any]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") + _, configuration_parsed = parse_named_configuration(data, SHARDING_INDEXED_CODEC_NAME) return cls(**configuration_parsed) # type: ignore[arg-type] @property @@ -393,7 +398,7 @@ def codec_pipeline(self) -> CodecPipeline: def to_dict(self) -> dict[str, JSON]: return { - "name": "sharding_indexed", + "name": SHARDING_INDEXED_CODEC_NAME, "configuration": { "chunk_shape": self.chunk_shape, "codecs": tuple(s.to_dict() for s in self.codecs), diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index 5756fba2b4..098155710f 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, cast import numpy as np +from zarr_metadata import TRANSPOSE_CODEC_NAME from zarr.abc.codec import ArrayArrayCodec from zarr.core.array_spec import ArraySpec @@ -41,11 +42,11 @@ def __init__(self, *, order: Iterable[int]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "transpose") + _, configuration_parsed = parse_named_configuration(data, TRANSPOSE_CODEC_NAME) return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: - return {"name": "transpose", "configuration": {"order": tuple(self.order)}} + return {"name": TRANSPOSE_CODEC_NAME, "configuration": {"order": tuple(self.order)}} def validate( self, diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index f93c25a3c7..198bfa47bb 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -8,6 +8,7 @@ import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version +from zarr_metadata import ZSTD_CODEC_NAME from zarr.abc.codec import BytesBytesCodec from zarr.core.buffer.cpu import as_numpy_array_wrapper @@ -60,11 +61,14 @@ def __init__(self, *, level: int = 0, checksum: bool = False) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "zstd") + _, configuration_parsed = parse_named_configuration(data, ZSTD_CODEC_NAME) return cls(**configuration_parsed) # type: ignore[arg-type] def to_dict(self) -> dict[str, JSON]: - return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + return { + "name": ZSTD_CODEC_NAME, + "configuration": {"level": self.level, "checksum": self.checksum}, + } @cached_property def _zstd_codec(self) -> Zstd: diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py index 76d763d267..179c2bee29 100644 --- a/src/zarr/core/dtype/common.py +++ b/src/zarr/core/dtype/common.py @@ -11,13 +11,17 @@ TypeGuard, ) +import zarr_metadata from typing_extensions import ReadOnly from zarr.core.common import NamedConfig from zarr.errors import UnstableSpecificationWarning -EndiannessStr = Literal["little", "big"] -ENDIANNESS_STR: Final = "little", "big" +# Re-exported under zarr-python's historical names; canonical definitions live +# in `zarr_metadata`. Plain assignments (not `import as`) so these remain +# explicitly importable from this module. +EndiannessStr = zarr_metadata.Endianness +ENDIANNESS_STR: Final = zarr_metadata.ENDIANNESS SpecialFloatStrings = Literal["NaN", "Infinity", "-Infinity"] SPECIAL_FLOAT_STRINGS: Final = ("NaN", "Infinity", "-Infinity") diff --git a/src/zarr/core/dtype/npy/common.py b/src/zarr/core/dtype/npy/common.py index f413f5f678..9382d40693 100644 --- a/src/zarr/core/dtype/npy/common.py +++ b/src/zarr/core/dtype/npy/common.py @@ -18,6 +18,7 @@ ) import numpy as np +import zarr_metadata from zarr.core.dtype.common import ( ENDIANNESS_STR, @@ -33,26 +34,11 @@ IntLike = SupportsInt | SupportsIndex | bytes | str FloatLike = SupportsIndex | SupportsFloat | bytes | str ComplexLike = SupportsFloat | SupportsIndex | SupportsComplex | bytes | str | None -DateTimeUnit = Literal[ - "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", "generic" -] -DATETIME_UNIT: Final = ( - "Y", - "M", - "W", - "D", - "h", - "m", - "s", - "ms", - "us", - "μs", - "ns", - "ps", - "fs", - "as", - "generic", -) +# Re-exported under zarr-python's historical names; canonical definitions live +# in `zarr_metadata`. Plain assignments (not `import as`) so these remain +# explicitly importable from this module. +DateTimeUnit = zarr_metadata.NumpyTimeUnit +DATETIME_UNIT: Final = zarr_metadata.NUMPY_TIME_UNIT IntishFloat = NewType("IntishFloat", float) """A type for floats that represent integers, like 1.0 (but not 1.1).""" diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py index b865998e52..1016f1dfcd 100644 --- a/src/zarr/core/dtype/npy/structured.py +++ b/src/zarr/core/dtype/npy/structured.py @@ -590,10 +590,14 @@ def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructJSON_V3: return {"name": fields_v2, "object_codec_id": None} elif zarr_format == 3: v3_unstable_dtype_warning(self) - fields_v3 = [ + # `fields` is emitted as a tuple, not a list: a JSON array is a + # typed fixed-length container, which `tuple` models faithfully. + # This matches zarr-metadata's `StructConfiguration.fields` type. + # `json.dumps` serializes tuple and list identically. + fields_v3 = tuple( {"name": f_name, "data_type": f_dtype.to_json(zarr_format=zarr_format)} for f_name, f_dtype in self.fields - ] + ) return cast( "StructJSON_V3", {"name": self._zarr_v3_name, "configuration": {"fields": fields_v3}}, diff --git a/src/zarr/core/dtype/npy/time.py b/src/zarr/core/dtype/npy/time.py index 4efa0be7bb..d0c5eaa9c6 100644 --- a/src/zarr/core/dtype/npy/time.py +++ b/src/zarr/core/dtype/npy/time.py @@ -16,6 +16,8 @@ import numpy as np from typing_extensions import ReadOnly +from zarr_metadata import NUMPY_TIME_UNIT as DATETIME_UNIT +from zarr_metadata import NumpyTimeUnit as DateTimeUnit from zarr.core.common import NamedRequiredConfig from zarr.core.dtype.common import ( @@ -26,8 +28,6 @@ check_dtype_spec_v2, ) from zarr.core.dtype.npy.common import ( - DATETIME_UNIT, - DateTimeUnit, check_json_int, endianness_to_numpy_str, get_endianness_from_numpy_dtype, diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 52eaa3e144..e265096c81 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -2,6 +2,7 @@ import asyncio import itertools +import json import logging import unicodedata import warnings @@ -73,6 +74,8 @@ ) from typing import Any + from zarr_metadata.v2 import ConsolidatedMetadataV2 + from zarr.core.array_spec import ArrayConfigLike from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike @@ -434,6 +437,12 @@ def to_dict(self) -> dict[str, Any]: else: # Leave consolidated metadata unset if it's None result.pop("consolidated_metadata") + # `node_type` is a v3-only field. v2 group metadata (.zgroup) has + # only `zarr_format`; attributes live in a sibling .zattrs file. + # The dataclass carries `node_type` for in-memory use; strip it + # from the serialized v2 form. + if self.zarr_format == 2: + result.pop("node_type", None) return result @@ -624,8 +633,12 @@ def _from_bytes_v2( group_metadata: dict[str, Any] = {**zgroup, "attributes": zattrs} if consolidated_metadata_bytes is not None: - v2_consolidated_doc = buffer_to_json_object(consolidated_metadata_bytes) - v2_consolidated_metadata = cast("dict[str, Any]", v2_consolidated_doc["metadata"]) + # The parsed file has the shape of `ConsolidatedMetadataV2` from + # zarr-metadata (keys like `foo/.zarray`, `foo/.zgroup`, + # `foo/.zattrs`). Mutate it below to strip and reorganize + # entries, so convert to a mutable `dict` after parsing. + parsed: ConsolidatedMetadataV2 = json.loads(consolidated_metadata_bytes.to_bytes()) + v2_consolidated_metadata = dict(parsed["metadata"]) # We already read zattrs and zgroup. Should we ignore these? v2_consolidated_metadata.pop(".zattrs", None) v2_consolidated_metadata.pop(".zgroup", None) diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index ac32521239..f58e701d04 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -4,7 +4,7 @@ import warnings from collections.abc import Iterable, Sequence from functools import cached_property -from typing import TYPE_CHECKING, Any, TypedDict, cast +from typing import TYPE_CHECKING, Any, cast from zarr.abc.metadata import Metadata from zarr.abc.numcodec import Numcodec, _is_numcodec @@ -29,6 +29,7 @@ from dataclasses import dataclass, field, fields, replace import numpy as np +from zarr_metadata.v2.array import ArrayMetadataV2 as _ArrayMetadataV2 from zarr.core._json import json_to_buffer from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -43,18 +44,10 @@ from zarr.core.config import config, parse_indexing_order from zarr.core.metadata.common import parse_attributes - -class ArrayV2MetadataDict(TypedDict): - """ - A typed dictionary model for Zarr format 2 metadata. - """ - - zarr_format: Literal[2] - attributes: dict[str, JSON] - - # Union of acceptable types for v2 compressors type CompressorLikev2 = dict[str, JSON] | Numcodec | None +# Re-export the v2 array metadata JSON shape under zarr-python's historical name. +ArrayV2MetadataDict = _ArrayMetadataV2 @dataclass(frozen=True, kw_only=True) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 9eaccc5076..95affafb1a 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -3,9 +3,16 @@ import json from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, field, replace -from typing import TYPE_CHECKING, Any, Final, Literal, NotRequired, TypeGuard, cast +from typing import TYPE_CHECKING, Any, Final, Literal, TypeGuard, cast from typing_extensions import TypedDict +from zarr_metadata import ( + RECTILINEAR_CHUNK_GRID_NAME, + REGULAR_CHUNK_GRID_NAME, + RectilinearChunkGridName, + RegularChunkGridName, +) +from zarr_metadata.v3.array import ArrayMetadataV3, ExtensionFieldV3 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.metadata import Metadata @@ -140,14 +147,12 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]: ) -class AllowedExtraField(TypedDict, extra_items=JSON): # type: ignore[call-arg] - """ - This class models allowed extra fields in array metadata. - They must have ``must_understand`` set to ``False``, and may contain - arbitrary additional JSON data. - """ +AllowedExtraField = ExtensionFieldV3 +"""Alias for `zarr_metadata.v3.array.ExtensionFieldV3`. - must_understand: Literal[False] +`must_understand` is typed as `bool` to match the spec (extension authors that +*understand* a field may produce `True`); the runtime guard +`check_allowed_extra_field` enforces that zarr-python only accepts `False`.""" def check_allowed_extra_field(data: object) -> TypeGuard[AllowedExtraField]: @@ -192,10 +197,10 @@ class RectilinearChunkGridMetadataConfig(TypedDict): RegularChunkGridMetadataJSON = NamedRequiredConfig[ - Literal["regular"], RegularChunkGridMetadataConfig + RegularChunkGridName, RegularChunkGridMetadataConfig ] RectilinearChunkGridMetadataJSON = NamedRequiredConfig[ - Literal["rectilinear"], RectilinearChunkGridMetadataConfig + RectilinearChunkGridName, RectilinearChunkGridMetadataConfig ] @@ -260,13 +265,13 @@ def ndim(self) -> int: def to_dict(self) -> RegularChunkGridMetadataJSON: # type: ignore[override] return { - "name": "regular", + "name": REGULAR_CHUNK_GRID_NAME, "configuration": {"chunk_shape": self.chunk_shape}, } @classmethod def from_dict(cls, data: RegularChunkGridMetadataJSON) -> Self: # type: ignore[override] - parse_named_configuration(data, "regular") # validate name + parse_named_configuration(data, REGULAR_CHUNK_GRID_NAME) # validate name configuration = data["configuration"] return cls(chunk_shape=_parse_chunk_shape(configuration["chunk_shape"])) @@ -316,7 +321,7 @@ def to_dict(self) -> RectilinearChunkGridMetadataJSON: # type: ignore[override] else: serialized_dims.append(list(dim_spec)) return { - "name": "rectilinear", + "name": RECTILINEAR_CHUNK_GRID_NAME, "configuration": { "kind": "inline", "chunk_shapes": tuple(serialized_dims), @@ -349,7 +354,7 @@ def update_shape( @classmethod def from_dict(cls, data: RectilinearChunkGridMetadataJSON) -> Self: # type: ignore[override] - parse_named_configuration(data, "rectilinear") # validate name + parse_named_configuration(data, RECTILINEAR_CHUNK_GRID_NAME) # validate name configuration = data["configuration"] validate_rectilinear_kind(configuration.get("kind")) raw_shapes = configuration["chunk_shapes"] @@ -413,32 +418,20 @@ def parse_chunk_grid( return data name, _ = parse_named_configuration(data) - if name == "regular": + if name == REGULAR_CHUNK_GRID_NAME: return RegularChunkGridMetadata.from_dict(data) # type: ignore[arg-type] - if name == "rectilinear": + if name == RECTILINEAR_CHUNK_GRID_NAME: return RectilinearChunkGridMetadata.from_dict(data) # type: ignore[arg-type] raise ValueError(f"Unknown chunk grid name: {name!r}") -class ArrayMetadataJSON_V3(TypedDict, extra_items=AllowedExtraField): # type: ignore[call-arg] - """ - A typed dictionary model for zarr v3 array metadata. +ArrayMetadataJSON_V3 = ArrayMetadataV3 +"""Alias for `zarr_metadata.v3.array.ArrayMetadataV3`, the TypedDict modeling +the v3 array metadata document. - Extra keys are permitted if they conform to ``AllowedExtraField`` - (i.e. they are mappings with ``must_understand: false``). - """ - - zarr_format: Literal[3] - node_type: Literal["array"] - data_type: str | NamedConfig[str, Mapping[str, JSON]] - shape: tuple[int, ...] - chunk_grid: str | NamedConfig[str, Mapping[str, JSON]] - chunk_key_encoding: str | NamedConfig[str, Mapping[str, JSON]] - fill_value: JSON - codecs: tuple[str | NamedConfig[str, Mapping[str, JSON]], ...] - attributes: NotRequired[Mapping[str, JSON]] - storage_transformers: NotRequired[tuple[str | NamedConfig[str, Mapping[str, JSON]], ...]] - dimension_names: NotRequired[tuple[str | None, ...]] +Used throughout zarr-python under this name to avoid visual collision with +the `ArrayV3Metadata` dataclass — the two differ only in word order. Extra +keys are permitted on this dict if they conform to `ExtensionFieldV3`.""" """ @@ -665,6 +658,12 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: ) def to_dict(self) -> dict[str, JSON]: + """Serialize as a JSON-shaped dict matching `ArrayMetadataJSON_V3`. + + Return type is `dict[str, JSON]` rather than `ArrayMetadataJSON_V3` so + the result composes with other zarr-python metadata serialisation + paths that traffic in `dict[str, JSON]` (notably consolidated metadata). + """ out_dict = super().to_dict() extra_fields = out_dict.pop("extra_fields") out_dict = out_dict | extra_fields # type: ignore[operator] diff --git a/tests/test_array.py b/tests/test_array.py index 0d6d2d5906..f3ca0ed70f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -7,7 +7,7 @@ import re import sys from itertools import accumulate, starmap -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, cast from unittest import mock import numcodecs @@ -85,7 +85,11 @@ from .test_dtype.conftest import zdtype_examples if TYPE_CHECKING: + from zarr_metadata import ArrayMetadataV2 + from zarr_metadata.v3.codec.bytes import BytesCodecMetadata + from zarr.abc.codec import CodecJSON_V3 + from zarr.core.metadata import ArrayMetadataJSON_V3 @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) @@ -325,47 +329,47 @@ def test_serializable_sync_array(store: LocalStore, zarr_format: ZarrFormat) -> @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("zarr_format", [2, 3, "invalid"]) -def test_storage_transformers(store: MemoryStore, zarr_format: ZarrFormat | str) -> None: +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_storage_transformers(store: MemoryStore, zarr_format: ZarrFormat) -> None: """ - Test that providing an actual storage transformer produces a warning and otherwise passes through + storage_transformers is a v3-only field; passing a populated one to v3 + array construction raises, while v2 (where the field has no spec + meaning) is unaffected. """ - metadata_dict: dict[str, JSON] if zarr_format == 3: - metadata_dict = { + v3_metadata: ArrayMetadataJSON_V3 = { "zarr_format": 3, "node_type": "array", "shape": (10,), "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, "data_type": "uint8", "chunk_key_encoding": {"name": "v2", "configuration": {"separator": "/"}}, - "codecs": (BytesCodec().to_dict(),), + "codecs": (BytesCodec().to_dict(),), # type: ignore[typeddict-item] "fill_value": 0, - "storage_transformers": ({"test": "should_raise"}), + # Deliberately invalid: the test asserts that any non-empty + # storage_transformers value triggers the "not supported" + # error path, regardless of its inner shape. + "storage_transformers": ({"test": "should_raise"},), # type: ignore[typeddict-item] } + match = "Arrays with storage transformers are not supported in zarr-python at this time." + with pytest.raises(ValueError, match=match): + # cast: from_dict accepts the wider `dict[str, JSON]`. + Array.from_dict(StorePath(store), data=cast("dict[str, JSON]", v3_metadata)) else: - metadata_dict = { - "zarr_format": zarr_format, + # Plain v2 array metadata; no v3-only fields (no codecs, + # storage_transformers, chunk_grid, etc.). + v2_metadata: ArrayMetadataV2 = { + "zarr_format": 2, "shape": (10,), "chunks": (1,), "dtype": "|u1", "dimension_separator": ".", - "codecs": (BytesCodec().to_dict(),), + "compressor": None, "fill_value": 0, "order": "C", - "storage_transformers": ({"test": "should_raise"}), + "filters": None, } - if zarr_format == 3: - match = "Arrays with storage transformers are not supported in zarr-python at this time." - with pytest.raises(ValueError, match=match): - Array.from_dict(StorePath(store), data=metadata_dict) - elif zarr_format == 2: - # no warning - Array.from_dict(StorePath(store), data=metadata_dict) - else: - match = f"Invalid zarr_format: {zarr_format}. Expected 2 or 3" - with pytest.raises(ValueError, match=match): - Array.from_dict(StorePath(store), data=metadata_dict) + Array.from_dict(StorePath(store), data=cast("dict[str, JSON]", v2_metadata)) @pytest.mark.parametrize("test_cls", [AnyArray, AnyAsyncArray]) @@ -1885,7 +1889,7 @@ def test_roundtrip_numcodecs() -> None: dimension_names=["lat", "lon"], ) - BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} + BYTES_CODEC: BytesCodecMetadata = {"name": "bytes", "configuration": {"endian": "little"}} # Read in the array again and check compressor config root = zarr.open_group(store) metadata = root["test"].metadata.to_dict() diff --git a/tests/test_codecs/test_cast_value.py b/tests/test_codecs/test_cast_value.py index c43edb76e8..d682234ace 100644 --- a/tests/test_codecs/test_cast_value.py +++ b/tests/test_codecs/test_cast_value.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING, Any, cast import numpy as np import pytest @@ -9,6 +9,11 @@ from tests.conftest import Expect, ExpectFail from zarr.codecs.cast_value import CastValue +if TYPE_CHECKING: + from zarr_metadata.v3.codec.cast_value import CastValueCodecObject + + from zarr.core.common import JSON + try: import cast_value_rs # noqa: F401 @@ -26,14 +31,25 @@ # --------------------------------------------------------------------------- +_CAST_VALUE_MINIMAL: CastValueCodecObject = { + "name": "cast_value", + "configuration": {"data_type": "uint8"}, +} +_CAST_VALUE_FULL: CastValueCodecObject = { + "name": "cast_value", + "configuration": { + "data_type": "uint8", + "rounding": "towards-zero", + "out_of_range": "clamp", + "scalar_map": {"encode": (("NaN", 0),)}, + }, +} + + @pytest.mark.parametrize( "case", [ - Expect( - input=CastValue(data_type="uint8"), - output={"name": "cast_value", "configuration": {"data_type": "uint8"}}, - id="minimal", - ), + Expect(input=CastValue(data_type="uint8"), output=_CAST_VALUE_MINIMAL, id="minimal"), Expect( input=CastValue( data_type="uint8", @@ -41,51 +57,53 @@ out_of_range="clamp", scalar_map={"encode": [("NaN", 0)]}, ), - output={ - "name": "cast_value", - "configuration": { - "data_type": "uint8", - "rounding": "towards-zero", - "out_of_range": "clamp", - "scalar_map": {"encode": [("NaN", 0)]}, - }, - }, + output=_CAST_VALUE_FULL, id="full", ), ], ids=lambda c: c.id, ) -def test_to_dict(case: Expect[CastValue, dict[str, Any]]) -> None: +def test_to_dict(case: Expect[CastValue, CastValueCodecObject]) -> None: """to_dict produces the expected JSON structure.""" assert case.input.to_dict() == case.output +_CAST_VALUE_FROM_DICT_DEFAULTS: CastValueCodecObject = { + "name": "cast_value", + "configuration": {"data_type": "float32"}, +} +_CAST_VALUE_FROM_DICT_EXPLICIT: CastValueCodecObject = { + "name": "cast_value", + "configuration": { + "data_type": "int16", + "rounding": "towards-zero", + "out_of_range": "clamp", + }, +} + + @pytest.mark.parametrize( "case", [ Expect( - input={"name": "cast_value", "configuration": {"data_type": "float32"}}, + input=_CAST_VALUE_FROM_DICT_DEFAULTS, output=("float32", "nearest-even", None), id="defaults", ), Expect( - input={ - "name": "cast_value", - "configuration": { - "data_type": "int16", - "rounding": "towards-zero", - "out_of_range": "clamp", - }, - }, + input=_CAST_VALUE_FROM_DICT_EXPLICIT, output=("int16", "towards-zero", "clamp"), id="explicit", ), ], ids=lambda c: c.id, ) -def test_from_dict(case: Expect[dict[str, Any], tuple[str, str, str | None]]) -> None: +def test_from_dict( + case: Expect[CastValueCodecObject, tuple[str, str, str | None]], +) -> None: """from_dict deserializes configuration with correct values and defaults.""" - codec = CastValue.from_dict(case.input) + # cast: from_dict accepts the wider `dict[str, JSON]`. + codec = CastValue.from_dict(cast("dict[str, JSON]", case.input)) dtype_name, rounding, out_of_range = case.output assert codec.dtype.to_native_dtype() == np.dtype(dtype_name) assert codec.rounding == rounding diff --git a/tests/test_codecs/test_scale_offset.py b/tests/test_codecs/test_scale_offset.py index 513caf463a..89df06a80c 100644 --- a/tests/test_codecs/test_scale_offset.py +++ b/tests/test_codecs/test_scale_offset.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING, Any, cast import numpy as np import pytest @@ -16,53 +16,67 @@ from zarr.core.buffer.core import default_buffer_prototype from zarr.storage._memory import MemoryStore +if TYPE_CHECKING: + from zarr_metadata.v3.codec.scale_offset import ScaleOffsetCodecObject + + from zarr.core.common import JSON + # --------------------------------------------------------------------------- # Serialization # --------------------------------------------------------------------------- +_SCALE_OFFSET_DEFAULT: ScaleOffsetCodecObject = {"name": "scale_offset"} +_SCALE_OFFSET_OFFSET_ONLY: ScaleOffsetCodecObject = { + "name": "scale_offset", + "configuration": {"offset": 5}, +} +_SCALE_OFFSET_SCALE_ONLY: ScaleOffsetCodecObject = { + "name": "scale_offset", + "configuration": {"scale": 0.1}, +} +_SCALE_OFFSET_BOTH: ScaleOffsetCodecObject = { + "name": "scale_offset", + "configuration": {"offset": 5, "scale": 0.1}, +} + + @pytest.mark.parametrize( "case", [ - Expect(input=ScaleOffset(), output={"name": "scale_offset"}, id="default"), - Expect( - input=ScaleOffset(offset=5), - output={"name": "scale_offset", "configuration": {"offset": 5}}, - id="offset-only", - ), - Expect( - input=ScaleOffset(scale=0.1), - output={"name": "scale_offset", "configuration": {"scale": 0.1}}, - id="scale-only", - ), - Expect( - input=ScaleOffset(offset=5, scale=0.1), - output={"name": "scale_offset", "configuration": {"offset": 5, "scale": 0.1}}, - id="both", - ), + Expect(input=ScaleOffset(), output=_SCALE_OFFSET_DEFAULT, id="default"), + Expect(input=ScaleOffset(offset=5), output=_SCALE_OFFSET_OFFSET_ONLY, id="offset-only"), + Expect(input=ScaleOffset(scale=0.1), output=_SCALE_OFFSET_SCALE_ONLY, id="scale-only"), + Expect(input=ScaleOffset(offset=5, scale=0.1), output=_SCALE_OFFSET_BOTH, id="both"), ], ids=lambda c: c.id, ) -def test_to_dict(case: Expect[ScaleOffset, dict[str, Any]]) -> None: +def test_to_dict(case: Expect[ScaleOffset, ScaleOffsetCodecObject]) -> None: """to_dict produces the expected JSON structure.""" assert case.input.to_dict() == case.output +_SCALE_OFFSET_FROM_DICT_NO_CONFIG: ScaleOffsetCodecObject = {"name": "scale_offset"} +_SCALE_OFFSET_FROM_DICT_WITH_CONFIG: ScaleOffsetCodecObject = { + "name": "scale_offset", + "configuration": {"offset": 3, "scale": 2}, +} + + @pytest.mark.parametrize( "case", [ - Expect(input={"name": "scale_offset"}, output=(0, 1), id="no-config"), - Expect( - input={"name": "scale_offset", "configuration": {"offset": 3, "scale": 2}}, - output=(3, 2), - id="with-config", - ), + Expect(input=_SCALE_OFFSET_FROM_DICT_NO_CONFIG, output=(0, 1), id="no-config"), + Expect(input=_SCALE_OFFSET_FROM_DICT_WITH_CONFIG, output=(3, 2), id="with-config"), ], ids=lambda c: c.id, ) -def test_from_dict(case: Expect[dict[str, Any], tuple[int | float, int | float]]) -> None: +def test_from_dict( + case: Expect[ScaleOffsetCodecObject, tuple[int | float, int | float]], +) -> None: """from_dict deserializes configuration with correct values and defaults.""" - codec = ScaleOffset.from_dict(case.input) + # cast: from_dict accepts the wider `dict[str, JSON]`. + codec = ScaleOffset.from_dict(cast("dict[str, JSON]", case.input)) expected_offset, expected_scale = case.output assert codec.offset == expected_offset assert codec.scale == expected_scale diff --git a/tests/test_dtype/test_npy/test_bool.py b/tests/test_dtype/test_npy/test_bool.py index da30214b3b..ff48b26189 100644 --- a/tests/test_dtype/test_npy/test_bool.py +++ b/tests/test_dtype/test_npy/test_bool.py @@ -1,10 +1,15 @@ from __future__ import annotations +from typing import TYPE_CHECKING, ClassVar + import numpy as np from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.bool import Bool +if TYPE_CHECKING: + from zarr_metadata.v3.data_type.bool import BoolDataTypeName + class TestBool(BaseTestZDType): test_cls = Bool @@ -16,7 +21,7 @@ class TestBool(BaseTestZDType): np.dtype(np.uint16), ) valid_json_v2 = ({"name": "|b1", "object_codec_id": None},) - valid_json_v3 = ("bool",) + valid_json_v3: ClassVar[tuple[BoolDataTypeName, ...]] = ("bool",) invalid_json_v2 = ( "|b1", "bool", diff --git a/tests/test_dtype/test_npy/test_complex.py b/tests/test_dtype/test_npy/test_complex.py index b4ce42be58..dda60585f8 100644 --- a/tests/test_dtype/test_npy/test_complex.py +++ b/tests/test_dtype/test_npy/test_complex.py @@ -1,12 +1,17 @@ from __future__ import annotations import math +from typing import TYPE_CHECKING, ClassVar import numpy as np from tests.test_dtype.test_wrapper import BaseTestZDType from zarr.core.dtype.npy.complex import Complex64, Complex128 +if TYPE_CHECKING: + from zarr_metadata.v3.data_type.complex64 import Complex64DataTypeName + from zarr_metadata.v3.data_type.complex128 import Complex128DataTypeName + class _BaseTestFloat(BaseTestZDType): def scalar_equals(self, scalar1: object, scalar2: object) -> bool: @@ -27,7 +32,7 @@ class TestComplex64(_BaseTestFloat): {"name": ">c8", "object_codec_id": None}, {"name": "c16", "object_codec_id": None}, {"name": " bool: @@ -36,7 +43,7 @@ class TestFloat16(_BaseTestFloat): {"name": ">f2", "object_codec_id": None}, {"name": "f4", "object_codec_id": None}, {"name": "f8", "object_codec_id": None}, {"name": "i1", "int8", @@ -51,7 +63,7 @@ class TestInt16(BaseTestZDType): {"name": ">i2", "object_codec_id": None}, {"name": "i4", "object_codec_id": None}, {"name": "i8", "object_codec_id": None}, {"name": "u2", "object_codec_id": None}, {"name": "u4", "object_codec_id": None}, {"name": "u8", "object_codec_id": None}, {"name": "i4"], ["field2", ">f8"]], "object_codec_id": None}, {"name": [["field1", ">i8"], ["field2", ">i4"]], "object_codec_id": None}, ) - valid_json_v3 = ( + # `StructConfiguration.fields` is a `tuple[StructField, ...]` (a JSON array + # is a typed fixed-length container), and `Struct.to_json` emits a tuple to + # match, so the field entries are written as tuples here. + valid_json_v3: ClassVar[tuple[StructMetadata, ...]] = ( { "name": "struct", "configuration": { - "fields": [ + "fields": ( {"name": "field1", "data_type": "int32"}, {"name": "field2", "data_type": "float64"}, - ] + ) }, }, { "name": "struct", "configuration": { - "fields": [ + "fields": ( { "name": "field1", "data_type": { @@ -62,7 +68,7 @@ class TestStruct(BaseTestZDType): "configuration": {"length_bytes": 32}, }, }, - ] + ) }, }, ) diff --git a/tests/test_dtype/test_npy/test_time.py b/tests/test_dtype/test_npy/test_time.py index 67ba3bd130..14b6000999 100644 --- a/tests/test_dtype/test_npy/test_time.py +++ b/tests/test_dtype/test_npy/test_time.py @@ -1,7 +1,7 @@ from __future__ import annotations import re -from typing import get_args +from typing import TYPE_CHECKING, ClassVar, get_args import numpy as np import pytest @@ -10,6 +10,10 @@ from zarr.core.dtype.npy.common import DateTimeUnit from zarr.core.dtype.npy.time import DateTime64, TimeDelta64, datetime_from_int +if TYPE_CHECKING: + from zarr_metadata.v3.data_type.numpy_datetime64 import NumpyDatetime64 + from zarr_metadata.v3.data_type.numpy_timedelta64 import NumpyTimedelta64 + class _TestTimeBase(BaseTestZDType): def json_scalar_equals(self, scalar1: object, scalar2: object) -> bool: @@ -40,7 +44,7 @@ class TestDateTime64(_TestTimeBase): {"name": " None: """ Test that we can create an AsyncGroup from a dict diff --git a/tests/test_metadata/conftest.py b/tests/test_metadata/conftest.py index 24f2417fce..2a0765b9a4 100644 --- a/tests/test_metadata/conftest.py +++ b/tests/test_metadata/conftest.py @@ -5,7 +5,10 @@ from zarr.codecs.bytes import BytesCodec if TYPE_CHECKING: - from zarr.core.metadata.v3 import ArrayMetadataJSON_V3 + from zarr_metadata.v3.chunk_grid.regular import RegularChunkGridMetadata + from zarr_metadata.v3.chunk_key_encoding.default import DefaultChunkKeyEncodingMetadata + + from zarr.core.metadata import ArrayMetadataJSON_V3 def minimal_metadata_dict_v3( @@ -23,13 +26,29 @@ def minimal_metadata_dict_v3( **overrides Override any of the standard metadata fields. """ + # Bind chunk-grid and chunk-key-encoding subdicts to their precise + # zarr-metadata types so structural shape errors surface here rather + # than downstream. + chunk_grid: RegularChunkGridMetadata = { + "name": "regular", + "configuration": {"chunk_shape": (4, 4)}, + } + chunk_key_encoding: DefaultChunkKeyEncodingMetadata = { + "name": "default", + "configuration": {"separator": "/"}, + } d: ArrayMetadataJSON_V3 = { "zarr_format": 3, "node_type": "array", "shape": (4, 4), "data_type": "uint8", - "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (4, 4)}}, - "chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}}, + # mypy does not recognize structural subtyping between TypedDicts, + # so `RegularChunkGridMetadata` is not seen as assignable to the + # outer `str | NamedConfig` field type even though it is. The + # bound variables above are correct; suppress the spurious + # `typeddict-item` rejections here. + "chunk_grid": chunk_grid, # type: ignore[typeddict-item] + "chunk_key_encoding": chunk_key_encoding, # type: ignore[typeddict-item] "fill_value": 0, "codecs": (BytesCodec().to_dict(),), # type: ignore[typeddict-item] "attributes": {}, diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 3596d2bcaa..724c471134 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -1,7 +1,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import numpy as np import pytest @@ -26,6 +26,10 @@ from zarr.storage import StorePath if TYPE_CHECKING: + from zarr_metadata.v2 import ConsolidatedMetadataV2, ZAttrsMetadata, ZGroupMetadata + from zarr_metadata.v3.array import ArrayMetadataV3Partial + from zarr_metadata.v3.group import GroupMetadataV3 + from zarr.abc.store import Store from zarr.core.common import JSON, ZarrFormat @@ -63,14 +67,22 @@ async def test_getitem_consolidated_empty_leaf_group( # # field on the leaf group nodes. if zarr_format == 2: - zmetadata: dict[str, JSON] = { + # Bind each value to a typed variable so the outer TypedDict's + # value-union (ZArrayMetadata | ZGroupMetadata | ZAttrsMetadata) + # resolves unambiguously to the correct arm — inline literals + # do not narrow because mypy can't structurally disambiguate + # `{}` between `ZAttrsMetadata` (Mapping[str, object]) and an + # empty TypedDict variant. + empty_attrs: ZAttrsMetadata = {} + empty_group: ZGroupMetadata = {"zarr_format": 2} + zmetadata: ConsolidatedMetadataV2 = { "metadata": { - ".zattrs": {}, - ".zgroup": {"zarr_format": 2}, - "raw/.zattrs": {}, - "raw/.zgroup": {"zarr_format": 2}, - "raw/varm/.zattrs": {}, - "raw/varm/.zgroup": {"zarr_format": 2}, + ".zattrs": empty_attrs, + ".zgroup": empty_group, + "raw/.zattrs": empty_attrs, + "raw/.zgroup": empty_group, + "raw/varm/.zattrs": empty_attrs, + "raw/varm/.zgroup": empty_group, }, "zarr_consolidated_format": 1, } @@ -83,7 +95,15 @@ async def test_getitem_consolidated_empty_leaf_group( ) else: - zmetadata = { + # The v3 shape is a group metadata document with an inline + # `consolidated_metadata` extension field; not a + # `ConsolidatedMetadataV2` shape, so use a separately-named + # variable. + # Complete v3 group document with an inline `consolidated_metadata` + # extension field. mypy does not honor PEP 728 `extra_items=`, so + # the extension key needs a `typeddict-unknown-key` suppression even + # though `GroupMetadataV3` permits conforming extension fields. + zarr_json: GroupMetadataV3 = { # type: ignore[typeddict-unknown-key] "attributes": {}, "zarr_format": 3, "consolidated_metadata": { @@ -105,7 +125,7 @@ async def test_getitem_consolidated_empty_leaf_group( "node_type": "group", } await memory_store.set( - "zarr.json", cpu.Buffer.from_bytes(json.dumps(zmetadata).encode()) + "zarr.json", cpu.Buffer.from_bytes(json.dumps(zarr_json).encode()) ) group = await zarr.api.asynchronous.open_consolidated( @@ -143,7 +163,10 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: await consolidate_metadata(memory_store_with_hierarchy) group2 = await AsyncGroup.open(memory_store_with_hierarchy) - array_metadata: dict[str, JSON] = { + # Partial v3 array document: `shape` and `chunk_grid` are intentionally + # omitted and supplied per-array via spread below. `ArrayMetadataV3Partial` + # is the `total=False` form that types exactly this kind of fragment. + array_metadata: ArrayMetadataV3Partial = { "attributes": {}, "chunk_key_encoding": { "configuration": {"separator": "/"}, @@ -173,7 +196,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "configuration": {"chunk_shape": (1, 2, 3)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "lat": ArrayV3Metadata.from_dict( @@ -183,7 +206,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "configuration": {"chunk_shape": (1,)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "lon": ArrayV3Metadata.from_dict( @@ -193,7 +216,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "configuration": {"chunk_shape": (2,)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "time": ArrayV3Metadata.from_dict( @@ -203,7 +226,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: "configuration": {"chunk_shape": (3,)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "child": GroupMetadata( @@ -212,7 +235,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: metadata={ "array": ArrayV3Metadata.from_dict( { - **array_metadata, + **array_metadata, # type: ignore[dict-item] "attributes": {"key": "child"}, "shape": (4, 4), "chunk_grid": { @@ -234,7 +257,7 @@ async def test_consolidated(self, memory_store_with_hierarchy: Store) -> None: ), "array": ArrayV3Metadata.from_dict( { - **array_metadata, + **array_metadata, # type: ignore[dict-item] "attributes": {"key": "grandchild"}, "shape": (4, 4), "chunk_grid": { @@ -294,7 +317,9 @@ def test_consolidated_sync(self, memory_store: Store) -> None: zarr.api.synchronous.consolidate_metadata(memory_store) group2 = zarr.Group.open(memory_store) - array_metadata: dict[str, JSON] = { + # Partial v3 array document (see `test_consolidated_metadata`): `shape` + # and `chunk_grid` are supplied per-array via the spreads below. + array_metadata: ArrayMetadataV3Partial = { "attributes": {}, "chunk_key_encoding": { "configuration": {"separator": "/"}, @@ -324,7 +349,7 @@ def test_consolidated_sync(self, memory_store: Store) -> None: "configuration": {"chunk_shape": (1, 2, 3)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "lat": ArrayV3Metadata.from_dict( @@ -334,7 +359,7 @@ def test_consolidated_sync(self, memory_store: Store) -> None: "configuration": {"chunk_shape": (1,)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "lon": ArrayV3Metadata.from_dict( @@ -344,7 +369,7 @@ def test_consolidated_sync(self, memory_store: Store) -> None: "configuration": {"chunk_shape": (2,)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "time": ArrayV3Metadata.from_dict( @@ -354,7 +379,7 @@ def test_consolidated_sync(self, memory_store: Store) -> None: "configuration": {"chunk_shape": (3,)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), }, @@ -411,7 +436,9 @@ def test_consolidated_metadata_from_dict(self) -> None: ConsolidatedMetadata.from_dict(data) def test_flatten(self) -> None: - array_metadata: dict[str, Any] = { + # Partial v3 array document (see `test_consolidated_metadata`): `shape` + # and `chunk_grid` are supplied per-array via the spreads below. + array_metadata: ArrayMetadataV3Partial = { "attributes": {}, "chunk_key_encoding": { "configuration": {"separator": "/"}, @@ -436,7 +463,7 @@ def test_flatten(self) -> None: "configuration": {"chunk_shape": (1, 2, 3)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "lat": ArrayV3Metadata.from_dict( @@ -446,7 +473,7 @@ def test_flatten(self) -> None: "configuration": {"chunk_shape": (1,)}, "name": "regular", }, - **array_metadata, + **array_metadata, # type: ignore[dict-item] } ), "child": GroupMetadata( @@ -455,7 +482,7 @@ def test_flatten(self) -> None: metadata={ "array": ArrayV3Metadata.from_dict( { - **array_metadata, + **array_metadata, # type: ignore[dict-item] "attributes": {"key": "child"}, "shape": (4, 4), "chunk_grid": { @@ -470,7 +497,7 @@ def test_flatten(self) -> None: metadata={ "array": ArrayV3Metadata.from_dict( { - **array_metadata, + **array_metadata, # type: ignore[dict-item] "attributes": {"key": "grandchild"}, "shape": (4, 4), "chunk_grid": { diff --git a/tests/test_metadata/test_v2.py b/tests/test_metadata/test_v2.py index d1a1ca00b4..3d4f9168fa 100644 --- a/tests/test_metadata/test_v2.py +++ b/tests/test_metadata/test_v2.py @@ -21,6 +21,8 @@ from pathlib import Path from typing import Any + from zarr_metadata import ConsolidatedMetadataV2 + from zarr.abc.codec import Codec from zarr.core.common import JSON @@ -107,7 +109,7 @@ class TestConsolidated: async def v2_consolidated_metadata( self, memory_store: zarr.storage.MemoryStore ) -> zarr.storage.MemoryStore: - zmetadata: dict[str, JSON] = { + zmetadata: ConsolidatedMetadataV2 = { "metadata": { ".zattrs": { "Conventions": "COARDS", @@ -170,19 +172,19 @@ async def v2_consolidated_metadata( await store.set(".zmetadata", cpu.Buffer.from_bytes(json.dumps(zmetadata).encode())) await store.set( "air/.zarray", - cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["air/.zarray"]).encode()), # type: ignore[index, call-overload] + cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["air/.zarray"]).encode()), ) await store.set( "air/.zattrs", - cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["air/.zattrs"]).encode()), # type: ignore[index, call-overload] + cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["air/.zattrs"]).encode()), ) await store.set( "time/.zarray", - cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["time/.zarray"]).encode()), # type: ignore[index, call-overload] + cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["time/.zarray"]).encode()), ) await store.set( "time/.zattrs", - cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["time/.zattrs"]).encode()), # type: ignore[index, call-overload] + cpu.Buffer.from_bytes(json.dumps(zmetadata["metadata"]["time/.zattrs"]).encode()), ) # and a nested group for fun @@ -195,13 +197,13 @@ async def v2_consolidated_metadata( await store.set( "nested/array/.zarray", cpu.Buffer.from_bytes( - json.dumps(zmetadata["metadata"]["nested/array/.zarray"]).encode() # type: ignore[index, call-overload] + json.dumps(zmetadata["metadata"]["nested/array/.zarray"]).encode() ), ) await store.set( "nested/array/.zattrs", cpu.Buffer.from_bytes( - json.dumps(zmetadata["metadata"]["nested/array/.zattrs"]).encode() # type: ignore[index, call-overload] + json.dumps(zmetadata["metadata"]["nested/array/.zattrs"]).encode() ), ) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index d1e156e500..7ba309995c 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -34,6 +34,9 @@ if TYPE_CHECKING: from typing import Any + from zarr_metadata import GroupMetadataV3 + from zarr_metadata.v3.codec.bytes import BytesCodecObject + # --------------------------------------------------------------------------- # Parsing helpers @@ -178,8 +181,13 @@ def test_array_metadata_keys_matches_typeddict() -> None: # --------------------------------------------------------------------------- # Codecs after evolution for single-byte (uint8) and multi-byte (float64) types. +# The uint8 case omits `configuration`; floor-pinned zarr-metadata 0.1.1 +# marks that field as required, so the annotation is dropped until the +# relaxed shape ships. _UINT8_CODECS = ({"name": "bytes"},) -_FLOAT64_CODECS = ({"name": "bytes", "configuration": {"endian": "little"}},) +_FLOAT64_CODECS: tuple[BytesCodecObject, ...] = ( + {"name": "bytes", "configuration": {"endian": "little"}}, +) @pytest.mark.parametrize( @@ -448,7 +456,11 @@ def test_group_metadata_to_dict_consolidated(attributes: dict[str, Any] | None) ): group = consolidate_metadata(store) - assert group.metadata.to_dict() == { + # `consolidated_metadata` is an `ExtensionFieldV3` (extra key allowed + # on `GroupMetadataV3` via PEP 728 extra_items=ExtensionFieldV3). mypy + # doesn't honor PEP 728 yet and reports `typeddict-unknown-key`; the + # annotation is correct, so the error code is ignored at the literal. + expected: GroupMetadataV3 = { # type: ignore[typeddict-unknown-key] "zarr_format": 3, "node_type": "group", "attributes": attributes or {}, @@ -469,3 +481,4 @@ def test_group_metadata_to_dict_consolidated(attributes: dict[str, Any] | None) }, }, } + assert group.metadata.to_dict() == expected diff --git a/uv.lock b/uv.lock index adc71bae62..61669dd3cf 100644 --- a/uv.lock +++ b/uv.lock @@ -6,6 +6,12 @@ resolution-markers = [ "python_full_version < '3.15'", ] +[manifest] +members = [ + "zarr", + "zarr-metadata", +] + [[package]] name = "aiobotocore" version = "3.7.0" @@ -3952,6 +3958,7 @@ dependencies = [ { name = "numpy" }, { name = "packaging" }, { name = "typing-extensions" }, + { name = "zarr-metadata" }, ] [package.optional-dependencies] @@ -4077,6 +4084,7 @@ requires-dist = [ { name = "typer", marker = "extra == 'cli'" }, { name = "typing-extensions", specifier = ">=4.14" }, { name = "universal-pathlib", marker = "extra == 'optional'" }, + { name = "zarr-metadata", editable = "packages/zarr-metadata" }, ] provides-extras = ["cast-value-rs", "cli", "gpu", "optional", "remote"] @@ -4168,3 +4176,25 @@ test = [ { name = "tomlkit", specifier = "==0.15.0" }, { name = "uv", specifier = "==0.11.20" }, ] + +[[package]] +name = "zarr-metadata" +source = { editable = "packages/zarr-metadata" } +dependencies = [ + { name = "typing-extensions" }, +] + +[package.dev-dependencies] +test = [ + { name = "pydantic" }, + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [{ name = "typing-extensions", specifier = ">=4.13" }] + +[package.metadata.requires-dev] +test = [ + { name = "pydantic", specifier = ">=2" }, + { name = "pytest" }, +]