Skip to content

Commit de8f5b1

Browse files
committed
codecs: unknown configuration keys will get rejected except must_understand=False
1 parent 0c82437 commit de8f5b1

File tree

9 files changed

+121
-13
lines changed

9 files changed

+121
-13
lines changed

src/zarr/codecs/blosc.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from dataclasses import dataclass, replace
55
from enum import Enum
66
from functools import cached_property
7-
from typing import TYPE_CHECKING
7+
from typing import TYPE_CHECKING, Any
88

99
import numcodecs
1010
from numcodecs.blosc import Blosc
@@ -101,7 +101,14 @@ def __init__(
101101
clevel: int = 5,
102102
shuffle: BloscShuffle | str | None = None,
103103
blocksize: int = 0,
104+
**kwargs: dict[str, Any],
104105
) -> None:
106+
if not all(
107+
isinstance(value, dict) and value.get("must_understand") is False
108+
for value in kwargs.values()
109+
):
110+
raise ValueError(f"The `blosc` codec got an unexpected configuration: {kwargs}")
111+
105112
typesize_parsed = parse_typesize(typesize) if typesize is not None else None
106113
cname_parsed = parse_enum(cname, BloscCname)
107114
clevel_parsed = parse_clevel(clevel)

src/zarr/codecs/bytes.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44
from dataclasses import dataclass, replace
55
from enum import Enum
6-
from typing import TYPE_CHECKING
6+
from typing import TYPE_CHECKING, Any
77

88
import numpy as np
99

@@ -36,9 +36,18 @@ class BytesCodec(ArrayBytesCodec):
3636

3737
endian: Endian | None
3838

39-
def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None:
39+
def __init__(
40+
self,
41+
*,
42+
endian: Endian | str | None = default_system_endian,
43+
**kwargs: dict[str, Any],
44+
) -> None:
45+
if not all(
46+
isinstance(value, dict) and value.get("must_understand") is False
47+
for value in kwargs.values()
48+
):
49+
raise ValueError(f"The `bytes` codec got an unexpected configuration: {kwargs}")
4050
endian_parsed = None if endian is None else parse_enum(endian, Endian)
41-
4251
object.__setattr__(self, "endian", endian_parsed)
4352

4453
@classmethod

src/zarr/codecs/crc32c_.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,14 @@ class Crc32cCodec(BytesBytesCodec):
2424

2525
@classmethod
2626
def from_dict(cls, data: dict[str, JSON]) -> Self:
27-
parse_named_configuration(data, "crc32c", require_configuration=False)
27+
_, configuration_parsed = parse_named_configuration(
28+
data, "crc32c", require_configuration=False
29+
)
30+
if configuration_parsed and not all(
31+
isinstance(value, dict) and value.get("must_understand") is False
32+
for value in configuration_parsed.values()
33+
):
34+
raise ValueError(f"The `crc32c` codec got an unexpected configuration: {data}")
2835
return cls()
2936

3037
def to_dict(self) -> dict[str, JSON]:

src/zarr/codecs/gzip.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import asyncio
44
from dataclasses import dataclass
5-
from typing import TYPE_CHECKING
5+
from typing import TYPE_CHECKING, Any
66

77
from numcodecs.gzip import GZip
88

@@ -34,7 +34,13 @@ class GzipCodec(BytesBytesCodec):
3434

3535
level: int = 5
3636

37-
def __init__(self, *, level: int = 5) -> None:
37+
def __init__(self, *, level: int = 5, **kwargs: dict[str, Any]) -> None:
38+
if not all(
39+
isinstance(value, dict) and value.get("must_understand") is False
40+
for value in kwargs.values()
41+
):
42+
raise ValueError(f"The `gzip` codec got an unexpected configuration: {kwargs}")
43+
3844
level_parsed = parse_gzip_level(level)
3945

4046
object.__setattr__(self, "level", level_parsed)

src/zarr/codecs/sharding.py

+7
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,14 @@ def __init__(
343343
codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(),),
344344
index_codecs: Iterable[Codec | dict[str, JSON]] = (BytesCodec(), Crc32cCodec()),
345345
index_location: ShardingCodecIndexLocation | str = ShardingCodecIndexLocation.end,
346+
**kwargs: dict[str, Any],
346347
) -> None:
348+
if not all(
349+
isinstance(value, dict) and value.get("must_understand") is False
350+
for value in kwargs.values()
351+
):
352+
raise ValueError(f"The `sharding` codec got an unexpected configuration: {kwargs}")
353+
347354
chunk_shape_parsed = parse_shapelike(chunk_shape)
348355
codecs_parsed = parse_codecs(codecs)
349356
index_codecs_parsed = parse_codecs(index_codecs)

src/zarr/codecs/transpose.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,13 @@ class TransposeCodec(ArrayArrayCodec):
3232

3333
order: tuple[int, ...]
3434

35-
def __init__(self, *, order: ChunkCoordsLike) -> None:
35+
def __init__(self, *, order: ChunkCoordsLike, **kwargs: dict[str, Any]) -> None:
36+
if not all(
37+
isinstance(value, dict) and value.get("must_understand") is False
38+
for value in kwargs.values()
39+
):
40+
raise ValueError(f"The `transpose` codec got an unexpected configuration: {kwargs}")
41+
3642
order_parsed = parse_transpose_order(order)
3743

3844
object.__setattr__(self, "order", order_parsed)

src/zarr/codecs/vlen_utf8.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
from dataclasses import dataclass
4-
from typing import TYPE_CHECKING
4+
from typing import TYPE_CHECKING, Any
55
from warnings import warn
66

77
import numpy as np
@@ -26,7 +26,13 @@
2626

2727
@dataclass(frozen=True)
2828
class VLenUTF8Codec(ArrayBytesCodec):
29-
def __init__(self) -> None:
29+
def __init__(self, **kwargs: dict[str, Any]) -> None:
30+
if not all(
31+
isinstance(value, dict) and value.get("must_understand") is False
32+
for value in kwargs.values()
33+
):
34+
raise ValueError(f"The `vlen-utf8` codec got an unexpected configuration: {kwargs}")
35+
3036
warn(
3137
"The codec `vlen-utf8` is currently not part in the Zarr format 3 specification. It "
3238
"may not be supported by other zarr implementations and may change in the future.",
@@ -81,7 +87,13 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -
8187

8288
@dataclass(frozen=True)
8389
class VLenBytesCodec(ArrayBytesCodec):
84-
def __init__(self) -> None:
90+
def __init__(self, **kwargs: dict[str, Any]) -> None:
91+
if not all(
92+
isinstance(value, dict) and value.get("must_understand") is False
93+
for value in kwargs.values()
94+
):
95+
raise ValueError(f"The `vlen-bytes` codec got an unexpected configuration: {kwargs}")
96+
8597
warn(
8698
"The codec `vlen-bytes` is currently not part in the Zarr format 3 specification. It "
8799
"may not be supported by other zarr implementations and may change in the future.",

src/zarr/codecs/zstd.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import asyncio
44
from dataclasses import dataclass
55
from functools import cached_property
6-
from typing import TYPE_CHECKING
6+
from typing import TYPE_CHECKING, Any
77

88
import numcodecs
99
from numcodecs.zstd import Zstd
@@ -42,7 +42,13 @@ class ZstdCodec(BytesBytesCodec):
4242
level: int = 0
4343
checksum: bool = False
4444

45-
def __init__(self, *, level: int = 0, checksum: bool = False) -> None:
45+
def __init__(self, *, level: int = 0, checksum: bool = False, **kwargs: dict[str, Any]) -> None:
46+
if not all(
47+
isinstance(value, dict) and value.get("must_understand") is False
48+
for value in kwargs.values()
49+
):
50+
raise ValueError(f"The `zstd` codec got an unexpected configuration: {kwargs}")
51+
4652
# numcodecs 0.13.0 introduces the checksum attribute for the zstd codec
4753
_numcodecs_version = Version(numcodecs.__version__)
4854
if _numcodecs_version < Version("0.13.0"):

tests/test_metadata/test_v3.py

+48
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,54 @@ def test_fail_on_invalid_key() -> None:
448448
default_metadata_dict(codecs=[{"name": "bytes", "unknown": {}, "configuration": {}}])
449449
)
450450

451+
# accepts invalid key with must_understand=false
452+
ArrayV3Metadata.from_dict(
453+
default_metadata_dict(
454+
codecs=[{"name": "bytes", "configuration": {}, "unknown": {"must_understand": False}}]
455+
)
456+
)
457+
458+
459+
@pytest.mark.parametrize(
460+
"codecs",
461+
[
462+
[{"name": "bytes", "configuration": {}}],
463+
[{"name": "transpose", "configuration": {"order": (0,)}}, "bytes"],
464+
[
465+
"bytes",
466+
{
467+
"name": "blosc",
468+
"configuration": {
469+
"cname": "lz4",
470+
"clevel": 1,
471+
"shuffle": "shuffle",
472+
"typesize": 4,
473+
"blocksize": 0,
474+
},
475+
},
476+
],
477+
["bytes", {"name": "gzip", "configuration": {"level": 1}}],
478+
["bytes", {"name": "zstd", "configuration": {"level": 1}}],
479+
["bytes", {"name": "crc32c", "configuration": {}}],
480+
[{"name": "sharding_indexed", "configuration": {"chunk_shape": (1,)}}],
481+
[{"name": "vlen-utf8", "configuration": {}}],
482+
[{"name": "vlen-bytes", "configuration": {}}],
483+
],
484+
)
485+
def test_codecs_fail_on_invalid_key(codecs) -> None:
486+
ArrayV3Metadata.from_dict(default_metadata_dict(codecs=codecs))
487+
488+
for codec in codecs:
489+
if codec != "bytes":
490+
codec["configuration"]["unknown"] = "value"
491+
with pytest.raises(ValueError):
492+
ArrayV3Metadata.from_dict(default_metadata_dict(codecs=codecs))
493+
# accepts invalid key with must_understand=false
494+
for codec in codecs:
495+
if codec != "bytes":
496+
codec["configuration"]["unknown"] = {"must_understand": False}
497+
ArrayV3Metadata.from_dict(default_metadata_dict(codecs=codecs))
498+
451499

452500
def test_specify_codecs_with_strings() -> None:
453501
expected = ArrayV3Metadata.from_dict(

0 commit comments

Comments
 (0)