Skip to content

Commit b703e08

Browse files
committed
Run tests using Zarr Python v3
1 parent 1846566 commit b703e08

File tree

8 files changed

+71
-25
lines changed

8 files changed

+71
-25
lines changed

.github/workflows/build.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,28 @@ jobs:
4141
uses: codecov/codecov-action@v3
4242
with:
4343
token: ${{ secrets.CODECOV_TOKEN }}
44+
45+
test-zarr-version:
46+
name: Test Zarr Python v3
47+
# Scheduled runs only on the origin org
48+
if: (github.event_name == 'schedule' && github.repository_owner == 'sgkit-dev') || (github.event_name != 'schedule')
49+
runs-on: ubuntu-latest
50+
strategy:
51+
matrix:
52+
zarr: ["==3.0.0b1"]
53+
steps:
54+
- uses: actions/checkout@v4
55+
- uses: actions/setup-python@v5
56+
with:
57+
python-version: '3.11'
58+
- name: Install dependencies
59+
run: |
60+
python -m pip install --upgrade pip
61+
pip install -r requirements.txt -r requirements-dev.txt
62+
- name: Install zarr${{ matrix.zarr }}
63+
run: |
64+
python -m pip install --pre 'zarr${{ matrix.zarr }}'
65+
python -m pip uninstall -y bio2zarr # TODO: remove when bio2zarr supports Zarr Python 3
66+
- name: Run tests
67+
run: |
68+
pytest

sgkit/io/bgen/bgen_reader.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import dask
1919
import dask.array as da
2020
import dask.dataframe as dd
21+
import numcodecs
2122
import numpy as np
2223
import pandas as pd
2324
import xarray as xr
@@ -348,7 +349,7 @@ def encode_variables(
348349
ds: Dataset,
349350
chunk_length: int,
350351
chunk_width: int,
351-
compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
352+
compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
352353
probability_dtype: Optional[Any] = "uint8",
353354
) -> Dict[Hashable, Dict[str, Any]]:
354355
encoding = {}
@@ -424,7 +425,7 @@ def rechunk_bgen(
424425
*,
425426
chunk_length: int = 10_000,
426427
chunk_width: int = 1_000,
427-
compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
428+
compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
428429
probability_dtype: Optional[DType] = "uint8",
429430
max_mem: str = "4GB",
430431
pack: bool = True,
@@ -538,7 +539,7 @@ def bgen_to_zarr(
538539
chunk_length: int = 10_000,
539540
chunk_width: int = 1_000,
540541
temp_chunk_length: int = 100,
541-
compressor: Optional[Any] = zarr.Blosc(cname="zstd", clevel=7, shuffle=2),
542+
compressor: Optional[Any] = numcodecs.Blosc(cname="zstd", clevel=7, shuffle=2),
542543
probability_dtype: Optional[DType] = "uint8",
543544
max_mem: str = "4GB",
544545
pack: bool = True,

sgkit/io/dataset.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
1-
from pathlib import Path
21
from typing import Any, Dict, MutableMapping, Optional, Union
32

4-
import fsspec
53
import numcodecs
64
import xarray as xr
75
from xarray import Dataset
86

97
from sgkit.typing import PathType
8+
from sgkit.utils import has_keyword
109

1110

1211
def save_dataset(
1312
ds: Dataset,
1413
store: Union[PathType, MutableMapping[str, bytes]],
1514
storage_options: Optional[Dict[str, str]] = None,
1615
auto_rechunk: Optional[bool] = None,
16+
zarr_format: int = 2,
1717
**kwargs: Any,
1818
) -> None:
1919
"""Save a dataset to Zarr storage.
@@ -35,11 +35,6 @@ def save_dataset(
3535
kwargs
3636
Additional arguments to pass to :meth:`xarray.Dataset.to_zarr`.
3737
"""
38-
if isinstance(store, str):
39-
storage_options = storage_options or {}
40-
store = fsspec.get_mapper(store, **storage_options)
41-
elif isinstance(store, Path):
42-
store = str(store)
4338
if auto_rechunk is None:
4439
auto_rechunk = False
4540
for v in ds:
@@ -71,7 +66,9 @@ def save_dataset(
7166

7267
# Catch unequal chunking errors to provide a more helpful error message
7368
try:
74-
ds.to_zarr(store, **kwargs)
69+
if has_keyword(ds.to_zarr, "zarr_format"): # from xarray v2024.10.0
70+
kwargs["zarr_format"] = zarr_format
71+
ds.to_zarr(store, storage_options=storage_options, **kwargs)
7572
except ValueError as e:
7673
if "Zarr requires uniform chunk sizes" in str(
7774
e
@@ -109,12 +106,7 @@ def load_dataset(
109106
Dataset
110107
The dataset loaded from the Zarr store or file system.
111108
"""
112-
if isinstance(store, str):
113-
storage_options = storage_options or {}
114-
store = fsspec.get_mapper(store, **storage_options)
115-
elif isinstance(store, Path):
116-
store = str(store)
117-
ds: Dataset = xr.open_zarr(store, concat_characters=False, **kwargs) # type: ignore[no-untyped-call]
109+
ds: Dataset = xr.open_zarr(store, storage_options=storage_options, concat_characters=False, **kwargs) # type: ignore[no-untyped-call]
118110
for v in ds:
119111
# Workaround for https://github.com/pydata/xarray/issues/4386
120112
if v.endswith("_mask"): # type: ignore

sgkit/tests/io/bgen/test_bgen_reader.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
import numpy.testing as npt
66
import pytest
77
import xarray as xr
8+
import zarr
9+
from packaging.version import Version
10+
11+
pytestmark = pytest.mark.skipif(
12+
Version(zarr.__version__).major >= 3, reason="Rechunking fails for Zarr Python 3"
13+
)
814

915
from sgkit.io.bgen.bgen_reader import (
1016
GT_DATA_VARS,

sgkit/tests/io/test_dataset.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import pytest
44
import xarray as xr
5+
import zarr
6+
from packaging.version import Version
57
from xarray import Dataset
68

79
from sgkit import load_dataset, save_dataset
@@ -54,7 +56,10 @@ def test_save_unequal_chunks_error():
5456
n_variant=10, n_sample=10, n_ploidy=10, n_allele=10, n_contig=10
5557
)
5658
# Normal zarr errors shouldn't be caught
57-
with pytest.raises(ValueError, match="path '' contains an array"):
59+
with pytest.raises(
60+
(FileExistsError, ValueError),
61+
match="(path '' contains an array|Store already exists)",
62+
):
5863
save_dataset(ds, {".zarray": ""})
5964

6065
# Make the dataset have unequal chunk sizes across all dimensions
@@ -74,6 +79,9 @@ def test_save_unequal_chunks_error():
7479
save_dataset(ds, {})
7580

7681

82+
@pytest.mark.skipif(
83+
Version(zarr.__version__).major >= 3, reason="Fails for Zarr Python 3"
84+
)
7785
def test_save_auto_rechunk():
7886
# Make all dimensions the same size for ease of testing
7987
ds = simulate_genotype_call_dataset(

sgkit/tests/test_association.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,14 @@
66
import pandas as pd
77
import pytest
88
import xarray as xr
9-
import zarr
109
from pandas import DataFrame
1110
from xarray import Dataset
1211

12+
try:
13+
from zarr.storage import ZipStore # v3
14+
except ImportError: # pragma: no cover
15+
from zarr import ZipStore
16+
1317
import sgkit.distarray as da
1418
from sgkit.stats.association import (
1519
gwas_linear_regression,
@@ -313,12 +317,10 @@ def test_regenie_loco_regression(ndarray_type: str, covariate: bool) -> None:
313317

314318
for ds_name in datasets:
315319
# Load simulated data
316-
genotypes_store = zarr.ZipStore(
320+
genotypes_store = ZipStore(
317321
str(ds_dir / ds_name / "genotypes.zarr.zip"), mode="r"
318322
)
319-
glow_store = zarr.ZipStore(
320-
str(ds_dir / ds_name / glow_offsets_filename), mode="r"
321-
)
323+
glow_store = ZipStore(str(ds_dir / ds_name / glow_offsets_filename), mode="r")
322324

323325
ds = xr.open_zarr(genotypes_store, consolidated=False)
324326
glow_loco_predictions = xr.open_zarr(glow_store, consolidated=False)

sgkit/tests/test_regenie.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import pytest
1010
import xarray as xr
1111
import yaml
12-
import zarr
1312
from dask.array import Array
1413
from hypothesis import given, settings
1514
from hypothesis import strategies as st
@@ -18,6 +17,11 @@
1817
from pandas import DataFrame
1918
from xarray import Dataset
2019

20+
try:
21+
from zarr.storage import ZipStore # v3
22+
except ImportError: # pragma: no cover
23+
from zarr import ZipStore
24+
2125
from sgkit.stats.association import LinearRegressionResult, linear_regression
2226
from sgkit.stats.regenie import (
2327
index_array_blocks,
@@ -258,7 +262,7 @@ def check_simulation_result(
258262
result_dir = datadir / "result" / run["name"]
259263

260264
# Load simulated data
261-
with zarr.ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
265+
with ZipStore(str(dataset_dir / "genotypes.zarr.zip"), mode="r") as store:
262266
ds = xr.open_zarr(store, consolidated=False)
263267
df_covariate = load_covariates(dataset_dir)
264268
df_trait = load_traits(dataset_dir)

sgkit/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import inspect
12
import warnings
23
from itertools import product
34
from typing import Any, Callable, Hashable, List, Mapping, Optional, Set, Tuple, Union
@@ -425,3 +426,10 @@ def smallest_numpy_int_dtype(value: int) -> Optional[DType]:
425426
if np.iinfo(dtype).min <= value <= np.iinfo(dtype).max:
426427
return dtype
427428
raise OverflowError(f"Value {value} cannot be stored in np.int64")
429+
430+
431+
def has_keyword(func, keyword):
432+
try:
433+
return keyword in inspect.signature(func).parameters
434+
except Exception: # pragma: no cover
435+
return False

0 commit comments

Comments
 (0)