Skip to content

Commit c963f5e

Browse files
committed
Don't allow chunking in ploidy dimension
1 parent 9764d6c commit c963f5e

File tree

3 files changed

+19
-4
lines changed

3 files changed

+19
-4
lines changed

sgkit/stats/aggregation.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ def count_call_alleles(
7777
variables.validate(ds, {call_genotype: variables.call_genotype_spec})
7878
n_alleles = ds.sizes["alleles"]
7979
G = da.asarray(ds[call_genotype])
80+
if G.numblocks[2] > 1:
81+
raise ValueError(
82+
f"Variable {call_genotype} must have only a single chunk in the ploidy dimension. "
83+
"Consider rechunking to change the size of chunks."
84+
)
8085
shape = (G.chunks[0], G.chunks[1], n_alleles)
8186
# use numpy array to avoid dask task dependencies between chunks
8287
N = np.empty(n_alleles, dtype=np.uint8)

sgkit/tests/test_aggregation.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,10 @@ def test_count_variant_alleles__chunked(using):
139139
calls = rs.randint(0, 1, size=(50, 10, 2))
140140
ds = get_dataset(calls)
141141
ac1 = count_variant_alleles(ds, using=using)
142-
# Coerce from numpy to multiple chunks in all dimensions
143-
ds["call_genotype"] = ds["call_genotype"].chunk(chunks=(5, 5, 1))
142+
# Coerce from numpy to multiple chunks in all non-core dimensions
143+
ds["call_genotype"] = ds["call_genotype"].chunk(
144+
chunks={"variants": 5, "samples": 5}
145+
)
144146
ac2 = count_variant_alleles(ds, using=using)
145147
assert isinstance(ac2["variant_allele_count"].data, da.Array)
146148
xr.testing.assert_equal(ac1, ac2)
@@ -273,6 +275,14 @@ def test_count_call_alleles__chunked():
273275
assert hasattr(ac2["call_allele_count"].data, "chunks")
274276
xr.testing.assert_equal(ac1, ac2)
275277

278+
# Multiple chunks in core dimension should fail
279+
ds["call_genotype"] = ds["call_genotype"].chunk(chunks={"ploidy": 1})
280+
with pytest.raises(
281+
ValueError,
282+
match="Variable call_genotype must have only a single chunk in the ploidy dimension",
283+
):
284+
count_call_alleles(ds)
285+
276286

277287
def test_count_cohort_alleles__multi_variant_multi_sample():
278288
ds = get_dataset(

sgkit/tests/test_popgen.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -533,7 +533,7 @@ def test_Garud_h__raise_on_no_windows():
533533

534534

535535
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
536-
@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (2, 2))])
536+
@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (4))])
537537
def test_observed_heterozygosity(chunks):
538538
ds = simulate_genotype_call_dataset(
539539
n_variant=4,
@@ -599,7 +599,7 @@ def test_observed_heterozygosity(chunks):
599599

600600

601601
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
602-
@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (2, 2))])
602+
@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (4,))])
603603
@pytest.mark.parametrize(
604604
"cohorts,expectation",
605605
[

0 commit comments

Comments
 (0)