Skip to content

Commit 310a5a8

Browse files
committed
feat(clickhouse): enable support for working window functions
1 parent c1b72ba commit 310a5a8

File tree

3 files changed

+116
-37
lines changed

3 files changed

+116
-37
lines changed

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
version: "3.4"
22
services:
33
clickhouse:
4-
image: clickhouse/clickhouse-server:22-alpine
4+
image: clickhouse/clickhouse-server:22.6.6.16-alpine
55
ports:
66
- 8123:8123
77
- 9000:9000

ibis/backends/clickhouse/registry.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import ibis.expr.operations as ops
77
import ibis.expr.types as ir
88
import ibis.util as util
9-
from ibis.backends.base.sql.registry import binary_infix
9+
from ibis.backends.base.sql.registry import binary_infix, window
1010
from ibis.backends.clickhouse.datatypes import serialize
1111
from ibis.backends.clickhouse.identifiers import quote_identifier
1212

@@ -685,6 +685,13 @@ def _struct_field(translator, expr):
685685
return f"{translator.translate(op.arg)}.`{op.field}`"
686686

687687

688+
def _nth_value(translator, expr):
689+
op = expr.op()
690+
arg = translator.translate(op.arg)
691+
nth = translator.translate(op.nth)
692+
return f"nth_value({arg}, ({nth}) + 1)"
693+
694+
688695
# TODO: clickhouse uses different string functions
689696
# for ascii and utf-8 encodings,
690697

@@ -848,6 +855,17 @@ def _struct_field(translator, expr):
848855
ops.Clip: _clip,
849856
ops.StructField: _struct_field,
850857
ops.StructColumn: _struct_column,
858+
ops.Window: window.window,
859+
ops.RowNumber: lambda *args: 'row_number()',
860+
ops.DenseRank: lambda *args: 'dense_rank()',
861+
ops.MinRank: lambda *args: 'rank()',
862+
ops.Lag: window.shift_like('lagInFrame'),
863+
ops.Lead: window.shift_like('leadInFrame'),
864+
ops.FirstValue: _unary('first_value'),
865+
ops.LastValue: _unary('last_value'),
866+
ops.NthValue: _nth_value,
867+
ops.Window: window.window,
868+
ops.NTile: window.ntile,
851869
}
852870

853871

@@ -896,28 +914,13 @@ def _day_of_week_index(translator, expr):
896914

897915

898916
_unsupported_ops_list = [
899-
ops.Window,
900917
ops.DecimalPrecision,
901918
ops.DecimalScale,
902919
ops.BaseConvert,
903-
ops.CumeDist,
904-
ops.CumulativeSum,
905-
ops.CumulativeMin,
906-
ops.CumulativeMax,
907-
ops.CumulativeMean,
908-
ops.CumulativeAny,
909-
ops.CumulativeAll,
910920
ops.IdenticalTo,
911-
ops.RowNumber,
912-
ops.DenseRank,
913-
ops.MinRank,
921+
ops.CumeDist,
914922
ops.PercentRank,
915-
ops.FirstValue,
916-
ops.LastValue,
917-
ops.NthValue,
918-
ops.Lag,
919-
ops.Lead,
920-
ops.NTile,
923+
ops.ReductionVectorizedUDF,
921924
]
922925
_unsupported_ops = {k: _raise_error for k in _unsupported_ops_list}
923926

ibis/backends/tests/test_window.py

Lines changed: 94 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,28 @@ def calc_zscore(s):
3030
lambda t, win: t.float_col.lead().over(win),
3131
lambda t: t.float_col.shift(-1),
3232
id='lead',
33+
marks=pytest.mark.broken(
34+
["clickhouse"],
35+
reason="upstream is broken; returns all nulls",
36+
),
3337
),
3438
param(
3539
lambda t, win: t.id.rank().over(win),
3640
lambda t: t.id.rank(method='min').astype('int64') - 1,
3741
id='rank',
42+
marks=pytest.mark.broken(
43+
["clickhouse"],
44+
reason="upstream is broken",
45+
),
3846
),
3947
param(
4048
lambda t, win: t.id.dense_rank().over(win),
4149
lambda t: t.id.rank(method='dense').astype('int64') - 1,
4250
id='dense_rank',
51+
marks=pytest.mark.broken(
52+
["clickhouse"],
53+
reason="upstream is broken",
54+
),
4355
),
4456
param(
4557
lambda t, win: t.id.percent_rank().over(win),
@@ -52,12 +64,13 @@ def calc_zscore(s):
5264
)
5365
).reset_index(drop=True, level=[0]),
5466
id='percent_rank',
67+
marks=pytest.mark.notyet(["clickhouse"]),
5568
),
5669
param(
5770
lambda t, win: t.id.cume_dist().over(win),
5871
lambda t: t.id.rank(method='min') / t.id.transform(len),
5972
id='cume_dist',
60-
marks=pytest.mark.notimpl(["pyspark"]),
73+
marks=pytest.mark.notimpl(["clickhouse", "pyspark"]),
6174
),
6275
param(
6376
lambda t, win: t.float_col.ntile(buckets=7).over(win),
@@ -99,7 +112,13 @@ def calc_zscore(s):
99112
lambda _, win: ibis.row_number().over(win),
100113
lambda t: t.cumcount(),
101114
id='row_number',
102-
marks=pytest.mark.notimpl(["pandas"]),
115+
marks=[
116+
pytest.mark.notimpl(["pandas"]),
117+
pytest.mark.broken(
118+
["clickhouse"],
119+
reason="upstream implementation cannot handle subtraction",
120+
),
121+
],
103122
),
104123
param(
105124
lambda t, win: t.double_col.cumsum().over(win),
@@ -143,7 +162,14 @@ def calc_zscore(s):
143162
),
144163
id='cumnotany',
145164
marks=pytest.mark.notyet(
146-
("duckdb", 'impala', 'postgres', 'mysql', 'sqlite'),
165+
(
166+
"clickhouse",
167+
"duckdb",
168+
'impala',
169+
'postgres',
170+
'mysql',
171+
'sqlite',
172+
),
147173
reason="notany() over window not supported",
148174
),
149175
),
@@ -167,7 +193,14 @@ def calc_zscore(s):
167193
),
168194
id='cumnotall',
169195
marks=pytest.mark.notyet(
170-
("duckdb", 'impala', 'postgres', 'mysql', 'sqlite'),
196+
(
197+
"clickhouse",
198+
"duckdb",
199+
'impala',
200+
'postgres',
201+
'mysql',
202+
'sqlite',
203+
),
171204
reason="notall() over window not supported",
172205
),
173206
),
@@ -204,7 +237,7 @@ def calc_zscore(s):
204237
),
205238
],
206239
)
207-
@pytest.mark.notimpl(["clickhouse", "dask", "datafusion"])
240+
@pytest.mark.notimpl(["dask", "datafusion"])
208241
def test_grouped_bounded_expanding_window(
209242
backend, alltypes, df, result_fn, expected_fn
210243
):
@@ -244,14 +277,21 @@ def test_grouped_bounded_expanding_window(
244277
id='mean_udf',
245278
marks=[
246279
pytest.mark.notimpl(
247-
["duckdb", "impala", "mysql", "postgres", "sqlite"]
280+
[
281+
"clickhouse",
282+
"duckdb",
283+
"impala",
284+
"mysql",
285+
"postgres",
286+
"sqlite",
287+
]
248288
)
249289
],
250290
),
251291
],
252292
)
253293
# Some backends do not support non-grouped window specs
254-
@pytest.mark.notimpl(["clickhouse", "dask", "datafusion"])
294+
@pytest.mark.notimpl(["dask", "datafusion"])
255295
def test_ungrouped_bounded_expanding_window(
256296
backend, alltypes, df, result_fn, expected_fn
257297
):
@@ -271,7 +311,7 @@ def test_ungrouped_bounded_expanding_window(
271311
backend.assert_series_equal(left, right)
272312

273313

274-
@pytest.mark.notimpl(["clickhouse", "dask", "datafusion", "pandas"])
314+
@pytest.mark.notimpl(["dask", "datafusion", "pandas"])
275315
def test_grouped_bounded_following_window(backend, alltypes, df):
276316
window = ibis.window(
277317
preceding=0,
@@ -326,7 +366,7 @@ def test_grouped_bounded_following_window(backend, alltypes, df):
326366
),
327367
],
328368
)
329-
@pytest.mark.notimpl(["clickhouse", "dask", "datafusion"])
369+
@pytest.mark.notimpl(["dask", "datafusion"])
330370
def test_grouped_bounded_preceding_window(backend, alltypes, df, window_fn):
331371
window = window_fn(alltypes)
332372

@@ -363,7 +403,15 @@ def test_grouped_bounded_preceding_window(backend, alltypes, df, window_fn):
363403
lambda gb: (gb.double_col.transform('mean')),
364404
id='mean_udf',
365405
marks=pytest.mark.notimpl(
366-
["dask", "duckdb", "impala", "mysql", "postgres", "sqlite"]
406+
[
407+
"clickhouse",
408+
"dask",
409+
"duckdb",
410+
"impala",
411+
"mysql",
412+
"postgres",
413+
"sqlite",
414+
]
367415
),
368416
),
369417
],
@@ -377,7 +425,7 @@ def test_grouped_bounded_preceding_window(backend, alltypes, df, window_fn):
377425
param(False, id='unordered'),
378426
],
379427
)
380-
@pytest.mark.notimpl(["clickhouse", "datafusion"])
428+
@pytest.mark.notimpl(["datafusion"])
381429
def test_grouped_unbounded_window(
382430
backend, alltypes, df, result_fn, expected_fn, ordered
383431
):
@@ -417,7 +465,12 @@ def test_grouped_unbounded_window(
417465
lambda df: pd.Series([df.double_col.mean()] * len(df.double_col)),
418466
True,
419467
id='ordered-mean',
420-
marks=pytest.mark.notimpl(["dask", "impala", "pandas"]),
468+
marks=[
469+
pytest.mark.notimpl(["dask", "impala", "pandas"]),
470+
pytest.mark.broken(
471+
["clickhouse"], reason="upstream appears broken"
472+
),
473+
],
421474
),
422475
param(
423476
lambda t, win: t.double_col.mean().over(win),
@@ -432,6 +485,7 @@ def test_grouped_unbounded_window(
432485
id='ordered-mean_udf',
433486
marks=pytest.mark.notimpl(
434487
[
488+
"clickhouse",
435489
"dask",
436490
"duckdb",
437491
"impala",
@@ -448,7 +502,14 @@ def test_grouped_unbounded_window(
448502
False,
449503
id='unordered-mean_udf',
450504
marks=pytest.mark.notimpl(
451-
["duckdb", "impala", "mysql", "postgres", "sqlite"]
505+
[
506+
"clickhouse",
507+
"duckdb",
508+
"impala",
509+
"mysql",
510+
"postgres",
511+
"sqlite",
512+
]
452513
),
453514
),
454515
# Analytic ops
@@ -471,14 +532,16 @@ def test_grouped_unbounded_window(
471532
lambda df: df.float_col.shift(-1),
472533
True,
473534
id='ordered-lead',
474-
marks=pytest.mark.notimpl(["dask"]),
535+
marks=pytest.mark.notimpl(["clickhouse", "dask"]),
475536
),
476537
param(
477538
lambda t, win: t.float_col.lead().over(win),
478539
lambda df: df.float_col.shift(-1),
479540
False,
480541
id='unordered-lead',
481-
marks=pytest.mark.notimpl(["dask", "mysql", "pyspark"]),
542+
marks=pytest.mark.notimpl(
543+
["clickhouse", "dask", "mysql", "pyspark"]
544+
),
482545
),
483546
param(
484547
lambda t, win: calc_zscore(t.double_col).over(win),
@@ -487,6 +550,7 @@ def test_grouped_unbounded_window(
487550
id='ordered-zscore_udf',
488551
marks=pytest.mark.notimpl(
489552
[
553+
"clickhouse",
490554
"dask",
491555
"duckdb",
492556
"impala",
@@ -504,13 +568,21 @@ def test_grouped_unbounded_window(
504568
False,
505569
id='unordered-zscore_udf',
506570
marks=pytest.mark.notimpl(
507-
["duckdb", "impala", "mysql", "postgres", "pyspark", "sqlite"]
571+
[
572+
"clickhouse",
573+
"duckdb",
574+
"impala",
575+
"mysql",
576+
"postgres",
577+
"pyspark",
578+
"sqlite",
579+
]
508580
),
509581
),
510582
],
511583
)
512584
# Some backends do not support non-grouped window specs
513-
@pytest.mark.notimpl(["clickhouse", "datafusion"])
585+
@pytest.mark.notimpl(["datafusion"])
514586
def test_ungrouped_unbounded_window(
515587
backend, alltypes, df, con, result_fn, expected_fn, ordered
516588
):
@@ -541,7 +613,11 @@ def test_ungrouped_unbounded_window(
541613
backend.assert_series_equal(left, right)
542614

543615

544-
@pytest.mark.notimpl(["clickhouse", "dask", "datafusion", "impala", "pandas"])
616+
@pytest.mark.notimpl(["dask", "datafusion", "impala", "pandas"])
617+
@pytest.mark.notyet(
618+
["clickhouse"],
619+
reason="RANGE OFFSET frame for 'DB::ColumnNullable' ORDER BY column is not implemented", # noqa: E501
620+
)
545621
def test_grouped_bounded_range_window(backend, alltypes, df):
546622
# Explanation of the range window spec below:
547623
#

0 commit comments

Comments
 (0)