Skip to content

Commit db512a8

Browse files
hikitanicpcloud
authored andcommitted
feat: support of arrays and tuples for clickhouse
add support array index op add method for converting of dtype to typename add support array concat op update array index op it uses translate of exprs fix format add support array repeat op add support array slice op add DateTime64 to ch dtypes update test_array_index for check negative ids add support for neg slice ids use f-string in array repeat op refactor to_ibis method style: oneline expr for array repeat Co-authored-by: Phillip Cloud <[email protected]> test: remove xfail on array slice and index ops removed for: * dask * pandas * postgres * pyspark fix: add negative index support for postgres style: remove unused module
1 parent 2ba540d commit db512a8

File tree

10 files changed

+234
-179
lines changed

10 files changed

+234
-179
lines changed

ibis/backends/clickhouse/client.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,23 +28,29 @@
2828
'FixedString': dt.String,
2929
'Date': dt.Date,
3030
'DateTime': dt.Timestamp,
31+
'DateTime64': dt.Timestamp,
32+
'Array': dt.Array,
3133
}
3234
_ibis_dtypes = {v: k for k, v in _clickhouse_dtypes.items()}
3335
_ibis_dtypes[dt.String] = 'String'
36+
_ibis_dtypes[dt.Timestamp] = 'DateTime'
3437

3538

3639
class ClickhouseDataType:
3740

38-
__slots__ = 'typename', 'nullable'
41+
__slots__ = 'typename', 'base_typename', 'nullable'
3942

4043
def __init__(self, typename, nullable=False):
4144
m = base_typename_re.match(typename)
42-
base_typename = m.groups()[0]
43-
if base_typename not in _clickhouse_dtypes:
45+
self.base_typename = m.groups()[0]
46+
if self.base_typename not in _clickhouse_dtypes:
4447
raise com.UnsupportedBackendType(typename)
45-
self.typename = base_typename
48+
self.typename = self.base_typename
4649
self.nullable = nullable
4750

51+
if self.base_typename == 'Array':
52+
self.typename = typename
53+
4854
def __str__(self):
4955
if self.nullable:
5056
return f'Nullable({self.typename})'
@@ -63,11 +69,37 @@ def parse(cls, spec):
6369
return cls(spec)
6470

6571
def to_ibis(self):
66-
return _clickhouse_dtypes[self.typename](nullable=self.nullable)
72+
if self.base_typename != 'Array':
73+
return _clickhouse_dtypes[self.typename](nullable=self.nullable)
74+
75+
sub_type = ClickhouseDataType(
76+
self.get_subname(self.typename)
77+
).to_ibis()
78+
return dt.Array(value_type=sub_type)
79+
80+
@staticmethod
81+
def get_subname(name: str) -> str:
82+
lbracket_pos = name.find('(')
83+
rbracket_pos = name.rfind(')')
84+
85+
if lbracket_pos == -1 or rbracket_pos == -1:
86+
return ''
87+
88+
subname = name[lbracket_pos + 1 : rbracket_pos]
89+
return subname
90+
91+
@staticmethod
92+
def get_typename_from_ibis_dtype(dtype):
93+
if not isinstance(dtype, dt.Array):
94+
return _ibis_dtypes[type(dtype)]
95+
96+
return 'Array({})'.format(
97+
ClickhouseDataType.get_typename_from_ibis_dtype(dtype.value_type)
98+
)
6799

68100
@classmethod
69101
def from_ibis(cls, dtype, nullable=None):
70-
typename = _ibis_dtypes[type(dtype)]
102+
typename = ClickhouseDataType.get_typename_from_ibis_dtype(dtype)
71103
if nullable is None:
72104
nullable = dtype.nullable
73105
return cls(typename, nullable=nullable)

ibis/backends/clickhouse/registry.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,58 @@ def formatter(translator, expr):
7474
return formatter
7575

7676

77+
def _array_index_op(translator, expr):
78+
op = expr.op()
79+
80+
arr = op.args[0]
81+
idx = op.args[1]
82+
83+
arr_ = translator.translate(arr)
84+
idx_ = _parenthesize(translator, idx)
85+
86+
correct_idx = f'if({idx_} >= 0, {idx_} + 1, {idx_})'
87+
88+
return f'arrayElement({arr_}, {correct_idx})'
89+
90+
91+
def _array_repeat_op(translator, expr):
92+
op = expr.op()
93+
arr, times = op.args
94+
95+
arr_ = _parenthesize(translator, arr)
96+
times_ = _parenthesize(translator, times)
97+
98+
select = 'arrayFlatten(groupArray(arr))'
99+
from_ = f'(select {arr_} as arr from system.numbers limit {times_})'
100+
return f'(select {select} from {from_})'
101+
102+
103+
def _array_slice_op(translator, expr):
104+
op = expr.op()
105+
arg, start, stop = op.args
106+
107+
start_ = _parenthesize(translator, start)
108+
arg_ = translator.translate(arg)
109+
110+
start_correct_ = f'if({start_} < 0, {start_}, {start_} + 1)'
111+
112+
if stop is not None:
113+
stop_ = _parenthesize(translator, stop)
114+
115+
cast_arg_ = f'if({arg_} = [], CAST({arg_} AS Array(UInt8)), {arg_})'
116+
neg_start_ = f'(arrayCount({cast_arg_}) + {start_})'
117+
diff_fmt = f'greatest(-0, {stop_} - {{}})'.format
118+
119+
length_ = (
120+
f'if({stop_} < 0, {stop_}, '
121+
+ f'if({start_} < 0, {diff_fmt(neg_start_)}, {diff_fmt(start_)}))'
122+
)
123+
124+
return f'arraySlice({arg_}, {start_correct_}, {length_})'
125+
126+
return f'arraySlice({arg_}, {start_correct_})'
127+
128+
77129
def _agg(func):
78130
def formatter(translator, expr):
79131
return _aggregate(translator, func, *expr.op().args)
@@ -644,6 +696,10 @@ def _group_concat(translator, expr):
644696
ops.ExistsSubquery: _exists_subquery,
645697
ops.NotExistsSubquery: _exists_subquery,
646698
ops.ArrayLength: _unary('length'),
699+
ops.ArrayIndex: _array_index_op,
700+
ops.ArrayConcat: _fixed_arity('arrayConcat', 2),
701+
ops.ArrayRepeat: _array_repeat_op,
702+
ops.ArraySlice: _array_slice_op,
647703
}
648704

649705

ibis/backends/clickhouse/tests/test_operators.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,3 +273,65 @@ def test_search_case(con, alltypes, translate):
273273
END"""
274274
assert translate(expr) == expected
275275
assert len(con.execute(expr))
276+
277+
278+
@pytest.mark.parametrize(
279+
'arr',
280+
[
281+
[1, 2, 3],
282+
['qw', 'wq', '1'],
283+
[1.2, 0.3, 0.4],
284+
[[1], [1, 2], [1, 2, 3]],
285+
],
286+
)
287+
@pytest.mark.parametrize(
288+
'ids',
289+
[
290+
lambda arr: range(len(arr)),
291+
lambda arr: range(-len(arr), 0),
292+
],
293+
)
294+
def test_array_index(con, arr, ids):
295+
expr = L(arr)
296+
for i in ids(arr):
297+
el_expr = expr[i]
298+
el = con.execute(el_expr)
299+
assert el == arr[i]
300+
301+
302+
@pytest.mark.parametrize(
303+
'arrays',
304+
[
305+
([1], [2]),
306+
([1], [1, 2]),
307+
([1, 2], [1]),
308+
([1, 2], [3, 4]),
309+
([1, 2], [3, 4], [5, 6]),
310+
],
311+
)
312+
def test_array_concat(con, arrays):
313+
expr = L([]).cast(dt.Array(dt.int8))
314+
expected = sum(arrays, [])
315+
for arr in arrays:
316+
expr += L(arr)
317+
318+
assert con.execute(expr) == expected
319+
320+
321+
@pytest.mark.parametrize(
322+
('arr', 'times'),
323+
[([1], 1), ([1], 2), ([1], 3), ([1, 2], 1), ([1, 2], 2), ([1, 2], 3)],
324+
)
325+
def test_array_repeat(con, arr, times):
326+
expected = arr * times
327+
expr = L(arr)
328+
329+
assert con.execute(expr * times) == expected
330+
331+
332+
@pytest.mark.parametrize('arr', [[], [1], [1, 2, 3, 4, 5, 6]])
333+
@pytest.mark.parametrize('start', [None, 0, 1, 2, -1, -3])
334+
@pytest.mark.parametrize('stop', [None, 0, 1, 3, -2, -4])
335+
def test_array_slice(con, arr, start, stop):
336+
expr = L(arr)
337+
assert con.execute(expr[start:stop]) == arr[start:stop]

ibis/backends/clickhouse/tests/test_types.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
import pytest
12
from pkg_resources import parse_version
23

4+
import ibis.expr.datatypes as dt
5+
from ibis.backends.clickhouse.client import ClickhouseDataType
6+
37

48
def test_column_types(alltypes):
59
df = alltypes.execute()
@@ -23,3 +27,38 @@ def test_columns_types_with_additional_argument(con):
2327
assert df.fixedstring_col.dtype.name == 'object'
2428
if parse_version(con.version).base_version >= '1.1.54337':
2529
assert df.datetime_col.dtype.name == 'datetime64[ns]'
30+
31+
32+
@pytest.mark.parametrize(
33+
('ch_type', 'ibis_type'),
34+
[
35+
('Array(Int8)', dt.Array(dt.Int8(nullable=False))),
36+
('Array(Int16)', dt.Array(dt.Int16(nullable=False))),
37+
('Array(Int32)', dt.Array(dt.Int32(nullable=False))),
38+
('Array(Int64)', dt.Array(dt.Int64(nullable=False))),
39+
('Array(UInt8)', dt.Array(dt.UInt8(nullable=False))),
40+
('Array(UInt16)', dt.Array(dt.UInt16(nullable=False))),
41+
('Array(UInt32)', dt.Array(dt.UInt32(nullable=False))),
42+
('Array(UInt64)', dt.Array(dt.UInt64(nullable=False))),
43+
('Array(Float32)', dt.Array(dt.Float32(nullable=False))),
44+
('Array(Float64)', dt.Array(dt.Float64(nullable=False))),
45+
('Array(String)', dt.Array(dt.String(nullable=False))),
46+
('Array(FixedString(32))', dt.Array(dt.String(nullable=False))),
47+
('Array(Date)', dt.Array(dt.Date(nullable=False))),
48+
('Array(DateTime)', dt.Array(dt.Timestamp(nullable=False))),
49+
('Array(DateTime64)', dt.Array(dt.Timestamp(nullable=False))),
50+
('Array(Nothing)', dt.Array(dt.Null(nullable=False))),
51+
('Array(Null)', dt.Array(dt.Null(nullable=False))),
52+
('Array(Array(Int8))', dt.Array(dt.Array(dt.Int8(nullable=False)))),
53+
(
54+
'Array(Array(Array(Int8)))',
55+
dt.Array(dt.Array(dt.Array(dt.Int8(nullable=False)))),
56+
),
57+
(
58+
'Array(Array(Array(Array(Int8))))',
59+
dt.Array(dt.Array(dt.Array(dt.Array(dt.Int8(nullable=False))))),
60+
),
61+
],
62+
)
63+
def test_array_type(ch_type, ibis_type):
64+
assert ClickhouseDataType(ch_type).to_ibis() == ibis_type

ibis/backends/dask/tests/execution/test_arrays.py

Lines changed: 6 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import pandas as pd
66
import pytest
77
from dask.dataframe.utils import tm
8-
from pytest import param
98

109
import ibis
1110
from ibis.common.exceptions import IbisTypeError
@@ -103,29 +102,9 @@ def test_array_collect_scalar(client):
103102
(None, 3),
104103
(None, None),
105104
(3, None),
106-
# negative slices are not supported
107-
# TODO: uncomment once test as a whole is not xfailed
108-
# param(
109-
# -3,
110-
# None,
111-
# marks=pytest.mark.xfail(
112-
# raises=ValueError, reason='Negative slicing not supported'
113-
# ),
114-
# ),
115-
# param(
116-
# None,
117-
# -3,
118-
# marks=pytest.mark.xfail(
119-
# raises=ValueError, reason='Negative slicing not supported'
120-
# ),
121-
# ),
122-
# param(
123-
# -3,
124-
# -1,
125-
# marks=pytest.mark.xfail(
126-
# raises=ValueError, reason='Negative slicing not supported'
127-
# ),
128-
# ),
105+
(-3, None),
106+
(None, -3),
107+
(-3, -1),
129108
],
130109
)
131110
def test_array_slice(t, df, start, stop):
@@ -146,28 +125,9 @@ def test_array_slice(t, df, start, stop):
146125
(None, 3),
147126
(None, None),
148127
(3, None),
149-
# negative slices are not supported
150-
param(
151-
-3,
152-
None,
153-
marks=pytest.mark.xfail(
154-
raises=ValueError, reason='Negative slicing not supported'
155-
),
156-
),
157-
param(
158-
None,
159-
-3,
160-
marks=pytest.mark.xfail(
161-
raises=ValueError, reason='Negative slicing not supported'
162-
),
163-
),
164-
param(
165-
-3,
166-
-1,
167-
marks=pytest.mark.xfail(
168-
raises=ValueError, reason='Negative slicing not supported'
169-
),
170-
),
128+
(-3, None),
129+
(None, -3),
130+
(-3, -1),
171131
],
172132
)
173133
def test_array_slice_scalar(client, start, stop):

0 commit comments

Comments
 (0)