Skip to content

Commit 9b7a377

Browse files
NickCrewscpcloud
authored andcommitted
perf(duckdb): reduce branching factor for ArrayDistinct
If I do something like `some_really_complex_array_expression.unique()`, then the SQL for `some_really_complex_array_expression` was duplicated 4 times. Now it is only duplicated 3 times. This is important for perf, because duckdb sometimes appears to not be smart, and actually does a computation for each time a subexpression appears. See duckdb/duckdb#14649. So this can reduce the computation time to 3/4 of what it was. I want to go through our other compilation steps and do similar optimizations whenever possible.
1 parent a28ceb1 commit 9b7a377

File tree

2 files changed

+10
-17
lines changed

2 files changed

+10
-17
lines changed

ibis/backends/sql/compilers/duckdb.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -143,15 +143,17 @@ def visit_StructColumn(self, op, *, names, values):
143143
)
144144

145145
def visit_ArrayDistinct(self, op, *, arg):
146+
return self._array_distinct(arg)
147+
148+
def _array_distinct(self, arg: sge.Expression) -> sge.Expression:
149+
x = sg.to_identifier("x")
150+
is_null = sge.Lambda(this=x.is_(NULL), expressions=[x])
151+
contains_null = self.f.list_bool_or(self.f.list_apply(arg, is_null))
146152
return self.if_(
147153
arg.is_(NULL),
148154
NULL,
149155
self.f.list_distinct(arg)
150-
+ self.if_(
151-
self.f.list_count(arg) < self.f.len(arg),
152-
self.f.array(NULL),
153-
self.f.array(),
154-
),
156+
+ self.if_(contains_null, self.f.array(NULL), self.f.array()),
155157
)
156158

157159
def visit_ArrayPosition(self, op, *, arg, other):
@@ -223,16 +225,7 @@ def visit_ArrayRemove(self, op, *, arg, other):
223225

224226
def visit_ArrayUnion(self, op, *, left, right):
225227
arg = self.f.list_concat(left, right)
226-
return self.if_(
227-
arg.is_(NULL),
228-
NULL,
229-
self.f.list_distinct(arg)
230-
+ self.if_(
231-
self.f.list_count(arg) < self.f.len(arg),
232-
self.f.array(NULL),
233-
self.f.array(),
234-
),
235-
)
228+
return self._array_distinct(arg)
236229

237230
def visit_ArrayZip(self, op, *, arg):
238231
i = sg.to_identifier("i")

ibis/backends/tests/test_array.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -779,8 +779,8 @@ def test_array_remove(con, input, expected):
779779
("input", "expected"),
780780
[
781781
param(
782-
{"a": [[1, 3, 3], [], [42, 42], [], [None], None]},
783-
[{3, 1}, set(), {42}, set(), {None}, None],
782+
{"a": [[1, 3, 3], [1, 3, None, 3], [42, 42], [], [None], None]},
783+
[{3, 1}, {1, 3, None}, {42}, set(), {None}, None],
784784
id="null",
785785
marks=[
786786
pytest.mark.notyet(

0 commit comments

Comments
 (0)