Skip to content

Commit 8a747fb

Browse files
jcristcpcloud
authored andcommitted
feat: implement dropna for SQL backends
1 parent 0a2f315 commit 8a747fb

File tree

7 files changed

+51
-37
lines changed

7 files changed

+51
-37
lines changed

ibis/backends/base/sql/compiler/select_builder.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
from __future__ import annotations
22

3+
import functools
4+
import operator
35
from typing import NamedTuple
46

57
import toolz
68

9+
import ibis
710
import ibis.common.exceptions as com
811
import ibis.expr.analysis as L
912
import ibis.expr.operations as ops
@@ -442,6 +445,28 @@ def _collect_Distinct(self, expr, toplevel=False):
442445

443446
self._collect(expr.op().table, toplevel=toplevel)
444447

448+
def _collect_DropNa(self, expr, toplevel=False):
449+
if toplevel:
450+
op = expr.op()
451+
if op.subset is None:
452+
columns = [op.table[c] for c in op.table.columns]
453+
else:
454+
columns = op.subset
455+
if columns:
456+
filters = [
457+
functools.reduce(
458+
operator.and_ if op.how == "any" else operator.or_,
459+
[c.notnull() for c in columns],
460+
)
461+
]
462+
elif op.how == "all":
463+
filters = [ibis.literal(False)]
464+
else:
465+
filters = []
466+
self.table_set = op.table
467+
self.select_set = [op.table]
468+
self.filters = filters
469+
445470
def _collect_Limit(self, expr, toplevel=False):
446471
if not toplevel:
447472
return

ibis/backends/dask/execution/generic.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,10 @@
117117
execute_difference_dataframe_dataframe,
118118
)
119119
],
120-
ops.DropNa: [((dd.DataFrame, tuple), execute_node_dropna_dataframe)],
120+
ops.DropNa: [
121+
((dd.DataFrame, tuple), execute_node_dropna_dataframe),
122+
((dd.DataFrame, type(None)), execute_node_dropna_dataframe),
123+
],
121124
ops.FillNa: [
122125
((dd.DataFrame, simple_types), execute_node_fillna_dataframe_scalar),
123126
((dd.DataFrame,), execute_node_fillna_dataframe_dict),

ibis/backends/pandas/execution/generic.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1144,9 +1144,11 @@ def execute_node_log_number_number(op, value, base, **kwargs):
11441144
return math.log(value, base)
11451145

11461146

1147+
@execute_node.register(ops.DropNa, pd.DataFrame, type(None))
11471148
@execute_node.register(ops.DropNa, pd.DataFrame, tuple)
11481149
def execute_node_dropna_dataframe(op, df, subset, **kwargs):
1149-
subset = [col.get_name() for col in subset] if subset else None
1150+
if subset is not None:
1151+
subset = [col.get_name() for col in subset]
11501152
return df.dropna(how=op.how, subset=subset)
11511153

11521154

ibis/backends/pyspark/compiler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1986,7 +1986,9 @@ def compile_not_null(t, expr, scope, timecontext, **kwargs):
19861986
def compile_dropna_table(t, expr, scope, timecontext, **kwargs):
19871987
op = expr.op()
19881988
table = t.translate(op.table, scope, timecontext, **kwargs)
1989-
subset = [col.get_name() for col in op.subset] if op.subset else None
1989+
subset = op.subset
1990+
if subset is not None:
1991+
subset = [col.get_name() for col in subset]
19901992
return table.dropna(how=op.how, subset=subset)
19911993

19921994

ibis/backends/tests/test_generic.py

Lines changed: 13 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
import ibis
1313
import ibis.common.exceptions as com
14-
import ibis.util as util
1514
from ibis import _
1615
from ibis import literal as L
1716

@@ -385,45 +384,29 @@ def test_mutate_rename(alltypes):
385384
assert list(result.columns) == ["bool_col", "string_col", "dupe_col"]
386385

387386

387+
@pytest.mark.parametrize('how', ['any', 'all'])
388388
@pytest.mark.parametrize(
389-
('how', 'subset'),
390-
[
391-
('any', None),
392-
('any', []),
393-
('any', ['int_col', 'na_col']),
394-
('all', None),
395-
('all', ['int_col', 'na_col']),
396-
('all', 'none_col'),
397-
],
398-
)
399-
@pytest.mark.notimpl(
400-
[
401-
"clickhouse",
402-
"datafusion",
403-
"impala",
404-
"mysql",
405-
"postgres",
406-
"sqlite",
407-
]
389+
'subset', [None, [], 'col_1', ['col_1', 'col_2'], ['col_1', 'col_3']]
408390
)
409-
@pytest.mark.notyet(["duckdb"], reason="non-finite value support")
391+
@pytest.mark.notimpl(["datafusion"])
410392
def test_dropna_table(backend, alltypes, how, subset):
411-
table = alltypes.mutate(na_col=np.nan)
412-
table = table.mutate(none_col=None)
413-
table = table.mutate(none_col=table['none_col'].cast('float64'))
393+
is_two = alltypes.int_col == 2
394+
is_four = alltypes.int_col == 4
395+
396+
table = alltypes.mutate(
397+
col_1=is_two.ifelse(ibis.NA, alltypes.float_col),
398+
col_2=is_four.ifelse(ibis.NA, alltypes.float_col),
399+
col_3=(is_two | is_four).ifelse(ibis.NA, alltypes.float_col),
400+
).select("col_1", "col_2", "col_3")
401+
414402
table_pandas = table.execute()
415403

416404
result = table.dropna(subset, how).execute().reset_index(drop=True)
417-
subset = util.promote_list(subset) if subset else table_pandas.columns
418405
expected = table_pandas.dropna(how=how, subset=subset).reset_index(
419406
drop=True
420407
)
421408

422-
# check_dtype is False here because there are dtype diffs between
423-
# Pyspark and Pandas on Java 8 - the 'bool_col' of an empty DataFrame
424-
# is type object in Pyspark, and type bool in Pandas. This diff does
425-
# not exist in Java 11.
426-
backend.assert_frame_equal(result, expected, check_dtype=False)
409+
backend.assert_frame_equal(result, expected)
427410

428411

429412
def test_select_sort_sort(alltypes):

ibis/expr/operations/relations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -794,7 +794,7 @@ class DropNa(TableNode, sch.HasSchema):
794794

795795
table = rlz.table
796796
how = rlz.isin({'any', 'all'})
797-
subset = rlz.optional(rlz.tuple_of(rlz.column_from("table")), default=())
797+
subset = rlz.optional(rlz.tuple_of(rlz.column_from("table")), default=None)
798798

799799
@property
800800
def schema(self):

ibis/expr/types/relations.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -874,9 +874,8 @@ def dropna(
874874
"""
875875
from ibis.expr import operations as ops
876876

877-
if subset is None:
878-
subset = []
879-
subset = util.promote_list(subset)
877+
if subset is not None:
878+
subset = util.promote_list(subset)
880879
return ops.DropNa(self, how, subset).to_expr()
881880

882881
def fillna(

0 commit comments

Comments
 (0)