Skip to content

Commit 8dcfb8d

Browse files
cpcloudkszucs
authored andcommitted
feat(api): upcast pandas DataFrames to memtables in rlz.table rule
1 parent c4254f6 commit 8dcfb8d

File tree

10 files changed

+75
-9
lines changed

10 files changed

+75
-9
lines changed

ibis/backends/duckdb/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -687,8 +687,15 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
687687
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
688688
# in theory we could use pandas dataframes, but when using dataframes
689689
# with pyarrow datatypes later reads of this data segfault
690+
schema = op.schema
691+
if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
692+
raise exc.IbisTypeError(
693+
"DuckDB cannot yet reliably handle `null` typed columns; "
694+
f"got null typed columns: {null_columns}"
695+
)
696+
690697
name = op.name
691-
table = op.data.to_pyarrow()
698+
table = op.data.to_pyarrow(schema)
692699
with self.begin() as con:
693700
con.connection.register(name, table)
694701

ibis/backends/duckdb/tests/test_datatypes.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from packaging.version import parse as vparse
44
from pytest import param
55

6+
import ibis.common.exceptions as exc
67
import ibis.expr.datatypes as dt
78
from ibis.backends.duckdb.datatypes import parse
89

@@ -95,3 +96,18 @@ def test_cast_uints(uint_type, snapshot):
9596
snapshot.assert_match(
9697
str(ibis.to_sql(t.a.cast(uint_type), dialect="duckdb")), "out.sql"
9798
)
99+
100+
101+
def test_null_dtype():
102+
import ibis
103+
104+
con = ibis.connect("duckdb://:memory:")
105+
106+
t = ibis.memtable({"a": [None, None]})
107+
assert t.schema() == ibis.schema(dict(a="null"))
108+
109+
with pytest.raises(
110+
exc.IbisTypeError,
111+
match="DuckDB cannot yet reliably handle `null` typed columns",
112+
):
113+
con.execute(t)

ibis/backends/pandas/client.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,10 +228,12 @@ def __repr__(self) -> str:
228228
def to_frame(self) -> pd.DataFrame:
229229
return self._df
230230

231-
def to_pyarrow(self) -> pa.Table:
231+
def to_pyarrow(self, schema: sch.Schema) -> pa.Table:
232232
import pyarrow as pa
233233

234-
return pa.Table.from_pandas(self._df)
234+
from ibis.backends.pyarrow.datatypes import ibis_to_pyarrow_schema
235+
236+
return pa.Table.from_pandas(self._df, schema=ibis_to_pyarrow_schema(schema))
235237

236238

237239
class PandasInMemoryTable(ops.InMemoryTable):

ibis/backends/pandas/tests/test_datatypes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def test_pandas_dtype(pandas_dtype, ibis_dtype):
208208
# mixed
209209
(pd.Series([b'1', '2', 3.0]), dt.binary),
210210
# empty
211-
(pd.Series([], dtype='object'), dt.binary),
211+
(pd.Series([], dtype='object'), dt.null),
212212
(pd.Series([], dtype="string"), dt.string),
213213
],
214214
)

ibis/backends/pyarrow/datatypes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
dt.Boolean: pa.bool_(),
2727
dt.Timestamp: pa.timestamp('ns'),
2828
dt.Date: pa.date64(),
29+
dt.JSON: pa.string(),
30+
dt.Null: pa.null(),
2931
}
3032

3133

ibis/backends/tests/test_generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444

4545

4646
@pytest.mark.broken(["duckdb", "impala", "bigquery"], 'assert nan is None')
47-
@pytest.mark.notimpl(["datafusion"], raises=NotImplementedError)
47+
@pytest.mark.notimpl(["datafusion"])
4848
def test_null_literal(con, backend):
4949
expr = ibis.null()
5050
result = con.execute(expr)

ibis/backends/tests/test_join.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
from packaging.version import parse as vparse
77
from pytest import param
88

9+
import ibis.common.exceptions as exc
10+
import ibis.expr.schema as sch
11+
912

1013
def _pandas_semi_join(left, right, on, **_):
1114
assert len(on) == 1, str(on)
@@ -194,3 +197,32 @@ def test_semi_join_topk(batting, awards_players):
194197
left = batting.semi_join(batting.year.topk(5), "year").select("year", "RBI")
195198
expr = left.join(awards_players, left.year == awards_players.yearID)
196199
assert not expr.limit(5).execute().empty
200+
201+
202+
@pytest.mark.notimpl(["dask", "datafusion", "druid", "pandas"])
203+
@pytest.mark.broken(
204+
["duckdb"],
205+
raises=exc.IbisTypeError,
206+
reason="DuckDB as of 0.7.1 occasionally segfaults when there are `null`-typed columns present",
207+
)
208+
def test_join_with_pandas(batting, awards_players):
209+
batting_filt = batting[lambda t: t.yearID < 1900]
210+
awards_players_filt = awards_players[lambda t: t.yearID < 1900].execute()
211+
assert isinstance(awards_players_filt, pd.DataFrame)
212+
expr = batting_filt.join(awards_players_filt, "yearID")
213+
df = expr.execute()
214+
assert df.yearID.nunique() == 7
215+
216+
217+
@pytest.mark.notimpl(["dask", "datafusion", "pandas"])
218+
def test_join_with_pandas_non_null_typed_columns(batting, awards_players):
219+
batting_filt = batting[lambda t: t.yearID < 1900][["yearID"]]
220+
awards_players_filt = awards_players[lambda t: t.yearID < 1900][
221+
["yearID"]
222+
].execute()
223+
# ensure that none of the columns have type null
224+
assert sch.infer(awards_players_filt) == sch.Schema(dict(yearID="int"))
225+
assert isinstance(awards_players_filt, pd.DataFrame)
226+
expr = batting_filt.join(awards_players_filt, "yearID")
227+
df = expr.execute()
228+
assert df.yearID.nunique() == 7

ibis/expr/datatypes/value.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def _infer_object_array_dtype(x):
195195
'timedelta': dt.interval,
196196
'time': dt.time,
197197
'period': dt.binary,
198-
'empty': dt.binary,
198+
'empty': dt.null,
199199
'unicode': dt.string,
200200
}[classifier]
201201

ibis/expr/rules.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -466,8 +466,14 @@ def table(arg, schema=None, **kwargs):
466466
it must be of the specified type. The table may have extra columns not
467467
specified in the schema.
468468
"""
469+
import pandas as pd
470+
471+
import ibis
469472
import ibis.expr.operations as ops
470473

474+
if isinstance(arg, pd.DataFrame):
475+
arg = ibis.memtable(arg).op()
476+
471477
if not isinstance(arg, ops.TableNode):
472478
raise com.IbisTypeError(
473479
f'Argument is not a table; got type {type(arg).__name__}'

ibis/util.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import pyarrow as pa
3737

3838
import ibis.expr.operations as ops
39+
import ibis.expr.schema as sch
3940

4041
Graph = Mapping[ops.Node, Sequence[ops.Node]]
4142

@@ -493,11 +494,11 @@ class ToFrame(abc.ABC):
493494

494495
@abc.abstractmethod
495496
def to_frame(self) -> pd.DataFrame: # pragma: no cover
496-
...
497+
"""Convert this input to a pandas DataFrame."""
497498

498499
@abc.abstractmethod
499-
def to_pyarrow(self) -> pa.Table: # pragma: no cover
500-
...
500+
def to_pyarrow(self, schema: sch.Schema) -> pa.Table: # pragma: no cover
501+
"""Convert this input to a PyArrow Table."""
501502

502503

503504
def backend_entry_points() -> list[importlib.metadata.EntryPoint]:

0 commit comments

Comments
 (0)