feat(api): upcast pandas DataFrames to memtables in rlz.table rule

cpcloud · kszucs · commit 8dcfb8d7cb7b · 2023-03-13T15:34:43.000+01:00
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -687,8 +687,15 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
     def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
         # in theory we could use pandas dataframes, but when using dataframes
         # with pyarrow datatypes later reads of this data segfault
+        schema = op.schema
+        if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
+            raise exc.IbisTypeError(
+                "DuckDB cannot yet reliably handle `null` typed columns; "
+                f"got null typed columns: {null_columns}"
+            )
+
         name = op.name
-        table = op.data.to_pyarrow()
+        table = op.data.to_pyarrow(schema)
         with self.begin() as con:
             con.connection.register(name, table)
 
diff --git a/ibis/backends/duckdb/tests/test_datatypes.py b/ibis/backends/duckdb/tests/test_datatypes.py
@@ -3,6 +3,7 @@
 from packaging.version import parse as vparse
 from pytest import param
 
+import ibis.common.exceptions as exc
 import ibis.expr.datatypes as dt
 from ibis.backends.duckdb.datatypes import parse
 
@@ -95,3 +96,18 @@ def test_cast_uints(uint_type, snapshot):
     snapshot.assert_match(
         str(ibis.to_sql(t.a.cast(uint_type), dialect="duckdb")), "out.sql"
     )
+
+
+def test_null_dtype():
+    import ibis
+
+    con = ibis.connect("duckdb://:memory:")
+
+    t = ibis.memtable({"a": [None, None]})
+    assert t.schema() == ibis.schema(dict(a="null"))
+
+    with pytest.raises(
+        exc.IbisTypeError,
+        match="DuckDB cannot yet reliably handle `null` typed columns",
+    ):
+        con.execute(t)
diff --git a/ibis/backends/pandas/client.py b/ibis/backends/pandas/client.py
@@ -228,10 +228,12 @@ def __repr__(self) -> str:
     def to_frame(self) -> pd.DataFrame:
         return self._df
 
-    def to_pyarrow(self) -> pa.Table:
+    def to_pyarrow(self, schema: sch.Schema) -> pa.Table:
         import pyarrow as pa
 
-        return pa.Table.from_pandas(self._df)
+        from ibis.backends.pyarrow.datatypes import ibis_to_pyarrow_schema
+
+        return pa.Table.from_pandas(self._df, schema=ibis_to_pyarrow_schema(schema))
 
 
 class PandasInMemoryTable(ops.InMemoryTable):
diff --git a/ibis/backends/pandas/tests/test_datatypes.py b/ibis/backends/pandas/tests/test_datatypes.py
@@ -208,7 +208,7 @@ def test_pandas_dtype(pandas_dtype, ibis_dtype):
         # mixed
         (pd.Series([b'1', '2', 3.0]), dt.binary),
         # empty
-        (pd.Series([], dtype='object'), dt.binary),
+        (pd.Series([], dtype='object'), dt.null),
         (pd.Series([], dtype="string"), dt.string),
     ],
 )
diff --git a/ibis/backends/pyarrow/datatypes.py b/ibis/backends/pyarrow/datatypes.py
@@ -26,6 +26,8 @@
     dt.Boolean: pa.bool_(),
     dt.Timestamp: pa.timestamp('ns'),
     dt.Date: pa.date64(),
+    dt.JSON: pa.string(),
+    dt.Null: pa.null(),
 }
 
 
diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
@@ -44,7 +44,7 @@
 
 
 @pytest.mark.broken(["duckdb", "impala", "bigquery"], 'assert nan is None')
-@pytest.mark.notimpl(["datafusion"], raises=NotImplementedError)
+@pytest.mark.notimpl(["datafusion"])
 def test_null_literal(con, backend):
     expr = ibis.null()
     result = con.execute(expr)
diff --git a/ibis/backends/tests/test_join.py b/ibis/backends/tests/test_join.py
@@ -6,6 +6,9 @@
 from packaging.version import parse as vparse
 from pytest import param
 
+import ibis.common.exceptions as exc
+import ibis.expr.schema as sch
+
 
 def _pandas_semi_join(left, right, on, **_):
     assert len(on) == 1, str(on)
@@ -194,3 +197,32 @@ def test_semi_join_topk(batting, awards_players):
     left = batting.semi_join(batting.year.topk(5), "year").select("year", "RBI")
     expr = left.join(awards_players, left.year == awards_players.yearID)
     assert not expr.limit(5).execute().empty
+
+
+@pytest.mark.notimpl(["dask", "datafusion", "druid", "pandas"])
+@pytest.mark.broken(
+    ["duckdb"],
+    raises=exc.IbisTypeError,
+    reason="DuckDB as of 0.7.1 occasionally segfaults when there are `null`-typed columns present",
+)
+def test_join_with_pandas(batting, awards_players):
+    batting_filt = batting[lambda t: t.yearID < 1900]
+    awards_players_filt = awards_players[lambda t: t.yearID < 1900].execute()
+    assert isinstance(awards_players_filt, pd.DataFrame)
+    expr = batting_filt.join(awards_players_filt, "yearID")
+    df = expr.execute()
+    assert df.yearID.nunique() == 7
+
+
+@pytest.mark.notimpl(["dask", "datafusion", "pandas"])
+def test_join_with_pandas_non_null_typed_columns(batting, awards_players):
+    batting_filt = batting[lambda t: t.yearID < 1900][["yearID"]]
+    awards_players_filt = awards_players[lambda t: t.yearID < 1900][
+        ["yearID"]
+    ].execute()
+    # ensure that none of the columns have type null
+    assert sch.infer(awards_players_filt) == sch.Schema(dict(yearID="int"))
+    assert isinstance(awards_players_filt, pd.DataFrame)
+    expr = batting_filt.join(awards_players_filt, "yearID")
+    df = expr.execute()
+    assert df.yearID.nunique() == 7
diff --git a/ibis/expr/datatypes/value.py b/ibis/expr/datatypes/value.py
@@ -195,7 +195,7 @@ def _infer_object_array_dtype(x):
             'timedelta': dt.interval,
             'time': dt.time,
             'period': dt.binary,
-            'empty': dt.binary,
+            'empty': dt.null,
             'unicode': dt.string,
         }[classifier]
 
diff --git a/ibis/expr/rules.py b/ibis/expr/rules.py
@@ -466,8 +466,14 @@ def table(arg, schema=None, **kwargs):
     it must be of the specified type. The table may have extra columns not
     specified in the schema.
     """
+    import pandas as pd
+
+    import ibis
     import ibis.expr.operations as ops
 
+    if isinstance(arg, pd.DataFrame):
+        arg = ibis.memtable(arg).op()
+
     if not isinstance(arg, ops.TableNode):
         raise com.IbisTypeError(
             f'Argument is not a table; got type {type(arg).__name__}'
diff --git a/ibis/util.py b/ibis/util.py
@@ -36,6 +36,7 @@
     import pyarrow as pa
 
     import ibis.expr.operations as ops
+    import ibis.expr.schema as sch
 
     Graph = Mapping[ops.Node, Sequence[ops.Node]]
 
@@ -493,11 +494,11 @@ class ToFrame(abc.ABC):
 
     @abc.abstractmethod
     def to_frame(self) -> pd.DataFrame:  # pragma: no cover
-        ...
+        """Convert this input to a pandas DataFrame."""
 
     @abc.abstractmethod
-    def to_pyarrow(self) -> pa.Table:  # pragma: no cover
-        ...
+    def to_pyarrow(self, schema: sch.Schema) -> pa.Table:  # pragma: no cover
+        """Convert this input to a PyArrow Table."""
 
 
 def backend_entry_points() -> list[importlib.metadata.EntryPoint]:

Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,7 @@ def test_pandas_dtype(pandas_dtype, ibis_dtype):`
`208`	`208`	`# mixed`
`209`	`209`	`(pd.Series([b'1', '2', 3.0]), dt.binary),`
`210`	`210`	`# empty`
`211`		`- (pd.Series([], dtype='object'), dt.binary),`
	`211`	`+ (pd.Series([], dtype='object'), dt.null),`
`212`	`212`	`(pd.Series([], dtype="string"), dt.string),`
`213`	`213`	`],`
`214`	`214`	`)`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,8 @@`
`26`	`26`	`dt.Boolean: pa.bool_(),`
`27`	`27`	`dt.Timestamp: pa.timestamp('ns'),`
`28`	`28`	`dt.Date: pa.date64(),`
	`29`	`+ dt.JSON: pa.string(),`
	`30`	`+ dt.Null: pa.null(),`
`29`	`31`	`}`
`30`	`32`
`31`	`33`