Skip to content

Commit d6a2f09

Browse files
gforsythcpcloud
authored andcommitted
refactor(duckdb): use pyarrow for all memtable registration
1 parent 12058f2 commit d6a2f09

File tree

2 files changed

+10
-24
lines changed

2 files changed

+10
-24
lines changed

ibis/backends/duckdb/__init__.py

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
from ibis.backends.base.sqlglot import C, F
2828
from ibis.backends.duckdb.compiler import DuckDBSQLCompiler
2929
from ibis.backends.duckdb.datatypes import DuckDBType
30-
from ibis.expr.operations.relations import PandasDataFrameProxy
3130
from ibis.expr.operations.udf import InputType
3231
from ibis.formats.pandas import PandasData
3332

@@ -1171,10 +1170,6 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
11711170
yield name, ibis_type
11721171

11731172
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
1174-
# in theory we could use pandas dataframes, but when using dataframes
1175-
# with pyarrow datatypes later reads of this data segfault
1176-
import pandas as pd
1177-
11781173
schema = op.schema
11791174
if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
11801175
raise exc.IbisTypeError(
@@ -1184,32 +1179,15 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
11841179

11851180
# only register if we haven't already done so
11861181
if (name := op.name) not in self.list_tables():
1187-
if isinstance(data := op.data, PandasDataFrameProxy):
1188-
table = data.to_frame()
1189-
1190-
# convert to object string dtypes because duckdb is either
1191-
# 1. extremely slow to register DataFrames with not-pyarrow
1192-
# string dtypes
1193-
# 2. broken for string[pyarrow] dtypes (segfault)
1194-
if conversions := {
1195-
colname: "str"
1196-
for colname, col in table.items()
1197-
if isinstance(col.dtype, pd.StringDtype)
1198-
}:
1199-
table = table.astype(conversions)
1200-
else:
1201-
table = data.to_pyarrow(schema)
1182+
table = op.data.to_pyarrow(schema)
12021183

12031184
# register creates a transaction, and we can't nest transactions so
12041185
# we create a function to encapsulate the whole shebang
12051186
def _register(name, table):
12061187
with self.begin() as con:
12071188
con.connection.register(name, table)
12081189

1209-
try:
1210-
_register(name, table)
1211-
except duckdb.NotImplementedException:
1212-
_register(name, data.to_pyarrow(schema))
1190+
_register(name, table)
12131191

12141192
def _get_temp_view_definition(
12151193
self, name: str, definition: sa.sql.compiler.Compiled

ibis/backends/duckdb/tests/test_register.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,3 +367,11 @@ def test_register_filesystem_gcs(con):
367367
)
368368

369369
assert band_members.count().to_pyarrow()
370+
371+
372+
def test_memtable_null_column_parquet_dtype_roundtrip(con, tmp_path):
373+
before = ibis.memtable({"a": [None, None, None]}, schema={"a": "string"})
374+
before.to_parquet(tmp_path / "tmp.parquet")
375+
after = ibis.read_parquet(tmp_path / "tmp.parquet")
376+
377+
assert before.a.type() == after.a.type()

0 commit comments

Comments
 (0)