Skip to content

Commit 428d1a3

Browse files
gforsythcpcloud
andauthored
feat(memtable): add pyarrow.dataset support (#10206)
Co-authored-by: Phillip Cloud <[email protected]>
1 parent 021df72 commit 428d1a3

File tree

5 files changed

+71
-8
lines changed

5 files changed

+71
-8
lines changed

ibis/backends/duckdb/__init__.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1609,7 +1609,15 @@ def _in_memory_table_exists(self, name: str) -> bool:
16091609
return True
16101610

16111611
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
1612-
self.con.register(op.name, op.data.to_pyarrow(op.schema))
1612+
data = op.data
1613+
schema = op.schema
1614+
1615+
try:
1616+
obj = data.to_pyarrow_dataset(schema)
1617+
except AttributeError:
1618+
obj = data.to_pyarrow(schema)
1619+
1620+
self.con.register(op.name, obj)
16131621

16141622
def _finalize_memtable(self, name: str) -> None:
16151623
# if we don't aggressively unregister tables duckdb will keep a

ibis/backends/tests/test_client.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -933,21 +933,22 @@ def test_self_join_memory_table(backend, con, monkeypatch):
933933
[
934934
"bigquery",
935935
"clickhouse",
936-
"duckdb",
937936
"exasol",
938937
"impala",
939938
"mssql",
940939
"mysql",
941940
"oracle",
942-
"polars",
943941
"postgres",
944942
"pyspark",
945943
"risingwave",
946944
"snowflake",
947945
"sqlite",
948946
"trino",
949-
]
947+
],
948+
raises=com.UnsupportedOperationError,
949+
reason="we don't materialize datasets to avoid perf footguns",
950950
),
951+
pytest.mark.notimpl(["polars"], raises=NotImplementedError),
951952
],
952953
id="pyarrow dataset",
953954
),

ibis/expr/api.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import pandas as pd
5151
import polars as pl
5252
import pyarrow as pa
53+
import pyarrow.dataset as ds
5354

5455
from ibis.expr.schema import SchemaLike
5556

@@ -480,6 +481,23 @@ def _memtable_from_pyarrow_table(
480481
).to_expr()
481482

482483

484+
@_memtable.register("pyarrow.dataset.Dataset")
485+
def _memtable_from_pyarrow_dataset(
486+
data: ds.Dataset,
487+
*,
488+
name: str | None = None,
489+
schema: SchemaLike | None = None,
490+
columns: Iterable[str] | None = None,
491+
):
492+
from ibis.formats.pyarrow import PyArrowDatasetProxy
493+
494+
return ops.InMemoryTable(
495+
name=name if name is not None else util.gen_name("pyarrow_memtable"),
496+
schema=Schema.from_pyarrow(data.schema),
497+
data=PyArrowDatasetProxy(data),
498+
).to_expr()
499+
500+
483501
@_memtable.register("polars.LazyFrame")
484502
def _memtable_from_polars_lazyframe(data: pl.LazyFrame, **kwargs):
485503
return _memtable_from_polars_dataframe(data.collect(), **kwargs)

ibis/formats/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,9 +239,6 @@ def __repr__(self) -> str:
239239
data_repr = indent(repr(self.obj), spaces=2)
240240
return f"{self.__class__.__name__}:\n{data_repr}"
241241

242-
def __len__(self) -> int:
243-
return len(self.obj)
244-
245242
@abstractmethod
246243
def to_frame(self) -> pd.DataFrame: # pragma: no cover
247244
"""Convert this input to a pandas DataFrame."""

ibis/formats/pyarrow.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,18 @@
55
import pyarrow as pa
66
import pyarrow_hotfix # noqa: F401
77

8+
import ibis.common.exceptions as com
89
import ibis.expr.datatypes as dt
910
from ibis.expr.schema import Schema
1011
from ibis.formats import DataMapper, SchemaMapper, TableProxy, TypeMapper
12+
from ibis.util import V
1113

1214
if TYPE_CHECKING:
1315
from collections.abc import Sequence
1416

17+
import pandas as pd
1518
import polars as pl
19+
import pyarrow.dataset as ds
1620

1721

1822
_from_pyarrow_types = {
@@ -327,7 +331,7 @@ def convert_table(cls, table: pa.Table, schema: Schema) -> pa.Table:
327331
return table
328332

329333

330-
class PyArrowTableProxy(TableProxy):
334+
class PyArrowTableProxy(TableProxy[V]):
331335
def to_frame(self):
332336
return self.obj.to_pandas()
333337

@@ -341,3 +345,38 @@ def to_polars(self, schema: Schema) -> pl.DataFrame:
341345

342346
df = pl.from_arrow(self.obj)
343347
return PolarsData.convert_table(df, schema)
348+
349+
350+
class PyArrowDatasetProxy(TableProxy[V]):
351+
ERROR_MESSAGE = """\
352+
You are trying to use a PyArrow Dataset with a backend that will require
353+
materializing the entire dataset in local memory.
354+
355+
If you would like to materialize this dataset, please construct the memtable
356+
directly by running `ibis.memtable(my_dataset.to_table())`."""
357+
358+
__slots__ = ("obj",)
359+
obj: V
360+
361+
def __init__(self, obj: V) -> None:
362+
self.obj = obj
363+
364+
# pyarrow datasets are hashable, so we override the hash from TableProxy
365+
def __hash__(self):
366+
return hash(self.obj)
367+
368+
def to_frame(self) -> pd.DataFrame:
369+
raise com.UnsupportedOperationError(self.ERROR_MESSAGE)
370+
371+
def to_pyarrow(self, schema: Schema) -> pa.Table:
372+
raise com.UnsupportedOperationError(self.ERROR_MESSAGE)
373+
374+
def to_pyarrow_dataset(self, schema: Schema) -> ds.Dataset:
375+
"""Return the dataset object itself.
376+
377+
Use with backends that can perform pushdowns into dataset objects.
378+
"""
379+
return self.obj
380+
381+
def to_polars(self, schema: Schema) -> pa.Table:
382+
raise com.UnsupportedOperationError(self.ERROR_MESSAGE)

0 commit comments

Comments
 (0)