Skip to content

Commit b57be01

Browse files
authored
feat(datafusion): enable reading multiple paths in read_csv (#10317)
1 parent 224bfc3 commit b57be01

File tree

3 files changed

+25
-6
lines changed

3 files changed

+25
-6
lines changed

ibis/backends/datafusion/__init__.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from ibis.common.dispatch import lazy_singledispatch
2929
from ibis.expr.operations.udf import InputType
3030
from ibis.formats.pyarrow import PyArrowSchema, PyArrowType
31-
from ibis.util import deprecated, gen_name, normalize_filename
31+
from ibis.util import deprecated, gen_name, normalize_filename, normalize_filenames
3232

3333
try:
3434
from datafusion import ExecutionContext as SessionContext
@@ -414,13 +414,16 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
414414
self.con.from_arrow(op.data.to_pyarrow(op.schema), op.name)
415415

416416
def read_csv(
417-
self, path: str | Path, table_name: str | None = None, **kwargs: Any
417+
self,
418+
source_list: str | Path | list[str | Path] | tuple[str | Path],
419+
table_name: str | None = None,
420+
**kwargs: Any,
418421
) -> ir.Table:
419422
"""Register a CSV file as a table in the current database.
420423
421424
Parameters
422425
----------
423-
path
426+
source_list
424427
The data source. A string or Path to the CSV file.
425428
table_name
426429
An optional name to use for the created table. This defaults to
@@ -434,9 +437,9 @@ def read_csv(
434437
The just-registered table
435438
436439
"""
437-
path = normalize_filename(path)
440+
path = normalize_filenames(source_list)
438441
table_name = table_name or gen_name("read_csv")
439-
# Our other backends support overwriting views / tables when reregistering
442+
# Our other backends support overwriting views / tables when re-registering
440443
self.con.deregister_table(table_name)
441444
self.con.register_csv(table_name, path, **kwargs)
442445
return self.table(table_name)

ibis/backends/datafusion/tests/test_register.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
import pathlib
4+
35
import pandas as pd
46
import pyarrow as pa
57
import pytest
@@ -17,6 +19,20 @@ def test_read_csv(conn, data_dir):
1719
assert t.count().execute()
1820

1921

22+
@pytest.mark.parametrize(
23+
"function",
24+
[pathlib.Path, str],
25+
)
26+
def test_read_csv_path_list(conn, data_dir, function):
27+
path = data_dir / "csv" / "functional_alltypes.csv"
28+
29+
t = conn.read_csv(path, table_name="alltypes1")
30+
t2 = conn.read_csv([function(path), function(path)], table_name="alltypes2")
31+
32+
assert t2.schema() == t.schema()
33+
assert t2.count().execute() == 2 * t.count().execute()
34+
35+
2036
def test_read_parquet(conn, data_dir):
2137
t = conn.read_parquet(data_dir / "parquet" / "functional_alltypes.parquet")
2238
assert t.count().execute()

ibis/backends/tests/test_signatures.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def _scrape_methods(modules, params):
107107
"read_csv": pytest.param(
108108
BaseBackend,
109109
"read_csv",
110-
marks=pytest.mark.notyet(["duckdb", "flink", "pyspark"]),
110+
marks=pytest.mark.notyet(["duckdb", "flink", "pyspark", "datafusion"]),
111111
),
112112
"read_delta": pytest.param(
113113
BaseBackend,

0 commit comments

Comments
 (0)