Skip to content

Commit e67132c

Browse files
cpcloudgforsyth
authored andcommitted
feat(api): add ibis.read top level API function
1 parent fc617e2 commit e67132c

File tree

4 files changed

+146
-63
lines changed

4 files changed

+146
-63
lines changed

docs/api/expressions/top_level.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ These methods and objects are available directly in the `ibis` module.
3131
::: ibis.to_sql
3232
::: ibis.random
3333
::: ibis.range_window
34+
::: ibis.read
3435
::: ibis.row_number
3536
::: ibis.schema
3637
::: ibis.struct

ibis/backends/duckdb/__init__.py

Lines changed: 76 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@
3434
_gen_table_names = (f"registered_table{i:d}" for i in itertools.count())
3535

3636

37-
def _name_from_path(path: Path) -> str:
38-
base, *_ = path.name.partition(os.extsep)
39-
return base.replace("-", "_")
37+
def _name_from_path(path: str) -> str:
38+
# https://github.com/duckdb/duckdb/issues/5203
39+
return path.replace(".", "_")
4040

4141

4242
def _name_from_dataset(dataset: pa.dataset.FileSystemDataset) -> str:
@@ -47,49 +47,67 @@ def _quote(name: str):
4747
return _dialect.identifier_preparer.quote(name)
4848

4949

50-
@_generate_view_code.register(r"parquet://(?P<path>.+)", priority=10)
51-
def _parquet(_, path, table_name=None, **kwargs):
52-
path = Path(path).absolute()
53-
table_name = table_name or _name_from_path(path)
54-
quoted_table_name = _quote(table_name)
55-
args = [f"'{str(path)}'"]
56-
if kwargs:
57-
args.extend([f"{k}={v}" for k, v in kwargs.items()])
50+
def _get_scheme(scheme):
51+
if scheme is None or scheme == "file://":
52+
return ""
53+
return scheme
54+
55+
56+
def _format_kwargs(kwargs):
5857
return (
59-
f"CREATE OR REPLACE VIEW {quoted_table_name} as SELECT * "
60-
f"from read_parquet({', '.join(args)})",
61-
table_name,
58+
f"{k}='{v}'" if isinstance(v, str) else f"{k}={v!r}" for k, v in kwargs.items()
6259
)
6360

6461

65-
@_generate_view_code.register(r"csv(?:\.gz)?://(?P<path>.+)", priority=10)
66-
def _csv(_, path, table_name=None, **kwargs):
67-
path = Path(path).absolute()
68-
table_name = table_name or _name_from_path(path)
62+
@_generate_view_code.register(r"parquet://(?P<path>.+)", priority=13)
63+
def _parquet(_, path, table_name=None, scheme=None, **kwargs):
64+
scheme = _get_scheme(scheme)
65+
if not scheme:
66+
path = os.path.abspath(path)
67+
if not table_name:
68+
table_name = _name_from_path(path)
6969
quoted_table_name = _quote(table_name)
70-
# AUTO_DETECT and COLUMNS collide, so we set AUTO_DETECT=True
70+
args = [f"'{scheme}{path}'", *_format_kwargs(kwargs)]
71+
code = f"""\
72+
CREATE OR REPLACE VIEW {quoted_table_name} AS
73+
SELECT * FROM read_parquet({', '.join(args)})"""
74+
return code, table_name, ["parquet"] + ["httpfs"] if scheme else []
75+
76+
77+
@_generate_view_code.register(r"(c|t)sv://(?P<path>.+)", priority=13)
78+
def _csv(_, path, table_name=None, scheme=None, **kwargs):
79+
scheme = _get_scheme(scheme)
80+
if not scheme:
81+
path = os.path.abspath(path)
82+
if not table_name:
83+
table_name = _name_from_path(path)
84+
quoted_table_name = _quote(table_name)
85+
# auto_detect and columns collide, so we set auto_detect=True
7186
# unless COLUMNS has been specified
72-
args = [f"'{str(path)}'"]
73-
args.extend(
74-
[
75-
f"AUTO_DETECT="
76-
f"{kwargs.pop('AUTO_DETECT', False if 'COLUMNS' in kwargs else True)}"
77-
]
78-
)
79-
if kwargs:
80-
args.extend([f"{k}={v}" for k, v in kwargs.items()])
81-
return (
82-
f"CREATE OR REPLACE VIEW {quoted_table_name} as SELECT * "
83-
f"from read_csv({', '.join(args)})",
84-
table_name,
85-
)
87+
args = [
88+
f"'{scheme}{path}'",
89+
f"auto_detect={kwargs.pop('auto_detect', 'columns' not in kwargs)}",
90+
*_format_kwargs(kwargs),
91+
]
92+
code = f"""\
93+
CREATE OR REPLACE VIEW {quoted_table_name} AS
94+
SELECT * FROM read_csv({', '.join(args)})"""
95+
return code, table_name, ["httpfs"] if scheme else []
96+
97+
98+
@_generate_view_code.register(
99+
r"(?P<scheme>(?:file|https?)://)?(?P<path>.+?\.((?:c|t)sv|txt)(?:\.gz)?)",
100+
priority=12,
101+
)
102+
def _csv_file_or_url(_, path, table_name=None, **kwargs):
103+
return _csv(f"csv://{path}", path=path, table_name=table_name, **kwargs)
86104

87105

88-
@_generate_view_code.register(r"(?:file://)?(?P<path>.+)", priority=9)
89-
def _file(_, path, table_name=None, **kwargs):
90-
num_sep_chars = len(os.extsep)
91-
extension = "".join(Path(path).suffixes)[num_sep_chars:]
92-
return _generate_view_code(f"{extension}://{path}", table_name=table_name, **kwargs)
106+
@_generate_view_code.register(
107+
r"(?P<scheme>(?:file|https?)://)?(?P<path>.+?\.parquet)", priority=12
108+
)
109+
def _parquet_file_or_url(_, path, table_name=None, **kwargs):
110+
return _parquet(f"parquet://{path}", path=path, table_name=table_name, **kwargs)
93111

94112

95113
@_generate_view_code.register(r"s3://.+", priority=10)
@@ -100,17 +118,16 @@ def _s3(full_path, table_name=None):
100118
dataset = ds.dataset(full_path)
101119
table_name = table_name or _name_from_dataset(dataset)
102120
quoted_table_name = _quote(table_name)
103-
return quoted_table_name, dataset
121+
return quoted_table_name, dataset, ()
104122

105123

106124
@_generate_view_code.register(r".+", priority=1)
107-
def _default(_, **kwargs):
125+
def _default(path, **kwargs):
108126
raise ValueError(
109-
"""
110-
Unrecognized filetype or extension.
111-
Valid prefixes are parquet://, csv://, s3://, or file://
127+
f"""Unrecognized file type or extension: {path}.
112128
113-
Supported filetypes are parquet, csv, and csv.gz
129+
Valid prefixes are parquet://, csv://, tsv://, s3://, or file://
130+
Supported file extensions are parquet, csv, tsv, txt, csv.gz, tsv.gz, and txt.gz
114131
"""
115132
)
116133

@@ -180,6 +197,15 @@ def do_connect(
180197
)
181198
)
182199
self._meta = sa.MetaData(bind=self.con)
200+
self._extensions = set()
201+
202+
def _load_extensions(self, extensions):
203+
for extension in extensions:
204+
if extension not in self._extensions:
205+
with self.con.connect() as con:
206+
con.execute(f"INSTALL '{extension}'")
207+
con.execute(f"LOAD '{extension}'")
208+
self._extensions.add(extension)
183209

184210
def register(
185211
self,
@@ -210,7 +236,10 @@ def register(
210236
The just-registered table
211237
"""
212238
if isinstance(source, str) and source.startswith("s3://"):
213-
table_name, dataset = _generate_view_code(source, table_name=table_name)
239+
table_name, dataset, extensions_required = _generate_view_code(
240+
source, table_name=table_name
241+
)
242+
self._load_extensions(extensions_required)
214243
# We don't create a view since DuckDB special cases Arrow Datasets
215244
# so if we also create a view we end up with both a "lazy table"
216245
# and a view with the same name
@@ -221,9 +250,10 @@ def register(
221250
# explicitly.
222251
cursor.cursor.c.register(table_name, dataset)
223252
elif isinstance(source, (str, Path)):
224-
sql, table_name = _generate_view_code(
253+
sql, table_name, extensions_required = _generate_view_code(
225254
str(source), table_name=table_name, **kwargs
226255
)
256+
self._load_extensions(extensions_required)
227257
self.con.execute(sql)
228258
else:
229259
if table_name is None:

ibis/backends/duckdb/tests/test_register.py

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import csv
33
import gzip
44
import os
5+
import re
56
from pathlib import Path
67

78
import pytest
@@ -32,9 +33,9 @@ def gzip_csv(data_directory, tmp_path):
3233

3334

3435
@pytest.mark.parametrize(
35-
"fname, in_table_name, out_table_name",
36+
("fname", "in_table_name", "out_table_name"),
3637
[
37-
param("diamonds.csv", None, "diamonds", id="default"),
38+
param("diamonds.csv", None, None, id="default"),
3839
param("csv://diamonds.csv", "Diamonds", "Diamonds", id="csv_name"),
3940
param(
4041
"file://diamonds.csv",
@@ -55,14 +56,17 @@ def test_register_csv(
5556
data_directory, fname, in_table_name, out_table_name, ext, gzip_csv
5657
):
5758
con = ibis.duckdb.connect()
58-
if ext is not None:
59+
if ext:
5960
fname = gzip_csv
6061
with pushd(data_directory):
61-
con.register(fname, table_name=in_table_name)
62+
table = con.register(fname, table_name=in_table_name)
6263

63-
assert out_table_name in con.list_tables()
64+
if out_table_name is not None:
65+
out_table_name += (os.extsep * bool(ext) + (ext or "")) * (
66+
in_table_name is None
67+
)
68+
assert out_table_name in con.list_tables()
6469

65-
table = con.table(out_table_name)
6670
assert table.count().execute()
6771

6872

@@ -73,18 +77,17 @@ def test_register_with_dotted_name(data_directory, tmp_path):
7377
f.parent.mkdir()
7478
data = data_directory.joinpath("diamonds.csv").read_bytes()
7579
f.write_bytes(data)
76-
con.register(str(f.absolute()))
77-
table = con.table("diamonds")
80+
table = con.register(str(f.absolute()))
7881
assert table.count().execute()
7982

8083

8184
@pytest.mark.parametrize(
82-
"fname, in_table_name, out_table_name",
85+
("fname", "in_table_name", "out_table_name"),
8386
[
84-
(
87+
pytest.param(
8588
"parquet://functional_alltypes.parquet",
8689
None,
87-
"functional_alltypes",
90+
"functional_alltypes_parquet",
8891
),
8992
("functional_alltypes.parquet", "funk_all", "funk_all"),
9093
("parquet://functional_alltypes.parq", "funk_all", "funk_all"),
@@ -103,11 +106,10 @@ def test_register_parquet(
103106

104107
con = ibis.duckdb.connect()
105108
with pushd(tmp_path):
106-
con.register(f"parquet://{fname.name}", table_name=in_table_name)
109+
table = con.register(f"parquet://{fname.name}", table_name=in_table_name)
107110

108-
assert out_table_name in con.list_tables()
111+
assert any(out_table_name in t for t in con.list_tables())
109112

110-
table = con.table(out_table_name)
111113
assert table.count().execute()
112114

113115

@@ -137,10 +139,10 @@ def test_register_pyarrow_tables():
137139

138140
@pytest.mark.parametrize(
139141
"kwargs, expected_snippet",
140-
[({}, "AUTO_DETECT=True"), ({"COLUMNS": {"foo": "int8"}}, "AUTO_DETECT=False")],
142+
[({}, "auto_detect=True"), ({"columns": {"foo": "int8"}}, "auto_detect=False")],
141143
)
142144
def test_csv_register_kwargs(kwargs, expected_snippet):
143-
view_str, _ = _generate_view_code("bork.csv", **kwargs)
145+
view_str, _, _ = _generate_view_code("bork.csv", **kwargs)
144146
assert expected_snippet in view_str
145147

146148

@@ -167,3 +169,21 @@ def test_csv_reregister_schema(tmp_path):
167169
foo_table = con.register(foo, SAMPLE_SIZE=2)
168170
exp_schema = ibis.schema(dict(cola="int32", colb="int32", colc="int32"))
169171
assert foo_table.schema() == exp_schema
172+
173+
174+
def test_read_csv(data_directory):
175+
t = ibis.read(data_directory / "functional_alltypes.csv")
176+
assert t.count().execute()
177+
178+
179+
def test_read_parquet(data_directory):
180+
t = ibis.read(data_directory / "functional_alltypes.parquet")
181+
assert t.count().execute()
182+
183+
184+
@pytest.mark.parametrize("basename", ["functional_alltypes.*", "df.xlsx"])
185+
def test_read_invalid(data_directory, basename):
186+
path = data_directory / basename
187+
msg = f"^Unrecognized file type or extension: {re.escape(str(path))}"
188+
with pytest.raises(ValueError, match=msg):
189+
ibis.read(path)

ibis/expr/api.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import functools
77
import itertools
88
import operator
9-
from typing import Iterable, Literal, Mapping, Sequence
9+
from pathlib import Path
10+
from typing import Any, Iterable, Literal, Mapping, Sequence
1011
from typing import Tuple as _Tuple
1112
from typing import TypeVar
1213
from typing import Union as _Union
@@ -198,6 +199,7 @@
198199
'pi',
199200
'random',
200201
'range_window',
202+
'read',
201203
'row_number',
202204
'rows_with_max_lookback',
203205
'schema',
@@ -871,6 +873,36 @@ def row_number() -> ir.IntegerColumn:
871873
return ops.RowNumber().to_expr()
872874

873875

876+
def read(path: str | Path, **kwargs: Any) -> ir.Table:
877+
"""Lazily load a data source located at `path`.
878+
879+
Parameters
880+
----------
881+
path
882+
A filesystem path or URL. Supports CSV, TSV, and Parquet files.
883+
kwargs
884+
DuckDB-specific keyword arguments for the file type.
885+
886+
* CSV/TSV: https://duckdb.org/docs/data/csv#parameters.
887+
* Parquet: https://duckdb.org/docs/data/parquet
888+
889+
Returns
890+
-------
891+
ir.Table
892+
Table expression representing a file
893+
894+
Examples
895+
--------
896+
>>> batting = ibis.read("ci/ibis-testing-data/batting.csv")
897+
>>> diamonds = ibis.read("ci/ibis-testing-data/parquet/diamonds/diamonds.parquet")
898+
>>> ft = ibis.read("parquet://ci/ibis-testing-data/parquet/functional_alltypes/*")
899+
"""
900+
from ibis.config import _default_backend
901+
902+
con = _default_backend()
903+
return con.register(str(path), **kwargs)
904+
905+
874906
e = ops.E().to_expr()
875907

876908
pi = ops.Pi().to_expr()

0 commit comments

Comments
 (0)