Skip to content

Commit 4ccc6fc

Browse files
gforsythcpcloud
authored andcommitted
feat(duckdb): add register method to duckdb backend to load parquet and csv files
1 parent 4501f3a commit 4ccc6fc

File tree

2 files changed

+79
-0
lines changed

2 files changed

+79
-0
lines changed

ibis/backends/duckdb/__init__.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,51 @@ def do_connect(
6767
)
6868
self._meta = sa.MetaData(bind=self.con)
6969

70+
def register(
71+
self,
72+
file_name: str | Path,
73+
table_name: str | None = None,
74+
) -> None:
75+
"""Register an external file (csv or parquet) as a table in the current
76+
connection database
77+
78+
Parameters
79+
----------
80+
file_name
81+
Name of the parquet or CSV file
82+
table_name
83+
Name for the created table. Defaults to filename if not given
84+
"""
85+
file_name = Path(file_name)
86+
suffix = "".join(file_name.suffixes).strip(".") # handles .csv.gz
87+
if file_name.parts[0].endswith(":"):
88+
prefix, *fname = file_name.parts
89+
else:
90+
prefix = "file:"
91+
fname = file_name.parts
92+
93+
file_name = Path(*fname).absolute()
94+
95+
# Use prefix for file_type. If omitted, infer from file extension
96+
file_type = prefix.strip(":") if prefix != "file:" else suffix
97+
table_name = table_name or file_name.stem.replace("-", "_")
98+
if file_type == "parquet":
99+
view = f"""
100+
CREATE VIEW {table_name} as SELECT * from
101+
read_parquet('{file_name}')
102+
"""
103+
elif file_type.startswith("csv"):
104+
view = f"""
105+
CREATE VIEW {table_name} as SELECT * from
106+
read_csv_auto('{file_name}')
107+
"""
108+
else:
109+
raise TypeError(
110+
"Only csv and parquet files can be registered with DuckDB."
111+
)
112+
113+
self.con.execute(view)
114+
70115
def fetch_from_cursor(
71116
self,
72117
cursor: duckdb.DuckDBPyConnection,
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import contextlib
2+
import os
3+
4+
import pytest
5+
6+
import ibis
7+
8+
9+
@contextlib.contextmanager
10+
def pushd(new_dir):
11+
previous_dir = os.getcwd()
12+
os.chdir(new_dir)
13+
yield
14+
os.chdir(previous_dir)
15+
16+
17+
@pytest.mark.parametrize(
18+
"fname, in_table_name, out_table_name",
19+
[
20+
("diamonds.csv", None, "diamonds"),
21+
("csv://diamonds.csv", "Diamonds", "Diamonds"),
22+
("parquet://batting.parquet", None, "batting"),
23+
("batting.parquet", "baseball", "baseball"),
24+
],
25+
)
26+
def test_register_file(data_directory, fname, in_table_name, out_table_name):
27+
con = ibis.duckdb.connect()
28+
with pushd(data_directory):
29+
con.register(fname, table_name=in_table_name)
30+
31+
assert out_table_name in con.list_tables()
32+
33+
table = con.table(out_table_name)
34+
assert table.count().execute() > 0

0 commit comments

Comments
 (0)