Skip to content

Commit 65e65c1

Browse files
cpcloudgforsyth
authored andcommitted
feat(duckdb): add read_json function for consuming newline-delimited JSON files
1 parent 0d319ca commit 65e65c1

File tree

3 files changed

+94
-3
lines changed

3 files changed

+94
-3
lines changed

ibis/backends/duckdb/__init__.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,12 @@
2626
from ibis.backends.duckdb.compiler import DuckDBSQLCompiler
2727
from ibis.backends.duckdb.datatypes import parse
2828

29-
# counters for in-memory, parquet, and csv reads
29+
# counters for in-memory, parquet, csv, and json reads
3030
# used if no table name is specified
3131
pd_n = itertools.count(0)
3232
pa_n = itertools.count(0)
3333
csv_n = itertools.count(0)
34+
json_n = itertools.count(0)
3435

3536

3637
def normalize_filenames(source_list):
@@ -209,6 +210,55 @@ def _register_failure(self):
209210
f"please call one of {msg} directly"
210211
)
211212

213+
def read_json(
214+
self,
215+
source_list: str | list[str] | tuple[str],
216+
table_name: str | None = None,
217+
**kwargs,
218+
) -> ir.Table:
219+
"""Read newline-delimited JSON into an ibis table.
220+
221+
Parameters
222+
----------
223+
source_list
224+
File or list of files
225+
table_name
226+
Optional table name
227+
kwargs
228+
Additional keyword arguments passed to DuckDB's `read_json_objects` function
229+
230+
Returns
231+
-------
232+
Table
233+
An ibis table expression
234+
"""
235+
if not table_name:
236+
table_name = f"ibis_read_json_{next(json_n)}"
237+
238+
source_list = normalize_filenames(source_list)
239+
240+
objects = (
241+
sa.func.read_json_objects(
242+
sa.func.list_value(*source_list), _format_kwargs(kwargs)
243+
)
244+
.table_valued("raw")
245+
.render_derived()
246+
)
247+
# read a single row out to get the schema, assumes the first row is representative
248+
json_structure_query = sa.select(sa.func.json_structure(objects.c.raw)).limit(1)
249+
250+
with self.begin() as con:
251+
json_structure = con.execute(json_structure_query).scalar()
252+
data = sa.select(sa.literal_column("s.*")).select_from(
253+
sa.select(
254+
sa.func.json_transform(objects.c.raw, json_structure).label("s")
255+
).subquery()
256+
)
257+
view = _create_view(sa.table(table_name), data, or_replace=True)
258+
con.execute(view)
259+
260+
return self.table(table_name)
261+
212262
def read_csv(
213263
self,
214264
source_list: str | list[str] | tuple[str],

ibis/backends/duckdb/tests/test_register.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,19 @@ def test_read_parquet(data_directory):
1818
assert t.count().execute()
1919

2020

21+
def test_read_json(data_directory, tmp_path):
22+
pqt = ibis.read_parquet(data_directory / "functional_alltypes.parquet")
23+
24+
path = tmp_path.joinpath("ft.json")
25+
path.write_text(pqt.execute().to_json(orient="records", lines=True))
26+
27+
jst = ibis.read_json(path)
28+
29+
nrows = pqt.count().execute()
30+
assert nrows
31+
assert nrows == jst.count().execute()
32+
33+
2134
def test_temp_directory(tmp_path):
2235
query = sa.text("SELECT value FROM duckdb_settings() WHERE name = 'temp_directory'")
2336

ibis/expr/api.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
trailing_window,
4848
window,
4949
)
50+
from ibis.util import experimental
5051

5152
if TYPE_CHECKING:
5253
import pandas as pd
@@ -142,6 +143,7 @@
142143
'random',
143144
'range_window',
144145
'read_csv',
146+
'read_json',
145147
'read_parquet',
146148
'row_number',
147149
'rows_with_max_lookback',
@@ -820,7 +822,7 @@ def row_number() -> ir.IntegerColumn:
820822
return ops.RowNumber().to_expr()
821823

822824

823-
def read_csv(sources: str | Path, **kwargs: Any) -> ir.Table:
825+
def read_csv(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
824826
"""Lazily load a CSV or set of CSVs.
825827
826828
Parameters
@@ -847,7 +849,33 @@ def read_csv(sources: str | Path, **kwargs: Any) -> ir.Table:
847849
return con.read_csv(sources, **kwargs)
848850

849851

850-
def read_parquet(sources: str | Path, **kwargs: Any) -> ir.Table:
852+
@experimental
853+
def read_json(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
854+
"""Lazily load newline-delimited JSON data.
855+
856+
Parameters
857+
----------
858+
sources
859+
A filesystem path or URL or list of same.
860+
kwargs
861+
DuckDB-specific keyword arguments for the file type.
862+
863+
Returns
864+
-------
865+
ir.Table
866+
Table expression representing a file
867+
868+
Examples
869+
--------
870+
>>> t = ibis.read_json("data.json")
871+
"""
872+
from ibis.config import _default_backend
873+
874+
con = _default_backend()
875+
return con.read_json(sources, **kwargs)
876+
877+
878+
def read_parquet(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
851879
"""Lazily load a parquet file or set of parquet files.
852880
853881
Parameters

0 commit comments

Comments
 (0)