feat(duckdb): add read_json function for consuming newline-delimited JSON files

cpcloud · gforsyth · commit 65e65c1a87f5 · 2023-01-24T16:58:55.000-05:00
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -26,11 +26,12 @@
 from ibis.backends.duckdb.compiler import DuckDBSQLCompiler
 from ibis.backends.duckdb.datatypes import parse
 
-# counters for in-memory, parquet, and csv reads
+# counters for in-memory, parquet, csv, and json reads
 # used if no table name is specified
 pd_n = itertools.count(0)
 pa_n = itertools.count(0)
 csv_n = itertools.count(0)
+json_n = itertools.count(0)
 
 
 def normalize_filenames(source_list):
@@ -209,6 +210,55 @@ def _register_failure(self):
             f"please call one of {msg} directly"
         )
 
+    def read_json(
+        self,
+        source_list: str | list[str] | tuple[str],
+        table_name: str | None = None,
+        **kwargs,
+    ) -> ir.Table:
+        """Read newline-delimited JSON into an ibis table.
+
+        Parameters
+        ----------
+        source_list
+            File or list of files
+        table_name
+            Optional table name
+        kwargs
+            Additional keyword arguments passed to DuckDB's `read_json_objects` function
+
+        Returns
+        -------
+        Table
+            An ibis table expression
+        """
+        if not table_name:
+            table_name = f"ibis_read_json_{next(json_n)}"
+
+        source_list = normalize_filenames(source_list)
+
+        objects = (
+            sa.func.read_json_objects(
+                sa.func.list_value(*source_list), _format_kwargs(kwargs)
+            )
+            .table_valued("raw")
+            .render_derived()
+        )
+        # read a single row out to get the schema, assumes the first row is representative
+        json_structure_query = sa.select(sa.func.json_structure(objects.c.raw)).limit(1)
+
+        with self.begin() as con:
+            json_structure = con.execute(json_structure_query).scalar()
+            data = sa.select(sa.literal_column("s.*")).select_from(
+                sa.select(
+                    sa.func.json_transform(objects.c.raw, json_structure).label("s")
+                ).subquery()
+            )
+            view = _create_view(sa.table(table_name), data, or_replace=True)
+            con.execute(view)
+
+        return self.table(table_name)
+
     def read_csv(
         self,
         source_list: str | list[str] | tuple[str],
diff --git a/ibis/backends/duckdb/tests/test_register.py b/ibis/backends/duckdb/tests/test_register.py
@@ -18,6 +18,19 @@ def test_read_parquet(data_directory):
     assert t.count().execute()
 
 
+def test_read_json(data_directory, tmp_path):
+    pqt = ibis.read_parquet(data_directory / "functional_alltypes.parquet")
+
+    path = tmp_path.joinpath("ft.json")
+    path.write_text(pqt.execute().to_json(orient="records", lines=True))
+
+    jst = ibis.read_json(path)
+
+    nrows = pqt.count().execute()
+    assert nrows
+    assert nrows == jst.count().execute()
+
+
 def test_temp_directory(tmp_path):
     query = sa.text("SELECT value FROM duckdb_settings() WHERE name = 'temp_directory'")
 
diff --git a/ibis/expr/api.py b/ibis/expr/api.py
@@ -47,6 +47,7 @@
     trailing_window,
     window,
 )
+from ibis.util import experimental
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -142,6 +143,7 @@
     'random',
     'range_window',
     'read_csv',
+    'read_json',
     'read_parquet',
     'row_number',
     'rows_with_max_lookback',
@@ -820,7 +822,7 @@ def row_number() -> ir.IntegerColumn:
     return ops.RowNumber().to_expr()
 
 
-def read_csv(sources: str | Path, **kwargs: Any) -> ir.Table:
+def read_csv(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
     """Lazily load a CSV or set of CSVs.
 
     Parameters
@@ -847,7 +849,33 @@ def read_csv(sources: str | Path, **kwargs: Any) -> ir.Table:
     return con.read_csv(sources, **kwargs)
 
 
-def read_parquet(sources: str | Path, **kwargs: Any) -> ir.Table:
+@experimental
+def read_json(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
+    """Lazily load newline-delimited JSON data.
+
+    Parameters
+    ----------
+    sources
+        A filesystem path or URL or list of same.
+    kwargs
+        DuckDB-specific keyword arguments for the file type.
+
+    Returns
+    -------
+    ir.Table
+        Table expression representing a file
+
+    Examples
+    --------
+    >>> t = ibis.read_json("data.json")
+    """
+    from ibis.config import _default_backend
+
+    con = _default_backend()
+    return con.read_json(sources, **kwargs)
+
+
+def read_parquet(sources: str | Path | Sequence[str | Path], **kwargs: Any) -> ir.Table:
     """Lazily load a parquet file or set of parquet files.
 
     Parameters