feat: Support inlining small list, struct, json data (#1589)

TrevorBergeron · web-flow · commit 2ce891fcd5bf · 2025-04-04T12:14:01.000-07:00
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
@@ -71,5 +71,13 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
     if pa.types.is_large_string(type):
         # simple string type can handle the largest strings needed
         return pa.string()
+    if pa.types.is_null(type):
+        # null as a type not allowed, default type is float64 for bigframes
+        return pa.float64()
+    if pa.types.is_list(type):
+        new_field_t = arrow_type_replacements(type.value_type)
+        if new_field_t != type.value_type:
+            return pa.list_(new_field_t)
+        return type
     else:
         return type
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -108,20 +108,9 @@
 
 logger = logging.getLogger(__name__)
 
-# Excludes geography and nested (array, struct) datatypes
-INLINABLE_DTYPES: Sequence[bigframes.dtypes.Dtype] = (
-    pandas.BooleanDtype(),
-    pandas.Float64Dtype(),
-    pandas.Int64Dtype(),
-    pandas.StringDtype(storage="pyarrow"),
-    pandas.ArrowDtype(pa.binary()),
-    pandas.ArrowDtype(pa.date32()),
-    pandas.ArrowDtype(pa.time64("us")),
-    pandas.ArrowDtype(pa.timestamp("us")),
-    pandas.ArrowDtype(pa.timestamp("us", tz="UTC")),
-    pandas.ArrowDtype(pa.decimal128(38, 9)),
-    pandas.ArrowDtype(pa.decimal256(76, 38)),
-    pandas.ArrowDtype(pa.duration("us")),
+NON_INLINABLE_DTYPES: Sequence[bigframes.dtypes.Dtype] = (
+    # Currently excluded as doesn't have arrow type
+    bigframes.dtypes.GEO_DTYPE,
 )
 
 
@@ -852,7 +841,7 @@ def _read_pandas_inline(
         # Make sure all types are inlinable to avoid escaping errors.
         inline_types = inline_df._block.expr.schema.dtypes
         noninlinable_types = [
-            dtype for dtype in inline_types if dtype not in INLINABLE_DTYPES
+            dtype for dtype in inline_types if dtype in NON_INLINABLE_DTYPES
         ]
         if len(noninlinable_types) != 0:
             raise ValueError(
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -4342,6 +4342,20 @@ def test_series_explode_w_aggregate():
     assert s.explode().sum() == pd_s.explode().sum()
 
 
+@skip_legacy_pandas
+def test_series_construct_empty_array():
+    s = bigframes.pandas.Series([[]])
+    expected = pd.Series(
+        [[]],
+        dtype=pd.ArrowDtype(pa.list_(pa.float64())),
+        index=pd.Index([0], dtype=pd.Int64Dtype()),
+    )
+    pd.testing.assert_series_equal(
+        expected,
+        s.to_pandas(),
+    )
+
+
 @pytest.mark.parametrize(
     ("data"),
     [
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -969,13 +969,11 @@ def test_read_pandas_json_index(session, write_engine):
 @pytest.mark.parametrize(
     ("write_engine"),
     [
-        pytest.param("default"),
         pytest.param("bigquery_load"),
         pytest.param("bigquery_streaming"),
-        pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)),
     ],
 )
-def test_read_pandas_w_nested_json(session, write_engine):
+def test_read_pandas_w_nested_json_fails(session, write_engine):
     data = [
         [{"json_field": "1"}],
         [{"json_field": None}],
@@ -995,16 +993,44 @@ def test_read_pandas_w_nested_json(session, write_engine):
         session.read_pandas(pd_s, write_engine=write_engine)
 
 
+@utils.skip_legacy_pandas
 @pytest.mark.parametrize(
     ("write_engine"),
     [
         pytest.param("default"),
+        pytest.param("bigquery_inline"),
+    ],
+)
+def test_read_pandas_inline_w_nested_json(session, write_engine):
+    data = [
+        [{"json_field": "1"}],
+        [{"json_field": None}],
+        [{"json_field": '["1","3","5"]'}],
+        [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
+    ]
+    pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
+    pd_s = pd.Series(
+        arrays.ArrowExtensionArray(pa_array),  # type: ignore
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
+        ),
+    )
+    bq_s = (
+        session.read_pandas(pd_s, write_engine=write_engine)
+        .to_pandas()
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_series_equal(bq_s, pd_s)
+
+
+@pytest.mark.parametrize(
+    ("write_engine"),
+    [
         pytest.param("bigquery_load"),
         pytest.param("bigquery_streaming"),
-        pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)),
     ],
 )
-def test_read_pandas_w_nested_json_index(session, write_engine):
+def test_read_pandas_inline_w_nested_json_index_fails(session, write_engine):
     data = [
         [{"json_field": "1"}],
         [{"json_field": None}],
@@ -1026,6 +1052,32 @@ def test_read_pandas_w_nested_json_index(session, write_engine):
         session.read_pandas(pd_idx, write_engine=write_engine)
 
 
+@utils.skip_legacy_pandas
+@pytest.mark.parametrize(
+    ("write_engine"),
+    [
+        pytest.param("default"),
+        pytest.param("bigquery_inline"),
+    ],
+)
+def test_read_pandas_w_nested_json_index(session, write_engine):
+    data = [
+        [{"json_field": "1"}],
+        [{"json_field": None}],
+        [{"json_field": '["1","3","5"]'}],
+        [{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
+    ]
+    pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
+    pd_idx: pd.Index = pd.Index(
+        arrays.ArrowExtensionArray(pa_array),  # type: ignore
+        dtype=pd.ArrowDtype(
+            pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
+        ),
+    )
+    bq_idx = session.read_pandas(pd_idx, write_engine=write_engine).to_pandas()
+    pd.testing.assert_index_equal(bq_idx, pd_idx)
+
+
 @utils.skip_legacy_pandas
 @pytest.mark.parametrize(
     ("write_engine",),
diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py
@@ -24,6 +24,7 @@
 import pandas.testing
 import pyarrow  # type: ignore
 import pytest
+import shapely  # type: ignore
 
 import bigframes.core.schema
 import bigframes.features
@@ -503,3 +504,17 @@ def test_read_pandas_with_bigframes_dataframe():
         ValueError, match=re.escape("read_pandas() expects a pandas.DataFrame")
     ):
         session.read_pandas(df)
+
+
+def test_read_pandas_inline_w_noninlineable_type_raises_error():
+    session = resources.create_bigquery_session()
+    data = [
+        shapely.Point(1, 1),
+        shapely.Point(2, 1),
+        shapely.Point(1, 2),
+    ]
+    s = pandas.Series(data, dtype=geopandas.array.GeometryDtype())
+    with pytest.raises(
+        ValueError, match="Could not (convert|inline) with a BigQuery type:"
+    ):
+        session.read_pandas(s, write_engine="bigquery_inline")
diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py
@@ -21,9 +21,7 @@
 
 import google.api_core.exceptions
 import google.cloud.bigquery
-import google.cloud.bigquery.table
 import pandas as pd
-import pyarrow as pa
 import pytest
 
 import bigframes
@@ -478,16 +476,3 @@ def test_read_pandas_inline_w_interval_type_raises_error():
     df = pd.DataFrame(pd.arrays.IntervalArray.from_breaks([0, 10, 20, 30, 40, 50]))
     with pytest.raises(ValueError, match="Could not convert with a BigQuery type: "):
         session.read_pandas(df, write_engine="bigquery_inline")
-
-
-def test_read_pandas_inline_w_noninlineable_type_raises_error():
-    session = resources.create_bigquery_session()
-    data = [
-        [1, 2, 3],
-        [4, 5],
-        None,
-        [6, 7, 8, 9],
-    ]
-    s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
-    with pytest.raises(ValueError, match="Could not inline with a BigQuery type:"):
-        session.read_pandas(s, write_engine="bigquery_inline")
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py
@@ -706,6 +706,10 @@ def visit_Literal(self, op, *, value, dtype):
            else return the result of the previous step.
         """
         if value is None:
+            if dtype.is_array():
+                # hack: bq arrays are like semi-nullable, but want to treat as non-nullable for simplicity
+                # instead, use empty array as missing value sentinel
+                return self.cast(self.f.array(), dtype)
             if dtype.nullable:
                 return NULL if dtype.is_null() else self.cast(NULL, dtype)
             raise ibis_exceptions.UnsupportedOperationError(
@@ -763,15 +767,17 @@ def visit_DefaultLiteral(self, op, *, value, dtype):
         elif dtype.is_date():
             return self.f.datefromparts(value.year, value.month, value.day)
         elif dtype.is_array():
+            # array type is ambiguous if no elements
             value_type = dtype.value_type
-            return self.f.array(
+            values = self.f.array(
                 *(
                     self.visit_Literal(
                         ops.Literal(v, value_type), value=v, dtype=value_type
                     )
                     for v in value
                 )
             )
+            return values if len(value) > 0 else self.cast(values, dtype)
         elif dtype.is_map():
             key_type = dtype.key_type
             keys = self.f.array(
@@ -804,6 +810,8 @@ def visit_DefaultLiteral(self, op, *, value, dtype):
             return sge.Struct.from_arg_list(items)
         elif dtype.is_uuid():
             return self.cast(str(value), dtype)
+        elif dtype.is_json():
+            return sge.ParseJSON(this=sge.convert(str(value)))
         elif dtype.is_geospatial():
             args = [value.wkt]
             if (srid := dtype.srid) is not None: