Skip to content

Commit 2ce891f

Browse files
feat: Support inlining small list, struct, json data (#1589)
1 parent 40c55a0 commit 2ce891f

File tree

7 files changed

+107
-36
lines changed

7 files changed

+107
-36
lines changed

bigframes/core/local_data.py

+8
Original file line numberDiff line numberDiff line change
@@ -71,5 +71,13 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
7171
if pa.types.is_large_string(type):
7272
# simple string type can handle the largest strings needed
7373
return pa.string()
74+
if pa.types.is_null(type):
75+
# null as a type not allowed, default type is float64 for bigframes
76+
return pa.float64()
77+
if pa.types.is_list(type):
78+
new_field_t = arrow_type_replacements(type.value_type)
79+
if new_field_t != type.value_type:
80+
return pa.list_(new_field_t)
81+
return type
7482
else:
7583
return type

bigframes/session/__init__.py

+4-15
Original file line numberDiff line numberDiff line change
@@ -108,20 +108,9 @@
108108

109109
logger = logging.getLogger(__name__)
110110

111-
# Excludes geography and nested (array, struct) datatypes
112-
INLINABLE_DTYPES: Sequence[bigframes.dtypes.Dtype] = (
113-
pandas.BooleanDtype(),
114-
pandas.Float64Dtype(),
115-
pandas.Int64Dtype(),
116-
pandas.StringDtype(storage="pyarrow"),
117-
pandas.ArrowDtype(pa.binary()),
118-
pandas.ArrowDtype(pa.date32()),
119-
pandas.ArrowDtype(pa.time64("us")),
120-
pandas.ArrowDtype(pa.timestamp("us")),
121-
pandas.ArrowDtype(pa.timestamp("us", tz="UTC")),
122-
pandas.ArrowDtype(pa.decimal128(38, 9)),
123-
pandas.ArrowDtype(pa.decimal256(76, 38)),
124-
pandas.ArrowDtype(pa.duration("us")),
111+
NON_INLINABLE_DTYPES: Sequence[bigframes.dtypes.Dtype] = (
112+
# Currently excluded as doesn't have arrow type
113+
bigframes.dtypes.GEO_DTYPE,
125114
)
126115

127116

@@ -852,7 +841,7 @@ def _read_pandas_inline(
852841
# Make sure all types are inlinable to avoid escaping errors.
853842
inline_types = inline_df._block.expr.schema.dtypes
854843
noninlinable_types = [
855-
dtype for dtype in inline_types if dtype not in INLINABLE_DTYPES
844+
dtype for dtype in inline_types if dtype in NON_INLINABLE_DTYPES
856845
]
857846
if len(noninlinable_types) != 0:
858847
raise ValueError(

tests/system/small/test_series.py

+14
Original file line numberDiff line numberDiff line change
@@ -4342,6 +4342,20 @@ def test_series_explode_w_aggregate():
43424342
assert s.explode().sum() == pd_s.explode().sum()
43434343

43444344

4345+
@skip_legacy_pandas
4346+
def test_series_construct_empty_array():
4347+
s = bigframes.pandas.Series([[]])
4348+
expected = pd.Series(
4349+
[[]],
4350+
dtype=pd.ArrowDtype(pa.list_(pa.float64())),
4351+
index=pd.Index([0], dtype=pd.Int64Dtype()),
4352+
)
4353+
pd.testing.assert_series_equal(
4354+
expected,
4355+
s.to_pandas(),
4356+
)
4357+
4358+
43454359
@pytest.mark.parametrize(
43464360
("data"),
43474361
[

tests/system/small/test_session.py

+57-5
Original file line numberDiff line numberDiff line change
@@ -969,13 +969,11 @@ def test_read_pandas_json_index(session, write_engine):
969969
@pytest.mark.parametrize(
970970
("write_engine"),
971971
[
972-
pytest.param("default"),
973972
pytest.param("bigquery_load"),
974973
pytest.param("bigquery_streaming"),
975-
pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)),
976974
],
977975
)
978-
def test_read_pandas_w_nested_json(session, write_engine):
976+
def test_read_pandas_w_nested_json_fails(session, write_engine):
979977
data = [
980978
[{"json_field": "1"}],
981979
[{"json_field": None}],
@@ -995,16 +993,44 @@ def test_read_pandas_w_nested_json(session, write_engine):
995993
session.read_pandas(pd_s, write_engine=write_engine)
996994

997995

996+
@utils.skip_legacy_pandas
998997
@pytest.mark.parametrize(
999998
("write_engine"),
1000999
[
10011000
pytest.param("default"),
1001+
pytest.param("bigquery_inline"),
1002+
],
1003+
)
1004+
def test_read_pandas_inline_w_nested_json(session, write_engine):
1005+
data = [
1006+
[{"json_field": "1"}],
1007+
[{"json_field": None}],
1008+
[{"json_field": '["1","3","5"]'}],
1009+
[{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
1010+
]
1011+
pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
1012+
pd_s = pd.Series(
1013+
arrays.ArrowExtensionArray(pa_array), # type: ignore
1014+
dtype=pd.ArrowDtype(
1015+
pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
1016+
),
1017+
)
1018+
bq_s = (
1019+
session.read_pandas(pd_s, write_engine=write_engine)
1020+
.to_pandas()
1021+
.reset_index(drop=True)
1022+
)
1023+
pd.testing.assert_series_equal(bq_s, pd_s)
1024+
1025+
1026+
@pytest.mark.parametrize(
1027+
("write_engine"),
1028+
[
10021029
pytest.param("bigquery_load"),
10031030
pytest.param("bigquery_streaming"),
1004-
pytest.param("bigquery_inline", marks=pytest.mark.xfail(raises=ValueError)),
10051031
],
10061032
)
1007-
def test_read_pandas_w_nested_json_index(session, write_engine):
1033+
def test_read_pandas_inline_w_nested_json_index_fails(session, write_engine):
10081034
data = [
10091035
[{"json_field": "1"}],
10101036
[{"json_field": None}],
@@ -1026,6 +1052,32 @@ def test_read_pandas_w_nested_json_index(session, write_engine):
10261052
session.read_pandas(pd_idx, write_engine=write_engine)
10271053

10281054

1055+
@utils.skip_legacy_pandas
1056+
@pytest.mark.parametrize(
1057+
("write_engine"),
1058+
[
1059+
pytest.param("default"),
1060+
pytest.param("bigquery_inline"),
1061+
],
1062+
)
1063+
def test_read_pandas_w_nested_json_index(session, write_engine):
1064+
data = [
1065+
[{"json_field": "1"}],
1066+
[{"json_field": None}],
1067+
[{"json_field": '["1","3","5"]'}],
1068+
[{"json_field": '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}'}],
1069+
]
1070+
pa_array = pa.array(data, type=pa.list_(pa.struct([("name", pa.string())])))
1071+
pd_idx: pd.Index = pd.Index(
1072+
arrays.ArrowExtensionArray(pa_array), # type: ignore
1073+
dtype=pd.ArrowDtype(
1074+
pa.list_(pa.struct([("name", bigframes.dtypes.JSON_ARROW_TYPE)]))
1075+
),
1076+
)
1077+
bq_idx = session.read_pandas(pd_idx, write_engine=write_engine).to_pandas()
1078+
pd.testing.assert_index_equal(bq_idx, pd_idx)
1079+
1080+
10291081
@utils.skip_legacy_pandas
10301082
@pytest.mark.parametrize(
10311083
("write_engine",),

tests/unit/session/test_io_pandas.py

+15
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import pandas.testing
2525
import pyarrow # type: ignore
2626
import pytest
27+
import shapely # type: ignore
2728

2829
import bigframes.core.schema
2930
import bigframes.features
@@ -503,3 +504,17 @@ def test_read_pandas_with_bigframes_dataframe():
503504
ValueError, match=re.escape("read_pandas() expects a pandas.DataFrame")
504505
):
505506
session.read_pandas(df)
507+
508+
509+
def test_read_pandas_inline_w_noninlineable_type_raises_error():
510+
session = resources.create_bigquery_session()
511+
data = [
512+
shapely.Point(1, 1),
513+
shapely.Point(2, 1),
514+
shapely.Point(1, 2),
515+
]
516+
s = pandas.Series(data, dtype=geopandas.array.GeometryDtype())
517+
with pytest.raises(
518+
ValueError, match="Could not (convert|inline) with a BigQuery type:"
519+
):
520+
session.read_pandas(s, write_engine="bigquery_inline")

tests/unit/session/test_session.py

-15
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@
2121

2222
import google.api_core.exceptions
2323
import google.cloud.bigquery
24-
import google.cloud.bigquery.table
2524
import pandas as pd
26-
import pyarrow as pa
2725
import pytest
2826

2927
import bigframes
@@ -478,16 +476,3 @@ def test_read_pandas_inline_w_interval_type_raises_error():
478476
df = pd.DataFrame(pd.arrays.IntervalArray.from_breaks([0, 10, 20, 30, 40, 50]))
479477
with pytest.raises(ValueError, match="Could not convert with a BigQuery type: "):
480478
session.read_pandas(df, write_engine="bigquery_inline")
481-
482-
483-
def test_read_pandas_inline_w_noninlineable_type_raises_error():
484-
session = resources.create_bigquery_session()
485-
data = [
486-
[1, 2, 3],
487-
[4, 5],
488-
None,
489-
[6, 7, 8, 9],
490-
]
491-
s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
492-
with pytest.raises(ValueError, match="Could not inline with a BigQuery type:"):
493-
session.read_pandas(s, write_engine="bigquery_inline")

third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,10 @@ def visit_Literal(self, op, *, value, dtype):
706706
else return the result of the previous step.
707707
"""
708708
if value is None:
709+
if dtype.is_array():
710+
# hack: bq arrays are like semi-nullable, but want to treat as non-nullable for simplicity
711+
# instead, use empty array as missing value sentinel
712+
return self.cast(self.f.array(), dtype)
709713
if dtype.nullable:
710714
return NULL if dtype.is_null() else self.cast(NULL, dtype)
711715
raise ibis_exceptions.UnsupportedOperationError(
@@ -763,15 +767,17 @@ def visit_DefaultLiteral(self, op, *, value, dtype):
763767
elif dtype.is_date():
764768
return self.f.datefromparts(value.year, value.month, value.day)
765769
elif dtype.is_array():
770+
# array type is ambiguous if no elements
766771
value_type = dtype.value_type
767-
return self.f.array(
772+
values = self.f.array(
768773
*(
769774
self.visit_Literal(
770775
ops.Literal(v, value_type), value=v, dtype=value_type
771776
)
772777
for v in value
773778
)
774779
)
780+
return values if len(value) > 0 else self.cast(values, dtype)
775781
elif dtype.is_map():
776782
key_type = dtype.key_type
777783
keys = self.f.array(
@@ -804,6 +810,8 @@ def visit_DefaultLiteral(self, op, *, value, dtype):
804810
return sge.Struct.from_arg_list(items)
805811
elif dtype.is_uuid():
806812
return self.cast(str(value), dtype)
813+
elif dtype.is_json():
814+
return sge.ParseJSON(this=sge.convert(str(value)))
807815
elif dtype.is_geospatial():
808816
args = [value.wkt]
809817
if (srid := dtype.srid) is not None:

0 commit comments

Comments
 (0)