Skip to content

Commit 75ea8f5

Browse files
feat: Support inlining any small data instead of uploading
1 parent 45c9d9f commit 75ea8f5

File tree

5 files changed

+13
-65
lines changed

5 files changed

+13
-65
lines changed

bigframes/core/array_value.py

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class ArrayValue:
6060

6161
@classmethod
6262
def from_pyarrow(cls, arrow_table: pa.Table, session: Session):
63+
# TODO: we might need to adapt some unruly types, and even cast after in bigquery
6364
adapted_table = local_data.adapt_pa_table(arrow_table)
6465
schema = local_data.arrow_schema_to_bigframes(adapted_table.schema)
6566

bigframes/core/blocks.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@ def from_local(
176176
*,
177177
cache_transpose: bool = True,
178178
) -> Block:
179-
# Assumes caller has already converted datatypes to bigframes ones.
180179
pd_data = data
181180
column_labels = pd_data.columns
182181
index_labels = list(pd_data.index.names)
@@ -187,6 +186,7 @@ def from_local(
187186

188187
pd_data = pd_data.set_axis(column_ids, axis=1)
189188
pd_data = pd_data.reset_index(names=index_ids)
189+
# TODO: We need to
190190
as_pyarrow = pa.Table.from_pandas(pd_data, preserve_index=False)
191191
array_value = core.ArrayValue.from_pyarrow(as_pyarrow, session=session)
192192
block = cls(

bigframes/session/__init__.py

+8-38
Original file line numberDiff line numberDiff line change
@@ -794,13 +794,12 @@ def _read_pandas(
794794
)
795795

796796
if write_engine == "default":
797-
try:
798-
inline_df = self._read_pandas_inline(pandas_dataframe)
799-
return inline_df
800-
except ValueError:
801-
pass
802-
return self._read_pandas_load_job(pandas_dataframe, api_name)
803-
elif write_engine == "bigquery_inline":
797+
is_df_large = (
798+
pandas_dataframe.memory_usage(deep=True).sum() > MAX_INLINE_DF_BYTES
799+
)
800+
write_engine = "bigquery_load" if is_df_large else "bigquery_inline"
801+
802+
if write_engine == "bigquery_inline":
804803
return self._read_pandas_inline(pandas_dataframe)
805804
elif write_engine == "bigquery_load":
806805
return self._read_pandas_load_job(pandas_dataframe, api_name)
@@ -814,37 +813,8 @@ def _read_pandas_inline(
814813
) -> dataframe.DataFrame:
815814
import bigframes.dataframe as dataframe
816815

817-
memory_usage = pandas_dataframe.memory_usage(deep=True).sum()
818-
if memory_usage > MAX_INLINE_DF_BYTES:
819-
raise ValueError(
820-
f"DataFrame size ({memory_usage} bytes) exceeds the maximum allowed "
821-
f"for inline data ({MAX_INLINE_DF_BYTES} bytes)."
822-
)
823-
824-
try:
825-
local_block = blocks.Block.from_local(pandas_dataframe, self)
826-
inline_df = dataframe.DataFrame(local_block)
827-
except (
828-
pa.ArrowInvalid, # Thrown by arrow for unsupported types, such as geo.
829-
pa.ArrowTypeError, # Thrown by arrow for types without mapping (geo).
830-
ValueError, # Thrown by ibis for some unhandled types
831-
TypeError, # Not all types handleable by local code path
832-
) as exc:
833-
raise ValueError(
834-
f"Could not convert with a BigQuery type: `{exc}`. "
835-
) from exc
836-
837-
# Make sure all types are inlinable to avoid escaping errors.
838-
inline_types = inline_df._block.expr.schema.dtypes
839-
noninlinable_types = [
840-
dtype for dtype in inline_types if dtype not in INLINABLE_DTYPES
841-
]
842-
if len(noninlinable_types) != 0:
843-
raise ValueError(
844-
f"Could not inline with a BigQuery type: `{noninlinable_types}`. "
845-
f"{constants.FEEDBACK_LINK}"
846-
)
847-
816+
local_block = blocks.Block.from_local(pandas_dataframe, self)
817+
inline_df = dataframe.DataFrame(local_block)
848818
return inline_df
849819

850820
def _read_pandas_load_job(

tests/unit/session/test_session.py

+1-26
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import google.cloud.bigquery
2424
import google.cloud.bigquery.table
2525
import pandas as pd
26-
import pyarrow as pa
2726
import pytest
2827

2928
import bigframes
@@ -462,32 +461,8 @@ def today(cls):
462461
resources.create_bigquery_session()
463462

464463

465-
@mock.patch("bigframes.session.MAX_INLINE_DF_BYTES", 1)
466-
def test_read_pandas_inline_exceeds_limit_raises_error():
467-
session = resources.create_bigquery_session()
468-
pd_df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
469-
with pytest.raises(
470-
ValueError,
471-
match=r"DataFrame size \(.* bytes\) exceeds the maximum allowed for inline data \(1 bytes\)\.",
472-
):
473-
session.read_pandas(pd_df, write_engine="bigquery_inline")
474-
475-
476464
def test_read_pandas_inline_w_interval_type_raises_error():
477465
session = resources.create_bigquery_session()
478466
df = pd.DataFrame(pd.arrays.IntervalArray.from_breaks([0, 10, 20, 30, 40, 50]))
479-
with pytest.raises(ValueError, match="Could not convert with a BigQuery type: "):
467+
with pytest.raises(TypeError):
480468
session.read_pandas(df, write_engine="bigquery_inline")
481-
482-
483-
def test_read_pandas_inline_w_noninlineable_type_raises_error():
484-
session = resources.create_bigquery_session()
485-
data = [
486-
[1, 2, 3],
487-
[4, 5],
488-
None,
489-
[6, 7, 8, 9],
490-
]
491-
s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64())))
492-
with pytest.raises(ValueError, match="Could not inline with a BigQuery type:"):
493-
session.read_pandas(s, write_engine="bigquery_inline")

third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py

+2
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,8 @@ def visit_DefaultLiteral(self, op, *, value, dtype):
804804
return sge.Struct.from_arg_list(items)
805805
elif dtype.is_uuid():
806806
return self.cast(str(value), dtype)
807+
elif dtype.is_json():
808+
return sge.ParseJSON(this=sge.convert(str(value)))
807809
elif dtype.is_geospatial():
808810
args = [value.wkt]
809811
if (srid := dtype.srid) is not None:

0 commit comments

Comments
 (0)