Skip to content

Commit c71a061

Browse files
committed
perf: inline small Series and DataFrames in query text
The prevents unnecessary load and query jobs. Towards internal issue 296474170.
1 parent 3adc1b3 commit c71a061

File tree

9 files changed

+178
-48
lines changed

9 files changed

+178
-48
lines changed

bigframes/core/__init__.py

+51-11
Original file line numberDiff line numberDiff line change
@@ -144,21 +144,56 @@ def mem_expr_from_pandas(
144144
"""
145145
Builds an in-memory only (SQL only) expr from a pandas dataframe.
146146
147-
Caution: If session is None, only a subset of expr functionality will be available (null Session is usually not supported).
147+
Caution: If session is None, only a subset of expr functionality will
148+
be available (null Session is usually not supported).
148149
"""
149-
# must set non-null column labels. these are not the user-facing labels
150-
pd_df = pd_df.set_axis(
151-
[column or bigframes.core.guid.generate_guid() for column in pd_df.columns],
152-
axis="columns",
153-
)
150+
# We can't include any hidden columns in the ArrayValue constructor, so
151+
# grab the column names before we add the hidden ordering column.
152+
column_names = [str(column) for column in pd_df.columns]
153+
# Make sure column names are all strings.
154+
pd_df = pd_df.set_axis(column_names, axis="columns")
154155
pd_df = pd_df.assign(**{ORDER_ID_COLUMN: range(len(pd_df))})
156+
155157
# ibis memtable cannot handle NA, must convert to None
156158
pd_df = pd_df.astype("object") # type: ignore
157159
pd_df = pd_df.where(pandas.notnull(pd_df), None)
160+
161+
# NULL type isn't valid in BigQuery, so retry with an explicit schema in these cases.
158162
keys_memtable = ibis.memtable(pd_df)
163+
schema = keys_memtable.schema()
164+
new_schema = []
165+
for column_index, column in enumerate(schema):
166+
if column == ORDER_ID_COLUMN:
167+
new_type = ibis_dtypes.int64
168+
else:
169+
column_type = schema[column]
170+
# The autodetected type might not be one we can support, such
171+
# as NULL type for empty rows, so convert to a type we do
172+
# support.
173+
new_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(
174+
bigframes.dtypes.ibis_dtype_to_bigframes_dtype(column_type)
175+
)
176+
# TODO(swast): Ibis memtable doesn't use backticks in struct
177+
# field names, so spaces and other characters aren't allowed in
178+
# the memtable context. Blocked by
179+
# https://github.com/ibis-project/ibis/issues/7187
180+
column = f"col_{column_index}"
181+
new_schema.append((column, new_type))
182+
183+
# must set non-null column labels. these are not the user-facing labels
184+
pd_df = pd_df.set_axis(
185+
[column for column, _ in new_schema],
186+
axis="columns",
187+
)
188+
keys_memtable = ibis.memtable(pd_df, schema=ibis.schema(new_schema))
189+
159190
return cls(
160191
session, # type: ignore # Session cannot normally be none, see "caution" above
161192
keys_memtable,
193+
columns=[
194+
keys_memtable[f"col_{column_index}"].name(column)
195+
for column_index, column in enumerate(column_names)
196+
],
162197
ordering=ExpressionOrdering(
163198
ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
164199
total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
@@ -426,11 +461,16 @@ def shape(self) -> typing.Tuple[int, int]:
426461
width = len(self.columns)
427462
count_expr = self._to_ibis_expr(ordering_mode="unordered").count()
428463
sql = self._session.ibis_client.compile(count_expr)
429-
row_iterator, _ = self._session._start_query(
430-
sql=sql,
431-
max_results=1,
432-
)
433-
length = next(row_iterator)[0]
464+
465+
# Support in-memory engines for hermetic unit tests.
466+
if not isinstance(sql, str):
467+
length = self._session.ibis_client.execute(count_expr)
468+
else:
469+
row_iterator, _ = self._session._start_query(
470+
sql=sql,
471+
max_results=1,
472+
)
473+
length = next(row_iterator)[0]
434474
return (length, width)
435475

436476
def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue:

bigframes/core/blocks.py

+27-28
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import bigframes.dtypes
4545
import bigframes.operations as ops
4646
import bigframes.operations.aggregations as agg_ops
47+
import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common
4748

4849
# Type constraint for wherever column labels are used
4950
Label = typing.Hashable
@@ -1522,37 +1523,35 @@ def _is_monotonic(
15221523
return result
15231524

15241525

1525-
def block_from_local(data, session=None, use_index=True) -> Block:
1526-
# TODO(tbergeron): Handle duplicate column labels
1526+
def block_from_local(data, session=None) -> Block:
15271527
pd_data = pd.DataFrame(data)
1528+
columns = pd_data.columns
15281529

1529-
column_labels = list(pd_data.columns)
1530-
if not all((label is None) or isinstance(label, str) for label in column_labels):
1531-
raise NotImplementedError(
1532-
f"Only string column labels supported. {constants.FEEDBACK_LINK}"
1533-
)
1530+
# Make a flattened version to treat as a table.
1531+
if len(pd_data.columns.names) > 1:
1532+
pd_data.columns = columns.to_flat_index()
15341533

1535-
if use_index:
1536-
if pd_data.index.nlevels > 1:
1537-
raise NotImplementedError(
1538-
f"multi-indices not supported. {constants.FEEDBACK_LINK}"
1539-
)
1540-
index_label = pd_data.index.name
1541-
1542-
index_id = guid.generate_guid()
1543-
pd_data = pd_data.reset_index(names=index_id)
1544-
keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session)
1545-
return Block(
1546-
keys_expr,
1547-
column_labels=column_labels,
1548-
index_columns=[index_id],
1549-
index_labels=[index_label],
1550-
)
1551-
else:
1552-
keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session)
1553-
keys_expr, offsets_id = keys_expr.promote_offsets()
1554-
# Constructor will create default range index
1555-
return Block(keys_expr, index_columns=[offsets_id], column_labels=column_labels)
1534+
index_labels = list(pd_data.index.names)
1535+
# The ArrayValue layer doesn't know about indexes, so make sure indexes
1536+
# are real columns with unique IDs.
1537+
pd_data = pd_data.reset_index(
1538+
names=[f"level_{level}" for level in range(len(index_labels))]
1539+
)
1540+
pd_data = pd_data.set_axis(
1541+
vendored_pandas_io_common.dedup_names(
1542+
pd_data.columns, is_potential_multiindex=False
1543+
),
1544+
axis="columns",
1545+
)
1546+
index_ids = pd_data.columns[: len(index_labels)]
1547+
1548+
keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session)
1549+
return Block(
1550+
keys_expr,
1551+
column_labels=columns,
1552+
index_columns=index_ids,
1553+
index_labels=index_labels,
1554+
)
15561555

15571556

15581557
def _align_block_to_schema(

bigframes/dataframe.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,7 @@
6565

6666
# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type.
6767
# TODO(tbergeron): Convert to bytes-based limit
68-
# TODO(swast): Address issues with string escaping and empty tables before
69-
# re-enabling inline data (ibis.memtable) feature.
70-
MAX_INLINE_DF_SIZE = -1
68+
MAX_INLINE_DF_SIZE = 5000
7169

7270
LevelType = typing.Union[str, int]
7371
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]

bigframes/dtypes.py

+3
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,9 @@ def ibis_dtype_to_bigframes_dtype(
155155

156156
if ibis_dtype in IBIS_TO_BIGFRAMES:
157157
return IBIS_TO_BIGFRAMES[ibis_dtype]
158+
elif isinstance(ibis_dtype, ibis_dtypes.Null):
159+
# Fallback to STRING for NULL values for most flexibility in SQL.
160+
return IBIS_TO_BIGFRAMES[ibis_dtypes.string]
158161
else:
159162
raise ValueError(
160163
f"Unexpected Ibis data type {ibis_dtype}. {constants.FEEDBACK_LINK}"

bigframes/operations/base.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@
3030

3131
# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type.
3232
# TODO(tbergeron): Convert to bytes-based limit
33-
# TODO(swast): Address issues with string escaping and empty tables before
34-
# re-enabling inline data (ibis.memtable) feature.
35-
MAX_INLINE_SERIES_SIZE = -1
33+
MAX_INLINE_SERIES_SIZE = 5000
3634

3735

3836
class SeriesMethods:

setup.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,12 @@
4444
"google-cloud-resource-manager >=1.10.3",
4545
"google-cloud-storage >=2.0.0",
4646
# TODO: Relax upper bound once we have fixed `system_prerelease` tests.
47-
"ibis-framework[bigquery] >=6.0.0,<=6.1.0",
47+
"ibis-framework[bigquery] >=6.2.0,<7.0.0dev",
4848
"pandas >=1.5.0",
4949
"pydata-google-auth >=1.8.2",
5050
"requests >=2.27.1",
5151
"scikit-learn >=1.2.2",
52-
"sqlalchemy >=1.4,<3.0",
52+
"sqlalchemy >=1.4,<3.0dev",
5353
"ipywidgets >=7.7.1",
5454
"humanize >= 4.6.0",
5555
]

testing/constraints-3.9.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ greenlet==2.0.2
4545
grpc-google-iam-v1==0.12.6
4646
grpcio==1.53.0
4747
grpcio-status==1.48.2
48-
ibis-framework==6.0.0
48+
ibis-framework==6.2.0
4949
humanize==4.6.0
5050
identify==2.5.22
5151
idna==3.4

tests/unit/core/__init__.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

tests/unit/core/test_blocks.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pandas
16+
import pandas.testing
17+
import pytest
18+
19+
import bigframes.core.blocks as blocks
20+
21+
from .. import resources
22+
23+
24+
@pytest.mark.parametrize(
25+
("data",),
26+
(
27+
pytest.param(
28+
{"test 1": [1, 2, 3], "test 2": [0.25, 0.5, 0.75]},
29+
id="dict_spaces_in_column_names",
30+
),
31+
pytest.param(
32+
[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
33+
id="nested_list",
34+
),
35+
pytest.param(
36+
pandas.concat(
37+
[
38+
pandas.Series([1, 2, 3], name="some col"),
39+
pandas.Series([2, 3, 4], name="some col"),
40+
],
41+
axis="columns",
42+
),
43+
id="duplicate_column_names",
44+
),
45+
pytest.param(
46+
pandas.DataFrame(
47+
{"test": [1, 2, 3]},
48+
index=pandas.Index(["a", "b", "c"], name="string index"),
49+
),
50+
id="string_index",
51+
),
52+
pytest.param(
53+
pandas.DataFrame(
54+
[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]],
55+
columns=pandas.MultiIndex.from_tuples(
56+
[(1, 1), (1, 2), (0, 0), (0, 1)],
57+
names=["some level", "another level"],
58+
),
59+
),
60+
id="multiindex_columns",
61+
),
62+
pytest.param(
63+
pandas.DataFrame(
64+
{"test": [1, 2, 3]},
65+
index=pandas.MultiIndex.from_tuples([(1, 1), (1, 2), (0, 0)]),
66+
),
67+
id="multiindex_rows",
68+
),
69+
),
70+
)
71+
def test_block_from_local(data):
72+
expected = pandas.DataFrame(data)
73+
session = resources.create_pandas_session({})
74+
75+
block = blocks.block_from_local(data, session=session)
76+
77+
pandas.testing.assert_index_equal(block.column_labels, expected.columns)
78+
assert tuple(block.index_labels) == tuple(expected.index.names)
79+
assert block.shape == expected.shape

0 commit comments

Comments
 (0)