Skip to content

Commit 0fbcfe5

Browse files
perf: Automatically squash internal projection nodes and use internal schema system.
1 parent e8e66cf commit 0fbcfe5

File tree

6 files changed

+28
-44
lines changed

6 files changed

+28
-44
lines changed

bigframes/core/__init__.py

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,7 @@ def session(self) -> Session:
106106

107107
@functools.cached_property
108108
def schema(self) -> schemata.ArraySchema:
109-
# TODO: switch to use self.node.schema
110-
return self._compiled_schema
109+
return self.node.schema
111110

112111
@functools.cached_property
113112
def _compiled_schema(self) -> schemata.ArraySchema:
@@ -118,18 +117,6 @@ def _compiled_schema(self) -> schemata.ArraySchema:
118117
)
119118
return schemata.ArraySchema(items)
120119

121-
def validate_schema(self):
122-
tree_derived = self.node.schema
123-
ibis_derived = self._compiled_schema
124-
if tree_derived.names != ibis_derived.names:
125-
raise ValueError(
126-
f"Unexpected names internal {tree_derived.names} vs compiled {ibis_derived.names}"
127-
)
128-
if tree_derived.dtypes != ibis_derived.dtypes:
129-
raise ValueError(
130-
f"Unexpected types internal {tree_derived.dtypes} vs compiled {ibis_derived.dtypes}"
131-
)
132-
133120
def _try_evaluate_local(self):
134121
"""Use only for unit testing paths - not fully featured. Will throw exception if fails."""
135122
import ibis
@@ -196,7 +183,7 @@ def project_to_id(self, expression: ex.Expression, output_id: str):
196183
child=self.node,
197184
assignments=tuple(exprs),
198185
)
199-
)
186+
).rewrite_projection()
200187

201188
def assign(self, source_id: str, destination_id: str) -> ArrayValue:
202189
if destination_id in self.column_ids: # Mutate case
@@ -221,7 +208,7 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue:
221208
child=self.node,
222209
assignments=tuple(exprs),
223210
)
224-
)
211+
).rewrite_projection()
225212

226213
def assign_constant(
227214
self,
@@ -251,7 +238,7 @@ def assign_constant(
251238
child=self.node,
252239
assignments=tuple(exprs),
253240
)
254-
)
241+
).rewrite_projection()
255242

256243
def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
257244
selections = ((ex.free_var(col_id), col_id) for col_id in column_ids)
@@ -260,7 +247,7 @@ def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue:
260247
child=self.node,
261248
assignments=tuple(selections),
262249
)
263-
)
250+
).rewrite_projection()
264251

265252
def drop_columns(self, columns: Iterable[str]) -> ArrayValue:
266253
new_projection = (
@@ -273,7 +260,7 @@ def drop_columns(self, columns: Iterable[str]) -> ArrayValue:
273260
child=self.node,
274261
assignments=tuple(new_projection),
275262
)
276-
)
263+
).rewrite_projection()
277264

278265
def aggregate(
279266
self,
@@ -404,3 +391,7 @@ def _uniform_sampling(self, fraction: float) -> ArrayValue:
404391
The row numbers of result is non-deterministic, avoid to use.
405392
"""
406393
return ArrayValue(nodes.RandomSampleNode(self.node, fraction))
394+
395+
def rewrite_projection(self) -> ArrayValue:
396+
rewritten = bigframes.core.rewrite.maybe_squash_projection(self.node)
397+
return ArrayValue(rewritten)

bigframes/core/compile/compiled.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,18 +1228,24 @@ def _bake_ordering(self) -> OrderedIR:
12281228
)
12291229
new_baked_cols.append(baked_column)
12301230
new_expr = OrderingExpression(
1231-
ex.free_var(baked_column.name), expr.direction, expr.na_last
1231+
ex.free_var(baked_column.get_name()), expr.direction, expr.na_last
12321232
)
12331233
new_exprs.append(new_expr)
1234-
else:
1234+
elif isinstance(expr.scalar_expression, ex.UnboundVariableExpression):
12351235
new_exprs.append(expr)
1236+
new_baked_cols.append(self._ibis_bindings[expr.scalar_expression.id])
12361237

1237-
ordering = self._ordering.with_ordering_columns(new_exprs)
1238+
new_ordering = ExpressionOrdering(
1239+
tuple(new_exprs),
1240+
self._ordering.integer_encoding,
1241+
self._ordering.string_encoding,
1242+
self._ordering.total_ordering_columns,
1243+
)
12381244
return OrderedIR(
12391245
self._table,
12401246
columns=self.columns,
1241-
hidden_ordering_columns=[*self._hidden_ordering_columns, *new_baked_cols],
1242-
ordering=ordering,
1247+
hidden_ordering_columns=tuple(new_baked_cols),
1248+
ordering=new_ordering,
12431249
predicates=self._predicates,
12441250
)
12451251

bigframes/core/rewrite.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,13 @@ def expand(self) -> nodes.BigFrameNode:
170170
return nodes.ProjectionNode(child=root, assignments=self.columns)
171171

172172

173+
def maybe_squash_projection(node: nodes.BigFrameNode) -> nodes.BigFrameNode:
174+
squashed = SquashedSelect.from_node(node)
175+
if squashed.root not in node.child_nodes:
176+
return squashed.expand()
177+
return node
178+
179+
173180
def maybe_rewrite_join(join_node: nodes.JoinNode) -> nodes.BigFrameNode:
174181
left_side = SquashedSelect.from_node(join_node.left_child)
175182
right_side = SquashedSelect.from_node(join_node.right_child)

bigframes/dataframe.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from __future__ import annotations
1818

1919
import datetime
20-
import os
2120
import re
2221
import sys
2322
import textwrap
@@ -174,11 +173,6 @@ def __init__(
174173
self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()
175174
self._query_job: Optional[bigquery.QueryJob] = None
176175

177-
# Runs strict validations to ensure internal type predictions and ibis are completely in sync
178-
# Do not execute these validations outside of testing suite.
179-
if "PYTEST_CURRENT_TEST" in os.environ:
180-
self._block.expr.validate_schema()
181-
182176
def __dir__(self):
183177
return dir(type(self)) + [
184178
label

bigframes/series.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import functools
2020
import itertools
2121
import numbers
22-
import os
2322
import textwrap
2423
import typing
2524
from typing import Any, Literal, Mapping, Optional, Tuple, Union
@@ -72,11 +71,6 @@ def __init__(self, *args, **kwargs):
7271
self._query_job: Optional[bigquery.QueryJob] = None
7372
super().__init__(*args, **kwargs)
7473

75-
# Runs strict validations to ensure internal type predictions and ibis are completely in sync
76-
# Do not execute these validations outside of testing suite.
77-
if "PYTEST_CURRENT_TEST" in os.environ:
78-
self._block.expr.validate_schema()
79-
8074
@property
8175
def dt(self) -> dt.DatetimeMethods:
8276
return dt.DatetimeMethods(self._block)

tests/system/small/test_dataframe.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import io
1616
import operator
17-
import sys
1817
import tempfile
1918
import typing
2019
from typing import Tuple
@@ -4034,13 +4033,6 @@ def test_df_dot_operator_series(
40344033
)
40354034

40364035

4037-
# TODO(tswast): We may be able to re-enable this test after we break large
4038-
# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427
4039-
@pytest.mark.skipif(
4040-
sys.version_info >= (3, 12),
4041-
# See: https://github.com/python/cpython/issues/112282
4042-
reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
4043-
)
40444036
def test_recursion_limit(scalars_df_index):
40454037
scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]]
40464038
for i in range(400):

0 commit comments

Comments
 (0)