Skip to content

perf: Simplify sum aggregate SQL text #1395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions bigframes/core/compile/aggregate_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,7 @@ def _(
) -> ibis_types.NumericValue:
# Will be null if all inputs are null. Pandas defaults to zero sum though.
bq_sum = _apply_window_if_present(column.sum(), window)
return (
ibis_api.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore
)
return bq_sum.fillna(ibis_types.literal(0))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Much more readable, thanks! Curious if it gives some SQL simplification and/or performance gains too?



@compile_unary_agg.register
Expand Down
5 changes: 0 additions & 5 deletions bigframes/core/compile/compiled.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@ def aggregate(
self,
aggregations: typing.Sequence[tuple[ex.Aggregation, str]],
by_column_ids: typing.Sequence[ex.DerefOp] = (),
dropna: bool = True,
order_by: typing.Sequence[OrderingExpression] = (),
) -> UnorderedIR:
"""
Expand All @@ -230,10 +229,6 @@ def aggregate(
for aggregate, col_out in aggregations
}
if by_column_ids:
if dropna:
table = table.filter(
[table[ref.id.sql].notnull() for ref in by_column_ids]
)
result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate(
**stats
)
Expand Down
9 changes: 7 additions & 2 deletions bigframes/core/compile/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import google.cloud.bigquery
import pandas as pd

from bigframes import dtypes
from bigframes import dtypes, operations
from bigframes.core import utils
import bigframes.core.compile.compiled as compiled
import bigframes.core.compile.concat as concat_impl
Expand Down Expand Up @@ -278,8 +278,13 @@ def compile_rowcount(self, node: nodes.RowCountNode):
def compile_aggregate(self, node: nodes.AggregateNode):
aggs = tuple((agg, id.sql) for agg, id in node.aggregations)
result = self.compile_node(node.child).aggregate(
aggs, node.by_column_ids, node.dropna, order_by=node.order_by
aggs, node.by_column_ids, order_by=node.order_by
)
# TODO: Remove dropna field and use filter node instead
if node.dropna:
for key in node.by_column_ids:
if node.child.field_by_id[key.id].nullable:
result = result.filter(operations.notnull_op.as_expr(key))
return result

@_compile_node.register
Expand Down