From 3c7698df31847663b5ca6e8f595b237fc7b5ec8f Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 23 Oct 2024 20:01:32 +0000 Subject: [PATCH 01/10] perf: improve series.unique performance and replace drop_duplicates in tpch test. --- bigframes/series.py | 8 ++++++-- third_party/bigframes_vendored/tpch/queries/q20.py | 7 ++----- third_party/bigframes_vendored/tpch/queries/q22.py | 4 +--- third_party/bigframes_vendored/tpch/queries/q4.py | 5 +---- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 1a913f18d7..7625e98043 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1601,9 +1601,13 @@ def drop_duplicates(self, *, keep: str = "first") -> Series: block = block_ops.drop_duplicates(self._block, (self._value_column,), keep) return Series(block) - @validations.requires_ordering() def unique(self) -> Series: - return self.drop_duplicates() + block, results = self._block.aggregate( + [self._value_column], + [(self._value_column, agg_ops.AnyValueOp())], + dropna=False, + ) + return Series(block.select_columns(results)) def duplicated(self, keep: str = "first") -> Series: if keep is not False: diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py index 26651a31c4..5136fb6727 100644 --- a/third_party/bigframes_vendored/tpch/queries/q20.py +++ b/third_party/bigframes_vendored/tpch/queries/q20.py @@ -46,7 +46,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): if not session._strictly_ordered: filtered_parts = filtered_parts[["P_PARTKEY"]].sort_values(by=["P_PARTKEY"]) - filtered_parts = filtered_parts[["P_PARTKEY"]].drop_duplicates() + filtered_parts = filtered_parts["P_PARTKEY"].unique().to_frame() joined_parts = filtered_parts.merge( partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY" ) @@ -56,10 +56,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) final_filtered = final_join[final_join["PS_AVAILQTY"] > final_join["SUM_QUANTITY"]] - final_filtered = final_filtered[["PS_SUPPKEY"]] - if not session._strictly_ordered: - final_filtered = final_filtered.sort_values(by="PS_SUPPKEY") - final_filtered = final_filtered.drop_duplicates() + final_filtered = final_filtered["PS_SUPPKEY"].unique().to_frame() final_result = final_filtered.merge(q3, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") final_result = final_result[["S_NAME", "S_ADDRESS"]].sort_values(by="S_NAME") diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py index 137a7d5c36..07d9c9e19b 100644 --- a/third_party/bigframes_vendored/tpch/queries/q22.py +++ b/third_party/bigframes_vendored/tpch/queries/q22.py @@ -22,9 +22,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): (customer["CNTRYCODE"].isin(country_codes)) & (customer["C_ACCTBAL"] > 0) ]["C_ACCTBAL"].mean() - if not session._strictly_ordered: - orders = orders.sort_values(by="O_CUSTKEY") - orders_unique = orders.drop_duplicates(subset=["O_CUSTKEY"]) + orders_unique = orders["O_CUSTKEY"].unique().to_frame() matched_customers = customer.merge( orders_unique, left_on="C_CUSTKEY", right_on="O_CUSTKEY" diff --git a/third_party/bigframes_vendored/tpch/queries/q4.py b/third_party/bigframes_vendored/tpch/queries/q4.py index b89f70845f..d149a71f71 100644 --- a/third_party/bigframes_vendored/tpch/queries/q4.py +++ b/third_party/bigframes_vendored/tpch/queries/q4.py @@ -26,10 +26,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): jn = jn[(jn["O_ORDERDATE"] >= var1) & (jn["O_ORDERDATE"] < var2)] jn = jn[jn["L_COMMITDATE"] < jn["L_RECEIPTDATE"]] - if not session._strictly_ordered: - jn = jn.sort_values(by=["O_ORDERPRIORITY", "L_ORDERKEY"]) - - jn = jn.drop_duplicates(subset=["O_ORDERPRIORITY", "L_ORDERKEY"]) + jn = jn.groupby(["O_ORDERPRIORITY", "L_ORDERKEY"], as_index=False).agg("size") gb = jn.groupby("O_ORDERPRIORITY", as_index=False) agg = gb.agg(ORDER_COUNT=bpd.NamedAgg(column="L_ORDERKEY", aggfunc="count")) From 78b59ef7ea2de8a8488a4ade5457750311b9e49b Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 23 Oct 2024 20:02:51 +0000 Subject: [PATCH 02/10] change variable name --- bigframes/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 7625e98043..375ff96372 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1602,12 +1602,12 @@ def drop_duplicates(self, *, keep: str = "first") -> Series: return Series(block) def unique(self) -> Series: - block, results = self._block.aggregate( + block, result = self._block.aggregate( [self._value_column], [(self._value_column, agg_ops.AnyValueOp())], dropna=False, ) - return Series(block.select_columns(results)) + return Series(block.select_columns(result)) def duplicated(self, keep: str = "first") -> Series: if keep is not False: From 27b0f4c5fe6ff59369a805820d1340e80838d898 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 23 Oct 2024 21:03:11 +0000 Subject: [PATCH 03/10] update docstring and index. --- bigframes/series.py | 2 +- third_party/bigframes_vendored/pandas/core/series.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 375ff96372..9443a6f7dc 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1607,7 +1607,7 @@ def unique(self) -> Series: [(self._value_column, agg_ops.AnyValueOp())], dropna=False, ) - return Series(block.select_columns(result)) + return Series(block.select_columns(result).reset_index()) def duplicated(self, keep: str = "first") -> Series: if keep is not False: diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a6363e3285..c488403cce 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -649,8 +649,8 @@ def unique(self) -> Series: """ Return unique values of Series object. - Uniques are returned in order of appearance. Hash table-based unique, - therefore does NOT sort. + Uniques are returned in order of value. To maintain unique values + in order of appearance, use `drop_duplicates()` instead. **Examples:** @@ -665,8 +665,8 @@ def unique(self) -> Series: 3 3 Name: A, dtype: Int64 >>> s.unique() - 0 2 - 1 1 + 0 1 + 1 2 2 3 Name: A, dtype: Int64 From 24912c35012bde064a94a0c56f22f4e25c4d0a05 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 23 Oct 2024 21:58:19 +0000 Subject: [PATCH 04/10] update flag to control behavior. --- bigframes/series.py | 4 +++- .../bigframes_vendored/pandas/core/series.py | 20 ++++++++++++++++--- .../bigframes_vendored/tpch/queries/q20.py | 4 ++-- .../bigframes_vendored/tpch/queries/q22.py | 2 +- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/bigframes/series.py b/bigframes/series.py index 9443a6f7dc..62783f7288 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1601,7 +1601,9 @@ def drop_duplicates(self, *, keep: str = "first") -> Series: block = block_ops.drop_duplicates(self._block, (self._value_column,), keep) return Series(block) - def unique(self) -> Series: + def unique(self, keep_order=True) -> Series: + if keep_order: + return self.drop_duplicates() block, result = self._block.aggregate( [self._value_column], [(self._value_column, agg_ops.AnyValueOp())], diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c488403cce..c66adc3585 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -645,18 +645,24 @@ def nunique(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def unique(self) -> Series: + def unique(self, keep_order=True) -> Series: """ Return unique values of Series object. - Uniques are returned in order of value. To maintain unique values - in order of appearance, use `drop_duplicates()` instead. + By default, uniques are returned in order of appearance. Hash table-based unique, + therefore does NOT sort. + + Args: + keep_order (bool, default True): + If True, preserves the order of the first appearance of each unique value. + If False, returns the elements in ascending order, which can be faster. **Examples:** >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None + Example with order preservation: Slower, but keeps order >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s 0 2 @@ -664,12 +670,20 @@ def unique(self) -> Series: 2 3 3 3 Name: A, dtype: Int64 + + Example without order preservation: Faster, but loses original order >>> s.unique() 0 1 1 2 2 3 Name: A, dtype: Int64 + >>> s.unique(sort=True) + 0 1 + 1 2 + 2 3 + Name: A, dtype: Int64 + Returns: Series: The unique values returned as a Series. """ diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py index 5136fb6727..fded5f5c97 100644 --- a/third_party/bigframes_vendored/tpch/queries/q20.py +++ b/third_party/bigframes_vendored/tpch/queries/q20.py @@ -46,7 +46,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): if not session._strictly_ordered: filtered_parts = filtered_parts[["P_PARTKEY"]].sort_values(by=["P_PARTKEY"]) - filtered_parts = filtered_parts["P_PARTKEY"].unique().to_frame() + filtered_parts = filtered_parts["P_PARTKEY"].unique(keep_order=False).to_frame() joined_parts = filtered_parts.merge( partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY" ) @@ -56,7 +56,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) final_filtered = final_join[final_join["PS_AVAILQTY"] > final_join["SUM_QUANTITY"]] - final_filtered = final_filtered["PS_SUPPKEY"].unique().to_frame() + final_filtered = final_filtered["PS_SUPPKEY"].unique(keep_order=False).to_frame() final_result = final_filtered.merge(q3, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") final_result = final_result[["S_NAME", "S_ADDRESS"]].sort_values(by="S_NAME") diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py index 07d9c9e19b..48850de589 100644 --- a/third_party/bigframes_vendored/tpch/queries/q22.py +++ b/third_party/bigframes_vendored/tpch/queries/q22.py @@ -22,7 +22,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): (customer["CNTRYCODE"].isin(country_codes)) & (customer["C_ACCTBAL"] > 0) ]["C_ACCTBAL"].mean() - orders_unique = orders["O_CUSTKEY"].unique().to_frame() + orders_unique = orders["O_CUSTKEY"].unique(keep_order=False).to_frame() matched_customers = customer.merge( orders_unique, left_on="C_CUSTKEY", right_on="O_CUSTKEY" From bb0332af6000b980f040fa4881a80db5fa3df5e3 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 23 Oct 2024 22:08:49 +0000 Subject: [PATCH 05/10] update examples --- third_party/bigframes_vendored/pandas/core/series.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c66adc3585..2ce45a173f 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -662,7 +662,7 @@ def unique(self, keep_order=True) -> Series: >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - Example with order preservation: Slower, but keeps order + >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s 0 2 @@ -671,14 +671,16 @@ def unique(self, keep_order=True) -> Series: 3 3 Name: A, dtype: Int64 - Example without order preservation: Faster, but loses original order + Example with order preservation: Slower, but keeps order >>> s.unique() - 0 1 - 1 2 + 0 2 + 1 1 2 3 Name: A, dtype: Int64 - >>> s.unique(sort=True) + + Example without order preservation: Faster, but loses original order + >>> s.unique(keep_order=False) 0 1 1 2 2 3 From be29b54d69a968224846085812d4e4ef6812341e Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 23 Oct 2024 22:09:24 +0000 Subject: [PATCH 06/10] delete empty line --- third_party/bigframes_vendored/pandas/core/series.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 2ce45a173f..13ddff00a8 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -662,7 +662,6 @@ def unique(self, keep_order=True) -> Series: >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series([2, 1, 3, 3], name='A') >>> s 0 2 @@ -678,7 +677,6 @@ def unique(self, keep_order=True) -> Series: 2 3 Name: A, dtype: Int64 - Example without order preservation: Faster, but loses original order >>> s.unique(keep_order=False) 0 1 From 0c0163eac8a2c2e214a63132763450511a6c1248 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 23 Oct 2024 22:12:52 +0000 Subject: [PATCH 07/10] keep order error --- bigframes/series.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/series.py b/bigframes/series.py index 62783f7288..65363f5e14 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1603,6 +1603,7 @@ def drop_duplicates(self, *, keep: str = "first") -> Series: def unique(self, keep_order=True) -> Series: if keep_order: + validations.enforce_ordered(self, "unique(keep_order != False)") return self.drop_duplicates() block, result = self._block.aggregate( [self._value_column], From c316b21df70ed5e5a64ba377c72193d8cc8aff73 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 25 Oct 2024 19:11:09 +0000 Subject: [PATCH 08/10] reduce query_count --- third_party/bigframes_vendored/tpch/queries/q10.py | 2 -- third_party/bigframes_vendored/tpch/queries/q11.py | 10 +++++++--- third_party/bigframes_vendored/tpch/queries/q14.py | 11 ++++++----- third_party/bigframes_vendored/tpch/queries/q15.py | 9 +++++++-- third_party/bigframes_vendored/tpch/queries/q17.py | 4 ++-- third_party/bigframes_vendored/tpch/queries/q22.py | 13 +++++++++---- 6 files changed, 31 insertions(+), 18 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q10.py b/third_party/bigframes_vendored/tpch/queries/q10.py index 75a8f2de7f..3c6cde6236 100644 --- a/third_party/bigframes_vendored/tpch/queries/q10.py +++ b/third_party/bigframes_vendored/tpch/queries/q10.py @@ -28,8 +28,6 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): var1 = date(1993, 10, 1) var2 = date(1994, 1, 1) - q_final = customer.merge - q_final = ( customer.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") .merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py index 484a7c0001..51e4510a82 100644 --- a/third_party/bigframes_vendored/tpch/queries/q11.py +++ b/third_party/bigframes_vendored/tpch/queries/q11.py @@ -30,10 +30,14 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): grouped["VALUE"] = grouped["VALUE"].round(2) - total_value = (filtered_df["PS_SUPPLYCOST"] * filtered_df["PS_AVAILQTY"]).sum() - threshold = total_value * 0.0001 + total_value = ( + (filtered_df["PS_SUPPLYCOST"] * filtered_df["PS_AVAILQTY"]).to_frame().sum() + ) + threshold = (total_value * 0.0001).rename("THRESHOLD") + + grouped = grouped.merge(threshold, how="cross") - result_df = grouped[grouped["VALUE"] > threshold] + result_df = grouped[grouped["VALUE"] > grouped["THRESHOLD"]] result_df = result_df.sort_values(by="VALUE", ascending=False) diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py index 27f3d9e224..3b7808c33b 100644 --- a/third_party/bigframes_vendored/tpch/queries/q14.py +++ b/third_party/bigframes_vendored/tpch/queries/q14.py @@ -26,9 +26,10 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"]) ) * filtered["P_TYPE"].str.contains("PROMO").astype("Int64") - total_revenue = (filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"])).sum() - promo_revenue = filtered["CONDI_REVENUE"].sum() - - promo_revenue_percent = 100.00 * promo_revenue / total_revenue + total_revenue = ( + (filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"])).to_frame().sum() + ) + promo_revenue = filtered[["CONDI_REVENUE"]].sum() - _ = round(promo_revenue_percent, 2) + promo_revenue_percent = (100.00 * promo_revenue / total_revenue).round(2) + promo_revenue_percent.to_frame().to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py index 042adbda8b..adf37f9892 100644 --- a/third_party/bigframes_vendored/tpch/queries/q15.py +++ b/third_party/bigframes_vendored/tpch/queries/q15.py @@ -36,8 +36,13 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): supplier, grouped_revenue, left_on="S_SUPPKEY", right_on="SUPPLIER_NO" ) - max_revenue = joined_data["TOTAL_REVENUE"].max() - max_revenue_suppliers = joined_data[joined_data["TOTAL_REVENUE"] == max_revenue] + max_revenue = joined_data[["TOTAL_REVENUE"]].max().rename("MAX_REVENUE") + + joined_data = joined_data.merge(max_revenue, how="cross") + + max_revenue_suppliers = joined_data[ + joined_data["TOTAL_REVENUE"] == joined_data["MAX_REVENUE"] + ] max_revenue_suppliers["TOTAL_REVENUE"] = max_revenue_suppliers[ "TOTAL_REVENUE" diff --git a/third_party/bigframes_vendored/tpch/queries/q17.py b/third_party/bigframes_vendored/tpch/queries/q17.py index 0bd1c44315..56289d57ad 100644 --- a/third_party/bigframes_vendored/tpch/queries/q17.py +++ b/third_party/bigframes_vendored/tpch/queries/q17.py @@ -33,8 +33,8 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): q_final = q_final[q_final["L_QUANTITY"] < q_final["AVG_QUANTITY"]] - q_final = bpd.DataFrame( - {"AVG_YEARLY": [(q_final["L_EXTENDEDPRICE"].sum() / 7.0).round(2)]} + q_final = ( + (q_final[["L_EXTENDEDPRICE"]].sum() / 7.0).round(2).to_frame(name="AVG_YEARLY") ) q_final.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py index 48850de589..bc648ef392 100644 --- a/third_party/bigframes_vendored/tpch/queries/q22.py +++ b/third_party/bigframes_vendored/tpch/queries/q22.py @@ -18,9 +18,13 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): customer["CNTRYCODE"] = customer["C_PHONE"].str.slice(0, 2) - avg_acctbal = customer[ - (customer["CNTRYCODE"].isin(country_codes)) & (customer["C_ACCTBAL"] > 0) - ]["C_ACCTBAL"].mean() + avg_acctbal = ( + customer[ + (customer["CNTRYCODE"].isin(country_codes)) & (customer["C_ACCTBAL"] > 0) + ][["C_ACCTBAL"]] + .mean() + .rename("AVG_ACCTBAL") + ) orders_unique = orders["O_CUSTKEY"].unique(keep_order=False).to_frame() @@ -33,10 +37,11 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): matched_customers[["C_CUSTKEY", "IS_IN_ORDERS"]], on="C_CUSTKEY", how="left" ) customer["IS_IN_ORDERS"] = customer["IS_IN_ORDERS"].fillna(False) + customer = customer.merge(avg_acctbal, how="cross") filtered_customers = customer[ (customer["CNTRYCODE"].isin(country_codes)) - & (customer["C_ACCTBAL"] > avg_acctbal) + & (customer["C_ACCTBAL"] > customer["AVG_ACCTBAL"]) & (~customer["IS_IN_ORDERS"]) ] From 283c9d7fa759d2bb78999a0afe76ac64d096d1fd Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Mon, 28 Oct 2024 21:23:50 +0000 Subject: [PATCH 09/10] improve q16 --- third_party/bigframes_vendored/tpch/queries/q16.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py index 1bd2795c42..9acf0635d4 100644 --- a/third_party/bigframes_vendored/tpch/queries/q16.py +++ b/third_party/bigframes_vendored/tpch/queries/q16.py @@ -20,16 +20,22 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): var1 = "Brand#45" - supplier = supplier[ - supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True) - ]["S_SUPPKEY"] + supplier = ( + supplier[ + supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True) + ]["S_SUPPKEY"] + .unique(keep_order=False) + .to_frame() + ) q_filtered = part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY") q_filtered = q_filtered[q_filtered["P_BRAND"] != var1] q_filtered = q_filtered[~q_filtered["P_TYPE"].str.contains("MEDIUM POLISHED")] q_filtered = q_filtered[q_filtered["P_SIZE"].isin([49, 14, 23, 45, 19, 3, 36, 9])] - final_df = q_filtered[~q_filtered["PS_SUPPKEY"].isin(supplier)] + final_df = q_filtered.merge( + supplier, left_on=["PS_SUPPKEY"], right_on=["S_SUPPKEY"] + ) grouped = final_df.groupby(["P_BRAND", "P_TYPE", "P_SIZE"], as_index=False) result = grouped.agg( From a34a270c5586945b70272aea53fa41986d11f259 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 29 Oct 2024 02:53:56 +0000 Subject: [PATCH 10/10] benchmark updates --- .../bigframes_vendored/tpch/queries/q10.py | 28 +++++++++++-------- .../bigframes_vendored/tpch/queries/q11.py | 4 ++- .../bigframes_vendored/tpch/queries/q14.py | 11 ++++---- .../bigframes_vendored/tpch/queries/q16.py | 2 +- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q10.py b/third_party/bigframes_vendored/tpch/queries/q10.py index 3c6cde6236..1650e9ca34 100644 --- a/third_party/bigframes_vendored/tpch/queries/q10.py +++ b/third_party/bigframes_vendored/tpch/queries/q10.py @@ -59,15 +59,21 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): as_index=False, ).agg(REVENUE=bpd.NamedAgg(column="INTERMEDIATE_REVENUE", aggfunc="sum")) - q_final[ - [ - "C_CUSTKEY", - "C_NAME", - "REVENUE", - "C_ACCTBAL", - "N_NAME", - "C_ADDRESS", - "C_PHONE", - "C_COMMENT", + q_final = ( + q_final[ + [ + "C_CUSTKEY", + "C_NAME", + "REVENUE", + "C_ACCTBAL", + "N_NAME", + "C_ADDRESS", + "C_PHONE", + "C_COMMENT", + ] ] - ].sort_values(by="REVENUE", ascending=False).head(20).to_gbq() + .sort_values(by="REVENUE", ascending=False) + .head(20) + ) + + q_final.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py index 51e4510a82..385393f781 100644 --- a/third_party/bigframes_vendored/tpch/queries/q11.py +++ b/third_party/bigframes_vendored/tpch/queries/q11.py @@ -37,7 +37,9 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): grouped = grouped.merge(threshold, how="cross") - result_df = grouped[grouped["VALUE"] > grouped["THRESHOLD"]] + result_df = grouped[grouped["VALUE"] > grouped["THRESHOLD"]].drop( + columns="THRESHOLD" + ) result_df = result_df.sort_values(by="VALUE", ascending=False) diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py index 3b7808c33b..27f3d9e224 100644 --- a/third_party/bigframes_vendored/tpch/queries/q14.py +++ b/third_party/bigframes_vendored/tpch/queries/q14.py @@ -26,10 +26,9 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"]) ) * filtered["P_TYPE"].str.contains("PROMO").astype("Int64") - total_revenue = ( - (filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"])).to_frame().sum() - ) - promo_revenue = filtered[["CONDI_REVENUE"]].sum() + total_revenue = (filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"])).sum() + promo_revenue = filtered["CONDI_REVENUE"].sum() + + promo_revenue_percent = 100.00 * promo_revenue / total_revenue - promo_revenue_percent = (100.00 * promo_revenue / total_revenue).round(2) - promo_revenue_percent.to_frame().to_gbq() + _ = round(promo_revenue_percent, 2) diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py index 9acf0635d4..79f42ec42c 100644 --- a/third_party/bigframes_vendored/tpch/queries/q16.py +++ b/third_party/bigframes_vendored/tpch/queries/q16.py @@ -22,7 +22,7 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): supplier = ( supplier[ - supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True) + ~supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True) ]["S_SUPPKEY"] .unique(keep_order=False) .to_frame()