Merge branch 'main' into b296390934-index-return-types

arwas11 · web-flow · commit c8a8384820d7 · 2024-12-11T10:44:53.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,43 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [1.28.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.27.0...v1.28.0) (2024-12-11)
+
+
+### Features
+
+* (Series | DataFrame).plot.bar ([#1152](https://github.com/googleapis/python-bigquery-dataframes/issues/1152)) ([0fae2e0](https://github.com/googleapis/python-bigquery-dataframes/commit/0fae2e0291ec8d22341b5b543e8f1b384f83cd3c))
+* `bigframes.bigquery.vector_search` supports `use_brute_force` and `fraction_lists_to_search` parameters ([#1158](https://github.com/googleapis/python-bigquery-dataframes/issues/1158)) ([131edc3](https://github.com/googleapis/python-bigquery-dataframes/commit/131edc3d79f46d35a25422f0db7f150e63e8f561))
+* Add `ARIMAPlus.predict_explain()` to generate forecasts with explanation columns  ([#1177](https://github.com/googleapis/python-bigquery-dataframes/issues/1177)) ([05f8b4d](https://github.com/googleapis/python-bigquery-dataframes/commit/05f8b4d2b2b5f624097228e65a3c42364fc40d36))
+* Add client_endpoints_override to bq options ([#1167](https://github.com/googleapis/python-bigquery-dataframes/issues/1167)) ([be74b99](https://github.com/googleapis/python-bigquery-dataframes/commit/be74b99977cfbd513def5b7e439de6b7706c0712))
+* Add support for temporal types in dataframe's describe() method ([#1189](https://github.com/googleapis/python-bigquery-dataframes/issues/1189)) ([2d564a6](https://github.com/googleapis/python-bigquery-dataframes/commit/2d564a6a9925b69c7e9a15b532fb66ad68c3e264))
+* Allow join-free alignment of analytic expressions ([#1168](https://github.com/googleapis/python-bigquery-dataframes/issues/1168)) ([daef4f0](https://github.com/googleapis/python-bigquery-dataframes/commit/daef4f0c7c5ff2d0a4e9a6ffefeb81f43780ac8b))
+* Series.isin supports bigframes.Series arg ([#1195](https://github.com/googleapis/python-bigquery-dataframes/issues/1195)) ([0d8a16b](https://github.com/googleapis/python-bigquery-dataframes/commit/0d8a16ba77a66dce544d0a7cf411fca0adc2a694))
+* Update llm.TextEmbeddingGenerator to 005 ([#1186](https://github.com/googleapis/python-bigquery-dataframes/issues/1186)) ([3072d38](https://github.com/googleapis/python-bigquery-dataframes/commit/3072d382c6ff57bdb37d7e080c794c67dbf6e701))
+
+
+### Bug Fixes
+
+* Fix error loading local dataframes into bigquery ([#1165](https://github.com/googleapis/python-bigquery-dataframes/issues/1165)) ([5b355ef](https://github.com/googleapis/python-bigquery-dataframes/commit/5b355efde122ed76b1cff39900ab8f94f5a13a30))
+* Fix null index join with 'on' arg ([#1153](https://github.com/googleapis/python-bigquery-dataframes/issues/1153)) ([9015c33](https://github.com/googleapis/python-bigquery-dataframes/commit/9015c33e73675ebb2299487dce3295732ea0527e))
+* Fix series.isin using local path always ([#1202](https://github.com/googleapis/python-bigquery-dataframes/issues/1202)) ([a44eafd](https://github.com/googleapis/python-bigquery-dataframes/commit/a44eafdd95eb1b994dc82411640b61fd0a78a492))
+
+
+### Performance Improvements
+
+* Update df.corr, df.cov to be used with more than 30 columns case. ([#1161](https://github.com/googleapis/python-bigquery-dataframes/issues/1161)) ([9dcf1aa](https://github.com/googleapis/python-bigquery-dataframes/commit/9dcf1aa918919704dcf4d12b05935b22fb502fc6))
+
+
+### Documentation
+
+* Add a code sample using `bpd.options.bigquery.ordering_mode = "partial"` ([#909](https://github.com/googleapis/python-bigquery-dataframes/issues/909)) ([f80d705](https://github.com/googleapis/python-bigquery-dataframes/commit/f80d70503b80559a0b1fe64434383aa3e028bf9b))
+* Add snippet for creating boosted tree model ([#1142](https://github.com/googleapis/python-bigquery-dataframes/issues/1142)) ([a972668](https://github.com/googleapis/python-bigquery-dataframes/commit/a972668833a454fb18e6cb148697165edd46e8cc))
+* Add snippet for evaluating a boosted tree model ([#1154](https://github.com/googleapis/python-bigquery-dataframes/issues/1154)) ([9d8970a](https://github.com/googleapis/python-bigquery-dataframes/commit/9d8970ac1f18b2520a061ac743e767ca8593cc8c))
+* Add snippet for predicting classifications using a boosted tree model ([#1156](https://github.com/googleapis/python-bigquery-dataframes/issues/1156)) ([e7b83f1](https://github.com/googleapis/python-bigquery-dataframes/commit/e7b83f166ef56e631120050103c2f43f454fce44))
+* Add third party `pandas.Index methods` and docstrings ([#1171](https://github.com/googleapis/python-bigquery-dataframes/issues/1171)) ([a970294](https://github.com/googleapis/python-bigquery-dataframes/commit/a9702945286fbe500ade4d0f0c14cc60a8aa00eb))
+* Fix Bigframes.Pandas.General_Function missing docs ([#1164](https://github.com/googleapis/python-bigquery-dataframes/issues/1164)) ([de923d0](https://github.com/googleapis/python-bigquery-dataframes/commit/de923d01b904b96cc51dfd526b6a412f28ff10c4))
+* Update `bigframes.pandas.Index` docstrings ([#1144](https://github.com/googleapis/python-bigquery-dataframes/issues/1144)) ([557ab8d](https://github.com/googleapis/python-bigquery-dataframes/commit/557ab8df526fcf743af0a609ec7ec636b00d0c0b))
+
 ## [1.27.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.26.0...v1.27.0) (2024-11-16)
 
 
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -2025,7 +2025,7 @@ def isin(self, other: Block):
         assert len(other.value_columns) == 1
         unique_other_values = other.expr.select_columns(
             [other.value_columns[0]]
-        ).aggregate((), by_column_ids=(other.value_columns[0],))
+        ).aggregate((), by_column_ids=(other.value_columns[0],), dropna=False)
         block = self
         # for each original column, join with other
         for i in range(len(self.value_columns)):
@@ -2039,9 +2039,7 @@ def _isin_inner(self: Block, col: str, unique_values: core.ArrayValue) -> Block:
         expr, (l_map, r_map) = self._expr.relational_join(
             unique_values, ((col, unique_values.column_ids[0]),), type="left"
         )
-        expr, matches = expr.project_to_id(
-            ops.eq_op.as_expr(ex.const(True), r_map[const])
-        )
+        expr, matches = expr.project_to_id(ops.notnull_op.as_expr(r_map[const]))
 
         new_index_cols = tuple(l_map[idx_col] for idx_col in self.index_columns)
         new_value_cols = tuple(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -517,6 +517,17 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
         )
         return DataFrame(self._block.select_columns(selected_columns))
 
+    def _select_exact_dtypes(
+        self, dtypes: Sequence[bigframes.dtypes.Dtype]
+    ) -> DataFrame:
+        """Selects columns without considering inheritance relationships."""
+        columns = [
+            col_id
+            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            if dtype in dtypes
+        ]
+        return DataFrame(self._block.select_columns(columns))
+
     def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]):
         self._query_job = query_job
 
@@ -2437,13 +2448,9 @@ def agg(
             aggregations = [agg_ops.lookup_agg_func(f) for f in func]
 
             for dtype, agg in itertools.product(self.dtypes, aggregations):
-                if not bigframes.operations.aggregations.is_agg_op_supported(
-                    dtype, agg
-                ):
-                    raise NotImplementedError(
-                        f"Type {dtype} does not support aggregation {agg}. "
-                        f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
-                    )
+                agg.output_type(
+                    dtype
+                )  # Raises exception if the agg does not support the dtype.
 
             return DataFrame(
                 self._block.summarize(
@@ -2512,7 +2519,10 @@ def melt(
 
     def describe(self, include: None | Literal["all"] = None) -> DataFrame:
         if include is None:
-            numeric_df = self._drop_non_numeric(permissive=False)
+            numeric_df = self._select_exact_dtypes(
+                bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+                + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
+            )
             if len(numeric_df.columns) == 0:
                 # Describe eligible non-numeric columns
                 return self._describe_non_numeric()
@@ -2540,9 +2550,11 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
             raise ValueError(f"Unsupported include type: {include}")
 
     def _describe_numeric(self) -> DataFrame:
-        return typing.cast(
+        number_df_result = typing.cast(
             DataFrame,
-            self._drop_non_numeric(permissive=False).agg(
+            self._select_exact_dtypes(
+                bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+            ).agg(
                 [
                     "count",
                     "mean",
@@ -2555,16 +2567,41 @@ def _describe_numeric(self) -> DataFrame:
                 ]
             ),
         )
+        temporal_df_result = typing.cast(
+            DataFrame,
+            self._select_exact_dtypes(
+                bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
+            ).agg(["count"]),
+        )
+
+        if len(number_df_result.columns) == 0:
+            return temporal_df_result
+        elif len(temporal_df_result.columns) == 0:
+            return number_df_result
+        else:
+            import bigframes.core.reshape.api as rs
+
+            original_columns = self._select_exact_dtypes(
+                bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+                + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
+            ).columns
+
+            # Use reindex after join to preserve the original column order.
+            return rs.concat(
+                [number_df_result, temporal_df_result],
+                axis=1,
+            )._reindex_columns(original_columns)
 
     def _describe_non_numeric(self) -> DataFrame:
         return typing.cast(
             DataFrame,
-            self.select_dtypes(
-                include={
+            self._select_exact_dtypes(
+                [
                     bigframes.dtypes.STRING_DTYPE,
                     bigframes.dtypes.BOOL_DTYPE,
                     bigframes.dtypes.BYTES_DTYPE,
-                }
+                    bigframes.dtypes.TIME_DTYPE,
+                ]
             ).agg(["count", "nunique"]),
         )
 
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -18,7 +18,7 @@
 import datetime
 import decimal
 import typing
-from typing import Dict, Literal, Union
+from typing import Dict, List, Literal, Union
 
 import bigframes_vendored.constants as constants
 import geopandas as gpd  # type: ignore
@@ -211,7 +211,7 @@ class SimpleDtypeInfo:
 
 # Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
 # Pandas is inconsistent, so two definitions are provided, each used in different contexts
-NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
+NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: List[Dtype] = [
     FLOAT_DTYPE,
     INT_DTYPE,
 ]
@@ -222,7 +222,16 @@ class SimpleDtypeInfo:
 ]
 
 
-## dtype predicates - use these to maintain consistency
+# Temporal types that are considered as "numeric" by Pandas
+TEMPORAL_NUMERIC_BIGFRAMES_TYPES: List[Dtype] = [
+    DATE_DTYPE,
+    TIMESTAMP_DTYPE,
+    DATETIME_DTYPE,
+]
+TEMPORAL_BIGFRAMES_TYPES = TEMPORAL_NUMERIC_BIGFRAMES_TYPES + [TIME_DTYPE]
+
+
+# dtype predicates - use these to maintain consistency
 def is_datetime_like(type_: ExpressionType) -> bool:
     return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE)
 
@@ -630,7 +639,7 @@ def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool
         return True  # None can be coerced to any supported type
     else:
         return (source_type == STRING_DTYPE) and (
-            target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE)
+            target_type in TEMPORAL_BIGFRAMES_TYPES
         )
 
 
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -579,14 +579,3 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
         return _AGGREGATIONS_LOOKUP[key]
     else:
         raise ValueError(f"Unrecognize aggregate function: {key}")
-
-
-def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
-    if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
-        return True
-
-    if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
-        return isinstance(op, (CountOp, NuniqueOp))
-
-    # For all other types, support no aggregation
-    return False
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -719,7 +719,7 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
 
     def isin(self, values) -> "Series" | None:
         if isinstance(values, (Series,)):
-            self._block.isin(values._block)
+            return Series(self._block.isin(values._block))
         if not _is_list_like(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed to "
diff --git a/bigframes/version.py b/bigframes/version.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.27.0"
+__version__ = "1.28.0"
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
@@ -1358,4 +1358,4 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent
         # backend flakiness.
         #
         # Let's stop further clean up and leave it to later.
-        traceback.print_exception(exc)
+        traceback.print_exception(type(exc), exc, None)
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
@@ -39,8 +39,12 @@ def llm_remote_text_df(session, llm_remote_text_pandas_df):
 
 
 @pytest.mark.flaky(retries=2)
-def test_llm_gemini_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_df):
-    model = llm.GeminiTextGenerator(model_name="gemini-pro", max_iterations=1)
+def test_llm_gemini_configure_fit(
+    session, llm_fine_tune_df_default_index, llm_remote_text_df
+):
+    model = llm.GeminiTextGenerator(
+        session=session, model_name="gemini-pro", max_iterations=1
+    )
 
     X_train = llm_fine_tune_df_default_index[["prompt"]]
     y_train = llm_fine_tune_df_default_index[["label"]]
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -2671,11 +2671,11 @@ def test_dataframe_agg_int_multi_string(scalars_dfs):
 
 
 @skip_legacy_pandas
-def test_df_describe(scalars_dfs):
+def test_df_describe_non_temporal(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
-    # pyarrows time columns fail in pandas
+    # excluding temporal columns here because BigFrames cannot perform percentiles operations on them
     unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
-    bf_result = scalars_df.describe().to_pandas()
+    bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas()
 
     modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns)
     pd_result = modified_pd_df.describe()
@@ -2709,12 +2709,14 @@ def test_df_describe(scalars_dfs):
 def test_df_describe_non_numeric(scalars_dfs, include):
     scalars_df, scalars_pandas_df = scalars_dfs
 
-    non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
+    # Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is
+    # considered numerical in Pandas
+    target_columns = ["string_col", "bytes_col", "bool_col", "time_col"]
 
-    modified_bf = scalars_df[non_numeric_columns]
+    modified_bf = scalars_df[target_columns]
     bf_result = modified_bf.describe(include=include).to_pandas()
 
-    modified_pd_df = scalars_pandas_df[non_numeric_columns]
+    modified_pd_df = scalars_pandas_df[target_columns]
     pd_result = modified_pd_df.describe(include=include)
 
     # Reindex results with the specified keys and their order, because
@@ -2726,8 +2728,35 @@ def test_df_describe_non_numeric(scalars_dfs, include):
     ).rename(index={"unique": "nunique"})
 
     pd.testing.assert_frame_equal(
-        pd_result[non_numeric_columns].astype("Int64"),
-        bf_result[non_numeric_columns],
+        pd_result.astype("Int64"),
+        bf_result,
+        check_index_type=False,
+    )
+
+
+@skip_legacy_pandas
+def test_df_describe_temporal(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
+
+    modified_bf = scalars_df[temporal_columns]
+    bf_result = modified_bf.describe(include="all").to_pandas()
+
+    modified_pd_df = scalars_pandas_df[temporal_columns]
+    pd_result = modified_pd_df.describe(include="all")
+
+    # Reindex results with the specified keys and their order, because
+    # the relative order is not important.
+    bf_result = bf_result.reindex(["count", "nunique"])
+    pd_result = pd_result.reindex(
+        ["count", "unique"]
+        # BF counter part of "unique" is called "nunique"
+    ).rename(index={"unique": "nunique"})
+
+    pd.testing.assert_frame_equal(
+        pd_result.astype("Float64"),
+        bf_result.astype("Float64"),
         check_index_type=False,
     )
 
diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py
diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py

Original file line number	Diff line number	Diff line change
`@@ -1358,4 +1358,4 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent`
`1358`	`1358`	`# backend flakiness.`
`1359`	`1359`	`#`
`1360`	`1360`	`# Let's stop further clean up and leave it to later.`
`1361`		`- traceback.print_exception(exc)`
	`1361`	`+ traceback.print_exception(type(exc), exc, None)`