feat: allow pandas.cut 'labels' parameter to accept a list of string (#1549)

chelsea-lin · web-flow · commit af842b174de7 · 2025-03-28T21:17:48.000Z
diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py
@@ -366,7 +366,13 @@ def _(
         for this_bin in range(op.bins):
             if op.labels is False:
                 value = compile_ibis_types.literal_to_ibis_scalar(
-                    this_bin, force_dtype=pd.Int64Dtype()
+                    this_bin,
+                    force_dtype=pd.Int64Dtype(),
+                )
+            elif isinstance(op.labels, typing.Iterable):
+                value = compile_ibis_types.literal_to_ibis_scalar(
+                    list(op.labels)[this_bin],
+                    force_dtype=pd.StringDtype(storage="pyarrow"),
                 )
             else:
                 left_adj = adj if this_bin == 0 and op.right else 0
@@ -402,7 +408,13 @@ def _(
 
             if op.labels is False:
                 value = compile_ibis_types.literal_to_ibis_scalar(
-                    this_bin, force_dtype=pd.Int64Dtype()
+                    this_bin,
+                    force_dtype=pd.Int64Dtype(),
+                )
+            elif isinstance(op.labels, typing.Iterable):
+                value = compile_ibis_types.literal_to_ibis_scalar(
+                    list(op.labels)[this_bin],
+                    force_dtype=pd.StringDtype(storage="pyarrow"),
                 )
             else:
                 if op.right:
diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py
@@ -20,6 +20,7 @@
 import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile
 import pandas as pd
 
+import bigframes.constants
 import bigframes.core.expression as ex
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
@@ -41,17 +42,37 @@ def cut(
     right: typing.Optional[bool] = True,
     labels: typing.Union[typing.Iterable[str], bool, None] = None,
 ) -> bigframes.series.Series:
-    if labels is not None and labels is not False:
+    if (
+        labels is not None
+        and labels is not False
+        and not isinstance(labels, typing.Iterable)
+    ):
+        raise ValueError(
+            "Bin labels must either be False, None or passed in as a list-like argument"
+        )
+    if (
+        isinstance(labels, typing.Iterable)
+        and len(list(labels)) > 0
+        and not isinstance(list(labels)[0], str)
+    ):
         raise NotImplementedError(
-            "The 'labels' parameter must be either False or None. "
-            "Please provide a valid value for 'labels'."
+            "When using an iterable for labels, only iterables of strings are supported "
+            f"but found {type(list(labels)[0])}. {constants.FEEDBACK_LINK}"
         )
+
     if x.size == 0:
         raise ValueError("Cannot cut empty array.")
 
     if isinstance(bins, int):
         if bins <= 0:
             raise ValueError("`bins` should be a positive integer.")
+        if isinstance(labels, typing.Iterable):
+            labels = tuple(labels)
+            if len(labels) != bins:
+                raise ValueError(
+                    f"Bin labels({len(labels)}) must be same as the value of bins({bins})"
+                )
+
         op = agg_ops.CutOp(bins, right=right, labels=labels)
         return x._apply_window_op(op, window_spec=window_specs.unbound())
     elif isinstance(bins, typing.Iterable):
@@ -64,9 +85,6 @@ def cut(
         elif len(list(bins)) == 0:
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple()
-            # To maintain consistency with pandas' behavior
-            right = True
-            labels = None
         elif isinstance(list(bins)[0], tuple):
             as_index = pd.IntervalIndex.from_tuples(list(bins))
             bins = tuple(bins)
@@ -88,8 +106,17 @@ def cut(
             raise ValueError("`bins` iterable should contain tuples or numerics.")
 
         if as_index.is_overlapping:
-            raise ValueError("Overlapping IntervalIndex is not accepted.")
-        elif len(as_index) == 0:
+            raise ValueError("Overlapping IntervalIndex is not accepted.")  # TODO: test
+
+        if isinstance(labels, typing.Iterable):
+            labels = tuple(labels)
+            if len(labels) != len(as_index):
+                raise ValueError(
+                    f"Bin labels({len(labels)}) must be same as the number of bin edges"
+                    f"({len(as_index)})"
+                )
+
+        if len(as_index) == 0:
             dtype = agg_ops.CutOp(bins, right=right, labels=labels).output_type()
             return bigframes.series.Series(
                 [pd.NA] * len(x),
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
@@ -340,7 +340,7 @@ class CutOp(UnaryWindowOp):
     # TODO: Unintuitive, refactor into multiple ops?
     bins: typing.Union[int, Iterable]
     right: Optional[bool]
-    labels: Optional[bool]
+    labels: typing.Union[bool, Iterable[str], None]
 
     @property
     def skips_nulls(self):
@@ -349,6 +349,8 @@ def skips_nulls(self):
     def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
         if self.labels is False:
             return dtypes.INT_DTYPE
+        elif isinstance(self.labels, Iterable):
+            return dtypes.STRING_DTYPE
         else:
             # Assumption: buckets use same numeric type
             if isinstance(self.bins, int):
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
@@ -404,6 +404,14 @@ def _convert_pandas_category(pd_s: pd.Series):
             f"Input must be a pandas Series with categorical data: {pd_s.dtype}"
         )
 
+    if pd.api.types.is_object_dtype(pd_s.cat.categories.dtype):
+        return pd_s.astype(pd.StringDtype(storage="pyarrow"))
+
+    if not isinstance(pd_s.cat.categories.dtype, pd.IntervalDtype):
+        raise ValueError(
+            f"Must be a IntervalDtype with categorical data: {pd_s.cat.categories.dtype}"
+        )
+
     if pd_s.cat.categories.dtype.closed == "left":  # type: ignore
         left_key = "left_inclusive"
         right_key = "right_exclusive"
@@ -465,6 +473,17 @@ def test_cut_by_int_bins(scalars_dfs, labels, right):
     pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)
 
 
+def test_cut_by_int_bins_w_labels(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    labels = ["A", "B", "C", "D", "E"]
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=labels)
+    bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=labels)
+
+    pd_result = _convert_pandas_category(pd_result)
+    pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)
+
+
 @pytest.mark.parametrize(
     ("breaks", "right", "labels"),
     [
@@ -494,7 +513,7 @@ def test_cut_by_int_bins(scalars_dfs, labels, right):
         ),
     ],
 )
-def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels):
+def test_cut_by_numeric_breaks(scalars_dfs, breaks, right, labels):
     scalars_df, scalars_pandas_df = scalars_dfs
 
     pd_result = pd.cut(
@@ -508,6 +527,18 @@ def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels):
     pd.testing.assert_series_equal(bf_result, pd_result_converted)
 
 
+def test_cut_by_numeric_breaks_w_labels(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    bins = [0, 5, 10, 15, 20]
+    labels = ["A", "B", "C", "D"]
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels)
+    bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels)
+
+    pd_result = _convert_pandas_category(pd_result)
+    pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)
+
+
 @pytest.mark.parametrize(
     ("bins", "right", "labels"),
     [
@@ -534,7 +565,7 @@ def test_cut_numeric_breaks(scalars_dfs, breaks, right, labels):
         ),
     ],
 )
-def test_cut_with_interval(scalars_dfs, bins, right, labels):
+def test_cut_by_interval_bins(scalars_dfs, bins, right, labels):
     scalars_df, scalars_pandas_df = scalars_dfs
     bf_result = bpd.cut(
         scalars_df["int64_too"], bins, labels=labels, right=right
@@ -548,22 +579,30 @@ def test_cut_with_interval(scalars_dfs, bins, right, labels):
     pd.testing.assert_series_equal(bf_result, pd_result_converted)
 
 
+def test_cut_by_interval_bins_w_labels(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    bins = pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)])
+    labels = ["A", "B", "C", "D", "E"]
+    pd_result = pd.cut(scalars_pandas_df["float64_col"], bins, labels=labels)
+    bf_result = bpd.cut(scalars_df["float64_col"], bins, labels=labels)
+
+    pd_result = _convert_pandas_category(pd_result)
+    pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result)
+
+
 @pytest.mark.parametrize(
-    "bins",
+    ("bins", "labels"),
     [
-        pytest.param([], id="empty_breaks"),
-        pytest.param(
-            [1], id="single_int_breaks", marks=pytest.mark.skip(reason="b/404338651")
-        ),
-        pytest.param(pd.IntervalIndex.from_tuples([]), id="empty_interval_index"),
+        pytest.param([], None, id="empty_breaks"),
+        pytest.param([1], False, id="single_int_breaks"),
+        pytest.param(pd.IntervalIndex.from_tuples([]), None, id="empty_interval_index"),
     ],
 )
-def test_cut_by_edge_cases_bins(scalars_dfs, bins):
+def test_cut_by_edge_cases_bins(scalars_dfs, bins, labels):
     scalars_df, scalars_pandas_df = scalars_dfs
-    bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas()
-    if isinstance(bins, list):
-        bins = pd.IntervalIndex.from_tuples(bins)
-    pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False)
+    bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=labels).to_pandas()
+    pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=labels)
 
     pd_result_converted = _convert_pandas_category(pd_result)
     pd.testing.assert_series_equal(bf_result, pd_result_converted)
diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py
@@ -91,13 +91,48 @@ def test_method_matches_session(method_name: str):
     assert pandas_signature.return_annotation == session_signature.return_annotation
 
 
-def test_cut_raises_with_labels():
+@pytest.mark.parametrize(
+    ("bins", "labels", "error_message"),
+    [
+        pytest.param(
+            5,
+            True,
+            "Bin labels must either be False, None or passed in as a list-like argument",
+            id="true",
+        ),
+        pytest.param(
+            5,
+            1.5,
+            "Bin labels must either be False, None or passed in as a list-like argument",
+            id="invalid_types",
+        ),
+        pytest.param(
+            2,
+            ["A"],
+            "must be same as the value of bins",
+            id="int_bins_mismatch",
+        ),
+        pytest.param(
+            [1, 2, 3],
+            ["A"],
+            "must be same as the number of bin edges",
+            id="iterator_bins_mismatch",
+        ),
+    ],
+)
+def test_cut_raises_with_invalid_labels(bins: int, labels, error_message: str):
+    mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
+    with pytest.raises(ValueError, match=error_message):
+        bigframes.pandas.cut(mock_series, bins, labels=labels)
+
+
+def test_cut_raises_with_unsupported_labels():
+    mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
+    labels = [1, 2]
     with pytest.raises(
-        NotImplementedError,
-        match="The 'labels' parameter must be either False or None.",
+        NotImplementedError, match=r".*only iterables of strings are supported.*"
     ):
-        mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
-        bigframes.pandas.cut(mock_series, 4, labels=["a", "b", "c", "d"])
+        bigframes.pandas.cut(mock_series, 2, labels=labels)  # type: ignore
 
 
 @pytest.mark.parametrize(
@@ -111,11 +146,21 @@ def test_cut_raises_with_labels():
             "`bins` iterable should contain tuples or numerics",
             id="iterable_w_wrong_type",
         ),
+        pytest.param(
+            [10, 3],
+            "left side of interval must be <= right side",
+            id="decreased_breaks",
+        ),
+        pytest.param(
+            [(1, 10), (2, 25)],
+            "Overlapping IntervalIndex is not accepted.",
+            id="overlapping_intervals",
+        ),
     ],
 )
 def test_cut_raises_with_invalid_bins(bins: int, error_message: str):
+    mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
     with pytest.raises(ValueError, match=error_message):
-        mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True)
         bigframes.pandas.cut(mock_series, bins, labels=False)
 
 
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
@@ -31,8 +31,6 @@ def cut(
     age ranges. Supports binning into an equal number of bins, or a
     pre-specified array of bins.
 
-    ``labels=False`` implies you just want the bins back.
-
     **Examples:**
 
         >>> import bigframes.pandas as bpd
@@ -55,7 +53,16 @@ def cut(
             3     {'left_exclusive': 7.5, 'right_inclusive': 10.0}
             dtype: struct<left_exclusive: double, right_inclusive: double>[pyarrow]
 
-    Cut with an integer (equal-width bins) and labels=False:
+    Cut with the same bins, but assign them specific labels:
+
+        >>> bpd.cut(s, bins=3, labels=["bad", "medium", "good"])
+        0    bad
+        1    bad
+        2    medium
+        3    good
+        dtype: string
+
+    `labels=False` implies you want the bins back.
 
         >>> bpd.cut(s, bins=4, labels=False)
         0    0
@@ -67,7 +74,6 @@ def cut(
     Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex:
 
         >>> import pandas as pd
-
         >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)])
         >>> bpd.cut(s, bins=interval_index)
         0                                            <NA>
@@ -107,7 +113,7 @@ def cut(
         dtype: struct<left_inclusive: int64, right_exclusive: int64>[pyarrow]
 
     Args:
-        x (Series):
+        x (bigframes.pandas.Series):
             The input Series to be binned. Must be 1-dimensional.
         bins (int, pd.IntervalIndex, Iterable):
             The criteria to bin by.
@@ -127,10 +133,11 @@ def cut(
             ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
             indicate (1,2], (2,3], (3,4]. This argument is ignored when
             `bins` is an IntervalIndex.
-        labels (default None):
+        labels (bool, Iterable, default None):
             Specifies the labels for the returned bins. Must be the same length as
             the resulting bins. If False, returns only integer indicators of the
-            bins. This affects the type of the output container.
+            bins. This affects the type of the output container. This argument is
+            ignored when `bins` is an IntervalIndex. If True, raises an error.
 
     Returns:
         bigframes.pandas.Series: