From 81f498e15666e554aa962ef96ea7de6a4b163b4b Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 5 Nov 2024 20:11:24 +0000 Subject: [PATCH 1/4] fix: dataframe fillna with string scalar. --- bigframes/dataframe.py | 4 ++-- tests/system/small/test_dataframe.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f803b66ab6..88f82a7e25 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -734,7 +734,7 @@ def _apply_binop( how: str = "outer", reverse: bool = False, ): - if isinstance(other, (float, int, bool)): + if isinstance(other, (float, int, bool, str)): return self._apply_scalar_binop(other, op, reverse=reverse) elif isinstance(other, DataFrame): return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) @@ -752,7 +752,7 @@ def _apply_binop( ) def _apply_scalar_binop( - self, other: float | int, op: ops.BinaryOp, reverse: bool = False + self, other: float | int | bool | str, op: ops.BinaryOp, reverse: bool = False ) -> DataFrame: if reverse: expr = op.as_expr( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 1fb12d3f82..4ee87c4c54 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1020,13 +1020,19 @@ def test_df_interpolate(scalars_dfs): ) -def test_df_fillna(scalars_dfs): +@pytest.mark.parametrize( + "col, fill_value", + [ + (["int64_col", "float64_col"], 3), + (["string_col"], "A"), + ], +) +def test_df_fillna(scalars_dfs, col, fill_value): scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df[["int64_col", "float64_col"]].fillna(3) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df[["int64_col", "float64_col"]].fillna(3) + bf_result = scalars_df[col].fillna(fill_value).to_pandas() + pd_result = scalars_pandas_df[col].fillna(fill_value) - pandas.testing.assert_frame_equal(bf_result, pd_result) + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) def test_df_replace_scalar_scalar(scalars_dfs): From 85c4254e9f6f0ccd8ee2ec0f42e4833e7d899858 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 6 Nov 2024 19:40:41 +0000 Subject: [PATCH 2/4] update type supports --- bigframes/dataframe.py | 2 +- bigframes/dtypes.py | 18 ++++++++++++++++++ tests/system/small/test_dataframe.py | 3 +++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 88f82a7e25..f9fc0dbe96 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -734,7 +734,7 @@ def _apply_binop( how: str = "outer", reverse: bool = False, ): - if isinstance(other, (float, int, bool, str)): + if isinstance(other, bigframes.dtypes.LOCAL_SCALAR_TYPES): return self._apply_scalar_binop(other, op, reverse=reverse) elif isinstance(other, DataFrame): return self._apply_dataframe_binop(other, op, how=how, reverse=reverse) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index bc5b89b779..98f59f7032 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -59,6 +59,24 @@ # Used when storing Null expressions DEFAULT_DTYPE = FLOAT_DTYPE +LOCAL_SCALAR_TYPES = ( + bool, + np.bool_, + int, + np.integer, + float, + np.floating, + decimal.Decimal, + str, + np.str_, + bytes, + np.bytes_, + datetime.datetime, + pd.Timestamp, + datetime.date, + datetime.time, +) + # Will have a few dtype variants: simple(eg. int, string, bool), complex (eg. list, struct), and virtual (eg. micro intervals, categorical) @dataclass(frozen=True) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 4ee87c4c54..9e45b4f915 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import decimal import io import operator import sys @@ -1025,6 +1026,8 @@ def test_df_interpolate(scalars_dfs): [ (["int64_col", "float64_col"], 3), (["string_col"], "A"), + (["datetime_col"], pd.Timestamp("2023-01-01")), + (["numeric_col"], decimal.Decimal("3.14")), ], ) def test_df_fillna(scalars_dfs, col, fill_value): From 557aa66f6131202edbc5d075eb0f61dad34a1909 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 6 Nov 2024 21:19:42 +0000 Subject: [PATCH 3/4] remove case that pandas has issue --- tests/system/small/test_dataframe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9e45b4f915..02b771a1bd 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import decimal import io import operator import sys @@ -1027,7 +1026,6 @@ def test_df_interpolate(scalars_dfs): (["int64_col", "float64_col"], 3), (["string_col"], "A"), (["datetime_col"], pd.Timestamp("2023-01-01")), - (["numeric_col"], decimal.Decimal("3.14")), ], ) def test_df_fillna(scalars_dfs, col, fill_value): From 2e27dff6834435acccd2733d215aab5049cb3e18 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 12 Nov 2024 19:42:28 +0000 Subject: [PATCH 4/4] update annotation --- bigframes/dataframe.py | 5 ++++- bigframes/dtypes.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f9fc0dbe96..d5cac76e9b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -752,7 +752,10 @@ def _apply_binop( ) def _apply_scalar_binop( - self, other: float | int | bool | str, op: ops.BinaryOp, reverse: bool = False + self, + other: bigframes.dtypes.LOCAL_SCALAR_TYPE, + op: ops.BinaryOp, + reverse: bool = False, ) -> DataFrame: if reverse: expr = op.as_expr( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 98f59f7032..c71531f9f3 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -59,7 +59,7 @@ # Used when storing Null expressions DEFAULT_DTYPE = FLOAT_DTYPE -LOCAL_SCALAR_TYPES = ( +LOCAL_SCALAR_TYPE = Union[ bool, np.bool_, int, @@ -75,7 +75,8 @@ pd.Timestamp, datetime.date, datetime.time, -) +] +LOCAL_SCALAR_TYPES = typing.get_args(LOCAL_SCALAR_TYPE) # Will have a few dtype variants: simple(eg. int, string, bool), complex (eg. list, struct), and virtual (eg. micro intervals, categorical)