Skip to content

Commit 7f22899

Browse files
committed
[SPARK-52570][PS] Enable divide-by-zero for numeric rmod with ANSI enabled
### What changes were proposed in this pull request? Enable divide-by-zero for numeric rmod with ANSI enabled ### Why are the changes needed? Part of https://issues.apache.org/jira/browse/SPARK-52169. ### Does this PR introduce _any_ user-facing change? Yes. ```py >>> ps.set_option("compute.fail_on_ansi_mode", False) >>> ps.set_option("compute.ansi_mode_support", True) >>> pdf = pd.DataFrame({"a": [0], "b": [False]}) >>> pdf.dtypes a int64 b bool dtype: object >>> psdf = ps.from_pandas(pdf) >>> 1 % psdf["a"] 0 NaN Name: a, dtype: float64 >>> 1 % psdf["b"] 0 NaN Name: b, dtype: float64 ``` ### How was this patch tested? Unit tests. ``` (dev3.11) spark (bool_mod_new) % SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.11 --testnames "pyspark.pandas.tests.data_type_ops.test_boolean_ops ... Tests passed in 4 seconds ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51275 from xinrong-meng/bool_mod_new. Authored-by: Xinrong Meng <[email protected]> Signed-off-by: Xinrong Meng <[email protected]>
1 parent 69c3eb9 commit 7f22899

File tree

4 files changed

+24
-37
lines changed

4 files changed

+24
-37
lines changed

python/pyspark/pandas/data_type_ops/boolean_ops.py

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import pandas as pd
2222
from pandas.api.types import CategoricalDtype
2323

24-
from pyspark.pandas.base import column_op, IndexOpsMixin, numpy_column_op
24+
from pyspark.pandas.base import column_op, IndexOpsMixin
2525
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
2626
from pyspark.pandas.data_type_ops.base import (
2727
DataTypeOps,
@@ -35,7 +35,6 @@
3535
_is_boolean_type,
3636
)
3737
from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, pandas_on_spark_type
38-
from pyspark.pandas.utils import is_ansi_mode_enabled
3938
from pyspark.sql import functions as F, Column as PySparkColumn
4039
from pyspark.sql.types import BooleanType, StringType
4140
from pyspark.errors import PySparkValueError
@@ -137,21 +136,13 @@ def mod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
137136
raise TypeError(
138137
"Modulo can not be applied to %s and the given type." % self.pretty_name
139138
)
140-
spark_session = left._internal.spark_frame.sparkSession
141-
142-
def safe_mod(left_col: PySparkColumn, right_val: Any) -> PySparkColumn:
143-
if is_ansi_mode_enabled(spark_session):
144-
return F.when(F.lit(right_val == 0), F.lit(None)).otherwise(left_col % right_val)
145-
else:
146-
return left_col % right_val
147-
148139
if isinstance(right, numbers.Number):
149140
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
150-
return numpy_column_op(safe_mod)(left, right)
141+
return left % right
151142
else:
152143
assert isinstance(right, IndexOpsMixin)
153144
left = transform_boolean_operand_to_numeric(left, spark_type=right.spark.data_type)
154-
return numpy_column_op(safe_mod)(left, right)
145+
return left % right
155146

156147
def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
157148
_sanitize_list_like(right)
@@ -235,18 +226,7 @@ def rmod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
235226
_sanitize_list_like(right)
236227
if isinstance(right, numbers.Number) and not isinstance(right, bool):
237228
left = transform_boolean_operand_to_numeric(left, spark_type=as_spark_type(type(right)))
238-
spark_session = left._internal.spark_frame.sparkSession
239-
240-
if is_ansi_mode_enabled(spark_session):
241-
242-
def safe_rmod(left_col: PySparkColumn, right_val: Any) -> PySparkColumn:
243-
return F.when(left_col != 0, F.pmod(F.lit(right_val), left_col)).otherwise(
244-
F.lit(None)
245-
)
246-
247-
return numpy_column_op(safe_rmod)(left, right)
248-
else:
249-
return right % left
229+
return right % left
250230
else:
251231
raise TypeError(
252232
"Modulo can not be applied to %s and the given type." % self.pretty_name

python/pyspark/pandas/data_type_ops/num_ops.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -160,12 +160,21 @@ def rmod(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
160160
_sanitize_list_like(right)
161161
if not isinstance(right, numbers.Number):
162162
raise TypeError("Modulo can not be applied to given types.")
163-
164-
def rmod(left: PySparkColumn, right: Any) -> PySparkColumn:
165-
return ((right % left) + left) % left
163+
spark_session = left._internal.spark_frame.sparkSession
166164

167165
right = transform_boolean_operand_to_numeric(right)
168-
return column_op(rmod)(left, right)
166+
167+
def safe_rmod(left_col: PySparkColumn, right_val: Any) -> PySparkColumn:
168+
if is_ansi_mode_enabled(spark_session):
169+
# Java-style modulo -> Python-style modulo
170+
result = F.when(
171+
left_col != 0, ((F.lit(right_val) % left_col) + left_col) % left_col
172+
).otherwise(F.lit(None))
173+
return result
174+
else:
175+
return ((right % left) + left) % left
176+
177+
return column_op(safe_rmod)(left, right)
169178

170179
def neg(self, operand: IndexOpsLike) -> IndexOpsLike:
171180
return operand._with_new_scol(-operand.spark.column, field=operand._internal.data_fields[0])

python/pyspark/pandas/tests/computation/test_binary_ops.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,11 +225,12 @@ def test_binary_operator_floordiv(self):
225225

226226
def test_binary_operator_mod(self):
227227
# Positive
228-
pdf = pd.DataFrame({"a": [3], "b": [2]})
228+
pdf = pd.DataFrame({"a": [3], "b": [2], "c": [0]})
229229
psdf = ps.from_pandas(pdf)
230230

231231
self.assert_eq(psdf["a"] % psdf["b"], pdf["a"] % pdf["b"])
232232
self.assert_eq(psdf["a"] % 0, pdf["a"] % 0)
233+
self.assert_eq(1 % psdf["c"], 1 % pdf["c"])
233234

234235
# Negative
235236
psdf = ps.DataFrame({"a": ["x"], "b": [1]})

python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -235,16 +235,13 @@ def test_rpow(self):
235235

236236
def test_rmod(self):
237237
psdf = self.psdf
238+
pdf = self.pdf
238239

239240
b_psser = psdf["bool"]
240-
# 1 % False is 0.0 in pandas
241-
self.assert_eq(pd.Series([0, 0, None], dtype=float, name="bool"), 1 % b_psser)
242-
# 0.1 / True is 0.1 in pandas
243-
self.assert_eq(
244-
pd.Series([0.10000000000000009, 0.10000000000000009, None], dtype=float, name="bool"),
245-
0.1 % b_psser,
246-
check_exact=False, # [0.1, 0.1, nan] for pandas-on-Spark
247-
)
241+
b_pser = pdf["bool"]
242+
self.assert_eq(1 % b_pser.astype(float), 1 % b_psser)
243+
# # Allow float precision diff: pandas: 0.10000000000000009; pandas on spark: 0.1
244+
self.assert_eq(0.1 % b_pser, 0.1 % b_psser, almost=True)
248245
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % b_psser)
249246
self.assertRaises(TypeError, lambda: True % b_psser)
250247

0 commit comments

Comments
 (0)