This is an automated email from the ASF dual-hosted git repository.
ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 83639f598026 [SPARK-52356][PS] Enable divide-by-zero for boolean
mod/rmod with ANSI enabled
83639f598026 is described below
commit 83639f59802603434f96174757dce737dc230a87
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Jun 6 11:24:50 2025 -0700
[SPARK-52356][PS] Enable divide-by-zero for boolean mod/rmod with ANSI
enabled
### What changes were proposed in this pull request?
Enable divide-by-zero for boolean mod/rmod with ANSI enabled
### Why are the changes needed?
Ensure pandas on Spark works well with ANSI mode on.
Part of https://issues.apache.org/jira/browse/SPARK-52169.
### Does this PR introduce _any_ user-facing change?
Yes, divide-by-zero is enabled when ANSI is on, as shown below:
```
>>> ps.set_option("compute.fail_on_ansi_mode", False)
>>> pser = pd.Series([True, False])
>>> psser = ps.from_pandas(pser)
>>> ps.set_option("compute.ansi_mode_support", True)
>>> spark.conf.set("spark.sql.ansi.enabled", True)
>>> 1 % psser
0 0.0
1 NaN
dtype: float64
# Same as ANSI off
>>> spark.conf.set("spark.sql.ansi.enabled", False)
>>> 1 % psser
0 0.0
1 NaN
dtype: float64
```
### How was this patch tested?
Unit tests, and
```py
(dev3.10) spark (divide_0_tests) % SPARK_ANSI_SQL_MODE=true
./python/run-tests --python-executables=python3.10 --testnames
"pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_mod"
Running PySpark tests. Output is in
/Users/xinrong.meng/spark/python/unit-tests.log
...
Finished test(python3.10):
pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_mod
(5s)
Tests passed in 5 seconds
(dev3.10) spark (divide_0_tests) % SPARK_ANSI_SQL_MODE=true
./python/run-tests --python-executables=python3.10 --testnames
"pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_rmod"
Running PySpark tests. Output is in
/Users/xinrong.meng/spark/python/unit-tests.log
...
Finished test(python3.10):
pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_rmod
(4s)
Tests passed in 4 seconds
```
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #51058 from xinrong-meng/bool_mod.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Takuya Ueshin <[email protected]>
---
python/pyspark/pandas/data_type_ops/boolean_ops.py | 28 ++++++++++++++++++----
.../pandas/tests/data_type_ops/test_boolean_ops.py | 3 +--
2 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py
b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index c91dcc913080..765ec6a94634 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -21,7 +21,7 @@ from typing import Any, Union
import pandas as pd
from pandas.api.types import CategoricalDtype
-from pyspark.pandas.base import column_op, IndexOpsMixin
+from pyspark.pandas.base import column_op, IndexOpsMixin, numpy_column_op
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
from pyspark.pandas.data_type_ops.base import (
DataTypeOps,
@@ -35,6 +35,7 @@ from pyspark.pandas.data_type_ops.base import (
_is_boolean_type,
)
from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes,
pandas_on_spark_type
+from pyspark.pandas.utils import is_ansi_mode_enabled
from pyspark.sql import functions as F, Column as PySparkColumn
from pyspark.sql.types import BooleanType, StringType
from pyspark.errors import PySparkValueError
@@ -136,13 +137,21 @@ class BooleanOps(DataTypeOps):
raise TypeError(
"Modulo can not be applied to %s and the given type." %
self.pretty_name
)
+ spark_session = left._internal.spark_frame.sparkSession
+
+ def safe_mod(left_col: PySparkColumn, right_val: Any) -> PySparkColumn:
+ if is_ansi_mode_enabled(spark_session):
+ return F.when(F.lit(right_val == 0),
F.lit(None)).otherwise(left_col % right_val)
+ else:
+ return left_col % right_val
+
if isinstance(right, numbers.Number):
left = transform_boolean_operand_to_numeric(left,
spark_type=as_spark_type(type(right)))
- return left % right
+ return numpy_column_op(safe_mod)(left, right)
else:
assert isinstance(right, IndexOpsMixin)
left = transform_boolean_operand_to_numeric(left,
spark_type=right.spark.data_type)
- return left % right
+ return numpy_column_op(safe_mod)(left, right)
def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
@@ -226,7 +235,18 @@ class BooleanOps(DataTypeOps):
_sanitize_list_like(right)
if isinstance(right, numbers.Number) and not isinstance(right, bool):
left = transform_boolean_operand_to_numeric(left,
spark_type=as_spark_type(type(right)))
- return right % left
+ spark_session = left._internal.spark_frame.sparkSession
+
+ if is_ansi_mode_enabled(spark_session):
+
+ def safe_rmod(left_col: PySparkColumn, right_val: Any) ->
PySparkColumn:
+ return F.when(left_col != 0, F.pmod(F.lit(right_val),
left_col)).otherwise(
+ F.lit(None)
+ )
+
+ return numpy_column_op(safe_rmod)(left, right)
+ else:
+ return right % left
else:
raise TypeError(
"Modulo can not be applied to %s and the given type." %
self.pretty_name
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index d9bb68c6cf15..6969c527b5f7 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -137,7 +137,6 @@ class BooleanOpsTestsMixin:
for col in self.non_numeric_df_cols:
self.assertRaises(TypeError, lambda: b_psser // psdf[col])
- @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
def test_mod(self):
pdf, psdf = self.pdf, self.psdf
@@ -237,7 +236,6 @@ class BooleanOpsTestsMixin:
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) **
b_psser)
self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) **
b_psser)
- @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
def test_rmod(self):
psdf = self.psdf
@@ -248,6 +246,7 @@ class BooleanOpsTestsMixin:
self.assert_eq(
pd.Series([0.10000000000000009, 0.10000000000000009, None],
dtype=float, name="bool"),
0.1 % b_psser,
+ check_exact=False, # [0.1, 0.1, nan] for pandas-on-Spark
)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) %
b_psser)
self.assertRaises(TypeError, lambda: True % b_psser)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]