(spark) branch master updated: [SPARK-52356][PS] Enable divide-by-zero for boolean mod/rmod with ANSI enabled

ueshin Fri, 06 Jun 2025 11:25:33 -0700

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 83639f598026 [SPARK-52356][PS] Enable divide-by-zero for boolean 
mod/rmod with ANSI enabled
83639f598026 is described below

commit 83639f59802603434f96174757dce737dc230a87
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Jun 6 11:24:50 2025 -0700

    [SPARK-52356][PS] Enable divide-by-zero for boolean mod/rmod with ANSI 
enabled
    
    ### What changes were proposed in this pull request?
    Enable divide-by-zero for boolean mod/rmod with ANSI enabled
    
    ### Why are the changes needed?
    Ensure pandas on Spark works well with ANSI mode on.
    Part of https://issues.apache.org/jira/browse/SPARK-52169.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, divide-by-zero is enabled when ANSI is on, as shown below:
    
    ```
    >>> ps.set_option("compute.fail_on_ansi_mode", False)
    >>> pser = pd.Series([True, False])
    >>> psser = ps.from_pandas(pser)
    
    >>> ps.set_option("compute.ansi_mode_support", True)
    >>> spark.conf.set("spark.sql.ansi.enabled", True)
    >>> 1 % psser
    0    0.0
    1    NaN
    dtype: float64
    
    # Same as ANSI off
    >>> spark.conf.set("spark.sql.ansi.enabled", False)
    >>> 1 % psser
    0    0.0
    1    NaN
    dtype: float64
    
    ```
    
    ### How was this patch tested?
    Unit tests, and
    
    ```py
    (dev3.10) spark (divide_0_tests) % SPARK_ANSI_SQL_MODE=true  
./python/run-tests --python-executables=python3.10 --testnames 
"pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_mod"
    
    Running PySpark tests. Output is in 
/Users/xinrong.meng/spark/python/unit-tests.log
    ...
    Finished test(python3.10): 
pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_mod 
(5s)
    Tests passed in 5 seconds
    
    (dev3.10) spark (divide_0_tests) % SPARK_ANSI_SQL_MODE=true  
./python/run-tests --python-executables=python3.10 --testnames 
"pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_rmod"
    Running PySpark tests. Output is in 
/Users/xinrong.meng/spark/python/unit-tests.log
    ...
    Finished test(python3.10): 
pyspark.pandas.tests.data_type_ops.test_boolean_ops BooleanOpsTests.test_rmod 
(4s)
    Tests passed in 4 seconds
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #51058 from xinrong-meng/bool_mod.
    
    Authored-by: Xinrong Meng <[email protected]>
    Signed-off-by: Takuya Ueshin <[email protected]>
---
 python/pyspark/pandas/data_type_ops/boolean_ops.py | 28 ++++++++++++++++++----
 .../pandas/tests/data_type_ops/test_boolean_ops.py |  3 +--
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py 
b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index c91dcc913080..765ec6a94634 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -21,7 +21,7 @@ from typing import Any, Union
 import pandas as pd
 from pandas.api.types import CategoricalDtype
 
-from pyspark.pandas.base import column_op, IndexOpsMixin
+from pyspark.pandas.base import column_op, IndexOpsMixin, numpy_column_op
 from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
 from pyspark.pandas.data_type_ops.base import (
     DataTypeOps,
@@ -35,6 +35,7 @@ from pyspark.pandas.data_type_ops.base import (
     _is_boolean_type,
 )
 from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, 
pandas_on_spark_type
+from pyspark.pandas.utils import is_ansi_mode_enabled
 from pyspark.sql import functions as F, Column as PySparkColumn
 from pyspark.sql.types import BooleanType, StringType
 from pyspark.errors import PySparkValueError
@@ -136,13 +137,21 @@ class BooleanOps(DataTypeOps):
             raise TypeError(
                 "Modulo can not be applied to %s and the given type." % 
self.pretty_name
             )
+        spark_session = left._internal.spark_frame.sparkSession
+
+        def safe_mod(left_col: PySparkColumn, right_val: Any) -> PySparkColumn:
+            if is_ansi_mode_enabled(spark_session):
+                return F.when(F.lit(right_val == 0), 
F.lit(None)).otherwise(left_col % right_val)
+            else:
+                return left_col % right_val
+
         if isinstance(right, numbers.Number):
             left = transform_boolean_operand_to_numeric(left, 
spark_type=as_spark_type(type(right)))
-            return left % right
+            return numpy_column_op(safe_mod)(left, right)
         else:
             assert isinstance(right, IndexOpsMixin)
             left = transform_boolean_operand_to_numeric(left, 
spark_type=right.spark.data_type)
-            return left % right
+            return numpy_column_op(safe_mod)(left, right)
 
     def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
         _sanitize_list_like(right)
@@ -226,7 +235,18 @@ class BooleanOps(DataTypeOps):
         _sanitize_list_like(right)
         if isinstance(right, numbers.Number) and not isinstance(right, bool):
             left = transform_boolean_operand_to_numeric(left, 
spark_type=as_spark_type(type(right)))
-            return right % left
+            spark_session = left._internal.spark_frame.sparkSession
+
+            if is_ansi_mode_enabled(spark_session):
+
+                def safe_rmod(left_col: PySparkColumn, right_val: Any) -> 
PySparkColumn:
+                    return F.when(left_col != 0, F.pmod(F.lit(right_val), 
left_col)).otherwise(
+                        F.lit(None)
+                    )
+
+                return numpy_column_op(safe_rmod)(left, right)
+            else:
+                return right % left
         else:
             raise TypeError(
                 "Modulo can not be applied to %s and the given type." % 
self.pretty_name
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index d9bb68c6cf15..6969c527b5f7 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -137,7 +137,6 @@ class BooleanOpsTestsMixin:
         for col in self.non_numeric_df_cols:
             self.assertRaises(TypeError, lambda: b_psser // psdf[col])
 
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_mod(self):
         pdf, psdf = self.pdf, self.psdf
 
@@ -237,7 +236,6 @@ class BooleanOpsTestsMixin:
         self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) ** 
b_psser)
         self.assertRaises(TypeError, lambda: datetime.datetime(1994, 1, 1) ** 
b_psser)
 
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_rmod(self):
         psdf = self.psdf
 
@@ -248,6 +246,7 @@ class BooleanOpsTestsMixin:
         self.assert_eq(
             pd.Series([0.10000000000000009, 0.10000000000000009, None], 
dtype=float, name="bool"),
             0.1 % b_psser,
+            check_exact=False,  # [0.1, 0.1, nan] for pandas-on-Spark
         )
         self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) % 
b_psser)
         self.assertRaises(TypeError, lambda: True % b_psser)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-52356][PS] Enable divide-by-zero for boolean mod/rmod with ANSI enabled

Reply via email to