This is an automated email from the ASF dual-hosted git repository. xinrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 6ab297b0fc7d [SPARK-52519][PS] Enable divide-by-zero for numeric floordiv with ANSI enabled 6ab297b0fc7d is described below commit 6ab297b0fc7d13da24154c958b946a34a0c552b7 Author: Xinrong Meng <xinr...@apache.org> AuthorDate: Fri Jun 20 13:38:40 2025 -0700 [SPARK-52519][PS] Enable divide-by-zero for numeric floordiv with ANSI enabled ### What changes were proposed in this pull request? Enable divide-by-zero for numeric floordiv with ANSI enabled ### Why are the changes needed? Ensure pandas on Spark works well with ANSI mode on. Part of https://issues.apache.org/jira/browse/SPARK-52169. ### Does this PR introduce _any_ user-facing change? Yes. ```py >>> spark.conf.get("spark.sql.ansi.enabled") 'true' >>> ps.set_option("compute.fail_on_ansi_mode", False) >>> ps.set_option("compute.ansi_mode_support", True) >>> ps.Series([1, 2]) // 0 0 inf 1 inf dtype: float64 >>> ps.Series([1, 2]) // ps.Series([0, 0]) 0 inf 1 inf dtype: float64 ``` ### How was this patch tested? Unit tests. ``` (dev3.10) spark (num_floordiv) % SPARK_ANSI_SQL_MODE=true ./python/run-tests --python-executables=python3.10 --testnames "pyspark.pandas.tests.computation.test_binary_ops FrameBinaryOpsTests.test_binary_operator_floordiv" ... Tests passed in 6 seconds (dev3.10) spark (num_floordiv) % SPARK_ANSI_SQL_MODE=false ./python/run-tests --python-executables=python3.10 --testnames "pyspark.pandas.tests.computation.test_binary_ops FrameBinaryOpsTests.test_binary_operator_floordiv" ... Tests passed in 4 seconds ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51209 from xinrong-meng/num_floordiv. Lead-authored-by: Xinrong Meng <xinr...@apache.org> Co-authored-by: Xinrong Meng <xinrong.apa...@gmail.com> Signed-off-by: Xinrong Meng <xinr...@apache.org> --- python/pyspark/pandas/data_type_ops/num_ops.py | 24 +++++++++++++++++++--- .../pandas/tests/computation/test_binary_ops.py | 6 +++++- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py index 34d313af8232..06622ef71d88 100644 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ b/python/pyspark/pandas/data_type_ops/num_ops.py @@ -16,7 +16,7 @@ # import numbers -from typing import Any, Union +from typing import Any, Union, Callable import numpy as np import pandas as pd @@ -271,13 +271,22 @@ class IntegralOps(NumericOps): _sanitize_list_like(right) if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError("Floor division can not be applied to given types.") + spark_session = left._internal.spark_frame.sparkSession + use_try_divide = is_ansi_mode_enabled(spark_session) + + def fallback_div(x: PySparkColumn, y: PySparkColumn) -> PySparkColumn: + return x.__div__(y) + + safe_div: Callable[[PySparkColumn, PySparkColumn], PySparkColumn] = ( + F.try_divide if use_try_divide else fallback_div + ) def floordiv(left: PySparkColumn, right: Any) -> PySparkColumn: return F.when(F.lit(right is np.nan), np.nan).otherwise( F.when( F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right)), - ).otherwise(F.lit(np.inf).__div__(left)) + ).otherwise(safe_div(F.lit(np.inf), left)) ) right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) @@ -369,6 +378,15 @@ class FractionalOps(NumericOps): _sanitize_list_like(right) if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError("Floor division can not be applied to given types.") + spark_session = left._internal.spark_frame.sparkSession + use_try_divide = is_ansi_mode_enabled(spark_session) + + def fallback_div(x: PySparkColumn, y: PySparkColumn) -> PySparkColumn: + return x.__div__(y) + + safe_div: Callable[[PySparkColumn, PySparkColumn], PySparkColumn] = ( + F.try_divide if use_try_divide else fallback_div + ) def floordiv(left: PySparkColumn, right: Any) -> PySparkColumn: return F.when(F.lit(right is np.nan), np.nan).otherwise( @@ -377,7 +395,7 @@ class FractionalOps(NumericOps): F.floor(left.__div__(right)), ).otherwise( F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( - F.lit(np.inf).__div__(left) + safe_div(F.lit(np.inf), left) ) ) ) diff --git a/python/pyspark/pandas/tests/computation/test_binary_ops.py b/python/pyspark/pandas/tests/computation/test_binary_ops.py index 3c9b7293d5d5..cda9958ad3de 100644 --- a/python/pyspark/pandas/tests/computation/test_binary_ops.py +++ b/python/pyspark/pandas/tests/computation/test_binary_ops.py @@ -208,7 +208,11 @@ class FrameBinaryOpsMixin: self.assertRaisesRegex(TypeError, ks_err_msg, lambda: 1 / psdf["a"]) def test_binary_operator_floordiv(self): - psdf = ps.DataFrame({"a": ["x"], "b": [1]}) + pdf = pd.DataFrame({"a": ["x"], "b": [1], "c": [1.0], "d": [0]}) + psdf = ps.from_pandas(pdf) + self.assert_eq(pdf["b"] // 0, psdf["b"] // 0) + self.assert_eq(pdf["c"] // 0, psdf["c"] // 0) + self.assert_eq(pdf["d"] // 0, psdf["d"] // 0) ks_err_msg = "Floor division can not be applied to strings" self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] // psdf["b"]) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org