This is an automated email from the ASF dual-hosted git repository.
ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5075ea6a85f3 [SPARK-54665][PS] Fix boolean vs string comparison to
match pandas behavior
5075ea6a85f3 is described below
commit 5075ea6a85f3f1689766cf08a7d5b2ce500be1fb
Author: Devin Petersohn <[email protected]>
AuthorDate: Thu Mar 5 11:18:35 2026 -0800
[SPARK-54665][PS] Fix boolean vs string comparison to match pandas behavior
### What changes were proposed in this pull request?
Move the `_should_return_all_false` type-mismatch check outside the ANSI
mode guard in `DataTypeOps.eq/ne` and `NumericOps.eq/ne` so it runs regardless
of `spark.sql.ansi.enabled`.
### Why are the changes needed?
It is a bug, boolean vs string comparison doesn't match pandas behavior
when ANSI mode is off.
### Does this PR introduce _any_ user-facing change?
Yes, fix the bug
### How was this patch tested?
CI
### Was this patch authored or co-authored using generative AI tooling?
Co-authored-by: Claude Opus 4
Closes #54456 from devin-petersohn/devin/fix-bool-string-comparison.
Authored-by: Devin Petersohn <[email protected]>
Signed-off-by: Takuya Ueshin <[email protected]>
---
python/pyspark/pandas/data_type_ops/base.py | 23 +++++++++++++++------
python/pyspark/pandas/data_type_ops/num_ops.py | 24 ++++++++++++++--------
.../pandas/tests/data_type_ops/test_boolean_ops.py | 6 ++++++
3 files changed, 38 insertions(+), 15 deletions(-)
diff --git a/python/pyspark/pandas/data_type_ops/base.py
b/python/pyspark/pandas/data_type_ops/base.py
index fbd6b8a30a1e..21177847a312 100644
--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@@ -17,7 +17,7 @@
import numbers
from abc import ABCMeta
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, cast
from itertools import chain
import numpy as np
@@ -53,7 +53,6 @@ from pyspark.pandas.typedef.typehints import (
handle_dtype_as_extension_dtype,
spark_type_to_pandas_dtype,
)
-from pyspark.pandas.utils import is_ansi_mode_enabled
if extension_dtypes_available:
from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype
@@ -424,9 +423,14 @@ class DataTypeOps(object, metaclass=ABCMeta):
raise TypeError(">= can not be applied to %s." % self.pretty_name)
def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
- if is_ansi_mode_enabled(left._internal.spark_frame.sparkSession):
- if _should_return_all_false(left, right):
- return left._with_new_scol(F.lit(False)).rename(None) # type:
ignore[attr-defined]
+ from pyspark.pandas.base import IndexOpsMixin
+
+ if _should_return_all_false(left, right):
+ left_scol = left._with_new_scol(F.lit(False))
+ if isinstance(right, IndexOpsMixin):
+ return left_scol.rename(None) # type: ignore[attr-defined]
+ else:
+ return cast(SeriesOrIndex, left_scol)
if isinstance(right, (list, tuple)):
from pyspark.pandas.series import first_series, scol_for
@@ -521,10 +525,17 @@ class DataTypeOps(object, metaclass=ABCMeta):
return column_op(PySparkColumn.__eq__)(left, right)
def ne(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
- from pyspark.pandas.base import column_op
+ from pyspark.pandas.base import column_op, IndexOpsMixin
_sanitize_list_like(right)
+ if _should_return_all_false(left, right):
+ left_scol = left._with_new_scol(F.lit(True))
+ if isinstance(right, IndexOpsMixin):
+ return left_scol.rename(None) # type: ignore[attr-defined]
+ else:
+ return cast(SeriesOrIndex, left_scol)
+
return column_op(PySparkColumn.__ne__)(left, right)
def invert(self, operand: IndexOpsLike) -> IndexOpsLike:
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 17f17cd76d8a..8c3b9ab66bc3 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -276,16 +276,16 @@ class NumericOps(DataTypeOps):
if not isinstance(right, IndexOpsMixin) and is_list_like(right):
return super().eq(left, right)
else:
+ if _should_return_all_false(left, right):
+ left_scol = left._with_new_scol(F.lit(False))
+ if isinstance(right, IndexOpsMixin):
+ # When comparing with another Series/Index, drop the name
+ # to align with pandas behavior
+ return left_scol.rename(None) # type: ignore[attr-defined]
+ else:
+ # When comparing with scalar-like, keep the name of left
operand
+ return cast(SeriesOrIndex, left_scol)
if is_ansi_mode_enabled(left._internal.spark_frame.sparkSession):
- if _should_return_all_false(left, right):
- left_scol = left._with_new_scol(F.lit(False))
- if isinstance(right, IndexOpsMixin):
- # When comparing with another Series/Index, drop the
name
- # to align with pandas behavior
- return left_scol.rename(None) # type:
ignore[attr-defined]
- else:
- # When comparing with scalar-like, keep the name of
left operand
- return cast(SeriesOrIndex, left_scol)
if _is_boolean_type(right): # numeric vs. bool
right = transform_boolean_operand_to_numeric(
right, spark_type=left.spark.data_type
@@ -294,6 +294,12 @@ class NumericOps(DataTypeOps):
def ne(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
+ if _should_return_all_false(left, right):
+ left_scol = left._with_new_scol(F.lit(True))
+ if isinstance(right, IndexOpsMixin):
+ return left_scol.rename(None) # type: ignore[attr-defined]
+ else:
+ return cast(SeriesOrIndex, left_scol)
return pyspark_column_op("__ne__", left, right, fillna=True)
def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index f4b069426caa..9911b5dc4976 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -364,6 +364,9 @@ class BooleanOpsTestsMixin:
psser, other_psser = psdf["this"], psdf["that"]
self.assert_eq(pser == other_pser, psser == other_psser)
self.assert_eq(pser == pser, psser == psser)
+ # SPARK-54665: boolean vs string comparison should match pandas
behavior
+ self.assert_eq(pser == "True", psser == "True")
+ self.assert_eq(pser == "False", psser == "False")
def test_ne(self):
pdf, psdf = self.bool_pdf, self.bool_psdf
@@ -371,6 +374,9 @@ class BooleanOpsTestsMixin:
psser, other_psser = psdf["this"], psdf["that"]
self.assert_eq(pser != other_pser, psser != other_psser)
self.assert_eq(pser != pser, psser != psser)
+ # SPARK-54665: boolean vs string comparison should match pandas
behavior
+ self.assert_eq(pser != "True", psser != "True")
+ self.assert_eq(pser != "False", psser != "False")
def test_lt(self):
pdf, psdf = self.bool_pdf, self.bool_psdf
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]