(spark) branch master updated: [SPARK-55625][PS] Fix StringOps to make `str` dtype work properly

gurwls223 Sun, 22 Feb 2026 14:19:34 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 188cf470727a [SPARK-55625][PS] Fix StringOps to make `str` dtype work 
properly
188cf470727a is described below

commit 188cf470727a7d657619de787671e1278d40a827
Author: Takuya Ueshin <[email protected]>
AuthorDate: Mon Feb 23 07:19:08 2026 +0900

    [SPARK-55625][PS] Fix StringOps to make `str` dtype work properly
    
    ### What changes were proposed in this pull request?
    
    Fix `StringOps` to make `str` dtype work properly.
    
    ### Why are the changes needed?
    
    In pandas 3, the default dtype for string is now `str` or 
`StringDtype(na_value=np.nan)`.
    This is one of extension dtypes, but actually handled as if non-extension 
dtypes.
    
    ```py
    >>> pser = pd.Series(["x", "y", "z", None])
    >>> other_pser = pd.Series([None, "z", "y", "x"])
    ```
    
    - pandas 2
    
    ```py
    >>> pser
    0       x
    1       y
    2       z
    3    None
    dtype: object
    >>> other_pser
    0    None
    1       z
    2       y
    3       x
    dtype: object
    >>> pser == other_pser
    0    False
    1    False
    2    False
    3    False
    dtype: bool
    ```
    
    - pandas 3
    
    ```py
    >>> pser
    0      x
    1      y
    2      z
    3    NaN
    dtype: str
    >>> other_pser
    0    NaN
    1      z
    2      y
    3      x
    dtype: str
    >>> pser == other_pser
    0    False
    1    False
    2    False
    3    False
    dtype: bool
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it will behave more like pandas 3.
    
    ### How was this patch tested?
    
    The existing tests should pass.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #54413 from ueshin/issues/SPARK-55625/string_ops.
    
    Authored-by: Takuya Ueshin <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/pandas/base.py                      |  4 ++--
 python/pyspark/pandas/data_type_ops/base.py        | 13 +++++++-----
 python/pyspark/pandas/data_type_ops/boolean_ops.py | 16 +++++++++------
 python/pyspark/pandas/data_type_ops/num_ops.py     | 24 +++++++++++++---------
 python/pyspark/pandas/data_type_ops/string_ops.py  |  4 ++--
 python/pyspark/pandas/internal.py                  |  4 ++--
 python/pyspark/pandas/typedef/typehints.py         | 19 +++++++++++++----
 7 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index cd6ae554cb02..a43fd7112cbd 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -40,7 +40,7 @@ from pyspark.pandas.internal import (
     SPARK_DEFAULT_INDEX_NAME,
 )
 from pyspark.pandas.spark.accessors import SparkIndexOpsMethods
-from pyspark.pandas.typedef import extension_dtypes
+from pyspark.pandas.typedef.typehints import handle_dtype_as_extension_dtype
 from pyspark.pandas.utils import (
     ansi_mode_context,
     combine_frames,
@@ -228,7 +228,7 @@ def column_op(f: Callable[..., Column]) -> Callable[..., 
SeriesOrIndex]:
             field = InternalField.from_struct_field(
                 self._internal.spark_frame.select(scol).schema[0],
                 use_extension_dtypes=any(
-                    isinstance(col.dtype, extension_dtypes) for col in [self] 
+ cols
+                    handle_dtype_as_extension_dtype(col.dtype) for col in 
[self] + cols
                 ),
             )
 
diff --git a/python/pyspark/pandas/data_type_ops/base.py 
b/python/pyspark/pandas/data_type_ops/base.py
index 7be658b79192..fbd6b8a30a1e 100644
--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@@ -46,11 +46,11 @@ from pyspark.sql.types import (
     UserDefinedType,
 )
 from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
-from pyspark.pandas.typedef import extension_dtypes
 from pyspark.pandas.typedef.typehints import (
     extension_dtypes_available,
     extension_float_dtypes_available,
     extension_object_dtypes_available,
+    handle_dtype_as_extension_dtype,
     spark_type_to_pandas_dtype,
 )
 from pyspark.pandas.utils import is_ansi_mode_enabled
@@ -173,7 +173,7 @@ def _as_categorical_type(
 def _as_bool_type(index_ops: IndexOpsLike, dtype: Dtype) -> IndexOpsLike:
     """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
     spark_type = BooleanType()
-    if isinstance(dtype, extension_dtypes):
+    if handle_dtype_as_extension_dtype(dtype):
         scol = index_ops.spark.column.cast(spark_type)
     else:
         null_value = (
@@ -194,7 +194,7 @@ def _as_string_type(
     representing null Spark column. Note that `null_str` is for non-extension 
dtypes only.
     """
     spark_type = StringType()
-    if isinstance(dtype, extension_dtypes):
+    if handle_dtype_as_extension_dtype(dtype):
         scol = index_ops.spark.column.cast(spark_type)
     else:
         casted = index_ops.spark.column.cast(spark_type)
@@ -254,7 +254,7 @@ def _is_extension_dtypes(object: Any) -> bool:
     Extention dtype includes Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, 
BooleanDtype,
     StringDtype, Float32Dtype and Float64Dtype.
     """
-    return isinstance(getattr(object, "dtype", None), extension_dtypes)
+    return handle_dtype_as_extension_dtype(getattr(object, "dtype", None))
 
 
 class DataTypeOps(object, metaclass=ABCMeta):
@@ -300,7 +300,10 @@ class DataTypeOps(object, metaclass=ABCMeta):
                 return object.__new__(IntegralOps)
         elif isinstance(spark_type, StringType):
             if extension_object_dtypes_available and isinstance(dtype, 
StringDtype):
-                return object.__new__(StringExtensionOps)
+                if handle_dtype_as_extension_dtype(dtype):
+                    return object.__new__(StringExtensionOps)
+                else:
+                    return object.__new__(StringOps)
             else:
                 return object.__new__(StringOps)
         elif isinstance(spark_type, BooleanType):
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py 
b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index 8e84fe4e3da5..05797735b1ce 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -36,7 +36,11 @@ from pyspark.pandas.data_type_ops.base import (
     _is_valid_for_logical_operator,
     _is_boolean_type,
 )
-from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes, 
pandas_on_spark_type
+from pyspark.pandas.typedef.typehints import (
+    as_spark_type,
+    handle_dtype_as_extension_dtype,
+    pandas_on_spark_type,
+)
 from pyspark.pandas.utils import is_ansi_mode_enabled
 from pyspark.sql import functions as F, Column as PySparkColumn
 from pyspark.sql.types import BooleanType, StringType
@@ -243,7 +247,7 @@ class BooleanOps(DataTypeOps):
             and right is None
         ):
             raise TypeError("AND can not be applied to given types.")
-        if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, 
extension_dtypes):
+        if isinstance(right, IndexOpsMixin) and 
handle_dtype_as_extension_dtype(right.dtype):
             return right.__and__(left)
         else:
 
@@ -268,7 +272,7 @@ class BooleanOps(DataTypeOps):
             and right is None
         ):
             raise TypeError("XOR can not be applied to given types.")
-        if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, 
extension_dtypes):
+        if isinstance(right, IndexOpsMixin) and 
handle_dtype_as_extension_dtype(right.dtype):
             return right ^ left
         elif _is_valid_for_logical_operator(right):
 
@@ -295,7 +299,7 @@ class BooleanOps(DataTypeOps):
             and right is None
         ):
             raise TypeError("OR can not be applied to given types.")
-        if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, 
extension_dtypes):
+        if isinstance(right, IndexOpsMixin) and 
handle_dtype_as_extension_dtype(right.dtype):
             return right.__or__(left)
         else:
 
@@ -322,7 +326,7 @@ class BooleanOps(DataTypeOps):
         elif isinstance(spark_type, BooleanType):
             return _as_bool_type(index_ops, dtype)
         elif isinstance(spark_type, StringType):
-            if isinstance(dtype, extension_dtypes):
+            if handle_dtype_as_extension_dtype(dtype):
                 scol = F.when(
                     index_ops.spark.column.isNotNull(),
                     F.when(index_ops.spark.column, "True").otherwise("False"),
@@ -342,7 +346,7 @@ class BooleanOps(DataTypeOps):
         else:
             is_ansi = 
is_ansi_mode_enabled(index_ops._internal.spark_frame.sparkSession)
             if is_ansi and get_option("compute.eager_check"):
-                if is_integer_dtype(dtype) and not isinstance(dtype, 
extension_dtypes):
+                if is_integer_dtype(dtype) and not 
handle_dtype_as_extension_dtype(dtype):
                     if index_ops.hasnans:
                         raise ValueError(
                             "Cannot convert %s with missing values to integer" 
% self.pretty_name
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py 
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 6bc8efbc2554..17f17cd76d8a 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -46,7 +46,11 @@ from pyspark.pandas.data_type_ops.base import (
     _is_boolean_type,
     _should_return_all_false,
 )
-from pyspark.pandas.typedef.typehints import extension_dtypes, 
pandas_on_spark_type, as_spark_type
+from pyspark.pandas.typedef.typehints import (
+    as_spark_type,
+    handle_dtype_as_extension_dtype,
+    pandas_on_spark_type,
+)
 from pyspark.pandas.utils import is_ansi_mode_enabled
 from pyspark.sql import functions as F, Column as PySparkColumn
 from pyspark.sql.types import (
@@ -318,7 +322,7 @@ class IntegralOps(NumericOps):
     def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
         _sanitize_list_like(right)
 
-        if isinstance(right, IndexOpsMixin) and isinstance(right.dtype, 
extension_dtypes):
+        if isinstance(right, IndexOpsMixin) and 
handle_dtype_as_extension_dtype(right.dtype):
             return right ^ left
         elif _is_valid_for_logical_operator(right):
             right_is_boolean = _is_boolean_type(right)
@@ -605,7 +609,7 @@ class FractionalOps(NumericOps):
     def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) 
-> IndexOpsLike:
         dtype, spark_type = pandas_on_spark_type(dtype)
 
-        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
+        if is_integer_dtype(dtype) and not 
handle_dtype_as_extension_dtype(dtype):
             if get_option("compute.eager_check") and index_ops.hasnans:
                 raise ValueError(
                     "Cannot convert %s with missing values to integer" % 
self.pretty_name
@@ -614,7 +618,7 @@ class FractionalOps(NumericOps):
         if isinstance(dtype, CategoricalDtype):
             return _as_categorical_type(index_ops, dtype, spark_type)
         elif isinstance(spark_type, BooleanType):
-            if isinstance(dtype, extension_dtypes):
+            if handle_dtype_as_extension_dtype(dtype):
                 scol = index_ops.spark.column.cast(spark_type)
             else:
                 scol = F.when(
@@ -666,7 +670,7 @@ class DecimalOps(FractionalOps):
 
     def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) 
-> IndexOpsLike:
         dtype, spark_type = pandas_on_spark_type(dtype)
-        if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
+        if is_integer_dtype(dtype) and not 
handle_dtype_as_extension_dtype(dtype):
             if get_option("compute.eager_check") and index_ops.hasnans:
                 raise ValueError(
                     "Cannot convert %s with missing values to integer" % 
self.pretty_name
@@ -709,12 +713,12 @@ class IntegralExtensionOps(IntegralOps):
     def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) 
-> IndexOpsLike:
         dtype, spark_type = pandas_on_spark_type(dtype)
         if get_option("compute.eager_check"):
-            if is_integer_dtype(dtype) and not isinstance(dtype, 
extension_dtypes):
+            if is_integer_dtype(dtype) and not 
handle_dtype_as_extension_dtype(dtype):
                 if index_ops.hasnans:
                     raise ValueError(
                         "Cannot convert %s with missing values to integer" % 
self.pretty_name
                     )
-            elif is_bool_dtype(dtype) and not isinstance(dtype, 
extension_dtypes):
+            elif is_bool_dtype(dtype) and not 
handle_dtype_as_extension_dtype(dtype):
                 if index_ops.hasnans:
                     raise ValueError(
                         "Cannot convert %s with missing values to bool" % 
self.pretty_name
@@ -738,12 +742,12 @@ class FractionalExtensionOps(FractionalOps):
     def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) 
-> IndexOpsLike:
         dtype, spark_type = pandas_on_spark_type(dtype)
         if get_option("compute.eager_check"):
-            if is_integer_dtype(dtype) and not isinstance(dtype, 
extension_dtypes):
+            if is_integer_dtype(dtype) and not 
handle_dtype_as_extension_dtype(dtype):
                 if index_ops.hasnans:
                     raise ValueError(
                         "Cannot convert %s with missing values to integer" % 
self.pretty_name
                     )
-            elif is_bool_dtype(dtype) and not isinstance(dtype, 
extension_dtypes):
+            elif is_bool_dtype(dtype) and not 
handle_dtype_as_extension_dtype(dtype):
                 if index_ops.hasnans:
                     raise ValueError(
                         "Cannot convert %s with missing values to bool" % 
self.pretty_name
@@ -752,7 +756,7 @@ class FractionalExtensionOps(FractionalOps):
         if isinstance(dtype, CategoricalDtype):
             return _as_categorical_type(index_ops, dtype, spark_type)
         elif isinstance(spark_type, BooleanType):
-            if isinstance(dtype, extension_dtypes):
+            if handle_dtype_as_extension_dtype(dtype):
                 scol = index_ops.spark.column.cast(spark_type)
             else:
                 scol = F.when(
diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py 
b/python/pyspark/pandas/data_type_ops/string_ops.py
index 6c8bc754ac96..e8462bf9f4f4 100644
--- a/python/pyspark/pandas/data_type_ops/string_ops.py
+++ b/python/pyspark/pandas/data_type_ops/string_ops.py
@@ -32,7 +32,7 @@ from pyspark.pandas.data_type_ops.base import (
     _as_string_type,
     _sanitize_list_like,
 )
-from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type
+from pyspark.pandas.typedef import handle_dtype_as_extension_dtype, 
pandas_on_spark_type
 from pyspark.sql.types import BooleanType
 
 
@@ -124,7 +124,7 @@ class StringOps(DataTypeOps):
             return _as_categorical_type(index_ops, dtype, spark_type)
 
         if isinstance(spark_type, BooleanType):
-            if isinstance(dtype, extension_dtypes):
+            if handle_dtype_as_extension_dtype(dtype):
                 scol = index_ops.spark.column.cast(spark_type)
             else:
                 scol = F.when(index_ops.spark.column.isNull(), 
F.lit(False)).otherwise(
diff --git a/python/pyspark/pandas/internal.py 
b/python/pyspark/pandas/internal.py
index cef8996d657b..aba46d41e200 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -50,7 +50,7 @@ from pyspark.pandas.data_type_ops.base import DataTypeOps
 from pyspark.pandas.typedef import (
     Dtype,
     as_spark_type,
-    extension_dtypes,
+    handle_dtype_as_extension_dtype,
     infer_pd_series_spark_type,
     spark_type_to_pandas_dtype,
 )
@@ -162,7 +162,7 @@ class InternalField:
     @property
     def is_extension_dtype(self) -> bool:
         """Return whether the dtype for the field is an extension type or 
not."""
-        return isinstance(self.dtype, extension_dtypes)
+        return handle_dtype_as_extension_dtype(self.dtype)
 
     def normalize_spark_type(self) -> "InternalField":
         """Return a new InternalField object with normalized Spark data 
type."""
diff --git a/python/pyspark/pandas/typedef/typehints.py 
b/python/pyspark/pandas/typedef/typehints.py
index 975147042aaf..99249906089e 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -274,10 +274,7 @@ def spark_type_to_pandas_dtype(
                 return BooleanDtype()
             # StringType
             elif isinstance(spark_type, types.StringType):
-                if LooseVersion(pd.__version__) < "3.0.0":
-                    return StringDtype()
-                else:
-                    return StringDtype(na_value=np.nan)
+                return StringDtype()
 
         # FractionalType
         if extension_float_dtypes_available:
@@ -286,6 +283,10 @@ def spark_type_to_pandas_dtype(
             elif isinstance(spark_type, types.DoubleType):
                 return Float64Dtype()
 
+    if LooseVersion(pd.__version__) >= "3.0.0":
+        if extension_object_dtypes_available and isinstance(spark_type, 
types.StringType):
+            return StringDtype(na_value=np.nan)
+
     if isinstance(
         spark_type,
         (
@@ -318,6 +319,16 @@ def spark_type_to_pandas_dtype(
         )
 
 
+def handle_dtype_as_extension_dtype(tpe: Dtype) -> bool:
+    if LooseVersion(pd.__version__) < "3.0.0":
+        return isinstance(tpe, extension_dtypes)
+
+    if extension_object_dtypes_available:
+        if isinstance(tpe, StringDtype):
+            return tpe.na_value is pd.NA
+    return isinstance(tpe, extension_dtypes)
+
+
 def pandas_on_spark_type(tpe: Union[str, type, Dtype]) -> Tuple[Dtype, 
types.DataType]:
     """
     Convert input into a pandas only dtype object or a numpy dtype object,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-55625][PS] Fix StringOps to make `str` dtype work properly

Reply via email to