This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 188cf470727a [SPARK-55625][PS] Fix StringOps to make `str` dtype work
properly
188cf470727a is described below
commit 188cf470727a7d657619de787671e1278d40a827
Author: Takuya Ueshin <[email protected]>
AuthorDate: Mon Feb 23 07:19:08 2026 +0900
[SPARK-55625][PS] Fix StringOps to make `str` dtype work properly
### What changes were proposed in this pull request?
Fix `StringOps` to make `str` dtype work properly.
### Why are the changes needed?
In pandas 3, the default dtype for string is now `str` or
`StringDtype(na_value=np.nan)`.
This is one of extension dtypes, but actually handled as if non-extension
dtypes.
```py
>>> pser = pd.Series(["x", "y", "z", None])
>>> other_pser = pd.Series([None, "z", "y", "x"])
```
- pandas 2
```py
>>> pser
0 x
1 y
2 z
3 None
dtype: object
>>> other_pser
0 None
1 z
2 y
3 x
dtype: object
>>> pser == other_pser
0 False
1 False
2 False
3 False
dtype: bool
```
- pandas 3
```py
>>> pser
0 x
1 y
2 z
3 NaN
dtype: str
>>> other_pser
0 NaN
1 z
2 y
3 x
dtype: str
>>> pser == other_pser
0 False
1 False
2 False
3 False
dtype: bool
```
### Does this PR introduce _any_ user-facing change?
Yes, it will behave more like pandas 3.
### How was this patch tested?
The existing tests should pass.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54413 from ueshin/issues/SPARK-55625/string_ops.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/base.py | 4 ++--
python/pyspark/pandas/data_type_ops/base.py | 13 +++++++-----
python/pyspark/pandas/data_type_ops/boolean_ops.py | 16 +++++++++------
python/pyspark/pandas/data_type_ops/num_ops.py | 24 +++++++++++++---------
python/pyspark/pandas/data_type_ops/string_ops.py | 4 ++--
python/pyspark/pandas/internal.py | 4 ++--
python/pyspark/pandas/typedef/typehints.py | 19 +++++++++++++----
7 files changed, 53 insertions(+), 31 deletions(-)
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index cd6ae554cb02..a43fd7112cbd 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -40,7 +40,7 @@ from pyspark.pandas.internal import (
SPARK_DEFAULT_INDEX_NAME,
)
from pyspark.pandas.spark.accessors import SparkIndexOpsMethods
-from pyspark.pandas.typedef import extension_dtypes
+from pyspark.pandas.typedef.typehints import handle_dtype_as_extension_dtype
from pyspark.pandas.utils import (
ansi_mode_context,
combine_frames,
@@ -228,7 +228,7 @@ def column_op(f: Callable[..., Column]) -> Callable[...,
SeriesOrIndex]:
field = InternalField.from_struct_field(
self._internal.spark_frame.select(scol).schema[0],
use_extension_dtypes=any(
- isinstance(col.dtype, extension_dtypes) for col in [self]
+ cols
+ handle_dtype_as_extension_dtype(col.dtype) for col in
[self] + cols
),
)
diff --git a/python/pyspark/pandas/data_type_ops/base.py
b/python/pyspark/pandas/data_type_ops/base.py
index 7be658b79192..fbd6b8a30a1e 100644
--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@@ -46,11 +46,11 @@ from pyspark.sql.types import (
UserDefinedType,
)
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
-from pyspark.pandas.typedef import extension_dtypes
from pyspark.pandas.typedef.typehints import (
extension_dtypes_available,
extension_float_dtypes_available,
extension_object_dtypes_available,
+ handle_dtype_as_extension_dtype,
spark_type_to_pandas_dtype,
)
from pyspark.pandas.utils import is_ansi_mode_enabled
@@ -173,7 +173,7 @@ def _as_categorical_type(
def _as_bool_type(index_ops: IndexOpsLike, dtype: Dtype) -> IndexOpsLike:
"""Cast `index_ops` to BooleanType Spark type, given `dtype`."""
spark_type = BooleanType()
- if isinstance(dtype, extension_dtypes):
+ if handle_dtype_as_extension_dtype(dtype):
scol = index_ops.spark.column.cast(spark_type)
else:
null_value = (
@@ -194,7 +194,7 @@ def _as_string_type(
representing null Spark column. Note that `null_str` is for non-extension
dtypes only.
"""
spark_type = StringType()
- if isinstance(dtype, extension_dtypes):
+ if handle_dtype_as_extension_dtype(dtype):
scol = index_ops.spark.column.cast(spark_type)
else:
casted = index_ops.spark.column.cast(spark_type)
@@ -254,7 +254,7 @@ def _is_extension_dtypes(object: Any) -> bool:
Extention dtype includes Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype,
BooleanDtype,
StringDtype, Float32Dtype and Float64Dtype.
"""
- return isinstance(getattr(object, "dtype", None), extension_dtypes)
+ return handle_dtype_as_extension_dtype(getattr(object, "dtype", None))
class DataTypeOps(object, metaclass=ABCMeta):
@@ -300,7 +300,10 @@ class DataTypeOps(object, metaclass=ABCMeta):
return object.__new__(IntegralOps)
elif isinstance(spark_type, StringType):
if extension_object_dtypes_available and isinstance(dtype,
StringDtype):
- return object.__new__(StringExtensionOps)
+ if handle_dtype_as_extension_dtype(dtype):
+ return object.__new__(StringExtensionOps)
+ else:
+ return object.__new__(StringOps)
else:
return object.__new__(StringOps)
elif isinstance(spark_type, BooleanType):
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py
b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index 8e84fe4e3da5..05797735b1ce 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -36,7 +36,11 @@ from pyspark.pandas.data_type_ops.base import (
_is_valid_for_logical_operator,
_is_boolean_type,
)
-from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes,
pandas_on_spark_type
+from pyspark.pandas.typedef.typehints import (
+ as_spark_type,
+ handle_dtype_as_extension_dtype,
+ pandas_on_spark_type,
+)
from pyspark.pandas.utils import is_ansi_mode_enabled
from pyspark.sql import functions as F, Column as PySparkColumn
from pyspark.sql.types import BooleanType, StringType
@@ -243,7 +247,7 @@ class BooleanOps(DataTypeOps):
and right is None
):
raise TypeError("AND can not be applied to given types.")
- if isinstance(right, IndexOpsMixin) and isinstance(right.dtype,
extension_dtypes):
+ if isinstance(right, IndexOpsMixin) and
handle_dtype_as_extension_dtype(right.dtype):
return right.__and__(left)
else:
@@ -268,7 +272,7 @@ class BooleanOps(DataTypeOps):
and right is None
):
raise TypeError("XOR can not be applied to given types.")
- if isinstance(right, IndexOpsMixin) and isinstance(right.dtype,
extension_dtypes):
+ if isinstance(right, IndexOpsMixin) and
handle_dtype_as_extension_dtype(right.dtype):
return right ^ left
elif _is_valid_for_logical_operator(right):
@@ -295,7 +299,7 @@ class BooleanOps(DataTypeOps):
and right is None
):
raise TypeError("OR can not be applied to given types.")
- if isinstance(right, IndexOpsMixin) and isinstance(right.dtype,
extension_dtypes):
+ if isinstance(right, IndexOpsMixin) and
handle_dtype_as_extension_dtype(right.dtype):
return right.__or__(left)
else:
@@ -322,7 +326,7 @@ class BooleanOps(DataTypeOps):
elif isinstance(spark_type, BooleanType):
return _as_bool_type(index_ops, dtype)
elif isinstance(spark_type, StringType):
- if isinstance(dtype, extension_dtypes):
+ if handle_dtype_as_extension_dtype(dtype):
scol = F.when(
index_ops.spark.column.isNotNull(),
F.when(index_ops.spark.column, "True").otherwise("False"),
@@ -342,7 +346,7 @@ class BooleanOps(DataTypeOps):
else:
is_ansi =
is_ansi_mode_enabled(index_ops._internal.spark_frame.sparkSession)
if is_ansi and get_option("compute.eager_check"):
- if is_integer_dtype(dtype) and not isinstance(dtype,
extension_dtypes):
+ if is_integer_dtype(dtype) and not
handle_dtype_as_extension_dtype(dtype):
if index_ops.hasnans:
raise ValueError(
"Cannot convert %s with missing values to integer"
% self.pretty_name
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 6bc8efbc2554..17f17cd76d8a 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -46,7 +46,11 @@ from pyspark.pandas.data_type_ops.base import (
_is_boolean_type,
_should_return_all_false,
)
-from pyspark.pandas.typedef.typehints import extension_dtypes,
pandas_on_spark_type, as_spark_type
+from pyspark.pandas.typedef.typehints import (
+ as_spark_type,
+ handle_dtype_as_extension_dtype,
+ pandas_on_spark_type,
+)
from pyspark.pandas.utils import is_ansi_mode_enabled
from pyspark.sql import functions as F, Column as PySparkColumn
from pyspark.sql.types import (
@@ -318,7 +322,7 @@ class IntegralOps(NumericOps):
def xor(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
- if isinstance(right, IndexOpsMixin) and isinstance(right.dtype,
extension_dtypes):
+ if isinstance(right, IndexOpsMixin) and
handle_dtype_as_extension_dtype(right.dtype):
return right ^ left
elif _is_valid_for_logical_operator(right):
right_is_boolean = _is_boolean_type(right)
@@ -605,7 +609,7 @@ class FractionalOps(NumericOps):
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype])
-> IndexOpsLike:
dtype, spark_type = pandas_on_spark_type(dtype)
- if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
+ if is_integer_dtype(dtype) and not
handle_dtype_as_extension_dtype(dtype):
if get_option("compute.eager_check") and index_ops.hasnans:
raise ValueError(
"Cannot convert %s with missing values to integer" %
self.pretty_name
@@ -614,7 +618,7 @@ class FractionalOps(NumericOps):
if isinstance(dtype, CategoricalDtype):
return _as_categorical_type(index_ops, dtype, spark_type)
elif isinstance(spark_type, BooleanType):
- if isinstance(dtype, extension_dtypes):
+ if handle_dtype_as_extension_dtype(dtype):
scol = index_ops.spark.column.cast(spark_type)
else:
scol = F.when(
@@ -666,7 +670,7 @@ class DecimalOps(FractionalOps):
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype])
-> IndexOpsLike:
dtype, spark_type = pandas_on_spark_type(dtype)
- if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes):
+ if is_integer_dtype(dtype) and not
handle_dtype_as_extension_dtype(dtype):
if get_option("compute.eager_check") and index_ops.hasnans:
raise ValueError(
"Cannot convert %s with missing values to integer" %
self.pretty_name
@@ -709,12 +713,12 @@ class IntegralExtensionOps(IntegralOps):
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype])
-> IndexOpsLike:
dtype, spark_type = pandas_on_spark_type(dtype)
if get_option("compute.eager_check"):
- if is_integer_dtype(dtype) and not isinstance(dtype,
extension_dtypes):
+ if is_integer_dtype(dtype) and not
handle_dtype_as_extension_dtype(dtype):
if index_ops.hasnans:
raise ValueError(
"Cannot convert %s with missing values to integer" %
self.pretty_name
)
- elif is_bool_dtype(dtype) and not isinstance(dtype,
extension_dtypes):
+ elif is_bool_dtype(dtype) and not
handle_dtype_as_extension_dtype(dtype):
if index_ops.hasnans:
raise ValueError(
"Cannot convert %s with missing values to bool" %
self.pretty_name
@@ -738,12 +742,12 @@ class FractionalExtensionOps(FractionalOps):
def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype])
-> IndexOpsLike:
dtype, spark_type = pandas_on_spark_type(dtype)
if get_option("compute.eager_check"):
- if is_integer_dtype(dtype) and not isinstance(dtype,
extension_dtypes):
+ if is_integer_dtype(dtype) and not
handle_dtype_as_extension_dtype(dtype):
if index_ops.hasnans:
raise ValueError(
"Cannot convert %s with missing values to integer" %
self.pretty_name
)
- elif is_bool_dtype(dtype) and not isinstance(dtype,
extension_dtypes):
+ elif is_bool_dtype(dtype) and not
handle_dtype_as_extension_dtype(dtype):
if index_ops.hasnans:
raise ValueError(
"Cannot convert %s with missing values to bool" %
self.pretty_name
@@ -752,7 +756,7 @@ class FractionalExtensionOps(FractionalOps):
if isinstance(dtype, CategoricalDtype):
return _as_categorical_type(index_ops, dtype, spark_type)
elif isinstance(spark_type, BooleanType):
- if isinstance(dtype, extension_dtypes):
+ if handle_dtype_as_extension_dtype(dtype):
scol = index_ops.spark.column.cast(spark_type)
else:
scol = F.when(
diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py
b/python/pyspark/pandas/data_type_ops/string_ops.py
index 6c8bc754ac96..e8462bf9f4f4 100644
--- a/python/pyspark/pandas/data_type_ops/string_ops.py
+++ b/python/pyspark/pandas/data_type_ops/string_ops.py
@@ -32,7 +32,7 @@ from pyspark.pandas.data_type_ops.base import (
_as_string_type,
_sanitize_list_like,
)
-from pyspark.pandas.typedef import extension_dtypes, pandas_on_spark_type
+from pyspark.pandas.typedef import handle_dtype_as_extension_dtype,
pandas_on_spark_type
from pyspark.sql.types import BooleanType
@@ -124,7 +124,7 @@ class StringOps(DataTypeOps):
return _as_categorical_type(index_ops, dtype, spark_type)
if isinstance(spark_type, BooleanType):
- if isinstance(dtype, extension_dtypes):
+ if handle_dtype_as_extension_dtype(dtype):
scol = index_ops.spark.column.cast(spark_type)
else:
scol = F.when(index_ops.spark.column.isNull(),
F.lit(False)).otherwise(
diff --git a/python/pyspark/pandas/internal.py
b/python/pyspark/pandas/internal.py
index cef8996d657b..aba46d41e200 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -50,7 +50,7 @@ from pyspark.pandas.data_type_ops.base import DataTypeOps
from pyspark.pandas.typedef import (
Dtype,
as_spark_type,
- extension_dtypes,
+ handle_dtype_as_extension_dtype,
infer_pd_series_spark_type,
spark_type_to_pandas_dtype,
)
@@ -162,7 +162,7 @@ class InternalField:
@property
def is_extension_dtype(self) -> bool:
"""Return whether the dtype for the field is an extension type or
not."""
- return isinstance(self.dtype, extension_dtypes)
+ return handle_dtype_as_extension_dtype(self.dtype)
def normalize_spark_type(self) -> "InternalField":
"""Return a new InternalField object with normalized Spark data
type."""
diff --git a/python/pyspark/pandas/typedef/typehints.py
b/python/pyspark/pandas/typedef/typehints.py
index 975147042aaf..99249906089e 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -274,10 +274,7 @@ def spark_type_to_pandas_dtype(
return BooleanDtype()
# StringType
elif isinstance(spark_type, types.StringType):
- if LooseVersion(pd.__version__) < "3.0.0":
- return StringDtype()
- else:
- return StringDtype(na_value=np.nan)
+ return StringDtype()
# FractionalType
if extension_float_dtypes_available:
@@ -286,6 +283,10 @@ def spark_type_to_pandas_dtype(
elif isinstance(spark_type, types.DoubleType):
return Float64Dtype()
+ if LooseVersion(pd.__version__) >= "3.0.0":
+ if extension_object_dtypes_available and isinstance(spark_type,
types.StringType):
+ return StringDtype(na_value=np.nan)
+
if isinstance(
spark_type,
(
@@ -318,6 +319,16 @@ def spark_type_to_pandas_dtype(
)
+def handle_dtype_as_extension_dtype(tpe: Dtype) -> bool:
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return isinstance(tpe, extension_dtypes)
+
+ if extension_object_dtypes_available:
+ if isinstance(tpe, StringDtype):
+ return tpe.na_value is pd.NA
+ return isinstance(tpe, extension_dtypes)
+
+
def pandas_on_spark_type(tpe: Union[str, type, Dtype]) -> Tuple[Dtype,
types.DataType]:
"""
Convert input into a pandas only dtype object or a numpy dtype object,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]