devin-petersohn commented on code in PR #55783:
URL: https://github.com/apache/spark/pull/55783#discussion_r3226918846
##########
python/pyspark/pandas/typedef/typehints.py:
##########
@@ -329,14 +374,31 @@ def spark_type_to_pandas_dtype(
def is_str_dtype(tpe: Dtype) -> bool:
if LooseVersion(pd.__version__) < "3.0.0":
return False
+ if extension_arrow_dtypes_available and isinstance(tpe, ArrowDtype):
+ pyarrow_type = tpe.pyarrow_dtype
+ if pa.types.is_string(pyarrow_type) or
pa.types.is_large_string(pyarrow_type):
+ return True
if extension_object_dtypes_available:
return isinstance(tpe, StringDtype) and tpe.na_value is np.nan
return False
+def is_pyarrow_backed_dtype(tpe: Dtype) -> bool:
+ if extension_arrow_dtypes_available and isinstance(tpe, ArrowDtype):
+ return True
+
+ if extension_object_dtypes_available and isinstance(tpe, StringDtype):
+ storage = getattr(tpe, "storage", None)
+ return isinstance(storage, str) and storage.startswith("pyarrow") and
tpe.na_value is pd.NA
Review Comment:
`StringDtype.storage` is part of the API. This feels overly defensive.
##########
python/pyspark/pandas/typedef/typehints.py:
##########
@@ -246,6 +254,24 @@ def as_spark_type(
return types.FloatType()
elif isinstance(tpe, Float64Dtype) or (isinstance(tpe, str) and
tpe == "Float64"):
return types.DoubleType()
+ if extension_arrow_dtypes_available and isinstance(tpe, ArrowDtype):
+ pyarrow_type = tpe.pyarrow_dtype
+ if pa.types.is_string(pyarrow_type) or
pa.types.is_large_string(pyarrow_type):
Review Comment:
Can we just reuse existing functionality?
https://github.com/apache/spark/blob/1360e5c7fa38df1dac55476aba983aa6012b62c4/python/pyspark/sql/pandas/types.py#L347-L350
##########
python/pyspark/pandas/data_type_ops/base.py:
##########
@@ -531,7 +544,22 @@ def ne(self, left: IndexOpsLike, right: Any) ->
SeriesOrIndex:
_sanitize_list_like(right)
if _should_return_all_false(left, right):
- left_scol = left._with_new_scol(F.lit(True))
+ use_extension_dtypes = handle_dtype_as_extension_dtype(left.dtype)
or (
+ isinstance(right, IndexOpsMixin) and
handle_dtype_as_extension_dtype(right.dtype)
+ )
+ use_arrow_dtypes = is_pyarrow_backed_dtype(left.dtype) or (
+ isinstance(right, IndexOpsMixin) and
is_pyarrow_backed_dtype(right.dtype)
+ )
+ field = left._internal.data_fields[0].copy(
+ dtype=spark_type_to_pandas_dtype(
+ BooleanType(),
+ use_extension_dtypes=use_extension_dtypes,
+ use_arrow_dtypes=use_arrow_dtypes,
+ ),
+ spark_type=BooleanType(),
+ nullable=False,
+ )
+ left_scol = left._with_new_scol(F.lit(True), field=field)
Review Comment:
This looks identical to the code block above. Should be a helper if it's
this many lines IMO
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]