ueshin commented on code in PR #41503:
URL: https://github.com/apache/spark/pull/41503#discussion_r1242956904
##########
python/pyspark/sql/pandas/serializers.py:
##########
@@ -190,7 +190,7 @@ def arrow_to_pandas(self, arrow_column,
struct_in_pandas="dict", ndarray_as_list
)
return converter(s)
- def _create_array(self, series, arrow_type, spark_type=None):
+ def _create_array(self, series, arrow_type, spark_type=None,
arrow_cast=False):
Review Comment:
I'm wondering if this needs to take the argument as it is a method and
`self` should have `_arrow_cast`.
##########
python/pyspark/sql/pandas/serializers.py:
##########
@@ -234,18 +237,33 @@ def _create_array(self, series, arrow_type,
spark_type=None):
)
raise PySparkTypeError(error_msg % (series.dtype, series.name,
arrow_type)) from e
except ValueError as e:
- error_msg = (
- "Exception thrown when converting pandas.Series (%s) "
- "with name '%s' to Arrow Array (%s)."
- )
- if self._safecheck:
- error_msg = error_msg + (
- " It can be caused by overflows or other "
- "unsafe conversions warned by Arrow. Arrow safe type check
"
- "can be disabled by using SQL config "
- "`spark.sql.execution.pandas.convertToArrowArraySafely`."
+
+ def _raise_exception(e):
+ error_msg = (
+ "Exception thrown when converting pandas.Series (%s) "
+ "with name '%s' to Arrow Array (%s)."
)
- raise PySparkValueError(error_msg % (series.dtype, series.name,
arrow_type)) from e
+ if self._safecheck:
+ error_msg = error_msg + (
+ " It can be caused by overflows or other "
+ "unsafe conversions warned by Arrow. Arrow safe type
check "
+ "can be disabled by using SQL config "
+
"`spark.sql.execution.pandas.convertToArrowArraySafely`."
+ )
+ raise PySparkValueError(error_msg % (series.dtype,
series.name, arrow_type)) from e
+
+ if arrow_cast:
+
+ def force_cast(array: pa.Array):
+ try:
+ return array.cast(target_type=arrow_type,
safe=self._safecheck)
+ except Exception as ee:
+ _raise_exception(ee)
+
+ array = pa.Array.from_pandas(series, mask=mask,
safe=self._safecheck)
+ return force_cast(array)
Review Comment:
I guess we can do this around line 232? I'm wondering what happens if we do
so:
```py
try:
if self._arrow_cast:
return pa.Array.from_pandas(series, mask=mask,
safe=self._safecheck).cast(target_type=arrow_type, safe=self._safecheck)
else:
return pa.Array.from_pandas(series, mask=mask,
type=arrow_type, safe=self._safecheck)
except TypeError as e:
...
```
Will it introduce breaking changes?
##########
python/pyspark/sql/pandas/serializers.py:
##########
@@ -234,18 +237,33 @@ def _create_array(self, series, arrow_type,
spark_type=None):
)
raise PySparkTypeError(error_msg % (series.dtype, series.name,
arrow_type)) from e
except ValueError as e:
- error_msg = (
- "Exception thrown when converting pandas.Series (%s) "
- "with name '%s' to Arrow Array (%s)."
- )
- if self._safecheck:
- error_msg = error_msg + (
- " It can be caused by overflows or other "
- "unsafe conversions warned by Arrow. Arrow safe type check
"
- "can be disabled by using SQL config "
- "`spark.sql.execution.pandas.convertToArrowArraySafely`."
+
+ def _raise_exception(e):
+ error_msg = (
+ "Exception thrown when converting pandas.Series (%s) "
+ "with name '%s' to Arrow Array (%s)."
)
- raise PySparkValueError(error_msg % (series.dtype, series.name,
arrow_type)) from e
+ if self._safecheck:
+ error_msg = error_msg + (
+ " It can be caused by overflows or other "
+ "unsafe conversions warned by Arrow. Arrow safe type
check "
+ "can be disabled by using SQL config "
+
"`spark.sql.execution.pandas.convertToArrowArraySafely`."
+ )
+ raise PySparkValueError(error_msg % (series.dtype,
series.name, arrow_type)) from e
+
+ if arrow_cast:
+
+ def force_cast(array: pa.Array):
+ try:
+ return array.cast(target_type=arrow_type,
safe=self._safecheck)
+ except Exception as ee:
+ _raise_exception(ee)
+
+ array = pa.Array.from_pandas(series, mask=mask,
safe=self._safecheck)
Review Comment:
Do we need `safe=self._safecheck` here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]