Yicong-Huang commented on code in PR #53992:
URL: https://github.com/apache/spark/pull/53992#discussion_r2734572280
##########
python/pyspark/sql/pandas/serializers.py:
##########
@@ -686,44 +704,50 @@ def _create_batch(self, series):
import pandas as pd
import pyarrow as pa
- # Make input conform to
- # [(series1, arrow_type1, spark_type1), (series2, arrow_type2,
spark_type2), ...]
- if (
- not isinstance(series, (list, tuple))
- or (len(series) == 2 and isinstance(series[1], pa.DataType))
- or (
- len(series) == 3
- and isinstance(series[1], pa.DataType)
- and isinstance(series[2], DataType)
- )
+ # Normalize input to list of (data, spark_type) tuples
+ # Handle: single series, (series, type) tuple, or list of tuples
+ if not isinstance(series, (list, tuple)) or (
+ len(series) == 2 and isinstance(series[1], DataType)
):
series = [series]
- series = ((s, None) if not isinstance(s, (list, tuple)) else s for s
in series)
- series = ((s[0], s[1], None) if len(s) == 2 else s for s in series)
+ # Ensure each element is a (data, spark_type) tuple
+ series = [(s, None) if not isinstance(s, (list, tuple)) else s for s
in series]
arrs = []
- for s, arrow_type, spark_type in series:
- # Variants are represented in arrow as structs with additional
metadata (checked by
- # is_variant). If the data type is Variant, return a VariantVal
atomic type instead of
- # a dict of two binary values.
- if (
- self._struct_in_pandas == "dict"
- and arrow_type is not None
- and pa.types.is_struct(arrow_type)
- and not is_variant(arrow_type)
- ):
- # A pandas UDF should return pd.DataFrame when the return type
is a struct type.
- # If it returns a pd.Series, it should throw an error.
- if not isinstance(s, pd.DataFrame):
+ for s, spark_type in series:
+ # DataFrame with StructType → struct array; otherwise → regular
array
+ # Check if this is a struct type that needs DataFrame
representation
+ is_struct_type = (
+ spark_type is not None
+ and isinstance(spark_type, StructType)
+ and not isinstance(spark_type, VariantType)
Review Comment:
got it! thanks for the explanation! will change this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]