zhengruifeng commented on code in PR #53957:
URL: https://github.com/apache/spark/pull/53957#discussion_r2726249317
##########
python/pyspark/sql/conversion.py:
##########
@@ -1030,3 +1030,78 @@ def convert_legacy(
integer_object_nulls=True,
)
return converter(ser)
+
+ @classmethod
+ def create_converter(
+ cls,
+ timezone: str,
+ struct_in_pandas: str = "dict",
+ ndarray_as_list: bool = False,
+ df_for_struct: bool = False,
+ input_types: Optional[List] = None,
+ ) -> Callable[["pa.Array", int], Union["pd.Series", "pd.DataFrame"]]:
+ """
+ Create a column-level converter function for Arrow to Pandas
conversion.
+
+ Parameters
+ ----------
+ timezone : str
+ Timezone for timestamp conversion.
+ struct_in_pandas : str
+ How to represent struct in pandas ("dict", "row", etc.)
+ ndarray_as_list : bool
+ Whether to convert ndarray as list.
+ df_for_struct : bool
+ If True, convert struct columns to DataFrame instead of Series.
+ input_types : list, optional
+ Spark types for each column, used for precise type conversion.
+
+ Returns
+ -------
+ callable
+ Function (arrow_column, column_index) -> pd.Series | pd.DataFrame
+ """
+ from pyspark.sql.pandas.types import from_arrow_type, is_variant
+
+ def convert(arrow_column: "pa.Array", spark_type: Optional[DataType] =
None):
+ return cls.convert_legacy(
+ arrow_column,
+ spark_type or from_arrow_type(arrow_column.type),
Review Comment:
I believe we should make the callsite explictly specify the
timezone/datatype/ect from the scratch.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]