This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2d0d317d749e [SPARK-55126][PYTHON] Remove unused timezone and
assign_cols_by_name from ArrowStreamArrowUDFSerializer
2d0d317d749e is described below
commit 2d0d317d749eca815b50c6306c911ec0d07c8b13
Author: Yicong-Huang <[email protected]>
AuthorDate: Thu Jan 22 16:27:15 2026 +0800
[SPARK-55126][PYTHON] Remove unused timezone and assign_cols_by_name from
ArrowStreamArrowUDFSerializer
### What changes were proposed in this pull request?
This PR removes unused `timezone` and `assign_cols_by_name` parameters from
`ArrowStreamArrowUDFSerializer` and related classes.
### Why are the changes needed?
`ArrowStreamArrowUDFSerializer` stores `timezone` and `assign_cols_by_name`
but never uses them. Arrow serializers operate directly on Arrow arrays without
pandas conversion, so these parameters are unnecessary.
This cleanup:
1. Removes dead code and unnecessary parameters
2. Simplifies the serializer API
### Does this PR introduce _any_ user-facing change?
No. These are internal implementation details.
### How was this patch tested?
Existing unit tests should cover this change as the functionality remains
the same.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #53904 from
Yicong-Huang/SPARK-55126/refactor/remove-unused-params-from-arrow-udf-serializer.
Authored-by: Yicong-Huang <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/pandas/serializers.py | 13 -------------
python/pyspark/worker.py | 9 ++-------
2 files changed, 2 insertions(+), 20 deletions(-)
diff --git a/python/pyspark/sql/pandas/serializers.py
b/python/pyspark/sql/pandas/serializers.py
index 3b4e2677933f..504dc64e52a1 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -727,15 +727,11 @@ class
ArrowStreamArrowUDFSerializer(ArrowStreamSerializer):
def __init__(
self,
- timezone,
safecheck,
- assign_cols_by_name,
arrow_cast,
):
super().__init__()
- self._timezone = timezone
self._safecheck = safecheck
- self._assign_cols_by_name = assign_cols_by_name
self._arrow_cast = arrow_cast
def _create_array(self, arr, arrow_type, arrow_cast):
@@ -794,8 +790,6 @@ class
ArrowBatchUDFSerializer(ArrowStreamArrowUDFSerializer):
Parameters
----------
- timezone : str
- A timezone to respect when handling timestamp values
safecheck : bool
If True, conversion from Arrow to Pandas checks for overflow/truncation
input_types : list
@@ -809,16 +803,13 @@ class
ArrowBatchUDFSerializer(ArrowStreamArrowUDFSerializer):
def __init__(
self,
- timezone,
safecheck,
input_types,
int_to_decimal_coercion_enabled,
binary_as_bytes,
):
super().__init__(
- timezone=timezone,
safecheck=safecheck,
- assign_cols_by_name=False,
arrow_cast=True,
)
self._input_types = input_types
@@ -1093,15 +1084,11 @@ class
GroupArrowUDFSerializer(ArrowStreamGroupUDFSerializer):
class ArrowStreamAggArrowUDFSerializer(ArrowStreamArrowUDFSerializer):
def __init__(
self,
- timezone,
safecheck,
- assign_cols_by_name,
arrow_cast,
):
super().__init__(
- timezone=timezone,
safecheck=safecheck,
- assign_cols_by_name=assign_cols_by_name,
arrow_cast=arrow_cast,
)
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index fcd1a106291c..0df7eaf709c8 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -2747,9 +2747,7 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf):
PythonEvalType.SQL_GROUPED_AGG_ARROW_ITER_UDF,
PythonEvalType.SQL_WINDOW_AGG_ARROW_UDF,
):
- ser = ArrowStreamAggArrowUDFSerializer(
- runner_conf.timezone, True, runner_conf.assign_cols_by_name,
True
- )
+ ser = ArrowStreamAggArrowUDFSerializer(safecheck=True,
arrow_cast=True)
elif eval_type in (
PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
PythonEvalType.SQL_GROUPED_AGG_PANDAS_ITER_UDF,
@@ -2822,9 +2820,7 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf):
PythonEvalType.SQL_SCALAR_ARROW_ITER_UDF,
):
# Arrow cast and safe check are always enabled
- ser = ArrowStreamArrowUDFSerializer(
- runner_conf.timezone, True, runner_conf.assign_cols_by_name,
True
- )
+ ser = ArrowStreamArrowUDFSerializer(safecheck=True,
arrow_cast=True)
elif (
eval_type == PythonEvalType.SQL_ARROW_BATCHED_UDF
and not runner_conf.use_legacy_pandas_udf_conversion
@@ -2833,7 +2829,6 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf):
f.dataType for f in
_parse_datatype_json_string(utf8_deserializer.loads(infile))
]
ser = ArrowBatchUDFSerializer(
- runner_conf.timezone,
runner_conf.safecheck,
input_types,
runner_conf.int_to_decimal_coercion_enabled,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]