This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 2d0d317d749e [SPARK-55126][PYTHON] Remove unused timezone and 
assign_cols_by_name from ArrowStreamArrowUDFSerializer
2d0d317d749e is described below

commit 2d0d317d749eca815b50c6306c911ec0d07c8b13
Author: Yicong-Huang <[email protected]>
AuthorDate: Thu Jan 22 16:27:15 2026 +0800

    [SPARK-55126][PYTHON] Remove unused timezone and assign_cols_by_name from 
ArrowStreamArrowUDFSerializer
    
    ### What changes were proposed in this pull request?
    
    This PR removes unused `timezone` and `assign_cols_by_name` parameters from 
`ArrowStreamArrowUDFSerializer` and related classes.
    
    ### Why are the changes needed?
    
    `ArrowStreamArrowUDFSerializer` stores `timezone` and `assign_cols_by_name` 
but never uses them. Arrow serializers operate directly on Arrow arrays without 
pandas conversion, so these parameters are unnecessary.
    
    This cleanup:
    1. Removes dead code and unnecessary parameters
    2. Simplifies the serializer API
    
    ### Does this PR introduce _any_ user-facing change?
    
    No. These are internal implementation details.
    
    ### How was this patch tested?
    
    Existing unit tests should cover this change as the functionality remains 
the same.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #53904 from 
Yicong-Huang/SPARK-55126/refactor/remove-unused-params-from-arrow-udf-serializer.
    
    Authored-by: Yicong-Huang <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 python/pyspark/sql/pandas/serializers.py | 13 -------------
 python/pyspark/worker.py                 |  9 ++-------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/python/pyspark/sql/pandas/serializers.py 
b/python/pyspark/sql/pandas/serializers.py
index 3b4e2677933f..504dc64e52a1 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -727,15 +727,11 @@ class 
ArrowStreamArrowUDFSerializer(ArrowStreamSerializer):
 
     def __init__(
         self,
-        timezone,
         safecheck,
-        assign_cols_by_name,
         arrow_cast,
     ):
         super().__init__()
-        self._timezone = timezone
         self._safecheck = safecheck
-        self._assign_cols_by_name = assign_cols_by_name
         self._arrow_cast = arrow_cast
 
     def _create_array(self, arr, arrow_type, arrow_cast):
@@ -794,8 +790,6 @@ class 
ArrowBatchUDFSerializer(ArrowStreamArrowUDFSerializer):
 
     Parameters
     ----------
-    timezone : str
-        A timezone to respect when handling timestamp values
     safecheck : bool
         If True, conversion from Arrow to Pandas checks for overflow/truncation
     input_types : list
@@ -809,16 +803,13 @@ class 
ArrowBatchUDFSerializer(ArrowStreamArrowUDFSerializer):
 
     def __init__(
         self,
-        timezone,
         safecheck,
         input_types,
         int_to_decimal_coercion_enabled,
         binary_as_bytes,
     ):
         super().__init__(
-            timezone=timezone,
             safecheck=safecheck,
-            assign_cols_by_name=False,
             arrow_cast=True,
         )
         self._input_types = input_types
@@ -1093,15 +1084,11 @@ class 
GroupArrowUDFSerializer(ArrowStreamGroupUDFSerializer):
 class ArrowStreamAggArrowUDFSerializer(ArrowStreamArrowUDFSerializer):
     def __init__(
         self,
-        timezone,
         safecheck,
-        assign_cols_by_name,
         arrow_cast,
     ):
         super().__init__(
-            timezone=timezone,
             safecheck=safecheck,
-            assign_cols_by_name=assign_cols_by_name,
             arrow_cast=arrow_cast,
         )
 
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index fcd1a106291c..0df7eaf709c8 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -2747,9 +2747,7 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf):
             PythonEvalType.SQL_GROUPED_AGG_ARROW_ITER_UDF,
             PythonEvalType.SQL_WINDOW_AGG_ARROW_UDF,
         ):
-            ser = ArrowStreamAggArrowUDFSerializer(
-                runner_conf.timezone, True, runner_conf.assign_cols_by_name, 
True
-            )
+            ser = ArrowStreamAggArrowUDFSerializer(safecheck=True, 
arrow_cast=True)
         elif eval_type in (
             PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
             PythonEvalType.SQL_GROUPED_AGG_PANDAS_ITER_UDF,
@@ -2822,9 +2820,7 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf):
             PythonEvalType.SQL_SCALAR_ARROW_ITER_UDF,
         ):
             # Arrow cast and safe check are always enabled
-            ser = ArrowStreamArrowUDFSerializer(
-                runner_conf.timezone, True, runner_conf.assign_cols_by_name, 
True
-            )
+            ser = ArrowStreamArrowUDFSerializer(safecheck=True, 
arrow_cast=True)
         elif (
             eval_type == PythonEvalType.SQL_ARROW_BATCHED_UDF
             and not runner_conf.use_legacy_pandas_udf_conversion
@@ -2833,7 +2829,6 @@ def read_udfs(pickleSer, infile, eval_type, runner_conf):
                 f.dataType for f in 
_parse_datatype_json_string(utf8_deserializer.loads(infile))
             ]
             ser = ArrowBatchUDFSerializer(
-                runner_conf.timezone,
                 runner_conf.safecheck,
                 input_types,
                 runner_conf.int_to_decimal_coercion_enabled,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to