(spark) branch master updated: [SPARK-55424][PYTHON] Explicitly pass the series name in `convert_numpy`

gurwls223 Sun, 08 Feb 2026 14:22:49 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 7577ec34d80e [SPARK-55424][PYTHON] Explicitly pass the series name in 
`convert_numpy`
7577ec34d80e is described below

commit 7577ec34d80e1181de6c090b680d0d157d4c491a
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Feb 9 07:22:30 2026 +0900

    [SPARK-55424][PYTHON] Explicitly pass the series name in `convert_numpy`
    
    ### What changes were proposed in this pull request?
    Explicitly pass the series name in `convert_numpy`
    
    ### Why are the changes needed?
    Arrow array from `batch.column(idx)` contains name, and this name will be 
used to rename the pandas series returned by `array.to_pandas()`.
    
    In `convert_legacy`, the first step is `array.to_pandas()`, so the name was 
kept.
    
    In `convert_numpy`, complex conversion happened before `to_pandas()`, and 
this name will be dropped after pa.compute functions. We'd explicitly pass the 
series name instead of relying on arrow internal stuff `array._name`
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #54207 from zhengruifeng/pass_name.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/sql/conversion.py | 54 +++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
index 11b3b195a9d1..cae03a2c01fd 100644
--- a/python/pyspark/sql/conversion.py
+++ b/python/pyspark/sql/conversion.py
@@ -20,12 +20,14 @@ import datetime
 import decimal
 from typing import TYPE_CHECKING, Any, Callable, List, Optional, Sequence, 
Union, overload
 
+import pyspark
 from pyspark.errors import PySparkValueError
 from pyspark.sql.pandas.types import (
     _dedup_names,
     _deduplicate_field_names,
     _create_converter_to_pandas,
     to_arrow_schema,
+    from_arrow_schema,
 )
 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version
 from pyspark.sql.types import (
@@ -140,16 +142,17 @@ class ArrowBatchTransformer:
         """
         import pandas as pd
 
-        import pyspark
-        from pyspark.sql.pandas.types import from_arrow_type
-
         if batch.num_columns == 0:
             return [pd.Series([pyspark._NoValue] * batch.num_rows)]
 
+        if schema is None:
+            schema = from_arrow_schema(batch.schema)
+
         return [
             ArrowArrayToPandasConversion.convert(
                 batch.column(i),
-                schema[i].dataType if schema is not None else 
from_arrow_type(batch.column(i).type),
+                schema[i].dataType,
+                ser_name=schema[i].name,
                 timezone=timezone,
                 struct_in_pandas=struct_in_pandas,
                 ndarray_as_list=ndarray_as_list,
@@ -1177,9 +1180,10 @@ class ArrowArrayToPandasConversion:
     @classmethod
     def convert(
         cls,
-        arrow_column: Union["pa.Array", "pa.ChunkedArray"],
-        target_type: DataType,
+        arr: Union["pa.Array", "pa.ChunkedArray"],
+        spark_type: DataType,
         *,
+        ser_name: Optional[str] = None,
         timezone: Optional[str] = None,
         struct_in_pandas: str = "dict",
         ndarray_as_list: bool = False,
@@ -1190,10 +1194,12 @@ class ArrowArrayToPandasConversion:
 
         Parameters
         ----------
-        arrow_column : pa.Array or pa.ChunkedArray
+        arr : pa.Array or pa.ChunkedArray
             The Arrow column to convert.
-        target_type : DataType
+        spark_type : DataType
             The target Spark type for the column to be converted to.
+        ser_name : str
+            The name of returned pd.Series. If not set, will try to get it 
from arr._name.
         timezone : str, optional
             Timezone for timestamp conversion. Required if the data contains 
timestamp types.
         struct_in_pandas : str, optional
@@ -1211,10 +1217,11 @@ class ArrowArrayToPandasConversion:
             Converted pandas Series. If df_for_struct is True and the type is 
StructType,
             returns a DataFrame with columns corresponding to struct fields.
         """
-        if cls._prefer_convert_numpy(target_type, df_for_struct):
+        if cls._prefer_convert_numpy(spark_type, df_for_struct):
             return cls.convert_numpy(
-                arrow_column,
-                target_type,
+                arr,
+                spark_type,
+                ser_name=ser_name,
                 timezone=timezone,
                 struct_in_pandas=struct_in_pandas,
                 ndarray_as_list=ndarray_as_list,
@@ -1222,8 +1229,8 @@ class ArrowArrayToPandasConversion:
             )
 
         return cls.convert_legacy(
-            arrow_column,
-            target_type,
+            arr,
+            spark_type,
             timezone=timezone,
             struct_in_pandas=struct_in_pandas,
             ndarray_as_list=ndarray_as_list,
@@ -1359,6 +1366,7 @@ class ArrowArrayToPandasConversion:
         arr: Union["pa.Array", "pa.ChunkedArray"],
         spark_type: DataType,
         *,
+        ser_name: Optional[str] = None,
         timezone: Optional[str] = None,
         struct_in_pandas: Optional[str] = None,
         ndarray_as_list: bool = False,
@@ -1375,11 +1383,12 @@ class ArrowArrayToPandasConversion:
             assert types.is_struct(arr.type)
             assert len(spark_type.names) == len(arr.type.names), 
f"{spark_type} {arr.type} "
 
-            pdf: pd.DataFrame = pd.concat(
+            return pd.concat(
                 [
                     cls.convert_numpy(
                         field_arr,
                         spark_type=field.dataType,
+                        ser_name=field.name,
                         timezone=timezone,
                         struct_in_pandas=struct_in_pandas,
                         ndarray_as_list=ndarray_as_list,
@@ -1389,15 +1398,14 @@ class ArrowArrayToPandasConversion:
                 ],
                 axis=1,
             )
-            pdf.columns = spark_type.names  # type: ignore[assignment]
-            return pdf
 
-        # Arrow array from batch.column(idx) contains name,
-        # and this name will be used to rename the pandas series
-        # returned by array.to_pandas().
-        # Right now, the name is dropped in arrow conversions.
-        # TODO: should make convert_numpy explicitly pass the expected series 
name.
-        name = arr._name
+        if ser_name is None:
+            # Arrow array from batch.column(idx) contains name,
+            # and this name will be used to rename the pandas series
+            # returned by array.to_pandas().
+            # This name will be dropped after pa.compute functions.
+            ser_name = arr._name
+
         arr = ArrowArrayConversion.preprocess_time(arr)
 
         series: pd.Series
@@ -1491,4 +1499,4 @@ class ArrowArrayToPandasConversion:
         else:  # pragma: no cover
             assert False, f"Need converter for {spark_type} but failed to find 
one."
 
-        return series.rename(name)
+        return series.rename(ser_name)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-55424][PYTHON] Explicitly pass the series name in `convert_numpy`

Reply via email to