This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7577ec34d80e [SPARK-55424][PYTHON] Explicitly pass the series name in
`convert_numpy`
7577ec34d80e is described below
commit 7577ec34d80e1181de6c090b680d0d157d4c491a
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Feb 9 07:22:30 2026 +0900
[SPARK-55424][PYTHON] Explicitly pass the series name in `convert_numpy`
### What changes were proposed in this pull request?
Explicitly pass the series name in `convert_numpy`
### Why are the changes needed?
Arrow array from `batch.column(idx)` contains name, and this name will be
used to rename the pandas series returned by `array.to_pandas()`.
In `convert_legacy`, the first step is `array.to_pandas()`, so the name was
kept.
In `convert_numpy`, complex conversion happened before `to_pandas()`, and
this name will be dropped after pa.compute functions. We'd explicitly pass the
series name instead of relying on arrow internal stuff `array._name`
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #54207 from zhengruifeng/pass_name.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/conversion.py | 54 +++++++++++++++++++++++-----------------
1 file changed, 31 insertions(+), 23 deletions(-)
diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
index 11b3b195a9d1..cae03a2c01fd 100644
--- a/python/pyspark/sql/conversion.py
+++ b/python/pyspark/sql/conversion.py
@@ -20,12 +20,14 @@ import datetime
import decimal
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Sequence,
Union, overload
+import pyspark
from pyspark.errors import PySparkValueError
from pyspark.sql.pandas.types import (
_dedup_names,
_deduplicate_field_names,
_create_converter_to_pandas,
to_arrow_schema,
+ from_arrow_schema,
)
from pyspark.sql.pandas.utils import require_minimum_pyarrow_version
from pyspark.sql.types import (
@@ -140,16 +142,17 @@ class ArrowBatchTransformer:
"""
import pandas as pd
- import pyspark
- from pyspark.sql.pandas.types import from_arrow_type
-
if batch.num_columns == 0:
return [pd.Series([pyspark._NoValue] * batch.num_rows)]
+ if schema is None:
+ schema = from_arrow_schema(batch.schema)
+
return [
ArrowArrayToPandasConversion.convert(
batch.column(i),
- schema[i].dataType if schema is not None else
from_arrow_type(batch.column(i).type),
+ schema[i].dataType,
+ ser_name=schema[i].name,
timezone=timezone,
struct_in_pandas=struct_in_pandas,
ndarray_as_list=ndarray_as_list,
@@ -1177,9 +1180,10 @@ class ArrowArrayToPandasConversion:
@classmethod
def convert(
cls,
- arrow_column: Union["pa.Array", "pa.ChunkedArray"],
- target_type: DataType,
+ arr: Union["pa.Array", "pa.ChunkedArray"],
+ spark_type: DataType,
*,
+ ser_name: Optional[str] = None,
timezone: Optional[str] = None,
struct_in_pandas: str = "dict",
ndarray_as_list: bool = False,
@@ -1190,10 +1194,12 @@ class ArrowArrayToPandasConversion:
Parameters
----------
- arrow_column : pa.Array or pa.ChunkedArray
+ arr : pa.Array or pa.ChunkedArray
The Arrow column to convert.
- target_type : DataType
+ spark_type : DataType
The target Spark type for the column to be converted to.
+ ser_name : str
+ The name of returned pd.Series. If not set, will try to get it
from arr._name.
timezone : str, optional
Timezone for timestamp conversion. Required if the data contains
timestamp types.
struct_in_pandas : str, optional
@@ -1211,10 +1217,11 @@ class ArrowArrayToPandasConversion:
Converted pandas Series. If df_for_struct is True and the type is
StructType,
returns a DataFrame with columns corresponding to struct fields.
"""
- if cls._prefer_convert_numpy(target_type, df_for_struct):
+ if cls._prefer_convert_numpy(spark_type, df_for_struct):
return cls.convert_numpy(
- arrow_column,
- target_type,
+ arr,
+ spark_type,
+ ser_name=ser_name,
timezone=timezone,
struct_in_pandas=struct_in_pandas,
ndarray_as_list=ndarray_as_list,
@@ -1222,8 +1229,8 @@ class ArrowArrayToPandasConversion:
)
return cls.convert_legacy(
- arrow_column,
- target_type,
+ arr,
+ spark_type,
timezone=timezone,
struct_in_pandas=struct_in_pandas,
ndarray_as_list=ndarray_as_list,
@@ -1359,6 +1366,7 @@ class ArrowArrayToPandasConversion:
arr: Union["pa.Array", "pa.ChunkedArray"],
spark_type: DataType,
*,
+ ser_name: Optional[str] = None,
timezone: Optional[str] = None,
struct_in_pandas: Optional[str] = None,
ndarray_as_list: bool = False,
@@ -1375,11 +1383,12 @@ class ArrowArrayToPandasConversion:
assert types.is_struct(arr.type)
assert len(spark_type.names) == len(arr.type.names),
f"{spark_type} {arr.type} "
- pdf: pd.DataFrame = pd.concat(
+ return pd.concat(
[
cls.convert_numpy(
field_arr,
spark_type=field.dataType,
+ ser_name=field.name,
timezone=timezone,
struct_in_pandas=struct_in_pandas,
ndarray_as_list=ndarray_as_list,
@@ -1389,15 +1398,14 @@ class ArrowArrayToPandasConversion:
],
axis=1,
)
- pdf.columns = spark_type.names # type: ignore[assignment]
- return pdf
- # Arrow array from batch.column(idx) contains name,
- # and this name will be used to rename the pandas series
- # returned by array.to_pandas().
- # Right now, the name is dropped in arrow conversions.
- # TODO: should make convert_numpy explicitly pass the expected series
name.
- name = arr._name
+ if ser_name is None:
+ # Arrow array from batch.column(idx) contains name,
+ # and this name will be used to rename the pandas series
+ # returned by array.to_pandas().
+ # This name will be dropped after pa.compute functions.
+ ser_name = arr._name
+
arr = ArrowArrayConversion.preprocess_time(arr)
series: pd.Series
@@ -1491,4 +1499,4 @@ class ArrowArrayToPandasConversion:
else: # pragma: no cover
assert False, f"Need converter for {spark_type} but failed to find
one."
- return series.rename(name)
+ return series.rename(ser_name)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]