This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 33769b8b78d2 Revert "[SPARK-54182][SQL][PYTHON] Optimize non-arrow
conversion of df.toPandas`"
33769b8b78d2 is described below
commit 33769b8b78d211d345c49ac8674c6ac508ddb285
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Fri Jan 2 15:15:08 2026 +0900
Revert "[SPARK-54182][SQL][PYTHON] Optimize non-arrow conversion of
df.toPandas`"
revert https://github.com/apache/spark/pull/52897 due to perf regression
when the number of columns are small
Closes #53661 from zhengruifeng/revert_54182.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/pandas/conversion.py | 29 ++++++++++++-----------------
1 file changed, 12 insertions(+), 17 deletions(-)
diff --git a/python/pyspark/sql/pandas/conversion.py
b/python/pyspark/sql/pandas/conversion.py
index f5157d5a4e58..0e63bcef7d88 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -18,7 +18,6 @@ import sys
from typing import (
Any,
Callable,
- Iterator,
List,
Optional,
Sequence,
@@ -292,20 +291,18 @@ class PandasConversionMixin:
# Below is toPandas without Arrow optimization.
rows = self.collect()
+ if len(rows) > 0:
+ pdf = pd.DataFrame.from_records(
+ rows, index=range(len(rows)), columns=self.columns # type:
ignore[arg-type]
+ )
+ else:
+ pdf = pd.DataFrame(columns=self.columns)
- if len(self.columns) > 0:
+ if len(pdf.columns) > 0:
timezone = sessionLocalTimeZone
struct_in_pandas = pandasStructHandlingMode
- # Extract columns from rows and apply converters
- if len(rows) > 0:
- # Use iterator to avoid materializing intermediate data
structure
- columns_data: Iterator[Any] = iter(zip(*rows))
- else:
- columns_data = iter([] for _ in self.schema.fields)
-
- # Build DataFrame from columns
- pdf = pd.concat(
+ return pd.concat(
[
_create_converter_to_pandas(
field.dataType,
@@ -316,15 +313,13 @@ class PandasConversionMixin:
),
error_on_duplicated_field_names=False,
timestamp_utc_localized=False,
- )(pd.Series(col_data, dtype=object))
- for col_data, field in zip(columns_data,
self.schema.fields)
+ )(pser)
+ for (_, pser), field in zip(pdf.items(),
self.schema.fields)
],
- axis=1,
- keys=self.columns,
+ axis="columns",
)
- return pdf
else:
- return pd.DataFrame(columns=[], index=range(len(rows)))
+ return pdf
def toArrow(self) -> "pa.Table":
from pyspark.sql.dataframe import DataFrame
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]