This is an automated email from the ASF dual-hosted git repository.
ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 08a2cb9213dd [SPARK-55296][PS][FOLLOW-UP] Fix setitem with Series
through iloc indexer with pandas 3
08a2cb9213dd is described below
commit 08a2cb9213dd7701c562af1f465cbf394b32a38e
Author: Takuya Ueshin <[email protected]>
AuthorDate: Thu Feb 26 11:56:35 2026 -0800
[SPARK-55296][PS][FOLLOW-UP] Fix setitem with Series through iloc indexer
with pandas 3
### What changes were proposed in this pull request?
This is another follow-up of apache/spark#54375.
Fixes `setitem` with `Series` through `iloc` indexer with pandas 3.
Also fixed some tests with `DataFrame` with `loc` indexer.
### Why are the changes needed?
Setting `Series` through `iloc` indexer causes an analysis exception:
```py
>>> from pyspark import pandas as ps
>>> psdf = ps.DataFrame({"a": [1, 2, 3], "b": [10, 20, 30]})
>>> psser = psdf["a"]
>>>
>>> psser.iloc[[0, 1, 2]] = -ps.DataFrame({"x": [100, 200, 300]})["x"]
Traceback (most recent call last):
...
pyspark.errors.exceptions.captured.AnalysisException:
[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter
with name `__temp_value_col__` cannot be resolved. Did you mean one of the
following? [`__natural_order__`, `__index_level_0__`, `a`,
`__distributed_sequence_column__`]. SQLSTATE: 42703;
...
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Updated the related tests and the existing tests should pass.
### Was this patch authored or co-authored using generative AI tooling?
Codex (GPT-5.3-Codex)
Closes #54498 from ueshin/issues/SPARK-55296/setitem.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Takuya Ueshin <[email protected]>
---
python/pyspark/pandas/indexing.py | 17 +++++++++--------
.../tests/diff_frames_ops/test_setitem_frame.py | 20 ++++++++++++++------
2 files changed, 23 insertions(+), 14 deletions(-)
diff --git a/python/pyspark/pandas/indexing.py
b/python/pyspark/pandas/indexing.py
index 7f26f4ce7595..03d8054c7e2a 100644
--- a/python/pyspark/pandas/indexing.py
+++ b/python/pyspark/pandas/indexing.py
@@ -621,17 +621,18 @@ class LocIndexerLike(IndexerLike, metaclass=ABCMeta):
psdf[temp_value_col] = value
psdf =
psdf.sort_values(temp_natural_order).drop(columns=temp_natural_order)
- psser = psdf._psser_for(column_label)
if isinstance(key, Series):
- key = F.col(
-
"`{}`".format(psdf[temp_key_col]._internal.data_spark_column_names[0])
- )
+ key = psdf[temp_key_col].spark.column
if isinstance(value, Series):
- value = F.col(
-
"`{}`".format(psdf[temp_value_col]._internal.data_spark_column_names[0])
- )
+ value = psdf[temp_value_col].spark.column
+
+ if isinstance(self, iLocIndexer):
+ col_sel = psdf._internal.column_labels.index(column_label)
+ else:
+ col_sel = column_label
+ type(self)(psdf)[key, col_sel] = value
- type(self)(psser)[key] = value
+ psser = psdf._psser_for(column_label)
if self._psdf_or_psser.name is None:
psser = psser.rename()
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py
index 09428da14ee3..45eb8a3705f2 100644
--- a/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_setitem_frame.py
@@ -19,6 +19,7 @@ import pandas as pd
from pyspark import pandas as ps
from pyspark.pandas.config import set_option, reset_option
+from pyspark.loose_version import LooseVersion
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
@@ -51,8 +52,15 @@ class DiffFramesSetItemFrameMixin:
another_psdf = ps.DataFrame(pdf_orig)
- psdf.loc[["viper", "sidewinder"], ["shield"]] = -another_psdf.max_speed
- pdf.loc[["viper", "sidewinder"], ["shield"]] = -pdf.max_speed
+ if LooseVersion(pd.__version__) < "3.0.0":
+ shield_sel = ["shield"]
+ else:
+ # pandas 3 CoW can raise a shape-mismatch error for `loc[...,
["shield"]] = Series`
+ # when Series views are already referenced. Use scalar column
selection instead.
+ shield_sel = "shield"
+
+ psdf.loc[["viper", "sidewinder"], shield_sel] = -another_psdf.max_speed
+ pdf.loc[["viper", "sidewinder"], shield_sel] = -pdf.max_speed
self.assert_eq(psdf, pdf)
self.assert_eq(psser1, pser1)
self.assert_eq(psser2, pser2)
@@ -63,8 +71,8 @@ class DiffFramesSetItemFrameMixin:
pser2 = pdf.shield
psser1 = psdf.max_speed
psser2 = psdf.shield
- psdf.loc[another_psdf.max_speed < 5, ["shield"]] = -psdf.max_speed
- pdf.loc[pdf.max_speed < 5, ["shield"]] = -pdf.max_speed
+ psdf.loc[another_psdf.max_speed < 5, shield_sel] = -psdf.max_speed
+ pdf.loc[pdf.max_speed < 5, shield_sel] = -pdf.max_speed
self.assert_eq(psdf, pdf)
self.assert_eq(psser1, pser1)
self.assert_eq(psser2, pser2)
@@ -75,8 +83,8 @@ class DiffFramesSetItemFrameMixin:
pser2 = pdf.shield
psser1 = psdf.max_speed
psser2 = psdf.shield
- psdf.loc[another_psdf.max_speed < 5, ["shield"]] =
-another_psdf.max_speed
- pdf.loc[pdf.max_speed < 5, ["shield"]] = -pdf.max_speed
+ psdf.loc[another_psdf.max_speed < 5, shield_sel] =
-another_psdf.max_speed
+ pdf.loc[pdf.max_speed < 5, shield_sel] = -pdf.max_speed
self.assert_eq(psdf, pdf)
self.assert_eq(psser1, pser1)
self.assert_eq(psser2, pser2)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]