This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 9e6b25718688 [SPARK-55700][PS] Fix handling integer keys on Series 
with non-integer index
9e6b25718688 is described below

commit 9e6b2571868812cc960f79ac5ced6eda6b89266f
Author: Takuya Ueshin <[email protected]>
AuthorDate: Thu Feb 26 11:57:25 2026 -0800

    [SPARK-55700][PS] Fix handling integer keys on Series with non-integer index
    
    ### What changes were proposed in this pull request?
    
    Fixes handling integer keys on Series with non-integer index.
    
    ### Why are the changes needed?
    
    Integer keys on Series with non-integer index doesn't handle them as 
positional index with pandas 3 anymore.
    
    For example:
    
    ```py
    >>> dates = pd.date_range("20130101", periods=6)
    >>> pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, 
columns=list("ABCD"))
    ```
    
    - pandas 2
    
    ```py
    >>> pdf.A[4]
    <stdin>:1: FutureWarning: Series.__getitem__ treating keys as positions is 
deprecated. In a future version, integer keys will always be treated as labels 
(consistent with DataFrame behavior). To access a value by position, use 
`ser.iloc[pos]`
    np.float64(-1.2836101861392761)
    ```
    
    - pandas 3
    
    ```py
    >>> pdf.A[4]
    Traceback (most recent call last):
    ...
    KeyError: 4
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it will behave more like pandas 3.
    
    ### How was this patch tested?
    
    Updated the related tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #54499 from ueshin/issues/SPARK-55700/treating_keys_as_positions.
    
    Authored-by: Takuya Ueshin <[email protected]>
    Signed-off-by: Takuya Ueshin <[email protected]>
---
 python/pyspark/pandas/series.py                    | 27 +++++++++++++---------
 .../pandas/tests/indexes/test_indexing_adv.py      |  9 ++++++--
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 1015ff4db4d9..f1fb0069fa74 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -7354,19 +7354,24 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         )
 
     def __getitem__(self, key: Any) -> Any:
-        if type(key) == int and not isinstance(self.index.spark.data_type, 
(IntegerType, LongType)):
-            warnings.warn(
-                "Series.__getitem__ treating keys as positions is deprecated. "
-                "In a future version, integer keys will always be treated as 
labels "
-                "(consistent with DataFrame behavior). "
-                "To access a value by position, use `ser.iloc[pos]`",
-                FutureWarning,
+        if LooseVersion(pd.__version__) < "3.0.0":
+            treating_keys_as_positions = type(key) == int and not isinstance(
+                self.index.spark.data_type, (IntegerType, LongType)
             )
+            if treating_keys_as_positions:
+                warnings.warn(
+                    "Series.__getitem__ treating keys as positions is 
deprecated. "
+                    "In a future version, integer keys will always be treated 
as labels "
+                    "(consistent with DataFrame behavior). "
+                    "To access a value by position, use `ser.iloc[pos]`",
+                    FutureWarning,
+                )
+        else:
+            treating_keys_as_positions = False
         try:
-            if (isinstance(key, slice) and any(type(n) == int for n in 
[key.start, key.stop])) or (
-                type(key) == int
-                and not isinstance(self.index.spark.data_type, (IntegerType, 
LongType))
-            ):
+            if (
+                isinstance(key, slice) and any(type(n) == int for n in 
[key.start, key.stop])
+            ) or treating_keys_as_positions:
                 # Seems like pandas Series always uses int as positional 
search when slicing
                 # with ints, searches based on index values when the value is 
int.
                 return self.iloc[key]
diff --git a/python/pyspark/pandas/tests/indexes/test_indexing_adv.py 
b/python/pyspark/pandas/tests/indexes/test_indexing_adv.py
index fdebdcbd0002..90c9d5dbb0ef 100644
--- a/python/pyspark/pandas/tests/indexes/test_indexing_adv.py
+++ b/python/pyspark/pandas/tests/indexes/test_indexing_adv.py
@@ -22,6 +22,7 @@ import numpy as np
 import pandas as pd
 
 from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
 from pyspark.pandas.exceptions import SparkPandasNotImplementedError
 from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
@@ -339,8 +340,12 @@ class IndexingAdvMixin:
         self.assert_eq(psdf[10:3], pdf[10:3], almost=True)
 
         # Index loc search
-        self.assert_eq(psdf.A[4], pdf.A[4])
-        self.assert_eq(psdf.A[3], pdf.A[3])
+        if LooseVersion(pd.__version__) < "3.0.0":
+            self.assert_eq(psdf.A[4], pdf.A[4])
+            self.assert_eq(psdf.A[3], pdf.A[3])
+        else:
+            with self.assertRaises(KeyError):
+                psdf.A[4]
 
         # Positional iloc search
         self.assert_eq(psdf.A[:4], pdf.A[:4], almost=True)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to