This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 7fca140e3b3a [SPARK-55648][PS] Handle an unexpected keyword argument 
error `groupby(axis)` with pandas 3
7fca140e3b3a is described below

commit 7fca140e3b3a7509ff68ed8e2d54db5190af3621
Author: Takuya Ueshin <[email protected]>
AuthorDate: Tue Feb 24 12:13:40 2026 -0800

    [SPARK-55648][PS] Handle an unexpected keyword argument error 
`groupby(axis)` with pandas 3
    
    ### What changes were proposed in this pull request?
    
    Handles an unexpected keyword argument error `groupby(axis)` with pandas 3.
    
    ### Why are the changes needed?
    
    The `axis` argument was removed from `groupby` in pandas 3.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it will behave more like pandas 3.
    
    ### How was this patch tested?
    
    Updated the related tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #54436 from ueshin/issues/SPARK-55648/axis.
    
    Lead-authored-by: Takuya Ueshin <[email protected]>
    Co-authored-by: Takuya UESHIN <[email protected]>
    Signed-off-by: Takuya Ueshin <[email protected]>
---
 python/pyspark/pandas/frame.py                     |  2 +-
 python/pyspark/pandas/generic.py                   | 15 ++++--
 python/pyspark/pandas/series.py                    |  2 +-
 .../pyspark/pandas/tests/groupby/test_groupby.py   | 60 ++++++++++++----------
 4 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 5609f76cd719..aeb47709766c 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -13658,7 +13658,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
     def groupby(
         self,
         by: Union[Name, "Series", List[Union[Name, "Series"]]],
-        axis: Axis = 0,
+        axis: Union[Axis, _NoValueType] = _NoValue,
         as_index: bool = True,
         dropna: bool = True,
     ) -> "DataFrameGroupBy":
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index f73580444449..cfd566e20573 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -2397,7 +2397,7 @@ class Frame(object, metaclass=ABCMeta):
     def groupby(
         self: FrameLike,
         by: Union[Name, "Series", List[Union[Name, "Series"]]],
-        axis: Axis = 0,
+        axis: Union[Axis, _NoValueType] = _NoValue,
         as_index: bool = True,
         dropna: bool = True,
     ) -> "GroupBy[FrameLike]":
@@ -2518,9 +2518,16 @@ class Frame(object, metaclass=ABCMeta):
             raise ValueError("Grouper for '{}' not 
1-dimensional".format(type(by).__name__))
         if not len(new_by):
             raise ValueError("No group keys passed!")
-        axis = validate_axis(axis)
-        if axis != 0:
-            raise NotImplementedError('axis should be either 0 or "index" 
currently.')
+
+        if LooseVersion(pd.__version__) < "3.0.0":
+            if axis is _NoValue:
+                axis = 0
+            axis = validate_axis(axis)  # type: ignore[arg-type]
+            if axis != 0:
+                raise NotImplementedError('axis should be either 0 or "index" 
currently.')
+        else:
+            if axis is not _NoValue:
+                raise TypeError("The 'axis' keyword is not supported in pandas 
3.0.0 and later.")
 
         return self._build_groupby(by=new_by, as_index=as_index, dropna=dropna)
 
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 72d49574423b..3f8a2e57792d 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -7195,7 +7195,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
     def groupby(
         self,
         by: Union[Name, "Series", List[Union[Name, "Series"]]],
-        axis: Axis = 0,
+        axis: Union[Axis, _NoValueType] = _NoValue,
         as_index: bool = True,
         dropna: bool = True,
     ) -> "SeriesGroupBy":
diff --git a/python/pyspark/pandas/tests/groupby/test_groupby.py 
b/python/pyspark/pandas/tests/groupby/test_groupby.py
index 92bb84a5c341..91ea9168c710 100644
--- a/python/pyspark/pandas/tests/groupby/test_groupby.py
+++ b/python/pyspark/pandas/tests/groupby/test_groupby.py
@@ -20,6 +20,7 @@ import numpy as np
 import pandas as pd
 
 from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
 from pyspark.pandas.groupby import is_multi_agg_with_relabel
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
@@ -92,28 +93,29 @@ class GroupByTestsMixin:
             psdf.a.groupby(psdf.b).sum().sort_index(), 
pdf.a.groupby(pdf.b).sum().sort_index()
         )
 
-        for axis in [0, "index"]:
-            self.assert_eq(
-                psdf.groupby("a", axis=axis).a.sum().sort_index(),
-                pdf.groupby("a", axis=axis).a.sum().sort_index(),
-            )
-            self.assert_eq(
-                psdf.groupby("a", axis=axis)["a"].sum().sort_index(),
-                pdf.groupby("a", axis=axis)["a"].sum().sort_index(),
-            )
-            self.assert_eq(
-                psdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
-                pdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
-            )
-            self.assert_eq(
-                psdf.groupby("a", axis=axis)[["a", "c"]].sum().sort_index(),
-                pdf.groupby("a", axis=axis)[["a", "c"]].sum().sort_index(),
-            )
-
-            self.assert_eq(
-                psdf.a.groupby(psdf.b, axis=axis).sum().sort_index(),
-                pdf.a.groupby(pdf.b, axis=axis).sum().sort_index(),
-            )
+        if LooseVersion(pd.__version__) < "3.0.0":
+            for axis in [0, "index"]:
+                self.assert_eq(
+                    psdf.groupby("a", axis=axis).a.sum().sort_index(),
+                    pdf.groupby("a", axis=axis).a.sum().sort_index(),
+                )
+                self.assert_eq(
+                    psdf.groupby("a", axis=axis)["a"].sum().sort_index(),
+                    pdf.groupby("a", axis=axis)["a"].sum().sort_index(),
+                )
+                self.assert_eq(
+                    psdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
+                    pdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
+                )
+                self.assert_eq(
+                    psdf.groupby("a", axis=axis)[["a", 
"c"]].sum().sort_index(),
+                    pdf.groupby("a", axis=axis)[["a", "c"]].sum().sort_index(),
+                )
+
+                self.assert_eq(
+                    psdf.a.groupby(psdf.b, axis=axis).sum().sort_index(),
+                    pdf.a.groupby(pdf.b, axis=axis).sum().sort_index(),
+                )
 
         self.assertRaises(ValueError, lambda: psdf.groupby("a", 
as_index=False).a)
         self.assertRaises(ValueError, lambda: psdf.groupby("a", 
as_index=False)["a"])
@@ -124,10 +126,16 @@ class GroupByTestsMixin:
 
         self.assertRaises(TypeError, lambda: psdf.a.groupby(psdf.b, 
as_index=False))
 
-        self.assertRaises(NotImplementedError, lambda: psdf.groupby("a", 
axis=1))
-        self.assertRaises(NotImplementedError, lambda: psdf.groupby("a", 
axis="columns"))
-        self.assertRaises(ValueError, lambda: psdf.groupby("a", "b"))
-        self.assertRaises(TypeError, lambda: psdf.a.groupby(psdf.a, psdf.b))
+        if LooseVersion(pd.__version__) < "3.0.0":
+            self.assertRaises(NotImplementedError, lambda: psdf.groupby("a", 
axis=1))
+            self.assertRaises(NotImplementedError, lambda: psdf.groupby("a", 
axis="columns"))
+            self.assertRaises(ValueError, lambda: psdf.groupby("a", "b"))
+            self.assertRaises(TypeError, lambda: psdf.a.groupby(psdf.a, 
psdf.b))
+        else:
+            with self.assertRaises(TypeError):
+                psdf.groupby("a", axis=1)
+            with self.assertRaises(TypeError):
+                psdf.a.groupby(psdf.b, axis=1)
 
         # we can't use column name/names as a parameter `by` for 
`SeriesGroupBy`.
         self.assertRaises(KeyError, lambda: psdf.a.groupby(by="a"))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to