This is an automated email from the ASF dual-hosted git repository.
ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7fca140e3b3a [SPARK-55648][PS] Handle an unexpected keyword argument
error `groupby(axis)` with pandas 3
7fca140e3b3a is described below
commit 7fca140e3b3a7509ff68ed8e2d54db5190af3621
Author: Takuya Ueshin <[email protected]>
AuthorDate: Tue Feb 24 12:13:40 2026 -0800
[SPARK-55648][PS] Handle an unexpected keyword argument error
`groupby(axis)` with pandas 3
### What changes were proposed in this pull request?
Handles an unexpected keyword argument error `groupby(axis)` with pandas 3.
### Why are the changes needed?
The `axis` argument was removed from `groupby` in pandas 3.
### Does this PR introduce _any_ user-facing change?
Yes, it will behave more like pandas 3.
### How was this patch tested?
Updated the related tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54436 from ueshin/issues/SPARK-55648/axis.
Lead-authored-by: Takuya Ueshin <[email protected]>
Co-authored-by: Takuya UESHIN <[email protected]>
Signed-off-by: Takuya Ueshin <[email protected]>
---
python/pyspark/pandas/frame.py | 2 +-
python/pyspark/pandas/generic.py | 15 ++++--
python/pyspark/pandas/series.py | 2 +-
.../pyspark/pandas/tests/groupby/test_groupby.py | 60 ++++++++++++----------
4 files changed, 47 insertions(+), 32 deletions(-)
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 5609f76cd719..aeb47709766c 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -13658,7 +13658,7 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
def groupby(
self,
by: Union[Name, "Series", List[Union[Name, "Series"]]],
- axis: Axis = 0,
+ axis: Union[Axis, _NoValueType] = _NoValue,
as_index: bool = True,
dropna: bool = True,
) -> "DataFrameGroupBy":
diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py
index f73580444449..cfd566e20573 100644
--- a/python/pyspark/pandas/generic.py
+++ b/python/pyspark/pandas/generic.py
@@ -2397,7 +2397,7 @@ class Frame(object, metaclass=ABCMeta):
def groupby(
self: FrameLike,
by: Union[Name, "Series", List[Union[Name, "Series"]]],
- axis: Axis = 0,
+ axis: Union[Axis, _NoValueType] = _NoValue,
as_index: bool = True,
dropna: bool = True,
) -> "GroupBy[FrameLike]":
@@ -2518,9 +2518,16 @@ class Frame(object, metaclass=ABCMeta):
raise ValueError("Grouper for '{}' not
1-dimensional".format(type(by).__name__))
if not len(new_by):
raise ValueError("No group keys passed!")
- axis = validate_axis(axis)
- if axis != 0:
- raise NotImplementedError('axis should be either 0 or "index"
currently.')
+
+ if LooseVersion(pd.__version__) < "3.0.0":
+ if axis is _NoValue:
+ axis = 0
+ axis = validate_axis(axis) # type: ignore[arg-type]
+ if axis != 0:
+ raise NotImplementedError('axis should be either 0 or "index"
currently.')
+ else:
+ if axis is not _NoValue:
+ raise TypeError("The 'axis' keyword is not supported in pandas
3.0.0 and later.")
return self._build_groupby(by=new_by, as_index=as_index, dropna=dropna)
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 72d49574423b..3f8a2e57792d 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -7195,7 +7195,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
def groupby(
self,
by: Union[Name, "Series", List[Union[Name, "Series"]]],
- axis: Axis = 0,
+ axis: Union[Axis, _NoValueType] = _NoValue,
as_index: bool = True,
dropna: bool = True,
) -> "SeriesGroupBy":
diff --git a/python/pyspark/pandas/tests/groupby/test_groupby.py
b/python/pyspark/pandas/tests/groupby/test_groupby.py
index 92bb84a5c341..91ea9168c710 100644
--- a/python/pyspark/pandas/tests/groupby/test_groupby.py
+++ b/python/pyspark/pandas/tests/groupby/test_groupby.py
@@ -20,6 +20,7 @@ import numpy as np
import pandas as pd
from pyspark import pandas as ps
+from pyspark.loose_version import LooseVersion
from pyspark.pandas.groupby import is_multi_agg_with_relabel
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
@@ -92,28 +93,29 @@ class GroupByTestsMixin:
psdf.a.groupby(psdf.b).sum().sort_index(),
pdf.a.groupby(pdf.b).sum().sort_index()
)
- for axis in [0, "index"]:
- self.assert_eq(
- psdf.groupby("a", axis=axis).a.sum().sort_index(),
- pdf.groupby("a", axis=axis).a.sum().sort_index(),
- )
- self.assert_eq(
- psdf.groupby("a", axis=axis)["a"].sum().sort_index(),
- pdf.groupby("a", axis=axis)["a"].sum().sort_index(),
- )
- self.assert_eq(
- psdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
- pdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
- )
- self.assert_eq(
- psdf.groupby("a", axis=axis)[["a", "c"]].sum().sort_index(),
- pdf.groupby("a", axis=axis)[["a", "c"]].sum().sort_index(),
- )
-
- self.assert_eq(
- psdf.a.groupby(psdf.b, axis=axis).sum().sort_index(),
- pdf.a.groupby(pdf.b, axis=axis).sum().sort_index(),
- )
+ if LooseVersion(pd.__version__) < "3.0.0":
+ for axis in [0, "index"]:
+ self.assert_eq(
+ psdf.groupby("a", axis=axis).a.sum().sort_index(),
+ pdf.groupby("a", axis=axis).a.sum().sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("a", axis=axis)["a"].sum().sort_index(),
+ pdf.groupby("a", axis=axis)["a"].sum().sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
+ pdf.groupby("a", axis=axis)[["a"]].sum().sort_index(),
+ )
+ self.assert_eq(
+ psdf.groupby("a", axis=axis)[["a",
"c"]].sum().sort_index(),
+ pdf.groupby("a", axis=axis)[["a", "c"]].sum().sort_index(),
+ )
+
+ self.assert_eq(
+ psdf.a.groupby(psdf.b, axis=axis).sum().sort_index(),
+ pdf.a.groupby(pdf.b, axis=axis).sum().sort_index(),
+ )
self.assertRaises(ValueError, lambda: psdf.groupby("a",
as_index=False).a)
self.assertRaises(ValueError, lambda: psdf.groupby("a",
as_index=False)["a"])
@@ -124,10 +126,16 @@ class GroupByTestsMixin:
self.assertRaises(TypeError, lambda: psdf.a.groupby(psdf.b,
as_index=False))
- self.assertRaises(NotImplementedError, lambda: psdf.groupby("a",
axis=1))
- self.assertRaises(NotImplementedError, lambda: psdf.groupby("a",
axis="columns"))
- self.assertRaises(ValueError, lambda: psdf.groupby("a", "b"))
- self.assertRaises(TypeError, lambda: psdf.a.groupby(psdf.a, psdf.b))
+ if LooseVersion(pd.__version__) < "3.0.0":
+ self.assertRaises(NotImplementedError, lambda: psdf.groupby("a",
axis=1))
+ self.assertRaises(NotImplementedError, lambda: psdf.groupby("a",
axis="columns"))
+ self.assertRaises(ValueError, lambda: psdf.groupby("a", "b"))
+ self.assertRaises(TypeError, lambda: psdf.a.groupby(psdf.a,
psdf.b))
+ else:
+ with self.assertRaises(TypeError):
+ psdf.groupby("a", axis=1)
+ with self.assertRaises(TypeError):
+ psdf.a.groupby(psdf.b, axis=1)
# we can't use column name/names as a parameter `by` for
`SeriesGroupBy`.
self.assertRaises(KeyError, lambda: psdf.a.groupby(by="a"))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]