This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8cbd520e24db [SPARK-55896][PS] Use numpy functions instead of builtins
8cbd520e24db is described below
commit 8cbd520e24dbf996a951aa120ba2d67f2521ebad
Author: Takuya Ueshin <[email protected]>
AuthorDate: Tue Mar 10 09:48:52 2026 -0700
[SPARK-55896][PS] Use numpy functions instead of builtins
### What changes were proposed in this pull request?
Uses `numpy` functions instead of builtins and fix groupby-apply.
### Why are the changes needed?
In pandas 3, the given builtin functions won't implicitly be replaced with
the corresponding `numpy` function anymore.
For example:
```py
>>> pdf = pd.DataFrame(
... {"d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], "v": [1.0, 2.0, 3.0, 4.0,
5.0, 6.0]}
... )
```
- pandas 2
```py
>>> pdf.groupby("d").apply(sum)
<stdin>:1: FutureWarning: The provided callable <built-in function sum> is
currently using np.sum. In a future version of pandas, the provided callable
will be used directly. To keep current behavior pass the string np.sum instead.
d v
d
1.0 3.0 6.0
2.0 6.0 15.0
```
- pandas 3
```py
>>> pdf.groupby("d").apply(sum)
Traceback (most recent call last):
...
TypeError: unsupported operand type(s) for +: 'int' and 'str'
```
### Does this PR introduce _any_ user-facing change?
Yes, it will behave more like pandas 3.
### How was this patch tested?
Updated the related tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54699 from ueshin/issues/SPARK-55896/builtin.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
python/pyspark/pandas/groupby.py | 7 ++--
.../pandas/tests/groupby/test_apply_func.py | 38 +++++++++++++++++-----
2 files changed, 34 insertions(+), 11 deletions(-)
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 5e47f9840811..f23422b43a22 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -1988,8 +1988,11 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
if include_groups:
raise ValueError("include_groups=True is no longer allowed.")
- spec = inspect.getfullargspec(func)
- return_sig = spec.annotations.get("return", None)
+ try:
+ spec = inspect.getfullargspec(func)
+ return_sig = spec.annotations.get("return", None)
+ except TypeError:
+ return_sig = None
should_infer_schema = return_sig is None
should_retain_index = should_infer_schema
diff --git a/python/pyspark/pandas/tests/groupby/test_apply_func.py
b/python/pyspark/pandas/tests/groupby/test_apply_func.py
index 5716e574cb44..c8e2bf41b62e 100644
--- a/python/pyspark/pandas/tests/groupby/test_apply_func.py
+++ b/python/pyspark/pandas/tests/groupby/test_apply_func.py
@@ -345,14 +345,18 @@ class GroupbyApplyFuncMixin:
)
psdf = ps.from_pandas(pdf)
+ if LooseVersion(pd.__version__) < "3.0.0":
+ sum_f = sum
+ else:
+ sum_f = np.sum
+
self.assert_eq(
- psdf.groupby("d").apply(sum).sort_index(),
pdf.groupby("d").apply(sum).sort_index()
+ psdf.groupby("d").apply(sum_f).sort_index(),
pdf.groupby("d").apply(sum_f).sort_index()
)
- with ps.option_context("compute.shortcut_limit", 1):
- self.assert_eq(
- psdf.groupby("d").apply(sum).sort_index(),
pdf.groupby("d").apply(sum).sort_index()
- )
+ def test_apply_key_handling_without_shortcut(self):
+ with ps.option_context("compute.shortcut_limit", 0):
+ self.test_apply_key_handling()
def test_apply_with_side_effect(self):
pdf = pd.DataFrame(
@@ -370,6 +374,11 @@ class GroupbyApplyFuncMixin:
def _check_apply_with_side_effect(self, psdf, pdf, include_groups):
acc = ps.utils.default_session().sparkContext.accumulator(0)
+ if LooseVersion(pd.__version__) < "3.0.0":
+ sum_f = sum
+ else:
+ sum_f = np.sum
+
if include_groups:
def sum_with_acc_frame(x) -> ps.DataFrame[np.float64, np.float64]:
@@ -378,18 +387,25 @@ class GroupbyApplyFuncMixin:
return np.sum(x)
else:
+ if LooseVersion(pd.__version__) < "3.0.0":
+ ret_type = ps.DataFrame[np.float64]
+ else:
+ ret_type = np.float64
- def sum_with_acc_frame(x) -> ps.DataFrame[np.float64]:
+ def sum_with_acc_frame(x) -> ret_type:
nonlocal acc
acc += 1
return np.sum(x)
actual = psdf.groupby("d").apply(sum_with_acc_frame,
include_groups=include_groups)
- actual.columns = ["d", "v"] if include_groups else ["v"]
+ if LooseVersion(pd.__version__) < "3.0.0":
+ actual.columns = ["d", "v"] if include_groups else ["v"]
+ else:
+ actual = actual.rename()
self.assert_eq(
actual._to_pandas().sort_index(),
pdf.groupby("d")
- .apply(sum, include_groups=include_groups)
+ .apply(sum_f, include_groups=include_groups)
.sort_index()
.reset_index(drop=True),
)
@@ -406,12 +422,16 @@ class GroupbyApplyFuncMixin:
._to_pandas()
.sort_index(),
pdf.groupby("d")["v"]
- .apply(sum, include_groups=include_groups)
+ .apply(sum_f, include_groups=include_groups)
.sort_index()
.reset_index(drop=True),
)
self.assert_eq(acc.value, 4)
+ def test_apply_with_side_effect_without_shortcut(self):
+ with ps.option_context("compute.shortcut_limit", 0):
+ self.test_apply_with_side_effect()
+
def test_apply_return_series(self):
# SPARK-36907: Fix DataFrameGroupBy.apply without shortcut.
pdf = pd.DataFrame(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]