This is an automated email from the ASF dual-hosted git repository.
holden pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 161ed3d18dc3 [SPARK-46166][PS] Implementation of pandas.DataFrame.any
with axis=1
161ed3d18dc3 is described below
commit 161ed3d18dc346d3ad970b7a5997e42ea05b5206
Author: Devin Petersohn <[email protected]>
AuthorDate: Wed Nov 26 16:54:59 2025 -0800
[SPARK-46166][PS] Implementation of pandas.DataFrame.any with axis=1
### What changes were proposed in this pull request?
New parameter `axis=1` support for `pandas.DataFrame.any`
### Why are the changes needed?
To introduce `axis=1`
### Does this PR introduce _any_ user-facing change?
Yes, `any` will now support `axis=1`.
### How was this patch tested?
Locally, new tests
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #53096 from devin-petersohn/devin/any_axis_46166.
Lead-authored-by: Devin Petersohn <[email protected]>
Co-authored-by: Devin Petersohn <[email protected]>
Signed-off-by: Holden Karau <[email protected]>
---
python/pyspark/pandas/frame.py | 59 ++++++++++++++++------
.../pandas/tests/computation/test_any_all.py | 45 +++++++++++++++--
2 files changed, 84 insertions(+), 20 deletions(-)
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index af89d18a0ede..379d3698bc09 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -11196,28 +11196,55 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Series([], dtype: bool)
"""
axis = validate_axis(axis)
- if axis != 0:
- raise NotImplementedError('axis should be either 0 or "index"
currently.')
-
column_labels = self._internal.column_labels
if bool_only:
column_labels = self._bool_column_labels(column_labels)
if len(column_labels) == 0:
return ps.Series([], dtype=bool)
+ if axis == 0:
+ applied: List[PySparkColumn] = []
+ for label in column_labels:
+ scol = self._internal.spark_column_for(label)
+ if skipna:
+ # When skipna=True, nulls count as False
+ any_col = F.max(scol.cast("boolean"))
+ applied.append(F.when(any_col.isNull(),
False).otherwise(any_col))
+ else:
+ # When skipna=False, nulls count as True
+ any_col = F.max(scol.cast("boolean"))
+ applied.append(F.when(any_col.isNull(),
True).otherwise(any_col))
+ return self._result_aggregated(column_labels, applied)
+ elif axis == 1:
+ from pyspark.pandas.series import first_series
- applied: List[PySparkColumn] = []
- for label in column_labels:
- scol = self._internal.spark_column_for(label)
- if skipna:
- # When skipna=True, nulls count as False
- any_col = F.max(scol.cast("boolean"))
- applied.append(F.when(any_col.isNull(),
False).otherwise(any_col))
- else:
- # When skipna=False, nulls count as True
- any_col = F.max(scol.cast("boolean"))
- applied.append(F.when(any_col.isNull(),
True).otherwise(any_col))
-
- return self._result_aggregated(column_labels, applied)
+ sdf = self._internal.spark_frame.select(
+ *self._internal_frame.index_spark_columns,
+ F.greatest(
+ *[
+ F.coalesce(
+
self._internal.spark_column_for(label).cast("boolean"),
+ # When skipna=True, nulls count as False and vice
versa
+ F.lit(not skipna),
+ )
+ for label in column_labels
+ ],
+ F.lit(False), # Handle one-column DataFrame case
+ ).alias(SPARK_DEFAULT_SERIES_NAME),
+ )
+ return first_series(
+ DataFrame(
+ InternalFrame(
+ spark_frame=sdf,
+ index_spark_columns=self._internal.index_spark_columns,
+ index_names=self._internal.index_names,
+ index_fields=self._internal.index_fields,
+ column_labels=[None],
+ )
+ )
+ )
+ else:
+ # axis=None case - return single boolean value
+ raise NotImplementedError('axis should be 0, 1, "index", or
"columns" currently.')
def _bool_column_labels(self, column_labels: List[Label]) -> List[Label]:
"""
diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py
b/python/pyspark/pandas/tests/computation/test_any_all.py
index c381c96ead0e..37966f9e0bf1 100644
--- a/python/pyspark/pandas/tests/computation/test_any_all.py
+++ b/python/pyspark/pandas/tests/computation/test_any_all.py
@@ -135,6 +135,29 @@ class FrameAnyAllMixin:
self.assert_eq(psdf.any(bool_only=True), pdf.any(bool_only=True))
self.assert_eq(psdf.any(bool_only=False), pdf.any(bool_only=False))
+ # Test axis=1
+ self.assert_eq(psdf.any(axis=1), pdf.any(axis=1))
+ self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1,
bool_only=True))
+ self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1,
bool_only=False))
+
+ # Test axis='index'
+ self.assert_eq(psdf.any(axis="index"), pdf.any(axis="index"))
+ self.assert_eq(
+ psdf.any(axis="index", bool_only=True), pdf.any(axis="index",
bool_only=True)
+ )
+ self.assert_eq(
+ psdf.any(axis="index", bool_only=False), pdf.any(axis="index",
bool_only=False)
+ )
+
+ # Test axis='columns'
+ self.assert_eq(psdf.any(axis="columns"), pdf.any(axis="columns"))
+ self.assert_eq(
+ psdf.any(axis="columns", bool_only=True), pdf.any(axis="columns",
bool_only=True)
+ )
+ self.assert_eq(
+ psdf.any(axis="columns", bool_only=False), pdf.any(axis="columns",
bool_only=False)
+ )
+
columns.names = ["X", "Y"]
pdf.columns = columns
psdf.columns = columns
@@ -143,10 +166,10 @@ class FrameAnyAllMixin:
self.assert_eq(psdf.any(bool_only=True), pdf.any(bool_only=True))
self.assert_eq(psdf.any(bool_only=False), pdf.any(bool_only=False))
- with self.assertRaisesRegex(
- NotImplementedError, 'axis should be either 0 or "index"
currently.'
- ):
- psdf.any(axis=1)
+ # Test axis=1
+ self.assert_eq(psdf.any(axis=1), pdf.any(axis=1))
+ self.assert_eq(psdf.any(axis=1, bool_only=True), pdf.any(axis=1,
bool_only=True))
+ self.assert_eq(psdf.any(axis=1, bool_only=False), pdf.any(axis=1,
bool_only=False))
# Test skipna parameter
pdf = pd.DataFrame(
@@ -156,12 +179,16 @@ class FrameAnyAllMixin:
# bools and np.nan
self.assert_eq(psdf[["A", "B"]].any(skipna=False), pdf[["A",
"B"]].any(skipna=False))
+ self.assert_eq(
+ psdf[["A", "B"]].any(axis=1, skipna=False), pdf[["A",
"B"]].any(axis=1, skipna=False)
+ )
# bools and None
self.assert_eq(psdf[["A", "C"]].any(skipna=False), pdf[["A",
"C"]].any(skipna=False))
# bools, np.nan, and None
self.assert_eq(psdf[["B", "C"]].any(skipna=False), pdf[["B",
"C"]].any(skipna=False))
# np.nan, and None
self.assert_eq(psdf[["D"]].any(skipna=False),
pdf[["D"]].any(skipna=False))
+ self.assert_eq(psdf[["D"]].any(axis=1, skipna=False),
pdf[["D"]].any(axis=1, skipna=False))
# np.nan only
self.assert_eq(
@@ -169,6 +196,11 @@ class FrameAnyAllMixin:
pd.DataFrame([np.nan]).any(skipna=False),
almost=True,
)
+ self.assert_eq(
+ ps.DataFrame([np.nan]).any(axis=1, skipna=False),
+ pd.DataFrame([np.nan]).any(axis=1, skipna=False),
+ almost=True,
+ )
# None only
self.assert_eq(
@@ -176,6 +208,11 @@ class FrameAnyAllMixin:
pd.DataFrame([None]).any(skipna=True),
almost=True,
)
+ self.assert_eq(
+ ps.DataFrame([None]).any(axis=1, skipna=True),
+ pd.DataFrame([None]).any(axis=1, skipna=True),
+ almost=True,
+ )
class FrameAnyAllTests(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]