This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 8dfbb9a94a0 [SPARK-38863][PYTHON] Implement `skipna` parameter of
`DataFrame.all`
8dfbb9a94a0 is described below
commit 8dfbb9a94a0d84588ebd341c728159c19123874e
Author: Xinrong Meng <[email protected]>
AuthorDate: Wed Apr 13 12:21:36 2022 +0900
[SPARK-38863][PYTHON] Implement `skipna` parameter of `DataFrame.all`
### What changes were proposed in this pull request?
Implement `skipna` parameter of `DataFrame.all`.
If `skipna` is False, numpy.NaNs are treated as True, Nones are treated as
False.
### Why are the changes needed?
To reach parity with pandas.
### Does this PR introduce _any_ user-facing change?
Yes. `skipna` parameter of `DataFrame.all` is supported.
```py
>>> psdf = ps.DataFrame({'A':[True, True], 'B':[np.nan, 1], 'C':[None,
'x']})
>>> psdf
A B C
0 True NaN None
1 True 1.0 x
>>> psdf.all(skipna=False)
A True
B True
C False
dtype: bool
>>> ps.DataFrame({'A':[np.nan], 'B':[None]}).all(skipna=False)
A True
B False
dtype: bool
```
### How was this patch tested?
Unit tests.
Closes #36104 from xinrong-databricks/frame.all.skipna.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/base.py | 6 ++++--
python/pyspark/pandas/frame.py | 30 ++++++++++++++++++++++++---
python/pyspark/pandas/tests/test_dataframe.py | 15 ++++++++++++++
3 files changed, 46 insertions(+), 5 deletions(-)
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index 2d2c79e7f47..99922074f4c 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -982,9 +982,11 @@ class IndexOpsMixin(object, metaclass=ABCMeta):
original column labels.
skipna : boolean, default True
- Exclude NA/null values. If an entire row/column is NA and skipna
is True,
+ Exclude NA values, such as None or numpy.NaN.
+ If an entire row/column is NA values and `skipna` is True,
then the result will be True, as for an empty row/column.
- If skipna is False, then NA are treated as True, because these are
not equal to zero.
+ If `skipna` is False, numpy.NaNs are treated as True because these
are
+ not equal to zero, Nones are treated as False.
Examples
--------
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 683ef9b4a88..5ac9e30f390 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -10164,8 +10164,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
)
)
- # TODO: axis, skipna, level and **kwargs should be implemented.
- def all(self, axis: Axis = 0, bool_only: Optional[bool] = None) ->
"Series":
+ # TODO: axis, level and **kwargs should be implemented.
+ def all(
+ self, axis: Axis = 0, bool_only: Optional[bool] = None, skipna: bool =
True
+ ) -> "Series":
"""
Return whether all elements are True.
@@ -10184,6 +10186,13 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
Include only boolean columns. If None, will attempt to use
everything,
then use only boolean data.
+ skipna : boolean, default True
+ Exclude NA values, such as None or numpy.NaN.
+ If an entire row/column is NA values and `skipna` is True,
+ then the result will be True, as for an empty row/column.
+ If `skipna` is False, numpy.NaNs are treated as True because these
are
+ not equal to zero, Nones are treated as False.
+
Returns
-------
Series
@@ -10212,6 +10221,13 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
col6 False
dtype: bool
+ Include NA values when set `skipna=False`.
+
+ >>> df[['col5', 'col6']].all(skipna=False)
+ col5 False
+ col6 False
+ dtype: bool
+
Include only boolean columns when set `bool_only=True`.
>>> df.all(bool_only=True)
@@ -10232,7 +10248,15 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
applied = []
for label in column_labels:
scol = self._internal.spark_column_for(label)
- all_col = F.min(F.coalesce(scol.cast("boolean"), SF.lit(True)))
+
+ if isinstance(self._internal.spark_type_for(label), NumericType)
or skipna:
+ # np.nan takes no effect to the result; None takes no effect
if `skipna`
+ all_col = F.min(F.coalesce(scol.cast("boolean"), SF.lit(True)))
+ else:
+ # Take None as False when not `skipna`
+ all_col = F.min(
+ F.when(scol.isNull(),
SF.lit(False)).otherwise(scol.cast("boolean"))
+ )
applied.append(F.when(all_col.isNull(), True).otherwise(all_col))
return self._result_aggregated(column_labels, applied)
diff --git a/python/pyspark/pandas/tests/test_dataframe.py
b/python/pyspark/pandas/tests/test_dataframe.py
index f3de2856d83..fa32b38d3c9 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -3957,6 +3957,21 @@ class DataFrameTest(ComparisonTestBase, SQLTestUtils):
):
psdf.all(axis=1)
+ # Test skipna
+ pdf = pd.DataFrame({"A": [True, True], "B": [1, np.nan], "C": [True,
None]})
+ pdf.name = "x"
+ psdf = ps.from_pandas(pdf)
+ self.assert_eq(psdf[["A", "B"]].all(skipna=False), pdf[["A",
"B"]].all(skipna=False))
+ self.assert_eq(psdf[["A", "C"]].all(skipna=False), pdf[["A",
"C"]].all(skipna=False))
+ self.assert_eq(psdf[["B", "C"]].all(skipna=False), pdf[["B",
"C"]].all(skipna=False))
+ self.assert_eq(psdf.all(skipna=False), pdf.all(skipna=False))
+ self.assert_eq(psdf.all(skipna=True), pdf.all(skipna=True))
+ self.assert_eq(psdf.all(), pdf.all())
+ self.assert_eq(
+ ps.DataFrame([np.nan]).all(skipna=False),
pd.DataFrame([np.nan]).all(skipna=False)
+ )
+ self.assert_eq(ps.DataFrame([None]).all(skipna=True),
pd.DataFrame([None]).all(skipna=True))
+
def test_any(self):
pdf = pd.DataFrame(
{
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]