This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 14fa704c8e7 [SPARK-38726][PYTHON] Support `how` parameter of
`MultiIndex.dropna`
14fa704c8e7 is described below
commit 14fa704c8e720aa244ac496e43e7a21248308787
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Apr 8 09:52:57 2022 +0900
[SPARK-38726][PYTHON] Support `how` parameter of `MultiIndex.dropna`
### What changes were proposed in this pull request?
Support `how` parameter of `MultiIndex.dropna` to specify drop the value
when any or all levels are NaN.
### Why are the changes needed?
To reach parity with pandas.
### Does this PR introduce _any_ user-facing change?
Yes.`how` parameter of `MultiIndex.dropna` is supported
```py
>>> tuples = [(np.nan, 1.0), (2.0, 2.0), (np.nan, np.nan), (3.0,
np.nan)]
>>> midx = ps.MultiIndex.from_tuples(tuples)
>>> midx
MultiIndex([(nan, 1.0),
(2.0, 2.0),
(nan, nan),
(3.0, nan)],
)
>>> midx.dropna(how="any")
MultiIndex([(2.0, 2.0)],
)
>>> midx.dropna(how="all")
MultiIndex([(nan, 1.0),
(2.0, 2.0),
(3.0, nan)],
)
```
### How was this patch tested?
Unit tests.
Closes #36028 from xinrong-databricks/index.dropna.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/indexes/base.py | 61 +++++++++++++-----------
python/pyspark/pandas/tests/indexes/test_base.py | 17 ++++++-
2 files changed, 49 insertions(+), 29 deletions(-)
diff --git a/python/pyspark/pandas/indexes/base.py
b/python/pyspark/pandas/indexes/base.py
index 450a083a983..fd1c2dff032 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -1142,10 +1142,20 @@ class Index(IndexOpsMixin):
"""
return kind == self.inferred_type
- def dropna(self) -> "Index":
+ def dropna(self, how: str = "any") -> "Index":
"""
Return Index or MultiIndex without NA/NaN values
+ Parameters
+ ----------
+ how : {'any', 'all'}, default 'any'
+ If the Index is a MultiIndex, drop the value when any or all levels
+ are NaN.
+
+ Returns
+ -------
+ Index or MultiIndex
+
Examples
--------
@@ -1163,35 +1173,30 @@ class Index(IndexOpsMixin):
Also support for MultiIndex
- >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
- ... [None, 'weight', 'length']],
- ... [[0, 1, 1, 1, 1, 1, 2, 2, 2],
- ... [0, 1, 1, 0, 1, 2, 1, 1, 2]])
- >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, None],
- ... index=midx)
- >>> s
- lama NaN 45.0
- cow weight 200.0
- weight 1.2
- NaN 30.0
- weight 250.0
- length 1.5
- falcon weight 320.0
- weight 1.0
- length NaN
- dtype: float64
-
- >>> s.index.dropna() # doctest: +SKIP
- MultiIndex([( 'cow', 'weight'),
- ( 'cow', 'weight'),
- ( 'cow', 'weight'),
- ( 'cow', 'length'),
- ('falcon', 'weight'),
- ('falcon', 'weight'),
- ('falcon', 'length')],
+
+ >>> tuples = [(np.nan, 1.0), (2.0, 2.0), (np.nan, np.nan), (3.0,
np.nan)]
+ >>> midx = ps.MultiIndex.from_tuples(tuples)
+ >>> midx # doctest: +SKIP
+ MultiIndex([(nan, 1.0),
+ (2.0, 2.0),
+ (nan, nan),
+ (3.0, nan)],
+ )
+
+ >>> midx.dropna() # doctest: +SKIP
+ MultiIndex([(2.0, 2.0)],
+ )
+
+ >>> midx.dropna(how="all") # doctest: +SKIP
+ MultiIndex([(nan, 1.0),
+ (2.0, 2.0),
+ (3.0, nan)],
)
"""
- sdf =
self._internal.spark_frame.select(self._internal.index_spark_columns).dropna()
+ if how not in ("any", "all"):
+ raise ValueError("invalid how option: %s" % how)
+
+ sdf =
self._internal.spark_frame.select(self._internal.index_spark_columns).dropna(how=how)
internal = InternalFrame(
spark_frame=sdf,
index_spark_columns=[
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py
b/python/pyspark/pandas/tests/indexes/test_base.py
index 28985a9b2f7..3e03bbc028c 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -384,12 +384,27 @@ class IndexesTest(ComparisonTestBase, TestUtils):
self.assert_eq(psmidx.drop_duplicates(keep=False),
pmidx.drop_duplicates(keep=False))
def test_dropna(self):
- pidx = pd.Index([np.nan, 2, 4, 1, np.nan, 3])
+ pidx = pd.Index([np.nan, 2, 4, 1, None, 3])
psidx = ps.from_pandas(pidx)
self.assert_eq(psidx.dropna(), pidx.dropna())
self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna())
+ self.assert_eq(psidx.dropna(how="any"), pidx.dropna(how="any"))
+ self.assert_eq(psidx.dropna(how="all"), pidx.dropna(how="all"))
+
+ pmidx = pd.MultiIndex.from_tuples(
+ [(np.nan, 1.0), (2.0, 2.0), (np.nan, None), (3.0, np.nan)]
+ )
+ psmidx = ps.from_pandas(pmidx)
+ self.assert_eq(psmidx.dropna(), pmidx.dropna())
+ self.assert_eq(psmidx.dropna(how="any"), pmidx.dropna(how="any"))
+ self.assert_eq(psmidx.dropna(how="all"), pmidx.dropna(how="all"))
+
+ invalid_how = "none"
+ with self.assertRaisesRegex(ValueError, "invalid how option: %s" %
invalid_how):
+ psmidx.dropna(invalid_how)
+
def test_index_symmetric_difference(self):
pidx1 = pd.Index([1, 2, 3, 4])
pidx2 = pd.Index([2, 3, 4, 5])
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]