[spark] branch master updated: [SPARK-38726][PYTHON] Support `how` parameter of `MultiIndex.dropna`

gurwls223 Thu, 07 Apr 2022 17:53:36 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 14fa704c8e7 [SPARK-38726][PYTHON] Support `how` parameter of 
`MultiIndex.dropna`
14fa704c8e7 is described below

commit 14fa704c8e720aa244ac496e43e7a21248308787
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Apr 8 09:52:57 2022 +0900

    [SPARK-38726][PYTHON] Support `how` parameter of `MultiIndex.dropna`
    
    ### What changes were proposed in this pull request?
    Support `how` parameter of `MultiIndex.dropna` to specify drop the value 
when any or all levels are NaN.
    
    ### Why are the changes needed?
    To reach parity with pandas.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes.`how` parameter of `MultiIndex.dropna` is supported
    ```py
            >>> tuples = [(np.nan, 1.0), (2.0, 2.0), (np.nan, np.nan), (3.0, 
np.nan)]
            >>> midx = ps.MultiIndex.from_tuples(tuples)
            >>> midx
            MultiIndex([(nan, 1.0),
                        (2.0, 2.0),
                        (nan, nan),
                        (3.0, nan)],
                       )
    
            >>> midx.dropna(how="any")
            MultiIndex([(2.0, 2.0)],
                       )
    
            >>> midx.dropna(how="all")
            MultiIndex([(nan, 1.0),
                        (2.0, 2.0),
                        (3.0, nan)],
                       )
    ```
    
    ### How was this patch tested?
    Unit tests.
    
    Closes #36028 from xinrong-databricks/index.dropna.
    
    Authored-by: Xinrong Meng <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/pandas/indexes/base.py            | 61 +++++++++++++-----------
 python/pyspark/pandas/tests/indexes/test_base.py | 17 ++++++-
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/python/pyspark/pandas/indexes/base.py 
b/python/pyspark/pandas/indexes/base.py
index 450a083a983..fd1c2dff032 100644
--- a/python/pyspark/pandas/indexes/base.py
+++ b/python/pyspark/pandas/indexes/base.py
@@ -1142,10 +1142,20 @@ class Index(IndexOpsMixin):
         """
         return kind == self.inferred_type
 
-    def dropna(self) -> "Index":
+    def dropna(self, how: str = "any") -> "Index":
         """
         Return Index or MultiIndex without NA/NaN values
 
+        Parameters
+        ----------
+        how : {'any', 'all'}, default 'any'
+            If the Index is a MultiIndex, drop the value when any or all levels
+            are NaN.
+
+        Returns
+        -------
+        Index or MultiIndex
+
         Examples
         --------
 
@@ -1163,35 +1173,30 @@ class Index(IndexOpsMixin):
 
         Also support for MultiIndex
 
-        >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
-        ...                       [None, 'weight', 'length']],
-        ...                      [[0, 1, 1, 1, 1, 1, 2, 2, 2],
-        ...                       [0, 1, 1, 0, 1, 2, 1, 1, 2]])
-        >>> s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, None],
-        ...               index=midx)
-        >>> s
-        lama    NaN        45.0
-        cow     weight    200.0
-                weight      1.2
-                NaN        30.0
-                weight    250.0
-                length      1.5
-        falcon  weight    320.0
-                weight      1.0
-                length      NaN
-        dtype: float64
-
-        >>> s.index.dropna()  # doctest: +SKIP
-        MultiIndex([(   'cow', 'weight'),
-                    (   'cow', 'weight'),
-                    (   'cow', 'weight'),
-                    (   'cow', 'length'),
-                    ('falcon', 'weight'),
-                    ('falcon', 'weight'),
-                    ('falcon', 'length')],
+
+        >>> tuples = [(np.nan, 1.0), (2.0, 2.0), (np.nan, np.nan), (3.0, 
np.nan)]
+        >>> midx = ps.MultiIndex.from_tuples(tuples)
+        >>> midx  # doctest: +SKIP
+        MultiIndex([(nan, 1.0),
+                    (2.0, 2.0),
+                    (nan, nan),
+                    (3.0, nan)],
+                   )
+
+        >>> midx.dropna()  # doctest: +SKIP
+        MultiIndex([(2.0, 2.0)],
+                   )
+
+        >>> midx.dropna(how="all")  # doctest: +SKIP
+        MultiIndex([(nan, 1.0),
+                    (2.0, 2.0),
+                    (3.0, nan)],
                    )
         """
-        sdf = 
self._internal.spark_frame.select(self._internal.index_spark_columns).dropna()
+        if how not in ("any", "all"):
+            raise ValueError("invalid how option: %s" % how)
+
+        sdf = 
self._internal.spark_frame.select(self._internal.index_spark_columns).dropna(how=how)
         internal = InternalFrame(
             spark_frame=sdf,
             index_spark_columns=[
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py 
b/python/pyspark/pandas/tests/indexes/test_base.py
index 28985a9b2f7..3e03bbc028c 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -384,12 +384,27 @@ class IndexesTest(ComparisonTestBase, TestUtils):
         self.assert_eq(psmidx.drop_duplicates(keep=False), 
pmidx.drop_duplicates(keep=False))
 
     def test_dropna(self):
-        pidx = pd.Index([np.nan, 2, 4, 1, np.nan, 3])
+        pidx = pd.Index([np.nan, 2, 4, 1, None, 3])
         psidx = ps.from_pandas(pidx)
 
         self.assert_eq(psidx.dropna(), pidx.dropna())
         self.assert_eq((psidx + 1).dropna(), (pidx + 1).dropna())
 
+        self.assert_eq(psidx.dropna(how="any"), pidx.dropna(how="any"))
+        self.assert_eq(psidx.dropna(how="all"), pidx.dropna(how="all"))
+
+        pmidx = pd.MultiIndex.from_tuples(
+            [(np.nan, 1.0), (2.0, 2.0), (np.nan, None), (3.0, np.nan)]
+        )
+        psmidx = ps.from_pandas(pmidx)
+        self.assert_eq(psmidx.dropna(), pmidx.dropna())
+        self.assert_eq(psmidx.dropna(how="any"), pmidx.dropna(how="any"))
+        self.assert_eq(psmidx.dropna(how="all"), pmidx.dropna(how="all"))
+
+        invalid_how = "none"
+        with self.assertRaisesRegex(ValueError, "invalid how option: %s" % 
invalid_how):
+            psmidx.dropna(invalid_how)
+
     def test_index_symmetric_difference(self):
         pidx1 = pd.Index([1, 2, 3, 4])
         pidx2 = pd.Index([2, 3, 4, 5])


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-38726][PYTHON] Support `how` parameter of `MultiIndex.dropna`

Reply via email to