[spark] branch master updated: [SPARK-38903][PYTHON] Implement `ignore_index` of `Series.sort_values` and `Series.sort_index`

gurwls223 Mon, 18 Apr 2022 19:06:27 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new eadd5354c45 [SPARK-38903][PYTHON] Implement `ignore_index` of 
`Series.sort_values` and `Series.sort_index`
eadd5354c45 is described below

commit eadd5354c459114aa1d0fd0a6ca432a2d6249ae9
Author: Xinrong Meng <[email protected]>
AuthorDate: Tue Apr 19 11:06:11 2022 +0900

    [SPARK-38903][PYTHON] Implement `ignore_index` of `Series.sort_values` and 
`Series.sort_index`
    
    ### What changes were proposed in this pull request?
    Implement `ignore_index` of `Series.sort_values` and `Series.sort_index`
    
    ### Why are the changes needed?
    To reach parity with pandas API.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes. `ignore_index`s of `Series.sort_values` and `Series.sort_index` are 
supported.
    
    ```py
    >>> s = ps.Series([2, 1, 3], index=['b', 'c', 'a'])
    >>> s.sort_values(ignore_index=True)
    0    1
    1    2
    2    3
    dtype: int64
    >>> s.sort_index(ignore_index=True)
    0    3
    1    2
    2    1
    dtype: int64
    ```
    
    ### How was this patch tested?
    Unit tests.
    
    Closes #36186 from xinrong-databricks/series.ignore_index.
    
    Authored-by: Xinrong Meng <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/pandas/series.py            | 61 +++++++++++++++++++++++-------
 python/pyspark/pandas/tests/test_series.py | 16 +++++++-
 2 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index f4638fe22de..3ac2daa612a 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -2703,7 +2703,11 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         return first_series(DataFrame(internal))
 
     def sort_values(
-        self, ascending: bool = True, inplace: bool = False, na_position: str 
= "last"
+        self,
+        ascending: bool = True,
+        inplace: bool = False,
+        na_position: str = "last",
+        ignore_index: bool = False,
     ) -> Optional["Series"]:
         """
         Sort by the values.
@@ -2720,6 +2724,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
              if True, perform operation in-place
         na_position : {'first', 'last'}, default 'last'
              `first` puts NaNs at the beginning, `last` puts NaNs at the end
+        ignore_index : bool, default False
+             If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+             .. versionadded:: 3.4.0
 
         Returns
         -------
@@ -2756,6 +2764,16 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         0     NaN
         dtype: float64
 
+        Sort values descending order and ignoring index
+
+        >>> s.sort_values(ascending=False, ignore_index=True)
+        0    10.0
+        1     5.0
+        2     3.0
+        3     1.0
+        4     NaN
+        dtype: float64
+
         Sort values inplace
 
         >>> s.sort_values(ascending=False, inplace=True)
@@ -2802,10 +2820,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         )
 
         if inplace:
+            if ignore_index:
+                psdf.reset_index(drop=True, inplace=inplace)
             self._update_anchor(psdf)
             return None
         else:
-            return first_series(psdf)
+            return first_series(psdf.reset_index(drop=True)) if ignore_index 
else first_series(psdf)
 
     def sort_index(
         self,
@@ -2815,6 +2835,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         inplace: bool = False,
         kind: str = None,
         na_position: str = "last",
+        ignore_index: bool = False,
     ) -> Optional["Series"]:
         """
         Sort object by labels (along an axis)
@@ -2834,6 +2855,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         na_position : {‘first’, ‘last’}, default ‘last’
             first puts NaNs at the beginning, last puts NaNs at the end. Not 
implemented for
             MultiIndex.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+            .. versionadded:: 3.4.0
 
         Returns
         -------
@@ -2841,50 +2866,58 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
 
         Examples
         --------
-        >>> df = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan])
+        >>> s = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan])
 
-        >>> df.sort_index()
+        >>> s.sort_index()
         a      1.0
         b      2.0
         NaN    NaN
         dtype: float64
 
-        >>> df.sort_index(ascending=False)
+        >>> s.sort_index(ignore_index=True)
+        0    1.0
+        1    2.0
+        2    NaN
+        dtype: float64
+
+        >>> s.sort_index(ascending=False)
         b      2.0
         a      1.0
         NaN    NaN
         dtype: float64
 
-        >>> df.sort_index(na_position='first')
+        >>> s.sort_index(na_position='first')
         NaN    NaN
         a      1.0
         b      2.0
         dtype: float64
 
-        >>> df.sort_index(inplace=True)
-        >>> df
+        >>> s.sort_index(inplace=True)
+        >>> s
         a      1.0
         b      2.0
         NaN    NaN
         dtype: float64
 
-        >>> df = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1, 
0]], name='0')
+        Multi-index series.
+
+        >>> s = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1, 
0]], name='0')
 
-        >>> df.sort_index()
+        >>> s.sort_index()
         a  0    3
            1    2
         b  0    1
            1    0
         Name: 0, dtype: int64
 
-        >>> df.sort_index(level=1)  # doctest: +SKIP
+        >>> s.sort_index(level=1)  # doctest: +SKIP
         a  0    3
         b  0    1
         a  1    2
         b  1    0
         Name: 0, dtype: int64
 
-        >>> df.sort_index(level=[1, 0])
+        >>> s.sort_index(level=[1, 0])
         a  0    3
         b  0    1
         a  1    2
@@ -2897,10 +2930,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         )
 
         if inplace:
+            if ignore_index:
+                psdf.reset_index(drop=True, inplace=inplace)
             self._update_anchor(psdf)
             return None
         else:
-            return first_series(psdf)
+            return first_series(psdf.reset_index(drop=True)) if ignore_index 
else first_series(psdf)
 
     def swaplevel(
         self, i: Union[int, Name] = -2, j: Union[int, Name] = -1, copy: bool = 
True
diff --git a/python/pyspark/pandas/tests/test_series.py 
b/python/pyspark/pandas/tests/test_series.py
index 68fed26324d..7f85e3f431e 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -978,13 +978,14 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
         check(psdf.index.to_series(name=("x", "a")), 
pdf.index.to_series(name=("x", "a")))
 
     def test_sort_values(self):
-        pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]})
+        pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]}, 
index=np.random.rand(7))
         psdf = ps.from_pandas(pdf)
 
         pser = pdf.x
         psser = psdf.x
 
         self.assert_eq(psser.sort_values(), pser.sort_values())
+        self.assert_eq(psser.sort_values(ignore_index=True), 
pser.sort_values(ignore_index=True))
         self.assert_eq(psser.sort_values(ascending=False), 
pser.sort_values(ascending=False))
         self.assert_eq(
             psser.sort_values(na_position="first"), 
pser.sort_values(na_position="first")
@@ -998,6 +999,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
         self.assert_eq(psser, pser.sort_values())
         self.assert_eq(psdf, pdf)
 
+        # pandas raises an exception when the Series is derived from DataFrame
+        psser.sort_values(inplace=True, ascending=False, ignore_index=True)
+        self.assert_eq(psser, pser.sort_values(ascending=False, 
ignore_index=True))
+        self.assert_eq(psdf, pdf)
+
         pser = pdf.x.copy()
         psser = psdf.x.copy()
 
@@ -1024,6 +1030,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
         self.assert_eq(psser.sort_index(ascending=False), 
pser.sort_index(ascending=False))
         # Assert sorting NA indices first
         self.assert_eq(psser.sort_index(na_position="first"), 
pser.sort_index(na_position="first"))
+        # Assert ignoring index
+        self.assert_eq(psser.sort_index(ignore_index=True), 
pser.sort_index(ignore_index=True))
 
         # Assert sorting inplace
         # pandas sorts pdf.x by the index and update the column only
@@ -1032,6 +1040,12 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
         self.assert_eq(psser, pser.sort_index())
         self.assert_eq(psdf, pdf)
 
+        # pandas sorts pdf.x by the index and update the column only
+        # when the Series is derived from DataFrame.
+        psser.sort_index(inplace=True, ascending=False, ignore_index=True)
+        self.assert_eq(psser, pser.sort_index(ascending=False, 
ignore_index=True))
+        self.assert_eq(psdf, pdf)
+
         pser = pdf.x.copy()
         psser = psdf.x.copy()
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-38903][PYTHON] Implement `ignore_index` of `Series.sort_values` and `Series.sort_index`

Reply via email to