This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new eadd5354c45 [SPARK-38903][PYTHON] Implement `ignore_index` of `Series.sort_values` and `Series.sort_index` eadd5354c45 is described below commit eadd5354c459114aa1d0fd0a6ca432a2d6249ae9 Author: Xinrong Meng <xinrong.m...@databricks.com> AuthorDate: Tue Apr 19 11:06:11 2022 +0900 [SPARK-38903][PYTHON] Implement `ignore_index` of `Series.sort_values` and `Series.sort_index` ### What changes were proposed in this pull request? Implement `ignore_index` of `Series.sort_values` and `Series.sort_index` ### Why are the changes needed? To reach parity with pandas API. ### Does this PR introduce _any_ user-facing change? Yes. `ignore_index`s of `Series.sort_values` and `Series.sort_index` are supported. ```py >>> s = ps.Series([2, 1, 3], index=['b', 'c', 'a']) >>> s.sort_values(ignore_index=True) 0 1 1 2 2 3 dtype: int64 >>> s.sort_index(ignore_index=True) 0 3 1 2 2 1 dtype: int64 ``` ### How was this patch tested? Unit tests. Closes #36186 from xinrong-databricks/series.ignore_index. Authored-by: Xinrong Meng <xinrong.m...@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls...@apache.org> --- python/pyspark/pandas/series.py | 61 +++++++++++++++++++++++------- python/pyspark/pandas/tests/test_series.py | 16 +++++++- 2 files changed, 63 insertions(+), 14 deletions(-) diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index f4638fe22de..3ac2daa612a 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -2703,7 +2703,11 @@ class Series(Frame, IndexOpsMixin, Generic[T]): return first_series(DataFrame(internal)) def sort_values( - self, ascending: bool = True, inplace: bool = False, na_position: str = "last" + self, + ascending: bool = True, + inplace: bool = False, + na_position: str = "last", + ignore_index: bool = False, ) -> Optional["Series"]: """ Sort by the values. @@ -2720,6 +2724,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]): if True, perform operation in-place na_position : {'first', 'last'}, default 'last' `first` puts NaNs at the beginning, `last` puts NaNs at the end + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 3.4.0 Returns ------- @@ -2756,6 +2764,16 @@ class Series(Frame, IndexOpsMixin, Generic[T]): 0 NaN dtype: float64 + Sort values descending order and ignoring index + + >>> s.sort_values(ascending=False, ignore_index=True) + 0 10.0 + 1 5.0 + 2 3.0 + 3 1.0 + 4 NaN + dtype: float64 + Sort values inplace >>> s.sort_values(ascending=False, inplace=True) @@ -2802,10 +2820,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]): ) if inplace: + if ignore_index: + psdf.reset_index(drop=True, inplace=inplace) self._update_anchor(psdf) return None else: - return first_series(psdf) + return first_series(psdf.reset_index(drop=True)) if ignore_index else first_series(psdf) def sort_index( self, @@ -2815,6 +2835,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]): inplace: bool = False, kind: str = None, na_position: str = "last", + ignore_index: bool = False, ) -> Optional["Series"]: """ Sort object by labels (along an axis) @@ -2834,6 +2855,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]): na_position : {‘first’, ‘last’}, default ‘last’ first puts NaNs at the beginning, last puts NaNs at the end. Not implemented for MultiIndex. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 3.4.0 Returns ------- @@ -2841,50 +2866,58 @@ class Series(Frame, IndexOpsMixin, Generic[T]): Examples -------- - >>> df = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan]) + >>> s = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan]) - >>> df.sort_index() + >>> s.sort_index() a 1.0 b 2.0 NaN NaN dtype: float64 - >>> df.sort_index(ascending=False) + >>> s.sort_index(ignore_index=True) + 0 1.0 + 1 2.0 + 2 NaN + dtype: float64 + + >>> s.sort_index(ascending=False) b 2.0 a 1.0 NaN NaN dtype: float64 - >>> df.sort_index(na_position='first') + >>> s.sort_index(na_position='first') NaN NaN a 1.0 b 2.0 dtype: float64 - >>> df.sort_index(inplace=True) - >>> df + >>> s.sort_index(inplace=True) + >>> s a 1.0 b 2.0 NaN NaN dtype: float64 - >>> df = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]], name='0') + Multi-index series. + + >>> s = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1, 0]], name='0') - >>> df.sort_index() + >>> s.sort_index() a 0 3 1 2 b 0 1 1 0 Name: 0, dtype: int64 - >>> df.sort_index(level=1) # doctest: +SKIP + >>> s.sort_index(level=1) # doctest: +SKIP a 0 3 b 0 1 a 1 2 b 1 0 Name: 0, dtype: int64 - >>> df.sort_index(level=[1, 0]) + >>> s.sort_index(level=[1, 0]) a 0 3 b 0 1 a 1 2 @@ -2897,10 +2930,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]): ) if inplace: + if ignore_index: + psdf.reset_index(drop=True, inplace=inplace) self._update_anchor(psdf) return None else: - return first_series(psdf) + return first_series(psdf.reset_index(drop=True)) if ignore_index else first_series(psdf) def swaplevel( self, i: Union[int, Name] = -2, j: Union[int, Name] = -1, copy: bool = True diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 68fed26324d..7f85e3f431e 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -978,13 +978,14 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): check(psdf.index.to_series(name=("x", "a")), pdf.index.to_series(name=("x", "a"))) def test_sort_values(self): - pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]}) + pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]}, index=np.random.rand(7)) psdf = ps.from_pandas(pdf) pser = pdf.x psser = psdf.x self.assert_eq(psser.sort_values(), pser.sort_values()) + self.assert_eq(psser.sort_values(ignore_index=True), pser.sort_values(ignore_index=True)) self.assert_eq(psser.sort_values(ascending=False), pser.sort_values(ascending=False)) self.assert_eq( psser.sort_values(na_position="first"), pser.sort_values(na_position="first") @@ -998,6 +999,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psser, pser.sort_values()) self.assert_eq(psdf, pdf) + # pandas raises an exception when the Series is derived from DataFrame + psser.sort_values(inplace=True, ascending=False, ignore_index=True) + self.assert_eq(psser, pser.sort_values(ascending=False, ignore_index=True)) + self.assert_eq(psdf, pdf) + pser = pdf.x.copy() psser = psdf.x.copy() @@ -1024,6 +1030,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psser.sort_index(ascending=False), pser.sort_index(ascending=False)) # Assert sorting NA indices first self.assert_eq(psser.sort_index(na_position="first"), pser.sort_index(na_position="first")) + # Assert ignoring index + self.assert_eq(psser.sort_index(ignore_index=True), pser.sort_index(ignore_index=True)) # Assert sorting inplace # pandas sorts pdf.x by the index and update the column only @@ -1032,6 +1040,12 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): self.assert_eq(psser, pser.sort_index()) self.assert_eq(psdf, pdf) + # pandas sorts pdf.x by the index and update the column only + # when the Series is derived from DataFrame. + psser.sort_index(inplace=True, ascending=False, ignore_index=True) + self.assert_eq(psser, pser.sort_index(ascending=False, ignore_index=True)) + self.assert_eq(psdf, pdf) + pser = pdf.x.copy() psser = psdf.x.copy() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org