This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new eadd5354c45 [SPARK-38903][PYTHON] Implement `ignore_index` of
`Series.sort_values` and `Series.sort_index`
eadd5354c45 is described below
commit eadd5354c459114aa1d0fd0a6ca432a2d6249ae9
Author: Xinrong Meng <[email protected]>
AuthorDate: Tue Apr 19 11:06:11 2022 +0900
[SPARK-38903][PYTHON] Implement `ignore_index` of `Series.sort_values` and
`Series.sort_index`
### What changes were proposed in this pull request?
Implement `ignore_index` of `Series.sort_values` and `Series.sort_index`
### Why are the changes needed?
To reach parity with pandas API.
### Does this PR introduce _any_ user-facing change?
Yes. `ignore_index`s of `Series.sort_values` and `Series.sort_index` are
supported.
```py
>>> s = ps.Series([2, 1, 3], index=['b', 'c', 'a'])
>>> s.sort_values(ignore_index=True)
0 1
1 2
2 3
dtype: int64
>>> s.sort_index(ignore_index=True)
0 3
1 2
2 1
dtype: int64
```
### How was this patch tested?
Unit tests.
Closes #36186 from xinrong-databricks/series.ignore_index.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/series.py | 61 +++++++++++++++++++++++-------
python/pyspark/pandas/tests/test_series.py | 16 +++++++-
2 files changed, 63 insertions(+), 14 deletions(-)
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index f4638fe22de..3ac2daa612a 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -2703,7 +2703,11 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
return first_series(DataFrame(internal))
def sort_values(
- self, ascending: bool = True, inplace: bool = False, na_position: str
= "last"
+ self,
+ ascending: bool = True,
+ inplace: bool = False,
+ na_position: str = "last",
+ ignore_index: bool = False,
) -> Optional["Series"]:
"""
Sort by the values.
@@ -2720,6 +2724,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
if True, perform operation in-place
na_position : {'first', 'last'}, default 'last'
`first` puts NaNs at the beginning, `last` puts NaNs at the end
+ ignore_index : bool, default False
+ If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+ .. versionadded:: 3.4.0
Returns
-------
@@ -2756,6 +2764,16 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
0 NaN
dtype: float64
+ Sort values descending order and ignoring index
+
+ >>> s.sort_values(ascending=False, ignore_index=True)
+ 0 10.0
+ 1 5.0
+ 2 3.0
+ 3 1.0
+ 4 NaN
+ dtype: float64
+
Sort values inplace
>>> s.sort_values(ascending=False, inplace=True)
@@ -2802,10 +2820,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
)
if inplace:
+ if ignore_index:
+ psdf.reset_index(drop=True, inplace=inplace)
self._update_anchor(psdf)
return None
else:
- return first_series(psdf)
+ return first_series(psdf.reset_index(drop=True)) if ignore_index
else first_series(psdf)
def sort_index(
self,
@@ -2815,6 +2835,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
inplace: bool = False,
kind: str = None,
na_position: str = "last",
+ ignore_index: bool = False,
) -> Optional["Series"]:
"""
Sort object by labels (along an axis)
@@ -2834,6 +2855,10 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
na_position : {‘first’, ‘last’}, default ‘last’
first puts NaNs at the beginning, last puts NaNs at the end. Not
implemented for
MultiIndex.
+ ignore_index : bool, default False
+ If True, the resulting axis will be labeled 0, 1, …, n - 1.
+
+ .. versionadded:: 3.4.0
Returns
-------
@@ -2841,50 +2866,58 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Examples
--------
- >>> df = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan])
+ >>> s = ps.Series([2, 1, np.nan], index=['b', 'a', np.nan])
- >>> df.sort_index()
+ >>> s.sort_index()
a 1.0
b 2.0
NaN NaN
dtype: float64
- >>> df.sort_index(ascending=False)
+ >>> s.sort_index(ignore_index=True)
+ 0 1.0
+ 1 2.0
+ 2 NaN
+ dtype: float64
+
+ >>> s.sort_index(ascending=False)
b 2.0
a 1.0
NaN NaN
dtype: float64
- >>> df.sort_index(na_position='first')
+ >>> s.sort_index(na_position='first')
NaN NaN
a 1.0
b 2.0
dtype: float64
- >>> df.sort_index(inplace=True)
- >>> df
+ >>> s.sort_index(inplace=True)
+ >>> s
a 1.0
b 2.0
NaN NaN
dtype: float64
- >>> df = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1,
0]], name='0')
+ Multi-index series.
+
+ >>> s = ps.Series(range(4), index=[['b', 'b', 'a', 'a'], [1, 0, 1,
0]], name='0')
- >>> df.sort_index()
+ >>> s.sort_index()
a 0 3
1 2
b 0 1
1 0
Name: 0, dtype: int64
- >>> df.sort_index(level=1) # doctest: +SKIP
+ >>> s.sort_index(level=1) # doctest: +SKIP
a 0 3
b 0 1
a 1 2
b 1 0
Name: 0, dtype: int64
- >>> df.sort_index(level=[1, 0])
+ >>> s.sort_index(level=[1, 0])
a 0 3
b 0 1
a 1 2
@@ -2897,10 +2930,12 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
)
if inplace:
+ if ignore_index:
+ psdf.reset_index(drop=True, inplace=inplace)
self._update_anchor(psdf)
return None
else:
- return first_series(psdf)
+ return first_series(psdf.reset_index(drop=True)) if ignore_index
else first_series(psdf)
def swaplevel(
self, i: Union[int, Name] = -2, j: Union[int, Name] = -1, copy: bool =
True
diff --git a/python/pyspark/pandas/tests/test_series.py
b/python/pyspark/pandas/tests/test_series.py
index 68fed26324d..7f85e3f431e 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -978,13 +978,14 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
check(psdf.index.to_series(name=("x", "a")),
pdf.index.to_series(name=("x", "a")))
def test_sort_values(self):
- pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]})
+ pdf = pd.DataFrame({"x": [1, 2, 3, 4, 5, None, 7]},
index=np.random.rand(7))
psdf = ps.from_pandas(pdf)
pser = pdf.x
psser = psdf.x
self.assert_eq(psser.sort_values(), pser.sort_values())
+ self.assert_eq(psser.sort_values(ignore_index=True),
pser.sort_values(ignore_index=True))
self.assert_eq(psser.sort_values(ascending=False),
pser.sort_values(ascending=False))
self.assert_eq(
psser.sort_values(na_position="first"),
pser.sort_values(na_position="first")
@@ -998,6 +999,11 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(psser, pser.sort_values())
self.assert_eq(psdf, pdf)
+ # pandas raises an exception when the Series is derived from DataFrame
+ psser.sort_values(inplace=True, ascending=False, ignore_index=True)
+ self.assert_eq(psser, pser.sort_values(ascending=False,
ignore_index=True))
+ self.assert_eq(psdf, pdf)
+
pser = pdf.x.copy()
psser = psdf.x.copy()
@@ -1024,6 +1030,8 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(psser.sort_index(ascending=False),
pser.sort_index(ascending=False))
# Assert sorting NA indices first
self.assert_eq(psser.sort_index(na_position="first"),
pser.sort_index(na_position="first"))
+ # Assert ignoring index
+ self.assert_eq(psser.sort_index(ignore_index=True),
pser.sort_index(ignore_index=True))
# Assert sorting inplace
# pandas sorts pdf.x by the index and update the column only
@@ -1032,6 +1040,12 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(psser, pser.sort_index())
self.assert_eq(psdf, pdf)
+ # pandas sorts pdf.x by the index and update the column only
+ # when the Series is derived from DataFrame.
+ psser.sort_index(inplace=True, ascending=False, ignore_index=True)
+ self.assert_eq(psser, pser.sort_index(ascending=False,
ignore_index=True))
+ self.assert_eq(psdf, pdf)
+
pser = pdf.x.copy()
psser = psdf.x.copy()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]