This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f29f030d4b3 [SPARK-38938][PYTHON] Implement `inplace` and `columns`
parameters of `Series.drop`
f29f030d4b3 is described below
commit f29f030d4b38c4dc6ee49defe6ad1fb870708c46
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Apr 22 11:47:38 2022 +0900
[SPARK-38938][PYTHON] Implement `inplace` and `columns` parameters of
`Series.drop`
### What changes were proposed in this pull request?
Implement `inplace` and `columns` parameters of `Series.drop`.
### Why are the changes needed?
Increase pandas API coverage.
### Does this PR introduce _any_ user-facing change?
Yes. `inplace` and `columns` parameters of `Series.drop` are supported now.
```py
>>> s = ps.Series(data=np.arange(3), index=['A', 'B', 'C'])
# 'columns' parameter
>>> s.drop(columns=['A'])
A 0
B 1
C 2
dtype: int64
# 'inplace' parameter
>>> s.drop(index=['B', 'C'], inplace=True)
>>> s
A 0
dtype: int64
```
### How was this patch tested?
Unit tests.
Closes #36215 from xinrong-databricks/series.drop.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../pandas_on_spark/supported_pandas_api.rst | 2 +-
python/pyspark/pandas/series.py | 54 ++++++++++++++++++----
python/pyspark/pandas/tests/test_series.py | 36 ++++++++++++---
3 files changed, 76 insertions(+), 16 deletions(-)
diff --git
a/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
b/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
index 06d044d0190..a975d4ec8cc 100644
--- a/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
+++ b/python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst
@@ -805,7 +805,7 @@ Supported Series APIs
+---------------------------------+-------------------+-------------------------------------------+
| :func:`dot` | Y |
|
+---------------------------------+-------------------+-------------------------------------------+
-| :func:`drop` | P | ``columns``,
``inplace``, ``errors`` |
+| :func:`drop` | P | ``errors``
|
+---------------------------------+-------------------+-------------------------------------------+
| :func:`drop_duplicates` | Y |
|
+---------------------------------+-------------------+-------------------------------------------+
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 3ac2daa612a..ea3426d5a54 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -2365,7 +2365,9 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
self,
labels: Optional[Union[Name, List[Name]]] = None,
index: Optional[Union[Name, List[Name]]] = None,
+ columns: Optional[Union[Name, List[Name]]] = None,
level: Optional[int] = None,
+ inplace: bool = False,
) -> "Series":
"""
Return Series with specified index labels removed.
@@ -2377,10 +2379,18 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
----------
labels : single label or list-like
Index labels to drop.
- index : None
+ index : single label or list-like
Redundant for application on Series, but index can be used instead
of labels.
+ columns : single label or list-like
+ No change is made to the Series; use ‘index’ or ‘labels’ instead.
+
+ .. versionadded:: 3.4.0
level : int or level name, optional
For MultiIndex, level for which the labels will be removed.
+ inplace: bool, default False
+ If True, do operation inplace and return None
+
+ .. versionadded:: 3.4.0
Returns
-------
@@ -2424,6 +2434,21 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
A 0
dtype: int64
+ With 'columns', no change is made to the Series.
+
+ >>> s.drop(columns=['A'])
+ A 0
+ B 1
+ C 2
+ dtype: int64
+
+ With 'inplace=True', do operation inplace and return None.
+
+ >>> s.drop(index=['B', 'C'], inplace=True)
+ >>> s
+ A 0
+ dtype: int64
+
Also support for MultiIndex
>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
@@ -2474,18 +2499,23 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
length 0.3
dtype: float64
"""
- return first_series(self._drop(labels=labels, index=index,
level=level))
+ dropped = self._drop(
+ labels=labels, index=index, level=level, inplace=inplace,
columns=columns
+ )
+ return None if dropped is None else first_series(dropped)
def _drop(
self,
labels: Optional[Union[Name, List[Name]]] = None,
index: Optional[Union[Name, List[Name]]] = None,
level: Optional[int] = None,
- ) -> DataFrame:
+ inplace: bool = False,
+ columns: Optional[Union[Name, List[Name]]] = None,
+ ) -> Optional[DataFrame]:
if labels is not None:
- if index is not None:
- raise ValueError("Cannot specify both 'labels' and 'index'")
- return self._drop(index=labels, level=level)
+ if columns is not None or index is not None:
+ raise ValueError("Cannot specify both 'labels' and
'index'/'columns'")
+ return self._drop(index=labels, level=level, inplace=inplace,
columns=columns)
if index is not None:
internal = self._internal
if level is None:
@@ -2524,10 +2554,16 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
drop_index_scols.append(reduce(lambda x, y: x & y,
index_scols))
cond = ~reduce(lambda x, y: x | y, drop_index_scols)
-
- return DataFrame(internal.with_filter(cond))
+ dropped_internal = internal.with_filter(cond)
+ if inplace:
+ self._update_anchor(DataFrame(dropped_internal))
+ return None
+ else:
+ return DataFrame(dropped_internal)
+ elif columns is not None:
+ return self._psdf
else:
- raise ValueError("Need to specify at least one of 'labels' or
'index'")
+ raise ValueError("Need to specify at least one of 'labels',
'index' or 'columns'")
def head(self, n: int = 5) -> "Series":
"""
diff --git a/python/pyspark/pandas/tests/test_series.py
b/python/pyspark/pandas/tests/test_series.py
index 7ae4e1da6d3..f39e1900dd3 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -1710,24 +1710,40 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
psser.aggregate(["min", max])
def test_drop(self):
- pser = pd.Series([10, 20, 15, 30, 45], name="x")
- psser = ps.Series(pser)
+ pdf = pd.DataFrame({"x": [10, 20, 15, 30, 45]})
+ psdf = ps.from_pandas(pdf)
+ pser, psser = pdf.x, psdf.x
self.assert_eq(psser.drop(1), pser.drop(1))
self.assert_eq(psser.drop([1, 4]), pser.drop([1, 4]))
+ self.assert_eq(psser.drop(columns=1), pser.drop(columns=1))
+ self.assert_eq(psser.drop(columns=[1, 4]), pser.drop(columns=[1, 4]))
- msg = "Need to specify at least one of 'labels' or 'index'"
+ msg = "Need to specify at least one of 'labels', 'index' or 'columns'"
with self.assertRaisesRegex(ValueError, msg):
psser.drop()
self.assertRaises(KeyError, lambda: psser.drop((0, 1)))
+ psser.drop([2, 3], inplace=True)
+ pser.drop([2, 3], inplace=True)
+ self.assert_eq(psser, pser)
+ self.assert_eq(psdf, pdf)
+
+ n_pser, n_psser = pser + 1, psser + 1
+ n_psser.drop([1, 4], inplace=True)
+ n_pser.drop([1, 4], inplace=True)
+ self.assert_eq(n_psser, n_pser)
+ self.assert_eq(psser, pser)
+
# For MultiIndex
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
)
- pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
- psser = ps.from_pandas(pser)
+
+ pdf = pd.DataFrame({"x": [45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3]},
index=midx)
+ psdf = ps.from_pandas(pdf)
+ psser, pser = psdf.x, pdf.x
self.assert_eq(psser.drop("lama"), pser.drop("lama"))
self.assert_eq(psser.drop(labels="weight", level=1),
pser.drop(labels="weight", level=1))
@@ -1750,14 +1766,22 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
with self.assertRaisesRegex(ValueError, msg):
psser.drop(["lama", ["cow", "falcon"]])
- msg = "Cannot specify both 'labels' and 'index'"
+ msg = "Cannot specify both 'labels' and 'index'/'columns'"
with self.assertRaisesRegex(ValueError, msg):
psser.drop("lama", index="cow")
+ with self.assertRaisesRegex(ValueError, msg):
+ psser.drop("lama", columns="cow")
+
msg = r"'Key length \(2\) exceeds index depth \(3\)'"
with self.assertRaisesRegex(KeyError, msg):
psser.drop(("lama", "speed", "x"))
+ psser.drop({"lama": "speed"}, inplace=True)
+ pser.drop({"lama": "speed"}, inplace=True)
+ self.assert_eq(psser, pser)
+ self.assert_eq(psdf, pdf)
+
def test_pop(self):
midx = pd.MultiIndex(
[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]