This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 1d09e7b [SPARK-38704][PYTHON] Support string `inclusive` parameter of
`Series.between`
1d09e7b is described below
commit 1d09e7be3aedd43a0b8beb44f17b7e79b9e9d402
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Apr 1 13:38:15 2022 +0900
[SPARK-38704][PYTHON] Support string `inclusive` parameter of
`Series.between`
### What changes were proposed in this pull request?
Support string `inclusive` parameter of `Series.between`
### Why are the changes needed?
To reach parity with Pandas.
### Does this PR introduce _any_ user-facing change?
Yes. String `inclusive` is supported now as below.
```py
>>> s = ps.Series([2, 0, 4, 8, np.nan])
With `inclusive` set to "both" boundary values are included:
>>> s.between(0, 4, inclusive="both")
0 True
1 True
2 True
3 False
4 False
dtype: bool
With `inclusive` set to "neither" boundary values are excluded:
>>> s.between(0, 4, inclusive="neither")
0 True
1 False
2 False
3 False
4 False
dtype: bool
With `inclusive` set to "right" only right boundary value is
included:
>>> s.between(0, 4, inclusive="right")
0 True
1 False
2 True
3 False
4 False
dtype: bool
With `inclusive` set to "left" only left boundary value is included:
>>> s.between(0, 4, inclusive="left")
0 True
1 True
2 False
3 False
4 False
dtype: bool
```
### How was this patch tested?
Unit tests.
Closes #36015 from xinrong-databricks/series.between.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/series.py | 61 +++++++++++++++++++++++++-----
python/pyspark/pandas/tests/test_series.py | 24 ++++++++++++
2 files changed, 76 insertions(+), 9 deletions(-)
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index a73ea1e..5c195da 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -22,6 +22,7 @@ import datetime
import re
import inspect
import sys
+import warnings
from collections.abc import Mapping
from functools import partial, reduce
from typing import (
@@ -853,7 +854,7 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
"""
return self.rfloordiv(other), self.rmod(other)
- def between(self, left: Any, right: Any, inclusive: bool = True) ->
"Series":
+ def between(self, left: Any, right: Any, inclusive: Union[bool, str] =
"both") -> "Series":
"""
Return boolean Series equivalent to left <= series <= right.
This function returns a boolean vector containing `True` wherever the
@@ -866,8 +867,9 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Left boundary.
right : scalar or list-like
Right boundary.
- inclusive : bool, default True
- Include boundaries.
+ inclusive : {"both", "neither", "left", "right"} or boolean. "both" by
default.
+ Include boundaries. Whether to set each bound as closed or open.
+ Booleans are deprecated in favour of `both` or `neither`.
Returns
-------
@@ -890,19 +892,39 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
Boundary values are included by default:
- >>> s.between(1, 4)
+ >>> s.between(0, 4)
0 True
- 1 False
+ 1 True
2 True
3 False
4 False
dtype: bool
- With `inclusive` set to ``False`` boundary values are excluded:
+ With `inclusive` set to "neither" boundary values are excluded:
+
+ >>> s.between(0, 4, inclusive="neither")
+ 0 True
+ 1 False
+ 2 False
+ 3 False
+ 4 False
+ dtype: bool
+
+ With `inclusive` set to "right" only right boundary value is included:
- >>> s.between(1, 4, inclusive=False)
+ >>> s.between(0, 4, inclusive="right")
0 True
1 False
+ 2 True
+ 3 False
+ 4 False
+ dtype: bool
+
+ With `inclusive` set to "left" only left boundary value is included:
+
+ >>> s.between(0, 4, inclusive="left")
+ 0 True
+ 1 True
2 False
3 False
4 False
@@ -918,12 +940,33 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
3 False
dtype: bool
"""
- if inclusive:
+ if inclusive is True or inclusive is False:
+ warnings.warn(
+ "Boolean inputs to the `inclusive` argument are deprecated in "
+ "favour of `both` or `neither`.",
+ FutureWarning,
+ )
+ if inclusive:
+ inclusive = "both"
+ else:
+ inclusive = "neither"
+
+ if inclusive == "both":
lmask = self >= left
rmask = self <= right
- else:
+ elif inclusive == "left":
+ lmask = self >= left
+ rmask = self < right
+ elif inclusive == "right":
+ lmask = self > left
+ rmask = self <= right
+ elif inclusive == "neither":
lmask = self > left
rmask = self < right
+ else:
+ raise ValueError(
+ "Inclusive has to be either string of 'both'," "'left',
'right', or 'neither'."
+ )
return lmask & rmask
diff --git a/python/pyspark/pandas/tests/test_series.py
b/python/pyspark/pandas/tests/test_series.py
index 3ed8866..dafb519 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -2970,6 +2970,30 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
self.assert_eq(pser.rpow(np.nan), psser.rpow(np.nan))
self.assert_eq(1 ** pser, 1 ** psser)
+ def test_between(self):
+ pser = pd.Series([np.nan, 1, 2, 3, 4])
+ psser = ps.from_pandas(pser)
+ self.assert_eq(psser.between(1, 4), pser.between(1, 4))
+ self.assert_eq(psser.between(1, 4, inclusive="both"), pser.between(1,
4, inclusive="both"))
+ self.assert_eq(
+ psser.between(1, 4, inclusive="neither"), pser.between(1, 4,
inclusive="neither")
+ )
+ self.assert_eq(psser.between(1, 4, inclusive="left"), pser.between(1,
4, inclusive="left"))
+ self.assert_eq(
+ psser.between(1, 4, inclusive="right"), pser.between(1, 4,
inclusive="right")
+ )
+ expected_err_msg = (
+ "Inclusive has to be either string of 'both'," "'left', 'right',
or 'neither'"
+ )
+ with self.assertRaisesRegex(ValueError, expected_err_msg):
+ psser.between(1, 4, inclusive="middle")
+
+ # Test for backward compatibility
+ self.assert_eq(psser.between(1, 4, inclusive=True), pser.between(1, 4,
inclusive=True))
+ self.assert_eq(psser.between(1, 4, inclusive=False), pser.between(1,
4, inclusive=False))
+ with self.assertWarns(FutureWarning):
+ psser.between(1, 4, inclusive=True)
+
def test_between_time(self):
idx = pd.date_range("2018-04-09", periods=4, freq="1D20min")
pser = pd.Series([1, 2, 3, 4], index=idx)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]