This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new b4014eb13a1 [SPARK-40330][PS] Implement `Series.searchsorted`
b4014eb13a1 is described below
commit b4014eb13a1749bba5184acb92690817ae9c2115
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Sep 26 12:55:29 2022 +0900
[SPARK-40330][PS] Implement `Series.searchsorted`
### What changes were proposed in this pull request?
Implement `Series.searchsorted`
### Why are the changes needed?
for API coverage
### Does this PR introduce _any_ user-facing change?
yes, new API
```
>>> ser = ps.Series([1, 2, 2, 3])
>>> ser.searchsorted(0)
0
>>> ser.searchsorted(1)
0
>>> ser.searchsorted(2)
1
>>> ser.searchsorted(5)
4
>>> ser.searchsorted(0, side="right")
0
>>> ser.searchsorted(1, side="right")
1
>>> ser.searchsorted(2, side="right")
3
>>> ser.searchsorted(5, side="right")
4
```
### How was this patch tested?
added testsuites
Closes #37978 from zhengruifeng/ps_ser_searchordered.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../source/reference/pyspark.pandas/series.rst | 1 +
python/pyspark/pandas/missing/series.py | 1 -
python/pyspark/pandas/series.py | 74 ++++++++++++++++++++++
python/pyspark/pandas/tests/test_series.py | 27 ++++++++
4 files changed, 102 insertions(+), 1 deletion(-)
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst
b/python/docs/source/reference/pyspark.pandas/series.rst
index 5ed6df6b2a1..a0119593f96 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -201,6 +201,7 @@ Reindexing / Selection / Label manipulation
Series.reindex_like
Series.reset_index
Series.sample
+ Series.searchsorted
Series.swaplevel
Series.swapaxes
Series.take
diff --git a/python/pyspark/pandas/missing/series.py
b/python/pyspark/pandas/missing/series.py
index 2b10dc34e58..74a544f0b94 100644
--- a/python/pyspark/pandas/missing/series.py
+++ b/python/pyspark/pandas/missing/series.py
@@ -39,7 +39,6 @@ class MissingPandasLikeSeries:
convert_dtypes = _unsupported_function("convert_dtypes")
infer_objects = _unsupported_function("infer_objects")
reorder_levels = _unsupported_function("reorder_levels")
- searchsorted = _unsupported_function("searchsorted")
set_axis = _unsupported_function("set_axis")
to_hdf = _unsupported_function("to_hdf")
to_period = _unsupported_function("to_period")
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 9a2513219e3..aadf8ce1fbe 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -6610,6 +6610,80 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
)
return DataFrame(internal)
+ # TODO(SPARK-40553): 1, support array-like 'value'; 2, add parameter
'sorter'
+ def searchsorted(self, value: Any, side: str = "left") -> int:
+ """
+ Find indices where elements should be inserted to maintain order.
+
+ Find the indices into a sorted Series self such that, if the
corresponding elements
+ in value were inserted before the indices, the order of self would be
preserved.
+
+ .. versionadded:: 3.4.0
+
+ Parameters
+ ----------
+ value : scalar
+ Values to insert into self.
+ side : {‘left’, ‘right’}, optional
+ If ‘left’, the index of the first suitable location found is given.
+ If ‘right’, return the last such index. If there is no suitable
index,
+ return either 0 or N (where N is the length of self).
+
+ Returns
+ -------
+ int
+ insertion point
+
+ Notes
+ -----
+ The Series must be monotonically sorted, otherwise wrong locations
will likely be returned.
+
+ Examples
+ --------
+ >>> ser = ps.Series([1, 2, 2, 3])
+ >>> ser.searchsorted(0)
+ 0
+ >>> ser.searchsorted(1)
+ 0
+ >>> ser.searchsorted(2)
+ 1
+ >>> ser.searchsorted(5)
+ 4
+ >>> ser.searchsorted(0, side="right")
+ 0
+ >>> ser.searchsorted(1, side="right")
+ 1
+ >>> ser.searchsorted(2, side="right")
+ 3
+ >>> ser.searchsorted(5, side="right")
+ 4
+ """
+ if side not in ["left", "right"]:
+ raise ValueError(f"Invalid side {side}")
+
+ sdf = self._internal.spark_frame
+ index_col_name = verify_temp_column_name(sdf,
"__search_sorted_index_col__")
+ value_col_name = verify_temp_column_name(sdf,
"__search_sorted_value_col__")
+ sdf = InternalFrame.attach_distributed_sequence_column(
+ sdf.select(self.spark.column.alias(value_col_name)), index_col_name
+ )
+
+ if side == "left":
+ results = sdf.select(
+ F.min(F.when(F.lit(value) <= F.col(value_col_name),
F.col(index_col_name))),
+ F.count(F.lit(0)),
+ ).take(1)
+ else:
+ results = sdf.select(
+ F.min(F.when(F.lit(value) < F.col(value_col_name),
F.col(index_col_name))),
+ F.count(F.lit(0)),
+ ).take(1)
+
+ if len(results) == 0:
+ return 0
+ else:
+ return results[0][1] if results[0][0] is None else results[0][0]
+
def align(
self,
other: Union[DataFrame, "Series"],
diff --git a/python/pyspark/pandas/tests/test_series.py
b/python/pyspark/pandas/tests/test_series.py
index e36f4d9aa23..db56e7e12da 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -3074,6 +3074,33 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
psser.backfill(inplace=True)
self.assert_eq(expected, psser)
+ def test_searchsorted(self):
+ pser1 = pd.Series([1, 2, 2, 3])
+
+ index2 = pd.date_range("2018-04-09", periods=4, freq="2D")
+ pser2 = pd.Series([1, 2, 3, 4], index=index2)
+
+ index3 = pd.MultiIndex.from_tuples(
+ [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
+ )
+ pser3 = pd.Series([1.0, 2.0, 3.0], index=index3, name="name")
+
+ pser4 = pd.Series([])
+
+ for pser in [pser1, pser2, pser3, pser4]:
+ psser = ps.from_pandas(pser)
+ for value in [0.5, 1, 2, 3.0, 4, 5]:
+ for side in ["left", "right"]:
+ self.assert_eq(
+ pser.searchsorted(value, side=side),
+ psser.searchsorted(value, side=side),
+ )
+
+ with self.assertRaisesRegex(ValueError, "Invalid side"):
+ ps.from_pandas(pser1).searchsorted(1.1, side=[1, 2])
+ with self.assertRaisesRegex(ValueError, "Invalid side"):
+ ps.from_pandas(pser1).searchsorted(1.1, side="middle")
+
def test_align(self):
pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
psdf = ps.from_pandas(pdf)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]