[spark] branch master updated: [SPARK-40330][PS] Implement `Series.searchsorted`

gurwls223 Sun, 25 Sep 2022 20:55:50 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new b4014eb13a1 [SPARK-40330][PS] Implement `Series.searchsorted`
b4014eb13a1 is described below

commit b4014eb13a1749bba5184acb92690817ae9c2115
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Sep 26 12:55:29 2022 +0900

    [SPARK-40330][PS] Implement `Series.searchsorted`
    
    ### What changes were proposed in this pull request?
    Implement `Series.searchsorted`
    
    ### Why are the changes needed?
    for API coverage
    
    ### Does this PR introduce _any_ user-facing change?
    yes, new API
    ```
            >>> ser = ps.Series([1, 2, 2, 3])
            >>> ser.searchsorted(0)
            0
            >>> ser.searchsorted(1)
            0
            >>> ser.searchsorted(2)
            1
            >>> ser.searchsorted(5)
            4
            >>> ser.searchsorted(0, side="right")
            0
            >>> ser.searchsorted(1, side="right")
            1
            >>> ser.searchsorted(2, side="right")
            3
            >>> ser.searchsorted(5, side="right")
            4
    ```
    
    ### How was this patch tested?
    added testsuites
    
    Closes #37978 from zhengruifeng/ps_ser_searchordered.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../source/reference/pyspark.pandas/series.rst     |  1 +
 python/pyspark/pandas/missing/series.py            |  1 -
 python/pyspark/pandas/series.py                    | 74 ++++++++++++++++++++++
 python/pyspark/pandas/tests/test_series.py         | 27 ++++++++
 4 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/python/docs/source/reference/pyspark.pandas/series.rst 
b/python/docs/source/reference/pyspark.pandas/series.rst
index 5ed6df6b2a1..a0119593f96 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -201,6 +201,7 @@ Reindexing / Selection / Label manipulation
    Series.reindex_like
    Series.reset_index
    Series.sample
+   Series.searchsorted
    Series.swaplevel
    Series.swapaxes
    Series.take
diff --git a/python/pyspark/pandas/missing/series.py 
b/python/pyspark/pandas/missing/series.py
index 2b10dc34e58..74a544f0b94 100644
--- a/python/pyspark/pandas/missing/series.py
+++ b/python/pyspark/pandas/missing/series.py
@@ -39,7 +39,6 @@ class MissingPandasLikeSeries:
     convert_dtypes = _unsupported_function("convert_dtypes")
     infer_objects = _unsupported_function("infer_objects")
     reorder_levels = _unsupported_function("reorder_levels")
-    searchsorted = _unsupported_function("searchsorted")
     set_axis = _unsupported_function("set_axis")
     to_hdf = _unsupported_function("to_hdf")
     to_period = _unsupported_function("to_period")
diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py
index 9a2513219e3..aadf8ce1fbe 100644
--- a/python/pyspark/pandas/series.py
+++ b/python/pyspark/pandas/series.py
@@ -6610,6 +6610,80 @@ class Series(Frame, IndexOpsMixin, Generic[T]):
         )
         return DataFrame(internal)
 
+    # TODO(SPARK-40553): 1, support array-like 'value'; 2, add parameter 
'sorter'
+    def searchsorted(self, value: Any, side: str = "left") -> int:
+        """
+        Find indices where elements should be inserted to maintain order.
+
+        Find the indices into a sorted Series self such that, if the 
corresponding elements
+        in value were inserted before the indices, the order of self would be 
preserved.
+
+        .. versionadded:: 3.4.0
+
+        Parameters
+        ----------
+        value : scalar
+            Values to insert into self.
+        side : {‘left’, ‘right’}, optional
+            If ‘left’, the index of the first suitable location found is given.
+            If ‘right’, return the last such index. If there is no suitable 
index,
+            return either 0 or N (where N is the length of self).
+
+        Returns
+        -------
+        int
+            insertion point
+
+        Notes
+        -----
+        The Series must be monotonically sorted, otherwise wrong locations 
will likely be returned.
+
+        Examples
+        --------
+        >>> ser = ps.Series([1, 2, 2, 3])
+        >>> ser.searchsorted(0)
+        0
+        >>> ser.searchsorted(1)
+        0
+        >>> ser.searchsorted(2)
+        1
+        >>> ser.searchsorted(5)
+        4
+        >>> ser.searchsorted(0, side="right")
+        0
+        >>> ser.searchsorted(1, side="right")
+        1
+        >>> ser.searchsorted(2, side="right")
+        3
+        >>> ser.searchsorted(5, side="right")
+        4
+        """
+        if side not in ["left", "right"]:
+            raise ValueError(f"Invalid side {side}")
+
+        sdf = self._internal.spark_frame
+        index_col_name = verify_temp_column_name(sdf, 
"__search_sorted_index_col__")
+        value_col_name = verify_temp_column_name(sdf, 
"__search_sorted_value_col__")
+        sdf = InternalFrame.attach_distributed_sequence_column(
+            sdf.select(self.spark.column.alias(value_col_name)), index_col_name
+        )
+
+        if side == "left":
+            results = sdf.select(
+                F.min(F.when(F.lit(value) <= F.col(value_col_name), 
F.col(index_col_name))),
+                F.count(F.lit(0)),
+            ).take(1)
+        else:
+            results = sdf.select(
+                F.min(F.when(F.lit(value) < F.col(value_col_name), 
F.col(index_col_name))),
+                F.count(F.lit(0)),
+            ).take(1)
+
+        if len(results) == 0:
+            return 0
+        else:
+            return results[0][1] if results[0][0] is None else results[0][0]
+
     def align(
         self,
         other: Union[DataFrame, "Series"],
diff --git a/python/pyspark/pandas/tests/test_series.py 
b/python/pyspark/pandas/tests/test_series.py
index e36f4d9aa23..db56e7e12da 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -3074,6 +3074,33 @@ class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
             psser.backfill(inplace=True)
             self.assert_eq(expected, psser)
 
+    def test_searchsorted(self):
+        pser1 = pd.Series([1, 2, 2, 3])
+
+        index2 = pd.date_range("2018-04-09", periods=4, freq="2D")
+        pser2 = pd.Series([1, 2, 3, 4], index=index2)
+
+        index3 = pd.MultiIndex.from_tuples(
+            [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
+        )
+        pser3 = pd.Series([1.0, 2.0, 3.0], index=index3, name="name")
+
+        pser4 = pd.Series([])
+
+        for pser in [pser1, pser2, pser3, pser4]:
+            psser = ps.from_pandas(pser)
+            for value in [0.5, 1, 2, 3.0, 4, 5]:
+                for side in ["left", "right"]:
+                    self.assert_eq(
+                        pser.searchsorted(value, side=side),
+                        psser.searchsorted(value, side=side),
+                    )
+
+        with self.assertRaisesRegex(ValueError, "Invalid side"):
+            ps.from_pandas(pser1).searchsorted(1.1, side=[1, 2])
+        with self.assertRaisesRegex(ValueError, "Invalid side"):
+            ps.from_pandas(pser1).searchsorted(1.1, side="middle")
+
     def test_align(self):
         pdf = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
         psdf = ps.from_pandas(pdf)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-40330][PS] Implement `Series.searchsorted`

Reply via email to