This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 066a30ecacbc [SPARK-50231][PYTHON] Make function `instr` accept Column
`substring`
066a30ecacbc is described below
commit 066a30ecacbc6221331b04056d31b1e4721d1c59
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Nov 5 12:45:10 2024 -0800
[SPARK-50231][PYTHON] Make function `instr` accept Column `substring`
### What changes were proposed in this pull request?
Make function `instr` accept Column `substring`
### Why are the changes needed?
in Spark Connect, this function actually accepts Column `substring`
### Does this PR introduce _any_ user-facing change?
yes, new feature
### How was this patch tested?
added doctests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #48761 from zhengruifeng/py_instr.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/connect/functions/builtin.py | 2 +-
python/pyspark/sql/functions/builtin.py | 39 +++++++++++++++++-----
.../scala/org/apache/spark/sql/functions.scala | 15 ++++++++-
3 files changed, 45 insertions(+), 11 deletions(-)
diff --git a/python/pyspark/sql/connect/functions/builtin.py
b/python/pyspark/sql/connect/functions/builtin.py
index 6f3ce942eb17..0c1fd63de5c9 100644
--- a/python/pyspark/sql/connect/functions/builtin.py
+++ b/python/pyspark/sql/connect/functions/builtin.py
@@ -2482,7 +2482,7 @@ def format_string(format: str, *cols: "ColumnOrName") ->
Column:
format_string.__doc__ = pysparkfuncs.format_string.__doc__
-def instr(str: "ColumnOrName", substr: str) -> Column:
+def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column:
return _invoke_function("instr", _to_col(str), lit(substr))
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 1e5349fb1649..1ee5c357bd6e 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -12821,7 +12821,7 @@ def format_string(format: str, *cols: "ColumnOrName")
-> Column:
@_try_remote_functions
-def instr(str: "ColumnOrName", substr: str) -> Column:
+def instr(str: "ColumnOrName", substr: Union[Column, str]) -> Column:
"""
Locate the position of the first occurrence of substr column in the given
string.
Returns null if either of the arguments are null.
@@ -12838,11 +12838,14 @@ def instr(str: "ColumnOrName", substr: str) -> Column:
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
- substr : str
+ substr : :class:`~pyspark.sql.Column` or literal string
substring to look for.
+ .. versionchanged:: 4.0.0
+ `substr` now accepts column.
+
Returns
-------
:class:`~pyspark.sql.Column`
@@ -12850,13 +12853,31 @@ def instr(str: "ColumnOrName", substr: str) -> Column:
Examples
--------
- >>> df = spark.createDataFrame([('abcd',)], ['s',])
- >>> df.select(instr(df.s, 'b').alias('s')).collect()
- [Row(s=2)]
- """
- from pyspark.sql.classic.column import _to_java_column
+ Example 1: Using a literal string as the 'substring'
- return _invoke_function("instr", _to_java_column(str),
_enum_to_value(substr))
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",])
+ >>> df.select("*", sf.instr(df.s, "b")).show()
+ +----+-----------+
+ | s|instr(s, b)|
+ +----+-----------+
+ |abcd| 2|
+ | xyz| 0|
+ +----+-----------+
+
+ Example 2: Using a Column 'substring'
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([("abcd",), ("xyz",)], ["s",])
+ >>> df.select("*", sf.instr("s", sf.lit("abc").substr(0, 2))).show()
+ +----+---------------------------+
+ | s|instr(s, substr(abc, 0, 2))|
+ +----+---------------------------+
+ |abcd| 1|
+ | xyz| 0|
+ +----+---------------------------+
+ """
+ return _invoke_function_over_columns("instr", str, lit(substr))
@_try_remote_functions
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
index 8c49952bc31e..35453603de5d 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3995,7 +3995,20 @@ object functions {
* @group string_funcs
* @since 1.5.0
*/
- def instr(str: Column, substring: String): Column = Column.fn("instr", str,
lit(substring))
+ def instr(str: Column, substring: String): Column = instr(str,
lit(substring))
+
+ /**
+ * Locate the position of the first occurrence of substr column in the given
string. Returns
+ * null if either of the arguments are null.
+ *
+ * @note
+ * The position is not zero based, but 1 based index. Returns 0 if substr
could not be found
+ * in str.
+ *
+ * @group string_funcs
+ * @since 4.0.0
+ */
+ def instr(str: Column, substring: Column): Column = Column.fn("instr", str,
substring)
/**
* Computes the character length of a given string or number of bytes of a
binary string. The
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]