This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new b613b228892d [SPARK-50703][PYTHON] Refine the docstring of
regexp_replace, regexp_substr, and regexp_instr
b613b228892d is described below
commit b613b228892d7c450c91112f14d1ace85cbf4c0b
Author: Evan Wu <[email protected]>
AuthorDate: Mon Jan 6 10:16:09 2025 +0800
[SPARK-50703][PYTHON] Refine the docstring of regexp_replace,
regexp_substr, and regexp_instr
### What changes were proposed in this pull request?
Refined docstring for `regexp_replace`, `regexp_substr`, and `regexp_instr`.
### Why are the changes needed?
to improve docs and test coverage
### Does this PR introduce _any_ user-facing change?
doc-only changes
### How was this patch tested?
doctests
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #49348 from drexler-sky/docstring2.
Authored-by: Evan Wu <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 149 ++++++++++++++++++++++++++------
1 file changed, 121 insertions(+), 28 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 5d557c7277a3..0ff9b17d68a2 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -15535,43 +15535,102 @@ def regexp_replace(
Examples
--------
- >>> df = spark.createDataFrame([("100-200", r"(\d+)", "--")], ["str",
"pattern", "replacement"])
- >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()
- [Row(d='-----')]
- >>> df.select(regexp_replace("str", col("pattern"),
col("replacement")).alias('d')).collect()
- [Row(d='-----')]
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame(
+ ... [("100-200", r"(\d+)", "--")],
+ ... ["str", "pattern", "replacement"]
+ ... )
+
+ Example 1: Replaces all the substrings in the `str` column name that
+ match the regex pattern `(\d+)` (one or more digits) with the replacement
+ string "--".
+
+ >>> df.select('*', sf.regexp_replace('str', r'(\d+)', '--')).show()
+ +-------+-------+-----------+---------------------------------+
+ | str|pattern|replacement|regexp_replace(str, (\d+), --, 1)|
+ +-------+-------+-----------+---------------------------------+
+ |100-200| (\d+)| --| -----|
+ +-------+-------+-----------+---------------------------------+
+
+ Example 2: Replaces all the substrings in the `str` Column that match
+ the regex pattern in the `pattern` Column with the string in the
`replacement`
+ column.
+
+ >>> df.select('*', \
+ ... sf.regexp_replace(sf.col("str"), sf.col("pattern"),
sf.col("replacement")) \
+ ... ).show()
+ +-------+-------+-----------+--------------------------------------------+
+ | str|pattern|replacement|regexp_replace(str, pattern, replacement, 1)|
+ +-------+-------+-----------+--------------------------------------------+
+ |100-200| (\d+)| --| -----|
+ +-------+-------+-----------+--------------------------------------------+
"""
return _invoke_function_over_columns("regexp_replace", string,
lit(pattern), lit(replacement))
@_try_remote_functions
def regexp_substr(str: "ColumnOrName", regexp: "ColumnOrName") -> Column:
- r"""Returns the substring that matches the Java regex `regexp` within the
string `str`.
+ r"""Returns the first substring that matches the Java regex `regexp`
within the string `str`.
If the regular expression is not found, the result is null.
.. versionadded:: 3.5.0
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
- regexp : :class:`~pyspark.sql.Column` or str
+ regexp : :class:`~pyspark.sql.Column` or column name
regex pattern to apply.
Returns
-------
:class:`~pyspark.sql.Column`
- the substring that matches a Java regex within the string `str`.
+ the first substring that matches a Java regex within the string `str`.
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
- >>> df.select(regexp_substr('str', lit(r'\d+')).alias('d')).collect()
- [Row(d='1')]
- >>> df.select(regexp_substr('str', lit(r'mmm')).alias('d')).collect()
- [Row(d=None)]
- >>> df.select(regexp_substr("str", col("regexp")).alias('d')).collect()
- [Row(d='1')]
+
+ Example 1: Returns the first substring in the `str` column name that
+ matches the regex pattern `(\d+)` (one or more digits).
+
+ >>> df.select('*', sf.regexp_substr('str', sf.lit(r'\d+'))).show()
+ +---------+------+-----------------------+
+ | str|regexp|regexp_substr(str, \d+)|
+ +---------+------+-----------------------+
+ |1a 2b 14m| \d+| 1|
+ +---------+------+-----------------------+
+
+ Example 2: Returns the first substring in the `str` column name that
+ matches the regex pattern `(mmm)` (three consecutive 'm' characters)
+
+ >>> df.select('*', sf.regexp_substr('str', sf.lit(r'mmm'))).show()
+ +---------+------+-----------------------+
+ | str|regexp|regexp_substr(str, mmm)|
+ +---------+------+-----------------------+
+ |1a 2b 14m| \d+| NULL|
+ +---------+------+-----------------------+
+
+ Example 3: Returns the first substring in the `str` column name that
+ matches the regex pattern in `regexp` Column.
+
+ >>> df.select('*', sf.regexp_substr("str", sf.col("regexp"))).show()
+ +---------+------+--------------------------+
+ | str|regexp|regexp_substr(str, regexp)|
+ +---------+------+--------------------------+
+ |1a 2b 14m| \d+| 1|
+ +---------+------+--------------------------+
+
+ Example 4: Returns the first substring in the `str` Column that
+ matches the regex pattern in `regexp` column name.
+
+ >>> df.select('*', sf.regexp_substr(sf.col("str"), "regexp")).show()
+ +---------+------+--------------------------+
+ | str|regexp|regexp_substr(str, regexp)|
+ +---------+------+--------------------------+
+ |1a 2b 14m| \d+| 1|
+ +---------+------+--------------------------+
"""
return _invoke_function_over_columns("regexp_substr", str, regexp)
@@ -15580,36 +15639,70 @@ def regexp_substr(str: "ColumnOrName", regexp:
"ColumnOrName") -> Column:
def regexp_instr(
str: "ColumnOrName", regexp: "ColumnOrName", idx: Optional[Union[int,
Column]] = None
) -> Column:
- r"""Extract all strings in the `str` that match the Java regex `regexp`
+ r"""Returns the position of the first substring in the `str` that match
the Java regex `regexp`
and corresponding to the regex group index.
.. versionadded:: 3.5.0
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
- regexp : :class:`~pyspark.sql.Column` or str
+ regexp : :class:`~pyspark.sql.Column` or column name
regex pattern to apply.
- idx : int, optional
+ idx : :class:`~pyspark.sql.Column` or int, optional
matched group id.
Returns
-------
:class:`~pyspark.sql.Column`
- all strings in the `str` that match a Java regex and corresponding to
the regex group index.
+ the position of the first substring in the `str` that match a Java
regex and corresponding
+ to the regex group index.
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("1a 2b 14m", r"\d+(a|b|m)")], ["str",
"regexp"])
- >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)')).alias('d')).collect()
- [Row(d=1)]
- >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'),
1).alias('d')).collect()
- [Row(d=1)]
- >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'),
2).alias('d')).collect()
- [Row(d=1)]
- >>> df.select(regexp_instr('str', col("regexp")).alias('d')).collect()
- [Row(d=1)]
+
+ Example 1: Returns the position of the first substring in the `str` column
name that
+ match the regex pattern `(\d+(a|b|m))` (one or more digits followed by
'a', 'b', or 'm').
+
+ >>> df.select('*', sf.regexp_instr('str', sf.lit(r'\d+(a|b|m)'))).show()
+ +---------+----------+--------------------------------+
+ | str| regexp|regexp_instr(str, \d+(a|b|m), 0)|
+ +---------+----------+--------------------------------+
+ |1a 2b 14m|\d+(a|b|m)| 1|
+ +---------+----------+--------------------------------+
+
+ Example 2: Returns the position of the first substring in the `str` column
name that
+ match the regex pattern `(\d+(a|b|m))` (one or more digits followed by
'a', 'b', or 'm'),
+
+ >>> df.select('*', sf.regexp_instr('str', sf.lit(r'\d+(a|b|m)'),
sf.lit(1))).show()
+ +---------+----------+--------------------------------+
+ | str| regexp|regexp_instr(str, \d+(a|b|m), 1)|
+ +---------+----------+--------------------------------+
+ |1a 2b 14m|\d+(a|b|m)| 1|
+ +---------+----------+--------------------------------+
+
+ Example 3: Returns the position of the first substring in the `str` column
name that
+ match the regex pattern in `regexp` Column.
+
+ >>> df.select('*', sf.regexp_instr('str', sf.col("regexp"))).show()
+ +---------+----------+----------------------------+
+ | str| regexp|regexp_instr(str, regexp, 0)|
+ +---------+----------+----------------------------+
+ |1a 2b 14m|\d+(a|b|m)| 1|
+ +---------+----------+----------------------------+
+
+ Example 4: Returns the position of the first substring in the `str` Column
that
+ match the regex pattern in `regexp` column name.
+
+ >>> df.select('*', sf.regexp_instr(sf.col("str"), "regexp")).show()
+ +---------+----------+----------------------------+
+ | str| regexp|regexp_instr(str, regexp, 0)|
+ +---------+----------+----------------------------+
+ |1a 2b 14m|\d+(a|b|m)| 1|
+ +---------+----------+----------------------------+
"""
if idx is None:
return _invoke_function_over_columns("regexp_instr", str, regexp)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]