This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 721a41794e11 [SPARK-50702][PYTHON] Refine the docstring of
regexp_count, regexp_extract and regexp_extract_all
721a41794e11 is described below
commit 721a41794e113c4372598ceb5e8176f73c911764
Author: Evan Wu <[email protected]>
AuthorDate: Thu Jan 2 12:12:23 2025 +0800
[SPARK-50702][PYTHON] Refine the docstring of regexp_count, regexp_extract
and regexp_extract_all
### What changes were proposed in this pull request?
Refine the docstring for `regexp_count`, `regexp_extract` and
`regexp_extract_all`.
### Why are the changes needed?
to improve docs and test coverage
### Does this PR introduce _any_ user-facing change?
doc-only changes
### How was this patch tested?
new doctests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49338 from drexler-sky/docstring.
Authored-by: Evan Wu <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 124 +++++++++++++++++++++++++-------
1 file changed, 98 insertions(+), 26 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 7b14598a0ef4..5d557c7277a3 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -15329,9 +15329,9 @@ def regexp_count(str: "ColumnOrName", regexp:
"ColumnOrName") -> Column:
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
- regexp : :class:`~pyspark.sql.Column` or str
+ regexp : :class:`~pyspark.sql.Column` or column name
regex pattern to apply.
Returns
@@ -15341,13 +15341,35 @@ def regexp_count(str: "ColumnOrName", regexp:
"ColumnOrName") -> Column:
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])
- >>> df.select(regexp_count('str', lit(r'\d+')).alias('d')).collect()
- [Row(d=3)]
- >>> df.select(regexp_count('str', lit(r'mmm')).alias('d')).collect()
- [Row(d=0)]
- >>> df.select(regexp_count("str", col("regexp")).alias('d')).collect()
- [Row(d=3)]
+ >>> df.select('*', sf.regexp_count('str', sf.lit(r'\d+'))).show()
+ +---------+------+----------------------+
+ | str|regexp|regexp_count(str, \d+)|
+ +---------+------+----------------------+
+ |1a 2b 14m| \d+| 3|
+ +---------+------+----------------------+
+
+ >>> df.select('*', sf.regexp_count('str', sf.lit(r'mmm'))).show()
+ +---------+------+----------------------+
+ | str|regexp|regexp_count(str, mmm)|
+ +---------+------+----------------------+
+ |1a 2b 14m| \d+| 0|
+ +---------+------+----------------------+
+
+ >>> df.select('*', sf.regexp_count("str", sf.col("regexp"))).show()
+ +---------+------+-------------------------+
+ | str|regexp|regexp_count(str, regexp)|
+ +---------+------+-------------------------+
+ |1a 2b 14m| \d+| 3|
+ +---------+------+-------------------------+
+
+ >>> df.select('*', sf.regexp_count(sf.col('str'), "regexp")).show()
+ +---------+------+-------------------------+
+ | str|regexp|regexp_count(str, regexp)|
+ +---------+------+-------------------------+
+ |1a 2b 14m| \d+| 3|
+ +---------+------+-------------------------+
"""
return _invoke_function_over_columns("regexp_count", str, regexp)
@@ -15364,7 +15386,7 @@ def regexp_extract(str: "ColumnOrName", pattern: str,
idx: int) -> Column:
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
pattern : str
regex pattern to apply.
@@ -15376,17 +15398,36 @@ def regexp_extract(str: "ColumnOrName", pattern: str,
idx: int) -> Column:
:class:`~pyspark.sql.Column`
matched value specified by `idx` group id.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.regexp_extract_all`
+
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([('100-200',)], ['str'])
- >>> df.select(regexp_extract('str', r'(\d+)-(\d+)',
1).alias('d')).collect()
- [Row(d='100')]
+ >>> df.select('*', sf.regexp_extract('str', r'(\d+)-(\d+)', 1)).show()
+ +-------+-----------------------------------+
+ | str|regexp_extract(str, (\d+)-(\d+), 1)|
+ +-------+-----------------------------------+
+ |100-200| 100|
+ +-------+-----------------------------------+
+
>>> df = spark.createDataFrame([('foo',)], ['str'])
- >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()
- [Row(d='')]
+ >>> df.select('*', sf.regexp_extract('str', r'(\d+)', 1)).show()
+ +---+-----------------------------+
+ |str|regexp_extract(str, (\d+), 1)|
+ +---+-----------------------------+
+ |foo| |
+ +---+-----------------------------+
+
>>> df = spark.createDataFrame([('aaaac',)], ['str'])
- >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()
- [Row(d='')]
+ >>> df.select('*', sf.regexp_extract(sf.col('str'), '(a+)(b)?(c)',
2)).show()
+ +-----+-----------------------------------+
+ | str|regexp_extract(str, (a+)(b)?(c), 2)|
+ +-----+-----------------------------------+
+ |aaaac| |
+ +-----+-----------------------------------+
"""
from pyspark.sql.classic.column import _to_java_column
@@ -15406,11 +15447,11 @@ def regexp_extract_all(
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
- regexp : :class:`~pyspark.sql.Column` or str
+ regexp : :class:`~pyspark.sql.Column` or column name
regex pattern to apply.
- idx : int, optional
+ idx : :class:`~pyspark.sql.Column` or int, optional
matched group id.
Returns
@@ -15418,17 +15459,48 @@ def regexp_extract_all(
:class:`~pyspark.sql.Column`
all strings in the `str` that match a Java regex and corresponding to
the regex group index.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.regexp_extract`
+
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("100-200, 300-400", r"(\d+)-(\d+)")],
["str", "regexp"])
- >>> df.select(regexp_extract_all('str',
lit(r'(\d+)-(\d+)')).alias('d')).collect()
- [Row(d=['100', '300'])]
- >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'),
1).alias('d')).collect()
- [Row(d=['100', '300'])]
- >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'),
2).alias('d')).collect()
- [Row(d=['200', '400'])]
- >>> df.select(regexp_extract_all('str',
col("regexp")).alias('d')).collect()
- [Row(d=['100', '300'])]
+ >>> df.select('*', sf.regexp_extract_all('str',
sf.lit(r'(\d+)-(\d+)'))).show()
+ +----------------+-----------+---------------------------------------+
+ | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 1)|
+ +----------------+-----------+---------------------------------------+
+ |100-200, 300-400|(\d+)-(\d+)| [100, 300]|
+ +----------------+-----------+---------------------------------------+
+
+ >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'),
sf.lit(1))).show()
+ +----------------+-----------+---------------------------------------+
+ | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 1)|
+ +----------------+-----------+---------------------------------------+
+ |100-200, 300-400|(\d+)-(\d+)| [100, 300]|
+ +----------------+-----------+---------------------------------------+
+
+ >>> df.select('*', sf.regexp_extract_all('str', sf.lit(r'(\d+)-(\d+)'),
2)).show()
+ +----------------+-----------+---------------------------------------+
+ | str| regexp|regexp_extract_all(str, (\d+)-(\d+), 2)|
+ +----------------+-----------+---------------------------------------+
+ |100-200, 300-400|(\d+)-(\d+)| [200, 400]|
+ +----------------+-----------+---------------------------------------+
+
+ >>> df.select('*', sf.regexp_extract_all('str', sf.col("regexp"))).show()
+ +----------------+-----------+----------------------------------+
+ | str| regexp|regexp_extract_all(str, regexp, 1)|
+ +----------------+-----------+----------------------------------+
+ |100-200, 300-400|(\d+)-(\d+)| [100, 300]|
+ +----------------+-----------+----------------------------------+
+
+ >>> df.select('*', sf.regexp_extract_all(sf.col('str'), "regexp")).show()
+ +----------------+-----------+----------------------------------+
+ | str| regexp|regexp_extract_all(str, regexp, 1)|
+ +----------------+-----------+----------------------------------+
+ |100-200, 300-400|(\d+)-(\d+)| [100, 300]|
+ +----------------+-----------+----------------------------------+
"""
if idx is None:
return _invoke_function_over_columns("regexp_extract_all", str, regexp)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]