This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0e45e2179962 [SPARK-50486][PYTHON][DOCS] Refine the docstring for
string functions - part 2
0e45e2179962 is described below
commit 0e45e217996248a093cfcb892407bb523f885c08
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Dec 4 13:13:12 2024 +0900
[SPARK-50486][PYTHON][DOCS] Refine the docstring for string functions -
part 2
### What changes were proposed in this pull request?
Refine the docstring for string functions
### Why are the changes needed?
to improve the docs and test coverage
### Does this PR introduce _any_ user-facing change?
doc-only changes
### How was this patch tested?
new doctests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #49049 from zhengruifeng/py_doc_13.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 491 ++++++++++++++++++++++----------
1 file changed, 340 insertions(+), 151 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 13b2e12bca33..391bc3db7a86 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -7343,6 +7343,12 @@ def rand(seed: Optional[int] = None) -> Column:
:class:`~pyspark.sql.Column`
A column of random values.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.randn`
+ :meth:`pyspark.sql.functions.randstr`
+ :meth:`pyspark.sql.functions.uniform`
+
Examples
--------
Example 1: Generate a random column without a seed
@@ -7396,6 +7402,12 @@ def randn(seed: Optional[int] = None) -> Column:
:class:`~pyspark.sql.Column`
A column of random values.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.rand`
+ :meth:`pyspark.sql.functions.randstr`
+ :meth:`pyspark.sql.functions.uniform`
+
Examples
--------
Example 1: Generate a random column without a seed
@@ -14015,6 +14027,13 @@ def instr(str: "ColumnOrName", substr: Union[Column,
str]) -> Column:
:class:`~pyspark.sql.Column`
location of the first occurrence of the substring as integer.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.locate`
+ :meth:`pyspark.sql.functions.substr`
+ :meth:`pyspark.sql.functions.substring`
+ :meth:`pyspark.sql.functions.substring_index`
+
Examples
--------
Example 1: Using a literal string as the 'substring'
@@ -14166,6 +14185,11 @@ def sentences(
:class:`~pyspark.sql.Column`
arrays of split sentences.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.split`
+ :meth:`pyspark.sql.functions.split_part`
+
Examples
--------
>>> from pyspark.sql import functions as sf
@@ -14221,15 +14245,15 @@ def substring(
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
- pos : :class:`~pyspark.sql.Column` or str or int
+ pos : :class:`~pyspark.sql.Column` or column name or int
starting position in str.
.. versionchanged:: 4.0.0
`pos` now accepts column and column name.
- len : :class:`~pyspark.sql.Column` or str or int
+ len : :class:`~pyspark.sql.Column` or column name or int
length of chars.
.. versionchanged:: 4.0.0
@@ -14240,6 +14264,14 @@ def substring(
:class:`~pyspark.sql.Column`
substring of given value.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.instr`
+ :meth:`pyspark.sql.functions.locate`
+ :meth:`pyspark.sql.functions.substr`
+ :meth:`pyspark.sql.functions.substring_index`
+ :meth:`pyspark.sql.Column.substr`
+
Examples
--------
Example 1: Using literal integers as arguments
@@ -14318,9 +14350,9 @@ def substring_index(str: "ColumnOrName", delim: str,
count: int) -> Column:
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
target column to work on.
- delim : str
+ delim : literal string
delimiter of values.
count : int
number of occurrences.
@@ -14330,13 +14362,31 @@ def substring_index(str: "ColumnOrName", delim: str,
count: int) -> Column:
:class:`~pyspark.sql.Column`
substring of given value.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.instr`
+ :meth:`pyspark.sql.functions.locate`
+ :meth:`pyspark.sql.functions.substr`
+ :meth:`pyspark.sql.functions.substring`
+ :meth:`pyspark.sql.Column.substr`
+
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([('a.b.c.d',)], ['s'])
- >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()
- [Row(s='a.b')]
- >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
- [Row(s='b.c.d')]
+ >>> df.select('*', sf.substring_index(df.s, '.', 2)).show()
+ +-------+------------------------+
+ | s|substring_index(s, ., 2)|
+ +-------+------------------------+
+ |a.b.c.d| a.b|
+ +-------+------------------------+
+
+ >>> df.select('*', sf.substring_index('s', '.', -3)).show()
+ +-------+-------------------------+
+ | s|substring_index(s, ., -3)|
+ +-------+-------------------------+
+ |a.b.c.d| b.c.d|
+ +-------+-------------------------+
"""
from pyspark.sql.classic.column import _to_java_column
@@ -14358,16 +14408,15 @@ def levenshtein(
Parameters
----------
- left : :class:`~pyspark.sql.Column` or str
+ left : :class:`~pyspark.sql.Column` or column name
first column value.
- right : :class:`~pyspark.sql.Column` or str
+ right : :class:`~pyspark.sql.Column` or column name
second column value.
threshold : int, optional
if set when the levenshtein distance of the two given strings
less than or equal to a given threshold then return result distance,
or -1
- .. versionchanged: 3.5.0
- Added ``threshold`` argument.
+ .. versionadded: 3.5.0
Returns
-------
@@ -14376,11 +14425,21 @@ def levenshtein(
Examples
--------
- >>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])
- >>> df0.select(levenshtein('l', 'r').alias('d')).collect()
- [Row(d=3)]
- >>> df0.select(levenshtein('l', 'r', 2).alias('d')).collect()
- [Row(d=-1)]
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])
+ >>> df.select('*', sf.levenshtein('l', 'r')).show()
+ +------+-------+-----------------+
+ | l| r|levenshtein(l, r)|
+ +------+-------+-----------------+
+ |kitten|sitting| 3|
+ +------+-------+-----------------+
+
+ >>> df.select('*', sf.levenshtein(df.l, df.r, 2)).show()
+ +------+-------+--------------------+
+ | l| r|levenshtein(l, r, 2)|
+ +------+-------+--------------------+
+ |kitten|sitting| -1|
+ +------+-------+--------------------+
"""
from pyspark.sql.classic.column import _to_java_column
@@ -14404,9 +14463,9 @@ def locate(substr: str, str: "ColumnOrName", pos: int =
1) -> Column:
Parameters
----------
- substr : str
+ substr : literal string
a string
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
a Column of :class:`pyspark.sql.types.StringType`
pos : int, optional
start position (zero based)
@@ -14421,11 +14480,31 @@ def locate(substr: str, str: "ColumnOrName", pos: int
= 1) -> Column:
The position is not zero based, but 1 based index. Returns 0 if substr
could not be found in str.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.instr`
+ :meth:`pyspark.sql.functions.substr`
+ :meth:`pyspark.sql.functions.substring`
+ :meth:`pyspark.sql.functions.substring_index`
+ :meth:`pyspark.sql.Column.substr`
+
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([('abcd',)], ['s',])
- >>> df.select(locate('b', df.s, 1).alias('s')).collect()
- [Row(s=2)]
+ >>> df.select('*', sf.locate('b', 's', 1)).show()
+ +----+---------------+
+ | s|locate(b, s, 1)|
+ +----+---------------+
+ |abcd| 2|
+ +----+---------------+
+
+ >>> df.select('*', sf.locate('b', df.s, 3)).show()
+ +----+---------------+
+ | s|locate(b, s, 3)|
+ +----+---------------+
+ |abcd| 0|
+ +----+---------------+
"""
from pyspark.sql.classic.column import _to_java_column
@@ -14469,6 +14548,10 @@ def lpad(
:class:`~pyspark.sql.Column`
left padded result.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.rpad`
+
Examples
--------
Example 1: Pad with a literal string
@@ -14535,6 +14618,10 @@ def rpad(
:class:`~pyspark.sql.Column`
right padded result.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.lpad`
+
Examples
--------
Example 1: Pad with a literal string
@@ -14578,9 +14665,9 @@ def repeat(col: "ColumnOrName", n:
Union["ColumnOrName", int]) -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to work on.
- n : :class:`~pyspark.sql.Column` or str or int
+ n : :class:`~pyspark.sql.Column` or column name or int
number of times to repeat value.
.. versionchanged:: 4.0.0
@@ -14593,35 +14680,35 @@ def repeat(col: "ColumnOrName", n:
Union["ColumnOrName", int]) -> Column:
Examples
--------
- >>> import pyspark.sql.functions as sf
- >>> spark.createDataFrame(
- ... [('ab',)], ['s',]
- ... ).select(sf.repeat("s", 3)).show()
- +------------+
- |repeat(s, 3)|
- +------------+
- | ababab|
- +------------+
+ Example 1: Repeat with a constant number of times
>>> import pyspark.sql.functions as sf
- >>> spark.createDataFrame(
- ... [('ab',)], ['s',]
- ... ).select(sf.repeat("s", sf.lit(4))).show()
- +------------+
- |repeat(s, 4)|
- +------------+
- | abababab|
- +------------+
+ >>> df = spark.createDataFrame([('ab',)], ['s',])
+ >>> df.select("*", sf.repeat("s", 3)).show()
+ +---+------------+
+ | s|repeat(s, 3)|
+ +---+------------+
+ | ab| ababab|
+ +---+------------+
+
+ >>> df.select("*", sf.repeat(df.s, sf.lit(4))).show()
+ +---+------------+
+ | s|repeat(s, 4)|
+ +---+------------+
+ | ab| abababab|
+ +---+------------+
+
+ Example 2: Repeat with a column containing different number of times
>>> import pyspark.sql.functions as sf
- >>> spark.createDataFrame(
- ... [('ab', 5,)], ['s', 't']
- ... ).select(sf.repeat("s", 't')).show()
- +------------+
- |repeat(s, t)|
- +------------+
- | ababababab|
- +------------+
+ >>> df = spark.createDataFrame([('ab', 5,), ('abc', 6,)], ['s', 't'])
+ >>> df.select("*", sf.repeat("s", "t")).show()
+ +---+---+------------------+
+ | s| t| repeat(s, t)|
+ +---+---+------------------+
+ | ab| 5| ababababab|
+ |abc| 6|abcabcabcabcabcabc|
+ +---+---+------------------+
"""
n = _enum_to_value(n)
n = lit(n) if isinstance(n, int) else n
@@ -14644,12 +14731,18 @@ def split(
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
a string expression to split
- pattern : :class:`~pyspark.sql.Column` or str
+ pattern : :class:`~pyspark.sql.Column` or literal string
a string representing a regular expression. The regex string should be
a Java regular expression.
- limit : :class:`~pyspark.sql.Column` or str or int
+
+ .. versionchanged:: 4.0.0
+ `pattern` now accepts column. Does not accept column name since
string type remain
+ accepted as a regular expression representation, for backwards
compatibility.
+ In addition to int, `limit` now accepts column and column name.
+
+ limit : :class:`~pyspark.sql.Column` or column name or int
an integer which controls the number of times `pattern` is applied.
* ``limit > 0``: The resulting array's length will not be more than
`limit`, and the
@@ -14661,61 +14754,66 @@ def split(
.. versionchanged:: 3.0
`split` now takes an optional `limit` field. If not provided,
default limit value is -1.
- .. versionchanged:: 4.0.0
- `pattern` now accepts column. Does not accept column name since
string type remain
- accepted as a regular expression representation, for backwards
compatibility.
- In addition to int, `limit` now accepts column and column name.
-
Returns
-------
:class:`~pyspark.sql.Column`
array of separated strings.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.sentences`
+ :meth:`pyspark.sql.functions.split_part`
+
Examples
--------
- >>> import pyspark.sql.functions as sf
- >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
- >>> df.select(sf.split(df.s, '[ABC]', 2).alias('s')).show()
- +-----------------+
- | s|
- +-----------------+
- |[one, twoBthreeC]|
- +-----------------+
+ Example 1: Repeat with a constant pattern
>>> import pyspark.sql.functions as sf
>>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
- >>> df.select(sf.split(df.s, '[ABC]', -1).alias('s')).show()
- +-------------------+
- | s|
- +-------------------+
- |[one, two, three, ]|
- +-------------------+
-
- >>> import pyspark.sql.functions as sf
- >>> df = spark.createDataFrame(
- ... [('oneAtwoBthreeC', '[ABC]'), ('1A2B3C', '[1-9]+'), ('aa2bb3cc4',
'[1-9]+')],
- ... ['s', 'pattern']
- ... )
- >>> df.select(sf.split(df.s, df.pattern).alias('s')).show()
- +-------------------+
- | s|
- +-------------------+
- |[one, two, three, ]|
- | [, A, B, C]|
- | [aa, bb, cc, ]|
- +-------------------+
+ >>> df.select('*', sf.split(df.s, '[ABC]')).show()
+ +--------------+-------------------+
+ | s|split(s, [ABC], -1)|
+ +--------------+-------------------+
+ |oneAtwoBthreeC|[one, two, three, ]|
+ +--------------+-------------------+
+
+ >>> df.select('*', sf.split(df.s, '[ABC]', 2)).show()
+ +--------------+------------------+
+ | s|split(s, [ABC], 2)|
+ +--------------+------------------+
+ |oneAtwoBthreeC| [one, twoBthreeC]|
+ +--------------+------------------+
+
+ >>> df.select('*', sf.split('s', '[ABC]', -2)).show()
+ +--------------+-------------------+
+ | s|split(s, [ABC], -2)|
+ +--------------+-------------------+
+ |oneAtwoBthreeC|[one, two, three, ]|
+ +--------------+-------------------+
+
+ Example 2: Repeat with a column containing different patterns and limits
>>> import pyspark.sql.functions as sf
- >>> df = spark.createDataFrame(
- ... [('oneAtwoBthreeC', '[ABC]', 2), ('1A2B3C', '[1-9]+', -1)],
- ... ['s', 'pattern', 'expected_parts']
- ... )
- >>> df.select(sf.split(df.s, df.pattern,
df.expected_parts).alias('s')).show()
+ >>> df = spark.createDataFrame([
+ ... ('oneAtwoBthreeC', '[ABC]', 2),
+ ... ('1A2B3C', '[1-9]+', 1),
+ ... ('aa2bb3cc4', '[1-9]+', -1)], ['s', 'p', 'l'])
+ >>> df.select('*', sf.split(df.s, df.p)).show()
+ +--------------+------+---+-------------------+
+ | s| p| l| split(s, p, -1)|
+ +--------------+------+---+-------------------+
+ |oneAtwoBthreeC| [ABC]| 2|[one, two, three, ]|
+ | 1A2B3C|[1-9]+| 1| [, A, B, C]|
+ | aa2bb3cc4|[1-9]+| -1| [aa, bb, cc, ]|
+ +--------------+------+---+-------------------+
+
+ >>> df.select(sf.split('s', df.p, 'l')).show()
+-----------------+
- | s|
+ | split(s, p, l)|
+-----------------+
|[one, twoBthreeC]|
- | [, A, B, C]|
+ | [1A2B3C]|
+ | [aa, bb, cc, ]|
+-----------------+
"""
limit = _enum_to_value(limit)
@@ -14880,16 +14978,29 @@ def randstr(length: Union[Column, int], seed:
Optional[Union[Column, int]] = Non
:class:`~pyspark.sql.Column`
The generated random string with the specified length.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.rand`
+ :meth:`pyspark.sql.functions.randn`
+
Examples
--------
- >>> spark.createDataFrame([('3',)], ['a']) \\
- ... .select(randstr(lit(5), lit(0)).alias('result')) \\
- ... .selectExpr("length(result) > 0").show()
- +--------------------+
- |(length(result) > 0)|
- +--------------------+
- | true|
- +--------------------+
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(0, 10, 1, 1).select(sf.randstr(16, 3)).show()
+ +----------------+
+ | randstr(16, 3)|
+ +----------------+
+ |nurJIpH4cmmMnsCG|
+ |fl9YtT5m01trZtIt|
+ |PD19rAgscTHS7qQZ|
+ |2CuAICF5UJOruVv4|
+ |kNZEs8nDpJEoz3Rl|
+ |OXiU0KN5eaXfjXFs|
+ |qfnTM1BZAHtN0gBV|
+ |1p8XiSKwg33KnRPK|
+ |od5y5MucayQq1bKK|
+ |tklYPmKmc5sIppWM|
+ +----------------+
"""
length = _enum_to_value(length)
length = lit(length)
@@ -15137,7 +15248,7 @@ def initcap(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to work on.
Returns
@@ -15147,8 +15258,14 @@ def initcap(col: "ColumnOrName") -> Column:
Examples
--------
- >>> spark.createDataFrame([('ab cd',)],
['a']).select(initcap("a").alias('v')).collect()
- [Row(v='Ab Cd')]
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([('ab cd',)], ['a'])
+ >>> df.select("*", sf.initcap("a")).show()
+ +-----+----------+
+ | a|initcap(a)|
+ +-----+----------+
+ |ab cd| Ab Cd|
+ +-----+----------+
"""
return _invoke_function_over_columns("initcap", col)
@@ -15165,7 +15282,7 @@ def soundex(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to work on.
Returns
@@ -15175,9 +15292,15 @@ def soundex(col: "ColumnOrName") -> Column:
Examples
--------
- >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
- >>> df.select(soundex(df.name).alias("soundex")).collect()
- [Row(soundex='P362'), Row(soundex='U612')]
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ["s"])
+ >>> df.select("*", sf.soundex("s")).show()
+ +-------+----------+
+ | s|soundex(s)|
+ +-------+----------+
+ | Peters| P362|
+ |Uhrbach| U612|
+ +-------+----------+
"""
return _invoke_function_over_columns("soundex", col)
@@ -15193,7 +15316,7 @@ def bin(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to work on.
Returns
@@ -15203,9 +15326,22 @@ def bin(col: "ColumnOrName") -> Column:
Examples
--------
- >>> df = spark.createDataFrame([2,5], "INT")
- >>> df.select(bin(df.value).alias('c')).collect()
- [Row(c='10'), Row(c='101')]
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(10).select("*", sf.bin("id")).show()
+ +---+-------+
+ | id|bin(id)|
+ +---+-------+
+ | 0| 0|
+ | 1| 1|
+ | 2| 10|
+ | 3| 11|
+ | 4| 100|
+ | 5| 101|
+ | 6| 110|
+ | 7| 111|
+ | 8| 1000|
+ | 9| 1001|
+ +---+-------+
"""
return _invoke_function_over_columns("bin", col)
@@ -15223,9 +15359,13 @@ def hex(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to work on.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.unhex`
+
Returns
-------
:class:`~pyspark.sql.Column`
@@ -15233,8 +15373,14 @@ def hex(col: "ColumnOrName") -> Column:
Examples
--------
- >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'),
hex('b')).collect()
- [Row(hex(a)='414243', hex(b)='3')]
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([('ABC', 3)], ['a', 'b'])
+ >>> df.select('*', sf.hex('a'), sf.hex(df.b)).show()
+ +---+---+------+------+
+ | a| b|hex(a)|hex(b)|
+ +---+---+------+------+
+ |ABC| 3|414243| 3|
+ +---+---+------+------+
"""
return _invoke_function_over_columns("hex", col)
@@ -15251,9 +15397,13 @@ def unhex(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to work on.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.hex`
+
Returns
-------
:class:`~pyspark.sql.Column`
@@ -15261,8 +15411,14 @@ def unhex(col: "ColumnOrName") -> Column:
Examples
--------
- >>> spark.createDataFrame([('414243',)],
['a']).select(unhex('a')).collect()
- [Row(unhex(a)=bytearray(b'ABC'))]
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([('414243',)], ['a'])
+ >>> df.select('*', sf.unhex('a')).show()
+ +------+----------+
+ | a| unhex(a)|
+ +------+----------+
+ |414243|[41 42 43]|
+ +------+----------+
"""
return _invoke_function_over_columns("unhex", col)
@@ -15297,14 +15453,22 @@ def uniform(
Examples
--------
- >>> spark.createDataFrame([('3',)], ['a']) \\
- ... .select(uniform(lit(0), lit(10), lit(0)).alias('result')) \\
- ... .selectExpr("result < 15").show()
- +-------------+
- |(result < 15)|
- +-------------+
- | true|
- +-------------+
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(0, 10, 1, 1).select(sf.uniform(5, 105, 3)).show()
+ +------------------+
+ |uniform(5, 105, 3)|
+ +------------------+
+ | 30|
+ | 71|
+ | 99|
+ | 77|
+ | 16|
+ | 25|
+ | 89|
+ | 80|
+ | 51|
+ | 83|
+ +------------------+
"""
min = _enum_to_value(min)
min = lit(min)
@@ -15667,18 +15831,35 @@ def split_part(src: "ColumnOrName", delimiter:
"ColumnOrName", partNum: "ColumnO
Parameters
----------
- src : :class:`~pyspark.sql.Column` or str
+ src : :class:`~pyspark.sql.Column` or column name
A column of string to be splited.
- delimiter : :class:`~pyspark.sql.Column` or str
+ delimiter : :class:`~pyspark.sql.Column` or column name
A column of string, the delimiter used for split.
- partNum : :class:`~pyspark.sql.Column` or str
+ partNum : :class:`~pyspark.sql.Column` or column name
A column of string, requested part of the split (1-based).
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.sentences`
+ :meth:`pyspark.sql.functions.split`
+
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"])
- >>> df.select(split_part(df.a, df.b, df.c).alias('r')).collect()
- [Row(r='13')]
+ >>> df.select("*", sf.split_part("a", "b", "c")).show()
+ +--------+---+---+-------------------+
+ | a| b| c|split_part(a, b, c)|
+ +--------+---+---+-------------------+
+ |11.12.13| .| 3| 13|
+ +--------+---+---+-------------------+
+
+ >>> df.select("*", sf.split_part(df.a, df.b, sf.lit(-2))).show()
+ +--------+---+---+--------------------+
+ | a| b| c|split_part(a, b, -2)|
+ +--------+---+---+--------------------+
+ |11.12.13| .| 3| 12|
+ +--------+---+---+--------------------+
"""
return _invoke_function_over_columns("split_part", src, delimiter, partNum)
@@ -15695,34 +15876,42 @@ def substr(
Parameters
----------
- str : :class:`~pyspark.sql.Column` or str
+ str : :class:`~pyspark.sql.Column` or column name
A column of string.
- pos : :class:`~pyspark.sql.Column` or str
+ pos : :class:`~pyspark.sql.Column` or column name
A column of string, the substring of `str` that starts at `pos`.
- len : :class:`~pyspark.sql.Column` or str, optional
+ len : :class:`~pyspark.sql.Column` or column name, optional
A column of string, the substring of `str` is of length `len`.
+ Returns
+ -------
+ :class:`~pyspark.sql.Column`
+ substring of given value.
+
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.instr`
+ :meth:`pyspark.sql.functions.substring`
+ :meth:`pyspark.sql.functions.substring_index`
+ :meth:`pyspark.sql.Column.substr`
+
Examples
--------
- >>> import pyspark.sql.functions as sf
- >>> spark.createDataFrame(
- ... [("Spark SQL", 5, 1,)], ["a", "b", "c"]
- ... ).select(sf.substr("a", "b", "c")).show()
- +---------------+
- |substr(a, b, c)|
- +---------------+
- | k|
- +---------------+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([("Spark SQL", 5, 1,)], ["a", "b", "c"])
+ >>> df.select("*", sf.substr("a", "b", "c")).show()
+ +---------+---+---+---------------+
+ | a| b| c|substr(a, b, c)|
+ +---------+---+---+---------------+
+ |Spark SQL| 5| 1| k|
+ +---------+---+---+---------------+
- >>> import pyspark.sql.functions as sf
- >>> spark.createDataFrame(
- ... [("Spark SQL", 5, 1,)], ["a", "b", "c"]
- ... ).select(sf.substr("a", "b")).show()
- +------------------------+
- |substr(a, b, 2147483647)|
- +------------------------+
- | k SQL|
- +------------------------+
+ >>> df.select("*", sf.substr(df.a, df.b)).show()
+ +---------+---+---+------------------------+
+ | a| b| c|substr(a, b, 2147483647)|
+ +---------+---+---+------------------------+
+ |Spark SQL| 5| 1| k SQL|
+ +---------+---+---+------------------------+
"""
if len is not None:
return _invoke_function_over_columns("substr", str, pos, len)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]