(spark) branch master updated: [SPARK-50486][PYTHON][DOCS] Refine the docstring for string functions - part 2

gurwls223 Tue, 03 Dec 2024 20:13:30 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 0e45e2179962 [SPARK-50486][PYTHON][DOCS] Refine the docstring for 
string functions - part 2
0e45e2179962 is described below

commit 0e45e217996248a093cfcb892407bb523f885c08
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Dec 4 13:13:12 2024 +0900

    [SPARK-50486][PYTHON][DOCS] Refine the docstring for string functions - 
part 2
    
    ### What changes were proposed in this pull request?
    Refine the docstring for string functions
    
    ### Why are the changes needed?
    to improve the docs and test coverage
    
    ### Does this PR introduce _any_ user-facing change?
    doc-only changes
    
    ### How was this patch tested?
    new doctests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #49049 from zhengruifeng/py_doc_13.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/sql/functions/builtin.py | 491 ++++++++++++++++++++++----------
 1 file changed, 340 insertions(+), 151 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py 
b/python/pyspark/sql/functions/builtin.py
index 13b2e12bca33..391bc3db7a86 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -7343,6 +7343,12 @@ def rand(seed: Optional[int] = None) -> Column:
     :class:`~pyspark.sql.Column`
         A column of random values.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.randn`
+    :meth:`pyspark.sql.functions.randstr`
+    :meth:`pyspark.sql.functions.uniform`
+
     Examples
     --------
     Example 1: Generate a random column without a seed
@@ -7396,6 +7402,12 @@ def randn(seed: Optional[int] = None) -> Column:
     :class:`~pyspark.sql.Column`
         A column of random values.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.rand`
+    :meth:`pyspark.sql.functions.randstr`
+    :meth:`pyspark.sql.functions.uniform`
+
     Examples
     --------
     Example 1: Generate a random column without a seed
@@ -14015,6 +14027,13 @@ def instr(str: "ColumnOrName", substr: Union[Column, 
str]) -> Column:
     :class:`~pyspark.sql.Column`
         location of the first occurrence of the substring as integer.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.locate`
+    :meth:`pyspark.sql.functions.substr`
+    :meth:`pyspark.sql.functions.substring`
+    :meth:`pyspark.sql.functions.substring_index`
+
     Examples
     --------
     Example 1: Using a literal string as the 'substring'
@@ -14166,6 +14185,11 @@ def sentences(
     :class:`~pyspark.sql.Column`
         arrays of split sentences.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.split`
+    :meth:`pyspark.sql.functions.split_part`
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -14221,15 +14245,15 @@ def substring(
 
     Parameters
     ----------
-    str : :class:`~pyspark.sql.Column` or str
+    str : :class:`~pyspark.sql.Column` or column name
         target column to work on.
-    pos : :class:`~pyspark.sql.Column` or str or int
+    pos : :class:`~pyspark.sql.Column` or column name or int
         starting position in str.
 
         .. versionchanged:: 4.0.0
             `pos` now accepts column and column name.
 
-    len : :class:`~pyspark.sql.Column` or str or int
+    len : :class:`~pyspark.sql.Column` or column name or int
         length of chars.
 
         .. versionchanged:: 4.0.0
@@ -14240,6 +14264,14 @@ def substring(
     :class:`~pyspark.sql.Column`
         substring of given value.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.instr`
+    :meth:`pyspark.sql.functions.locate`
+    :meth:`pyspark.sql.functions.substr`
+    :meth:`pyspark.sql.functions.substring_index`
+    :meth:`pyspark.sql.Column.substr`
+
     Examples
     --------
     Example 1: Using literal integers as arguments
@@ -14318,9 +14350,9 @@ def substring_index(str: "ColumnOrName", delim: str, 
count: int) -> Column:
 
     Parameters
     ----------
-    str : :class:`~pyspark.sql.Column` or str
+    str : :class:`~pyspark.sql.Column` or column name
         target column to work on.
-    delim : str
+    delim : literal string
         delimiter of values.
     count : int
         number of occurrences.
@@ -14330,13 +14362,31 @@ def substring_index(str: "ColumnOrName", delim: str, 
count: int) -> Column:
     :class:`~pyspark.sql.Column`
         substring of given value.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.instr`
+    :meth:`pyspark.sql.functions.locate`
+    :meth:`pyspark.sql.functions.substr`
+    :meth:`pyspark.sql.functions.substring`
+    :meth:`pyspark.sql.Column.substr`
+
     Examples
     --------
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([('a.b.c.d',)], ['s'])
-    >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()
-    [Row(s='a.b')]
-    >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
-    [Row(s='b.c.d')]
+    >>> df.select('*', sf.substring_index(df.s, '.', 2)).show()
+    +-------+------------------------+
+    |      s|substring_index(s, ., 2)|
+    +-------+------------------------+
+    |a.b.c.d|                     a.b|
+    +-------+------------------------+
+
+    >>> df.select('*', sf.substring_index('s', '.', -3)).show()
+    +-------+-------------------------+
+    |      s|substring_index(s, ., -3)|
+    +-------+-------------------------+
+    |a.b.c.d|                    b.c.d|
+    +-------+-------------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
@@ -14358,16 +14408,15 @@ def levenshtein(
 
     Parameters
     ----------
-    left : :class:`~pyspark.sql.Column` or str
+    left : :class:`~pyspark.sql.Column` or column name
         first column value.
-    right : :class:`~pyspark.sql.Column` or str
+    right : :class:`~pyspark.sql.Column` or column name
         second column value.
     threshold : int, optional
         if set when the levenshtein distance of the two given strings
         less than or equal to a given threshold then return result distance, 
or -1
 
-        .. versionchanged: 3.5.0
-            Added ``threshold`` argument.
+        .. versionadded: 3.5.0
 
     Returns
     -------
@@ -14376,11 +14425,21 @@ def levenshtein(
 
     Examples
     --------
-    >>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])
-    >>> df0.select(levenshtein('l', 'r').alias('d')).collect()
-    [Row(d=3)]
-    >>> df0.select(levenshtein('l', 'r', 2).alias('d')).collect()
-    [Row(d=-1)]
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])
+    >>> df.select('*', sf.levenshtein('l', 'r')).show()
+    +------+-------+-----------------+
+    |     l|      r|levenshtein(l, r)|
+    +------+-------+-----------------+
+    |kitten|sitting|                3|
+    +------+-------+-----------------+
+
+    >>> df.select('*', sf.levenshtein(df.l, df.r, 2)).show()
+    +------+-------+--------------------+
+    |     l|      r|levenshtein(l, r, 2)|
+    +------+-------+--------------------+
+    |kitten|sitting|                  -1|
+    +------+-------+--------------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
@@ -14404,9 +14463,9 @@ def locate(substr: str, str: "ColumnOrName", pos: int = 
1) -> Column:
 
     Parameters
     ----------
-    substr : str
+    substr : literal string
         a string
-    str : :class:`~pyspark.sql.Column` or str
+    str : :class:`~pyspark.sql.Column` or column name
         a Column of :class:`pyspark.sql.types.StringType`
     pos : int, optional
         start position (zero based)
@@ -14421,11 +14480,31 @@ def locate(substr: str, str: "ColumnOrName", pos: int 
= 1) -> Column:
     The position is not zero based, but 1 based index. Returns 0 if substr
     could not be found in str.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.instr`
+    :meth:`pyspark.sql.functions.substr`
+    :meth:`pyspark.sql.functions.substring`
+    :meth:`pyspark.sql.functions.substring_index`
+    :meth:`pyspark.sql.Column.substr`
+
     Examples
     --------
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([('abcd',)], ['s',])
-    >>> df.select(locate('b', df.s, 1).alias('s')).collect()
-    [Row(s=2)]
+    >>> df.select('*', sf.locate('b', 's', 1)).show()
+    +----+---------------+
+    |   s|locate(b, s, 1)|
+    +----+---------------+
+    |abcd|              2|
+    +----+---------------+
+
+    >>> df.select('*', sf.locate('b', df.s, 3)).show()
+    +----+---------------+
+    |   s|locate(b, s, 3)|
+    +----+---------------+
+    |abcd|              0|
+    +----+---------------+
     """
     from pyspark.sql.classic.column import _to_java_column
 
@@ -14469,6 +14548,10 @@ def lpad(
     :class:`~pyspark.sql.Column`
         left padded result.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.rpad`
+
     Examples
     --------
     Example 1: Pad with a literal string
@@ -14535,6 +14618,10 @@ def rpad(
     :class:`~pyspark.sql.Column`
         right padded result.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.lpad`
+
     Examples
     --------
     Example 1: Pad with a literal string
@@ -14578,9 +14665,9 @@ def repeat(col: "ColumnOrName", n: 
Union["ColumnOrName", int]) -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or str
+    col : :class:`~pyspark.sql.Column` or column name
         target column to work on.
-    n : :class:`~pyspark.sql.Column` or str or int
+    n : :class:`~pyspark.sql.Column` or column name or int
         number of times to repeat value.
 
         .. versionchanged:: 4.0.0
@@ -14593,35 +14680,35 @@ def repeat(col: "ColumnOrName", n: 
Union["ColumnOrName", int]) -> Column:
 
     Examples
     --------
-    >>> import pyspark.sql.functions as sf
-    >>> spark.createDataFrame(
-    ...     [('ab',)], ['s',]
-    ... ).select(sf.repeat("s", 3)).show()
-    +------------+
-    |repeat(s, 3)|
-    +------------+
-    |      ababab|
-    +------------+
+    Example 1: Repeat with a constant number of times
 
     >>> import pyspark.sql.functions as sf
-    >>> spark.createDataFrame(
-    ...     [('ab',)], ['s',]
-    ... ).select(sf.repeat("s", sf.lit(4))).show()
-    +------------+
-    |repeat(s, 4)|
-    +------------+
-    |    abababab|
-    +------------+
+    >>> df = spark.createDataFrame([('ab',)], ['s',])
+    >>> df.select("*", sf.repeat("s", 3)).show()
+    +---+------------+
+    |  s|repeat(s, 3)|
+    +---+------------+
+    | ab|      ababab|
+    +---+------------+
+
+    >>> df.select("*", sf.repeat(df.s, sf.lit(4))).show()
+    +---+------------+
+    |  s|repeat(s, 4)|
+    +---+------------+
+    | ab|    abababab|
+    +---+------------+
+
+    Example 2: Repeat with a column containing different number of times
 
     >>> import pyspark.sql.functions as sf
-    >>> spark.createDataFrame(
-    ...     [('ab', 5,)], ['s', 't']
-    ... ).select(sf.repeat("s", 't')).show()
-    +------------+
-    |repeat(s, t)|
-    +------------+
-    |  ababababab|
-    +------------+
+    >>> df = spark.createDataFrame([('ab', 5,), ('abc', 6,)], ['s', 't'])
+    >>> df.select("*", sf.repeat("s", "t")).show()
+    +---+---+------------------+
+    |  s|  t|      repeat(s, t)|
+    +---+---+------------------+
+    | ab|  5|        ababababab|
+    |abc|  6|abcabcabcabcabcabc|
+    +---+---+------------------+
     """
     n = _enum_to_value(n)
     n = lit(n) if isinstance(n, int) else n
@@ -14644,12 +14731,18 @@ def split(
 
     Parameters
     ----------
-    str : :class:`~pyspark.sql.Column` or str
+    str : :class:`~pyspark.sql.Column` or column name
         a string expression to split
-    pattern : :class:`~pyspark.sql.Column` or str
+    pattern : :class:`~pyspark.sql.Column` or literal string
         a string representing a regular expression. The regex string should be
         a Java regular expression.
-    limit : :class:`~pyspark.sql.Column` or str or int
+
+        .. versionchanged:: 4.0.0
+             `pattern` now accepts column. Does not accept column name since 
string type remain
+             accepted as a regular expression representation, for backwards 
compatibility.
+             In addition to int, `limit` now accepts column and column name.
+
+    limit : :class:`~pyspark.sql.Column` or column name or int
         an integer which controls the number of times `pattern` is applied.
 
         * ``limit > 0``: The resulting array's length will not be more than 
`limit`, and the
@@ -14661,61 +14754,66 @@ def split(
         .. versionchanged:: 3.0
            `split` now takes an optional `limit` field. If not provided, 
default limit value is -1.
 
-        .. versionchanged:: 4.0.0
-             `pattern` now accepts column. Does not accept column name since 
string type remain
-             accepted as a regular expression representation, for backwards 
compatibility.
-             In addition to int, `limit` now accepts column and column name.
-
     Returns
     -------
     :class:`~pyspark.sql.Column`
         array of separated strings.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.sentences`
+    :meth:`pyspark.sql.functions.split_part`
+
     Examples
     --------
-    >>> import pyspark.sql.functions as sf
-    >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
-    >>> df.select(sf.split(df.s, '[ABC]', 2).alias('s')).show()
-    +-----------------+
-    |                s|
-    +-----------------+
-    |[one, twoBthreeC]|
-    +-----------------+
+    Example 1: Repeat with a constant pattern
 
     >>> import pyspark.sql.functions as sf
     >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])
-    >>> df.select(sf.split(df.s, '[ABC]', -1).alias('s')).show()
-    +-------------------+
-    |                  s|
-    +-------------------+
-    |[one, two, three, ]|
-    +-------------------+
-
-    >>> import pyspark.sql.functions as sf
-    >>> df = spark.createDataFrame(
-    ...     [('oneAtwoBthreeC', '[ABC]'), ('1A2B3C', '[1-9]+'), ('aa2bb3cc4', 
'[1-9]+')],
-    ...     ['s', 'pattern']
-    ... )
-    >>> df.select(sf.split(df.s, df.pattern).alias('s')).show()
-    +-------------------+
-    |                  s|
-    +-------------------+
-    |[one, two, three, ]|
-    |        [, A, B, C]|
-    |     [aa, bb, cc, ]|
-    +-------------------+
+    >>> df.select('*', sf.split(df.s, '[ABC]')).show()
+    +--------------+-------------------+
+    |             s|split(s, [ABC], -1)|
+    +--------------+-------------------+
+    |oneAtwoBthreeC|[one, two, three, ]|
+    +--------------+-------------------+
+
+    >>> df.select('*', sf.split(df.s, '[ABC]', 2)).show()
+    +--------------+------------------+
+    |             s|split(s, [ABC], 2)|
+    +--------------+------------------+
+    |oneAtwoBthreeC| [one, twoBthreeC]|
+    +--------------+------------------+
+
+    >>> df.select('*', sf.split('s', '[ABC]', -2)).show()
+    +--------------+-------------------+
+    |             s|split(s, [ABC], -2)|
+    +--------------+-------------------+
+    |oneAtwoBthreeC|[one, two, three, ]|
+    +--------------+-------------------+
+
+    Example 2: Repeat with a column containing different patterns and limits
 
     >>> import pyspark.sql.functions as sf
-    >>> df = spark.createDataFrame(
-    ...     [('oneAtwoBthreeC', '[ABC]', 2), ('1A2B3C', '[1-9]+', -1)],
-    ...     ['s', 'pattern', 'expected_parts']
-    ... )
-    >>> df.select(sf.split(df.s, df.pattern, 
df.expected_parts).alias('s')).show()
+    >>> df = spark.createDataFrame([
+    ...     ('oneAtwoBthreeC', '[ABC]', 2),
+    ...     ('1A2B3C', '[1-9]+', 1),
+    ...     ('aa2bb3cc4', '[1-9]+', -1)], ['s', 'p', 'l'])
+    >>> df.select('*', sf.split(df.s, df.p)).show()
+    +--------------+------+---+-------------------+
+    |             s|     p|  l|    split(s, p, -1)|
+    +--------------+------+---+-------------------+
+    |oneAtwoBthreeC| [ABC]|  2|[one, two, three, ]|
+    |        1A2B3C|[1-9]+|  1|        [, A, B, C]|
+    |     aa2bb3cc4|[1-9]+| -1|     [aa, bb, cc, ]|
+    +--------------+------+---+-------------------+
+
+    >>> df.select(sf.split('s', df.p, 'l')).show()
     +-----------------+
-    |                s|
+    |   split(s, p, l)|
     +-----------------+
     |[one, twoBthreeC]|
-    |      [, A, B, C]|
+    |         [1A2B3C]|
+    |   [aa, bb, cc, ]|
     +-----------------+
     """
     limit = _enum_to_value(limit)
@@ -14880,16 +14978,29 @@ def randstr(length: Union[Column, int], seed: 
Optional[Union[Column, int]] = Non
     :class:`~pyspark.sql.Column`
         The generated random string with the specified length.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.rand`
+    :meth:`pyspark.sql.functions.randn`
+
     Examples
     --------
-    >>> spark.createDataFrame([('3',)], ['a']) \\
-    ...   .select(randstr(lit(5), lit(0)).alias('result')) \\
-    ...   .selectExpr("length(result) > 0").show()
-    +--------------------+
-    |(length(result) > 0)|
-    +--------------------+
-    |                true|
-    +--------------------+
+    >>> import pyspark.sql.functions as sf
+    >>> spark.range(0, 10, 1, 1).select(sf.randstr(16, 3)).show()
+    +----------------+
+    |  randstr(16, 3)|
+    +----------------+
+    |nurJIpH4cmmMnsCG|
+    |fl9YtT5m01trZtIt|
+    |PD19rAgscTHS7qQZ|
+    |2CuAICF5UJOruVv4|
+    |kNZEs8nDpJEoz3Rl|
+    |OXiU0KN5eaXfjXFs|
+    |qfnTM1BZAHtN0gBV|
+    |1p8XiSKwg33KnRPK|
+    |od5y5MucayQq1bKK|
+    |tklYPmKmc5sIppWM|
+    +----------------+
     """
     length = _enum_to_value(length)
     length = lit(length)
@@ -15137,7 +15248,7 @@ def initcap(col: "ColumnOrName") -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or str
+    col : :class:`~pyspark.sql.Column` or column name
         target column to work on.
 
     Returns
@@ -15147,8 +15258,14 @@ def initcap(col: "ColumnOrName") -> Column:
 
     Examples
     --------
-    >>> spark.createDataFrame([('ab cd',)], 
['a']).select(initcap("a").alias('v')).collect()
-    [Row(v='Ab Cd')]
+    >>> import pyspark.sql.functions as sf
+    >>> df = spark.createDataFrame([('ab cd',)], ['a'])
+    >>> df.select("*", sf.initcap("a")).show()
+    +-----+----------+
+    |    a|initcap(a)|
+    +-----+----------+
+    |ab cd|     Ab Cd|
+    +-----+----------+
     """
     return _invoke_function_over_columns("initcap", col)
 
@@ -15165,7 +15282,7 @@ def soundex(col: "ColumnOrName") -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or str
+    col : :class:`~pyspark.sql.Column` or column name
         target column to work on.
 
     Returns
@@ -15175,9 +15292,15 @@ def soundex(col: "ColumnOrName") -> Column:
 
     Examples
     --------
-    >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])
-    >>> df.select(soundex(df.name).alias("soundex")).collect()
-    [Row(soundex='P362'), Row(soundex='U612')]
+    >>> import pyspark.sql.functions as sf
+    >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ["s"])
+    >>> df.select("*", sf.soundex("s")).show()
+    +-------+----------+
+    |      s|soundex(s)|
+    +-------+----------+
+    | Peters|      P362|
+    |Uhrbach|      U612|
+    +-------+----------+
     """
     return _invoke_function_over_columns("soundex", col)
 
@@ -15193,7 +15316,7 @@ def bin(col: "ColumnOrName") -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or str
+    col : :class:`~pyspark.sql.Column` or column name
         target column to work on.
 
     Returns
@@ -15203,9 +15326,22 @@ def bin(col: "ColumnOrName") -> Column:
 
     Examples
     --------
-    >>> df = spark.createDataFrame([2,5], "INT")
-    >>> df.select(bin(df.value).alias('c')).collect()
-    [Row(c='10'), Row(c='101')]
+    >>> import pyspark.sql.functions as sf
+    >>> spark.range(10).select("*", sf.bin("id")).show()
+    +---+-------+
+    | id|bin(id)|
+    +---+-------+
+    |  0|      0|
+    |  1|      1|
+    |  2|     10|
+    |  3|     11|
+    |  4|    100|
+    |  5|    101|
+    |  6|    110|
+    |  7|    111|
+    |  8|   1000|
+    |  9|   1001|
+    +---+-------+
     """
     return _invoke_function_over_columns("bin", col)
 
@@ -15223,9 +15359,13 @@ def hex(col: "ColumnOrName") -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or str
+    col : :class:`~pyspark.sql.Column` or column name
         target column to work on.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.unhex`
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
@@ -15233,8 +15373,14 @@ def hex(col: "ColumnOrName") -> Column:
 
     Examples
     --------
-    >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), 
hex('b')).collect()
-    [Row(hex(a)='414243', hex(b)='3')]
+    >>> import pyspark.sql.functions as sf
+    >>> df = spark.createDataFrame([('ABC', 3)], ['a', 'b'])
+    >>> df.select('*', sf.hex('a'), sf.hex(df.b)).show()
+    +---+---+------+------+
+    |  a|  b|hex(a)|hex(b)|
+    +---+---+------+------+
+    |ABC|  3|414243|     3|
+    +---+---+------+------+
     """
     return _invoke_function_over_columns("hex", col)
 
@@ -15251,9 +15397,13 @@ def unhex(col: "ColumnOrName") -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or str
+    col : :class:`~pyspark.sql.Column` or column name
         target column to work on.
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.hex`
+
     Returns
     -------
     :class:`~pyspark.sql.Column`
@@ -15261,8 +15411,14 @@ def unhex(col: "ColumnOrName") -> Column:
 
     Examples
     --------
-    >>> spark.createDataFrame([('414243',)], 
['a']).select(unhex('a')).collect()
-    [Row(unhex(a)=bytearray(b'ABC'))]
+    >>> import pyspark.sql.functions as sf
+    >>> df = spark.createDataFrame([('414243',)], ['a'])
+    >>> df.select('*', sf.unhex('a')).show()
+    +------+----------+
+    |     a|  unhex(a)|
+    +------+----------+
+    |414243|[41 42 43]|
+    +------+----------+
     """
     return _invoke_function_over_columns("unhex", col)
 
@@ -15297,14 +15453,22 @@ def uniform(
 
     Examples
     --------
-    >>> spark.createDataFrame([('3',)], ['a']) \\
-    ...    .select(uniform(lit(0), lit(10), lit(0)).alias('result')) \\
-    ...    .selectExpr("result < 15").show()
-    +-------------+
-    |(result < 15)|
-    +-------------+
-    |         true|
-    +-------------+
+    >>> import pyspark.sql.functions as sf
+    >>> spark.range(0, 10, 1, 1).select(sf.uniform(5, 105, 3)).show()
+    +------------------+
+    |uniform(5, 105, 3)|
+    +------------------+
+    |                30|
+    |                71|
+    |                99|
+    |                77|
+    |                16|
+    |                25|
+    |                89|
+    |                80|
+    |                51|
+    |                83|
+    +------------------+
     """
     min = _enum_to_value(min)
     min = lit(min)
@@ -15667,18 +15831,35 @@ def split_part(src: "ColumnOrName", delimiter: 
"ColumnOrName", partNum: "ColumnO
 
     Parameters
     ----------
-    src : :class:`~pyspark.sql.Column` or str
+    src : :class:`~pyspark.sql.Column` or column name
         A column of string to be splited.
-    delimiter : :class:`~pyspark.sql.Column` or str
+    delimiter : :class:`~pyspark.sql.Column` or column name
         A column of string, the delimiter used for split.
-    partNum : :class:`~pyspark.sql.Column` or str
+    partNum : :class:`~pyspark.sql.Column` or column name
         A column of string, requested part of the split (1-based).
 
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.sentences`
+    :meth:`pyspark.sql.functions.split`
+
     Examples
     --------
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"])
-    >>> df.select(split_part(df.a, df.b, df.c).alias('r')).collect()
-    [Row(r='13')]
+    >>> df.select("*", sf.split_part("a", "b", "c")).show()
+    +--------+---+---+-------------------+
+    |       a|  b|  c|split_part(a, b, c)|
+    +--------+---+---+-------------------+
+    |11.12.13|  .|  3|                 13|
+    +--------+---+---+-------------------+
+
+    >>> df.select("*", sf.split_part(df.a, df.b, sf.lit(-2))).show()
+    +--------+---+---+--------------------+
+    |       a|  b|  c|split_part(a, b, -2)|
+    +--------+---+---+--------------------+
+    |11.12.13|  .|  3|                  12|
+    +--------+---+---+--------------------+
     """
     return _invoke_function_over_columns("split_part", src, delimiter, partNum)
 
@@ -15695,34 +15876,42 @@ def substr(
 
     Parameters
     ----------
-    str : :class:`~pyspark.sql.Column` or str
+    str : :class:`~pyspark.sql.Column` or column name
         A column of string.
-    pos : :class:`~pyspark.sql.Column` or str
+    pos : :class:`~pyspark.sql.Column` or column name
         A column of string, the substring of `str` that starts at `pos`.
-    len : :class:`~pyspark.sql.Column` or str, optional
+    len : :class:`~pyspark.sql.Column` or column name, optional
         A column of string, the substring of `str` is of length `len`.
 
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        substring of given value.
+
+    See Also
+    --------
+    :meth:`pyspark.sql.functions.instr`
+    :meth:`pyspark.sql.functions.substring`
+    :meth:`pyspark.sql.functions.substring_index`
+    :meth:`pyspark.sql.Column.substr`
+
     Examples
     --------
-    >>> import pyspark.sql.functions as sf
-    >>> spark.createDataFrame(
-    ...     [("Spark SQL", 5, 1,)], ["a", "b", "c"]
-    ... ).select(sf.substr("a", "b", "c")).show()
-    +---------------+
-    |substr(a, b, c)|
-    +---------------+
-    |              k|
-    +---------------+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([("Spark SQL", 5, 1,)], ["a", "b", "c"])
+    >>> df.select("*", sf.substr("a", "b", "c")).show()
+    +---------+---+---+---------------+
+    |        a|  b|  c|substr(a, b, c)|
+    +---------+---+---+---------------+
+    |Spark SQL|  5|  1|              k|
+    +---------+---+---+---------------+
 
-    >>> import pyspark.sql.functions as sf
-    >>> spark.createDataFrame(
-    ...     [("Spark SQL", 5, 1,)], ["a", "b", "c"]
-    ... ).select(sf.substr("a", "b")).show()
-    +------------------------+
-    |substr(a, b, 2147483647)|
-    +------------------------+
-    |                   k SQL|
-    +------------------------+
+    >>> df.select("*", sf.substr(df.a, df.b)).show()
+    +---------+---+---+------------------------+
+    |        a|  b|  c|substr(a, b, 2147483647)|
+    +---------+---+---+------------------------+
+    |Spark SQL|  5|  1|                   k SQL|
+    +---------+---+---+------------------------+
     """
     if len is not None:
         return _invoke_function_over_columns("substr", str, pos, len)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-50486][PYTHON][DOCS] Refine the docstring for string functions - part 2

Reply via email to