This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new f7a264375890 [SPARK-50423][PYTHON][DOCS] Refine the docstring for misc
and hash functions
f7a264375890 is described below
commit f7a264375890d8dc3a605457e4aff90f879be412
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Nov 26 18:08:52 2024 +0900
[SPARK-50423][PYTHON][DOCS] Refine the docstring for misc and hash functions
### What changes were proposed in this pull request?
Refine the docstring for misc and hash functions
### Why are the changes needed?
to improve docs and test coverage
### Does this PR introduce _any_ user-facing change?
doc-only changes
### How was this patch tested?
updated doctests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #48969 from zhengruifeng/py_doc_10.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 551 +++++++++++++++++++++++---------
1 file changed, 398 insertions(+), 153 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 31f3e88c46c0..ac4f2fc9881b 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -12691,9 +12691,15 @@ def current_catalog() -> Column:
.. versionadded:: 3.5.0
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.current_database`
+ :meth:`pyspark.sql.functions.current_schema`
+
Examples
--------
- >>> spark.range(1).select(current_catalog()).show()
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(1).select(sf.current_catalog()).show()
+-----------------+
|current_catalog()|
+-----------------+
@@ -12709,9 +12715,15 @@ def current_database() -> Column:
.. versionadded:: 3.5.0
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.current_catalog`
+ :meth:`pyspark.sql.functions.current_schema`
+
Examples
--------
- >>> spark.range(1).select(current_database()).show()
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(1).select(sf.current_database()).show()
+----------------+
|current_schema()|
+----------------+
@@ -12727,6 +12739,11 @@ def current_schema() -> Column:
.. versionadded:: 3.5.0
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.current_catalog`
+ :meth:`pyspark.sql.functions.current_database`
+
Examples
--------
>>> import pyspark.sql.functions as sf
@@ -12746,9 +12763,15 @@ def current_user() -> Column:
.. versionadded:: 3.5.0
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.user`
+ :meth:`pyspark.sql.functions.session_user`
+
Examples
--------
- >>> spark.range(1).select(current_user()).show() # doctest: +SKIP
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(1).select(sf.current_user()).show() # doctest: +SKIP
+--------------+
|current_user()|
+--------------+
@@ -12764,6 +12787,11 @@ def user() -> Column:
.. versionadded:: 3.5.0
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.current_user`
+ :meth:`pyspark.sql.functions.session_user`
+
Examples
--------
>>> import pyspark.sql.functions as sf
@@ -12783,6 +12811,11 @@ def session_user() -> Column:
.. versionadded:: 4.0.0
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.user`
+ :meth:`pyspark.sql.functions.current_user`
+
Examples
--------
>>> import pyspark.sql.functions as sf
@@ -12799,7 +12832,7 @@ def session_user() -> Column:
@_try_remote_functions
def crc32(col: "ColumnOrName") -> Column:
"""
- Calculates the cyclic redundancy check value (CRC32) of a binary column
and
+ Calculates the cyclic redundancy check value (CRC32) of a binary column and
returns the value as a bigint.
.. versionchanged:: 3.4.0
@@ -12807,7 +12840,7 @@ def crc32(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to compute on.
Returns
@@ -12819,8 +12852,14 @@ def crc32(col: "ColumnOrName") -> Column:
Examples
--------
- >>> spark.createDataFrame([('ABC',)],
['a']).select(crc32('a').alias('crc32')).collect()
- [Row(crc32=2743272264)]
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([('ABC',)], ['a'])
+ >>> df.select('*', sf.crc32('a')).show(truncate=False)
+ +---+----------+
+ |a |crc32(a) |
+ +---+----------+
+ |ABC|2743272264|
+ +---+----------+
"""
return _invoke_function_over_columns("crc32", col)
@@ -12836,7 +12875,7 @@ def md5(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to compute on.
Returns
@@ -12846,8 +12885,14 @@ def md5(col: "ColumnOrName") -> Column:
Examples
--------
- >>> spark.createDataFrame([('ABC',)],
['a']).select(md5('a').alias('hash')).collect()
- [Row(hash='902fbdd2b1df0c4f70b4a5d23525e932')]
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([('ABC',)], ['a'])
+ >>> df.select('*', sf.md5('a')).show(truncate=False)
+ +---+--------------------------------+
+ |a |md5(a) |
+ +---+--------------------------------+
+ |ABC|902fbdd2b1df0c4f70b4a5d23525e932|
+ +---+--------------------------------+
"""
return _invoke_function_over_columns("md5", col)
@@ -12863,7 +12908,7 @@ def sha1(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to compute on.
Returns
@@ -12871,10 +12916,21 @@ def sha1(col: "ColumnOrName") -> Column:
:class:`~pyspark.sql.Column`
the column for computed results.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.sha`
+ :meth:`pyspark.sql.functions.sha2`
+
Examples
--------
- >>> spark.createDataFrame([('ABC',)],
['a']).select(sha1('a').alias('hash')).collect()
- [Row(hash='3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([('ABC',)], ['a'])
+ >>> df.select('*', sf.sha1('a')).show(truncate=False)
+ +---+----------------------------------------+
+ |a |sha1(a) |
+ +---+----------------------------------------+
+ |ABC|3c01bdbb26f358bab27f267924aa2c9a03fcfdb8|
+ +---+----------------------------------------+
"""
return _invoke_function_over_columns("sha1", col)
@@ -12892,7 +12948,7 @@ def sha2(col: "ColumnOrName", numBits: int) -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
target column to compute on.
numBits : int
the desired bit length of the result, which must have a
@@ -12903,12 +12959,18 @@ def sha2(col: "ColumnOrName", numBits: int) -> Column:
:class:`~pyspark.sql.Column`
the column for computed results.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.sha`
+ :meth:`pyspark.sql.functions.sha1`
+
Examples
--------
- >>> df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"])
- >>> df.withColumn("sha2", sha2(df.name, 256)).show(truncate=False)
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([['Alice'], ['Bob']], ['name'])
+ >>> df.select('*', sf.sha2('name', 256)).show(truncate=False)
+-----+----------------------------------------------------------------+
- |name |sha2 |
+ |name |sha2(name, 256) |
+-----+----------------------------------------------------------------+
|Alice|3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043|
|Bob |cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961|
@@ -12938,7 +13000,7 @@ def hash(*cols: "ColumnOrName") -> Column:
Parameters
----------
- cols : :class:`~pyspark.sql.Column` or str
+ cols : :class:`~pyspark.sql.Column` or column name
one or more columns to compute on.
Returns
@@ -12946,27 +13008,34 @@ def hash(*cols: "ColumnOrName") -> Column:
:class:`~pyspark.sql.Column`
hash value as int column.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.xxhash64`
+
Examples
--------
+ >>> import pyspark.sql.functions as sf
>>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2'])
+ >>> df.select('*', sf.hash('c1')).show()
+ +---+---+----------+
+ | c1| c2| hash(c1)|
+ +---+---+----------+
+ |ABC|DEF|-757602832|
+ +---+---+----------+
- Hash for one column
-
- >>> df.select(hash('c1').alias('hash')).show()
- +----------+
- | hash|
- +----------+
- |-757602832|
- +----------+
-
- Two or more columns
+ >>> df.select('*', sf.hash('c1', df.c2)).show()
+ +---+---+------------+
+ | c1| c2|hash(c1, c2)|
+ +---+---+------------+
+ |ABC|DEF| 599895104|
+ +---+---+------------+
- >>> df.select(hash('c1', 'c2').alias('hash')).show()
- +---------+
- | hash|
- +---------+
- |599895104|
- +---------+
+ >>> df.select('*', sf.hash('*')).show()
+ +---+---+------------+
+ | c1| c2|hash(c1, c2)|
+ +---+---+------------+
+ |ABC|DEF| 599895104|
+ +---+---+------------+
"""
return _invoke_function_over_seq_of_columns("hash", cols)
@@ -12983,7 +13052,7 @@ def xxhash64(*cols: "ColumnOrName") -> Column:
Parameters
----------
- cols : :class:`~pyspark.sql.Column` or str
+ cols : :class:`~pyspark.sql.Column` or column name
one or more columns to compute on.
Returns
@@ -12991,27 +13060,34 @@ def xxhash64(*cols: "ColumnOrName") -> Column:
:class:`~pyspark.sql.Column`
hash value as long column.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.hash`
+
Examples
--------
+ >>> import pyspark.sql.functions as sf
>>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2'])
-
- Hash for one column
-
- >>> df.select(xxhash64('c1').alias('hash')).show()
- +-------------------+
- | hash|
- +-------------------+
- |4105715581806190027|
- +-------------------+
-
- Two or more columns
-
- >>> df.select(xxhash64('c1', 'c2').alias('hash')).show()
- +-------------------+
- | hash|
- +-------------------+
- |3233247871021311208|
- +-------------------+
+ >>> df.select('*', sf.xxhash64('c1')).show()
+ +---+---+-------------------+
+ | c1| c2| xxhash64(c1)|
+ +---+---+-------------------+
+ |ABC|DEF|4105715581806190027|
+ +---+---+-------------------+
+
+ >>> df.select('*', sf.xxhash64('c1', df.c2)).show()
+ +---+---+-------------------+
+ | c1| c2| xxhash64(c1, c2)|
+ +---+---+-------------------+
+ |ABC|DEF|3233247871021311208|
+ +---+---+-------------------+
+
+ >>> df.select('*', sf.xxhash64('*')).show()
+ +---+---+-------------------+
+ | c1| c2| xxhash64(c1, c2)|
+ +---+---+-------------------+
+ |ABC|DEF|3233247871021311208|
+ +---+---+-------------------+
"""
return _invoke_function_over_seq_of_columns("xxhash64", cols)
@@ -13029,9 +13105,9 @@ def assert_true(col: "ColumnOrName", errMsg:
Optional[Union[Column, str]] = None
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
column name or column that represents the input column to test
- errMsg : :class:`~pyspark.sql.Column` or str, optional
+ errMsg : :class:`~pyspark.sql.Column` or literal string, optional
A Python string literal or column containing the error message
Returns
@@ -13039,16 +13115,36 @@ def assert_true(col: "ColumnOrName", errMsg:
Optional[Union[Column, str]] = None
:class:`~pyspark.sql.Column`
`null` if the input column is `true` otherwise throws an error with
specified message.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.raise_error`
+
Examples
--------
- >>> df = spark.createDataFrame([(0,1)], ['a', 'b'])
- >>> df.select(assert_true(df.a < df.b).alias('r')).collect()
- [Row(r=None)]
- >>> df.select(assert_true(df.a < df.b, df.a).alias('r')).collect()
- [Row(r=None)]
- >>> df.select(assert_true(df.a < df.b, 'error').alias('r')).collect()
- [Row(r=None)]
- >>> df.select(assert_true(df.a > df.b, 'My error
msg').alias('r')).collect() # doctest: +SKIP
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.createDataFrame([(0, 1)], ['a', 'b'])
+ >>> df.select('*', sf.assert_true(df.a < df.b)).show() # doctest: +SKIP
+ +------------------------------------------------------+
+ |assert_true((a < b), '(a#788L < b#789L)' is not true!)|
+ +------------------------------------------------------+
+ | NULL|
+ +------------------------------------------------------+
+
+ >>> df.select('*', sf.assert_true(df.a < df.b, df.a)).show()
+ +---+---+-----------------------+
+ | a| b|assert_true((a < b), a)|
+ +---+---+-----------------------+
+ | 0| 1| NULL|
+ +---+---+-----------------------+
+
+ >>> df.select('*', sf.assert_true(df.a < df.b, 'error')).show()
+ +---+---+---------------------------+
+ | a| b|assert_true((a < b), error)|
+ +---+---+---------------------------+
+ | 0| 1| NULL|
+ +---+---+---------------------------+
+
+ >>> df.select('*', sf.assert_true(df.a > df.b, 'My error msg')).show() #
doctest: +SKIP
...
java.lang.RuntimeException: My error msg
...
@@ -13076,7 +13172,7 @@ def raise_error(errMsg: Union[Column, str]) -> Column:
Parameters
----------
- errMsg : :class:`~pyspark.sql.Column` or str
+ errMsg : :class:`~pyspark.sql.Column` or literal string
A Python string literal or column containing the error message
Returns
@@ -13084,10 +13180,14 @@ def raise_error(errMsg: Union[Column, str]) -> Column:
:class:`~pyspark.sql.Column`
throws an error with specified message.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.assert_true`
+
Examples
--------
- >>> df = spark.range(1)
- >>> df.select(raise_error("My error message")).show() # doctest: +SKIP
+ >>> import pyspark.sql.functions as sf
+ >>> spark.range(1).select(sf.raise_error("My error message")).show() #
doctest: +SKIP
...
java.lang.RuntimeException: My error message
...
@@ -24359,21 +24459,21 @@ def aes_encrypt(
Parameters
----------
- input : :class:`~pyspark.sql.Column` or str
+ input : :class:`~pyspark.sql.Column` or column name
The binary value to encrypt.
- key : :class:`~pyspark.sql.Column` or str
+ key : :class:`~pyspark.sql.Column` or column name
The passphrase to use to encrypt the data.
mode : :class:`~pyspark.sql.Column` or str, optional
Specifies which block cipher mode should be used to encrypt messages.
Valid modes: ECB,
GCM, CBC.
- padding : :class:`~pyspark.sql.Column` or str, optional
+ padding : :class:`~pyspark.sql.Column` or column name, optional
Specifies how to pad messages whose length is not a multiple of the
block size. Valid
values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB,
NONE for GCM and PKCS
for CBC.
- iv : :class:`~pyspark.sql.Column` or str, optional
+ iv : :class:`~pyspark.sql.Column` or column name, optional
Optional initialization vector. Only supported for CBC and GCM modes.
Valid values: None or
"". 16-byte array for CBC mode. 12-byte array for GCM mode.
- aad : :class:`~pyspark.sql.Column` or str, optional
+ aad : :class:`~pyspark.sql.Column` or column name, optional
Optional additional authenticated data. Only supported for GCM mode.
This can be any
free-form input and must be provided for both encryption and
decryption.
@@ -24382,6 +24482,11 @@ def aes_encrypt(
:class:`~pyspark.sql.Column`
A new column that contains an encrypted value.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.aes_decrypt`
+ :meth:`pyspark.sql.functions.try_aes_decrypt`
+
Examples
--------
@@ -24394,7 +24499,7 @@ def aes_encrypt(
... ["input", "key", "mode", "padding", "iv", "aad"]
... )
>>> df.select(sf.base64(sf.aes_encrypt(
- ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv,
sf.lit("hex")), df.aad)
+ ... df.input, df.key, "mode", df.padding, sf.to_binary(df.iv,
sf.lit("hex")), df.aad)
... )).show(truncate=False)
+-----------------------------------------------------------------------+
|base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), aad))|
@@ -24411,7 +24516,7 @@ def aes_encrypt(
... ["input", "key", "mode", "padding", "iv", "aad"]
... )
>>> df.select(sf.base64(sf.aes_encrypt(
- ... df.input, df.key, df.mode, df.padding, sf.to_binary(df.iv,
sf.lit("hex")))
+ ... df.input, df.key, "mode", df.padding, sf.to_binary(df.iv,
sf.lit("hex")))
... )).show(truncate=False)
+--------------------------------------------------------------------+
|base64(aes_encrypt(input, key, mode, padding, to_binary(iv, hex), ))|
@@ -24426,7 +24531,7 @@ def aes_encrypt(
... "Spark SQL", "1234567890abcdef", "ECB", "PKCS",)],
... ["input", "key", "mode", "padding"]
... )
- >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode,
df.padding),
+ >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, "mode",
df.padding),
... df.key, df.mode, df.padding
... ).cast("STRING")).show(truncate=False)
+---------------------------------------------------------------------------------------------+
@@ -24442,7 +24547,7 @@ def aes_encrypt(
... "Spark SQL", "0000111122223333", "ECB",)],
... ["input", "key", "mode"]
... )
- >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, df.mode),
+ >>> df.select(sf.aes_decrypt(sf.aes_encrypt(df.input, df.key, "mode"),
... df.key, df.mode
... ).cast("STRING")).show(truncate=False)
+---------------------------------------------------------------------------------------------+
@@ -24493,18 +24598,18 @@ def aes_decrypt(
Parameters
----------
- input : :class:`~pyspark.sql.Column` or str
+ input : :class:`~pyspark.sql.Column` or column name
The binary value to decrypt.
- key : :class:`~pyspark.sql.Column` or str
+ key : :class:`~pyspark.sql.Column` or column name
The passphrase to use to decrypt the data.
- mode : :class:`~pyspark.sql.Column` or str, optional
+ mode : :class:`~pyspark.sql.Column` or column name, optional
Specifies which block cipher mode should be used to decrypt messages.
Valid modes: ECB,
GCM, CBC.
- padding : :class:`~pyspark.sql.Column` or str, optional
+ padding : :class:`~pyspark.sql.Column` or column name, optional
Specifies how to pad messages whose length is not a multiple of the
block size. Valid
values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB,
NONE for GCM and PKCS
for CBC.
- aad : :class:`~pyspark.sql.Column` or str, optional
+ aad : :class:`~pyspark.sql.Column` or column name, optional
Optional additional authenticated data. Only supported for GCM mode.
This can be any
free-form input and must be provided for both encryption and
decryption.
@@ -24513,6 +24618,11 @@ def aes_decrypt(
:class:`~pyspark.sql.Column`
A new column that contains a decrypted value.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.aes_encrypt`
+ :meth:`pyspark.sql.functions.try_aes_decrypt`
+
Examples
--------
@@ -24526,7 +24636,7 @@ def aes_decrypt(
... ["input", "key", "mode", "padding", "aad"]
... )
>>> df.select(sf.aes_decrypt(
- ... sf.unbase64(df.input), df.key, df.mode, df.padding, df.aad
+ ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad
... ).cast("STRING")).show(truncate=False)
+---------------------------------------------------------------------+
|CAST(aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)|
@@ -24543,7 +24653,7 @@ def aes_decrypt(
... ["input", "key", "mode", "padding"]
... )
>>> df.select(sf.aes_decrypt(
- ... sf.unbase64(df.input), df.key, df.mode, df.padding
+ ... sf.unbase64(df.input), df.key, "mode", df.padding
... ).cast("STRING")).show(truncate=False)
+------------------------------------------------------------------+
|CAST(aes_decrypt(unbase64(input), key, mode, padding, ) AS STRING)|
@@ -24560,7 +24670,7 @@ def aes_decrypt(
... ["input", "key", "mode", "padding"]
... )
>>> df.select(sf.aes_decrypt(
- ... sf.unbase64(df.input), df.key, df.mode
+ ... sf.unbase64(df.input), df.key, "mode"
... ).cast("STRING")).show(truncate=False)
+------------------------------------------------------------------+
|CAST(aes_decrypt(unbase64(input), key, mode, DEFAULT, ) AS STRING)|
@@ -24612,18 +24722,18 @@ def try_aes_decrypt(
Parameters
----------
- input : :class:`~pyspark.sql.Column` or str
+ input : :class:`~pyspark.sql.Column` or column name
The binary value to decrypt.
- key : :class:`~pyspark.sql.Column` or str
+ key : :class:`~pyspark.sql.Column` or column name
The passphrase to use to decrypt the data.
- mode : :class:`~pyspark.sql.Column` or str, optional
+ mode : :class:`~pyspark.sql.Column` or column name, optional
Specifies which block cipher mode should be used to decrypt messages.
Valid modes: ECB,
GCM, CBC.
- padding : :class:`~pyspark.sql.Column` or str, optional
+ padding : :class:`~pyspark.sql.Column` or column name, optional
Specifies how to pad messages whose length is not a multiple of the
block size. Valid
values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB,
NONE for GCM and PKCS
for CBC.
- aad : :class:`~pyspark.sql.Column` or str, optional
+ aad : :class:`~pyspark.sql.Column` or column name, optional
Optional additional authenticated data. Only supported for GCM mode.
This can be any
free-form input and must be provided for both encryption and
decryption.
@@ -24632,6 +24742,11 @@ def try_aes_decrypt(
:class:`~pyspark.sql.Column`
A new column that contains a decrypted value or a NULL value.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.aes_encrypt`
+ :meth:`pyspark.sql.functions.aes_decrypt`
+
Examples
--------
@@ -24645,7 +24760,7 @@ def try_aes_decrypt(
... ["input", "key", "mode", "padding", "aad"]
... )
>>> df.select(sf.try_aes_decrypt(
- ... sf.unbase64(df.input), df.key, df.mode, df.padding, df.aad
+ ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad
... ).cast("STRING")).show(truncate=False)
+-------------------------------------------------------------------------+
|CAST(try_aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)|
@@ -24663,7 +24778,7 @@ def try_aes_decrypt(
... ["input", "key", "mode", "padding", "aad"]
... )
>>> df.select(sf.try_aes_decrypt(
- ... sf.unbase64(df.input), df.key, df.mode, df.padding, df.aad
+ ... sf.unbase64(df.input), df.key, "mode", df.padding, df.aad
... ).cast("STRING")).show(truncate=False)
+-------------------------------------------------------------------------+
|CAST(try_aes_decrypt(unbase64(input), key, mode, padding, aad) AS STRING)|
@@ -24680,7 +24795,7 @@ def try_aes_decrypt(
... ["input", "key", "mode", "padding"]
... )
>>> df.select(sf.try_aes_decrypt(
- ... sf.unbase64(df.input), df.key, df.mode, df.padding
+ ... sf.unbase64(df.input), df.key, "mode", df.padding
... ).cast("STRING")).show(truncate=False)
+----------------------------------------------------------------------+
|CAST(try_aes_decrypt(unbase64(input), key, mode, padding, ) AS STRING)|
@@ -24697,7 +24812,7 @@ def try_aes_decrypt(
... ["input", "key", "mode", "padding"]
... )
>>> df.select(sf.try_aes_decrypt(
- ... sf.unbase64(df.input), df.key, df.mode
+ ... sf.unbase64(df.input), df.key, "mode"
... ).cast("STRING")).show(truncate=False)
+----------------------------------------------------------------------+
|CAST(try_aes_decrypt(unbase64(input), key, mode, DEFAULT, ) AS STRING)|
@@ -24737,7 +24852,12 @@ def sha(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
+
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.sha1`
+ :meth:`pyspark.sql.functions.sha2`
Examples
--------
@@ -24827,18 +24947,28 @@ def reflect(*cols: "ColumnOrName") -> Column:
Parameters
----------
- cols : :class:`~pyspark.sql.Column` or str
- the first element should be a literal string for the class name,
- and the second element should be a literal string for the method name,
- and the remaining are input arguments to the Java method.
+ cols : :class:`~pyspark.sql.Column` or column name
+ the first element should be a Column representing literal string for
the class name,
+ and the second element should be a Column representing literal string
for the method name,
+ and the remaining are input arguments (Columns or column names) to the
Java method.
+
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.java_method`
+ :meth:`pyspark.sql.functions.try_reflect`
Examples
--------
- >>> df =
spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"])
+ >>> import pyspark.sql.functions as sf
+ >>> df =
spark.createDataFrame([('a5cf6c42-0c85-418f-af6c-3e4e5b1328f2',)], ['a'])
>>> df.select(
- ... reflect(lit("java.util.UUID"), lit("fromString"), df.a).alias('r')
- ... ).collect()
- [Row(r='a5cf6c42-0c85-418f-af6c-3e4e5b1328f2')]
+ ... sf.reflect(sf.lit('java.util.UUID'), sf.lit('fromString'), 'a')
+ ... ).show(truncate=False)
+ +--------------------------------------+
+ |reflect(java.util.UUID, fromString, a)|
+ +--------------------------------------+
+ |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 |
+ +--------------------------------------+
"""
return _invoke_function_over_seq_of_columns("reflect", cols)
@@ -24852,13 +24982,20 @@ def java_method(*cols: "ColumnOrName") -> Column:
Parameters
----------
- cols : :class:`~pyspark.sql.Column` or str
- the first element should be a literal string for the class name,
- and the second element should be a literal string for the method name,
- and the remaining are input arguments to the Java method.
+ cols : :class:`~pyspark.sql.Column` or column name
+ the first element should be a Column representing literal string for
the class name,
+ and the second element should be a Column representing literal string
for the method name,
+ and the remaining are input arguments (Columns or column names) to the
Java method.
+
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.reflect`
+ :meth:`pyspark.sql.functions.try_reflect`
Examples
--------
+ Example 1: Reflecting a method call with a column argument
+
>>> import pyspark.sql.functions as sf
>>> spark.range(1).select(
... sf.java_method(
@@ -24872,6 +25009,19 @@ def java_method(*cols: "ColumnOrName") -> Column:
+-----------------------------------------------------------------------------+
|a5cf6c42-0c85-418f-af6c-3e4e5b1328f2
|
+-----------------------------------------------------------------------------+
+
+ Example 2: Reflecting a method call with a column name argument
+
+ >>> import pyspark.sql.functions as sf
+ >>> df =
spark.createDataFrame([('a5cf6c42-0c85-418f-af6c-3e4e5b1328f2',)], ['a'])
+ >>> df.select(
+ ... sf.java_method(sf.lit('java.util.UUID'), sf.lit('fromString'), 'a')
+ ... ).show(truncate=False)
+ +------------------------------------------+
+ |java_method(java.util.UUID, fromString, a)|
+ +------------------------------------------+
+ |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 |
+ +------------------------------------------+
"""
return _invoke_function_over_seq_of_columns("java_method", cols)
@@ -24887,10 +25037,15 @@ def try_reflect(*cols: "ColumnOrName") -> Column:
Parameters
----------
- cols : :class:`~pyspark.sql.Column` or str
- the first element should be a literal string for the class name,
- and the second element should be a literal string for the method name,
- and the remaining are input arguments to the Java method.
+ cols : :class:`~pyspark.sql.Column` or column name
+ the first element should be a Column representing literal string for
the class name,
+ and the second element should be a Column representing literal string
for the method name,
+ and the remaining are input arguments (Columns or column names) to the
Java method.
+
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.reflect`
+ :meth:`pyspark.sql.functions.java_method`
Examples
--------
@@ -24899,25 +25054,24 @@ def try_reflect(*cols: "ColumnOrName") -> Column:
>>> from pyspark.sql import functions as sf
>>> df =
spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"])
>>> df.select(
- ... sf.try_reflect(sf.lit("java.util.UUID"), sf.lit("fromString"),
df.a)
- ... ).show()
+ ... sf.try_reflect(sf.lit("java.util.UUID"), sf.lit("fromString"), "a")
+ ... ).show(truncate=False)
+------------------------------------------+
|try_reflect(java.util.UUID, fromString, a)|
+------------------------------------------+
- | a5cf6c42-0c85-418...|
+ |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 |
+------------------------------------------+
Example 2: Exception in the reflection call, resulting in null
>>> from pyspark.sql import functions as sf
- >>> df = spark.range(1)
- >>> df.select(
+ >>> spark.range(1).select(
... sf.try_reflect(sf.lit("scala.Predef"), sf.lit("require"),
sf.lit(False))
- ... ).show()
+ ... ).show(truncate=False)
+-----------------------------------------+
|try_reflect(scala.Predef, require, false)|
+-----------------------------------------+
- | NULL|
+ |NULL |
+-----------------------------------------+
"""
return _invoke_function_over_seq_of_columns("try_reflect", cols)
@@ -24933,12 +25087,12 @@ def version() -> Column:
Examples
--------
- >>> df = spark.range(1)
- >>> df.select(version()).show(truncate=False) # doctest: +SKIP
+ >>> from pyspark.sql import functions as sf
+ >>> spark.range(1).select(sf.version()).show(truncate=False) # doctest:
+SKIP
+----------------------------------------------+
|version() |
+----------------------------------------------+
- |3.5.0 cafbea5b13623276517a9d716f75745eff91f616|
+ |4.0.0 4f8d1f575e99aeef8990c63a9614af0fc5479330|
+----------------------------------------------+
"""
return _invoke_function_over_columns("version")
@@ -24953,13 +25107,18 @@ def typeof(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
Examples
--------
- >>> df = spark.createDataFrame([(1,)], ["a"])
- >>> df.select(typeof(df.a).alias('r')).collect()
- [Row(r='bigint')]
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(True, 1, 1.0, 'xyz',)], ['a', 'b', 'c',
'd'])
+ >>> df.select(sf.typeof(df.a), sf.typeof(df.b), sf.typeof('c'),
sf.typeof('d')).show()
+ +---------+---------+---------+---------+
+ |typeof(a)|typeof(b)|typeof(c)|typeof(d)|
+ +---------+---------+---------+---------+
+ | boolean| bigint| double| string|
+ +---------+---------+---------+---------+
"""
return _invoke_function_over_columns("typeof", col)
@@ -24974,20 +25133,48 @@ def stack(*cols: "ColumnOrName") -> Column:
Parameters
----------
- cols : :class:`~pyspark.sql.Column` or str
+ cols : :class:`~pyspark.sql.Column` or column name
the first element should be a literal int for the number of rows to be
separated,
and the remaining are input elements to be separated.
Examples
--------
- >>> df = spark.createDataFrame([(1, 2, 3)], ["a", "b", "c"])
- >>> df.select(stack(lit(2), df.a, df.b, df.c)).show(truncate=False)
- +----+----+
- |col0|col1|
- +----+----+
- |1 |2 |
- |3 |NULL|
- +----+----+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c'])
+ >>> df.select('*', sf.stack(sf.lit(2), df.a, df.b, 'c')).show()
+ +---+---+---+----+----+
+ | a| b| c|col0|col1|
+ +---+---+---+----+----+
+ | 1| 2| 3| 1| 2|
+ | 1| 2| 3| 3|NULL|
+ +---+---+---+----+----+
+
+ >>> df.select('*', sf.stack(sf.lit(2), df.a, df.b, 'c').alias('x',
'y')).show()
+ +---+---+---+---+----+
+ | a| b| c| x| y|
+ +---+---+---+---+----+
+ | 1| 2| 3| 1| 2|
+ | 1| 2| 3| 3|NULL|
+ +---+---+---+---+----+
+
+ >>> df.select('*', sf.stack(sf.lit(3), df.a, df.b, 'c')).show()
+ +---+---+---+----+
+ | a| b| c|col0|
+ +---+---+---+----+
+ | 1| 2| 3| 1|
+ | 1| 2| 3| 2|
+ | 1| 2| 3| 3|
+ +---+---+---+----+
+
+ >>> df.select('*', sf.stack(sf.lit(4), df.a, df.b, 'c')).show()
+ +---+---+---+----+
+ | a| b| c|col0|
+ +---+---+---+----+
+ | 1| 2| 3| 1|
+ | 1| 2| 3| 2|
+ | 1| 2| 3| 3|
+ | 1| 2| 3|NULL|
+ +---+---+---+----+
"""
return _invoke_function_over_seq_of_columns("stack", cols)
@@ -25001,14 +25188,26 @@ def bitmap_bit_position(col: "ColumnOrName") ->
Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
The input column.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.bitmap_bucket_number`
+ :meth:`pyspark.sql.functions.bitmap_construct_agg`
+ :meth:`pyspark.sql.functions.bitmap_count`
+ :meth:`pyspark.sql.functions.bitmap_or_agg`
+
Examples
--------
- >>> df = spark.createDataFrame([(123,)], ["a"])
- >>> df.select(bitmap_bit_position(df.a).alias("r")).collect()
- [Row(r=122)]
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(123,)], ['a'])
+ >>> df.select('*', sf.bitmap_bit_position('a')).show()
+ +---+----------------------+
+ | a|bitmap_bit_position(a)|
+ +---+----------------------+
+ |123| 122|
+ +---+----------------------+
"""
return _invoke_function_over_columns("bitmap_bit_position", col)
@@ -25022,14 +25221,26 @@ def bitmap_bucket_number(col: "ColumnOrName") ->
Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
The input column.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.bitmap_bit_position`
+ :meth:`pyspark.sql.functions.bitmap_construct_agg`
+ :meth:`pyspark.sql.functions.bitmap_count`
+ :meth:`pyspark.sql.functions.bitmap_or_agg`
+
Examples
--------
- >>> df = spark.createDataFrame([(123,)], ["a"])
- >>> df.select(bitmap_bucket_number(df.a).alias("r")).collect()
- [Row(r=1)]
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(123,)], ['a'])
+ >>> df.select('*', sf.bitmap_bucket_number('a')).show()
+ +---+-----------------------+
+ | a|bitmap_bucket_number(a)|
+ +---+-----------------------+
+ |123| 1|
+ +---+-----------------------+
"""
return _invoke_function_over_columns("bitmap_bucket_number", col)
@@ -25044,16 +25255,28 @@ def bitmap_construct_agg(col: "ColumnOrName") ->
Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
The input column will most likely be bitmap_bit_position().
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.bitmap_bit_position`
+ :meth:`pyspark.sql.functions.bitmap_bucket_number`
+ :meth:`pyspark.sql.functions.bitmap_count`
+ :meth:`pyspark.sql.functions.bitmap_or_agg`
+
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([(1,),(2,),(3,)], ["a"])
- >>> df.select(substring(hex(
- ... bitmap_construct_agg(bitmap_bit_position(df.a))
- ... ), 0, 6).alias("r")).collect()
- [Row(r='070000')]
+ >>> df.select(
+ ... sf.bitmap_construct_agg(sf.bitmap_bit_position('a'))
+ ... ).show()
+ +--------------------------------------------+
+ |bitmap_construct_agg(bitmap_bit_position(a))|
+ +--------------------------------------------+
+ | [07 00 00 00 00 0...|
+ +--------------------------------------------+
"""
return _invoke_function_over_columns("bitmap_construct_agg", col)
@@ -25067,14 +25290,26 @@ def bitmap_count(col: "ColumnOrName") -> Column:
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
The input bitmap.
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.bitmap_bit_position`
+ :meth:`pyspark.sql.functions.bitmap_bucket_number`
+ :meth:`pyspark.sql.functions.bitmap_construct_agg`
+ :meth:`pyspark.sql.functions.bitmap_or_agg`
+
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("FFFF",)], ["a"])
- >>> df.select(bitmap_count(to_binary(df.a,
lit("hex"))).alias('r')).collect()
- [Row(r=16)]
+ >>> df.select(sf.bitmap_count(sf.to_binary(df.a, sf.lit("hex")))).show()
+ +-------------------------------+
+ |bitmap_count(to_binary(a, hex))|
+ +-------------------------------+
+ | 16|
+ +-------------------------------+
"""
return _invoke_function_over_columns("bitmap_count", col)
@@ -25087,18 +25322,28 @@ def bitmap_or_agg(col: "ColumnOrName") -> Column:
.. versionadded:: 3.5.0
+ See Also
+ --------
+ :meth:`pyspark.sql.functions.bitmap_bit_position`
+ :meth:`pyspark.sql.functions.bitmap_bucket_number`
+ :meth:`pyspark.sql.functions.bitmap_construct_agg`
+ :meth:`pyspark.sql.functions.bitmap_count`
+
Parameters
----------
- col : :class:`~pyspark.sql.Column` or str
+ col : :class:`~pyspark.sql.Column` or column name
The input column should be bitmaps created from bitmap_construct_agg().
Examples
--------
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([("10",),("20",),("40",)], ["a"])
- >>> df.select(substring(hex(
- ... bitmap_or_agg(to_binary(df.a, lit("hex")))
- ... ), 0, 6).alias("r")).collect()
- [Row(r='700000')]
+ >>> df.select(sf.bitmap_or_agg(sf.to_binary(df.a, sf.lit("hex")))).show()
+ +--------------------------------+
+ |bitmap_or_agg(to_binary(a, hex))|
+ +--------------------------------+
+ | [70 00 00 00 00 0...|
+ +--------------------------------+
"""
return _invoke_function_over_columns("bitmap_or_agg", col)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]