This is an automated email from the ASF dual-hosted git repository. yangjie01 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new c7c43cf5b1e1 [SPARK-46533][PYTHON][DOCS] Refine docstring of `array_min/array_max/array_size/array_repeat` c7c43cf5b1e1 is described below commit c7c43cf5b1e162bece7c8975a05d62b5b0fc7e76 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Fri Dec 29 15:02:26 2023 +0800 [SPARK-46533][PYTHON][DOCS] Refine docstring of `array_min/array_max/array_size/array_repeat` ### What changes were proposed in this pull request? This pr refine docstring of `array_min/array_max/array_size/array_repeat` and add some new examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #44522 from LuciferYang/SPARK-46533. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: yangjie01 <yangji...@baidu.com> --- python/pyspark/sql/functions/builtin.py | 256 +++++++++++++++++++++++++++++--- 1 file changed, 235 insertions(+), 21 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 3b579f20333e..b86fb4692012 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -14769,7 +14769,7 @@ def size(col: "ColumnOrName") -> Column: @_try_remote_functions def array_min(col: "ColumnOrName") -> Column: """ - Collection function: returns the minimum value of the array. + Array function: returns the minimum value of the array. .. versionadded:: 2.4.0 @@ -14779,18 +14779,74 @@ def array_min(col: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - name of column or expression + The name of the column or an expression that represents the array. Returns ------- :class:`~pyspark.sql.Column` - minimum value of array. + A new column that contains the minimum value of each array. Examples -------- + Example 1: Basic usage with integer array + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) - >>> df.select(array_min(df.data).alias('min')).collect() - [Row(min=1), Row(min=-1)] + >>> df.select(sf.array_min(df.data)).show() + +---------------+ + |array_min(data)| + +---------------+ + | 1| + | -1| + +---------------+ + + Example 2: Usage with string array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['apple', 'banana', 'cherry'],)], ['data']) + >>> df.select(sf.array_min(df.data)).show() + +---------------+ + |array_min(data)| + +---------------+ + | apple| + +---------------+ + + Example 3: Usage with mixed type array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['apple', 1, 'cherry'],)], ['data']) + >>> df.select(sf.array_min(df.data)).show() + +---------------+ + |array_min(data)| + +---------------+ + | 1| + +---------------+ + + Example 4: Usage with array of arrays + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([[2, 1], [3, 4]],)], ['data']) + >>> df.select(sf.array_min(df.data)).show() + +---------------+ + |array_min(data)| + +---------------+ + | [2, 1]| + +---------------+ + + Example 5: Usage with empty array + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([([],)], schema=schema) + >>> df.select(sf.array_min(df.data)).show() + +---------------+ + |array_min(data)| + +---------------+ + | NULL| + +---------------+ """ return _invoke_function_over_columns("array_min", col) @@ -14798,7 +14854,7 @@ def array_min(col: "ColumnOrName") -> Column: @_try_remote_functions def array_max(col: "ColumnOrName") -> Column: """ - Collection function: returns the maximum value of the array. + Array function: returns the maximum value of the array. .. versionadded:: 2.4.0 @@ -14808,18 +14864,74 @@ def array_max(col: "ColumnOrName") -> Column: Parameters ---------- col : :class:`~pyspark.sql.Column` or str - name of column or expression + The name of the column or an expression that represents the array. Returns ------- :class:`~pyspark.sql.Column` - maximum value of an array. + A new column that contains the maximum value of each array. Examples -------- + Example 1: Basic usage with integer array + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data']) - >>> df.select(array_max(df.data).alias('max')).collect() - [Row(max=3), Row(max=10)] + >>> df.select(sf.array_max(df.data)).show() + +---------------+ + |array_max(data)| + +---------------+ + | 3| + | 10| + +---------------+ + + Example 2: Usage with string array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['apple', 'banana', 'cherry'],)], ['data']) + >>> df.select(sf.array_max(df.data)).show() + +---------------+ + |array_max(data)| + +---------------+ + | cherry| + +---------------+ + + Example 3: Usage with mixed type array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['apple', 1, 'cherry'],)], ['data']) + >>> df.select(sf.array_max(df.data)).show() + +---------------+ + |array_max(data)| + +---------------+ + | cherry| + +---------------+ + + Example 4: Usage with array of arrays + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([[2, 1], [3, 4]],)], ['data']) + >>> df.select(sf.array_max(df.data)).show() + +---------------+ + |array_max(data)| + +---------------+ + | [3, 4]| + +---------------+ + + Example 5: Usage with empty array + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([([],)], schema=schema) + >>> df.select(sf.array_max(df.data)).show() + +---------------+ + |array_max(data)| + +---------------+ + | NULL| + +---------------+ """ return _invoke_function_over_columns("array_max", col) @@ -14827,25 +14939,82 @@ def array_max(col: "ColumnOrName") -> Column: @_try_remote_functions def array_size(col: "ColumnOrName") -> Column: """ - Returns the total number of elements in the array. The function returns null for null input. + Array function: returns the total number of elements in the array. + The function returns null for null input. .. versionadded:: 3.5.0 Parameters ---------- col : :class:`~pyspark.sql.Column` or str - target column to compute on. + The name of the column or an expression that represents the array. Returns ------- :class:`~pyspark.sql.Column` - total number of elements in the array. + A new column that contains the size of each array. Examples -------- + Example 1: Basic usage with integer array + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([([2, 1, 3],), (None,)], ['data']) - >>> df.select(array_size(df.data).alias('r')).collect() - [Row(r=3), Row(r=None)] + >>> df.select(sf.array_size(df.data)).show() + +----------------+ + |array_size(data)| + +----------------+ + | 3| + | NULL| + +----------------+ + + Example 2: Usage with string array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['apple', 'banana', 'cherry'],)], ['data']) + >>> df.select(sf.array_size(df.data)).show() + +----------------+ + |array_size(data)| + +----------------+ + | 3| + +----------------+ + + Example 3: Usage with mixed type array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['apple', 1, 'cherry'],)], ['data']) + >>> df.select(sf.array_size(df.data)).show() + +----------------+ + |array_size(data)| + +----------------+ + | 3| + +----------------+ + + Example 4: Usage with array of arrays + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([[2, 1], [3, 4]],)], ['data']) + >>> df.select(sf.array_size(df.data)).show() + +----------------+ + |array_size(data)| + +----------------+ + | 2| + +----------------+ + + Example 5: Usage with empty array + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", ArrayType(IntegerType()), True) + ... ]) + >>> df = spark.createDataFrame([([],)], schema=schema) + >>> df.select(sf.array_size(df.data)).show() + +----------------+ + |array_size(data)| + +----------------+ + | 0| + +----------------+ """ return _invoke_function_over_columns("array_size", col) @@ -15268,7 +15437,7 @@ def map_from_entries(col: "ColumnOrName") -> Column: @_try_remote_functions def array_repeat(col: "ColumnOrName", count: Union["ColumnOrName", int]) -> Column: """ - Collection function: creates an array containing a column repeated count times. + Array function: creates an array containing a column repeated count times. .. versionadded:: 2.4.0 @@ -15278,20 +15447,65 @@ def array_repeat(col: "ColumnOrName", count: Union["ColumnOrName", int]) -> Colu Parameters ---------- col : :class:`~pyspark.sql.Column` or str - column name or column that contains the element to be repeated + The name of the column or an expression that represents the element to be repeated. count : :class:`~pyspark.sql.Column` or str or int - column name, column, or int containing the number of times to repeat the first argument + The name of the column, an expression, + or an integer that represents the number of times to repeat the element. Returns ------- :class:`~pyspark.sql.Column` - an array of repeated elements. + A new column that contains an array of repeated elements. Examples -------- + Example 1: Usage with string + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('ab',)], ['data']) - >>> df.select(array_repeat(df.data, 3).alias('r')).collect() - [Row(r=['ab', 'ab', 'ab'])] + >>> df.select(sf.array_repeat(df.data, 3)).show() + +---------------------+ + |array_repeat(data, 3)| + +---------------------+ + | [ab, ab, ab]| + +---------------------+ + + Example 2: Usage with integer + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(3,)], ['data']) + >>> df.select(sf.array_repeat(df.data, 2)).show() + +---------------------+ + |array_repeat(data, 2)| + +---------------------+ + | [3, 3]| + +---------------------+ + + Example 3: Usage with array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(['apple', 'banana'],)], ['data']) + >>> df.select(sf.array_repeat(df.data, 2)).show(truncate=False) + +----------------------------------+ + |array_repeat(data, 2) | + +----------------------------------+ + |[[apple, banana], [apple, banana]]| + +----------------------------------+ + + Example 4: Usage with null + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import IntegerType, StructType, StructField + >>> schema = StructType([ + ... StructField("data", IntegerType(), True) + ... ]) + >>> df = spark.createDataFrame([(None, )], schema=schema) + >>> df.select(sf.array_repeat(df.data, 3)).show() + +---------------------+ + |array_repeat(data, 3)| + +---------------------+ + | [NULL, NULL, NULL]| + +---------------------+ """ count = lit(count) if isinstance(count, int) else count --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org