This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2e917fb91924 [SPARK-46978][PYTHON][DOCS] Refine docstring of
`sum_distinct/array_agg/count_if`
2e917fb91924 is described below
commit 2e917fb919244b421c5a2770403c0fd91336f65d
Author: yangjie01 <[email protected]>
AuthorDate: Mon Feb 5 11:58:25 2024 -0800
[SPARK-46978][PYTHON][DOCS] Refine docstring of
`sum_distinct/array_agg/count_if`
### What changes were proposed in this pull request?
This pr refine docstring of `sum_distinct/array_agg/count_if` and add some
new examples.
### Why are the changes needed?
To improve PySpark documentation
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Pass Github Actions
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #45031 from LuciferYang/agg-functions.
Lead-authored-by: yangjie01 <[email protected]>
Co-authored-by: YangJie <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 134 +++++++++++++++++++++++++++++---
1 file changed, 123 insertions(+), 11 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 0932ac1c2843..cb872fdb8180 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -1472,13 +1472,51 @@ def sum_distinct(col: "ColumnOrName") -> Column:
Examples
--------
- >>> df = spark.createDataFrame([(None,), (1,), (1,), (2,)],
schema=["numbers"])
- >>> df.select(sum_distinct(col("numbers"))).show()
+ Example 1: Using sum_distinct function on a column with all distinct values
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(1,), (2,), (3,), (4,)], ["numbers"])
+ >>> df.select(sf.sum_distinct('numbers')).show()
+ +---------------------+
+ |sum(DISTINCT numbers)|
+ +---------------------+
+ | 10|
+ +---------------------+
+
+ Example 2: Using sum_distinct function on a column with no distinct values
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(1,), (1,), (1,), (1,)], ["numbers"])
+ >>> df.select(sf.sum_distinct('numbers')).show()
+ +---------------------+
+ |sum(DISTINCT numbers)|
+ +---------------------+
+ | 1|
+ +---------------------+
+
+ Example 3: Using sum_distinct function on a column with null and duplicate
values
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(None,), (1,), (1,), (2,)], ["numbers"])
+ >>> df.select(sf.sum_distinct('numbers')).show()
+---------------------+
|sum(DISTINCT numbers)|
+---------------------+
| 3|
+---------------------+
+
+ Example 4: Using sum_distinct function on a column with all None values
+
+ >>> from pyspark.sql import functions as sf
+ >>> from pyspark.sql.types import StructType, StructField, IntegerType
+ >>> schema = StructType([StructField("numbers", IntegerType(), True)])
+ >>> df = spark.createDataFrame([(None,), (None,), (None,), (None,)],
schema=schema)
+ >>> df.select(sf.sum_distinct('numbers')).show()
+ +---------------------+
+ |sum(DISTINCT numbers)|
+ +---------------------+
+ | NULL|
+ +---------------------+
"""
return _invoke_function_over_columns("sum_distinct", col)
@@ -4122,9 +4160,49 @@ def array_agg(col: "ColumnOrName") -> Column:
Examples
--------
+ Example 1: Using array_agg function on an int column
+
+ >>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([[1],[1],[2]], ["c"])
- >>> df.agg(array_agg('c').alias('r')).collect()
- [Row(r=[1, 1, 2])]
+ >>> df.agg(sf.sort_array(sf.array_agg('c'))).show()
+ +---------------------------------+
+ |sort_array(collect_list(c), true)|
+ +---------------------------------+
+ | [1, 1, 2]|
+ +---------------------------------+
+
+ Example 2: Using array_agg function on a string column
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([["apple"],["apple"],["banana"]], ["c"])
+ >>> df.agg(sf.sort_array(sf.array_agg('c'))).show(truncate=False)
+ +---------------------------------+
+ |sort_array(collect_list(c), true)|
+ +---------------------------------+
+ |[apple, apple, banana] |
+ +---------------------------------+
+
+ Example 3: Using array_agg function on a column with null values
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([[1],[None],[2]], ["c"])
+ >>> df.agg(sf.sort_array(sf.array_agg('c'))).show()
+ +---------------------------------+
+ |sort_array(collect_list(c), true)|
+ +---------------------------------+
+ | [1, 2]|
+ +---------------------------------+
+
+ Example 4: Using array_agg function on a column with different data types
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([[1],["apple"],[2]], ["c"])
+ >>> df.agg(sf.sort_array(sf.array_agg('c'))).show()
+ +---------------------------------+
+ |sort_array(collect_list(c), true)|
+ +---------------------------------+
+ | [1, 2, apple]|
+ +---------------------------------+
"""
return _invoke_function_over_columns("array_agg", col)
@@ -6809,7 +6887,8 @@ def last_value(col: "ColumnOrName", ignoreNulls:
Optional[Union[bool, Column]] =
@_try_remote_functions
def count_if(col: "ColumnOrName") -> Column:
- """Returns the number of `TRUE` values for the `col`.
+ """
+ Aggregate function: Returns the number of `TRUE` values for the `col`.
.. versionadded:: 3.5.0
@@ -6825,17 +6904,50 @@ def count_if(col: "ColumnOrName") -> Column:
Examples
--------
- >>> df = spark.createDataFrame([("a", 1),
- ... ("a", 2),
- ... ("a", 3),
- ... ("b", 8),
- ... ("b", 2)], ["c1", "c2"])
- >>> df.select(count_if(col('c2') % 2 == 0)).show()
+ Example 1: Counting the number of even numbers in a numeric column
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([("a", 1), ("a", 2), ("a", 3), ("b", 8),
("b", 2)], ["c1", "c2"])
+ >>> df.select(sf.count_if(sf.col('c2') % 2 == 0)).show()
+------------------------+
|count_if(((c2 % 2) = 0))|
+------------------------+
| 3|
+------------------------+
+
+ Example 2: Counting the number of rows where a string column starts with a
certain letter
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame(
+ ... [("apple",), ("banana",), ("cherry",), ("apple",), ("banana",)],
["fruit"])
+ >>> df.select(sf.count_if(sf.col('fruit').startswith('a'))).show()
+ +------------------------------+
+ |count_if(startswith(fruit, a))|
+ +------------------------------+
+ | 2|
+ +------------------------------+
+
+ Example 3: Counting the number of rows where a numeric column is greater
than a certain value
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["num"])
+ >>> df.select(sf.count_if(sf.col('num') > 3)).show()
+ +-------------------+
+ |count_if((num > 3))|
+ +-------------------+
+ | 2|
+ +-------------------+
+
+ Example 4: Counting the number of rows where a boolean column is True
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(True,), (False,), (True,), (False,),
(True,)], ["bool"])
+ >>> df.select(sf.count_if(sf.col('bool'))).show()
+ +--------------+
+ |count_if(bool)|
+ +--------------+
+ | 3|
+ +--------------+
"""
return _invoke_function_over_columns("count_if", col)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]