This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 464a3c19f51 [SPARK-45113][PYTHON][DOCS][FOLLOWUP] Add sorting to the
example of `collect_set/collect_list` to ensure stable results
464a3c19f51 is described below
commit 464a3c19f51081914a09d4394820da059cb2ee47
Author: yangjie01 <[email protected]>
AuthorDate: Tue Sep 19 08:48:03 2023 +0900
[SPARK-45113][PYTHON][DOCS][FOLLOWUP] Add sorting to the example of
`collect_set/collect_list` to ensure stable results
### What changes were proposed in this pull request?
This PR adds a `sort_array` for the output results of `collect_set` and
`collect_list` to ensure the output is stable.
### Why are the changes needed?
When executing the Example of `collect_set` and `collect_list` with
different versions of Scala, the output results may differ, resulting in the
failure of daily tests on Scala 2.13:
- https://github.com/apache/spark/actions/runs/6209111340/job/16856005714
```
**********************************************************************
File "/__w/spark/spark/python/pyspark/sql/connect/functions.py", line 1030,
in pyspark.sql.connect.functions.collect_set
Failed example:
df.select(sf.collect_set('age')).show()
Expected:
+----------------+
|collect_set(age)|
+----------------+
| [5, 2]|
+----------------+
Got:
+----------------+
|collect_set(age)|
+----------------+
| [2, 5]|
+----------------+
<BLANKLINE>
**********************************************************************
1 of 9 in pyspark.sql.connect.functions.collect_set
***Test Failed*** 1 failures.
Had test failures in pyspark.sql.connect.functions with python3.9; see logs.
Error: running /__w/spark/spark/python/run-tests --modules=pyspark-connect
--parallelism=1 ; received return code 255
Error: Process completed with exit code 19.
```
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Pass GitHub Actions
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #42968 from LuciferYang/SPARK-45113-FOLLOWUP.
Lead-authored-by: yangjie01 <[email protected]>
Co-authored-by: YangJie <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/functions.py | 96 ++++++++++++++++++++---------------------
1 file changed, 48 insertions(+), 48 deletions(-)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 54bd330ebc0..5474873df7b 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -3671,39 +3671,39 @@ def collect_list(col: "ColumnOrName") -> Column:
Examples
--------
- Example 1: Collect values from a single column DataFrame
+ Example 1: Collect values from a DataFrame and sort the result in
ascending order
>>> from pyspark.sql import functions as sf
- >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
- >>> df.select(sf.collect_list('age')).show()
- +-----------------+
- |collect_list(age)|
- +-----------------+
- | [2, 5, 5]|
- +-----------------+
-
- Example 2: Collect values from a DataFrame with multiple columns
-
- >>> from pyspark.sql import functions as sf
- >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")],
("id", "name"))
- >>> df.groupBy("name").agg(sf.collect_list('id')).show()
- +----+----------------+
- |name|collect_list(id)|
- +----+----------------+
- |John| [1, 2]|
- | Ana| [3]|
- +----+----------------+
+ >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',))
+ >>>
df.select(sf.sort_array(sf.collect_list('value')).alias('sorted_list')).show()
+ +-----------+
+ |sorted_list|
+ +-----------+
+ | [1, 2, 2]|
+ +-----------+
- Example 3: Collect values from a DataFrame and sort the result
+ Example 2: Collect values from a DataFrame and sort the result in
descending order
>>> from pyspark.sql import functions as sf
- >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',))
- >>>
df.select(sf.array_sort(sf.collect_list('value')).alias('sorted_list')).show()
+ >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
+ >>> df.select(sf.sort_array(sf.collect_list('age'),
asc=False).alias('sorted_list')).show()
+-----------+
|sorted_list|
+-----------+
- | [1, 2, 2]|
+ | [5, 5, 2]|
+-----------+
+
+ Example 3: Collect values from a DataFrame with multiple columns and sort
the result
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")],
("id", "name"))
+ >>>
df.groupBy("name").agg(sf.sort_array(sf.collect_list('id')).alias('sorted_list')).show()
+ +----+-----------+
+ |name|sorted_list|
+ +----+-----------+
+ |John| [1, 2]|
+ | Ana| [3]|
+ +----+-----------+
"""
return _invoke_function_over_columns("collect_list", col)
@@ -3762,39 +3762,39 @@ def collect_set(col: "ColumnOrName") -> Column:
Examples
--------
- Example 1: Collect values from a single column DataFrame
+ Example 1: Collect values from a DataFrame and sort the result in
ascending order
>>> from pyspark.sql import functions as sf
- >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
- >>> df.select(sf.collect_set('age')).show()
- +----------------+
- |collect_set(age)|
- +----------------+
- | [5, 2]|
- +----------------+
-
- Example 2: Collect values from a DataFrame with multiple columns
-
- >>> from pyspark.sql import functions as sf
- >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")],
("id", "name"))
- >>> df.groupBy("name").agg(sf.collect_set('id')).show()
- +----+---------------+
- |name|collect_set(id)|
- +----+---------------+
- |John| [1, 2]|
- | Ana| [3]|
- +----+---------------+
+ >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',))
+ >>>
df.select(sf.sort_array(sf.collect_set('value')).alias('sorted_set')).show()
+ +----------+
+ |sorted_set|
+ +----------+
+ | [1, 2]|
+ +----------+
- Example 3: Collect values from a DataFrame and sort the result
+ Example 2: Collect values from a DataFrame and sort the result in
descending order
>>> from pyspark.sql import functions as sf
- >>> df = spark.createDataFrame([(1,), (2,), (2,)], ('value',))
- >>>
df.select(sf.array_sort(sf.collect_set('value')).alias('sorted_set')).show()
+ >>> df = spark.createDataFrame([(2,), (5,), (5,)], ('age',))
+ >>> df.select(sf.sort_array(sf.collect_set('age'),
asc=False).alias('sorted_set')).show()
+----------+
|sorted_set|
+----------+
- | [1, 2]|
+ | [5, 2]|
+----------+
+
+ Example 3: Collect values from a DataFrame with multiple columns and sort
the result
+
+ >>> from pyspark.sql import functions as sf
+ >>> df = spark.createDataFrame([(1, "John"), (2, "John"), (3, "Ana")],
("id", "name"))
+ >>>
df.groupBy("name").agg(sf.sort_array(sf.collect_set('id')).alias('sorted_set')).show()
+ +----+----------+
+ |name|sorted_set|
+ +----+----------+
+ |John| [1, 2]|
+ | Ana| [3]|
+ +----+----------+
"""
return _invoke_function_over_columns("collect_set", col)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]