This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 9f3e9500661 [SPARK-40010][PYTHON][DOCS] Make pyspark.sql.window
examples self-contained
9f3e9500661 is described below
commit 9f3e950066142a8f588b88102d2ac92fdb45af98
Author: Qian.Sun <[email protected]>
AuthorDate: Wed Aug 10 11:43:37 2022 +0900
[SPARK-40010][PYTHON][DOCS] Make pyspark.sql.window examples self-contained
### What changes were proposed in this pull request?
This PR proposes to improve the examples in `pyspark.sql.window` by making
each example self-contained with a brief explanation and a bit more realistic
example.
### Why are the changes needed?
To make the documentation more readable and able to copy and paste directly
in PySpark shell.
### Does this PR introduce _any_ user-facing change?
Yes, it changes the documentation
### How was this patch tested?
Manually ran each doctest.
Closes #37450 from dcoliversun/SPARK-40010.
Authored-by: Qian.Sun <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/window.py | 50 ++++++++++++++++++++++++++++++++++----------
1 file changed, 39 insertions(+), 11 deletions(-)
diff --git a/python/pyspark/sql/window.py b/python/pyspark/sql/window.py
index b8bc90f458c..f895e2010ce 100644
--- a/python/pyspark/sql/window.py
+++ b/python/pyspark/sql/window.py
@@ -128,11 +128,23 @@ class Window:
--------
>>> from pyspark.sql import Window
>>> from pyspark.sql import functions as func
- >>> from pyspark.sql import SQLContext
- >>> sc = SparkContext.getOrCreate()
- >>> sqlContext = SQLContext(sc)
- >>> tup = [(1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")]
- >>> df = sqlContext.createDataFrame(tup, ["id", "category"])
+ >>> df = spark.createDataFrame(
+ ... [(1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")],
["id", "category"])
+ >>> df.show()
+ +---+--------+
+ | id|category|
+ +---+--------+
+ | 1| a|
+ | 1| a|
+ | 2| a|
+ | 1| b|
+ | 2| b|
+ | 3| b|
+ +---+--------+
+
+ Calculate sum of ``id`` in the range from currentRow to currentRow + 1
+ in partition ``category``
+
>>> window =
Window.partitionBy("category").orderBy("id").rowsBetween(Window.currentRow, 1)
>>> df.withColumn("sum", func.sum("id").over(window)).sort("id",
"category", "sum").show()
+---+--------+---+
@@ -196,11 +208,23 @@ class Window:
--------
>>> from pyspark.sql import Window
>>> from pyspark.sql import functions as func
- >>> from pyspark.sql import SQLContext
- >>> sc = SparkContext.getOrCreate()
- >>> sqlContext = SQLContext(sc)
- >>> tup = [(1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")]
- >>> df = sqlContext.createDataFrame(tup, ["id", "category"])
+ >>> df = spark.createDataFrame(
+ ... [(1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")],
["id", "category"])
+ >>> df.show()
+ +---+--------+
+ | id|category|
+ +---+--------+
+ | 1| a|
+ | 1| a|
+ | 2| a|
+ | 1| b|
+ | 2| b|
+ | 3| b|
+ +---+--------+
+
+ Calculate sum of ``id`` in the range from ``id`` of currentRow to
``id`` of currentRow + 1
+ in partition ``category``
+
>>> window =
Window.partitionBy("category").orderBy("id").rangeBetween(Window.currentRow, 1)
>>> df.withColumn("sum", func.sum("id").over(window)).sort("id",
"category").show()
+---+--------+---+
@@ -329,13 +353,17 @@ class WindowSpec:
def _test() -> None:
import doctest
+ from pyspark.sql import SparkSession
import pyspark.sql.window
- SparkContext("local[4]", "PythonTest")
globs = pyspark.sql.window.__dict__.copy()
+ spark = SparkSession.builder.master("local[4]").appName("sql.window
tests").getOrCreate()
+ globs["spark"] = spark
+
(failure_count, test_count) = doctest.testmod(
pyspark.sql.window, globs=globs,
optionflags=doctest.NORMALIZE_WHITESPACE
)
+ spark.stop()
if failure_count:
sys.exit(-1)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]