This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new b594c4edb383 [SPARK-47621][PYTHON][DOCS] Refine docstring of `try_sum`, `try_avg`, `avg`, `sum`, `mean` b594c4edb383 is described below commit b594c4edb38364139adc3934b14284d9ed9c7d46 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Thu Mar 28 20:24:16 2024 +0800 [SPARK-47621][PYTHON][DOCS] Refine docstring of `try_sum`, `try_avg`, `avg`, `sum`, `mean` ### What changes were proposed in this pull request? This PR refines docstring of `try_sum`, `try_avg`, `avg`, `sum`, `mean` with more descriptive examples. ### Why are the changes needed? For better API reference documentation. ### Does this PR introduce _any_ user-facing change? Yes, it fixes user-facing documentation. ### How was this patch tested? Manually tested. GitHub Actions should verify them. ### Was this patch authored or co-authored using generative AI tooling? No Closes #45745 from HyukjinKwon/SPARK-47621. Lead-authored-by: Hyukjin Kwon <gurwls...@apache.org> Co-authored-by: Hyukjin Kwon <gurwls...@gmail.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/functions/builtin.py | 149 ++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 19 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 59167ad9e736..386d28cca0c0 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -528,15 +528,45 @@ def try_avg(col: "ColumnOrName") -> Column: Examples -------- + Example 1: Calculating the average age + >>> import pyspark.sql.functions as sf - >>> spark.createDataFrame( - ... [(1982, 15), (1990, 2)], ["birth", "age"] - ... ).select(sf.try_avg("age")).show() + >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) + >>> df.select(sf.try_avg("age")).show() +------------+ |try_avg(age)| +------------+ | 8.5| +------------+ + + Example 2: Calculating the average age with None + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) + >>> df.select(sf.try_avg("age")).show() + +------------+ + |try_avg(age)| + +------------+ + | 3.0| + +------------+ + + Example 3: Overflow results in NULL when ANSI mode is on + + >>> from decimal import Decimal + >>> import pyspark.sql.functions as sf + >>> origin = spark.conf.get("spark.sql.ansi.enabled") + >>> spark.conf.set("spark.sql.ansi.enabled", "true") + >>> try: + ... df = spark.createDataFrame( + ... [(Decimal("1" * 38),), (Decimal(0),)], "number DECIMAL(38, 0)") + ... df.select(sf.try_avg(df.number)).show() + ... finally: + ... spark.conf.set("spark.sql.ansi.enabled", origin) + +---------------+ + |try_avg(number)| + +---------------+ + | NULL| + +---------------+ """ return _invoke_function_over_columns("try_avg", col) @@ -720,13 +750,55 @@ def try_sum(col: "ColumnOrName") -> Column: Examples -------- - >>> import pyspark.sql.functions as sf - >>> spark.range(10).select(sf.try_sum("id")).show() + Example 1: Calculating the sum of values in a column + + >>> from pyspark.sql import functions as sf + >>> df = spark.range(10) + >>> df.select(sf.try_sum(df["id"])).show() +-----------+ |try_sum(id)| +-----------+ | 45| +-----------+ + + Example 2: Using a plus expression together to calculate the sum + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1, 2), (3, 4)], ["A", "B"]) + >>> df.select(sf.try_sum(sf.col("A") + sf.col("B"))).show() + +----------------+ + |try_sum((A + B))| + +----------------+ + | 10| + +----------------+ + + Example 3: Calculating the summation of ages with None + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) + >>> df.select(sf.try_sum("age")).show() + +------------+ + |try_sum(age)| + +------------+ + | 6| + +------------+ + + Example 4: Overflow results in NULL when ANSI mode is on + + >>> from decimal import Decimal + >>> import pyspark.sql.functions as sf + >>> origin = spark.conf.get("spark.sql.ansi.enabled") + >>> spark.conf.set("spark.sql.ansi.enabled", "true") + >>> try: + ... df = spark.createDataFrame([(Decimal("1" * 38),)] * 10, "number DECIMAL(38, 0)") + ... df.select(sf.try_sum(df.number)).show() + ... finally: + ... spark.conf.set("spark.sql.ansi.enabled", origin) + +---------------+ + |try_sum(number)| + +---------------+ + | NULL| + +---------------+ """ return _invoke_function_over_columns("try_sum", col) @@ -1323,6 +1395,17 @@ def sum(col: "ColumnOrName") -> Column: +------------+ | 10| +------------+ + + Example 3: Calculating the summation of ages with None + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) + >>> df.select(sf.sum("age")).show() + +--------+ + |sum(age)| + +--------+ + | 6| + +--------+ """ return _invoke_function_over_columns("sum", col) @@ -1349,13 +1432,27 @@ def avg(col: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.range(10) - >>> df.select(avg(col("id"))).show() - +-------+ - |avg(id)| - +-------+ - | 4.5| - +-------+ + Example 1: Calculating the average age + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) + >>> df.select(sf.avg("age")).show() + +--------+ + |avg(age)| + +--------+ + | 8.5| + +--------+ + + Example 2: Calculating the average age with None + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) + >>> df.select(sf.avg("age")).show() + +--------+ + |avg(age)| + +--------+ + | 3.0| + +--------+ """ return _invoke_function_over_columns("avg", col) @@ -1383,13 +1480,27 @@ def mean(col: "ColumnOrName") -> Column: Examples -------- - >>> df = spark.range(10) - >>> df.select(mean(df.id)).show() - +-------+ - |avg(id)| - +-------+ - | 4.5| - +-------+ + Example 1: Calculating the average age + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"]) + >>> df.select(sf.mean("age")).show() + +--------+ + |avg(age)| + +--------+ + | 8.5| + +--------+ + + Example 2: Calculating the average age with None + + >>> import pyspark.sql.functions as sf + >>> df = spark.createDataFrame([(1982, None), (1990, 2), (2000, 4)], ["birth", "age"]) + >>> df.select(sf.mean("age")).show() + +--------+ + |avg(age)| + +--------+ + | 3.0| + +--------+ """ return _invoke_function_over_columns("mean", col) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org