Github user icexelloss commented on a diff in the pull request:
https://github.com/apache/spark/pull/21887#discussion_r205943208
--- Diff: examples/src/main/python/sql/arrow.py ---
@@ -113,6 +113,42 @@ def substract_mean(pdf):
# $example off:grouped_map_pandas_udf$
+def grouped_agg_pandas_udf_example(spark):
+ # $example on:grouped_agg_pandas_udf$
+ from pyspark.sql.functions import pandas_udf, PandasUDFType
+ from pyspark.sql import Window
+
+ df = spark.createDataFrame(
+ [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
+ ("id", "v"))
+
+ @pandas_udf("double", PandasUDFType.GROUPED_AGG)
+ def mean_udf(v):
+ return v.mean()
+ df.groupby("id").agg(mean_udf(df['v'])).show()
+ # +---+-----------+
+ # | id|mean_udf(v)|
+ # +---+-----------+
+ # | 1| 1.5|
+ # | 2| 6.0|
+ # +---+-----------+
+
+ w = Window \
+ .partitionBy('id') \
+ .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
+ df.withColumn('mean_v', mean_udf(df['v']).over(w)).show()
+ # +---+----+------+
+ # | id| v|mean_v|
+ # +---+----+------+
+ # | 1| 1.0| 1.5|
+ # | 1| 2.0| 1.5|
+ # | 2| 3.0| 6.0|
+ # | 2| 5.0| 6.0|
+ # | 2|10.0| 6.0|
+ # +---+----+------+
+ # $example off:grouped_map_pandas_udf$
--- End diff --
Ah good catch, my bad. Let me try building the doc.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]