This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 1d2159f8881 [SPARK-40852][CONNECT][PYTHON][DOC][FOLLOWUP] Add document for `DataFrame.summary` 1d2159f8881 is described below commit 1d2159f888139e65c80db2003d521b0f684df83a Author: Jiaan Geng <belie...@163.com> AuthorDate: Wed Dec 7 18:49:21 2022 +0800 [SPARK-40852][CONNECT][PYTHON][DOC][FOLLOWUP] Add document for `DataFrame.summary` ### What changes were proposed in this pull request? This PR adds document for `DataFrame.summary`. ### Why are the changes needed? This PR adds document for `DataFrame.summary`. ### Does this PR introduce _any_ user-facing change? 'No'. `DataFrame.summary` is a new API ### How was this patch tested? N/A Closes #38962 from beliefer/SPARK-40852_followup. Authored-by: Jiaan Geng <belie...@163.com> Signed-off-by: Ruifeng Zheng <ruife...@apache.org> --- python/pyspark/sql/connect/dataframe.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py index 1e1b5dbff21..f268dc431b8 100644 --- a/python/pyspark/sql/connect/dataframe.py +++ b/python/pyspark/sql/connect/dataframe.py @@ -1316,6 +1316,38 @@ class DataFrame(object): return DataFrameStatFunctions(self) def summary(self, *statistics: str) -> "DataFrame": + """Computes specified statistics for numeric and string columns. + + .. versionadded:: 3.4.0 + + Available statistics are: + count + mean + stddev + min + max + arbitrary approximate percentiles specified as a percentage (e.g. 75%) + count_distinct + approx_count_distinct + + Notes + ----- + If no statistics are given, this function computes 'count', 'mean', 'stddev', 'min', + 'approximate quartiles' (percentiles at 25%, 50%, and 75%), and 'max'. + This function is meant for exploratory data analysis, as we make no guarantee about the + backward compatibility of the schema of the resulting :class:`DataFrame`. If you want to + programmatically compute summary statistics, use the `agg` function instead. + + Parameters + ---------- + statistics : str, list, optional + Statistics from above list to be computed. + + Returns + ------- + :class:`DataFrame` + A new DataFrame that computes specified statistics for given DataFrame. + """ _statistics: List[str] = list(statistics) for s in _statistics: if not isinstance(s, str): --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org