[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/20695 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user jkbradley commented on a diff in the pull request: https://github.com/apache/spark/pull/20695#discussion_r181802586 --- Diff: python/pyspark/ml/stat.py --- @@ -195,6 +197,195 @@ def test(dataset, sampleCol, distName, *params): _jvm().PythonUtils.toSeq(params))) +class Summarizer(object): +""" +.. note:: Experimental + +Tools for vectorized statistics on MLlib Vectors. +The methods in this package provide various statistics for Vectors contained inside DataFrames. +This class lets users pick the statistics they would like to extract for a given column. + +>>> from pyspark.ml.stat import Summarizer +>>> from pyspark.sql import Row +>>> from pyspark.ml.linalg import Vectors +>>> summarizer = Summarizer.metrics("mean", "count") +>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), +... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() +>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) ++---+ +|aggregate_metrics(features, weight)| ++---+ +|[[1.0,1.0,1.0], 1] | ++---+ + +>>> df.select(summarizer.summary(df.features)).show(truncate=False) +++ +|aggregate_metrics(features, 1.0)| +++ +|[[1.0,1.5,2.0], 2] | +++ + +>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.0,1.0] | ++--+ + +>>> df.select(Summarizer.mean(df.features)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.5,2.0] | ++--+ + + +.. versionadded:: 2.4.0 + +""" +@staticmethod +@since("2.4.0") +def mean(col, weightCol=None): +""" +return a column of mean summary +""" +return Summarizer._get_single_metric(col, weightCol, "mean") + +@staticmethod +@since("2.4.0") +def variance(col, weightCol=None): +""" +return a column of variance summary +""" +return Summarizer._get_single_metric(col, weightCol, "variance") + +@staticmethod +@since("2.4.0") +def count(col, weightCol=None): +""" +return a column of count summary +""" +return Summarizer._get_single_metric(col, weightCol, "count") + +@staticmethod +@since("2.4.0") +def numNonZeros(col, weightCol=None): +""" +return a column of numNonZero summary +""" +return Summarizer._get_single_metric(col, weightCol, "numNonZeros") + +@staticmethod +@since("2.4.0") +def max(col, weightCol=None): +""" +return a column of max summary +""" +return Summarizer._get_single_metric(col, weightCol, "max") + +@staticmethod +@since("2.4.0") +def min(col, weightCol=None): +""" +return a column of min summary +""" +return Summarizer._get_single_metric(col, weightCol, "min") + +@staticmethod +@since("2.4.0") +def normL1(col, weightCol=None): +""" +return a column of normL1 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL1") + +@staticmethod +@since("2.4.0") +def normL2(col, weightCol=None): +""" +return a column of normL2 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL2") + +@staticmethod +def _check_param(featuresCol, weightCol): +if weightCol is None: +weightCol = lit(1.0) +if not isinstance(featuresCol, Column) or not isinstance(weightCol, Column): +raise TypeError("featureCol and weightCol should be a Column") +return featuresCol, weightCol + +@staticmethod +def _get_single_metric(col, weightCol, metric): +col, weightCol = Summarizer._check_param(col, weightCol) +return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, +col._jc, weightCol._jc)) + +@staticmethod +
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user jkbradley commented on a diff in the pull request: https://github.com/apache/spark/pull/20695#discussion_r181263309 --- Diff: python/pyspark/ml/stat.py --- @@ -195,6 +197,185 @@ def test(dataset, sampleCol, distName, *params): _jvm().PythonUtils.toSeq(params))) +class Summarizer(object): +""" +.. note:: Experimental + +Tools for vectorized statistics on MLlib Vectors. +The methods in this package provide various statistics for Vectors contained inside DataFrames. +This class lets users pick the statistics they would like to extract for a given column. + +>>> from pyspark.ml.stat import Summarizer +>>> from pyspark.sql import Row +>>> from pyspark.ml.linalg import Vectors +>>> summarizer = Summarizer.metrics("mean", "count") +>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), +... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() +>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) ++---+ +|aggregate_metrics(features, weight)| ++---+ +|[[1.0,1.0,1.0], 1] | ++---+ + +>>> df.select(summarizer.summary(df.features)).show(truncate=False) +++ +|aggregate_metrics(features, 1.0)| +++ +|[[1.0,1.5,2.0], 2] | +++ + +>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.0,1.0] | ++--+ + +>>> df.select(Summarizer.mean(df.features)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.5,2.0] | ++--+ + + +.. versionadded:: 2.4.0 + +""" +@staticmethod +@since("2.4.0") +def mean(col, weightCol=None): +""" +return a column of mean summary +""" +return Summarizer._get_single_metric(col, weightCol, "mean") + +@staticmethod +@since("2.4.0") +def variance(col, weightCol=None): +""" +return a column of variance summary +""" +return Summarizer._get_single_metric(col, weightCol, "variance") + +@staticmethod +@since("2.4.0") +def count(col, weightCol=None): +""" +return a column of count summary +""" +return Summarizer._get_single_metric(col, weightCol, "count") + +@staticmethod +@since("2.4.0") +def numNonZeros(col, weightCol=None): +""" +return a column of numNonZero summary +""" +return Summarizer._get_single_metric(col, weightCol, "numNonZeros") + +@staticmethod +@since("2.4.0") +def max(col, weightCol=None): +""" +return a column of max summary +""" +return Summarizer._get_single_metric(col, weightCol, "max") + +@staticmethod +@since("2.4.0") +def min(col, weightCol=None): +""" +return a column of min summary +""" +return Summarizer._get_single_metric(col, weightCol, "min") + +@staticmethod +@since("2.4.0") +def normL1(col, weightCol=None): +""" +return a column of normL1 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL1") + +@staticmethod +@since("2.4.0") +def normL2(col, weightCol=None): +""" +return a column of normL2 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL2") + +@staticmethod +def _check_param(featureCol, weightCol): +if weightCol is None: +weightCol = lit(1.0) +if not isinstance(featureCol, Column) or not isinstance(weightCol, Column): +raise TypeError("featureCol and weightCol should be a Column") +return featureCol, weightCol + +@staticmethod +def _get_single_metric(col, weightCol, metric): +col, weightCol = Summarizer._check_param(col, weightCol) +return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, +col._jc, weightCol._jc)) + +@staticmethod +
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user jkbradley commented on a diff in the pull request: https://github.com/apache/spark/pull/20695#discussion_r181259361 --- Diff: python/pyspark/ml/stat.py --- @@ -195,6 +197,185 @@ def test(dataset, sampleCol, distName, *params): _jvm().PythonUtils.toSeq(params))) +class Summarizer(object): +""" +.. note:: Experimental + +Tools for vectorized statistics on MLlib Vectors. +The methods in this package provide various statistics for Vectors contained inside DataFrames. +This class lets users pick the statistics they would like to extract for a given column. + +>>> from pyspark.ml.stat import Summarizer +>>> from pyspark.sql import Row +>>> from pyspark.ml.linalg import Vectors +>>> summarizer = Summarizer.metrics("mean", "count") +>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), +... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() +>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) ++---+ +|aggregate_metrics(features, weight)| ++---+ +|[[1.0,1.0,1.0], 1] | ++---+ + +>>> df.select(summarizer.summary(df.features)).show(truncate=False) +++ +|aggregate_metrics(features, 1.0)| +++ +|[[1.0,1.5,2.0], 2] | +++ + +>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.0,1.0] | ++--+ + +>>> df.select(Summarizer.mean(df.features)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.5,2.0] | ++--+ + + +.. versionadded:: 2.4.0 + +""" +@staticmethod +@since("2.4.0") +def mean(col, weightCol=None): +""" +return a column of mean summary +""" +return Summarizer._get_single_metric(col, weightCol, "mean") + +@staticmethod +@since("2.4.0") +def variance(col, weightCol=None): +""" +return a column of variance summary +""" +return Summarizer._get_single_metric(col, weightCol, "variance") + +@staticmethod +@since("2.4.0") +def count(col, weightCol=None): +""" +return a column of count summary +""" +return Summarizer._get_single_metric(col, weightCol, "count") + +@staticmethod +@since("2.4.0") +def numNonZeros(col, weightCol=None): +""" +return a column of numNonZero summary +""" +return Summarizer._get_single_metric(col, weightCol, "numNonZeros") + +@staticmethod +@since("2.4.0") +def max(col, weightCol=None): +""" +return a column of max summary +""" +return Summarizer._get_single_metric(col, weightCol, "max") + +@staticmethod +@since("2.4.0") +def min(col, weightCol=None): +""" +return a column of min summary +""" +return Summarizer._get_single_metric(col, weightCol, "min") + +@staticmethod +@since("2.4.0") +def normL1(col, weightCol=None): +""" +return a column of normL1 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL1") + +@staticmethod +@since("2.4.0") +def normL2(col, weightCol=None): +""" +return a column of normL2 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL2") + +@staticmethod +def _check_param(featureCol, weightCol): +if weightCol is None: +weightCol = lit(1.0) +if not isinstance(featureCol, Column) or not isinstance(weightCol, Column): +raise TypeError("featureCol and weightCol should be a Column") +return featureCol, weightCol + +@staticmethod +def _get_single_metric(col, weightCol, metric): +col, weightCol = Summarizer._check_param(col, weightCol) +return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, +col._jc, weightCol._jc)) + +@staticmethod +
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user jkbradley commented on a diff in the pull request: https://github.com/apache/spark/pull/20695#discussion_r181259181 --- Diff: python/pyspark/ml/stat.py --- @@ -195,6 +197,185 @@ def test(dataset, sampleCol, distName, *params): _jvm().PythonUtils.toSeq(params))) +class Summarizer(object): +""" +.. note:: Experimental + +Tools for vectorized statistics on MLlib Vectors. +The methods in this package provide various statistics for Vectors contained inside DataFrames. +This class lets users pick the statistics they would like to extract for a given column. + +>>> from pyspark.ml.stat import Summarizer +>>> from pyspark.sql import Row +>>> from pyspark.ml.linalg import Vectors +>>> summarizer = Summarizer.metrics("mean", "count") +>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), +... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() +>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) ++---+ +|aggregate_metrics(features, weight)| ++---+ +|[[1.0,1.0,1.0], 1] | ++---+ + +>>> df.select(summarizer.summary(df.features)).show(truncate=False) +++ +|aggregate_metrics(features, 1.0)| +++ +|[[1.0,1.5,2.0], 2] | +++ + +>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.0,1.0] | ++--+ + +>>> df.select(Summarizer.mean(df.features)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.5,2.0] | ++--+ + + +.. versionadded:: 2.4.0 + +""" +@staticmethod +@since("2.4.0") +def mean(col, weightCol=None): +""" +return a column of mean summary +""" +return Summarizer._get_single_metric(col, weightCol, "mean") + +@staticmethod +@since("2.4.0") +def variance(col, weightCol=None): +""" +return a column of variance summary +""" +return Summarizer._get_single_metric(col, weightCol, "variance") + +@staticmethod +@since("2.4.0") +def count(col, weightCol=None): +""" +return a column of count summary +""" +return Summarizer._get_single_metric(col, weightCol, "count") + +@staticmethod +@since("2.4.0") +def numNonZeros(col, weightCol=None): +""" +return a column of numNonZero summary +""" +return Summarizer._get_single_metric(col, weightCol, "numNonZeros") + +@staticmethod +@since("2.4.0") +def max(col, weightCol=None): +""" +return a column of max summary +""" +return Summarizer._get_single_metric(col, weightCol, "max") + +@staticmethod +@since("2.4.0") +def min(col, weightCol=None): +""" +return a column of min summary +""" +return Summarizer._get_single_metric(col, weightCol, "min") + +@staticmethod +@since("2.4.0") +def normL1(col, weightCol=None): +""" +return a column of normL1 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL1") + +@staticmethod +@since("2.4.0") +def normL2(col, weightCol=None): +""" +return a column of normL2 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL2") + +@staticmethod +def _check_param(featureCol, weightCol): +if weightCol is None: +weightCol = lit(1.0) +if not isinstance(featureCol, Column) or not isinstance(weightCol, Column): +raise TypeError("featureCol and weightCol should be a Column") +return featureCol, weightCol + +@staticmethod +def _get_single_metric(col, weightCol, metric): +col, weightCol = Summarizer._check_param(col, weightCol) +return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, +col._jc, weightCol._jc)) + +@staticmethod +
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user jkbradley commented on a diff in the pull request: https://github.com/apache/spark/pull/20695#discussion_r181259536 --- Diff: python/pyspark/ml/stat.py --- @@ -195,6 +197,185 @@ def test(dataset, sampleCol, distName, *params): _jvm().PythonUtils.toSeq(params))) +class Summarizer(object): +""" +.. note:: Experimental + +Tools for vectorized statistics on MLlib Vectors. +The methods in this package provide various statistics for Vectors contained inside DataFrames. +This class lets users pick the statistics they would like to extract for a given column. + +>>> from pyspark.ml.stat import Summarizer +>>> from pyspark.sql import Row +>>> from pyspark.ml.linalg import Vectors +>>> summarizer = Summarizer.metrics("mean", "count") +>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), +... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() +>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) ++---+ +|aggregate_metrics(features, weight)| ++---+ +|[[1.0,1.0,1.0], 1] | ++---+ + +>>> df.select(summarizer.summary(df.features)).show(truncate=False) +++ +|aggregate_metrics(features, 1.0)| +++ +|[[1.0,1.5,2.0], 2] | +++ + +>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.0,1.0] | ++--+ + +>>> df.select(Summarizer.mean(df.features)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.5,2.0] | ++--+ + + +.. versionadded:: 2.4.0 + +""" +@staticmethod +@since("2.4.0") +def mean(col, weightCol=None): +""" +return a column of mean summary +""" +return Summarizer._get_single_metric(col, weightCol, "mean") + +@staticmethod +@since("2.4.0") +def variance(col, weightCol=None): +""" +return a column of variance summary +""" +return Summarizer._get_single_metric(col, weightCol, "variance") + +@staticmethod +@since("2.4.0") +def count(col, weightCol=None): +""" +return a column of count summary +""" +return Summarizer._get_single_metric(col, weightCol, "count") + +@staticmethod +@since("2.4.0") +def numNonZeros(col, weightCol=None): +""" +return a column of numNonZero summary +""" +return Summarizer._get_single_metric(col, weightCol, "numNonZeros") + +@staticmethod +@since("2.4.0") +def max(col, weightCol=None): +""" +return a column of max summary +""" +return Summarizer._get_single_metric(col, weightCol, "max") + +@staticmethod +@since("2.4.0") +def min(col, weightCol=None): +""" +return a column of min summary +""" +return Summarizer._get_single_metric(col, weightCol, "min") + +@staticmethod +@since("2.4.0") +def normL1(col, weightCol=None): +""" +return a column of normL1 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL1") + +@staticmethod +@since("2.4.0") +def normL2(col, weightCol=None): +""" +return a column of normL2 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL2") + +@staticmethod +def _check_param(featureCol, weightCol): +if weightCol is None: +weightCol = lit(1.0) +if not isinstance(featureCol, Column) or not isinstance(weightCol, Column): +raise TypeError("featureCol and weightCol should be a Column") +return featureCol, weightCol + +@staticmethod +def _get_single_metric(col, weightCol, metric): +col, weightCol = Summarizer._check_param(col, weightCol) +return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, +col._jc, weightCol._jc)) + +@staticmethod +
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user WeichenXu123 commented on a diff in the pull request: https://github.com/apache/spark/pull/20695#discussion_r176009765 --- Diff: python/pyspark/ml/stat.py --- @@ -132,6 +134,172 @@ def corr(dataset, column, method="pearson"): return _java2py(sc, javaCorrObj.corr(*args)) +class Summarizer(object): +""" +.. note:: Experimental + +Tools for vectorized statistics on MLlib Vectors. +The methods in this package provide various statistics for Vectors contained inside DataFrames. +This class lets users pick the statistics they would like to extract for a given column. + +>>> from pyspark.ml.stat import Summarizer +>>> from pyspark.sql import Row +>>> from pyspark.ml.linalg import Vectors +>>> summarizer = Summarizer.metrics("mean", "count") +>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), +... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() +>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) ++---+ +|aggregate_metrics(features, weight)| ++---+ +|[[1.0,1.0,1.0], 1] | ++---+ + +>>> df.select(summarizer.summary(df.features)).show(truncate=False) +++ +|aggregate_metrics(features, 1.0)| +++ +|[[1.0,1.5,2.0], 2] | +++ + +>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.0,1.0] | ++--+ + +>>> df.select(Summarizer.mean(df.features)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.5,2.0] | ++--+ + + +.. versionadded:: 2.4.0 + +""" +def __init__(self, js): +self._js = js + +@staticmethod +@since("2.4.0") +def mean(col, weightCol=None): +""" +return a column of mean summary +""" +return Summarizer._get_single_metric(col, weightCol, "mean") + +@staticmethod +@since("2.4.0") +def variance(col, weightCol=None): +""" +return a column of variance summary +""" +return Summarizer._get_single_metric(col, weightCol, "variance") + +@staticmethod +@since("2.4.0") +def count(col, weightCol=None): +""" +return a column of count summary +""" +return Summarizer._get_single_metric(col, weightCol, "count") + +@staticmethod +@since("2.4.0") +def numNonZeros(col, weightCol=None): +""" +return a column of numNonZero summary +""" +return Summarizer._get_single_metric(col, weightCol, "numNonZeros") + +@staticmethod +@since("2.4.0") +def max(col, weightCol=None): +""" +return a column of max summary +""" +return Summarizer._get_single_metric(col, weightCol, "max") + +@staticmethod +@since("2.4.0") +def min(col, weightCol=None): +""" +return a column of min summary +""" +return Summarizer._get_single_metric(col, weightCol, "min") + +@staticmethod +@since("2.4.0") +def normL1(col, weightCol=None): +""" +return a column of normL1 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL1") + +@staticmethod +@since("2.4.0") +def normL2(col, weightCol=None): +""" +return a column of normL2 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL2") + +@staticmethod +def _check_param(featureCol, weightCol): +if weightCol is None: +weightCol = lit(1.0) +if not isinstance(featureCol, Column) or not isinstance(weightCol, Column): +raise TypeError("featureCol and weightCol should be a Column") +return featureCol, weightCol + +@staticmethod +def _get_single_metric(col, weightCol, metric): +col, weightCol = Summarizer._check_param(col, weightCol) +return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, +col._jc, weightCol._jc)) +
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
Github user MrBago commented on a diff in the pull request: https://github.com/apache/spark/pull/20695#discussion_r175971741 --- Diff: python/pyspark/ml/stat.py --- @@ -132,6 +134,172 @@ def corr(dataset, column, method="pearson"): return _java2py(sc, javaCorrObj.corr(*args)) +class Summarizer(object): +""" +.. note:: Experimental + +Tools for vectorized statistics on MLlib Vectors. +The methods in this package provide various statistics for Vectors contained inside DataFrames. +This class lets users pick the statistics they would like to extract for a given column. + +>>> from pyspark.ml.stat import Summarizer +>>> from pyspark.sql import Row +>>> from pyspark.ml.linalg import Vectors +>>> summarizer = Summarizer.metrics("mean", "count") +>>> df = sc.parallelize([Row(weight=1.0, features=Vectors.dense(1.0, 1.0, 1.0)), +... Row(weight=0.0, features=Vectors.dense(1.0, 2.0, 3.0))]).toDF() +>>> df.select(summarizer.summary(df.features, df.weight)).show(truncate=False) ++---+ +|aggregate_metrics(features, weight)| ++---+ +|[[1.0,1.0,1.0], 1] | ++---+ + +>>> df.select(summarizer.summary(df.features)).show(truncate=False) +++ +|aggregate_metrics(features, 1.0)| +++ +|[[1.0,1.5,2.0], 2] | +++ + +>>> df.select(Summarizer.mean(df.features, df.weight)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.0,1.0] | ++--+ + +>>> df.select(Summarizer.mean(df.features)).show(truncate=False) ++--+ +|mean(features)| ++--+ +|[1.0,1.5,2.0] | ++--+ + + +.. versionadded:: 2.4.0 + +""" +def __init__(self, js): +self._js = js + +@staticmethod +@since("2.4.0") +def mean(col, weightCol=None): +""" +return a column of mean summary +""" +return Summarizer._get_single_metric(col, weightCol, "mean") + +@staticmethod +@since("2.4.0") +def variance(col, weightCol=None): +""" +return a column of variance summary +""" +return Summarizer._get_single_metric(col, weightCol, "variance") + +@staticmethod +@since("2.4.0") +def count(col, weightCol=None): +""" +return a column of count summary +""" +return Summarizer._get_single_metric(col, weightCol, "count") + +@staticmethod +@since("2.4.0") +def numNonZeros(col, weightCol=None): +""" +return a column of numNonZero summary +""" +return Summarizer._get_single_metric(col, weightCol, "numNonZeros") + +@staticmethod +@since("2.4.0") +def max(col, weightCol=None): +""" +return a column of max summary +""" +return Summarizer._get_single_metric(col, weightCol, "max") + +@staticmethod +@since("2.4.0") +def min(col, weightCol=None): +""" +return a column of min summary +""" +return Summarizer._get_single_metric(col, weightCol, "min") + +@staticmethod +@since("2.4.0") +def normL1(col, weightCol=None): +""" +return a column of normL1 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL1") + +@staticmethod +@since("2.4.0") +def normL2(col, weightCol=None): +""" +return a column of normL2 summary +""" +return Summarizer._get_single_metric(col, weightCol, "normL2") + +@staticmethod +def _check_param(featureCol, weightCol): +if weightCol is None: +weightCol = lit(1.0) +if not isinstance(featureCol, Column) or not isinstance(weightCol, Column): +raise TypeError("featureCol and weightCol should be a Column") +return featureCol, weightCol + +@staticmethod +def _get_single_metric(col, weightCol, metric): +col, weightCol = Summarizer._check_param(col, weightCol) +return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, +col._jc, weightCol._jc)) + +
[GitHub] spark pull request #20695: [SPARK-21741][ML][PySpark] Python API for DataFra...
GitHub user WeichenXu123 opened a pull request: https://github.com/apache/spark/pull/20695 [SPARK-21741][ML][PySpark] Python API for DataFrame-based multivariate summarizer ## What changes were proposed in this pull request? Python API for DataFrame-based multivariate summarizer. ## How was this patch tested? doctest added. You can merge this pull request into a Git repository by running: $ git pull https://github.com/WeichenXu123/spark py_summarizer Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/20695.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #20695 commit 488d45aa9cb9c4de96453d1f2c06f83db2b1ef77 Author: WeichenXuDate: 2018-02-28T14:17:12Z init pr --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org