spark git commit: [SPARK-6091] [MLLIB] Add MulticlassMetrics in PySpark/MLlib
Repository: spark Updated Branches: refs/heads/branch-1.4 e96fc8630 - fe46374f9 [SPARK-6091] [MLLIB] Add MulticlassMetrics in PySpark/MLlib https://issues.apache.org/jira/browse/SPARK-6091 Author: Yanbo Liang yblia...@gmail.com Closes #6011 from yanboliang/spark-6091 and squashes the following commits: bb3e4ba [Yanbo Liang] trigger jenkins 53c045d [Yanbo Liang] keep compatibility for python 2.6 972d5ac [Yanbo Liang] Add MulticlassMetrics in PySpark/MLlib (cherry picked from commit bf7e81a51cd81706570615cd67362c86602dec88) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe46374f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe46374f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe46374f Branch: refs/heads/branch-1.4 Commit: fe46374f9c785dfe2b15a003c996240965554cdf Parents: e96fc86 Author: Yanbo Liang yblia...@gmail.com Authored: Sun May 10 00:57:14 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Sun May 10 00:57:29 2015 -0700 -- .../mllib/evaluation/MulticlassMetrics.scala| 8 ++ python/pyspark/mllib/evaluation.py | 129 +++ 2 files changed, 137 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fe46374f/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 666362a..4628dc5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -23,6 +23,7 @@ import org.apache.spark.SparkContext._ import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Matrices, Matrix} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame /** * ::Experimental:: @@ -33,6 +34,13 @@ import org.apache.spark.rdd.RDD @Experimental class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { + /** + * An auxiliary constructor taking a DataFrame. + * @param predictionAndLabels a DataFrame with two double columns: prediction and label + */ + private[mllib] def this(predictionAndLabels: DataFrame) = +this(predictionAndLabels.map(r = (r.getDouble(0), r.getDouble(1 + private lazy val labelCountByClass: Map[Double, Long] = predictionAndLabels.values.countByValue() private lazy val labelCount: Long = labelCountByClass.values.sum private lazy val tpByClass: Map[Double, Int] = predictionAndLabels http://git-wip-us.apache.org/repos/asf/spark/blob/fe46374f/python/pyspark/mllib/evaluation.py -- diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 3e11df0..3691459 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -141,6 +141,135 @@ class RegressionMetrics(JavaModelWrapper): return self.call(r2) +class MulticlassMetrics(JavaModelWrapper): + +Evaluator for multiclass classification. + + predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), +... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) + metrics = MulticlassMetrics(predictionAndLabels) + metrics.falsePositiveRate(0.0) +0.2... + metrics.precision(1.0) +0.75... + metrics.recall(2.0) +1.0... + metrics.fMeasure(0.0, 2.0) +0.52... + metrics.precision() +0.66... + metrics.recall() +0.66... + metrics.weightedFalsePositiveRate +0.19... + metrics.weightedPrecision +0.68... + metrics.weightedRecall +0.66... + metrics.weightedFMeasure() +0.66... + metrics.weightedFMeasure(2.0) +0.65... + + +def __init__(self, predictionAndLabels): + +:param predictionAndLabels an RDD of (prediction, label) pairs. + +sc = predictionAndLabels.ctx +sql_ctx = SQLContext(sc) +df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([ +StructField(prediction, DoubleType(), nullable=False), +StructField(label, DoubleType(), nullable=False)])) +java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics +java_model = java_class(df._jdf) +super(MulticlassMetrics, self).__init__(java_model) + +def truePositiveRate(self, label): + +Returns true positive rate for a given label (category). + +return
spark git commit: [SPARK-6091] [MLLIB] Add MulticlassMetrics in PySpark/MLlib
Repository: spark Updated Branches: refs/heads/master b13162b36 - bf7e81a51 [SPARK-6091] [MLLIB] Add MulticlassMetrics in PySpark/MLlib https://issues.apache.org/jira/browse/SPARK-6091 Author: Yanbo Liang yblia...@gmail.com Closes #6011 from yanboliang/spark-6091 and squashes the following commits: bb3e4ba [Yanbo Liang] trigger jenkins 53c045d [Yanbo Liang] keep compatibility for python 2.6 972d5ac [Yanbo Liang] Add MulticlassMetrics in PySpark/MLlib Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bf7e81a5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bf7e81a5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bf7e81a5 Branch: refs/heads/master Commit: bf7e81a51cd81706570615cd67362c86602dec88 Parents: b13162b Author: Yanbo Liang yblia...@gmail.com Authored: Sun May 10 00:57:14 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Sun May 10 00:57:14 2015 -0700 -- .../mllib/evaluation/MulticlassMetrics.scala| 8 ++ python/pyspark/mllib/evaluation.py | 129 +++ 2 files changed, 137 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bf7e81a5/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 666362a..4628dc5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -23,6 +23,7 @@ import org.apache.spark.SparkContext._ import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Matrices, Matrix} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.DataFrame /** * ::Experimental:: @@ -33,6 +34,13 @@ import org.apache.spark.rdd.RDD @Experimental class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { + /** + * An auxiliary constructor taking a DataFrame. + * @param predictionAndLabels a DataFrame with two double columns: prediction and label + */ + private[mllib] def this(predictionAndLabels: DataFrame) = +this(predictionAndLabels.map(r = (r.getDouble(0), r.getDouble(1 + private lazy val labelCountByClass: Map[Double, Long] = predictionAndLabels.values.countByValue() private lazy val labelCount: Long = labelCountByClass.values.sum private lazy val tpByClass: Map[Double, Int] = predictionAndLabels http://git-wip-us.apache.org/repos/asf/spark/blob/bf7e81a5/python/pyspark/mllib/evaluation.py -- diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index 3e11df0..3691459 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -141,6 +141,135 @@ class RegressionMetrics(JavaModelWrapper): return self.call(r2) +class MulticlassMetrics(JavaModelWrapper): + +Evaluator for multiclass classification. + + predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), +... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) + metrics = MulticlassMetrics(predictionAndLabels) + metrics.falsePositiveRate(0.0) +0.2... + metrics.precision(1.0) +0.75... + metrics.recall(2.0) +1.0... + metrics.fMeasure(0.0, 2.0) +0.52... + metrics.precision() +0.66... + metrics.recall() +0.66... + metrics.weightedFalsePositiveRate +0.19... + metrics.weightedPrecision +0.68... + metrics.weightedRecall +0.66... + metrics.weightedFMeasure() +0.66... + metrics.weightedFMeasure(2.0) +0.65... + + +def __init__(self, predictionAndLabels): + +:param predictionAndLabels an RDD of (prediction, label) pairs. + +sc = predictionAndLabels.ctx +sql_ctx = SQLContext(sc) +df = sql_ctx.createDataFrame(predictionAndLabels, schema=StructType([ +StructField(prediction, DoubleType(), nullable=False), +StructField(label, DoubleType(), nullable=False)])) +java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics +java_model = java_class(df._jdf) +super(MulticlassMetrics, self).__init__(java_model) + +def truePositiveRate(self, label): + +Returns true positive rate for a given label (category). + +return self.call(truePositiveRate, label) + +def falsePositiveRate(self, label): + +Returns false positive rate for a given label