Repository: spark Updated Branches: refs/heads/master 486acda8c -> 0b5917000
[SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator ## What changes were proposed in this pull request? Using `computeCost` for evaluating a model is a very poor approach. We should advice the users to a better approach which is available, ie. using the `ClusteringEvaluator` to evaluate their models. The PR updates the examples for `BisectingKMeans` in order to do that. ## How was this patch tested? running examples Closes #22786 from mgaido91/SPARK-25764. Authored-by: Marco Gaido <[email protected]> Signed-off-by: DB Tsai <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b591700 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b591700 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b591700 Branch: refs/heads/master Commit: 0b59170001be1cc1198cfc1c0486ca34633e64d5 Parents: 486acda Author: Marco Gaido <[email protected]> Authored: Mon Nov 5 22:42:04 2018 +0000 Committer: DB Tsai <[email protected]> Committed: Mon Nov 5 22:42:04 2018 +0000 ---------------------------------------------------------------------- .../spark/examples/ml/JavaBisectingKMeansExample.java | 12 +++++++++--- .../src/main/python/ml/bisecting_k_means_example.py | 12 +++++++++--- .../spark/examples/ml/BisectingKMeansExample.scala | 12 +++++++++--- 3 files changed, 27 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 8c82aaa..f517dc3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml; // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans; import org.apache.spark.ml.clustering.BisectingKMeansModel; +import org.apache.spark.ml.evaluation.ClusteringEvaluator; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample { BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1); BisectingKMeansModel model = bkm.fit(dataset); - // Evaluate clustering. - double cost = model.computeCost(dataset); - System.out.println("Within Set Sum of Squared Errors = " + cost); + // Make predictions + Dataset<Row> predictions = model.transform(dataset); + + // Evaluate clustering by computing Silhouette score + ClusteringEvaluator evaluator = new ClusteringEvaluator(); + + double silhouette = evaluator.evaluate(predictions); + System.out.println("Silhouette with squared euclidean distance = " + silhouette); // Shows the result. System.out.println("Cluster Centers: "); http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/python/ml/bisecting_k_means_example.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 7842d20..82adb33 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -24,6 +24,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.clustering import BisectingKMeans +from pyspark.ml.evaluation import ClusteringEvaluator # $example off$ from pyspark.sql import SparkSession @@ -41,9 +42,14 @@ if __name__ == "__main__": bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) - # Evaluate clustering. - cost = model.computeCost(dataset) - print("Within Set Sum of Squared Errors = " + str(cost)) + # Make predictions + predictions = model.transform(dataset) + + # Evaluate clustering by computing Silhouette score + evaluator = ClusteringEvaluator() + + silhouette = evaluator.evaluate(predictions) + print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. print("Cluster Centers: ") http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala index 5f8f2c9..14e13df 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala @@ -21,6 +21,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans +import org.apache.spark.ml.evaluation.ClusteringEvaluator // $example off$ import org.apache.spark.sql.SparkSession @@ -48,9 +49,14 @@ object BisectingKMeansExample { val bkm = new BisectingKMeans().setK(2).setSeed(1) val model = bkm.fit(dataset) - // Evaluate clustering. - val cost = model.computeCost(dataset) - println(s"Within Set Sum of Squared Errors = $cost") + // Make predictions + val predictions = model.transform(dataset) + + // Evaluate clustering by computing Silhouette score + val evaluator = new ClusteringEvaluator() + + val silhouette = evaluator.evaluate(predictions) + println(s"Silhouette with squared euclidean distance = $silhouette") // Shows the result. println("Cluster Centers: ") --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
