spark git commit: [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator
Repository: spark Updated Branches: refs/heads/master 486acda8c -> 0b5917000 [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator ## What changes were proposed in this pull request? Using `computeCost` for evaluating a model is a very poor approach. We should advice the users to a better approach which is available, ie. using the `ClusteringEvaluator` to evaluate their models. The PR updates the examples for `BisectingKMeans` in order to do that. ## How was this patch tested? running examples Closes #22786 from mgaido91/SPARK-25764. Authored-by: Marco Gaido Signed-off-by: DB Tsai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b591700 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b591700 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b591700 Branch: refs/heads/master Commit: 0b59170001be1cc1198cfc1c0486ca34633e64d5 Parents: 486acda Author: Marco Gaido Authored: Mon Nov 5 22:42:04 2018 + Committer: DB Tsai Committed: Mon Nov 5 22:42:04 2018 + -- .../spark/examples/ml/JavaBisectingKMeansExample.java | 12 +--- .../src/main/python/ml/bisecting_k_means_example.py | 12 +--- .../spark/examples/ml/BisectingKMeansExample.scala | 12 +--- 3 files changed, 27 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 8c82aaa..f517dc3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml; // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans; import org.apache.spark.ml.clustering.BisectingKMeansModel; +import org.apache.spark.ml.evaluation.ClusteringEvaluator; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample { BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1); BisectingKMeansModel model = bkm.fit(dataset); -// Evaluate clustering. -double cost = model.computeCost(dataset); -System.out.println("Within Set Sum of Squared Errors = " + cost); +// Make predictions +Dataset predictions = model.transform(dataset); + +// Evaluate clustering by computing Silhouette score +ClusteringEvaluator evaluator = new ClusteringEvaluator(); + +double silhouette = evaluator.evaluate(predictions); +System.out.println("Silhouette with squared euclidean distance = " + silhouette); // Shows the result. System.out.println("Cluster Centers: "); http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/python/ml/bisecting_k_means_example.py -- diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 7842d20..82adb33 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -24,6 +24,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.clustering import BisectingKMeans +from pyspark.ml.evaluation import ClusteringEvaluator # $example off$ from pyspark.sql import SparkSession @@ -41,9 +42,14 @@ if __name__ == "__main__": bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) -# Evaluate clustering. -cost = model.computeCost(dataset) -print("Within Set Sum of Squared Errors = " + str(cost)) +# Make predictions +predictions = model.transform(dataset) + +# Evaluate clustering by computing Silhouette score +evaluator = ClusteringEvaluator() + +silhouette = evaluator.evaluate(predictions) +print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. print("Cluster Centers: ") http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala index
spark git commit: [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator
Repository: spark Updated Branches: refs/heads/branch-2.4 fd5b24726 -> 36307b1e4 [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator ## What changes were proposed in this pull request? The PR updates the examples for `BisectingKMeans` so that they don't use the deprecated method `computeCost` (see SPARK-25758). ## How was this patch tested? running examples Closes #22763 from mgaido91/SPARK-25764. Authored-by: Marco Gaido Signed-off-by: Wenchen Fan (cherry picked from commit d0ecff28545ac81f5ba7ac06957ced65b6e3ebcd) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/36307b1e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/36307b1e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/36307b1e Branch: refs/heads/branch-2.4 Commit: 36307b1e4b42ce22b07e7a3fc2679c4b5e7c34c8 Parents: fd5b247 Author: Marco Gaido Authored: Fri Oct 19 09:33:46 2018 +0800 Committer: Wenchen Fan Committed: Fri Oct 19 09:34:25 2018 +0800 -- .../spark/examples/ml/JavaBisectingKMeansExample.java | 12 +--- .../src/main/python/ml/bisecting_k_means_example.py | 12 +--- .../spark/examples/ml/BisectingKMeansExample.scala | 12 +--- 3 files changed, 27 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/36307b1e/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 8c82aaa..f517dc3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml; // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans; import org.apache.spark.ml.clustering.BisectingKMeansModel; +import org.apache.spark.ml.evaluation.ClusteringEvaluator; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample { BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1); BisectingKMeansModel model = bkm.fit(dataset); -// Evaluate clustering. -double cost = model.computeCost(dataset); -System.out.println("Within Set Sum of Squared Errors = " + cost); +// Make predictions +Dataset predictions = model.transform(dataset); + +// Evaluate clustering by computing Silhouette score +ClusteringEvaluator evaluator = new ClusteringEvaluator(); + +double silhouette = evaluator.evaluate(predictions); +System.out.println("Silhouette with squared euclidean distance = " + silhouette); // Shows the result. System.out.println("Cluster Centers: "); http://git-wip-us.apache.org/repos/asf/spark/blob/36307b1e/examples/src/main/python/ml/bisecting_k_means_example.py -- diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 7842d20..82adb33 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -24,6 +24,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.clustering import BisectingKMeans +from pyspark.ml.evaluation import ClusteringEvaluator # $example off$ from pyspark.sql import SparkSession @@ -41,9 +42,14 @@ if __name__ == "__main__": bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) -# Evaluate clustering. -cost = model.computeCost(dataset) -print("Within Set Sum of Squared Errors = " + str(cost)) +# Make predictions +predictions = model.transform(dataset) + +# Evaluate clustering by computing Silhouette score +evaluator = ClusteringEvaluator() + +silhouette = evaluator.evaluate(predictions) +print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. print("Cluster Centers: ") http://git-wip-us.apache.org/repos/asf/spark/blob/36307b1e/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala index 5f8f2c9..14e13df 100644 ---
spark git commit: [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator
Repository: spark Updated Branches: refs/heads/master f704ebe90 -> d0ecff285 [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator ## What changes were proposed in this pull request? The PR updates the examples for `BisectingKMeans` so that they don't use the deprecated method `computeCost` (see SPARK-25758). ## How was this patch tested? running examples Closes #22763 from mgaido91/SPARK-25764. Authored-by: Marco Gaido Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0ecff28 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0ecff28 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0ecff28 Branch: refs/heads/master Commit: d0ecff28545ac81f5ba7ac06957ced65b6e3ebcd Parents: f704ebe Author: Marco Gaido Authored: Fri Oct 19 09:33:46 2018 +0800 Committer: Wenchen Fan Committed: Fri Oct 19 09:33:46 2018 +0800 -- .../spark/examples/ml/JavaBisectingKMeansExample.java | 12 +--- .../src/main/python/ml/bisecting_k_means_example.py | 12 +--- .../spark/examples/ml/BisectingKMeansExample.scala | 12 +--- 3 files changed, 27 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecff28/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 8c82aaa..f517dc3 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml; // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans; import org.apache.spark.ml.clustering.BisectingKMeansModel; +import org.apache.spark.ml.evaluation.ClusteringEvaluator; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample { BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1); BisectingKMeansModel model = bkm.fit(dataset); -// Evaluate clustering. -double cost = model.computeCost(dataset); -System.out.println("Within Set Sum of Squared Errors = " + cost); +// Make predictions +Dataset predictions = model.transform(dataset); + +// Evaluate clustering by computing Silhouette score +ClusteringEvaluator evaluator = new ClusteringEvaluator(); + +double silhouette = evaluator.evaluate(predictions); +System.out.println("Silhouette with squared euclidean distance = " + silhouette); // Shows the result. System.out.println("Cluster Centers: "); http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecff28/examples/src/main/python/ml/bisecting_k_means_example.py -- diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 7842d20..82adb33 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -24,6 +24,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.clustering import BisectingKMeans +from pyspark.ml.evaluation import ClusteringEvaluator # $example off$ from pyspark.sql import SparkSession @@ -41,9 +42,14 @@ if __name__ == "__main__": bkm = BisectingKMeans().setK(2).setSeed(1) model = bkm.fit(dataset) -# Evaluate clustering. -cost = model.computeCost(dataset) -print("Within Set Sum of Squared Errors = " + str(cost)) +# Make predictions +predictions = model.transform(dataset) + +# Evaluate clustering by computing Silhouette score +evaluator = ClusteringEvaluator() + +silhouette = evaluator.evaluate(predictions) +print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. print("Cluster Centers: ") http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecff28/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala index 5f8f2c9..14e13df 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala +++