spark git commit: [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator

2018-11-05 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/master 486acda8c -> 0b5917000


[SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use 
ClusteringEvaluator

## What changes were proposed in this pull request?

Using `computeCost` for evaluating a model is a very poor approach. We should 
advice the users to a better approach which is available, ie. using the 
`ClusteringEvaluator` to evaluate their models. The PR updates the examples for 
`BisectingKMeans` in order to do that.

## How was this patch tested?

running examples

Closes #22786 from mgaido91/SPARK-25764.

Authored-by: Marco Gaido 
Signed-off-by: DB Tsai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b591700
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b591700
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b591700

Branch: refs/heads/master
Commit: 0b59170001be1cc1198cfc1c0486ca34633e64d5
Parents: 486acda
Author: Marco Gaido 
Authored: Mon Nov 5 22:42:04 2018 +
Committer: DB Tsai 
Committed: Mon Nov 5 22:42:04 2018 +

--
 .../spark/examples/ml/JavaBisectingKMeansExample.java   | 12 +---
 .../src/main/python/ml/bisecting_k_means_example.py | 12 +---
 .../spark/examples/ml/BisectingKMeansExample.scala  | 12 +---
 3 files changed, 27 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 8c82aaa..f517dc3 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
 // $example on$
 import org.apache.spark.ml.clustering.BisectingKMeans;
 import org.apache.spark.ml.clustering.BisectingKMeansModel;
+import org.apache.spark.ml.evaluation.ClusteringEvaluator;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample {
 BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1);
 BisectingKMeansModel model = bkm.fit(dataset);
 
-// Evaluate clustering.
-double cost = model.computeCost(dataset);
-System.out.println("Within Set Sum of Squared Errors = " + cost);
+// Make predictions
+Dataset predictions = model.transform(dataset);
+
+// Evaluate clustering by computing Silhouette score
+ClusteringEvaluator evaluator = new ClusteringEvaluator();
+
+double silhouette = evaluator.evaluate(predictions);
+System.out.println("Silhouette with squared euclidean distance = " + 
silhouette);
 
 // Shows the result.
 System.out.println("Cluster Centers: ");

http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/python/ml/bisecting_k_means_example.py
--
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py 
b/examples/src/main/python/ml/bisecting_k_means_example.py
index 7842d20..82adb33 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 # $example on$
 from pyspark.ml.clustering import BisectingKMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
 # $example off$
 from pyspark.sql import SparkSession
 
@@ -41,9 +42,14 @@ if __name__ == "__main__":
 bkm = BisectingKMeans().setK(2).setSeed(1)
 model = bkm.fit(dataset)
 
-# Evaluate clustering.
-cost = model.computeCost(dataset)
-print("Within Set Sum of Squared Errors = " + str(cost))
+# Make predictions
+predictions = model.transform(dataset)
+
+# Evaluate clustering by computing Silhouette score
+evaluator = ClusteringEvaluator()
+
+silhouette = evaluator.evaluate(predictions)
+print("Silhouette with squared euclidean distance = " + str(silhouette))
 
 # Shows the result.
 print("Cluster Centers: ")

http://git-wip-us.apache.org/repos/asf/spark/blob/0b591700/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
index 

spark git commit: [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator

2018-10-18 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/branch-2.4 fd5b24726 -> 36307b1e4


[SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use 
ClusteringEvaluator

## What changes were proposed in this pull request?

The PR updates the examples for `BisectingKMeans` so that they don't use the 
deprecated method `computeCost` (see SPARK-25758).

## How was this patch tested?

running examples

Closes #22763 from mgaido91/SPARK-25764.

Authored-by: Marco Gaido 
Signed-off-by: Wenchen Fan 
(cherry picked from commit d0ecff28545ac81f5ba7ac06957ced65b6e3ebcd)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/36307b1e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/36307b1e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/36307b1e

Branch: refs/heads/branch-2.4
Commit: 36307b1e4b42ce22b07e7a3fc2679c4b5e7c34c8
Parents: fd5b247
Author: Marco Gaido 
Authored: Fri Oct 19 09:33:46 2018 +0800
Committer: Wenchen Fan 
Committed: Fri Oct 19 09:34:25 2018 +0800

--
 .../spark/examples/ml/JavaBisectingKMeansExample.java   | 12 +---
 .../src/main/python/ml/bisecting_k_means_example.py | 12 +---
 .../spark/examples/ml/BisectingKMeansExample.scala  | 12 +---
 3 files changed, 27 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/36307b1e/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 8c82aaa..f517dc3 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
 // $example on$
 import org.apache.spark.ml.clustering.BisectingKMeans;
 import org.apache.spark.ml.clustering.BisectingKMeansModel;
+import org.apache.spark.ml.evaluation.ClusteringEvaluator;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample {
 BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1);
 BisectingKMeansModel model = bkm.fit(dataset);
 
-// Evaluate clustering.
-double cost = model.computeCost(dataset);
-System.out.println("Within Set Sum of Squared Errors = " + cost);
+// Make predictions
+Dataset predictions = model.transform(dataset);
+
+// Evaluate clustering by computing Silhouette score
+ClusteringEvaluator evaluator = new ClusteringEvaluator();
+
+double silhouette = evaluator.evaluate(predictions);
+System.out.println("Silhouette with squared euclidean distance = " + 
silhouette);
 
 // Shows the result.
 System.out.println("Cluster Centers: ");

http://git-wip-us.apache.org/repos/asf/spark/blob/36307b1e/examples/src/main/python/ml/bisecting_k_means_example.py
--
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py 
b/examples/src/main/python/ml/bisecting_k_means_example.py
index 7842d20..82adb33 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 # $example on$
 from pyspark.ml.clustering import BisectingKMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
 # $example off$
 from pyspark.sql import SparkSession
 
@@ -41,9 +42,14 @@ if __name__ == "__main__":
 bkm = BisectingKMeans().setK(2).setSeed(1)
 model = bkm.fit(dataset)
 
-# Evaluate clustering.
-cost = model.computeCost(dataset)
-print("Within Set Sum of Squared Errors = " + str(cost))
+# Make predictions
+predictions = model.transform(dataset)
+
+# Evaluate clustering by computing Silhouette score
+evaluator = ClusteringEvaluator()
+
+silhouette = evaluator.evaluate(predictions)
+print("Silhouette with squared euclidean distance = " + str(silhouette))
 
 # Shows the result.
 print("Cluster Centers: ")

http://git-wip-us.apache.org/repos/asf/spark/blob/36307b1e/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
index 5f8f2c9..14e13df 100644
--- 

spark git commit: [SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use ClusteringEvaluator

2018-10-18 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/master f704ebe90 -> d0ecff285


[SPARK-25764][ML][EXAMPLES] Update BisectingKMeans example to use 
ClusteringEvaluator

## What changes were proposed in this pull request?

The PR updates the examples for `BisectingKMeans` so that they don't use the 
deprecated method `computeCost` (see SPARK-25758).

## How was this patch tested?

running examples

Closes #22763 from mgaido91/SPARK-25764.

Authored-by: Marco Gaido 
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0ecff28
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0ecff28
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0ecff28

Branch: refs/heads/master
Commit: d0ecff28545ac81f5ba7ac06957ced65b6e3ebcd
Parents: f704ebe
Author: Marco Gaido 
Authored: Fri Oct 19 09:33:46 2018 +0800
Committer: Wenchen Fan 
Committed: Fri Oct 19 09:33:46 2018 +0800

--
 .../spark/examples/ml/JavaBisectingKMeansExample.java   | 12 +---
 .../src/main/python/ml/bisecting_k_means_example.py | 12 +---
 .../spark/examples/ml/BisectingKMeansExample.scala  | 12 +---
 3 files changed, 27 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecff28/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 8c82aaa..f517dc3 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -20,6 +20,7 @@ package org.apache.spark.examples.ml;
 // $example on$
 import org.apache.spark.ml.clustering.BisectingKMeans;
 import org.apache.spark.ml.clustering.BisectingKMeansModel;
+import org.apache.spark.ml.evaluation.ClusteringEvaluator;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@@ -50,9 +51,14 @@ public class JavaBisectingKMeansExample {
 BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1);
 BisectingKMeansModel model = bkm.fit(dataset);
 
-// Evaluate clustering.
-double cost = model.computeCost(dataset);
-System.out.println("Within Set Sum of Squared Errors = " + cost);
+// Make predictions
+Dataset predictions = model.transform(dataset);
+
+// Evaluate clustering by computing Silhouette score
+ClusteringEvaluator evaluator = new ClusteringEvaluator();
+
+double silhouette = evaluator.evaluate(predictions);
+System.out.println("Silhouette with squared euclidean distance = " + 
silhouette);
 
 // Shows the result.
 System.out.println("Cluster Centers: ");

http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecff28/examples/src/main/python/ml/bisecting_k_means_example.py
--
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py 
b/examples/src/main/python/ml/bisecting_k_means_example.py
index 7842d20..82adb33 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 # $example on$
 from pyspark.ml.clustering import BisectingKMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
 # $example off$
 from pyspark.sql import SparkSession
 
@@ -41,9 +42,14 @@ if __name__ == "__main__":
 bkm = BisectingKMeans().setK(2).setSeed(1)
 model = bkm.fit(dataset)
 
-# Evaluate clustering.
-cost = model.computeCost(dataset)
-print("Within Set Sum of Squared Errors = " + str(cost))
+# Make predictions
+predictions = model.transform(dataset)
+
+# Evaluate clustering by computing Silhouette score
+evaluator = ClusteringEvaluator()
+
+silhouette = evaluator.evaluate(predictions)
+print("Silhouette with squared euclidean distance = " + str(silhouette))
 
 # Shows the result.
 print("Cluster Centers: ")

http://git-wip-us.apache.org/repos/asf/spark/blob/d0ecff28/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
index 5f8f2c9..14e13df 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
+++