Repository: spark
Updated Branches:
  refs/heads/branch-1.4 cd55e9a51 -> 8b631038e


[SPARK-6612] [MLLIB] [PYSPARK] Python KMeans parity

The following items are added to Python kmeans:

kmeans - setEpsilon, setInitializationSteps
KMeansModel - computeCost, k

Author: Hrishikesh Subramonian <[email protected]>

Closes #5647 from FlytxtRnD/newPyKmeansAPI and squashes the following commits:

b9e451b [Hrishikesh Subramonian] set seed to fixed value in doc test
5fd3ced [Hrishikesh Subramonian] doc test corrections
20b3c68 [Hrishikesh Subramonian] python 3 fixes
4d4e695 [Hrishikesh Subramonian] added arguments in python tests
21eb84c [Hrishikesh Subramonian] Python Kmeans - setEpsilon, 
setInitializationSteps, k and computeCost added.

(cherry picked from commit 5995ada96b661546a80657f2c5ed20604593e4aa)
Signed-off-by: Xiangrui Meng <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8b631038
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8b631038
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8b631038

Branch: refs/heads/branch-1.4
Commit: 8b631038e3f2df6628eeff1dd924c1c65509f6f2
Parents: cd55e9a
Author: Hrishikesh Subramonian <[email protected]>
Authored: Tue May 5 07:57:39 2015 -0700
Committer: Xiangrui Meng <[email protected]>
Committed: Tue May 5 07:57:47 2015 -0700

----------------------------------------------------------------------
 examples/src/main/python/mllib/kmeans.py        |  1 +
 .../spark/mllib/api/python/PythonMLLibAPI.scala | 15 +++++++++-
 python/pyspark/mllib/clustering.py              | 29 +++++++++++++++++---
 python/pyspark/mllib/tests.py                   |  9 ++++--
 4 files changed, 46 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8b631038/examples/src/main/python/mllib/kmeans.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/mllib/kmeans.py 
b/examples/src/main/python/mllib/kmeans.py
index f901a87..002fc75 100755
--- a/examples/src/main/python/mllib/kmeans.py
+++ b/examples/src/main/python/mllib/kmeans.py
@@ -43,4 +43,5 @@ if __name__ == "__main__":
     k = int(sys.argv[2])
     model = KMeans.train(data, k)
     print("Final centers: " + str(model.clusterCenters))
+    print("Total Cost: " + str(model.computeCost(data)))
     sc.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/8b631038/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 8e9a208..b086cec 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -291,12 +291,16 @@ private[python] class PythonMLLibAPI extends Serializable 
{
       maxIterations: Int,
       runs: Int,
       initializationMode: String,
-      seed: java.lang.Long): KMeansModel = {
+      seed: java.lang.Long,
+      initializationSteps: Int,
+      epsilon: Double): KMeansModel = {
     val kMeansAlg = new KMeans()
       .setK(k)
       .setMaxIterations(maxIterations)
       .setRuns(runs)
       .setInitializationMode(initializationMode)
+      .setInitializationSteps(initializationSteps)
+      .setEpsilon(epsilon)
 
     if (seed != null) kMeansAlg.setSeed(seed)
 
@@ -308,6 +312,15 @@ private[python] class PythonMLLibAPI extends Serializable {
   }
 
   /**
+   * Java stub for Python mllib KMeansModel.computeCost()
+   */
+  def computeCostKmeansModel(
+      data: JavaRDD[Vector],
+      centers: java.util.ArrayList[Vector]): Double = {
+    new KMeansModel(centers).computeCost(data)
+  }
+
+  /**
    * Java stub for Python mllib GaussianMixture.run()
    * Returns a list containing weights, mean and covariance of each mixture 
component.
    */

http://git-wip-us.apache.org/repos/asf/spark/blob/8b631038/python/pyspark/mllib/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/clustering.py 
b/python/pyspark/mllib/clustering.py
index abbb7cf..04e6715 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -40,11 +40,16 @@ class KMeansModel(Saveable, Loader):
 
     >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
     >>> model = KMeans.train(
-    ...     sc.parallelize(data), 2, maxIterations=10, runs=30, 
initializationMode="random")
+    ...     sc.parallelize(data), 2, maxIterations=10, runs=30, 
initializationMode="random",
+    ...                    seed=50, initializationSteps=5, epsilon=1e-4)
     >>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
     True
     >>> model.predict(array([8.0, 9.0])) == model.predict(array([9.0, 8.0]))
     True
+    >>> model.k
+    2
+    >>> model.computeCost(sc.parallelize(data))
+    2.0000000000000004
     >>> model = KMeans.train(sc.parallelize(data), 2)
     >>> sparse_data = [
     ...     SparseVector(3, {1: 1.0}),
@@ -52,7 +57,8 @@ class KMeansModel(Saveable, Loader):
     ...     SparseVector(3, {2: 1.0}),
     ...     SparseVector(3, {2: 1.1})
     ... ]
-    >>> model = KMeans.train(sc.parallelize(sparse_data), 2, 
initializationMode="k-means||")
+    >>> model = KMeans.train(sc.parallelize(sparse_data), 2, 
initializationMode="k-means||",
+    ...                                     seed=50, initializationSteps=5, 
epsilon=1e-4)
     >>> model.predict(array([0., 1., 0.])) == model.predict(array([0, 1.1, 
0.]))
     True
     >>> model.predict(array([0., 0., 1.])) == model.predict(array([0, 0, 1.1]))
@@ -83,6 +89,11 @@ class KMeansModel(Saveable, Loader):
         """Get the cluster centers, represented as a list of NumPy arrays."""
         return self.centers
 
+    @property
+    def k(self):
+        """Total number of clusters."""
+        return len(self.centers)
+
     def predict(self, x):
         """Find the cluster to which x belongs in this model."""
         best = 0
@@ -95,6 +106,15 @@ class KMeansModel(Saveable, Loader):
                 best_distance = distance
         return best
 
+    def computeCost(self, rdd):
+        """
+        Return the K-means cost (sum of squared distances of points to
+        their nearest center) for this model on the given data.
+        """
+        cost = callMLlibFunc("computeCostKmeansModel", 
rdd.map(_convert_to_vector),
+                             [_convert_to_vector(c) for c in self.centers])
+        return cost
+
     def save(self, sc, path):
         java_centers = _py2java(sc, [_convert_to_vector(c) for c in 
self.centers])
         java_model = 
sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
@@ -109,10 +129,11 @@ class KMeansModel(Saveable, Loader):
 class KMeans(object):
 
     @classmethod
-    def train(cls, rdd, k, maxIterations=100, runs=1, 
initializationMode="k-means||", seed=None):
+    def train(cls, rdd, k, maxIterations=100, runs=1, 
initializationMode="k-means||",
+              seed=None, initializationSteps=5, epsilon=1e-4):
         """Train a k-means clustering model."""
         model = callMLlibFunc("trainKMeansModel", rdd.map(_convert_to_vector), 
k, maxIterations,
-                              runs, initializationMode, seed)
+                              runs, initializationMode, seed, 
initializationSteps, epsilon)
         centers = callJavaFunc(rdd.context, model.clusterCenters)
         return KMeansModel([c.toArray() for c in centers])
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8b631038/python/pyspark/mllib/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 1d9c6eb..d05cfe2 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -236,7 +236,8 @@ class ListTests(MLlibTestCase):
             [1.1, 0],
             [1.2, 0],
         ]
-        clusters = KMeans.train(self.sc.parallelize(data), 2, 
initializationMode="k-means||")
+        clusters = KMeans.train(self.sc.parallelize(data), 2, 
initializationMode="k-means||",
+                                initializationSteps=7, epsilon=1e-4)
         self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
         self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
 
@@ -246,9 +247,11 @@ class ListTests(MLlibTestCase):
         Y = range(0, 100, 10)
         data = [[x, y] for x, y in zip(X, Y)]
         clusters1 = KMeans.train(self.sc.parallelize(data),
-                                 3, initializationMode="k-means||", seed=42)
+                                 3, initializationMode="k-means||",
+                                 seed=42, initializationSteps=7, epsilon=1e-4)
         clusters2 = KMeans.train(self.sc.parallelize(data),
-                                 3, initializationMode="k-means||", seed=42)
+                                 3, initializationMode="k-means||",
+                                 seed=42, initializationSteps=7, epsilon=1e-4)
         centers1 = clusters1.centers
         centers2 = clusters2.centers
         for c1, c2 in zip(centers1, centers2):


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to