Repository: spark Updated Branches: refs/heads/master 1fcd5dcdd -> 935fe65ff
SPARK-1215 [MLLIB]: Clustering: Index out of bounds error (2) Added check to LocalKMeans.scala: kMeansPlusPlus initialization to handle case with fewer distinct data points than clusters k. Added two related unit tests to KMeansSuite. (Re-submitting PR after tangling commits in PR 1407 https://github.com/apache/spark/pull/1407 ) Author: Joseph K. Bradley <[email protected]> Closes #1468 from jkbradley/kmeans-fix and squashes the following commits: 4e9bd1e [Joseph K. Bradley] Updated PR per comments from mengxr 6c7a2ec [Joseph K. Bradley] Added check to LocalKMeans.scala: kMeansPlusPlus initialization to handle case with fewer distinct data points than clusters k. Added two related unit tests to KMeansSuite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/935fe65f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/935fe65f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/935fe65f Branch: refs/heads/master Commit: 935fe65ff6559a0e3b481e7508fa14337b23020b Parents: 1fcd5dc Author: Joseph K. Bradley <[email protected]> Authored: Thu Jul 17 15:05:02 2014 -0700 Committer: Xiangrui Meng <[email protected]> Committed: Thu Jul 17 15:05:02 2014 -0700 ---------------------------------------------------------------------- .../spark/mllib/clustering/LocalKMeans.scala | 8 +++++- .../spark/mllib/clustering/KMeansSuite.scala | 26 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/935fe65f/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala index 2e3a4ce..f0722d7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LocalKMeans.scala @@ -59,7 +59,13 @@ private[mllib] object LocalKMeans extends Logging { cumulativeScore += weights(j) * KMeans.pointCost(curCenters, points(j)) j += 1 } - centers(i) = points(j-1).toDense + if (j == 0) { + logWarning("kMeansPlusPlus initialization ran out of distinct points for centers." + + s" Using duplicate point for center k = $i.") + centers(i) = points(0).toDense + } else { + centers(i) = points(j - 1).toDense + } } // Run up to maxIterations iterations of Lloyd's algorithm http://git-wip-us.apache.org/repos/asf/spark/blob/935fe65f/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala index 560a4ad..76a3bdf 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala @@ -61,6 +61,32 @@ class KMeansSuite extends FunSuite with LocalSparkContext { assert(model.clusterCenters.head === center) } + test("no distinct points") { + val data = sc.parallelize( + Array( + Vectors.dense(1.0, 2.0, 3.0), + Vectors.dense(1.0, 2.0, 3.0), + Vectors.dense(1.0, 2.0, 3.0)), + 2) + val center = Vectors.dense(1.0, 2.0, 3.0) + + // Make sure code runs. + var model = KMeans.train(data, k=2, maxIterations=1) + assert(model.clusterCenters.size === 2) + } + + test("more clusters than points") { + val data = sc.parallelize( + Array( + Vectors.dense(1.0, 2.0, 3.0), + Vectors.dense(1.0, 3.0, 4.0)), + 2) + + // Make sure code runs. + var model = KMeans.train(data, k=3, maxIterations=1) + assert(model.clusterCenters.size === 3) + } + test("single cluster with big dataset") { val smallData = Array( Vectors.dense(1.0, 2.0, 6.0),
