Repository: spark Updated Branches: refs/heads/master 906383580 -> 9ac05225e
[SPARK-19319][SPARKR] SparkR Kmeans summary returns error when the cluster size doesn't equal to k ## What changes were proposed in this pull request When Kmeans using initMode = "random" and some random seed, it is possible the actual cluster size doesn't equal to the configured `k`. In this case, summary(model) returns error due to the number of cols of coefficient matrix doesn't equal to k. Example: > col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0) > col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0) > col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0) > cols <- as.data.frame(cbind(col1, col2, col3)) > df <- createDataFrame(cols) > > model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10, initMode = > "random", seed = 22222, tol = 1E-5) > > summary(model2) Error in `colnames<-`(`*tmp*`, value = c("col1", "col2", "col3")) : length of 'dimnames' [2] not equal to array extent In addition: Warning message: In matrix(coefficients, ncol = k) : data length [9] is not a sub-multiple or multiple of the number of rows [2] Fix: Get the actual cluster size in the summary and use it to build the coefficient matrix. ## How was this patch tested? Add unit tests. Author: wm...@hotmail.com <wm...@hotmail.com> Closes #16666 from wangmiao1981/kmeans. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ac05225 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ac05225 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ac05225 Branch: refs/heads/master Commit: 9ac05225e870e41dc86cd6d61c7f0d111d172810 Parents: 9063835 Author: wm...@hotmail.com <wm...@hotmail.com> Authored: Tue Jan 31 21:16:37 2017 -0800 Committer: Felix Cheung <felixche...@apache.org> Committed: Tue Jan 31 21:16:37 2017 -0800 ---------------------------------------------------------------------- R/pkg/R/mllib_clustering.R | 16 ++++++++++------ R/pkg/inst/tests/testthat/test_mllib_clustering.R | 15 +++++++++++---- .../scala/org/apache/spark/ml/r/KMeansWrapper.scala | 2 ++ 3 files changed, 23 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/9ac05225/R/pkg/R/mllib_clustering.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R index 3b782ce..8823f90 100644 --- a/R/pkg/R/mllib_clustering.R +++ b/R/pkg/R/mllib_clustering.R @@ -375,10 +375,13 @@ setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula" #' @param object a fitted k-means model. #' @return \code{summary} returns summary information of the fitted model, which is a list. -#' The list includes the model's \code{k} (number of cluster centers), +#' The list includes the model's \code{k} (the configured number of cluster centers), #' \code{coefficients} (model cluster centers), -#' \code{size} (number of data points in each cluster), and \code{cluster} -#' (cluster centers of the transformed data). +#' \code{size} (number of data points in each cluster), \code{cluster} +#' (cluster centers of the transformed data), {is.loaded} (whether the model is loaded +#' from a saved file), and \code{clusterSize} +#' (the actual number of cluster centers. When using initMode = "random", +#' \code{clusterSize} may not equal to \code{k}). #' @rdname spark.kmeans #' @export #' @note summary(KMeansModel) since 2.0.0 @@ -390,16 +393,17 @@ setMethod("summary", signature(object = "KMeansModel"), coefficients <- callJMethod(jobj, "coefficients") k <- callJMethod(jobj, "k") size <- callJMethod(jobj, "size") - coefficients <- t(matrix(unlist(coefficients), ncol = k)) + clusterSize <- callJMethod(jobj, "clusterSize") + coefficients <- t(matrix(unlist(coefficients), ncol = clusterSize)) colnames(coefficients) <- unlist(features) - rownames(coefficients) <- 1:k + rownames(coefficients) <- 1:clusterSize cluster <- if (is.loaded) { NULL } else { dataFrame(callJMethod(jobj, "cluster")) } list(k = k, coefficients = coefficients, size = size, - cluster = cluster, is.loaded = is.loaded) + cluster = cluster, is.loaded = is.loaded, clusterSize = clusterSize) }) # Predicted values based on a k-means model http://git-wip-us.apache.org/repos/asf/spark/blob/9ac05225/R/pkg/inst/tests/testthat/test_mllib_clustering.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R index 28a6eeb..1661e98 100644 --- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R +++ b/R/pkg/inst/tests/testthat/test_mllib_clustering.R @@ -196,13 +196,20 @@ test_that("spark.kmeans", { model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10, initMode = "random", seed = 22222, tol = 1E-5) - fitted.model1 <- fitted(model1) - fitted.model2 <- fitted(model2) + summary.model1 <- summary(model1) + summary.model2 <- summary(model2) + cluster1 <- summary.model1$cluster + cluster2 <- summary.model2$cluster + clusterSize1 <- summary.model1$clusterSize + clusterSize2 <- summary.model2$clusterSize + # The predicted clusters are different - expect_equal(sort(collect(distinct(select(fitted.model1, "prediction")))$prediction), + expect_equal(sort(collect(distinct(select(cluster1, "prediction")))$prediction), c(0, 1, 2, 3)) - expect_equal(sort(collect(distinct(select(fitted.model2, "prediction")))$prediction), + expect_equal(sort(collect(distinct(select(cluster2, "prediction")))$prediction), c(0, 1, 2)) + expect_equal(clusterSize1, 4) + expect_equal(clusterSize2, 3) }) test_that("spark.lda with libsvm", { http://git-wip-us.apache.org/repos/asf/spark/blob/9ac05225/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala index a1fefd3..8d59686 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala @@ -43,6 +43,8 @@ private[r] class KMeansWrapper private ( lazy val cluster: DataFrame = kMeansModel.summary.cluster + lazy val clusterSize: Int = kMeansModel.clusterCenters.size + def fitted(method: String): DataFrame = { if (method == "centers") { kMeansModel.summary.predictions.drop(kMeansModel.getFeaturesCol) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org