Repository: spark Updated Branches: refs/heads/master 9c0501c5d -> a6e53a9c8
[SPARK-9225] [MLLIB] LDASuite needs unit tests for empty documents Add unit tests for running LDA with empty documents. Both EMLDAOptimizer and OnlineLDAOptimizer are tested. feynmanliang Author: Meihua Wu <meihu...@umich.edu> Closes #7620 from rotationsymmetry/SPARK-9225 and squashes the following commits: 3ed7c88 [Meihua Wu] Incorporate reviewer's further comments f9432e8 [Meihua Wu] Incorporate reviewer's comments 8e1b9ec [Meihua Wu] Merge remote-tracking branch 'upstream/master' into SPARK-9225 ad55665 [Meihua Wu] Add unit tests for running LDA with empty documents Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a6e53a9c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a6e53a9c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a6e53a9c Branch: refs/heads/master Commit: a6e53a9c8b24326d1b6dca7a0e36ce6c643daa77 Parents: 9c0501c Author: Meihua Wu <meihu...@umich.edu> Authored: Thu Jul 30 08:52:01 2015 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Thu Jul 30 08:52:01 2015 -0700 ---------------------------------------------------------------------- .../spark/mllib/clustering/LDASuite.scala | 40 ++++++++++++++++++++ 1 file changed, 40 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/a6e53a9c/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index b91c7ce..61d2edf 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -390,6 +390,46 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("EMLDAOptimizer with empty docs") { + val vocabSize = 6 + val emptyDocsArray = Array.fill(6)(Vectors.sparse(vocabSize, Array.empty, Array.empty)) + val emptyDocs = emptyDocsArray + .zipWithIndex.map { case (wordCounts, docId) => + (docId.toLong, wordCounts) + } + val distributedEmptyDocs = sc.parallelize(emptyDocs, 2) + + val op = new EMLDAOptimizer() + val lda = new LDA() + .setK(3) + .setMaxIterations(5) + .setSeed(12345) + .setOptimizer(op) + + val model = lda.run(distributedEmptyDocs) + assert(model.vocabSize === vocabSize) + } + + test("OnlineLDAOptimizer with empty docs") { + val vocabSize = 6 + val emptyDocsArray = Array.fill(6)(Vectors.sparse(vocabSize, Array.empty, Array.empty)) + val emptyDocs = emptyDocsArray + .zipWithIndex.map { case (wordCounts, docId) => + (docId.toLong, wordCounts) + } + val distributedEmptyDocs = sc.parallelize(emptyDocs, 2) + + val op = new OnlineLDAOptimizer() + val lda = new LDA() + .setK(3) + .setMaxIterations(5) + .setSeed(12345) + .setOptimizer(op) + + val model = lda.run(distributedEmptyDocs) + assert(model.vocabSize === vocabSize) + } + } private[clustering] object LDASuite { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org