Repository: spark Updated Branches: refs/heads/master 96534aa47 -> e24923267
[SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer ## What changes were proposed in this pull request? Removed duplicated generation of `ids` in OnlineLDAOptimizer. ## How was this patch tested? tested with existing unit tests. Author: Pravin Gadakh <[email protected]> Closes #12176 from pravingadakh/SPARK-14370. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2492326 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2492326 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2492326 Branch: refs/heads/master Commit: e24923267f79e7fc03180095fcbb28a91f998f5d Parents: 96534aa Author: Pravin Gadakh <[email protected]> Authored: Fri Apr 15 13:08:30 2016 +0100 Committer: Sean Owen <[email protected]> Committed: Fri Apr 15 13:08:30 2016 +0100 ---------------------------------------------------------------------- .../org/apache/spark/mllib/clustering/LDAModel.scala | 8 ++++---- .../apache/spark/mllib/clustering/LDAOptimizer.scala | 13 ++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/e2492326/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 27b4004..4913c02 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -303,7 +303,7 @@ class LocalLDAModel private[spark] ( documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) => val localElogbeta = ElogbetaBc.value var docBound = 0.0D - val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference( + val (gammad: BDV[Double], _, _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, exp(localElogbeta), brzAlpha, gammaShape, k) val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad) @@ -354,7 +354,7 @@ class LocalLDAModel private[spark] ( if (termCounts.numNonzeros == 0) { (id, Vectors.zeros(k)) } else { - val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( + val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, expElogbetaBc.value, docConcentrationBrz, @@ -377,7 +377,7 @@ class LocalLDAModel private[spark] ( if (termCounts.numNonzeros == 0) { Vectors.zeros(k) } else { - val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( + val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, expElogbetaBc.value, docConcentrationBrz, @@ -403,7 +403,7 @@ class LocalLDAModel private[spark] ( if (document.numNonzeros == 0) { Vectors.zeros(this.k) } else { - val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference( + val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference( document, expElogbeta, this.docConcentration.toBreeze, http://git-wip-us.apache.org/repos/asf/spark/blob/e2492326/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 6418f0d..1b3e2f6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -466,11 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { val stat = BDM.zeros[Double](k, vocabSize) var gammaPart = List[BDV[Double]]() nonEmptyDocs.foreach { case (_, termCounts: Vector) => - val ids: List[Int] = termCounts match { - case v: DenseVector => (0 until v.size).toList - case v: SparseVector => v.indices.toList - } - val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference( + val (gammad, sstats, ids) = OnlineLDAOptimizer.variationalTopicInference( termCounts, expElogbetaBc.value, alpha, gammaShape, k) stat(::, ids) := stat(::, ids).toDenseMatrix + sstats gammaPart = gammad :: gammaPart @@ -563,13 +559,16 @@ private[clustering] object OnlineLDAOptimizer { * An optimization (Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001) * avoids explicit computation of variational parameter `phi`. * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]] + * + * @return Returns a tuple of `gammad` - estimate of gamma, the topic distribution, `sstatsd` - + * statistics for updating lambda and `ids` - list of termCounts vector indices. */ private[clustering] def variationalTopicInference( termCounts: Vector, expElogbeta: BDM[Double], alpha: breeze.linalg.Vector[Double], gammaShape: Double, - k: Int): (BDV[Double], BDM[Double]) = { + k: Int): (BDV[Double], BDM[Double], List[Int]) = { val (ids: List[Int], cts: Array[Double]) = termCounts match { case v: DenseVector => ((0 until v.size).toList, v.values) case v: SparseVector => (v.indices.toList, v.values) @@ -596,6 +595,6 @@ private[clustering] object OnlineLDAOptimizer { } val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ phiNorm).asDenseMatrix - (gammad, sstatsd) + (gammad, sstatsd, ids) } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
