spark git commit: [SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer

srowen Fri, 15 Apr 2016 05:08:56 -0700

Repository: spark
Updated Branches:
  refs/heads/master 96534aa47 -> e24923267



[SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer

## What changes were proposed in this pull request?

Removed duplicated generation of `ids` in OnlineLDAOptimizer.

## How was this patch tested?

tested with existing unit tests.

Author: Pravin Gadakh <[email protected]>

Closes #12176 from pravingadakh/SPARK-14370.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2492326
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2492326
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2492326

Branch: refs/heads/master
Commit: e24923267f79e7fc03180095fcbb28a91f998f5d
Parents: 96534aa
Author: Pravin Gadakh <[email protected]>
Authored: Fri Apr 15 13:08:30 2016 +0100
Committer: Sean Owen <[email protected]>
Committed: Fri Apr 15 13:08:30 2016 +0100

----------------------------------------------------------------------
 .../org/apache/spark/mllib/clustering/LDAModel.scala   |  8 ++++----
 .../apache/spark/mllib/clustering/LDAOptimizer.scala   | 13 ++++++-------
 2 files changed, 10 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/e2492326/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 27b4004..4913c02 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -303,7 +303,7 @@ class LocalLDAModel private[spark] (
       documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: 
Vector) =>
         val localElogbeta = ElogbetaBc.value
         var docBound = 0.0D
-        val (gammad: BDV[Double], _) = 
OnlineLDAOptimizer.variationalTopicInference(
+        val (gammad: BDV[Double], _, _) = 
OnlineLDAOptimizer.variationalTopicInference(
           termCounts, exp(localElogbeta), brzAlpha, gammaShape, k)
         val Elogthetad: BDV[Double] = LDAUtils.dirichletExpectation(gammad)
 
@@ -354,7 +354,7 @@ class LocalLDAModel private[spark] (
       if (termCounts.numNonzeros == 0) {
         (id, Vectors.zeros(k))
       } else {
-        val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+        val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
           termCounts,
           expElogbetaBc.value,
           docConcentrationBrz,
@@ -377,7 +377,7 @@ class LocalLDAModel private[spark] (
       if (termCounts.numNonzeros == 0) {
         Vectors.zeros(k)
       } else {
-        val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+        val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
           termCounts,
           expElogbetaBc.value,
           docConcentrationBrz,
@@ -403,7 +403,7 @@ class LocalLDAModel private[spark] (
     if (document.numNonzeros == 0) {
       Vectors.zeros(this.k)
     } else {
-      val (gamma, _) = OnlineLDAOptimizer.variationalTopicInference(
+      val (gamma, _, _) = OnlineLDAOptimizer.variationalTopicInference(
         document,
         expElogbeta,
         this.docConcentration.toBreeze,

http://git-wip-us.apache.org/repos/asf/spark/blob/e2492326/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 6418f0d..1b3e2f6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -466,11 +466,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
       val stat = BDM.zeros[Double](k, vocabSize)
       var gammaPart = List[BDV[Double]]()
       nonEmptyDocs.foreach { case (_, termCounts: Vector) =>
-        val ids: List[Int] = termCounts match {
-          case v: DenseVector => (0 until v.size).toList
-          case v: SparseVector => v.indices.toList
-        }
-        val (gammad, sstats) = OnlineLDAOptimizer.variationalTopicInference(
+        val (gammad, sstats, ids) = 
OnlineLDAOptimizer.variationalTopicInference(
           termCounts, expElogbetaBc.value, alpha, gammaShape, k)
         stat(::, ids) := stat(::, ids).toDenseMatrix + sstats
         gammaPart = gammad :: gammaPart
@@ -563,13 +559,16 @@ private[clustering] object OnlineLDAOptimizer {
    * An optimization (Lee, Seung: Algorithms for non-negative matrix 
factorization, NIPS 2001)
    * avoids explicit computation of variational parameter `phi`.
    * @see [[http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.7566]]
+   *
+   * @return Returns a tuple of `gammad` - estimate of gamma, the topic 
distribution, `sstatsd` -
+   *         statistics for updating lambda and `ids` - list of termCounts 
vector indices.
    */
   private[clustering] def variationalTopicInference(
       termCounts: Vector,
       expElogbeta: BDM[Double],
       alpha: breeze.linalg.Vector[Double],
       gammaShape: Double,
-      k: Int): (BDV[Double], BDM[Double]) = {
+      k: Int): (BDV[Double], BDM[Double], List[Int]) = {
     val (ids: List[Int], cts: Array[Double]) = termCounts match {
       case v: DenseVector => ((0 until v.size).toList, v.values)
       case v: SparseVector => (v.indices.toList, v.values)
@@ -596,6 +595,6 @@ private[clustering] object OnlineLDAOptimizer {
     }
 
     val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector :/ 
phiNorm).asDenseMatrix
-    (gammad, sstatsd)
+    (gammad, sstatsd, ids)
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDAOptimizer

Reply via email to