[GitHub] spark pull request: [SPARK-5563][mllib] online lda initial checkin

jkbradley Thu, 30 Apr 2015 14:43:09 -0700

Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/4419#discussion_r29473555
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala ---
    @@ -202,9 +202,241 @@ class EMLDAOptimizer extends LDAOptimizer{
         
graph.vertices.filter(isTermVertex).values.fold(BDV.zeros[Double](numTopics))(_ 
+= _)
       }
     
    -  private[clustering] override def getLDAModel(iterationTimes: 
Array[Double]): LDAModel = {
    +  override private[clustering] def getLDAModel(iterationTimes: 
Array[Double]): LDAModel = {
         require(graph != null, "graph is null, EMLDAOptimizer not 
initialized.")
         this.graphCheckpointer.deleteAllCheckpoints()
         new DistributedLDAModel(this, iterationTimes)
       }
     }
    +
    +
    +/**
    + * :: Experimental ::
    + *
    + * An online optimizer for LDA. The Optimizer implements the Online 
variational Bayes LDA
    + * algorithm, which processes a subset of the corpus on each iteration, 
and updates the term-topic
    + * distribution adaptively.
    + *
    + * Original Online LDA paper:
    + *   Hoffman, Blei and Bach, "Online Learning for Latent Dirichlet 
Allocation." NIPS, 2010.
    + */
    +@Experimental
    +class OnlineLDAOptimizer extends LDAOptimizer {
    +
    +  // LDA common parameters
    +  private var k: Int = 0
    +  private var corpusSize: Long = 0
    +  private var vocabSize: Int = 0
    +  private[clustering] var alpha: Double = 0
    +  private[clustering] var eta: Double = 0
    +  private var randomGenerator: java.util.Random = null
    +
    +  // Online LDA specific parameters
    +  private var tau_0: Double = 1024
    +  private var kappa: Double = 0.51
    +  private var miniBatchFraction: Double = 0.01
    +
    +  // internal data structure
    +  private var docs: RDD[(Long, Vector)] = null
    +  private[clustering] var lambda: BDM[Double] = null
    +
    +  // count of invocation to next, which helps deciding the weight for each 
iteration
    +  private var iteration: Int = 0
    +  private var gammaShape: Double = 100
    +
    +  /**
    +   * A (positive) learning parameter that downweights early iterations. 
Larger values make early
    +   * iterations count less.
    +   */
    +  def getTau_0: Double = this.tau_0
    +
    +  /**
    +   * A (positive) learning parameter that downweights early iterations. 
Larger values make early
    +   * iterations count less.
    +   * Default: 1024, following the original Online LDA paper.
    +   */
    +  def setTau_0(tau_0: Double): this.type = {
    +    require(tau_0 > 0,  s"LDA tau_0 must be positive, but was set to 
$tau_0")
    +    this.tau_0 = tau_0
    +    this
    +  }
    +
    +  /**
    +   * Learning rate: exponential decay rate
    +   */
    +  def getKappa: Double = this.kappa
    +
    +  /**
    +   * Learning rate: exponential decay rate---should be between
    +   * (0.5, 1.0] to guarantee asymptotic convergence.
    +   * Default: 0.51, based on the original Online LDA paper.
    +   */
    +  def setKappa(kappa: Double): this.type = {
    +    require(kappa >= 0, s"Online LDA kappa must be nonnegative, but was 
set to $kappa")
    +    this.kappa = kappa
    +    this
    +  }
    +
    +  /**
    +   * Mini-batch fraction, which sets the fraction of document sampled and 
used in each iteration
    +   */
    +  def getMiniBatchFraction: Double = this.miniBatchFraction
    +
    +  /**
    +   * Mini-batch fraction in (0, 1], which sets the fraction of document 
sampled and used in
    +   * each iteration.
    +   * Default: 0.01, i.e., 1% of total documents
    +   */
    +  def setMiniBatchFraction(miniBatchFraction: Double): this.type = {
    +    require(miniBatchFraction > 0.0 && miniBatchFraction <= 1.0,
    +      s"Online LDA miniBatchFraction must be in range (0,1], but was set 
to $miniBatchFraction")
    +    this.miniBatchFraction = miniBatchFraction
    +    this
    +  }
    +
    +  /**
    +   * The function is for test only now. In the future, it can help support 
training strop/resume
    +   */
    +  private[clustering] def setLambda(lambda: BDM[Double]): this.type = {
    +    this.lambda = lambda
    +    this
    +  }
    +
    +  /**
    +   * Used to control the gamma distribution. Larger value produces values 
closer to 1.0.
    +   */
    +  private[clustering] def setGammaShape(shape: Double): this.type = {
    +    this.gammaShape = shape
    +    this
    +  }
    +
    +  override private[clustering] def initialize(docs: RDD[(Long, Vector)], 
lda: LDA):
    --- End diff --
    
    scala style: If this can't fit on 1 line (100 chars), then put 1 argument 
per line:
    ```
    override private[clustering] def initialize(
        docs: RDD[(Long, Vector)],
        lda: LDA): OnlineLDAOptimizer = {
    ```



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-5563][mllib] online lda initial checkin

Reply via email to