Github user witgo commented on the pull request:
https://github.com/apache/spark/pull/2376#issuecomment-55913971
I tried to reproduce it with the way the test case, but was
unsuccessful.But like the code below does appear the problem, do not know why
```scala
def runGibbsSampling(
data: RDD[Document], initModel: TopicModel,
totalIter: Int, burnInIter: Int
): (TopicModel, RDD[Document]) = {
require(totalIter > burnInIter, "totalIter is less than burnInIter")
require(totalIter > 0, "totalIter is less than 0")
require(burnInIter > 0, "burnInIter is less than 0")
val (numTopics, numTerms, alpha, beta) = (initModel.topicCounts_.size,
initModel.topicTermCounts_.head.size,
initModel.alpha, initModel.beta)
val probModel = TopicModel(numTopics, numTerms, alpha, beta)
logInfo("Start initialization")
var (topicModel, corpus) = sampleTermAssignment(data, initModel)
for (iter <- 1 to totalIter) {
logInfo("Start Gibbs sampling (Iteration %d/%d)".format(iter,
totalIter))
val broadcastModel = data.context.broadcast(topicModel)
val previousCorpus = corpus
corpus = corpus.mapPartitions { docs =>
val rand = new Random
val topicModel = broadcastModel.value
val topicThisTerm = BDV.zeros[Double](numTopics)
docs.map { doc =>
val content = doc.content
val topics = doc.topics
val topicsDist = doc.topicsDist
for (i <- 0 until content.length) {
val term = content(i)
val topic = topics(i)
val chosenTopic = topicModel.dropOneDistSampler(topicsDist,
topicThisTerm,
rand, term, topic)
if (topic != chosenTopic) {
topics(i) = chosenTopic
topicsDist(topic) += -1
topicsDist(chosenTopic) += 1
topicModel.update(term, topic, -1)
topicModel.update(term, chosenTopic, 1)
}
}
doc
}
}.setName(s"LDA-$iter").persist(StorageLevel.MEMORY_AND_DISK)
if (iter % 5 == 0 && data.context.getCheckpointDir.isDefined) {
corpus.checkpoint()
}
topicModel = collectTopicCounters(corpus, numTerms, numTopics)
if (iter > burnInIter) {
probModel.merge(topicModel)
}
previousCorpus.unpersist()
broadcastModel.unpersist()
}
val burnIn = (totalIter - burnInIter).toDouble
probModel.topicCounts_ :/= burnIn
probModel.topicTermCounts_.foreach(_ :/= burnIn)
(probModel, corpus)
}
```
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]