Author: srowen
Date: Fri Sep 30 11:44:00 2011
New Revision: 1177616
URL: http://svn.apache.org/viewvc?rev=1177616&view=rev
Log:
MAHOUT-695 Compute number of terms rather than specify it
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
mahout/trunk/examples/bin/build-reuters.sh
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=1177616&r1=1177615&r2=1177616&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
Fri Sep 30 11:44:00 2011
@@ -42,6 +42,7 @@ import org.apache.mahout.common.RandomUt
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.Vector;
@@ -63,7 +64,6 @@ import java.util.Random;
public final class LDADriver extends AbstractJob {
private static final String TOPIC_SMOOTHING_OPTION = "topicSmoothing";
- private static final String NUM_WORDS_OPTION = "numWords";
private static final String NUM_TOPICS_OPTION = "numTopics";
// TODO: sequential iteration is not yet correct.
// private static final String SEQUENTIAL_OPTION = "sequential";
@@ -146,9 +146,6 @@ public final class LDADriver extends Abs
addOutputOption();
addOption(DefaultOptionCreator.overwriteOption().create());
addOption(NUM_TOPICS_OPTION, "k", "The total number of topics in the
corpus", true);
- addOption(NUM_WORDS_OPTION,
- "v",
- "The total number of words in the corpus (can be approximate,
needs to exceed the actual value)");
addOption(TOPIC_SMOOTHING_OPTION, "a", "Topic smoothing parameter. Default
is 50/numTopics.", "-1.0");
// addOption(SEQUENTIAL_OPTION, "seq", "Run sequentially (not
Hadoop-based). Default is false.", "false");
addOption(DefaultOptionCreator.maxIterationsOption().withRequired(false).create());
@@ -164,7 +161,7 @@ public final class LDADriver extends Abs
}
int maxIterations =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
int numTopics = Integer.parseInt(getOption(NUM_TOPICS_OPTION));
- int numWords = Integer.parseInt(getOption(NUM_WORDS_OPTION));
+ int numWords = determineNumberOfWordsFromFirstVector();
double topicSmoothing =
Double.parseDouble(getOption(TOPIC_SMOOTHING_OPTION));
if (topicSmoothing < 1) {
topicSmoothing = 50.0 / numTopics;
@@ -194,6 +191,29 @@ public final class LDADriver extends Abs
return lastPath;
}
+ /**
+ * Determine the number of words based on the size of the input vectors.
+ * Note: can't just check first part since it might have null vector. (this
+ * is a possible when seq2sparse is run over a small dataset with a large
number
+ * of reducers)
+ */
+ private int determineNumberOfWordsFromFirstVector() throws IOException {
+ SequenceFileDirValueIterator<VectorWritable> it =
+ new SequenceFileDirValueIterator<VectorWritable>(getInputPath(),
PathType.LIST, null, null, true, getConf());
+ try {
+ while (it.hasNext()) {
+ VectorWritable v = it.next();
+ if (v.get() != null) {
+ return v.get().size();
+ }
+ }
+ } finally {
+ Closeables.closeQuietly(it);
+ }
+ log.warn("can't determine number of words; no vectors in {}",
getInputPath());
+ return 0;
+ }
+
private void run(Configuration conf,
Path input,
Path output,
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1177616&r1=1177615&r2=1177616&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Fri Sep 30 11:44:00 2011
@@ -99,7 +99,7 @@ if [ "x$clustertype" == "xkmeans" ]; the
-x 10 -k 20 -ow \
&& \
$MAHOUT clusterdump \
- -s ${WORK_DIR}/reuters-kmeans/clusters-10 \
+ -s ${WORK_DIR}/reuters-kmeans/clusters-*-final \
-d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
-dt sequencefile -b 100 -n 20
elif [ "x$clustertype" == "xlda" ]; then
@@ -110,7 +110,7 @@ elif [ "x$clustertype" == "xlda" ]; then
&& \
$MAHOUT lda \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
- -o ${WORK_DIR}/reuters-lda -k 20 -v 50000 -ow -x 20 \
+ -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
&& \
$MAHOUT ldatopics \
-i ${WORK_DIR}/reuters-lda/state-20 \