build-reuters.sh

srowen Fri, 30 Sep 2011 04:44:29 -0700

Author: srowen
Date: Fri Sep 30 11:44:00 2011
New Revision: 1177616

URL: http://svn.apache.org/viewvc?rev=1177616&view=rev
Log:
MAHOUT-695 Compute number of terms rather than specify it


Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
    mahout/trunk/examples/bin/build-reuters.sh

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=1177616&r1=1177615&r2=1177616&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
Fri Sep 30 11:44:00 2011
@@ -42,6 +42,7 @@ import org.apache.mahout.common.RandomUt
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
+import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
 import org.apache.mahout.math.DenseMatrix;
 import org.apache.mahout.math.Vector;
@@ -63,7 +64,6 @@ import java.util.Random;
 public final class LDADriver extends AbstractJob {
 
   private static final String TOPIC_SMOOTHING_OPTION = "topicSmoothing";
-  private static final String NUM_WORDS_OPTION = "numWords";
   private static final String NUM_TOPICS_OPTION = "numTopics";
   // TODO: sequential iteration is not yet correct.
   // private static final String SEQUENTIAL_OPTION = "sequential";
@@ -146,9 +146,6 @@ public final class LDADriver extends Abs
     addOutputOption();
     addOption(DefaultOptionCreator.overwriteOption().create());
     addOption(NUM_TOPICS_OPTION, "k", "The total number of topics in the 
corpus", true);
-    addOption(NUM_WORDS_OPTION,
-              "v",
-              "The total number of words in the corpus (can be approximate, 
needs to exceed the actual value)");
     addOption(TOPIC_SMOOTHING_OPTION, "a", "Topic smoothing parameter. Default 
is 50/numTopics.", "-1.0");
     // addOption(SEQUENTIAL_OPTION, "seq", "Run sequentially (not 
Hadoop-based).  Default is false.", "false");
     
addOption(DefaultOptionCreator.maxIterationsOption().withRequired(false).create());
@@ -164,7 +161,7 @@ public final class LDADriver extends Abs
     }
     int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     int numTopics = Integer.parseInt(getOption(NUM_TOPICS_OPTION));
-    int numWords = Integer.parseInt(getOption(NUM_WORDS_OPTION));
+    int numWords = determineNumberOfWordsFromFirstVector();
     double topicSmoothing = 
Double.parseDouble(getOption(TOPIC_SMOOTHING_OPTION));
     if (topicSmoothing < 1) {
       topicSmoothing = 50.0 / numTopics;
@@ -194,6 +191,29 @@ public final class LDADriver extends Abs
     return lastPath;
   }
 
+  /**
+   * Determine the number of words based on the size of the input vectors.
+   * Note: can't just check first part since it might have null vector. (this 
+   * is a possible when seq2sparse is run over a small dataset with a large 
number
+   * of reducers)
+   */
+  private int determineNumberOfWordsFromFirstVector() throws IOException {
+    SequenceFileDirValueIterator<VectorWritable> it =
+        new SequenceFileDirValueIterator<VectorWritable>(getInputPath(), 
PathType.LIST, null, null, true, getConf());
+    try {
+      while (it.hasNext()) {
+        VectorWritable v = it.next();
+        if (v.get() != null) {
+          return v.get().size();
+        }
+      }
+    } finally {
+      Closeables.closeQuietly(it);
+    }
+    log.warn("can't determine number of words; no vectors in {}", 
getInputPath());
+    return 0;
+  }
+
   private void run(Configuration conf,
                           Path input,
                           Path output,

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1177616&r1=1177615&r2=1177616&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Fri Sep 30 11:44:00 2011
@@ -99,7 +99,7 @@ if [ "x$clustertype" == "xkmeans" ]; the
     -x 10 -k 20 -ow \
   && \
   $MAHOUT clusterdump \
-    -s ${WORK_DIR}/reuters-kmeans/clusters-10 \
+    -s ${WORK_DIR}/reuters-kmeans/clusters-*-final \
     -d ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
     -dt sequencefile -b 100 -n 20
 elif [ "x$clustertype" == "xlda" ]; then
@@ -110,7 +110,7 @@ elif [ "x$clustertype" == "xlda" ]; then
   && \
   $MAHOUT lda \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
-    -o ${WORK_DIR}/reuters-lda -k 20 -v 50000 -ow -x 20 \
+    -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
   && \
   $MAHOUT ldatopics \
     -i ${WORK_DIR}/reuters-lda/state-20 \

svn commit: r1177616 - in /mahout/trunk: core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java examples/bin/build-reuters.sh

Reply via email to