Author: gsingers
Date: Wed Nov 23 14:25:26 2011
New Revision: 1205414

URL: http://svn.apache.org/viewvc?rev=1205414&view=rev
Log:
Add filtering to the vector dumper, update clusster-reuters to use named vectors

Modified:
    mahout/trunk/examples/bin/cluster-reuters.sh
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java

Modified: mahout/trunk/examples/bin/cluster-reuters.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-reuters.sh?rev=1205414&r1=1205413&r2=1205414&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-reuters.sh (original)
+++ mahout/trunk/examples/bin/cluster-reuters.sh Wed Nov 23 14:25:26 2011
@@ -93,7 +93,7 @@ fi
 if [ "x$clustertype" == "xkmeans" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 
--namedVector \
   && \
   $MAHOUT kmeans \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
@@ -110,7 +110,7 @@ if [ "x$clustertype" == "xkmeans" ]; the
 elif [ "x$clustertype" == "xfuzzykmeans" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 
--namedVector \
   && \
   $MAHOUT fkmeans \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
@@ -127,7 +127,7 @@ elif [ "x$clustertype" == "xlda" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
     -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
-    -wt tf -seq -nr 3 \
+    -wt tf -seq -nr 3 --namedVector \
   && \
   $MAHOUT lda \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
@@ -140,7 +140,7 @@ elif [ "x$clustertype" == "xlda" ]; then
 elif [ "x$clustertype" == "xdirichlet" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet  --maxDFPercent 85 \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet  --maxDFPercent 85 
--namedVector \
   && \
   $MAHOUT dirichlet \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/tfidf-vectors \
@@ -156,7 +156,7 @@ elif [ "x$clustertype" == "xdirichlet" ]
 elif [ "x$clustertype" == "xminhash" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \
-    -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash --maxDFPercent 85 \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash --maxDFPercent 85 
--namedVector \
   && \
   $MAHOUT org.apache.mahout.clustering.minhash.MinHashDriver \
     -i ${WORK_DIR}/reuters-out-seqdir-sparse-minhash/tfidf-vectors \

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1205414&r1=1205413&r2=1205414&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
 Wed Nov 23 14:25:26 2011
@@ -44,7 +44,9 @@ import org.slf4j.LoggerFactory;
 import java.io.File;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
+import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Set;
 
 /**
  * Can read in a {@link SequenceFile} of {@link Vector}s and dump
@@ -91,12 +93,16 @@ public final class VectorDumper {
     Option numItemsOpt = 
obuilder.withLongName("n").withRequired(false).withArgument(
             
abuilder.withName("numItems").withMinimum(1).withMaximum(1).create()).
             withDescription("Output at most <n> key value 
pairs").withShortName("n").create();
+    Option filtersOpt = 
obuilder.withLongName("filter").withRequired(false).withArgument(
+            
abuilder.withName("filter").withMinimum(1).withMaximum(100).create()).
+            withDescription("Only dump out those vectors whose name matches 
the filter.  Multiple items may be specified by repeating the 
argument.").withShortName("fi").create();
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
             .create();
 
     Group group = 
gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
             
dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
-            
printKeyOpt).withOption(sizeOpt).withOption(numItemsOpt).withOption(helpOpt).create();
+            
printKeyOpt).withOption(sizeOpt).withOption(numItemsOpt).withOption(filtersOpt)
+            .withOption(helpOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -129,6 +135,13 @@ public final class VectorDumper {
             throw new OptionException(dictTypeOpt);
           }
         }
+
+        Set<String> filters;
+        if (cmdLine.hasOption(filtersOpt)) {
+          filters = new HashSet<String>(cmdLine.getValues(filtersOpt));
+        } else {
+          filters = null;
+        }
         boolean useCSV = cmdLine.hasOption(csvOpt);
 
         boolean sizeOnly = cmdLine.hasOption(sizeOpt);
@@ -175,6 +188,10 @@ public final class VectorDumper {
             }
             VectorWritable vectorWritable = (VectorWritable) 
(transposeKeyValue ? keyWritable : valueWritable);
             Vector vector = vectorWritable.get();
+            if (filters != null && (vector instanceof NamedVector && 
filters.contains(((NamedVector)vector).getName()) == false)){
+              //we are filtering out this item, skip
+              continue;
+            }
             if (sizeOnly) {
               if (vector instanceof NamedVector) {
                 writer.write(((NamedVector) vector).getName());


Reply via email to