Author: gsingers
Date: Wed Nov 23 14:25:26 2011
New Revision: 1205414
URL: http://svn.apache.org/viewvc?rev=1205414&view=rev
Log:
Add filtering to the vector dumper, update clusster-reuters to use named vectors
Modified:
mahout/trunk/examples/bin/cluster-reuters.sh
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
Modified: mahout/trunk/examples/bin/cluster-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-reuters.sh?rev=1205414&r1=1205413&r2=1205414&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-reuters.sh (original)
+++ mahout/trunk/examples/bin/cluster-reuters.sh Wed Nov 23 14:25:26 2011
@@ -93,7 +93,7 @@ fi
if [ "x$clustertype" == "xkmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85
--namedVector \
&& \
$MAHOUT kmeans \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
@@ -110,7 +110,7 @@ if [ "x$clustertype" == "xkmeans" ]; the
elif [ "x$clustertype" == "xfuzzykmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85
--namedVector \
&& \
$MAHOUT fkmeans \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
@@ -127,7 +127,7 @@ elif [ "x$clustertype" == "xlda" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
-o ${WORK_DIR}/reuters-out-seqdir-sparse-lda \
- -wt tf -seq -nr 3 \
+ -wt tf -seq -nr 3 --namedVector \
&& \
$MAHOUT lda \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tf-vectors \
@@ -140,7 +140,7 @@ elif [ "x$clustertype" == "xlda" ]; then
elif [ "x$clustertype" == "xdirichlet" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet --maxDFPercent 85 \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet --maxDFPercent 85
--namedVector \
&& \
$MAHOUT dirichlet \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/tfidf-vectors \
@@ -156,7 +156,7 @@ elif [ "x$clustertype" == "xdirichlet" ]
elif [ "x$clustertype" == "xminhash" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash --maxDFPercent 85 \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash --maxDFPercent 85
--namedVector \
&& \
$MAHOUT org.apache.mahout.clustering.minhash.MinHashDriver \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-minhash/tfidf-vectors \
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1205414&r1=1205413&r2=1205414&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
Wed Nov 23 14:25:26 2011
@@ -44,7 +44,9 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.OutputStreamWriter;
import java.io.Writer;
+import java.util.HashSet;
import java.util.Iterator;
+import java.util.Set;
/**
* Can read in a {@link SequenceFile} of {@link Vector}s and dump
@@ -91,12 +93,16 @@ public final class VectorDumper {
Option numItemsOpt =
obuilder.withLongName("n").withRequired(false).withArgument(
abuilder.withName("numItems").withMinimum(1).withMaximum(1).create()).
withDescription("Output at most <n> key value
pairs").withShortName("n").create();
+ Option filtersOpt =
obuilder.withLongName("filter").withRequired(false).withArgument(
+
abuilder.withName("filter").withMinimum(1).withMaximum(100).create()).
+ withDescription("Only dump out those vectors whose name matches
the filter. Multiple items may be specified by repeating the
argument.").withShortName("fi").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
.create();
Group group =
gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(
dictTypeOpt).withOption(dictOpt).withOption(csvOpt).withOption(vectorAsKeyOpt).withOption(
-
printKeyOpt).withOption(sizeOpt).withOption(numItemsOpt).withOption(helpOpt).create();
+
printKeyOpt).withOption(sizeOpt).withOption(numItemsOpt).withOption(filtersOpt)
+ .withOption(helpOpt).create();
try {
Parser parser = new Parser();
@@ -129,6 +135,13 @@ public final class VectorDumper {
throw new OptionException(dictTypeOpt);
}
}
+
+ Set<String> filters;
+ if (cmdLine.hasOption(filtersOpt)) {
+ filters = new HashSet<String>(cmdLine.getValues(filtersOpt));
+ } else {
+ filters = null;
+ }
boolean useCSV = cmdLine.hasOption(csvOpt);
boolean sizeOnly = cmdLine.hasOption(sizeOpt);
@@ -175,6 +188,10 @@ public final class VectorDumper {
}
VectorWritable vectorWritable = (VectorWritable)
(transposeKeyValue ? keyWritable : valueWritable);
Vector vector = vectorWritable.get();
+ if (filters != null && (vector instanceof NamedVector &&
filters.contains(((NamedVector)vector).getName()) == false)){
+ //we are filtering out this item, skip
+ continue;
+ }
if (sizeOnly) {
if (vector instanceof NamedVector) {
writer.write(((NamedVector) vector).getName());