ap...

drew Fri, 24 Sep 2010 20:30:08 -0700

Author: drew
Date: Sat Sep 25 03:27:57 2010
New Revision: 1001130

URL: http://svn.apache.org/viewvc?rev=1001130&view=rev
Log:
MAHOUT-401: Use NamedVector in seq2sparse
Adds the -nv option to SparseVectorFromSequenceFiles to create NamedVectors 
instead of Random or SequentialAccess vectors
Enhances DictionaryVictorizerTest to assert that the proper vector types are 
generated
Adds SparseVectorFromSequenceFilesTest to validate the proper command-line 
option behavior and vector types.
Extracts random document generation code to RandomDocumentGenerator utility clas



Added:
    mahout/trunk/utils/src/test/java/org/apache/mahout/text/
    
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
Modified:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 Sat Sep 25 03:27:57 2010
@@ -60,6 +60,7 @@ public final class SparseVectorsFromSequ
     Option outputDirOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
       
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
       "The output directory").withShortName("o").create();
+    
     Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
       
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
       "(Optional) Minimum Support. Default Value: 
2").withShortName("s").create();
@@ -98,16 +99,23 @@ public final class SparseVectorsFromSequ
       
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
       "The norm to use, expressed as either a float or \"INF\" if you want to 
use the Infinite norm.  "
           + "Must be greater or equal to 0.  The default is not to 
normalize").withShortName("n").create();
+    
     Option maxNGramSizeOpt = 
obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
       abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
         .withDescription(
           "(Optional) The maximum size of ngrams to create"
               + " (2 = bigrams, 3 = trigrams, etc) Default 
Value:1").withShortName("ng").create();
+    
     Option sequentialAccessVectorOpt = 
obuilder.withLongName("sequentialAccessVector").withRequired(false)
         .withDescription(
           "(Optional) Whether output vectors should be 
SequentialAccessVectors. If set true else false")
         .withShortName("seq").create();
     
+    Option namedVectorOpt = 
obuilder.withLongName("namedVector").withRequired(false)
+    .withDescription(
+      "(Optional) Whether output vectors should be NamedVectors. If set true 
else false")
+    .withShortName("nv").create();
+    
     Option overwriteOutput = 
obuilder.withLongName("overwrite").withRequired(false).withDescription(
       "If set, overwrite the output directory").withShortName("ow").create();
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
@@ -117,7 +125,7 @@ public final class SparseVectorsFromSequ
         
.withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
         
.withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
         
.withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(
-          helpOpt).withOption(sequentialAccessVectorOpt).create();
+          
helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -220,14 +228,19 @@ public final class SparseVectorsFromSequ
         sequentialAccessOutput = true;
       }
       
+      boolean namedVectors = false;
+      if (cmdLine.hasOption(namedVectorOpt)) {
+        namedVectors = true;
+      }
+      
       Configuration conf = new Configuration();
       DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, 
outputDir, conf, minSupport, maxNGramSize,
-        minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
+        minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput, 
namedVectors);
       if (processIdf) {
         TFIDFConverter.processTfIdf(
           new Path(outputDir, 
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
           outputDir, chunkSize, minDf, maxDFPercent, norm,
-          sequentialAccessOutput, reduceTasks);
+          sequentialAccessOutput, namedVectors, reduceTasks);
       }
     } catch (OptionException e) {
       log.error("Exception", e);

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
 Sat Sep 25 03:27:57 2010
@@ -40,6 +40,8 @@ public class PartialVectorMergeReducer e
 
   private boolean sequentialAccess;
 
+  private boolean namedVector;
+  
   @Override
   protected void reduce(WritableComparable<?> key, Iterable<VectorWritable> 
values, Context context) throws IOException,
       InterruptedException {
@@ -54,7 +56,12 @@ public class PartialVectorMergeReducer e
     if (sequentialAccess) {
       vector = new SequentialAccessSparseVector(vector);
     }
-    VectorWritable vectorWritable = new VectorWritable(new NamedVector(vector, 
key.toString()));
+    
+    if (namedVector) {
+      vector = new NamedVector(vector, key.toString());
+    }
+    
+    VectorWritable vectorWritable = new VectorWritable(vector);
     context.write(key, vectorWritable);
   }
 
@@ -65,6 +72,7 @@ public class PartialVectorMergeReducer e
     normPower = conf.getFloat(PartialVectorMerger.NORMALIZATION_POWER, 
PartialVectorMerger.NO_NORMALIZING);
     dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
     sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, 
false);
+    namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
   }
 
 }

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
 Sat Sep 25 03:27:57 2010
@@ -48,6 +48,8 @@ public final class PartialVectorMerger {
   public static final String DIMENSION = "vector.dimension";
 
   public static final String SEQUENTIAL_ACCESS = "vector.sequentialAccess";
+  
+  public static final String NAMED_VECTOR = "vector.named";
 
   /**
    * Cannot be initialized. Use the static functions
@@ -66,6 +68,11 @@ public final class PartialVectorMerger {
    *          output directory were the partial vectors have to be created
    * @param normPower
    *          The normalization value. Must be greater than or equal to 0 or 
equal to {...@link #NO_NORMALIZING}
+   * @param dimension
+   * @param sequentialAccess
+   *          output vectors should be optimized for sequential access
+   * @param namedVectors
+   *          output vectors should be named, retaining key (doc id) as a label
    * @param numReducers 
    *          The number of reducers to spawn
    * @throws IOException
@@ -77,6 +84,7 @@ public final class PartialVectorMerger {
                                          float normPower,
                                          int dimension,
                                          boolean sequentialAccess,
+                                         boolean namedVector,
                                          int numReducers) throws IOException, 
InterruptedException, ClassNotFoundException {
     if (normPower != NO_NORMALIZING && normPower < 0) {
       throw new IllegalArgumentException("normPower must either be -1 or >= 
0");
@@ -87,6 +95,7 @@ public final class PartialVectorMerger {
     conf.set("io.serializations", 
"org.apache.hadoop.io.serializer.JavaSerialization,"
         + "org.apache.hadoop.io.serializer.WritableSerialization");
     conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess);
+    conf.setBoolean(NAMED_VECTOR, namedVector);
     conf.setInt(DIMENSION, dimension);
     conf.setFloat(NORMALIZATION_POWER, normPower);
 

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 Sat Sep 25 03:27:57 2010
@@ -119,7 +119,8 @@ public final class DictionaryVectorizer 
                                                 float minLLRValue,
                                                 int numReducers,
                                                 int chunkSizeInMegabytes,
-                                                boolean sequentialAccess)
+                                                boolean sequentialAccess,
+                                                boolean namedVectors)
     throws IOException, InterruptedException, ClassNotFoundException {
     if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
       chunkSizeInMegabytes = MIN_CHUNKSIZE;
@@ -152,7 +153,7 @@ public final class DictionaryVectorizer 
       Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + 
partialVectorIndex++);
       partialVectorPaths.add(partialVectorOutputPath);
       makePartialVectors(input, maxNGramSize, dictionaryChunk, 
partialVectorOutputPath,
-        maxTermDimension[0], sequentialAccess, numReducers);
+        maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
     }
     
     Configuration conf = new Configuration();
@@ -161,7 +162,7 @@ public final class DictionaryVectorizer 
     Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
     if (dictionaryChunks.size() > 1) {
       PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, 
-1, maxTermDimension[0],
-        sequentialAccess, numReducers);
+        sequentialAccess, namedVectors, numReducers);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
       Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -242,6 +243,11 @@ public final class DictionaryVectorizer 
    *          location of the chunk of features and the id's
    * @param output
    *          output directory were the partial vectors have to be created
+   * @param dimension
+   * @param sequentialAccess
+   *          output vectors should be optimized for sequential access
+   * @param namedVectors
+   *          output vectors should be named, retaining key (doc id) as a label
    * @param numReducers 
    *          the desired number of reducer tasks
    * @throws IOException
@@ -254,6 +260,7 @@ public final class DictionaryVectorizer 
                                          Path output,
                                          int dimension,
                                          boolean sequentialAccess, 
+                                         boolean namedVectors,
                                          int numReducers) throws IOException, 
InterruptedException, ClassNotFoundException {
     
     Configuration conf = new Configuration();
@@ -262,6 +269,7 @@ public final class DictionaryVectorizer 
                                   + 
"org.apache.hadoop.io.serializer.WritableSerialization");
     conf.setInt(PartialVectorMerger.DIMENSION, dimension);
     conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
+    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
     conf.setInt(MAX_NGRAMS, maxNGramSize);   
     DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, 
conf);
     

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
 Sat Sep 25 03:27:57 2010
@@ -54,6 +54,8 @@ public class TFPartialVectorReducer exte
 
   private boolean sequentialAccess;
 
+  private boolean namedVector;
+  
   private int maxNGramSize = 1;
 
   @Override
@@ -94,9 +96,14 @@ public class TFPartialVectorReducer exte
     if (sequentialAccess) {
       vector = new SequentialAccessSparseVector(vector);
     }
+    
+    if (namedVector) {
+      vector = new NamedVector(vector, key.toString());
+    }
+    
     // if the vector has no nonZero entries (nothing in the dictionary), let's 
not waste space sending it to disk.
     if (vector.getNumNondefaultElements() > 0) {
-      VectorWritable vectorWritable = new VectorWritable(new 
NamedVector(vector, key.toString()));
+      VectorWritable vectorWritable = new VectorWritable(vector);
       context.write(key, vectorWritable);
     } else {
       context.getCounter("TFParticalVectorReducer", 
"emptyVectorCount").increment(1);
@@ -110,6 +117,7 @@ public class TFPartialVectorReducer exte
     try {
       dimension = conf.getInt(PartialVectorMerger.DIMENSION, 
Integer.MAX_VALUE);
       sequentialAccess = 
conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
+      namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
       maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, 
maxNGramSize);
       URI[] localFiles = DistributedCache.getCacheFiles(conf);
       if (localFiles == null || localFiles.length < 1) {

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
 Sat Sep 25 03:27:57 2010
@@ -120,6 +120,7 @@ public final class TFIDFConverter {
                                   int maxDFPercent,
                                   float normPower,
                                   boolean sequentialAccessOutput,
+                                  boolean namedVector,
                                   int numReducers) throws IOException, 
InterruptedException, ClassNotFoundException {
     if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
       chunkSizeInMegabytes = MIN_CHUNKSIZE;
@@ -156,7 +157,8 @@ public final class TFIDFConverter {
                          maxDFPercent,
                          dictionaryChunk,
                          partialVectorOutputPath,
-                         sequentialAccessOutput);
+                         sequentialAccessOutput,
+                         namedVector);
     }
 
     Configuration conf = new Configuration();
@@ -169,6 +171,7 @@ public final class TFIDFConverter {
                                               normPower,
                                               
datasetFeatures.getFirst()[0].intValue(),
                                               sequentialAccessOutput,
+                                              namedVector,
                                               numReducers);
       HadoopUtil.deletePaths(partialVectorPaths, fs);
     } else {
@@ -256,6 +259,10 @@ public final class TFIDFConverter {
    *          location of the chunk of features and the id's
    * @param output
    *          output directory were the partial vectors have to be created
+   * @param sequentialAccess
+   *          output vectors should be optimized for sequential access
+   * @param namedVectors
+   *          output vectors should be named, retaining key (doc id) as a label
    */
   private static void makePartialVectors(Path input,
                                          Long featureCount,
@@ -264,7 +271,8 @@ public final class TFIDFConverter {
                                          int maxDFPercent,
                                          Path dictionaryFilePath,
                                          Path output,
-                                         boolean sequentialAccess)
+                                         boolean sequentialAccess,
+                                         boolean namedVector)
     throws IOException, InterruptedException, ClassNotFoundException {
 
     Configuration conf = new Configuration();
@@ -276,6 +284,7 @@ public final class TFIDFConverter {
     conf.setInt(MIN_DF, minDf);
     conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
     conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
+    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector);
     DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()}, 
conf);
 
     Job job = new Job(conf);

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
 Sat Sep 25 03:27:57 2010
@@ -59,6 +59,8 @@ public class TFIDFPartialVectorReducer e
 
   private boolean sequentialAccess;
 
+  private boolean namedVector;
+  
   @Override
   protected void reduce(WritableComparable<?> key, Iterable<VectorWritable> 
values, Context context)
       throws IOException, InterruptedException {
@@ -86,7 +88,12 @@ public class TFIDFPartialVectorReducer e
     if (sequentialAccess) {
       vector = new SequentialAccessSparseVector(vector);
     }
-    VectorWritable vectorWritable = new VectorWritable(new NamedVector(vector, 
key.toString()));
+    
+    if (namedVector) {
+      vector = new NamedVector(vector, key.toString());
+    }
+    
+    VectorWritable vectorWritable = new VectorWritable(vector);
     context.write(key, vectorWritable);
   }
 
@@ -105,6 +112,7 @@ public class TFIDFPartialVectorReducer e
       minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
       maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99);
       sequentialAccess = 
conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
+      namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
 
       Path dictionaryFile = new Path(localFiles[0].getPath());
       FileSystem fs = dictionaryFile.getFileSystem(conf);

Added: 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java?rev=1001130&view=auto
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
 (added)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
 Sat Sep 25 03:27:57 2010
@@ -0,0 +1,90 @@
+package org.apache.mahout.text;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.utils.MahoutTestCase;
+import org.apache.mahout.utils.vectors.text.DictionaryVectorizerTest;
+import org.apache.mahout.utils.vectors.text.RandomDocumentGenerator;
+import org.junit.Before;
+import org.junit.Test;
+
+
+public class SparseVectorsFromSequenceFilesTest extends MahoutTestCase {
+  private static final int NUM_DOCS = 100;
+  
+  private Configuration conf;
+  private FileSystem fs;
+  private Path inputPath;
+  
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    conf = new Configuration();
+    fs = FileSystem.get(conf);
+
+    inputPath = getTestTempFilePath("documents/docs.file");
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPath, 
Text.class, Text.class);
+
+    RandomDocumentGenerator gen = new RandomDocumentGenerator();
+    
+    for (int i = 0; i < NUM_DOCS; i++) {
+      writer.append(new Text("Document::ID::" + i), new 
Text(gen.getRandomDocument()));
+    }
+    writer.close();
+  }
+  
+  
+  @Test
+  public void testCreateTermFrequencyVectors() throws Exception {
+    runTest(false, false);
+  }
+
+  @Test
+  public void testCreateTermFrequencyVectorsNam() throws Exception {
+    runTest(false, true);
+  }
+  
+  @Test
+  public void testCreateTermFrequencyVectorsSeq() throws Exception {
+    runTest(true, false);
+  }
+  
+  @Test
+  public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
+    runTest(true, true);
+  }
+  
+  protected void runTest(boolean sequential, boolean named) throws Exception {
+    Path outputPath = getTestTempFilePath("output");
+
+    
+    List<String> argList = new LinkedList<String>();
+    argList.add("-i");
+    argList.add(inputPath.toString());
+    argList.add("-o");
+    argList.add(outputPath.toString());
+    
+    if (sequential) 
+      argList.add("-seq");
+    
+    if (named)
+      argList.add("-nv");
+    
+    String[] args = argList.toArray(new String[0]);
+    
+    SparseVectorsFromSequenceFiles.main(args);
+
+    Path tfVectors = new Path(outputPath, "tf-vectors");
+    Path tfidfVectors = new Path(outputPath, "tfidf-vectors");
+    
+    DictionaryVectorizerTest.validateVectors(fs, conf, NUM_DOCS, tfVectors, 
sequential, named);
+    DictionaryVectorizerTest.validateVectors(fs, conf, NUM_DOCS, tfidfVectors, 
sequential, named);
+  }  
+}

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 Sat Sep 25 03:27:57 2010
@@ -19,15 +19,26 @@ package org.apache.mahout.utils.vectors.
 
 import java.util.Random;
 
+import junit.framework.TestCase;
+
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.text.DefaultAnalyzer;
 import org.apache.mahout.utils.MahoutTestCase;
+import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
+import org.apache.mahout.utils.vectors.text.DocumentProcessor;
 import org.apache.mahout.utils.vectors.tfidf.TFIDFConverter;
 import org.junit.Before;
 import org.junit.Test;
@@ -37,89 +48,122 @@ import org.junit.Test;
  */
 public final class DictionaryVectorizerTest extends MahoutTestCase {
 
-  private static final int AVG_DOCUMENT_LENGTH = 20;
-  private static final int AVG_SENTENCE_LENGTH = 8;
-  private static final int AVG_WORD_LENGTH = 6;
   private static final int NUM_DOCS = 100;
-  private static final String CHARSET = "abcdef";
-  private static final String DELIM = " .,?;:!\t\n\r";
-  private static final String ERRORSET = "`1234567890" + 
"-...@#$%^&*()_+[]{}'\"/<>|\\";
-
-  private final Random random = RandomUtils.getRandom();
+  
+  private Configuration conf;
   private FileSystem fs;
-
-  private char getRandomDelimiter() {
-    return DELIM.charAt(random.nextInt(DELIM.length()));
-  }
-
-  private String getRandomDocument() {
-    int length = (AVG_DOCUMENT_LENGTH >> 1) + 
random.nextInt(AVG_DOCUMENT_LENGTH);
-    StringBuilder sb = new StringBuilder(length * AVG_SENTENCE_LENGTH * 
AVG_WORD_LENGTH);
-    for (int i = 0; i < length; i++) {
-      sb.append(getRandomSentence());
-    }
-    return sb.toString();
-  }
-
-  private String getRandomSentence() {
-    int length = (AVG_SENTENCE_LENGTH >> 1) + 
random.nextInt(AVG_SENTENCE_LENGTH);
-    StringBuilder sb = new StringBuilder(length * AVG_WORD_LENGTH);
-    for (int i = 0; i < length; i++) {
-      sb.append(getRandomString()).append(' ');
-    }
-    sb.append(getRandomDelimiter());
-    return sb.toString();
-  }
-
-  private String getRandomString() {
-    int length = (AVG_WORD_LENGTH >> 1) + random.nextInt(AVG_WORD_LENGTH);
-    StringBuilder sb = new StringBuilder(length);
-    for (int i = 0; i < length; i++) {
-      sb.append(CHARSET.charAt(random.nextInt(CHARSET.length())));
-    }
-    if (random.nextInt(10) == 0) {
-      sb.append(ERRORSET.charAt(random.nextInt(ERRORSET.length())));
-    }
-    return sb.toString();
-  }
-
+  private Path inputPath;
+  
   @Override
   @Before
   public void setUp() throws Exception {
     super.setUp();
-    Configuration conf = new Configuration();
+    conf = new Configuration();
     fs = FileSystem.get(conf);
-  }
 
-  @Test
-  public void testCreateTermFrequencyVectors() throws Exception {
-    Configuration conf = new Configuration();
-    Path path = getTestTempFilePath("documents/docs.file");
-    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, 
Text.class, Text.class);
+    inputPath = getTestTempFilePath("documents/docs.file");
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPath, 
Text.class, Text.class);
 
+    RandomDocumentGenerator gen = new RandomDocumentGenerator();
+    
     for (int i = 0; i < NUM_DOCS; i++) {
-      writer.append(new Text("Document::ID::" + i), new 
Text(getRandomDocument()));
+      writer.append(new Text("Document::ID::" + i), new 
Text(gen.getRandomDocument()));
     }
     writer.close();
+  }
+  
+  @Test
+  public void testCreateTermFrequencyVectors() throws Exception {
+    runTest(false, false);
+  }
+
+  @Test
+  public void testCreateTermFrequencyVectorsNam() throws Exception {
+    runTest(false, true);
+  }
+  
+  @Test
+  public void testCreateTermFrequencyVectorsSeq() throws Exception {
+    runTest(true, false);
+  }
+  
+  @Test
+  public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
+    runTest(true, true);
+  }
+  
+  public void runTest(boolean sequential, boolean named) throws Exception {
+    
     Class<? extends Analyzer> analyzer = DefaultAnalyzer.class;
-    DocumentProcessor.tokenizeDocuments(path, analyzer, 
getTestTempDirPath("output/tokenized-documents"));
-    
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
-                                                    
getTestTempDirPath("output/wordcount"),
+    
+    Path tokenizedDocuments = getTestTempDirPath("output/tokenized-documents");
+    Path wordCount = getTestTempDirPath("output/wordcount");
+    Path tfVectors = new Path(wordCount, "tf-vectors");
+    Path tfidf = getTestTempDirPath("output/tfidf");
+    Path tfidfVectors = new Path(tfidf, "tfidf-vectors");
+    
+    DocumentProcessor.tokenizeDocuments(inputPath, analyzer, 
tokenizedDocuments);
+    
+    DictionaryVectorizer.createTermFrequencyVectors(tokenizedDocuments,
+                                                    wordCount,
                                                     conf,
                                                     2,
                                                     1,
                                                     0.0f,
                                                     1,
                                                     100,
-                                                    false);
-    
TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/tf-vectors"),
-                                getTestTempDirPath("output/tfidf"),
+                                                    sequential,
+                                                    named);
+    
+    validateVectors(fs, conf, NUM_DOCS, tfVectors, sequential, named);
+    
+    TFIDFConverter.processTfIdf(tfVectors,
+                                tfidf,
                                 100,
                                 1,
                                 99,
                                 1.0f,
-                                false,
+                                sequential,
+                                named,
                                 1);
+    
+    
+    validateVectors(fs, conf, NUM_DOCS, tfidfVectors, sequential, named);
+  }
+  
+  public static void validateVectors(FileSystem fs, Configuration conf, int 
numDocs, Path vectorPath, boolean sequential, boolean named) throws Exception {
+    FileStatus[] stats = fs.listStatus(vectorPath, new PathFilter() {
+      @Override
+      public boolean accept(Path path) {
+        return path.getName().startsWith("part-");
+      }
+      
+    });
+
+    int count = 0;
+    Text key = new Text();
+    VectorWritable vw = new VectorWritable();
+    for (FileStatus s: stats) {
+      SequenceFile.Reader tfidfReader = new SequenceFile.Reader(fs, 
s.getPath(), conf);
+      while (tfidfReader.next(key, vw)) {
+        count++;
+        Vector v = vw.get();
+        if (named) {
+          TestCase.assertTrue("Expected NamedVector", v instanceof 
NamedVector);
+          v = ((NamedVector) v).getDelegate();
+        }
+        
+        if (sequential) {
+          TestCase.assertTrue("Expected SequentialAccessSparseVector", v 
instanceof SequentialAccessSparseVector);
+        }
+        else {
+          TestCase.assertTrue("Expected RandomAccessSparseVector", v 
instanceof RandomAccessSparseVector);
+        }
+        
+      }
+      tfidfReader.close();
+    }
 
+    TestCase.assertEquals("Expected " + numDocs + " documents", numDocs, 
count);
   }
 }

Added: 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java?rev=1001130&view=auto
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
 (added)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
 Sat Sep 25 03:27:57 2010
@@ -0,0 +1,52 @@
+package org.apache.mahout.utils.vectors.text;
+
+import java.util.Random;
+
+import org.apache.mahout.common.RandomUtils;
+
+public class RandomDocumentGenerator {
+  
+  private static final int AVG_DOCUMENT_LENGTH = 20;
+  private static final int AVG_SENTENCE_LENGTH = 8;
+  private static final int AVG_WORD_LENGTH = 6;
+  private static final String CHARSET = "abcdef";
+  private static final String DELIM = " .,?;:!\t\n\r";
+  private static final String ERRORSET = "`1234567890" + 
"-...@#$%^&*()_+[]{}'\"/<>|\\";
+
+  private final Random random = RandomUtils.getRandom();
+  
+  private char getRandomDelimiter() {
+    return DELIM.charAt(random.nextInt(DELIM.length()));
+  }
+
+  public String getRandomDocument() {
+    int length = (AVG_DOCUMENT_LENGTH >> 1) + 
random.nextInt(AVG_DOCUMENT_LENGTH);
+    StringBuilder sb = new StringBuilder(length * AVG_SENTENCE_LENGTH * 
AVG_WORD_LENGTH);
+    for (int i = 0; i < length; i++) {
+      sb.append(getRandomSentence());
+    }
+    return sb.toString();
+  }
+
+  public String getRandomSentence() {
+    int length = (AVG_SENTENCE_LENGTH >> 1) + 
random.nextInt(AVG_SENTENCE_LENGTH);
+    StringBuilder sb = new StringBuilder(length * AVG_WORD_LENGTH);
+    for (int i = 0; i < length; i++) {
+      sb.append(getRandomString()).append(' ');
+    }
+    sb.append(getRandomDelimiter());
+    return sb.toString();
+  }
+
+  public String getRandomString() {
+    int length = (AVG_WORD_LENGTH >> 1) + random.nextInt(AVG_WORD_LENGTH);
+    StringBuilder sb = new StringBuilder(length);
+    for (int i = 0; i < length; i++) {
+      sb.append(CHARSET.charAt(random.nextInt(CHARSET.length())));
+    }
+    if (random.nextInt(10) == 0) {
+      sb.append(ERRORSET.charAt(random.nextInt(ERRORSET.length())));
+    }
+    return sb.toString();
+  }
+}

svn commit: r1001130 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/text/ main/java/org/apache/mahout/utils/vectors/common/ main/java/org/apache/mahout/utils/vectors/text/ main/java/org/apache/mahout/utils/vectors/text/term/ main/java/org/ap...

Reply via email to