[1/3] mahout git commit: extended TFPartialVectorReducer.java to handle multiple text blocks of one document

akm Mon, 30 Mar 2015 09:35:08 -0700

Repository: mahout
Updated Branches:
  refs/heads/master 4b1c13332 -> 91c1626df



extended TFPartialVectorReducer.java to handle multiple text blocks of one 
document


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/ccaec1b2
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/ccaec1b2
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/ccaec1b2

Branch: refs/heads/master
Commit: ccaec1b267d768e890695dd36c49a34d621e1e73
Parents: c9d978a
Author: wobu <[email protected]>
Authored: Fri Jul 25 13:01:10 2014 +0200
Committer: wobu <[email protected]>
Committed: Fri Jul 25 13:01:10 2014 +0200

----------------------------------------------------------------------
 .../vectorizer/term/TFPartialVectorReducer.java | 16 ++++--
 .../vectorizer/DictionaryVectorizerTest.java    | 60 +++++++++++++++++---
 2 files changed, 65 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/ccaec1b2/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
----------------------------------------------------------------------
diff --git 
a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
 
b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
index e8b24e1..53246ef 100644
--- 
a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
+++ 
b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java
@@ -17,6 +17,7 @@
 
 package org.apache.mahout.vectorizer.term;
 
+import com.google.common.collect.Lists;
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
@@ -44,6 +45,7 @@ import 
org.apache.mahout.vectorizer.common.PartialVectorMerger;
 import java.io.IOException;
 import java.net.URI;
 import java.util.Iterator;
+import java.util.List;
 
 /**
  * Converts a document in to a sparse vector
@@ -61,15 +63,21 @@ public class TFPartialVectorReducer extends Reducer<Text, 
StringTuple, Text, Vec
   protected void reduce(Text key, Iterable<StringTuple> values, Context 
context)
     throws IOException, InterruptedException {
     Iterator<StringTuple> it = values.iterator();
+
     if (!it.hasNext()) {
       return;
     }
-    StringTuple value = it.next();
 
-    Vector vector = new RandomAccessSparseVector(dimension, value.length()); 
// guess at initial size
+    List<String> value = Lists.newArrayList();
+
+    while (it.hasNext()) {
+      value.addAll(it.next().getEntries());
+    }
+
+    Vector vector = new RandomAccessSparseVector(dimension, value.size()); // 
guess at initial size
 
     if (maxNGramSize >= 2) {
-      ShingleFilter sf = new ShingleFilter(new 
IteratorTokenStream(value.getEntries().iterator()), maxNGramSize);
+      ShingleFilter sf = new ShingleFilter(new 
IteratorTokenStream(value.iterator()), maxNGramSize);
       sf.reset();
       try {
         do {
@@ -85,7 +93,7 @@ public class TFPartialVectorReducer extends Reducer<Text, 
StringTuple, Text, Vec
         Closeables.close(sf, true);
       }
     } else {
-      for (String term : value.getEntries()) {
+      for (String term : value) {
         if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram
           int termId = dictionary.get(term);
           vector.setQuick(termId, vector.getQuick(termId) + 1);

http://git-wip-us.apache.org/repos/asf/mahout/blob/ccaec1b2/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
----------------------------------------------------------------------
diff --git 
a/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
 
b/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
index edcc79b..835854f 100644
--- 
a/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
+++ 
b/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java
@@ -18,14 +18,17 @@
 package org.apache.mahout.vectorizer;
 
 import java.io.IOException;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 
-import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
 import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.lucene.analysis.Analyzer;
@@ -34,6 +37,7 @@ import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.iterator.sequencefile.PathFilters;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
 import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
 import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.RandomAccessSparseVector;
@@ -51,6 +55,7 @@ import org.junit.Test;
 public final class DictionaryVectorizerTest extends MahoutTestCase {
 
   private static final int NUM_DOCS = 100;
+  private static final String SECOND_TEXT_BLOCK_IDENTIFIER = "2NDBLOCK";
 
   private Path inputPath;
   
@@ -69,13 +74,18 @@ public final class DictionaryVectorizerTest extends 
MahoutTestCase {
       RandomDocumentGenerator gen = new RandomDocumentGenerator();
 
       for (int i = 0; i < NUM_DOCS; i++) {
-        writer.append(new Text("Document::ID::" + i), new 
Text(gen.getRandomDocument()));
+        writer.append(
+                new Text("Document::ID::" + i),
+                new Text(gen.getRandomDocument()));
+        writer.append(
+                new Text("Document::ID::" + i),
+                new Text(SECOND_TEXT_BLOCK_IDENTIFIER));
       }
     } finally {
       Closeables.close(writer, false);
     }
   }
-  
+
   @Test
   public void testCreateTermFrequencyVectors() throws Exception {
     runTest(false, false);
@@ -85,7 +95,7 @@ public final class DictionaryVectorizerTest extends 
MahoutTestCase {
   public void testCreateTermFrequencyVectorsNam() throws Exception {
     runTest(false, true);
   }
-  
+
   @Test
   public void testCreateTermFrequencyVectorsSeq() throws Exception {
     runTest(true, false);
@@ -106,7 +116,7 @@ public final class DictionaryVectorizerTest extends 
MahoutTestCase {
     Path tfVectors = new Path(wordCount, "tf-vectors");
     Path tfidf = getTestTempDirPath("output/tfidf");
     Path tfidfVectors = new Path(tfidf, "tfidf-vectors");
-    
+
     Configuration conf = getConfiguration();
     DocumentProcessor.tokenizeDocuments(inputPath, analyzer, 
tokenizedDocuments, conf);
     
@@ -123,7 +133,7 @@ public final class DictionaryVectorizerTest extends 
MahoutTestCase {
                                                     100,
                                                     sequential,
                                                     named);
-    
+
     validateVectors(conf, NUM_DOCS, tfVectors, sequential, named);
     
     Pair<Long[], List<Path>> docFrequenciesFeatures = 
TFIDFConverter.calculateDF(tfVectors, 
@@ -143,6 +153,10 @@ public final class DictionaryVectorizerTest extends 
MahoutTestCase {
     
     
     validateVectors(conf, NUM_DOCS, tfidfVectors, sequential, named);
+
+    Integer secondTextBlockIdentifierDimensionId = 
validateDictionary(wordCount, conf);
+
+    validateVectorContainingSecondTextBlock(conf, tfVectors, 
secondTextBlockIdentifierDimensionId);
   }
   
   public static void validateVectors(Configuration conf,
@@ -166,9 +180,41 @@ public final class DictionaryVectorizerTest extends 
MahoutTestCase {
       } else {
         assertTrue("Expected RandomAccessSparseVector", v instanceof 
RandomAccessSparseVector);
       }
+    }
+
+    assertEquals("Expected " + numDocs + " documents", numDocs, count);
+  }
+
+  private Integer validateDictionary(Path dictionaryDirectoryPath, 
Configuration conf) {
+    PathFilter dictionaryChunkPathFilter = new PathFilter() {
+      @Override
+      public boolean accept(Path path) {
+        String name = path.getName();
+        return name.startsWith("dictionary.file");
+      }
+    };
+
+    Map<String, Integer> dictionary = new HashMap<String, Integer>();
 
+    for (Pair<Text, IntWritable> value :
+            new SequenceFileDirIterable<Text, IntWritable>(
+                    dictionaryDirectoryPath, PathType.LIST, 
dictionaryChunkPathFilter, null, true, conf)) {
+      dictionary.put(value.getFirst().toString(), value.getSecond().get());
     }
 
-  assertEquals("Expected " + numDocs + " documents", numDocs, count);
+    Integer secondTextBlockIdentifierDimensionId = 
dictionary.get(SECOND_TEXT_BLOCK_IDENTIFIER.toLowerCase());
+
+    assertNotNull("Token '" + SECOND_TEXT_BLOCK_IDENTIFIER + "' must be in 
dictionary ", secondTextBlockIdentifierDimensionId);
+    assertTrue("Dictionary must contain more than just 1 element!", 
dictionary.size() > 1);
+
+    return secondTextBlockIdentifierDimensionId;
+  }
+
+  public static void validateVectorContainingSecondTextBlock(Configuration 
conf, Path vectorPath, int dimensionId) {
+    for (VectorWritable value :
+            new SequenceFileDirValueIterable<VectorWritable>(
+                    vectorPath, PathType.LIST, PathFilters.partFilter(), null, 
true, conf)) {
+      assertTrue("The vector must contain the second text block", 
value.get().get(dimensionId) > 0);
+    }
   }
 }

[1/3] mahout git commit: extended TFPartialVectorReducer.java to handle multiple text blocks of one document

Reply via email to