Repository: mahout Updated Branches: refs/heads/master 4b1c13332 -> 91c1626df
extended TFPartialVectorReducer.java to handle multiple text blocks of one document Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/ccaec1b2 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/ccaec1b2 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/ccaec1b2 Branch: refs/heads/master Commit: ccaec1b267d768e890695dd36c49a34d621e1e73 Parents: c9d978a Author: wobu <[email protected]> Authored: Fri Jul 25 13:01:10 2014 +0200 Committer: wobu <[email protected]> Committed: Fri Jul 25 13:01:10 2014 +0200 ---------------------------------------------------------------------- .../vectorizer/term/TFPartialVectorReducer.java | 16 ++++-- .../vectorizer/DictionaryVectorizerTest.java | 60 +++++++++++++++++--- 2 files changed, 65 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/ccaec1b2/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java ---------------------------------------------------------------------- diff --git a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java index e8b24e1..53246ef 100644 --- a/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java +++ b/mrlegacy/src/main/java/org/apache/mahout/vectorizer/term/TFPartialVectorReducer.java @@ -17,6 +17,7 @@ package org.apache.mahout.vectorizer.term; +import com.google.common.collect.Lists; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; @@ -44,6 +45,7 @@ import org.apache.mahout.vectorizer.common.PartialVectorMerger; import java.io.IOException; import java.net.URI; import java.util.Iterator; +import java.util.List; /** * Converts a document in to a sparse vector @@ -61,15 +63,21 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); + if (!it.hasNext()) { return; } - StringTuple value = it.next(); - Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size + List<String> value = Lists.newArrayList(); + + while (it.hasNext()) { + value.addAll(it.next().getEntries()); + } + + Vector vector = new RandomAccessSparseVector(dimension, value.size()); // guess at initial size if (maxNGramSize >= 2) { - ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); + ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.iterator()), maxNGramSize); sf.reset(); try { do { @@ -85,7 +93,7 @@ public class TFPartialVectorReducer extends Reducer<Text, StringTuple, Text, Vec Closeables.close(sf, true); } } else { - for (String term : value.getEntries()) { + for (String term : value) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); http://git-wip-us.apache.org/repos/asf/mahout/blob/ccaec1b2/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java ---------------------------------------------------------------------- diff --git a/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java b/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java index edcc79b..835854f 100644 --- a/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java +++ b/mrlegacy/src/test/java/org/apache/mahout/vectorizer/DictionaryVectorizerTest.java @@ -18,14 +18,17 @@ package org.apache.mahout.vectorizer; import java.io.IOException; +import java.util.HashMap; import java.util.List; +import java.util.Map; -import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.lucene.analysis.Analyzer; @@ -34,6 +37,7 @@ import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.PathType; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable; import org.apache.mahout.math.NamedVector; import org.apache.mahout.math.RandomAccessSparseVector; @@ -51,6 +55,7 @@ import org.junit.Test; public final class DictionaryVectorizerTest extends MahoutTestCase { private static final int NUM_DOCS = 100; + private static final String SECOND_TEXT_BLOCK_IDENTIFIER = "2NDBLOCK"; private Path inputPath; @@ -69,13 +74,18 @@ public final class DictionaryVectorizerTest extends MahoutTestCase { RandomDocumentGenerator gen = new RandomDocumentGenerator(); for (int i = 0; i < NUM_DOCS; i++) { - writer.append(new Text("Document::ID::" + i), new Text(gen.getRandomDocument())); + writer.append( + new Text("Document::ID::" + i), + new Text(gen.getRandomDocument())); + writer.append( + new Text("Document::ID::" + i), + new Text(SECOND_TEXT_BLOCK_IDENTIFIER)); } } finally { Closeables.close(writer, false); } } - + @Test public void testCreateTermFrequencyVectors() throws Exception { runTest(false, false); @@ -85,7 +95,7 @@ public final class DictionaryVectorizerTest extends MahoutTestCase { public void testCreateTermFrequencyVectorsNam() throws Exception { runTest(false, true); } - + @Test public void testCreateTermFrequencyVectorsSeq() throws Exception { runTest(true, false); @@ -106,7 +116,7 @@ public final class DictionaryVectorizerTest extends MahoutTestCase { Path tfVectors = new Path(wordCount, "tf-vectors"); Path tfidf = getTestTempDirPath("output/tfidf"); Path tfidfVectors = new Path(tfidf, "tfidf-vectors"); - + Configuration conf = getConfiguration(); DocumentProcessor.tokenizeDocuments(inputPath, analyzer, tokenizedDocuments, conf); @@ -123,7 +133,7 @@ public final class DictionaryVectorizerTest extends MahoutTestCase { 100, sequential, named); - + validateVectors(conf, NUM_DOCS, tfVectors, sequential, named); Pair<Long[], List<Path>> docFrequenciesFeatures = TFIDFConverter.calculateDF(tfVectors, @@ -143,6 +153,10 @@ public final class DictionaryVectorizerTest extends MahoutTestCase { validateVectors(conf, NUM_DOCS, tfidfVectors, sequential, named); + + Integer secondTextBlockIdentifierDimensionId = validateDictionary(wordCount, conf); + + validateVectorContainingSecondTextBlock(conf, tfVectors, secondTextBlockIdentifierDimensionId); } public static void validateVectors(Configuration conf, @@ -166,9 +180,41 @@ public final class DictionaryVectorizerTest extends MahoutTestCase { } else { assertTrue("Expected RandomAccessSparseVector", v instanceof RandomAccessSparseVector); } + } + + assertEquals("Expected " + numDocs + " documents", numDocs, count); + } + + private Integer validateDictionary(Path dictionaryDirectoryPath, Configuration conf) { + PathFilter dictionaryChunkPathFilter = new PathFilter() { + @Override + public boolean accept(Path path) { + String name = path.getName(); + return name.startsWith("dictionary.file"); + } + }; + + Map<String, Integer> dictionary = new HashMap<String, Integer>(); + for (Pair<Text, IntWritable> value : + new SequenceFileDirIterable<Text, IntWritable>( + dictionaryDirectoryPath, PathType.LIST, dictionaryChunkPathFilter, null, true, conf)) { + dictionary.put(value.getFirst().toString(), value.getSecond().get()); } - assertEquals("Expected " + numDocs + " documents", numDocs, count); + Integer secondTextBlockIdentifierDimensionId = dictionary.get(SECOND_TEXT_BLOCK_IDENTIFIER.toLowerCase()); + + assertNotNull("Token '" + SECOND_TEXT_BLOCK_IDENTIFIER + "' must be in dictionary ", secondTextBlockIdentifierDimensionId); + assertTrue("Dictionary must contain more than just 1 element!", dictionary.size() > 1); + + return secondTextBlockIdentifierDimensionId; + } + + public static void validateVectorContainingSecondTextBlock(Configuration conf, Path vectorPath, int dimensionId) { + for (VectorWritable value : + new SequenceFileDirValueIterable<VectorWritable>( + vectorPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { + assertTrue("The vector must contain the second text block", value.get().get(dimensionId) > 0); + } } }
