Author: drew
Date: Sat Sep 25 03:27:57 2010
New Revision: 1001130
URL: http://svn.apache.org/viewvc?rev=1001130&view=rev
Log:
MAHOUT-401: Use NamedVector in seq2sparse
Adds the -nv option to SparseVectorFromSequenceFiles to create NamedVectors
instead of Random or SequentialAccess vectors
Enhances DictionaryVictorizerTest to assert that the proper vector types are
generated
Adds SparseVectorFromSequenceFilesTest to validate the proper command-line
option behavior and vector types.
Extracts random document generation code to RandomDocumentGenerator utility clas
Added:
mahout/trunk/utils/src/test/java/org/apache/mahout/text/
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Sat Sep 25 03:27:57 2010
@@ -60,6 +60,7 @@ public final class SparseVectorsFromSequ
Option outputDirOpt =
obuilder.withLongName("output").withRequired(true).withArgument(
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
"The output directory").withShortName("o").create();
+
Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
"(Optional) Minimum Support. Default Value:
2").withShortName("s").create();
@@ -98,16 +99,23 @@ public final class SparseVectorsFromSequ
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
"The norm to use, expressed as either a float or \"INF\" if you want to
use the Infinite norm. "
+ "Must be greater or equal to 0. The default is not to
normalize").withShortName("n").create();
+
Option maxNGramSizeOpt =
obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
.withDescription(
"(Optional) The maximum size of ngrams to create"
+ " (2 = bigrams, 3 = trigrams, etc) Default
Value:1").withShortName("ng").create();
+
Option sequentialAccessVectorOpt =
obuilder.withLongName("sequentialAccessVector").withRequired(false)
.withDescription(
"(Optional) Whether output vectors should be
SequentialAccessVectors. If set true else false")
.withShortName("seq").create();
+ Option namedVectorOpt =
obuilder.withLongName("namedVector").withRequired(false)
+ .withDescription(
+ "(Optional) Whether output vectors should be NamedVectors. If set true
else false")
+ .withShortName("nv").create();
+
Option overwriteOutput =
obuilder.withLongName("overwrite").withRequired(false).withDescription(
"If set, overwrite the output directory").withShortName("ow").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
@@ -117,7 +125,7 @@ public final class SparseVectorsFromSequ
.withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
.withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
.withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput).withOption(
- helpOpt).withOption(sequentialAccessVectorOpt).create();
+
helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -220,14 +228,19 @@ public final class SparseVectorsFromSequ
sequentialAccessOutput = true;
}
+ boolean namedVectors = false;
+ if (cmdLine.hasOption(namedVectorOpt)) {
+ namedVectors = true;
+ }
+
Configuration conf = new Configuration();
DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
outputDir, conf, minSupport, maxNGramSize,
- minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
+ minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput,
namedVectors);
if (processIdf) {
TFIDFConverter.processTfIdf(
new Path(outputDir,
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
outputDir, chunkSize, minDf, maxDFPercent, norm,
- sequentialAccessOutput, reduceTasks);
+ sequentialAccessOutput, namedVectors, reduceTasks);
}
} catch (OptionException e) {
log.error("Exception", e);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMergeReducer.java
Sat Sep 25 03:27:57 2010
@@ -40,6 +40,8 @@ public class PartialVectorMergeReducer e
private boolean sequentialAccess;
+ private boolean namedVector;
+
@Override
protected void reduce(WritableComparable<?> key, Iterable<VectorWritable>
values, Context context) throws IOException,
InterruptedException {
@@ -54,7 +56,12 @@ public class PartialVectorMergeReducer e
if (sequentialAccess) {
vector = new SequentialAccessSparseVector(vector);
}
- VectorWritable vectorWritable = new VectorWritable(new NamedVector(vector,
key.toString()));
+
+ if (namedVector) {
+ vector = new NamedVector(vector, key.toString());
+ }
+
+ VectorWritable vectorWritable = new VectorWritable(vector);
context.write(key, vectorWritable);
}
@@ -65,6 +72,7 @@ public class PartialVectorMergeReducer e
normPower = conf.getFloat(PartialVectorMerger.NORMALIZATION_POWER,
PartialVectorMerger.NO_NORMALIZING);
dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE);
sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS,
false);
+ namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
}
}
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/common/PartialVectorMerger.java
Sat Sep 25 03:27:57 2010
@@ -48,6 +48,8 @@ public final class PartialVectorMerger {
public static final String DIMENSION = "vector.dimension";
public static final String SEQUENTIAL_ACCESS = "vector.sequentialAccess";
+
+ public static final String NAMED_VECTOR = "vector.named";
/**
* Cannot be initialized. Use the static functions
@@ -66,6 +68,11 @@ public final class PartialVectorMerger {
* output directory were the partial vectors have to be created
* @param normPower
* The normalization value. Must be greater than or equal to 0 or
equal to {...@link #NO_NORMALIZING}
+ * @param dimension
+ * @param sequentialAccess
+ * output vectors should be optimized for sequential access
+ * @param namedVectors
+ * output vectors should be named, retaining key (doc id) as a label
* @param numReducers
* The number of reducers to spawn
* @throws IOException
@@ -77,6 +84,7 @@ public final class PartialVectorMerger {
float normPower,
int dimension,
boolean sequentialAccess,
+ boolean namedVector,
int numReducers) throws IOException,
InterruptedException, ClassNotFoundException {
if (normPower != NO_NORMALIZING && normPower < 0) {
throw new IllegalArgumentException("normPower must either be -1 or >=
0");
@@ -87,6 +95,7 @@ public final class PartialVectorMerger {
conf.set("io.serializations",
"org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess);
+ conf.setBoolean(NAMED_VECTOR, namedVector);
conf.setInt(DIMENSION, dimension);
conf.setFloat(NORMALIZATION_POWER, normPower);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
Sat Sep 25 03:27:57 2010
@@ -119,7 +119,8 @@ public final class DictionaryVectorizer
float minLLRValue,
int numReducers,
int chunkSizeInMegabytes,
- boolean sequentialAccess)
+ boolean sequentialAccess,
+ boolean namedVectors)
throws IOException, InterruptedException, ClassNotFoundException {
if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
@@ -152,7 +153,7 @@ public final class DictionaryVectorizer
Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER +
partialVectorIndex++);
partialVectorPaths.add(partialVectorOutputPath);
makePartialVectors(input, maxNGramSize, dictionaryChunk,
partialVectorOutputPath,
- maxTermDimension[0], sequentialAccess, numReducers);
+ maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
}
Configuration conf = new Configuration();
@@ -161,7 +162,7 @@ public final class DictionaryVectorizer
Path outputDir = new Path(output, DOCUMENT_VECTOR_OUTPUT_FOLDER);
if (dictionaryChunks.size() > 1) {
PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir,
-1, maxTermDimension[0],
- sequentialAccess, numReducers);
+ sequentialAccess, namedVectors, numReducers);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
Path singlePartialVectorOutputPath = partialVectorPaths.get(0);
@@ -242,6 +243,11 @@ public final class DictionaryVectorizer
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
+ * @param dimension
+ * @param sequentialAccess
+ * output vectors should be optimized for sequential access
+ * @param namedVectors
+ * output vectors should be named, retaining key (doc id) as a label
* @param numReducers
* the desired number of reducer tasks
* @throws IOException
@@ -254,6 +260,7 @@ public final class DictionaryVectorizer
Path output,
int dimension,
boolean sequentialAccess,
+ boolean namedVectors,
int numReducers) throws IOException,
InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
@@ -262,6 +269,7 @@ public final class DictionaryVectorizer
+
"org.apache.hadoop.io.serializer.WritableSerialization");
conf.setInt(PartialVectorMerger.DIMENSION, dimension);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
+ conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
conf.setInt(MAX_NGRAMS, maxNGramSize);
DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()},
conf);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java
Sat Sep 25 03:27:57 2010
@@ -54,6 +54,8 @@ public class TFPartialVectorReducer exte
private boolean sequentialAccess;
+ private boolean namedVector;
+
private int maxNGramSize = 1;
@Override
@@ -94,9 +96,14 @@ public class TFPartialVectorReducer exte
if (sequentialAccess) {
vector = new SequentialAccessSparseVector(vector);
}
+
+ if (namedVector) {
+ vector = new NamedVector(vector, key.toString());
+ }
+
// if the vector has no nonZero entries (nothing in the dictionary), let's
not waste space sending it to disk.
if (vector.getNumNondefaultElements() > 0) {
- VectorWritable vectorWritable = new VectorWritable(new
NamedVector(vector, key.toString()));
+ VectorWritable vectorWritable = new VectorWritable(vector);
context.write(key, vectorWritable);
} else {
context.getCounter("TFParticalVectorReducer",
"emptyVectorCount").increment(1);
@@ -110,6 +117,7 @@ public class TFPartialVectorReducer exte
try {
dimension = conf.getInt(PartialVectorMerger.DIMENSION,
Integer.MAX_VALUE);
sequentialAccess =
conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
+ namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS,
maxNGramSize);
URI[] localFiles = DistributedCache.getCacheFiles(conf);
if (localFiles == null || localFiles.length < 1) {
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java
Sat Sep 25 03:27:57 2010
@@ -120,6 +120,7 @@ public final class TFIDFConverter {
int maxDFPercent,
float normPower,
boolean sequentialAccessOutput,
+ boolean namedVector,
int numReducers) throws IOException,
InterruptedException, ClassNotFoundException {
if (chunkSizeInMegabytes < MIN_CHUNKSIZE) {
chunkSizeInMegabytes = MIN_CHUNKSIZE;
@@ -156,7 +157,8 @@ public final class TFIDFConverter {
maxDFPercent,
dictionaryChunk,
partialVectorOutputPath,
- sequentialAccessOutput);
+ sequentialAccessOutput,
+ namedVector);
}
Configuration conf = new Configuration();
@@ -169,6 +171,7 @@ public final class TFIDFConverter {
normPower,
datasetFeatures.getFirst()[0].intValue(),
sequentialAccessOutput,
+ namedVector,
numReducers);
HadoopUtil.deletePaths(partialVectorPaths, fs);
} else {
@@ -256,6 +259,10 @@ public final class TFIDFConverter {
* location of the chunk of features and the id's
* @param output
* output directory were the partial vectors have to be created
+ * @param sequentialAccess
+ * output vectors should be optimized for sequential access
+ * @param namedVectors
+ * output vectors should be named, retaining key (doc id) as a label
*/
private static void makePartialVectors(Path input,
Long featureCount,
@@ -264,7 +271,8 @@ public final class TFIDFConverter {
int maxDFPercent,
Path dictionaryFilePath,
Path output,
- boolean sequentialAccess)
+ boolean sequentialAccess,
+ boolean namedVector)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
@@ -276,6 +284,7 @@ public final class TFIDFConverter {
conf.setInt(MIN_DF, minDf);
conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
+ conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector);
DistributedCache.setCacheFiles(new URI[] {dictionaryFilePath.toUri()},
conf);
Job job = new Job(conf);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFPartialVectorReducer.java
Sat Sep 25 03:27:57 2010
@@ -59,6 +59,8 @@ public class TFIDFPartialVectorReducer e
private boolean sequentialAccess;
+ private boolean namedVector;
+
@Override
protected void reduce(WritableComparable<?> key, Iterable<VectorWritable>
values, Context context)
throws IOException, InterruptedException {
@@ -86,7 +88,12 @@ public class TFIDFPartialVectorReducer e
if (sequentialAccess) {
vector = new SequentialAccessSparseVector(vector);
}
- VectorWritable vectorWritable = new VectorWritable(new NamedVector(vector,
key.toString()));
+
+ if (namedVector) {
+ vector = new NamedVector(vector, key.toString());
+ }
+
+ VectorWritable vectorWritable = new VectorWritable(vector);
context.write(key, vectorWritable);
}
@@ -105,6 +112,7 @@ public class TFIDFPartialVectorReducer e
minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99);
sequentialAccess =
conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
+ namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
Path dictionaryFile = new Path(localFiles[0].getPath());
FileSystem fs = dictionaryFile.getFileSystem(conf);
Added:
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java?rev=1001130&view=auto
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
(added)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/text/SparseVectorsFromSequenceFilesTest.java
Sat Sep 25 03:27:57 2010
@@ -0,0 +1,90 @@
+package org.apache.mahout.text;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.mahout.utils.MahoutTestCase;
+import org.apache.mahout.utils.vectors.text.DictionaryVectorizerTest;
+import org.apache.mahout.utils.vectors.text.RandomDocumentGenerator;
+import org.junit.Before;
+import org.junit.Test;
+
+
+public class SparseVectorsFromSequenceFilesTest extends MahoutTestCase {
+ private static final int NUM_DOCS = 100;
+
+ private Configuration conf;
+ private FileSystem fs;
+ private Path inputPath;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ conf = new Configuration();
+ fs = FileSystem.get(conf);
+
+ inputPath = getTestTempFilePath("documents/docs.file");
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPath,
Text.class, Text.class);
+
+ RandomDocumentGenerator gen = new RandomDocumentGenerator();
+
+ for (int i = 0; i < NUM_DOCS; i++) {
+ writer.append(new Text("Document::ID::" + i), new
Text(gen.getRandomDocument()));
+ }
+ writer.close();
+ }
+
+
+ @Test
+ public void testCreateTermFrequencyVectors() throws Exception {
+ runTest(false, false);
+ }
+
+ @Test
+ public void testCreateTermFrequencyVectorsNam() throws Exception {
+ runTest(false, true);
+ }
+
+ @Test
+ public void testCreateTermFrequencyVectorsSeq() throws Exception {
+ runTest(true, false);
+ }
+
+ @Test
+ public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
+ runTest(true, true);
+ }
+
+ protected void runTest(boolean sequential, boolean named) throws Exception {
+ Path outputPath = getTestTempFilePath("output");
+
+
+ List<String> argList = new LinkedList<String>();
+ argList.add("-i");
+ argList.add(inputPath.toString());
+ argList.add("-o");
+ argList.add(outputPath.toString());
+
+ if (sequential)
+ argList.add("-seq");
+
+ if (named)
+ argList.add("-nv");
+
+ String[] args = argList.toArray(new String[0]);
+
+ SparseVectorsFromSequenceFiles.main(args);
+
+ Path tfVectors = new Path(outputPath, "tf-vectors");
+ Path tfidfVectors = new Path(outputPath, "tfidf-vectors");
+
+ DictionaryVectorizerTest.validateVectors(fs, conf, NUM_DOCS, tfVectors,
sequential, named);
+ DictionaryVectorizerTest.validateVectors(fs, conf, NUM_DOCS, tfidfVectors,
sequential, named);
+ }
+}
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=1001130&r1=1001129&r2=1001130&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Sat Sep 25 03:27:57 2010
@@ -19,15 +19,26 @@ package org.apache.mahout.utils.vectors.
import java.util.Random;
+import junit.framework.TestCase;
+
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.text.DefaultAnalyzer;
import org.apache.mahout.utils.MahoutTestCase;
+import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
+import org.apache.mahout.utils.vectors.text.DocumentProcessor;
import org.apache.mahout.utils.vectors.tfidf.TFIDFConverter;
import org.junit.Before;
import org.junit.Test;
@@ -37,89 +48,122 @@ import org.junit.Test;
*/
public final class DictionaryVectorizerTest extends MahoutTestCase {
- private static final int AVG_DOCUMENT_LENGTH = 20;
- private static final int AVG_SENTENCE_LENGTH = 8;
- private static final int AVG_WORD_LENGTH = 6;
private static final int NUM_DOCS = 100;
- private static final String CHARSET = "abcdef";
- private static final String DELIM = " .,?;:!\t\n\r";
- private static final String ERRORSET = "`1234567890" +
"-...@#$%^&*()_+[]{}'\"/<>|\\";
-
- private final Random random = RandomUtils.getRandom();
+
+ private Configuration conf;
private FileSystem fs;
-
- private char getRandomDelimiter() {
- return DELIM.charAt(random.nextInt(DELIM.length()));
- }
-
- private String getRandomDocument() {
- int length = (AVG_DOCUMENT_LENGTH >> 1) +
random.nextInt(AVG_DOCUMENT_LENGTH);
- StringBuilder sb = new StringBuilder(length * AVG_SENTENCE_LENGTH *
AVG_WORD_LENGTH);
- for (int i = 0; i < length; i++) {
- sb.append(getRandomSentence());
- }
- return sb.toString();
- }
-
- private String getRandomSentence() {
- int length = (AVG_SENTENCE_LENGTH >> 1) +
random.nextInt(AVG_SENTENCE_LENGTH);
- StringBuilder sb = new StringBuilder(length * AVG_WORD_LENGTH);
- for (int i = 0; i < length; i++) {
- sb.append(getRandomString()).append(' ');
- }
- sb.append(getRandomDelimiter());
- return sb.toString();
- }
-
- private String getRandomString() {
- int length = (AVG_WORD_LENGTH >> 1) + random.nextInt(AVG_WORD_LENGTH);
- StringBuilder sb = new StringBuilder(length);
- for (int i = 0; i < length; i++) {
- sb.append(CHARSET.charAt(random.nextInt(CHARSET.length())));
- }
- if (random.nextInt(10) == 0) {
- sb.append(ERRORSET.charAt(random.nextInt(ERRORSET.length())));
- }
- return sb.toString();
- }
-
+ private Path inputPath;
+
@Override
@Before
public void setUp() throws Exception {
super.setUp();
- Configuration conf = new Configuration();
+ conf = new Configuration();
fs = FileSystem.get(conf);
- }
- @Test
- public void testCreateTermFrequencyVectors() throws Exception {
- Configuration conf = new Configuration();
- Path path = getTestTempFilePath("documents/docs.file");
- SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
Text.class, Text.class);
+ inputPath = getTestTempFilePath("documents/docs.file");
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPath,
Text.class, Text.class);
+ RandomDocumentGenerator gen = new RandomDocumentGenerator();
+
for (int i = 0; i < NUM_DOCS; i++) {
- writer.append(new Text("Document::ID::" + i), new
Text(getRandomDocument()));
+ writer.append(new Text("Document::ID::" + i), new
Text(gen.getRandomDocument()));
}
writer.close();
+ }
+
+ @Test
+ public void testCreateTermFrequencyVectors() throws Exception {
+ runTest(false, false);
+ }
+
+ @Test
+ public void testCreateTermFrequencyVectorsNam() throws Exception {
+ runTest(false, true);
+ }
+
+ @Test
+ public void testCreateTermFrequencyVectorsSeq() throws Exception {
+ runTest(true, false);
+ }
+
+ @Test
+ public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
+ runTest(true, true);
+ }
+
+ public void runTest(boolean sequential, boolean named) throws Exception {
+
Class<? extends Analyzer> analyzer = DefaultAnalyzer.class;
- DocumentProcessor.tokenizeDocuments(path, analyzer,
getTestTempDirPath("output/tokenized-documents"));
-
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
-
getTestTempDirPath("output/wordcount"),
+
+ Path tokenizedDocuments = getTestTempDirPath("output/tokenized-documents");
+ Path wordCount = getTestTempDirPath("output/wordcount");
+ Path tfVectors = new Path(wordCount, "tf-vectors");
+ Path tfidf = getTestTempDirPath("output/tfidf");
+ Path tfidfVectors = new Path(tfidf, "tfidf-vectors");
+
+ DocumentProcessor.tokenizeDocuments(inputPath, analyzer,
tokenizedDocuments);
+
+ DictionaryVectorizer.createTermFrequencyVectors(tokenizedDocuments,
+ wordCount,
conf,
2,
1,
0.0f,
1,
100,
- false);
-
TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/tf-vectors"),
- getTestTempDirPath("output/tfidf"),
+ sequential,
+ named);
+
+ validateVectors(fs, conf, NUM_DOCS, tfVectors, sequential, named);
+
+ TFIDFConverter.processTfIdf(tfVectors,
+ tfidf,
100,
1,
99,
1.0f,
- false,
+ sequential,
+ named,
1);
+
+
+ validateVectors(fs, conf, NUM_DOCS, tfidfVectors, sequential, named);
+ }
+
+ public static void validateVectors(FileSystem fs, Configuration conf, int
numDocs, Path vectorPath, boolean sequential, boolean named) throws Exception {
+ FileStatus[] stats = fs.listStatus(vectorPath, new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ return path.getName().startsWith("part-");
+ }
+
+ });
+
+ int count = 0;
+ Text key = new Text();
+ VectorWritable vw = new VectorWritable();
+ for (FileStatus s: stats) {
+ SequenceFile.Reader tfidfReader = new SequenceFile.Reader(fs,
s.getPath(), conf);
+ while (tfidfReader.next(key, vw)) {
+ count++;
+ Vector v = vw.get();
+ if (named) {
+ TestCase.assertTrue("Expected NamedVector", v instanceof
NamedVector);
+ v = ((NamedVector) v).getDelegate();
+ }
+
+ if (sequential) {
+ TestCase.assertTrue("Expected SequentialAccessSparseVector", v
instanceof SequentialAccessSparseVector);
+ }
+ else {
+ TestCase.assertTrue("Expected RandomAccessSparseVector", v
instanceof RandomAccessSparseVector);
+ }
+
+ }
+ tfidfReader.close();
+ }
+ TestCase.assertEquals("Expected " + numDocs + " documents", numDocs,
count);
}
}
Added:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java?rev=1001130&view=auto
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
(added)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/RandomDocumentGenerator.java
Sat Sep 25 03:27:57 2010
@@ -0,0 +1,52 @@
+package org.apache.mahout.utils.vectors.text;
+
+import java.util.Random;
+
+import org.apache.mahout.common.RandomUtils;
+
+public class RandomDocumentGenerator {
+
+ private static final int AVG_DOCUMENT_LENGTH = 20;
+ private static final int AVG_SENTENCE_LENGTH = 8;
+ private static final int AVG_WORD_LENGTH = 6;
+ private static final String CHARSET = "abcdef";
+ private static final String DELIM = " .,?;:!\t\n\r";
+ private static final String ERRORSET = "`1234567890" +
"-...@#$%^&*()_+[]{}'\"/<>|\\";
+
+ private final Random random = RandomUtils.getRandom();
+
+ private char getRandomDelimiter() {
+ return DELIM.charAt(random.nextInt(DELIM.length()));
+ }
+
+ public String getRandomDocument() {
+ int length = (AVG_DOCUMENT_LENGTH >> 1) +
random.nextInt(AVG_DOCUMENT_LENGTH);
+ StringBuilder sb = new StringBuilder(length * AVG_SENTENCE_LENGTH *
AVG_WORD_LENGTH);
+ for (int i = 0; i < length; i++) {
+ sb.append(getRandomSentence());
+ }
+ return sb.toString();
+ }
+
+ public String getRandomSentence() {
+ int length = (AVG_SENTENCE_LENGTH >> 1) +
random.nextInt(AVG_SENTENCE_LENGTH);
+ StringBuilder sb = new StringBuilder(length * AVG_WORD_LENGTH);
+ for (int i = 0; i < length; i++) {
+ sb.append(getRandomString()).append(' ');
+ }
+ sb.append(getRandomDelimiter());
+ return sb.toString();
+ }
+
+ public String getRandomString() {
+ int length = (AVG_WORD_LENGTH >> 1) + random.nextInt(AVG_WORD_LENGTH);
+ StringBuilder sb = new StringBuilder(length);
+ for (int i = 0; i < length; i++) {
+ sb.append(CHARSET.charAt(random.nextInt(CHARSET.length())));
+ }
+ if (random.nextInt(10) == 0) {
+ sb.append(ERRORSET.charAt(random.nextInt(ERRORSET.length())));
+ }
+ return sb.toString();
+ }
+}