Author: gsingers
Date: Sat Nov 5 00:21:17 2011
New Revision: 1197839
URL: http://svn.apache.org/viewvc?rev=1197839&view=rev
Log:
MAHOUT-873: baseline of simple vectorization encoding capabilities
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
- copied, changed from r1197293,
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java
- copied, changed from r1197293,
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
mahout/trunk/examples/bin/build-asf-email.sh
mahout/trunk/src/conf/driver.classes.props
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
Sat Nov 5 00:21:17 2011
@@ -408,31 +408,11 @@ public abstract class AbstractJob extend
Class<? extends Writable> mapperValue,
Class<? extends OutputFormat> outputFormat) throws
IOException {
- Job job = new Job(new Configuration(getConf()));
- Configuration jobConf = job.getConfiguration();
-
- if (mapper.equals(Mapper.class)) {
- throw new IllegalStateException("Can't figure out the user class jar
file from mapper/reducer");
- }
- job.setJarByClass(mapper);
-
- job.setInputFormatClass(inputFormat);
- jobConf.set("mapred.input.dir", inputPath.toString());
-
- job.setMapperClass(mapper);
- job.setMapOutputKeyClass(mapperKey);
- job.setMapOutputValueClass(mapperValue);
- job.setOutputKeyClass(mapperKey);
- job.setOutputValueClass(mapperValue);
- jobConf.setBoolean("mapred.compress.map.output", true);
- job.setNumReduceTasks(0);
-
- job.setJobName(getCustomJobName(job, mapper, Reducer.class));
-
- job.setOutputFormatClass(outputFormat);
- jobConf.set("mapred.output.dir", outputPath.toString());
-
+ Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+ inputFormat, mapper, mapperKey, mapperValue, outputFormat,
getConf());
+ job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(),
job, mapper, Reducer.class));
return job;
+
}
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends
Mapper> mapper,
@@ -452,67 +432,12 @@ public abstract class AbstractJob extend
Class<? extends Writable> reducerKey,
Class<? extends Writable> reducerValue,
Class<? extends OutputFormat> outputFormat) throws
IOException {
-
- Job job = new Job(new Configuration(getConf()));
- Configuration jobConf = job.getConfiguration();
-
- if (reducer.equals(Reducer.class)) {
- if (mapper.equals(Mapper.class)) {
- throw new IllegalStateException("Can't figure out the user class jar
file from mapper/reducer");
- }
- job.setJarByClass(mapper);
- } else {
- job.setJarByClass(reducer);
- }
-
- job.setInputFormatClass(inputFormat);
- jobConf.set("mapred.input.dir", inputPath.toString());
-
- job.setMapperClass(mapper);
- job.setMapOutputKeyClass(mapperKey);
- job.setMapOutputValueClass(mapperValue);
-
- jobConf.setBoolean("mapred.compress.map.output", true);
-
- job.setReducerClass(reducer);
- job.setOutputKeyClass(reducerKey);
- job.setOutputValueClass(reducerValue);
-
- job.setJobName(getCustomJobName(job, mapper, reducer));
-
- job.setOutputFormatClass(outputFormat);
- jobConf.set("mapred.output.dir", outputPath.toString());
-
+ Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+ inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey,
reducerValue, outputFormat, getConf());
+ job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(),
job, mapper, Reducer.class));
return job;
}
- private String getCustomJobName(JobContext job,Class<? extends Mapper>
mapper) {
- StringBuilder name = new StringBuilder(100);
- String customJobName = job.getJobName();
- if (customJobName == null || customJobName.trim().length() == 0) {
- name.append(getClass().getSimpleName());
- } else {
- name.append(customJobName);
- }
- name.append('-').append(mapper.getSimpleName());
- return name.toString();
- }
-
- private String getCustomJobName(JobContext job,
- Class<? extends Mapper> mapper,
- Class<? extends Reducer> reducer) {
- StringBuilder name = new StringBuilder(100);
- String customJobName = job.getJobName();
- if (customJobName == null || customJobName.trim().isEmpty()) {
- name.append(getClass().getSimpleName());
- } else {
- name.append(customJobName);
- }
- name.append('-').append(mapper.getSimpleName());
- name.append('-').append(reducer.getSimpleName());
- return name.toString();
- }
-
/**
* necessary to make this job (having a combined input path) work on Amazon
S3, hopefully this is obsolete when MultipleInputs is available
* again
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java
Sat Nov 5 00:21:17 2011
@@ -25,9 +25,23 @@ public final class ClassUtils {
public static <T> T instantiateAs(String classname, Class<T>
asSubclassOfClass) {
try {
- return
Class.forName(classname).asSubclass(asSubclassOfClass).getConstructor().newInstance();
- } catch (ClassNotFoundException cnfe) {
- throw new IllegalStateException(cnfe);
+ return
instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass),
asSubclassOfClass);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public static <T> T instantiateAs(String classname, Class<T>
asSubclassOfClass, Class[] params, Object[] args) {
+ try {
+ return
instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass),
asSubclassOfClass, params, args);
+ } catch (ClassNotFoundException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+
+ public static <T> T instantiateAs(Class<? extends T> clazz, Class<T>
asSubclassOfClass, Class[] params, Object[] args) {
+ try {
+ return
clazz.asSubclass(asSubclassOfClass).getConstructor(params).newInstance(args);
} catch (InstantiationException ie) {
throw new IllegalStateException(ie);
} catch (IllegalAccessException iae) {
@@ -39,6 +53,7 @@ public final class ClassUtils {
}
}
+
public static <T> T instantiateAs(Class<? extends T> clazz, Class<T>
asSubclassOfClass) {
try {
return
clazz.asSubclass(asSubclassOfClass).getConstructor().newInstance();
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
Sat Nov 5 00:21:17 2011
@@ -31,6 +31,12 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Reducer;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
import
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
@@ -43,6 +49,134 @@ public final class HadoopUtil {
private HadoopUtil() { }
+ /**
+ * Create a map-only Hadoop Job out of the passed in parameters. Does not
set the
+ * Job name.
+ * @param inputPath
+ * @param outputPath
+ * @param inputFormat
+ * @param mapper
+ * @param mapperKey
+ * @param mapperValue
+ * @param outputFormat
+ * @param conf
+ * @return
+ * @throws IOException
+ *
+ * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext,
Class, Class)
+ */
+ public static Job prepareJob(Path inputPath,
+ Path outputPath,
+ Class<? extends InputFormat> inputFormat,
+ Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey,
+ Class<? extends Writable> mapperValue,
+ Class<? extends OutputFormat> outputFormat,
Configuration conf) throws IOException {
+
+ Job job = new Job(new Configuration(conf));
+ Configuration jobConf = job.getConfiguration();
+
+ if (mapper.equals(Mapper.class)) {
+ throw new IllegalStateException("Can't figure out the user class jar
file from mapper/reducer");
+ }
+ job.setJarByClass(mapper);
+
+ job.setInputFormatClass(inputFormat);
+ jobConf.set("mapred.input.dir", inputPath.toString());
+
+ job.setMapperClass(mapper);
+ job.setMapOutputKeyClass(mapperKey);
+ job.setMapOutputValueClass(mapperValue);
+ job.setOutputKeyClass(mapperKey);
+ job.setOutputValueClass(mapperValue);
+ jobConf.setBoolean("mapred.compress.map.output", true);
+ job.setNumReduceTasks(0);
+
+ job.setOutputFormatClass(outputFormat);
+ jobConf.set("mapred.output.dir", outputPath.toString());
+
+ return job;
+ }
+
+ /**
+ * Create a map and reduce Hadoop job. Does not set the name on the job.
+ * @param inputPath
+ * @param outputPath
+ * @param inputFormat
+ * @param mapper
+ * @param mapperKey
+ * @param mapperValue
+ * @param reducer
+ * @param reducerKey
+ * @param reducerValue
+ * @param outputFormat
+ * @param conf
+ * @return
+ * @throws IOException
+ *
+ * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext,
Class, Class)
+ * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path,
Class, Class, Class, Class, Class, org.apache.hadoop.conf.Configuration)
+ */
+ public static Job prepareJob(Path inputPath,
+ Path outputPath,
+ Class<? extends InputFormat> inputFormat,
+ Class<? extends Mapper> mapper,
+ Class<? extends Writable> mapperKey,
+ Class<? extends Writable> mapperValue,
+ Class<? extends Reducer> reducer,
+ Class<? extends Writable> reducerKey,
+ Class<? extends Writable> reducerValue,
+ Class<? extends OutputFormat> outputFormat,
+ Configuration conf) throws IOException {
+
+ Job job = new Job(new Configuration(conf));
+ Configuration jobConf = job.getConfiguration();
+
+ if (reducer.equals(Reducer.class)) {
+ if (mapper.equals(Mapper.class)) {
+ throw new IllegalStateException("Can't figure out the user class jar
file from mapper/reducer");
+ }
+ job.setJarByClass(mapper);
+ } else {
+ job.setJarByClass(reducer);
+ }
+
+ job.setInputFormatClass(inputFormat);
+ jobConf.set("mapred.input.dir", inputPath.toString());
+
+ job.setMapperClass(mapper);
+ job.setMapOutputKeyClass(mapperKey);
+ job.setMapOutputValueClass(mapperValue);
+
+ jobConf.setBoolean("mapred.compress.map.output", true);
+
+ job.setReducerClass(reducer);
+ job.setOutputKeyClass(reducerKey);
+ job.setOutputValueClass(reducerValue);
+
+ job.setOutputFormatClass(outputFormat);
+ jobConf.set("mapred.output.dir", outputPath.toString());
+
+ return job;
+ }
+
+
+ public static String getCustomJobName(String className, JobContext job,
+ Class<? extends Mapper> mapper,
+ Class<? extends Reducer> reducer) {
+ StringBuilder name = new StringBuilder(100);
+ String customJobName = job.getJobName();
+ if (customJobName == null || customJobName.trim().isEmpty()) {
+ name.append(className);
+ } else {
+ name.append(customJobName);
+ }
+ name.append('-').append(mapper.getSimpleName());
+ name.append('-').append(reducer.getSimpleName());
+ return name.toString();
+ }
+
+
public static void delete(Configuration conf, Iterable<Path> paths) throws
IOException {
if (conf == null) {
conf = new Configuration();
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
Sat Nov 5 00:21:17 2011
@@ -60,7 +60,8 @@ import org.apache.mahout.vectorizer.term
* This is a dictionary based Vectorizer.
*
*/
-public final class DictionaryVectorizer {
+
+public final class DictionaryVectorizer implements Vectorizer{
public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
@@ -88,10 +89,17 @@ public final class DictionaryVectorizer
/**
* Cannot be initialized. Use the static functions
*/
- private DictionaryVectorizer() {
+ public DictionaryVectorizer() {
}
-
+ //TODO: move more of SparseVectorsFromSequenceFile in here, and then fold
SparseVectorsFrom with EncodedVectorsFrom to have one framework.
+
+ @Override
+ public void createVectors(Path input, Path output, VectorizerConfig config)
throws Exception {
+ createTermFrequencyVectors(input, output, config.conf, config.minSupport,
config.maxNGramSize,
+ config.minLLRValue, config.normPower, config.logNormalize,
config.numReducers, config.chunkSizeInMegabytes, config.sequentialAccess,
config.namedVectors);
+ }
+
/**
* Create Term Frequency (Tf) Vectors from the input set of documents in
{@link SequenceFile} format. This
* tries to fix the maximum memory used by the feature chunk per node
thereby splitting the process across
Copied:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
(from r1197293,
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java)
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?p2=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java&p1=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java&r1=1197293&r2=1197839&rev=1197839&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
Sat Nov 5 00:21:17 2011
@@ -17,252 +17,89 @@
package org.apache.mahout.vectorizer;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.vectorizer.collocations.llr.LLRReducer;
-import org.apache.mahout.vectorizer.common.PartialVectorMerger;
-import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Converts a given set of sequence files into SparseVectors
*/
-public final class SparseVectorsFromSequenceFiles extends AbstractJob {
-
- private static final Logger log =
LoggerFactory.getLogger(SparseVectorsFromSequenceFiles.class);
-
+public final class EncodedVectorsFromSequenceFiles extends AbstractJob {
+
+ private static final Logger log =
LoggerFactory.getLogger(EncodedVectorsFromSequenceFiles.class);
+
public static void main(String[] args) throws Exception {
- ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
+ ToolRunner.run(new EncodedVectorsFromSequenceFiles(), args);
}
-
+
@Override
public int run(String[] args) throws Exception {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputDirOpt = DefaultOptionCreator.inputOption().create();
-
- Option outputDirOpt = DefaultOptionCreator.outputOption().create();
-
- Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
-
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional) Minimum Support. Default Value:
2").withShortName("s").create();
-
- Option analyzerNameOpt =
obuilder.withLongName("analyzerName").withArgument(
-
abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
- "The class name of the analyzer").withShortName("a").create();
-
- Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(
-
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
- "The chunkSize in MegaBytes. 100-10000
MB").withShortName("chunk").create();
-
- Option weightOpt =
obuilder.withLongName("weight").withRequired(false).withArgument(
-
abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
- "The kind of weight to use. Currently TF or
TFIDF").withShortName("wt").create();
-
- Option minDFOpt =
obuilder.withLongName("minDF").withRequired(false).withArgument(
-
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
- "The minimum document frequency. Default is
1").withShortName("md").create();
-
- Option maxDFPercentOpt =
obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-
abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
- "The max percentage of docs for the DF. Can be used to remove really
high frequency terms."
- + " Expressed as an integer between 0 and 100. Default is
99.").withShortName("x").create();
-
- Option minLLROpt =
obuilder.withLongName("minLLR").withRequired(false).withArgument(
-
abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional)The minimum Log Likelihood Ratio(Float) Default is " +
LLRReducer.DEFAULT_MIN_LLR)
- .withShortName("ml").create();
-
- Option numReduceTasksOpt =
obuilder.withLongName("numReducers").withArgument(
-
abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional) Number of reduce tasks. Default Value:
1").withShortName("nr").create();
-
- Option powerOpt =
obuilder.withLongName("norm").withRequired(false).withArgument(
-
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
- "The norm to use, expressed as either a float or \"INF\" if you want to
use the Infinite norm. "
- + "Must be greater or equal to 0. The default is not to
normalize").withShortName("n").create();
-
- Option logNormalizeOpt =
obuilder.withLongName("logNormalize").withRequired(false)
- .withDescription(
- "(Optional) Whether output vectors should be logNormalize. If set true
else false")
- .withShortName("lnorm").create();
-
- Option maxNGramSizeOpt =
obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
- abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "(Optional) The maximum size of ngrams to create"
- + " (2 = bigrams, 3 = trigrams, etc) Default
Value:1").withShortName("ng").create();
-
- Option sequentialAccessVectorOpt =
obuilder.withLongName("sequentialAccessVector").withRequired(false)
- .withDescription(
- "(Optional) Whether output vectors should be
SequentialAccessVectors. If set true else false")
- .withShortName("seq").create();
-
- Option namedVectorOpt =
obuilder.withLongName("namedVector").withRequired(false)
- .withDescription(
- "(Optional) Whether output vectors should be NamedVectors. If set true
else false")
- .withShortName("nv").create();
-
- Option overwriteOutput =
obuilder.withLongName("overwrite").withRequired(false).withDescription(
- "If set, overwrite the output directory").withShortName("ow").create();
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
- .create();
-
- Group group =
gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
-
.withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
-
.withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
-
.withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
-
.withOption(helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt)
- .withOption(logNormalizeOpt)
- .create();
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- parser.setHelpOption(helpOpt);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return -1;
- }
-
- Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
- Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));
-
- int chunkSize = 100;
- if (cmdLine.hasOption(chunkSizeOpt)) {
- chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
- }
- int minSupport = 2;
- if (cmdLine.hasOption(minSupportOpt)) {
- String minSupportString = (String) cmdLine.getValue(minSupportOpt);
- minSupport = Integer.parseInt(minSupportString);
- }
-
- int maxNGramSize = 1;
-
- if (cmdLine.hasOption(maxNGramSizeOpt)) {
- try {
- maxNGramSize =
Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
- } catch (NumberFormatException ex) {
- log.warn("Could not parse ngram size option");
- }
- }
- log.info("Maximum n-gram size is: {}", maxNGramSize);
-
- if (cmdLine.hasOption(overwriteOutput)) {
- HadoopUtil.delete(getConf(), outputDir);
- }
-
- float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
- if (cmdLine.hasOption(minLLROpt)) {
- minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
- }
- log.info("Minimum LLR value: {}", minLLRValue);
-
- int reduceTasks = 1;
- if (cmdLine.hasOption(numReduceTasksOpt)) {
- reduceTasks =
Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
- }
- log.info("Number of reduce tasks: {}", reduceTasks);
-
- Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
- if (cmdLine.hasOption(analyzerNameOpt)) {
- String className = cmdLine.getValue(analyzerNameOpt).toString();
- analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
- // try instantiating it, b/c there isn't any point in setting it if
- // you can't instantiate it
- ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
- }
-
- boolean processIdf;
-
- if (cmdLine.hasOption(weightOpt)) {
- String wString = cmdLine.getValue(weightOpt).toString();
- if ("tf".equalsIgnoreCase(wString)) {
- processIdf = false;
- } else if ("tfidf".equalsIgnoreCase(wString)) {
- processIdf = true;
- } else {
- throw new OptionException(weightOpt);
- }
- } else {
- processIdf = true;
- }
-
- int minDf = 1;
- if (cmdLine.hasOption(minDFOpt)) {
- minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
- }
- int maxDFPercent = 99;
- if (cmdLine.hasOption(maxDFPercentOpt)) {
- maxDFPercent =
Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
- }
-
- float norm = PartialVectorMerger.NO_NORMALIZING;
- if (cmdLine.hasOption(powerOpt)) {
- String power = cmdLine.getValue(powerOpt).toString();
- if ("INF".equals(power)) {
- norm = Float.POSITIVE_INFINITY;
- } else {
- norm = Float.parseFloat(power);
- }
- }
-
- boolean logNormalize = false;
- if (cmdLine.hasOption(logNormalizeOpt)) {
- logNormalize = true;
- }
-
- Configuration conf = getConf();
- Path tokenizedPath = new Path(outputDir,
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
- DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
tokenizedPath, conf);
-
- boolean sequentialAccessOutput = false;
- if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
- sequentialAccessOutput = true;
- }
-
- boolean namedVectors = false;
- if (cmdLine.hasOption(namedVectorOpt)) {
- namedVectors = true;
- }
-
- if (!processIdf) {
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
outputDir, conf, minSupport, maxNGramSize,
- minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
sequentialAccessOutput, namedVectors);
- } else if (processIdf) {
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
outputDir, conf, minSupport, maxNGramSize,
- minLLRValue, -1.0f, false, reduceTasks, chunkSize,
sequentialAccessOutput, namedVectors);
-
- TFIDFConverter.processTfIdf(
- new Path(outputDir,
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
- outputDir, conf, chunkSize, minDf, maxDFPercent, norm, logNormalize,
- sequentialAccessOutput, namedVectors, reduceTasks);
- }
- } catch (OptionException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
+ addInputOption();
+ addOutputOption();
+ addOption("analyzerName", "an", "The class name of the analyzer",
DefaultAnalyzer.class.getName());
+ addOption(buildOption("sequentialAccessVector", "seq", "(Optional) Whether
output vectors should be SequentialAccessVectors. If set true else false",
false, false, null));
+ addOption(buildOption("namedVector", "nv", "Create named vectors using the
key. False by default", false, false, null));
+ addOption("cardinality", "c", "The cardinality to use for creating the
vectors. Default is 5000", String.valueOf(5000));
+ addOption("encoderFieldName", "en", "The name of the encoder to be passed
to the FeatureVectorEncoder constructor. Default is text. Note this is not
the class name of a FeatureValueEncoder, but is instead the construction
argument.", "text");
+ addOption("encoderClass", "ec", "The class name of the encoder to be used.
Default is " + LuceneTextValueEncoder.class.getName(),
LuceneTextValueEncoder.class.getName());
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+ HadoopUtil.delete(getConf(), output);
+ }
+
+ Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
+ if (hasOption("analyzerName")) {
+ String className = getOption("analyzerName").toString();
+ analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+ // try instantiating it, b/c there isn't any point in setting it if
+ // you can't instantiate it
+ ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
+ }
+
+
+ Configuration conf = getConf();
+
+ boolean sequentialAccessOutput = hasOption("sequentialAccessVector");
+
+
+ boolean namedVectors = hasOption("namedVector");
+ int cardinality = 5000;
+ if (hasOption("cardinality")){
+ cardinality = Integer.parseInt(getOption("cardinality"));
+ }
+ String encoderName = "text";
+ if (hasOption("encoderFieldName")){
+ encoderName = getOption("encoderFieldName");
}
+ String encoderClass = LuceneTextValueEncoder.class.getName();
+ if (hasOption("encoderClass")){
+ encoderClass = getOption("encoderClass");
+ ClassUtils.instantiateAs(encoderClass, FeatureVectorEncoder.class, new
Class[]{String.class}, new Object[]{encoderName});//try instantiating it
+ }
+
+ SimpleTextEncodingVectorizer vectorizer = new
SimpleTextEncodingVectorizer();
+ VectorizerConfig config = new VectorizerConfig(conf,
analyzerClass.getName(), encoderClass, encoderName, sequentialAccessOutput,
+ namedVectors, cardinality);
+
+ vectorizer.createVectors(input, output, config);
+
return 0;
}
-
+
}
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java?rev=1197839&view=auto
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
(added)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
Sat Nov 5 00:21:17 2011
@@ -0,0 +1,81 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;
+
+import java.io.IOException;
+
+/**
+* The Mapper that does the work of encoding text
+*
+**/
+public class EncodingMapper extends Mapper<Text, Text, Text, VectorWritable> {
+ public static final String USE_NAMED_VECTORS = "namedVectors";
+ public static final String USE_SEQUENTIAL = "sequential";
+ public static final String ANALYZER_NAME = "analyzer";
+ public static final String ENCODER_FIELD_NAME = "encoderFieldName";
+ public static final String ENCODER_CLASS = "encoderClass";
+ public static final String CARDINALITY = "cardinality";
+ boolean sequentialVecs, namedVectors;
+ FeatureVectorEncoder encoder;
+ int cardinality;
+
+ @Override
+ protected void setup(Context context) throws IOException,
InterruptedException {
+ Configuration conf = context.getConfiguration();
+ sequentialVecs = conf.getBoolean(USE_SEQUENTIAL, false);
+ namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false);
+ String analyzerName = conf.get(ANALYZER_NAME,
StandardAnalyzer.class.getName());
+ Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class);
+ String encoderName = conf.get(ENCODER_FIELD_NAME, "text");
+ cardinality = conf.getInt(CARDINALITY, 5000);
+ String encClass = conf.get(ENCODER_CLASS);
+ encoder = ClassUtils.instantiateAs(encClass, FeatureVectorEncoder.class,
new Class[]{String.class}, new Object[]{encoderName});
+ if (encoder instanceof LuceneTextValueEncoder){
+ ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer);
+ }
+ }
+
+ @Override
+ protected void map(Text key, Text value, Context context) throws
IOException, InterruptedException {
+ Vector vector = null;
+ if (sequentialVecs) {
+ vector = new SequentialAccessSparseVector(cardinality);
+ } else {
+ vector = new RandomAccessSparseVector(cardinality);
+ }
+ if (namedVectors){
+ vector = new NamedVector(vector, key.toString());
+ }
+ encoder.addToVector(value.toString(), vector);
+ context.write(new Text(key.toString()), new VectorWritable(vector));
+ }
+}
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java?rev=1197839&view=auto
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
(added)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
Sat Nov 5 00:21:17 2011
@@ -0,0 +1,66 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Runs a Map/Reduce job that encodes {@link
org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder} the
+ * input and writes it to the output as a sequence file.
+ *<p/>
+ * Only works on basic text, where the value in the SequenceFile is a blob of
text.
+ */
+//TODO: find commonalities w/ DictionaryVectorizer and abstract them out
+public class SimpleTextEncodingVectorizer implements Vectorizer {
+ private transient static Logger log =
LoggerFactory.getLogger(SimpleTextEncodingVectorizer.class);
+
+ public SimpleTextEncodingVectorizer() {
+ }
+
+
+ @Override
+ public void createVectors(final Path input, final Path output, final
VectorizerConfig config) throws Exception {
+ //do this for convenience of using prepareJob
+ Job job = HadoopUtil.prepareJob(input, output,
SequenceFileInputFormat.class, EncodingMapper.class, Text.class,
VectorWritable.class,
+ SequenceFileOutputFormat.class, config.conf);
+ Configuration conf = job.getConfiguration();
+ conf.set(EncodingMapper.USE_SEQUENTIAL,
String.valueOf(config.sequentialAccess));
+ conf.set(EncodingMapper.USE_NAMED_VECTORS,
String.valueOf(config.namedVectors));
+ conf.set(EncodingMapper.ANALYZER_NAME, config.analyzerClassName);
+ conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.encoderName);
+ conf.set(EncodingMapper.ENCODER_CLASS, config.encoderClass);
+ conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.cardinality));
+ job.setNumReduceTasks(0);
+ boolean finished = job.waitForCompletion(true);
+
+ log.info("result of run: " + finished);
+ //TODO: something useful w/ this result should it be meaningful.
+ }
+
+
+}
+
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
Sat Nov 5 00:21:17 2011
@@ -234,18 +234,18 @@ public final class SparseVectorsFromSequ
Configuration conf = getConf();
Path tokenizedPath = new Path(outputDir,
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
+ //TODO: move this into DictionaryVectorizer , and then fold
SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass,
tokenizedPath, conf);
-
+
boolean sequentialAccessOutput = false;
if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
sequentialAccessOutput = true;
}
-
+
boolean namedVectors = false;
if (cmdLine.hasOption(namedVectorOpt)) {
namedVectors = true;
}
-
if (!processIdf) {
DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
outputDir, conf, minSupport, maxNGramSize,
minLLRValue, norm, logNormalize, reduceTasks, chunkSize,
sequentialAccessOutput, namedVectors);
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java?rev=1197839&view=auto
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
(added)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
Sat Nov 5 00:21:17 2011
@@ -0,0 +1,28 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.fs.Path;
+
+/**
+ *
+ *
+ **/
+public interface Vectorizer {
+ void createVectors(Path input, Path output, VectorizerConfig config) throws
Exception;
+}
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java?rev=1197839&view=auto
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
(added)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
Sat Nov 5 00:21:17 2011
@@ -0,0 +1,51 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * The config for a Vectorizer. Not all implementations need use all
variables.
+ *
+ **/
+public class VectorizerConfig {
+ public Configuration conf;
+ public String analyzerClassName;
+ public String encoderName;
+ public boolean sequentialAccess, namedVectors;
+ public int cardinality;
+ public String encoderClass;
+ public int minSupport;
+ public int maxNGramSize;
+ public float minLLRValue;
+ public float normPower;
+ public boolean logNormalize;
+ public int numReducers;
+ public int chunkSizeInMegabytes;
+
+
+ public VectorizerConfig(Configuration conf, String analyzerClassName, String
encoderClass, String encoderName, boolean sequentialAccess, boolean
namedVectors, int cardinality) {
+ this.conf = conf;
+ this.analyzerClassName = analyzerClassName;
+ this.encoderClass = encoderClass;
+ this.encoderName = encoderName;
+ this.sequentialAccess = sequentialAccess;
+ this.namedVectors = namedVectors;
+ this.cardinality = cardinality;
+ }
+}
Copied:
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java
(from r1197293,
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java)
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java?p2=mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java&p1=mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java&r1=1197293&r2=1197839&rev=1197839&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java
Sat Nov 5 00:21:17 2011
@@ -17,20 +17,29 @@
package org.apache.mahout.vectorizer;
-import java.util.LinkedList;
-import java.util.List;
-
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;
-public class SparseVectorsFromSequenceFilesTest extends MahoutTestCase {
+import java.util.LinkedList;
+import java.util.List;
+
+public class EncodedVectorsFromSequenceFilesTest extends MahoutTestCase {
private static final int NUM_DOCS = 100;
@@ -60,28 +69,28 @@ public class SparseVectorsFromSequenceFi
@Test
- public void testCreateTermFrequencyVectors() throws Exception {
+ public void testCreate() throws Exception {
runTest(false, false);
}
@Test
- public void testCreateTermFrequencyVectorsNam() throws Exception {
+ public void testCreateNamed() throws Exception {
runTest(false, true);
}
@Test
- public void testCreateTermFrequencyVectorsSeq() throws Exception {
+ public void testCreateSeq() throws Exception {
runTest(true, false);
}
@Test
- public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
+ public void testCreateSeqNamed() throws Exception {
runTest(true, true);
}
private void runTest(boolean sequential, boolean named) throws Exception {
- Path outputPath = getTestTempFilePath("output");
-
+ Path tmpPath = getTestTempDirPath();
+ Path outputPath = new Path(tmpPath, "output");
List<String> argList = new LinkedList<String>();
argList.add("-i");
@@ -98,13 +107,20 @@ public class SparseVectorsFromSequenceFi
}
String[] args = argList.toArray(new String[argList.size()]);
-
- SparseVectorsFromSequenceFiles.main(args);
- Path tfVectors = new Path(outputPath, "tf-vectors");
- Path tfidfVectors = new Path(outputPath, "tfidf-vectors");
-
- DictionaryVectorizerTest.validateVectors(conf, NUM_DOCS, tfVectors,
sequential, named);
- DictionaryVectorizerTest.validateVectors(conf, NUM_DOCS, tfidfVectors,
sequential, named);
+ EncodedVectorsFromSequenceFiles.main(args);
+
+ SequenceFileDirIterator<Text, VectorWritable> iter = new
SequenceFileDirIterator<Text, VectorWritable>(outputPath, PathType.LIST,
PathFilters.partFilter(), null, true, conf);
+ int seen = 0;
+ while (iter.hasNext()) {
+ Pair<Text, VectorWritable> next = iter.next();
+ if (sequential && !named){
+ assertTrue(next.getSecond().get() instanceof
SequentialAccessSparseVector);
+ } else if (named){
+ assertTrue(next.getSecond().get() instanceof NamedVector);
+ }
+ seen++;
+ }
+ assertEquals("Missed some vectors", NUM_DOCS, seen);
}
}
Modified: mahout/trunk/examples/bin/build-asf-email.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-asf-email.sh?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-asf-email.sh (original)
+++ mahout/trunk/examples/bin/build-asf-email.sh Sat Nov 5 00:21:17 2011
@@ -98,60 +98,92 @@ elif [ "x$alg" == "xclustering" ]; then
#classification
elif [ "x$alg" == "xclassification" ]; then
- algorithm=( standard complementary )
+ algorithm=( standard complementary sgd )
echo "Please select a number to choose the corresponding algorithm to run"
echo "1. ${algorithm[0]}"
echo "2. ${algorithm[1]}"
+# echo "3. ${algorithm[2]}"
read -p "Enter your choice : " choice
echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
- nbalg=${algorithm[$choice-1]}
-
- CLASS="$OUT/classification/"
- MAIL_OUT="$CLASS/seq-files"
- SEQ2SP="$CLASS/seq2sparse"
- SEQ2SPLABEL="$CLASS/labeled"
- SPLIT="$CLASS/splits"
- TRAIN="$SPLIT/train"
- TEST="$SPLIT/test"
- TEST_OUT="$CLASS/test-results"
- LABEL="$SPLIT/labels"
+ classAlg=${algorithm[$choice-1]}
#Convert mail to be formatted as:
# label\ttext
# One per line
# the label is the project_name_mailing_list, as in tomcat.apache.org_dev
- if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
- echo "Converting Mail files to Sequence Files"
- $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
- fi
#Convert to vectors
- if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
- echo "Converting the files to sparse vectors"
- $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+ if [ "x$classAlg" == "xstandard" ] || [ "x$classAlg" == "xcomplementary" ];
then
+ CLASS="$OUT/classification/bayesian"
+ MAIL_OUT="$CLASS/seq-files"
+ SEQ2SP="$CLASS/seq2sparse"
+ SEQ2SPLABEL="$CLASS/labeled"
+ SPLIT="$CLASS/splits"
+ TRAIN="$SPLIT/train"
+ TEST="$SPLIT/test"
+ TEST_OUT="$CLASS/test-results"
+ LABEL="$SPLIT/labels"
+ if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ echo "Converting Mail files to Sequence Files"
+ $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+ fi
+ if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+ echo "Converting the files to sparse vectors"
+ $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+ #We need to modify the vectors to have a better label
+ echo "Converting vector labels"
+ $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite
--maxItemsPerLabel 1000
+ fi
+ if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+ #setup train/test files
+ echo "Creating training and test inputs"
+ $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
+ fi
+ MODEL="$CLASS/model"
+ if [ "x$classAlg" == "xstandard" ]; then
+ echo "Running Standard Training"
+ $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite
+ echo "Running Test"
+ $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite
+
+ elif [ "x$classAlg" == "xcomplementary" ]; then
+ echo "Running Complementary Training"
+ $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite --trainComplementary
+ echo "Running Complementary Test"
+ $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite --testComplementary
+ fi
+ elif [ "x$classAlg" == "xsgd" ]; then
+ CLASS="$OUT/classification/sgd"
+ MAIL_OUT="$CLASS/seq-files"
+ SEQ2SP="$CLASS/seq2encoded"
+ SEQ2SPLABEL="$CLASS/labeled"
+ SPLIT="$CLASS/splits"
+ TRAIN="$SPLIT/train"
+ TEST="$SPLIT/test"
+ TEST_OUT="$CLASS/test-results"
+ LABEL="$SPLIT/labels"
+ if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+ echo "Converting Mail files to Sequence Files"
+ $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+ fi
+ echo "Converting the files to sparse vectors in $SEQ2SP"
+ $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName
org.apache.mahout.text.MailArchivesClusteringAnalyzer
#We need to modify the vectors to have a better label
echo "Converting vector labels"
- $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input
"$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite --maxItemsPerLabel
1000
- fi
- if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
- #setup train/test files
- echo "Creating training and test inputs"
- $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
- fi
- MODEL="$CLASS/model"
- if [ "x$nbalg" == "xstandard" ]; then
- echo "Running Standard Training"
- $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite
+ $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
+ if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+ #setup train/test files
+ echo "Creating training and test inputs from $SEQ2SPLABEL"
+ $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
+ fi
+ MODEL="$CLASS/model"
+
+ echo "Running SGD Training"
+ #$MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite
echo "Running Test"
- $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite
+#$MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL --overwrite
- elif [ "x$nbalg" == "xcomplementary" ]; then
- echo "Running Complementary Training"
- $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL
--overwrite --trainComplementary
- echo "Running Complementary Test"
- $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL
--overwrite --testComplementary
fi
-
fi
Modified: mahout/trunk/src/conf/driver.classes.props
URL:
http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.props?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.props (original)
+++ mahout/trunk/src/conf/driver.classes.props Sat Nov 5 00:21:17 2011
@@ -18,6 +18,7 @@ org.apache.mahout.utils.vectors.lucene.D
org.apache.mahout.utils.vectors.arff.Driver = arff.vector : Generate Vectors
from an ARFF file or directory
org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate
sequence files (of Text) from a directory
org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles = seq2sparse:
Sparse Vector generation from Text sequence files
+org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded:
Encoded Sparse Vector generation from Text sequence files
org.apache.mahout.utils.vectors.RowIdJob = rowid : Map
SequenceFile<Text,VectorWritable> to {SequenceFile<IntWritable,VectorWritable>,
SequenceFile<IntWritable,Text>}
org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump
to sequence file
org.apache.mahout.classifier.bayes.TestClassifier = testclassifier : Test the
text based Bayes Classifier