svn commit: r1197839 - in /mahout/trunk: core/src/main/java/org/apache/mahout/common/ core/src/main/java/org/apache/mahout/vectorizer/ core/src/test/java/org/apache/mahout/vectorizer/ examples/bin/ src/conf/

gsingers Fri, 04 Nov 2011 17:21:52 -0700

Author: gsingers
Date: Sat Nov  5 00:21:17 2011
New Revision: 1197839

URL: http://svn.apache.org/viewvc?rev=1197839&view=rev
Log:
MAHOUT-873: baseline of simple vectorization encoding capabilities


Added:
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
      - copied, changed from r1197293, 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java
      - copied, changed from r1197293, 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java
Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java
    mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
    mahout/trunk/examples/bin/build-asf-email.sh
    mahout/trunk/src/conf/driver.classes.props

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java 
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java 
Sat Nov  5 00:21:17 2011
@@ -408,31 +408,11 @@ public abstract class AbstractJob extend
                            Class<? extends Writable> mapperValue,
                            Class<? extends OutputFormat> outputFormat) throws 
IOException {
 
-    Job job = new Job(new Configuration(getConf()));
-    Configuration jobConf = job.getConfiguration();
-
-    if (mapper.equals(Mapper.class)) {
-        throw new IllegalStateException("Can't figure out the user class jar 
file from mapper/reducer");
-    }
-    job.setJarByClass(mapper);
-
-    job.setInputFormatClass(inputFormat);
-    jobConf.set("mapred.input.dir", inputPath.toString());
-
-    job.setMapperClass(mapper);
-    job.setMapOutputKeyClass(mapperKey);
-    job.setMapOutputValueClass(mapperValue);
-    job.setOutputKeyClass(mapperKey);
-    job.setOutputValueClass(mapperValue);
-    jobConf.setBoolean("mapred.compress.map.output", true);
-    job.setNumReduceTasks(0);
-
-    job.setJobName(getCustomJobName(job, mapper, Reducer.class));
-
-    job.setOutputFormatClass(outputFormat);
-    jobConf.set("mapred.output.dir", outputPath.toString());
-
+    Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+            inputFormat, mapper, mapperKey, mapperValue, outputFormat, 
getConf());
+    job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), 
job, mapper, Reducer.class));
     return job;
+
   }
 
   protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends 
Mapper> mapper,
@@ -452,67 +432,12 @@ public abstract class AbstractJob extend
                            Class<? extends Writable> reducerKey,
                            Class<? extends Writable> reducerValue,
                            Class<? extends OutputFormat> outputFormat) throws 
IOException {
-
-    Job job = new Job(new Configuration(getConf()));
-    Configuration jobConf = job.getConfiguration();
-
-    if (reducer.equals(Reducer.class)) {
-      if (mapper.equals(Mapper.class)) {
-        throw new IllegalStateException("Can't figure out the user class jar 
file from mapper/reducer");
-      }
-      job.setJarByClass(mapper);
-    } else {
-      job.setJarByClass(reducer);
-    }
-
-    job.setInputFormatClass(inputFormat);
-    jobConf.set("mapred.input.dir", inputPath.toString());
-
-    job.setMapperClass(mapper);
-    job.setMapOutputKeyClass(mapperKey);
-    job.setMapOutputValueClass(mapperValue);
-
-    jobConf.setBoolean("mapred.compress.map.output", true);
-
-    job.setReducerClass(reducer);
-    job.setOutputKeyClass(reducerKey);
-    job.setOutputValueClass(reducerValue);
-
-    job.setJobName(getCustomJobName(job, mapper, reducer));
-
-    job.setOutputFormatClass(outputFormat);
-    jobConf.set("mapred.output.dir", outputPath.toString());
-
+    Job job = HadoopUtil.prepareJob(inputPath, outputPath,
+            inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, 
reducerValue, outputFormat, getConf());
+    job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), 
job, mapper, Reducer.class));
     return job;
   }
 
-  private String getCustomJobName(JobContext job,Class<? extends Mapper> 
mapper) {
-    StringBuilder name = new StringBuilder(100);
-    String customJobName = job.getJobName();
-    if (customJobName == null || customJobName.trim().length() == 0) {
-      name.append(getClass().getSimpleName());
-    } else {
-      name.append(customJobName);
-    }
-    name.append('-').append(mapper.getSimpleName());
-    return name.toString();
-  }
-
-  private String getCustomJobName(JobContext job,
-                                  Class<? extends Mapper> mapper,
-                                  Class<? extends Reducer> reducer) {
-    StringBuilder name = new StringBuilder(100);
-    String customJobName = job.getJobName();
-    if (customJobName == null || customJobName.trim().isEmpty()) {
-      name.append(getClass().getSimpleName());
-    } else {
-      name.append(customJobName);
-    }
-    name.append('-').append(mapper.getSimpleName());
-    name.append('-').append(reducer.getSimpleName());
-    return name.toString();
-  }
-
   /**
    * necessary to make this job (having a combined input path) work on Amazon 
S3, hopefully this is obsolete when MultipleInputs is available
    * again

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java 
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/ClassUtils.java 
Sat Nov  5 00:21:17 2011
@@ -25,9 +25,23 @@ public final class ClassUtils {
 
   public static <T> T instantiateAs(String classname, Class<T> 
asSubclassOfClass) {
     try {
-      return 
Class.forName(classname).asSubclass(asSubclassOfClass).getConstructor().newInstance();
-    } catch (ClassNotFoundException cnfe) {
-      throw new IllegalStateException(cnfe);
+      return 
instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass), 
asSubclassOfClass);
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public static <T> T instantiateAs(String classname, Class<T> 
asSubclassOfClass, Class[] params, Object[] args) {
+    try {
+      return 
instantiateAs(Class.forName(classname).asSubclass(asSubclassOfClass), 
asSubclassOfClass, params, args);
+    } catch (ClassNotFoundException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  public static <T> T instantiateAs(Class<? extends T> clazz, Class<T> 
asSubclassOfClass, Class[] params, Object[] args) {
+    try {
+      return 
clazz.asSubclass(asSubclassOfClass).getConstructor(params).newInstance(args);
     } catch (InstantiationException ie) {
       throw new IllegalStateException(ie);
     } catch (IllegalAccessException iae) {
@@ -39,6 +53,7 @@ public final class ClassUtils {
     }
   }
 
+
   public static <T> T instantiateAs(Class<? extends T> clazz, Class<T> 
asSubclassOfClass) {
     try {
       return 
clazz.asSubclass(asSubclassOfClass).getConstructor().newInstance();

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java 
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/HadoopUtil.java 
Sat Nov  5 00:21:17 2011
@@ -31,6 +31,12 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
 import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator;
@@ -43,6 +49,134 @@ public final class HadoopUtil {
 
   private HadoopUtil() { }
 
+  /**
+   * Create a map-only Hadoop Job out of the passed in parameters.  Does not 
set the
+   * Job name.
+   * @param inputPath
+   * @param outputPath
+   * @param inputFormat
+   * @param mapper
+   * @param mapperKey
+   * @param mapperValue
+   * @param outputFormat
+   * @param conf
+   * @return
+   * @throws IOException
+   *
+   * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, 
Class, Class)
+   */
+  public static Job prepareJob(Path inputPath,
+                           Path outputPath,
+                           Class<? extends InputFormat> inputFormat,
+                           Class<? extends Mapper> mapper,
+                           Class<? extends Writable> mapperKey,
+                           Class<? extends Writable> mapperValue,
+                           Class<? extends OutputFormat> outputFormat, 
Configuration conf) throws IOException {
+
+    Job job = new Job(new Configuration(conf));
+    Configuration jobConf = job.getConfiguration();
+
+    if (mapper.equals(Mapper.class)) {
+        throw new IllegalStateException("Can't figure out the user class jar 
file from mapper/reducer");
+    }
+    job.setJarByClass(mapper);
+
+    job.setInputFormatClass(inputFormat);
+    jobConf.set("mapred.input.dir", inputPath.toString());
+
+    job.setMapperClass(mapper);
+    job.setMapOutputKeyClass(mapperKey);
+    job.setMapOutputValueClass(mapperValue);
+    job.setOutputKeyClass(mapperKey);
+    job.setOutputValueClass(mapperValue);
+    jobConf.setBoolean("mapred.compress.map.output", true);
+    job.setNumReduceTasks(0);
+
+    job.setOutputFormatClass(outputFormat);
+    jobConf.set("mapred.output.dir", outputPath.toString());
+
+    return job;
+  }
+
+  /**
+   * Create a map and reduce Hadoop job.  Does not set the name on the job.
+   * @param inputPath
+   * @param outputPath
+   * @param inputFormat
+   * @param mapper
+   * @param mapperKey
+   * @param mapperValue
+   * @param reducer
+   * @param reducerKey
+   * @param reducerValue
+   * @param outputFormat
+   * @param conf
+   * @return
+   * @throws IOException
+   *
+   * @see #getCustomJobName(String, org.apache.hadoop.mapreduce.JobContext, 
Class, Class)
+   * @see #prepareJob(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path, 
Class, Class, Class, Class, Class, org.apache.hadoop.conf.Configuration)
+   */
+  public static Job prepareJob(Path inputPath,
+                           Path outputPath,
+                           Class<? extends InputFormat> inputFormat,
+                           Class<? extends Mapper> mapper,
+                           Class<? extends Writable> mapperKey,
+                           Class<? extends Writable> mapperValue,
+                           Class<? extends Reducer> reducer,
+                           Class<? extends Writable> reducerKey,
+                           Class<? extends Writable> reducerValue,
+                           Class<? extends OutputFormat> outputFormat,
+                           Configuration conf) throws IOException {
+
+    Job job = new Job(new Configuration(conf));
+    Configuration jobConf = job.getConfiguration();
+
+    if (reducer.equals(Reducer.class)) {
+      if (mapper.equals(Mapper.class)) {
+        throw new IllegalStateException("Can't figure out the user class jar 
file from mapper/reducer");
+      }
+      job.setJarByClass(mapper);
+    } else {
+      job.setJarByClass(reducer);
+    }
+
+    job.setInputFormatClass(inputFormat);
+    jobConf.set("mapred.input.dir", inputPath.toString());
+
+    job.setMapperClass(mapper);
+    job.setMapOutputKeyClass(mapperKey);
+    job.setMapOutputValueClass(mapperValue);
+
+    jobConf.setBoolean("mapred.compress.map.output", true);
+
+    job.setReducerClass(reducer);
+    job.setOutputKeyClass(reducerKey);
+    job.setOutputValueClass(reducerValue);
+
+    job.setOutputFormatClass(outputFormat);
+    jobConf.set("mapred.output.dir", outputPath.toString());
+
+    return job;
+  }
+
+
+  public static String getCustomJobName(String className, JobContext job,
+                                  Class<? extends Mapper> mapper,
+                                  Class<? extends Reducer> reducer) {
+    StringBuilder name = new StringBuilder(100);
+    String customJobName = job.getJobName();
+    if (customJobName == null || customJobName.trim().isEmpty()) {
+      name.append(className);
+    } else {
+      name.append(customJobName);
+    }
+    name.append('-').append(mapper.getSimpleName());
+    name.append('-').append(reducer.getSimpleName());
+    return name.toString();
+  }
+
+
   public static void delete(Configuration conf, Iterable<Path> paths) throws 
IOException {
     if (conf == null) {
       conf = new Configuration();

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
 Sat Nov  5 00:21:17 2011
@@ -60,7 +60,8 @@ import org.apache.mahout.vectorizer.term
  * This is a dictionary based Vectorizer.
  * 
  */
-public final class DictionaryVectorizer {
+
+public final class DictionaryVectorizer implements Vectorizer{
   
   public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
   
@@ -88,10 +89,17 @@ public final class DictionaryVectorizer 
   /**
    * Cannot be initialized. Use the static functions
    */
-  private DictionaryVectorizer() {
+  public DictionaryVectorizer() {
 
   }
-  
+  //TODO: move more of SparseVectorsFromSequenceFile in here, and then fold 
SparseVectorsFrom with EncodedVectorsFrom to have one framework.
+
+  @Override
+  public void createVectors(Path input, Path output, VectorizerConfig config) 
throws Exception {
+    createTermFrequencyVectors(input, output, config.conf, config.minSupport, 
config.maxNGramSize,
+            config.minLLRValue, config.normPower, config.logNormalize, 
config.numReducers, config.chunkSizeInMegabytes, config.sequentialAccess, 
config.namedVectors);
+  }
+
   /**
    * Create Term Frequency (Tf) Vectors from the input set of documents in 
{@link SequenceFile} format. This
    * tries to fix the maximum memory used by the feature chunk per node 
thereby splitting the process across

Copied: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
 (from r1197293, 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java)
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?p2=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java&p1=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java&r1=1197293&r2=1197839&rev=1197839&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
 Sat Nov  5 00:21:17 2011
@@ -17,252 +17,89 @@
 
 package org.apache.mahout.vectorizer;
 
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.ClassUtils;
-import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
-import org.apache.mahout.vectorizer.collocations.llr.LLRReducer;
-import org.apache.mahout.vectorizer.common.PartialVectorMerger;
-import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
  * Converts a given set of sequence files into SparseVectors
  */
-public final class SparseVectorsFromSequenceFiles extends AbstractJob {
-  
-  private static final Logger log = 
LoggerFactory.getLogger(SparseVectorsFromSequenceFiles.class);
-  
+public final class EncodedVectorsFromSequenceFiles extends AbstractJob {
+
+  private static final Logger log = 
LoggerFactory.getLogger(EncodedVectorsFromSequenceFiles.class);
+
   public static void main(String[] args) throws Exception {
-    ToolRunner.run(new SparseVectorsFromSequenceFiles(), args);
+    ToolRunner.run(new EncodedVectorsFromSequenceFiles(), args);
   }
-  
+
   @Override
   public int run(String[] args) throws Exception {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    
-    Option inputDirOpt = DefaultOptionCreator.inputOption().create();
-    
-    Option outputDirOpt = DefaultOptionCreator.outputOption().create();
-    
-    Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
-      
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Minimum Support. Default Value: 
2").withShortName("s").create();
-    
-    Option analyzerNameOpt = 
obuilder.withLongName("analyzerName").withArgument(
-      
abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The class name of the analyzer").withShortName("a").create();
-    
-    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withArgument(
-      
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The chunkSize in MegaBytes. 100-10000 
MB").withShortName("chunk").create();
-    
-    Option weightOpt = 
obuilder.withLongName("weight").withRequired(false).withArgument(
-      
abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The kind of weight to use. Currently TF or 
TFIDF").withShortName("wt").create();
-    
-    Option minDFOpt = 
obuilder.withLongName("minDF").withRequired(false).withArgument(
-      
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The minimum document frequency.  Default is 
1").withShortName("md").create();
-    
-    Option maxDFPercentOpt = 
obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-      
abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The max percentage of docs for the DF.  Can be used to remove really 
high frequency terms."
-          + " Expressed as an integer between 0 and 100. Default is 
99.").withShortName("x").create();
-    
-    Option minLLROpt = 
obuilder.withLongName("minLLR").withRequired(false).withArgument(
-      
abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional)The minimum Log Likelihood Ratio(Float)  Default is " + 
LLRReducer.DEFAULT_MIN_LLR)
-        .withShortName("ml").create();
-    
-    Option numReduceTasksOpt = 
obuilder.withLongName("numReducers").withArgument(
-      
abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Number of reduce tasks. Default Value: 
1").withShortName("nr").create();
-    
-    Option powerOpt = 
obuilder.withLongName("norm").withRequired(false).withArgument(
-      
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The norm to use, expressed as either a float or \"INF\" if you want to 
use the Infinite norm.  "
-          + "Must be greater or equal to 0.  The default is not to 
normalize").withShortName("n").create();
-    
-    Option logNormalizeOpt = 
obuilder.withLongName("logNormalize").withRequired(false)
-    .withDescription(
-      "(Optional) Whether output vectors should be logNormalize. If set true 
else false")
-    .withShortName("lnorm").create();
-    
-    Option maxNGramSizeOpt = 
obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
-      abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "(Optional) The maximum size of ngrams to create"
-              + " (2 = bigrams, 3 = trigrams, etc) Default 
Value:1").withShortName("ng").create();
-    
-    Option sequentialAccessVectorOpt = 
obuilder.withLongName("sequentialAccessVector").withRequired(false)
-        .withDescription(
-          "(Optional) Whether output vectors should be 
SequentialAccessVectors. If set true else false")
-        .withShortName("seq").create();
-    
-    Option namedVectorOpt = 
obuilder.withLongName("namedVector").withRequired(false)
-    .withDescription(
-      "(Optional) Whether output vectors should be NamedVectors. If set true 
else false")
-    .withShortName("nv").create();
-    
-    Option overwriteOutput = 
obuilder.withLongName("overwrite").withRequired(false).withDescription(
-      "If set, overwrite the output directory").withShortName("ow").create();
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
-        .create();
-    
-    Group group = 
gbuilder.withName("Options").withOption(minSupportOpt).withOption(analyzerNameOpt)
-        
.withOption(chunkSizeOpt).withOption(outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt)
-        
.withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt).withOption(minLLROpt)
-        
.withOption(numReduceTasksOpt).withOption(maxNGramSizeOpt).withOption(overwriteOutput)
-        
.withOption(helpOpt).withOption(sequentialAccessVectorOpt).withOption(namedVectorOpt)
-        .withOption(logNormalizeOpt)
-        .create();
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      parser.setHelpOption(helpOpt);
-      CommandLine cmdLine = parser.parse(args);
-      
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return -1;
-      }
-      
-      Path inputDir = new Path((String) cmdLine.getValue(inputDirOpt));
-      Path outputDir = new Path((String) cmdLine.getValue(outputDirOpt));
-      
-      int chunkSize = 100;
-      if (cmdLine.hasOption(chunkSizeOpt)) {
-        chunkSize = Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
-      }
-      int minSupport = 2;
-      if (cmdLine.hasOption(minSupportOpt)) {
-        String minSupportString = (String) cmdLine.getValue(minSupportOpt);
-        minSupport = Integer.parseInt(minSupportString);
-      }
-      
-      int maxNGramSize = 1;
-      
-      if (cmdLine.hasOption(maxNGramSizeOpt)) {
-        try {
-          maxNGramSize = 
Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
-        } catch (NumberFormatException ex) {
-          log.warn("Could not parse ngram size option");
-        }
-      }
-      log.info("Maximum n-gram size is: {}", maxNGramSize);
-      
-      if (cmdLine.hasOption(overwriteOutput)) {
-        HadoopUtil.delete(getConf(), outputDir);
-      }
-      
-      float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
-      if (cmdLine.hasOption(minLLROpt)) {
-        minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
-      }
-      log.info("Minimum LLR value: {}", minLLRValue);
-      
-      int reduceTasks = 1;
-      if (cmdLine.hasOption(numReduceTasksOpt)) {
-        reduceTasks = 
Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
-      }
-      log.info("Number of reduce tasks: {}", reduceTasks);
-
-      Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
-      if (cmdLine.hasOption(analyzerNameOpt)) {
-        String className = cmdLine.getValue(analyzerNameOpt).toString();
-        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
-        // try instantiating it, b/c there isn't any point in setting it if
-        // you can't instantiate it
-        ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
-      }
-      
-      boolean processIdf;
-      
-      if (cmdLine.hasOption(weightOpt)) {
-        String wString = cmdLine.getValue(weightOpt).toString();
-        if ("tf".equalsIgnoreCase(wString)) {
-          processIdf = false;
-        } else if ("tfidf".equalsIgnoreCase(wString)) {
-          processIdf = true;
-        } else {
-          throw new OptionException(weightOpt);
-        }
-      } else {
-        processIdf = true;
-      }
-      
-      int minDf = 1;
-      if (cmdLine.hasOption(minDFOpt)) {
-        minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
-      }
-      int maxDFPercent = 99;
-      if (cmdLine.hasOption(maxDFPercentOpt)) {
-        maxDFPercent = 
Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
-      }
-      
-      float norm = PartialVectorMerger.NO_NORMALIZING;
-      if (cmdLine.hasOption(powerOpt)) {
-        String power = cmdLine.getValue(powerOpt).toString();
-        if ("INF".equals(power)) {
-          norm = Float.POSITIVE_INFINITY;
-        } else {
-          norm = Float.parseFloat(power);
-        }
-      }
-      
-      boolean logNormalize = false;
-      if (cmdLine.hasOption(logNormalizeOpt)) {
-        logNormalize = true;
-      }
-
-      Configuration conf = getConf();
-      Path tokenizedPath = new Path(outputDir, 
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
-      DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, 
tokenizedPath, conf);
-      
-      boolean sequentialAccessOutput = false;
-      if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
-        sequentialAccessOutput = true;
-      }
-      
-      boolean namedVectors = false;
-      if (cmdLine.hasOption(namedVectorOpt)) {
-        namedVectors = true;
-      }
-      
-      if (!processIdf) {
-        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, 
outputDir, conf, minSupport, maxNGramSize,
-          minLLRValue, norm, logNormalize, reduceTasks, chunkSize, 
sequentialAccessOutput, namedVectors);
-      } else if (processIdf) {
-        DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, 
outputDir, conf, minSupport, maxNGramSize,
-          minLLRValue, -1.0f, false, reduceTasks, chunkSize, 
sequentialAccessOutput, namedVectors);
-      
-        TFIDFConverter.processTfIdf(
-          new Path(outputDir, 
DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER),
-          outputDir, conf, chunkSize, minDf, maxDFPercent, norm, logNormalize,
-          sequentialAccessOutput, namedVectors, reduceTasks);
-      }
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
+    addInputOption();
+    addOutputOption();
+    addOption("analyzerName", "an", "The class name of the analyzer", 
DefaultAnalyzer.class.getName());
+    addOption(buildOption("sequentialAccessVector", "seq", "(Optional) Whether 
output vectors should be SequentialAccessVectors. If set true else false", 
false, false, null));
+    addOption(buildOption("namedVector", "nv", "Create named vectors using the 
key.  False by default", false, false, null));
+    addOption("cardinality", "c", "The cardinality to use for creating the 
vectors.  Default is 5000", String.valueOf(5000));
+    addOption("encoderFieldName", "en", "The name of the encoder to be passed 
to the FeatureVectorEncoder constructor.  Default is text.  Note this is not 
the class name of a FeatureValueEncoder, but is instead the construction 
argument.", "text");
+    addOption("encoderClass", "ec", "The class name of the encoder to be used. 
Default is " + LuceneTextValueEncoder.class.getName(), 
LuceneTextValueEncoder.class.getName());
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Path input = getInputPath();
+    Path output = getOutputPath();
+
+    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
+      HadoopUtil.delete(getConf(), output);
+    }
+
+    Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
+    if (hasOption("analyzerName")) {
+      String className = getOption("analyzerName").toString();
+      analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+      // try instantiating it, b/c there isn't any point in setting it if
+      // you can't instantiate it
+      ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
+    }
+
+
+    Configuration conf = getConf();
+
+    boolean sequentialAccessOutput = hasOption("sequentialAccessVector");
+
+
+    boolean namedVectors = hasOption("namedVector");
+    int cardinality = 5000;
+    if (hasOption("cardinality")){
+      cardinality = Integer.parseInt(getOption("cardinality"));
+    }
+    String encoderName = "text";
+    if (hasOption("encoderFieldName")){
+      encoderName = getOption("encoderFieldName");
     }
+    String encoderClass = LuceneTextValueEncoder.class.getName();
+    if (hasOption("encoderClass")){
+      encoderClass = getOption("encoderClass");
+      ClassUtils.instantiateAs(encoderClass, FeatureVectorEncoder.class, new 
Class[]{String.class}, new Object[]{encoderName});//try instantiating it
+    }
+
+    SimpleTextEncodingVectorizer vectorizer = new 
SimpleTextEncodingVectorizer();
+    VectorizerConfig config = new VectorizerConfig(conf, 
analyzerClass.getName(), encoderClass, encoderName, sequentialAccessOutput,
+            namedVectors, cardinality);
+
+    vectorizer.createVectors(input, output, config);
+
     return 0;
   }
-  
+
 }

Added: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java?rev=1197839&view=auto
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
 (added)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodingMapper.java
 Sat Nov  5 00:21:17 2011
@@ -0,0 +1,81 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.mahout.common.ClassUtils;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
+import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;
+
+import java.io.IOException;
+
+/**
+* The Mapper that does the work of encoding text
+*
+**/
+public class EncodingMapper extends Mapper<Text, Text, Text, VectorWritable> {
+  public static final String USE_NAMED_VECTORS = "namedVectors";
+  public static final String USE_SEQUENTIAL = "sequential";
+  public static final String ANALYZER_NAME = "analyzer";
+  public static final String ENCODER_FIELD_NAME = "encoderFieldName";
+  public static final String ENCODER_CLASS = "encoderClass";
+  public static final String CARDINALITY = "cardinality";
+  boolean sequentialVecs, namedVectors;
+  FeatureVectorEncoder encoder;
+  int cardinality;
+
+  @Override
+  protected void setup(Context context) throws IOException, 
InterruptedException {
+    Configuration conf = context.getConfiguration();
+    sequentialVecs = conf.getBoolean(USE_SEQUENTIAL, false);
+    namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false);
+    String analyzerName = conf.get(ANALYZER_NAME, 
StandardAnalyzer.class.getName());
+    Analyzer analyzer = ClassUtils.instantiateAs(analyzerName, Analyzer.class);
+    String encoderName = conf.get(ENCODER_FIELD_NAME, "text");
+    cardinality = conf.getInt(CARDINALITY, 5000);
+    String encClass = conf.get(ENCODER_CLASS);
+    encoder = ClassUtils.instantiateAs(encClass, FeatureVectorEncoder.class, 
new Class[]{String.class}, new Object[]{encoderName});
+    if (encoder instanceof LuceneTextValueEncoder){
+      ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer);
+    }
+  }
+
+  @Override
+  protected void map(Text key, Text value, Context context) throws 
IOException, InterruptedException {
+    Vector vector = null;
+    if (sequentialVecs) {
+      vector = new SequentialAccessSparseVector(cardinality);
+    } else {
+      vector = new RandomAccessSparseVector(cardinality);
+    }
+    if (namedVectors){
+      vector = new NamedVector(vector, key.toString());
+    }
+    encoder.addToVector(value.toString(), vector);
+    context.write(new Text(key.toString()), new VectorWritable(vector));
+  }
+}

Added: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java?rev=1197839&view=auto
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
 (added)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SimpleTextEncodingVectorizer.java
 Sat Nov  5 00:21:17 2011
@@ -0,0 +1,66 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.math.VectorWritable;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Runs a Map/Reduce job that encodes {@link 
org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder} the
+ * input and writes it to the output as a sequence file.
+ *<p/>
+ * Only works on basic text, where the value in the SequenceFile is a blob of 
text.
+ */
+//TODO: find commonalities w/ DictionaryVectorizer and abstract them out
+public class SimpleTextEncodingVectorizer implements Vectorizer {
+  private transient static Logger log = 
LoggerFactory.getLogger(SimpleTextEncodingVectorizer.class);
+
+  public SimpleTextEncodingVectorizer() {
+  }
+
+
+  @Override
+  public void createVectors(final Path input, final Path output, final 
VectorizerConfig config) throws Exception {
+    //do this for convenience of using prepareJob
+    Job job = HadoopUtil.prepareJob(input, output, 
SequenceFileInputFormat.class, EncodingMapper.class, Text.class, 
VectorWritable.class,
+            SequenceFileOutputFormat.class, config.conf);
+    Configuration conf = job.getConfiguration();
+    conf.set(EncodingMapper.USE_SEQUENTIAL, 
String.valueOf(config.sequentialAccess));
+    conf.set(EncodingMapper.USE_NAMED_VECTORS, 
String.valueOf(config.namedVectors));
+    conf.set(EncodingMapper.ANALYZER_NAME, config.analyzerClassName);
+    conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.encoderName);
+    conf.set(EncodingMapper.ENCODER_CLASS, config.encoderClass);
+    conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.cardinality));
+    job.setNumReduceTasks(0);
+    boolean finished = job.waitForCompletion(true);
+
+    log.info("result of run: " + finished);
+    //TODO: something useful w/ this result should it be meaningful.
+  }
+
+
+}
+

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.java
 Sat Nov  5 00:21:17 2011
@@ -234,18 +234,18 @@ public final class SparseVectorsFromSequ
 
       Configuration conf = getConf();
       Path tokenizedPath = new Path(outputDir, 
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
+      //TODO: move this into DictionaryVectorizer , and then fold 
SparseVectorsFrom with EncodedVectorsFrom to have one framework for all of this.
       DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, 
tokenizedPath, conf);
-      
+
       boolean sequentialAccessOutput = false;
       if (cmdLine.hasOption(sequentialAccessVectorOpt)) {
         sequentialAccessOutput = true;
       }
-      
+
       boolean namedVectors = false;
       if (cmdLine.hasOption(namedVectorOpt)) {
         namedVectors = true;
       }
-      
       if (!processIdf) {
         DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, 
outputDir, conf, minSupport, maxNGramSize,
           minLLRValue, norm, logNormalize, reduceTasks, chunkSize, 
sequentialAccessOutput, namedVectors);

Added: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java?rev=1197839&view=auto
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java 
(added)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/Vectorizer.java 
Sat Nov  5 00:21:17 2011
@@ -0,0 +1,28 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.fs.Path;
+
+/**
+ *
+ *
+ **/
+public interface Vectorizer {
+  void createVectors(Path input, Path output, VectorizerConfig config) throws 
Exception;
+}

Added: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java?rev=1197839&view=auto
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
 (added)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/VectorizerConfig.java
 Sat Nov  5 00:21:17 2011
@@ -0,0 +1,51 @@
+package org.apache.mahout.vectorizer;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * The config for a Vectorizer.  Not all implementations need use all 
variables.
+ *
+ **/
+public class VectorizerConfig {
+  public Configuration conf;
+  public String analyzerClassName;
+  public String encoderName;
+  public boolean sequentialAccess, namedVectors;
+  public int cardinality;
+  public String encoderClass;
+  public int minSupport;
+  public int maxNGramSize;
+  public float minLLRValue;
+  public float normPower;
+  public boolean logNormalize;
+  public int numReducers;
+  public int chunkSizeInMegabytes;
+
+
+  public VectorizerConfig(Configuration conf, String analyzerClassName, String 
encoderClass, String encoderName, boolean sequentialAccess, boolean 
namedVectors, int cardinality) {
+    this.conf = conf;
+    this.analyzerClassName = analyzerClassName;
+    this.encoderClass = encoderClass;
+    this.encoderName = encoderName;
+    this.sequentialAccess = sequentialAccess;
+    this.namedVectors = namedVectors;
+    this.cardinality = cardinality;
+  }
+}

Copied: 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java
 (from r1197293, 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java)
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java?p2=mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java&p1=mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java&r1=1197293&r2=1197839&rev=1197839&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/SparseVectorsFromSequenceFilesTest.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFilesTest.java
 Sat Nov  5 00:21:17 2011
@@ -17,20 +17,29 @@
 
 package org.apache.mahout.vectorizer;
 
-import java.util.LinkedList;
-import java.util.List;
-
 import com.google.common.io.Closeables;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
+import org.apache.mahout.common.iterator.sequencefile.PathType;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.SequentialAccessSparseVector;
+import org.apache.mahout.math.VectorWritable;
 import org.junit.Before;
 import org.junit.Test;
 
-public class SparseVectorsFromSequenceFilesTest extends MahoutTestCase {
+import java.util.LinkedList;
+import java.util.List;
+
+public class EncodedVectorsFromSequenceFilesTest extends MahoutTestCase {
 
   private static final int NUM_DOCS = 100;
   
@@ -60,28 +69,28 @@ public class SparseVectorsFromSequenceFi
   
   
   @Test
-  public void testCreateTermFrequencyVectors() throws Exception {
+  public void testCreate() throws Exception {
     runTest(false, false);
   }
 
   @Test
-  public void testCreateTermFrequencyVectorsNam() throws Exception {
+  public void testCreateNamed() throws Exception {
     runTest(false, true);
   }
   
   @Test
-  public void testCreateTermFrequencyVectorsSeq() throws Exception {
+  public void testCreateSeq() throws Exception {
     runTest(true, false);
   }
   
   @Test
-  public void testCreateTermFrequencyVectorsSeqNam() throws Exception {
+  public void testCreateSeqNamed() throws Exception {
     runTest(true, true);
   }
   
   private void runTest(boolean sequential, boolean named) throws Exception {
-    Path outputPath = getTestTempFilePath("output");
-
+    Path tmpPath = getTestTempDirPath();
+    Path outputPath = new Path(tmpPath, "output");
     
     List<String> argList = new LinkedList<String>();
     argList.add("-i");
@@ -98,13 +107,20 @@ public class SparseVectorsFromSequenceFi
     }
     
     String[] args = argList.toArray(new String[argList.size()]);
-    
-    SparseVectorsFromSequenceFiles.main(args);
 
-    Path tfVectors = new Path(outputPath, "tf-vectors");
-    Path tfidfVectors = new Path(outputPath, "tfidf-vectors");
-    
-    DictionaryVectorizerTest.validateVectors(conf, NUM_DOCS, tfVectors, 
sequential, named);
-    DictionaryVectorizerTest.validateVectors(conf, NUM_DOCS, tfidfVectors, 
sequential, named);
+    EncodedVectorsFromSequenceFiles.main(args);
+
+    SequenceFileDirIterator<Text, VectorWritable> iter = new 
SequenceFileDirIterator<Text, VectorWritable>(outputPath, PathType.LIST, 
PathFilters.partFilter(), null, true, conf);
+    int seen = 0;
+    while (iter.hasNext()) {
+      Pair<Text, VectorWritable> next = iter.next();
+      if (sequential && !named){
+        assertTrue(next.getSecond().get() instanceof 
SequentialAccessSparseVector);
+      } else if (named){
+        assertTrue(next.getSecond().get() instanceof NamedVector);
+      }
+      seen++;
+    }
+    assertEquals("Missed some vectors", NUM_DOCS, seen);
   }  
 }

Modified: mahout/trunk/examples/bin/build-asf-email.sh
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-asf-email.sh?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-asf-email.sh (original)
+++ mahout/trunk/examples/bin/build-asf-email.sh Sat Nov  5 00:21:17 2011
@@ -98,60 +98,92 @@ elif [ "x$alg" == "xclustering" ]; then
 
 #classification
 elif [ "x$alg" == "xclassification" ]; then
-  algorithm=( standard complementary )
+  algorithm=( standard complementary sgd )
 
   echo "Please select a number to choose the corresponding algorithm to run"
   echo "1. ${algorithm[0]}"
   echo "2. ${algorithm[1]}"
+#  echo "3. ${algorithm[2]}"
   read -p "Enter your choice : " choice
 
   echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-  nbalg=${algorithm[$choice-1]}
-
-  CLASS="$OUT/classification/"
-  MAIL_OUT="$CLASS/seq-files"
-  SEQ2SP="$CLASS/seq2sparse"
-  SEQ2SPLABEL="$CLASS/labeled"
-  SPLIT="$CLASS/splits"
-  TRAIN="$SPLIT/train"
-  TEST="$SPLIT/test"
-  TEST_OUT="$CLASS/test-results"
-  LABEL="$SPLIT/labels"
+  classAlg=${algorithm[$choice-1]}
   #Convert mail to be formatted as:
   # label\ttext
   # One per line
   # the label is the project_name_mailing_list, as in tomcat.apache.org_dev
-  if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
-    echo "Converting Mail files to Sequence Files"
-    $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
-  fi
   #Convert to vectors
-  if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
-    echo "Converting the files to sparse vectors"
-    $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight 
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+  if [ "x$classAlg" == "xstandard" ] || [ "x$classAlg" == "xcomplementary" ]; 
then
+    CLASS="$OUT/classification/bayesian"
+    MAIL_OUT="$CLASS/seq-files"
+    SEQ2SP="$CLASS/seq2sparse"
+    SEQ2SPLABEL="$CLASS/labeled"
+    SPLIT="$CLASS/splits"
+    TRAIN="$SPLIT/train"
+    TEST="$SPLIT/test"
+    TEST_OUT="$CLASS/test-results"
+    LABEL="$SPLIT/labels"
+    if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+      echo "Converting Mail files to Sequence Files"
+      $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+    fi
+    if [ "x$OVER" == "xover" ] || [ ! -e "$SEQ2SP/dictionary.file-0" ]; then
+      echo "Converting the files to sparse vectors"
+      $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight 
TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
+      #We need to modify the vectors to have a better label
+      echo "Converting vector labels"
+      $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver 
--input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite 
--maxItemsPerLabel 1000
+    fi
+    if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+      #setup train/test files
+      echo "Creating training and test inputs"
+      $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput 
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
+    fi
+    MODEL="$CLASS/model"
+    if [ "x$classAlg" == "xstandard" ]; then
+      echo "Running Standard Training"
+      $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL 
--overwrite
+      echo "Running Test"
+      $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL 
--overwrite
+
+    elif [ "x$classAlg" == "xcomplementary"  ]; then
+      echo "Running Complementary Training"
+      $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL 
--overwrite --trainComplementary
+      echo "Running Complementary Test"
+      $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL 
--overwrite --testComplementary
+    fi
+  elif [ "x$classAlg" == "xsgd"  ]; then
+    CLASS="$OUT/classification/sgd"
+    MAIL_OUT="$CLASS/seq-files"
+    SEQ2SP="$CLASS/seq2encoded"
+    SEQ2SPLABEL="$CLASS/labeled"
+    SPLIT="$CLASS/splits"
+    TRAIN="$SPLIT/train"
+    TEST="$SPLIT/test"
+    TEST_OUT="$CLASS/test-results"
+    LABEL="$SPLIT/labels"
+    if [ "x$OVER" == "xover" ] || [ ! -e "$MAIL_OUT/chunk-0" ]; then
+      echo "Converting Mail files to Sequence Files"
+      $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset 
"UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+    fi
+    echo "Converting the files to sparse vectors in $SEQ2SP"
+    $MAHOUT seq2encoded --input $MAIL_OUT --output $SEQ2SP --analyzerName 
org.apache.mahout.text.MailArchivesClusteringAnalyzer
     #We need to modify the vectors to have a better label
     echo "Converting vector labels"
-    $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input 
"$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite --maxItemsPerLabel 
1000
-  fi
-  if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
-    #setup train/test files
-    echo "Creating training and test inputs"
-    $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput 
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
-  fi
-  MODEL="$CLASS/model"
-  if [ "x$nbalg" == "xstandard" ]; then
-    echo "Running Standard Training"
-    $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL 
--overwrite
+    $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input 
"$SEQ2SP" --output $SEQ2SPLABEL --overwrite
+    if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
+      #setup train/test files
+      echo "Creating training and test inputs from $SEQ2SPLABEL"
+      $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput 
$TEST --randomSelectionPct 20 --overwrite --sequenceFiles
+    fi
+    MODEL="$CLASS/model"
+
+    echo "Running SGD Training"
+    #$MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL 
--overwrite
     echo "Running Test"
-    $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL 
--overwrite
+#$MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL --overwrite
 
-  elif [ "x$nbalg" == "xcomplementary"  ]; then
-    echo "Running Complementary Training"
-    $MAHOUT trainnb -i $TRAIN -o $MODEL --extractLabels --labelIndex $LABEL 
--overwrite --trainComplementary
-    echo "Running Complementary Test"
-    $MAHOUT testnb -i $TEST -o $TEST_OUT -m $MODEL --labelIndex $LABEL 
--overwrite --testComplementary
   fi
-
 fi
 
 

Modified: mahout/trunk/src/conf/driver.classes.props
URL: 
http://svn.apache.org/viewvc/mahout/trunk/src/conf/driver.classes.props?rev=1197839&r1=1197838&r2=1197839&view=diff
==============================================================================
--- mahout/trunk/src/conf/driver.classes.props (original)
+++ mahout/trunk/src/conf/driver.classes.props Sat Nov  5 00:21:17 2011
@@ -18,6 +18,7 @@ org.apache.mahout.utils.vectors.lucene.D
 org.apache.mahout.utils.vectors.arff.Driver = arff.vector : Generate Vectors 
from an ARFF file or directory 
 org.apache.mahout.text.SequenceFilesFromDirectory = seqdirectory : Generate 
sequence files (of Text) from a directory
 org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles = seq2sparse: 
Sparse Vector generation from Text sequence files
+org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded: 
Encoded Sparse Vector generation from Text sequence files
 org.apache.mahout.utils.vectors.RowIdJob = rowid : Map 
SequenceFile<Text,VectorWritable> to {SequenceFile<IntWritable,VectorWritable>, 
SequenceFile<IntWritable,Text>}
 org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump 
to sequence file
 org.apache.mahout.classifier.bayes.TestClassifier = testclassifier : Test the 
text based Bayes Classifier

svn commit: r1197839 - in /mahout/trunk: core/src/main/java/org/apache/mahout/common/ core/src/main/java/org/apache/mahout/vectorizer/ core/src/test/java/org/apache/mahout/vectorizer/ examples/bin/ src/conf/

Reply via email to