Author: drew
Date: Tue Jun  8 02:20:01 2010
New Revision: 952511

URL: http://svn.apache.org/viewvc?rev=952511&view=rev
Log:
MAHOUT-404: AbstractJob Improvements

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
 Tue Jun  8 02:20:01 2010
@@ -39,6 +39,7 @@ import org.apache.mahout.cf.taste.hadoop
 import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
 import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper;
 import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.VarIntWritable;
 import org.apache.mahout.math.VarLongWritable;
 import org.apache.mahout.math.VectorWritable;
@@ -73,23 +74,18 @@ public final class RecommenderJob extend
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, 
InterruptedException {
     
-    Option numReccomendationsOpt = 
AbstractJob.buildOption("numRecommendations", "n",
-      "Number of recommendations per user",
+    addOption("numRecommendations", "n", "Number of recommendations per user",
       
String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
-    Option usersFileOpt = AbstractJob.buildOption("usersFile", "u",
-      "File of users to recommend for", null);
-    Option booleanDataOpt = AbstractJob.buildOption("booleanData", "b",
-      "Treat input as without pref values", Boolean.FALSE.toString());
-    Option maxPrefsPerUserConsideredOpt = 
AbstractJob.buildOption("maxPrefsPerUserConsidered", null,
+    addOption("usersFile", "u", "File of users to recommend for", null);
+    addOption("booleanData", "b", "Treat input as without pref values", 
Boolean.FALSE.toString());
+    addOption("maxPrefsPerUserConsidered", null,
       "Maximum number of preferences considered per user in final 
recommendation phase",
       
String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
-    Option maxCooccurrencesPerItemConsideredOpt = 
AbstractJob.buildOption("maxCooccurrencesPerItemConsidered", null,
+    addOption("maxCooccurrencesPerItemConsidered", null,
       "Maximum number of cooccurrences considered per item in count phase",
       
String.valueOf(UserVectorToCooccurrenceMapper.DEFAULT_MAX_COOCCURRENCES_PER_ITEM_CONSIDERED));
 
-    Map<String,String> parsedArgs = AbstractJob.parseArguments(
-        args, numReccomendationsOpt, usersFileOpt, booleanDataOpt,
-        maxPrefsPerUserConsideredOpt, maxCooccurrencesPerItemConsideredOpt);
+    Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
       return -1;
     }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
 Tue Jun  8 02:20:01 2010
@@ -31,6 +31,7 @@ import org.apache.hadoop.mapreduce.lib.o
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
 import org.apache.mahout.math.VarLongWritable;
 
@@ -106,15 +107,13 @@ public final class RecommenderJob extend
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, 
InterruptedException {
     
-    Option recommendClassOpt = AbstractJob.buildOption("recommenderClassName", 
"r",
-      "Name of recommender class to instantiate");
-    Option numReccomendationsOpt = 
AbstractJob.buildOption("numRecommendations", "n",
-      "Number of recommendations per user", "10");
-    Option usersFileOpt = AbstractJob.buildOption("usersFile", "u", "Number of 
recommendations per user",
-      null);
+    addOption(DefaultOptionCreator.inputOption().create());
+    addOption(DefaultOptionCreator.outputOption().create());
+    addOption("recommenderClassName", "r", "Name of recommender class to 
instantiate");
+    addOption("numRecommendations", "n", "Number of recommendations per user", 
"10");
+    addOption("usersFile", "u", "Number of recommendations per user", null);
     
-    Map<String,String> parsedArgs = AbstractJob.parseArguments(args, 
recommendClassOpt,
-      numReccomendationsOpt, usersFileOpt);
+    Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
       return -1;
     }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
 Tue Jun  8 02:20:01 2010
@@ -118,10 +118,9 @@ public final class ItemSimilarityJob ext
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, 
InterruptedException {
 
-    Option similarityClassOpt = AbstractJob.buildOption(
-        "similarityClassname", "s", "Name of distributed similarity class to 
instantiate");
+    addOption("similarityClassname", "s", "Name of distributed similarity 
class to instantiate");
 
-    Map<String,String> parsedArgs = AbstractJob.parseArguments(args, 
similarityClassOpt);
+    Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
       return -1;
     }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
 Tue Jun  8 02:20:01 2010
@@ -37,6 +37,7 @@ import org.apache.mahout.cf.taste.hadoop
 import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
 import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper;
 import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.VarLongWritable;
 
 public final class SlopeOneAverageDiffsJob extends AbstractJob {
@@ -44,7 +45,10 @@ public final class SlopeOneAverageDiffsJ
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, 
InterruptedException {
     
-    Map<String,String> parsedArgs = AbstractJob.parseArguments(args);
+    addInputOption();
+    addOutputOption();
+    
+    Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
       return -1;
     }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java 
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java 
Tue Jun  8 02:20:01 2010
@@ -18,6 +18,8 @@
 package org.apache.mahout.common;
 
 import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -75,50 +77,199 @@ public abstract class AbstractJob extend
   
   private static final Logger log = LoggerFactory.getLogger(AbstractJob.class);
 
-  protected static Option buildOption(String name, String shortName, String 
description) {
-    return buildOption(name, shortName, description, true, null);
+  /** option used to specify the input path */
+  private Option inputOption;
+  
+  /** option used to specify the output path */
+  private Option outputOption;
+  
+  /** input path, populated by {...@link #parseArguments(String[])} */
+  private Path   inputPath; 
+  
+  /** output path, populated by {...@link #parseArguments(String[]) */
+  private Path   outputPath;
+  
+  /** internal list of options that have been added */
+  private final List<Option> options;
+  
+  public AbstractJob() {
+    options = new LinkedList<Option>();
   }
   
-  protected static Option buildOption(String name, String shortName, String 
description, String defaultValue) {
-    return buildOption(name, shortName, description, false, defaultValue);
+  /** Returns the input path established by a call to {...@link 
#parseArguments(String[])}.
+   *  The source of the path may be an input option added using {...@link 
#addInputOption()}
+   *  or it may be the value of the <code>mapred.input.dir</code> configuration
+   *  property. 
+   * @return
+   */
+  public Path getInputPath() {
+    return inputPath;
+  }
+  
+  /** Returns the output path established by a call to {...@link 
#parseArguments(String[])}.
+   *  The source of the path may be an output option added using {...@link 
#addOutputOption()}
+   *  or it may be the value of the <code>mapred.input.dir</code> configuration
+   *  property. 
+   * @return
+   */
+  public Path getOutputPath() {
+    return outputPath;
+  }
+  
+  /** Add an option with no argument whose presence can be checked for using
+   *  <code>containsKey<code> method on the map returned by 
+   *  {...@link #parseArguments(String[])};
+   *  
+   * @param name
+   * @param shortName
+   * @param description
+   */
+  public void addFlag(String name, String shortName, String description) {
+    options.add(buildOption(name, shortName, description, true, false, null));
+  }
+  
+  /** Add an option to the the set of options this job will parse when
+   *  {...@link #parseArguments(String[])} is called. This options has an 
argument
+   *  with null as its default value.
+   *  
+   * @param name
+   * @param shortName
+   * @param description
+   */
+  public void addOption(String name, String shortName, String description) {
+    options.add(buildOption(name, shortName, description, true, false, null));
+  }
+  
+  /** Add an option to the the set of options this job will parse when
+   *  {...@link #parseArguments(String[])} is called.
+   * 
+   * @param name
+   * @param shortName
+   * @param description
+   * @param required if true the {...@link #parseArguments(String[])} will 
throw
+   *    fail with an error and usage message if this option is not specified
+   *    on the command line.
+   */
+  public void addOption(String name, String shortName, String description, 
boolean required) {
+    options.add(buildOption(name, shortName, description, true, required, 
null));
+  }
+  
+  /** Add an option to the the set of options this job will parse when
+   *  {...@link #parseArguments(String[])} is called. If this option is not 
+   *  specified on the command line the default value will be 
+   *  used.
+   *  
+   * @param name
+   * @param shortName
+   * @param description
+   * @param defaultValue the default argument value if this argument is not
+   *   found on the command-line. null is allowed.
+   */
+  public void addOption(String name, String shortName, String description, 
String defaultValue) {
+    options.add(buildOption(name, shortName, description, true, false, 
defaultValue));
   }
 
-  protected static Option buildOption(String name, String shortName, String 
description,
-                                      boolean required) {
-    return buildOption(name, shortName, description, required, null);
+  /** Add an arbitrary option to the set of options this job will parse when
+   *  {...@link #parseArguments(String[])} is called. If this option has no
+   *  argument, use <code>containsKey</code> on the map returned by 
+   *  <code>parseArguments</code> to check for its presence. Otherwise, the
+   *  string value of the option will be placed in the map using a key
+   *  equal to this options long name preceded by '--'.
+   * @param option
+   * @return the option added.
+   */
+  public Option addOption(Option option) {
+    options.add(option);
+    return option;
   }
   
-  protected static Option buildOption(String name,
-                                    String shortName,
-                                    String description,
-                                    boolean required,
-                                    String defaultValue) {
-    ArgumentBuilder argBuilder = new 
ArgumentBuilder().withName(name).withMinimum(1).withMaximum(1);
-    if (defaultValue != null) {
-      argBuilder = argBuilder.withDefault(defaultValue);
-    }
-    Argument arg = argBuilder.create();
-    DefaultOptionBuilder optBuilder = new 
DefaultOptionBuilder().withLongName(name).withRequired(required)
-        .withArgument(arg).withDescription(description);
+  /** Add the default output directory option, '-o' which takes a directory
+   *  name as an argument. When {...@link #parseArguments(String[])} is 
+   *  called, the outputPath will be set based upon the value for this option.
+   *  This this method is called, the output is required. 
+   */
+  public void addInputOption() {
+    this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
+  }
+  
+  /** Add the default output directory option, '-o' which takes a directory
+   *  name as an argument. When {...@link #parseArguments(String[])} is 
+   *  called, the outputPath will be set based upon the value for this option.
+   *  This this method is called, the output is required. 
+   */
+  public void addOutputOption() {
+    this.outputOption = 
addOption(DefaultOptionCreator.outputOption().create());
+  }
+
+  /** Build an option with the given parameters. Name and description are
+   *  required.
+   * 
+   * @param name the long name of the option prefixed with '--' on the 
command-line
+   * @param shortName the short name of the option, prefixed with '-' on the 
command-line
+   * @param description description of the option displayed in help method
+   * @param hasArg true if the option has an argument.
+   * @param required true if the option is required.
+   * @param defaultValue default argument value, can be null.
+   * @return the option.
+   */
+  private static Option buildOption(String name,
+                                      String shortName,
+                                      String description,
+                                      boolean hasArg,
+                                      boolean required,
+                                      String defaultValue) {
+
+    DefaultOptionBuilder optBuilder = new DefaultOptionBuilder()
+      .withLongName(name)
+      .withDescription(description)
+      .withRequired(required);
+      
     if (shortName != null) {
-      optBuilder = optBuilder.withShortName(shortName);
+      optBuilder.withShortName(shortName);
     }
+    
+    if (hasArg) {
+      ArgumentBuilder argBuilder = new ArgumentBuilder()
+        .withName(name)
+        .withMinimum(1)
+        .withMaximum(1);
+      
+      if (defaultValue != null) {
+        argBuilder = argBuilder.withDefault(defaultValue);
+      }
+      
+      optBuilder.withArgument(argBuilder.create());
+    }
+
     return optBuilder.create();
   }
   
-  protected static Map<String,String> parseArguments(String[] args, Option... 
extraOpts) {
+  /** Parse the arguments specified based on the options defined using the 
+   *  various <code>addOption</code> methods. If -h is specified or an 
+   *  exception is encountered pring help and return null. Has the 
+   *  side effect of setting inputPath and outputPath 
+   *  if <code>addInputOption</code> or <code>addOutputOption</code> 
+   *  or <code>mapred.input.dir</code> or <code>mapred.output.dir</code>
+   *  are present in the Configuration.
+   * 
+   * @param args
+   * @return a Map<String,Sting> containing options and their argument values.
+   *  The presence of a flag can be tested using <code>containsKey</code>, 
while
+   *  argument values can be retrieved using <code>get(optionName</code>. The
+   *  names used for keys are the option name parameter prefixed by '--'.
+   *  
+   * 
+   */
+  public Map<String,String> parseArguments(String[] args) {
     
-    Option tempDirOpt = buildOption("tempDir", null, "Intermediate output 
directory", "temp");
-    Option helpOpt = DefaultOptionCreator.helpOption();
-    Option startPhase = buildOption("startPhase", null, "First phase to run", 
"0");
-    Option endPhase = buildOption("endPhase", null, "Last phase to run", 
String.valueOf(Integer.MAX_VALUE));
-
-    GroupBuilder gBuilder = new GroupBuilder().withName("Options")
-        .withOption(tempDirOpt)
-        .withOption(helpOpt)
-        .withOption(startPhase).withOption(endPhase);
+    Option helpOpt = addOption(DefaultOptionCreator.helpOption());
+    addOption("tempDir", null, "Intermediate output directory", "temp");
+    addOption("startPhase", null, "First phase to run", "0");
+    addOption("endPhase", null, "Last phase to run", 
String.valueOf(Integer.MAX_VALUE));
+
+    GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific 
Options:");
     
-    for (Option opt : extraOpts) {
+    for (Option opt : options) {
       gBuilder = gBuilder.withOption(opt);
     }
     
@@ -132,28 +283,59 @@ public abstract class AbstractJob extend
       cmdLine = parser.parse(args);
     } catch (OptionException e) {
       log.error(e.getMessage());
-      CommandLineUtil.printHelp(group);
+      CommandLineUtil.printHelpWithGenericOptions(group);
       return null;
     }
     
     if (cmdLine.hasOption(helpOpt)) {
-      CommandLineUtil.printHelp(group);
+      CommandLineUtil.printHelpWithGenericOptions(group);
       return null;
     }
     
     Map<String,String> result = new TreeMap<String,String>();
-    maybePut(result, cmdLine, tempDirOpt, helpOpt, startPhase, endPhase);
-    maybePut(result, cmdLine, extraOpts);
+    maybePut(result, cmdLine, this.options.toArray(new Option[0]));
 
+    parseDirectories(cmdLine);
+    
     log.info("Command line arguments: {}", result);
     return result;
   }
   
+  /** Extract the values of the <code>inputOption</code> and 
<code>outputOption</code>
+   *  if present, otherwise attempt to retrieve the values of 
<code>mapred.input.dir</code>
+   *  and <code>mapred.output.dir</code>. If none of these are set, 
+   *  {...@link #getInputPath()} and {...@link #getOutputPath()} will return 
null.
+   * @param cmdLine
+   */
+  protected void parseDirectories(CommandLine cmdLine) {
+    Configuration conf = getConf();
+    
+    if (inputOption != null) {
+      if (cmdLine.hasOption(inputOption)) {
+        this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
+      }
+    }
+    else if (conf.get("mapred.input.dir") != null) {
+      this.inputPath = new Path(conf.get("mapred.input.dir"));
+    }
+    
+    if (outputOption != null) {
+      if (cmdLine.hasOption(outputOption)) {
+        this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
+      }
+    }
+    else if (conf.get("mapred.output.dir") != null) {
+      this.outputPath = new Path(conf.get("mapred.output.dir"));
+    }
+  }
+  
   protected static void maybePut(Map<String,String> args, CommandLine cmdLine, 
Option... opt) {
     for (Option o : opt) {
-      Object value = cmdLine.getValue(o);
-      if (value != null) {
-        args.put(o.getPreferredName(), value.toString());
+      if (cmdLine.hasOption(o)) {
+        // nulls are ok, for cases where options are simple flags.
+        Object vo = cmdLine.getValue(o);
+        String value = (vo == null) ? null : vo.toString();
+        args.put(o.getPreferredName(), value);
       }
     }
   }
@@ -219,5 +401,4 @@ public abstract class AbstractJob extend
     
     return job;
   }
-  
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java 
Tue Jun  8 02:20:01 2010
@@ -17,8 +17,12 @@
 
 package org.apache.mahout.common;
 
+import java.io.PrintWriter;
+
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.GenericOptionsParser;
 
 public final class CommandLineUtil {
   
@@ -29,5 +33,26 @@ public final class CommandLineUtil {
     formatter.setGroup(group);
     formatter.print();
   }
-  
+ 
+  /** print the options supported by <code>GenericOptionsParser</code>
+   *  in addition to the options supported by the job, passed in as the
+   *  group parameter.
+   *  @params group job-specific command-line options.
+   */
+  public static void printHelpWithGenericOptions(Group group) {
+    org.apache.commons.cli.Options ops = new org.apache.commons.cli.Options();
+    new GenericOptionsParser(new Configuration(), ops, new String[0]);
+    org.apache.commons.cli.HelpFormatter fmt = new 
org.apache.commons.cli.HelpFormatter();
+    fmt.printHelp("<command> [Generic Options] [Job-Specific Options]", 
+        "Generic Options:", ops, "");
+    
+    PrintWriter pw = new PrintWriter(System.out);
+    HelpFormatter formatter = new HelpFormatter();
+    formatter.setGroup(group);
+    formatter.setPrintWriter(pw);
+    formatter.printHelp();
+    pw.flush();
+    
+    
+  }
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
 Tue Jun  8 02:20:01 2010
@@ -70,33 +70,15 @@ public class MatrixMultiplicationJob ext
 
   @Override
   public int run(String[] strings) throws Exception {
-    Option numRowsAOpt = buildOption("numRowsA",
-                                     "nra",
-                                     "Number of rows of the first input 
matrix");
-    Option numColsAOpt = buildOption("numColsA",
-                                     "nca",
-                                     "Number of columns of the first input 
matrix");
-    Option numRowsBOpt = buildOption("numRowsB",
-                                     "nrb",
-                                     "Number of rows of the second input 
matrix");
-
-    Option numColsBOpt = buildOption("numColsB",
-                                     "ncb",
-                                     "Number of columns of the second input 
matrix");
-    Option inputPathA = buildOption("inputPathA",
-                                    "ia",
-                                    "Path to the first input matrix");
-    Option inputPathB = buildOption("inputPathB",
-                                    "ib",
-                                    "Path to the second input matrix");
-
-    Map<String, String> argMap = parseArguments(strings,
-                                                numRowsAOpt,
-                                                numRowsBOpt,
-                                                numColsAOpt,
-                                                numColsBOpt,
-                                                inputPathA,
-                                                inputPathB);
+    addOption("numRowsA", "nra", "Number of rows of the first input matrix");
+    addOption("numColsA", "nca", "Number of columns of the first input 
matrix");
+    addOption("numRowsB", "nrb", "Number of rows of the second input matrix");
+
+    addOption("numColsB", "ncb", "Number of columns of the second input 
matrix");
+    addOption("inputPathA", "ia", "Path to the first input matrix");
+    addOption("inputPathB", "ib", "Path to the second input matrix");
+
+    Map<String, String> argMap = parseArguments(strings);
 
     DistributedRowMatrix a = new 
DistributedRowMatrix(argMap.get("--inputPathA"),
                                                       argMap.get("--tempDir"),

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java 
Tue Jun  8 02:20:01 2010
@@ -55,13 +55,9 @@ public class TransposeJob extends Abstra
 
   @Override
   public int run(String[] strings) throws Exception {
-    Option numRowsOpt = buildOption("numRows",
-                                    "nr",
-                                    "Number of rows of the input matrix");
-    Option numColsOpt = buildOption("numCols",
-                                    "nc",
-                                    "Number of columns of the input matrix");
-    Map<String,String> parsedArgs = parseArguments(strings, numRowsOpt, 
numColsOpt);
+    addOption("numRows", "nr", "Number of rows of the input matrix");
+    addOption("numCols", "nc", "Number of columns of the input matrix");
+    Map<String,String> parsedArgs = parseArguments(strings);
     if (parsedArgs == null) {
       // FIXME
       return 0;

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
 Tue Jun  8 02:20:01 2010
@@ -144,25 +144,13 @@ public class DistributedLanczosSolver ex
     @Override
     public int run(String[] args) throws Exception {
 
-      Option numRowsOpt = buildOption("numRows",
-                                      "nr",
-                                      "Number of rows of the input matrix");
-      Option numColsOpt = buildOption("numCols",
-                                      "nc",
-                                      "Number of columns of the input matrix");
-      Option desiredRankOpt = buildOption("rank",
-                                          "r",
-                                          "Desired decomposition rank (note: 
only roughly 1/4 to 1/3 "
-                                        + "of these will have the top portion 
of the spectrum)");
-      Option isSymmetricOpt = buildOption("symmetric",
-                                          "sym",
-                                          "Is the input matrix square and 
symmetric?");
+      addOption("numRows", "nr", "Number of rows of the input matrix");
+      addOption("numCols", "nc", "Number of columns of the input matrix");
+      addOption("rank", "r", "Desired decomposition rank (note: only roughly 
1/4 to 1/3 "
+                           + "of these will have the top portion of the 
spectrum)");
+      addOption("symmetric", "sym", "Is the input matrix square and 
symmetric?");
 
-      DistributedLanczosSolver.this.parsedArgs = parseArguments(args,
-                                                                numRowsOpt,
-                                                                numColsOpt,
-                                                                desiredRankOpt,
-                                                                
isSymmetricOpt);
+      DistributedLanczosSolver.this.parsedArgs = parseArguments(args);
       if (DistributedLanczosSolver.this.parsedArgs == null) {
         return -1;
       } else {

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
 Tue Jun  8 02:20:01 2010
@@ -101,7 +101,7 @@ public class EigenVerificationJob extend
       return 0;
     }
     Configuration originalConf = getConf();
-    outPath = originalConf.get("mapred.output.class");
+    outPath = originalConf.get("mapred.output.dir");
     tmpOut = outPath + "/tmp";
 
     if (argMap.get("--eigenInput") != null && eigensToVerify == null) {
@@ -131,45 +131,18 @@ public class EigenVerificationJob extend
     return 0;
   }
 
-  public static Map<String,String> handleArgs(String[] args) {
-    Option eigenInputOpt = buildOption("eigenInput", "ei",
+  public Map<String,String> handleArgs(String[] args) {
+    addOption("eigenInput", "ei",
         "The Path for purported eigenVector input files 
(SequenceFile<WritableComparable,VectorWritable>.", null);
-    Option corpusInputOpt = buildOption("corpusInput", "ci",
+    addOption("corpusInput", "ci",
         "The Path for corpus input files 
(SequenceFile<WritableComparable,VectorWritable>.");
-    Option outOpt = DefaultOptionCreator.outputOption().create();
-    Option helpOpt = DefaultOptionCreator.helpOption();
-    Option inMemOpt = buildOption("inMemory", "mem", "Buffer eigen matrix into 
memory (if you have enough!)", "false");
-    Option errorOpt = buildOption("maxError", "err", "Maximum acceptable 
error", "0.05");
-    Option minEigenValOpt = buildOption("minEigenvalue", "mev", "Minimum 
eigenvalue to keep the vector for", "0.0");
-
-    GroupBuilder gBuilder = new GroupBuilder().withName("Options")
-                                              .withOption(eigenInputOpt)
-                                              .withOption(corpusInputOpt)
-                                              .withOption(helpOpt)
-                                              .withOption(outOpt)
-                                              .withOption(inMemOpt)
-                                              .withOption(errorOpt)
-                                              .withOption(minEigenValOpt);
-    Group group = gBuilder.create();
-
-    Map<String,String> argMap = new HashMap<String,String>();
-
-    CommandLine cmdLine;
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      cmdLine = parser.parse(args);
-    } catch (OptionException e) {
-      log.error(e.getMessage());
-      CommandLineUtil.printHelp(group);
-      return null;
-    }
-    if (cmdLine.hasOption(helpOpt)) {
-      CommandLineUtil.printHelp(group);
-      return argMap;
-    }
-    maybePut(argMap, cmdLine, eigenInputOpt, corpusInputOpt, helpOpt, outOpt, 
inMemOpt, errorOpt, minEigenValOpt);
-    return argMap;
+    addOption(DefaultOptionCreator.outputOption().create());
+    addOption(DefaultOptionCreator.helpOption());
+    addOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have 
enough!)", "false");
+    addOption("maxError", "err", "Maximum acceptable error", "0.05");
+    addOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector 
for", "0.0");
+
+    return parseArguments(args);
   }
 
   public VectorIterable computePairwiseInnerProducts() {

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
 Tue Jun  8 02:20:01 2010
@@ -25,6 +25,7 @@ import org.apache.commons.cli2.builder.A
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.mahout.common.CommandLineUtil;
@@ -219,7 +220,8 @@ public final class SparseVectorsFromSequ
         sequentialAccessOutput = true;
       }
       
-      DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, 
outputDir, minSupport, maxNGramSize,
+      Configuration conf = new Configuration();
+      DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, 
outputDir, conf, minSupport, maxNGramSize,
         minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
       if (processIdf) {
         TFIDFConverter.processTfIdf(

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
 Tue Jun  8 02:20:01 2010
@@ -18,16 +18,13 @@
 package org.apache.mahout.utils.nlp.collocations.llr;
 
 import java.io.IOException;
+import java.util.Map;
 
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
 import org.apache.commons.cli2.OptionException;
 import org.apache.commons.cli2.builder.ArgumentBuilder;
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.io.Text;
@@ -39,18 +36,19 @@ import org.apache.hadoop.mapred.RunningJ
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapred.lib.IdentityMapper;
-import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.text.DefaultAnalyzer;
 import org.apache.mahout.utils.vectors.text.DocumentProcessor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /** Driver for LLR Collocation discovery mapreduce job */
-public final class CollocDriver extends Configured implements Tool {
+public final class CollocDriver extends AbstractJob {
   public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
   public static final String SUBGRAM_OUTPUT_DIRECTORY = "subgrams";
   public static final String NGRAM_OUTPUT_DIRECTORY = "ngrams";
@@ -72,143 +70,105 @@ public final class CollocDriver extends 
 
   @Override
   public int run(String[] args) throws Exception {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    
-    Option inputOpt = 
obuilder.withLongName("input").withRequired(true).withArgument(
-      
abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Path for input files.").withShortName("i").create();
-    
-    Option outputOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
-      
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The Path write output to").withShortName("o").create();
-    
-    Option maxNGramSizeOpt = 
obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
-      abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
-        .withDescription(
-          "(Optional) The maximum size of ngrams to create"
-              + " (2 = bigrams, 3 = trigrams, etc) Default 
Value:2").withShortName("ng").create();
-    
-    Option minSupportOpt = 
obuilder.withLongName("minSupport").withRequired(false).withArgument(
-      
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Minimum Support. Default Value: " + 
CollocReducer.DEFAULT_MIN_SUPPORT).withShortName("s")
-        .create();
-    
-    Option minLLROpt = 
obuilder.withLongName("minLLR").withRequired(false).withArgument(
-      
abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional)The minimum Log Likelihood Ratio(Float)  Default is " + 
LLRReducer.DEFAULT_MIN_LLR)
-        .withShortName("ml").create();
-    
-    Option numReduceTasksOpt = 
obuilder.withLongName("numReducers").withRequired(false).withArgument(
-      
abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Number of reduce tasks. Default Value: " + 
DEFAULT_PASS1_NUM_REDUCE_TASKS)
-        .withShortName("nr").create();
-    
-    Option preprocessOpt = 
obuilder.withLongName("preprocess").withRequired(false).withDescription(
-      "If set, input is SequenceFile<Text,Text> where the value is the 
document, "
-          + " which will be tokenized using the specified 
analyzer.").withShortName("p").create();
-    
-    Option unigramOpt = 
obuilder.withLongName("unigram").withRequired(false).withDescription(
-      "If set, unigrams will be emitted in the final output alongside 
collocations").withShortName("u")
-        .create();
-    
-    Option overwriteOutput = 
obuilder.withLongName("overwrite").withRequired(false).withDescription(
-      "If set, overwrite the output directory").withShortName("w").create();
-    
-    Option analyzerNameOpt = 
obuilder.withLongName("analyzerName").withArgument(
-      
abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The class name of the analyzer").withShortName("a").create();
-    
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
-        .create();
-    
-    Group group = 
gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
-      
maxNGramSizeOpt).withOption(overwriteOutput).withOption(minSupportOpt).withOption(minLLROpt)
-        
.withOption(numReduceTasksOpt).withOption(analyzerNameOpt).withOption(preprocessOpt).withOption(
-          unigramOpt).withOption(helpOpt).create();
-    
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-      
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return 1;
-      }
-      
-      Path input = new Path(cmdLine.getValue(inputOpt).toString());
-      Path output = new Path(cmdLine.getValue(outputOpt).toString());
-      
-      int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
-      
-      if (cmdLine.hasOption(maxNGramSizeOpt)) {
-        try {
-          maxNGramSize = 
Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
-        } catch (NumberFormatException ex) {
-          log.warn("Could not parse ngram size option");
-        }
-      }
-      log.info("Maximum n-gram size is: {}", maxNGramSize);
-      
-      if (cmdLine.hasOption(overwriteOutput)) {
-        HadoopUtil.overwriteOutput(output);
-      }
-      
-      int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
-      if (cmdLine.hasOption(minSupportOpt)) {
-        minSupport = 
Integer.parseInt(cmdLine.getValue(minSupportOpt).toString());
-      }
-      log.info("Minimum Support value: {}", minSupport);
-      
-      float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
-      if (cmdLine.hasOption(minLLROpt)) {
-        minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
-      }
-      log.info("Minimum LLR value: {}", minLLRValue);
-      
-      int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
-      if (cmdLine.hasOption(numReduceTasksOpt)) {
-        reduceTasks = 
Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
+    addInputOption();
+    addOutputOption();
+    addOption(DefaultOptionCreator.numReducersOption().create());
+    
+    addOption("maxNGramSize", "ng", 
+        "(Optional) The max size of ngrams to create (2 = bigrams, 3 = 
trigrams, etc) default: 2",
+        String.valueOf(DEFAULT_MAX_NGRAM_SIZE));
+    addOption("minSupport", "s", 
+        "(Optional) Minimum Support. Default Value: " + 
CollocReducer.DEFAULT_MIN_SUPPORT, 
+        String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT));
+    addOption("minLLR", "ml",
+        "(Optional)The minimum Log Likelihood Ratio(Float)  Default is " + 
LLRReducer.DEFAULT_MIN_LLR,
+        String.valueOf(LLRReducer.DEFAULT_MIN_LLR));
+    addOption(DefaultOptionCreator.overwriteOption().create());
+    addOption("analyzerName", "a",
+        "The class name of the analyzer to use for preprocessing", null);
+    
+    addFlag("preprocess", "p",
+        "If set, input is SequenceFile<Text,Text> where the value is the 
document, "
+        + " which will be tokenized using the specified analyzer.");
+    addFlag("unigram", "u", 
+        "If set, unigrams will be emitted in the final output alongside 
collocations");
+    
+    Map<String, String> argMap = parseArguments(args);
+    
+    if (argMap == null) {
+      return -1;
+    }
+    
+    Path input = getInputPath();
+    Path output = getOutputPath();
+    
+    
+    int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
+    if (argMap.get("--maxNGramSize") != null) {
+      try {
+        maxNGramSize = Integer.parseInt(argMap.get("--maxNGramSize"));
+      } catch (NumberFormatException ex) {
+        log.warn("Could not parse ngram size option");
       }
-      log.info("Number of pass1 reduce tasks: {}", reduceTasks);
-      
-      boolean emitUnigrams = cmdLine.hasOption(unigramOpt);
+    }
+    log.info("Maximum n-gram size is: {}", maxNGramSize);
+    
+    
+    if (argMap.containsKey("--overwrite")) {
+      HadoopUtil.overwriteOutput(output);
+    }
+    
+    
+    int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
+    if (argMap.get("--minsupport") != null) {
+      minSupport = Integer.parseInt(argMap.get("--minsupport"));
+    }
+    log.info("Minimum Support value: {}", minSupport);
+    
+    
+    float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
+    if (argMap.get("--minLLR") != null) {
+      minLLRValue = Float.parseFloat(argMap.get("--minLLR"));
+    }
+    log.info("Minimum LLR value: {}", minLLRValue);
+    
+    
+    int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
+    if (argMap.get("--maxRed") != null) {
+      reduceTasks = Integer.parseInt(argMap.get("--maxRed"));
+    }
+    log.info("Number of pass1 reduce tasks: {}", reduceTasks);
+    
+    
+    boolean emitUnigrams = argMap.containsKey("--emitUnigrams");
+
+    if (argMap.containsKey("--preprocess")) {
+      log.info("Input will be preprocessed");
       
-      if (cmdLine.hasOption(preprocessOpt)) {
-        log.info("Input will be preprocessed");
-        
-        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
-        if (cmdLine.hasOption(analyzerNameOpt)) {
-          String className = cmdLine.getValue(analyzerNameOpt).toString();
-          analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
-          // try instantiating it, b/c there isn't any point in setting it if
-          // you can't instantiate it
-          analyzerClass.newInstance();
-        }
-        
-        Path tokenizedPath = new Path(output, 
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
-        
-        DocumentProcessor.tokenizeDocuments(input, analyzerClass, 
tokenizedPath);
-        input = tokenizedPath;
-      } else {
-        log.info("Input will NOT be preprocessed");
+      Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
+      if (argMap.get("--analyzerName") != null) {
+        String className = argMap.get("--analyzerName");
+        analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+        // try instantiating it, b/c there isn't any point in setting it if
+        // you can't instantiate it
+        analyzerClass.newInstance();
       }
       
-      // parse input and extract collocations
-      long ngramCount = generateCollocations(input, output, emitUnigrams, 
maxNGramSize,
-        reduceTasks, minSupport);
+      Path tokenizedPath = new Path(output, 
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
       
-      // tally collocations and perform LLR calculation
-      computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue, 
reduceTasks);
-      
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      CommandLineUtil.printHelp(group);
-      return 1;
+      DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath);
+      input = tokenizedPath;
+    } else {
+      log.info("Input will NOT be preprocessed");
     }
     
+    // parse input and extract collocations
+    long ngramCount = generateCollocations(input, output, getConf(), 
emitUnigrams, maxNGramSize,
+      reduceTasks, minSupport);
+    
+    // tally collocations and perform LLR calculation
+    computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams, 
minLLRValue, reduceTasks);
+
     return 0;
   }
   
@@ -231,16 +191,17 @@ public final class CollocDriver extends 
    */
   public static void generateAllGrams(Path input,
                                       Path output,
+                                      Configuration baseConf,
                                       int maxNGramSize,
                                       int minSupport,
                                       float minLLRValue,
                                       int reduceTasks) throws IOException {
     // parse input and extract collocations
-    long ngramCount = generateCollocations(input, output, true, maxNGramSize, 
reduceTasks,
+    long ngramCount = generateCollocations(input, output, baseConf, true, 
maxNGramSize, reduceTasks,
       minSupport);
     
     // tally collocations and perform LLR calculation
-    computeNGramsPruneByLLR(ngramCount, output, true, minLLRValue, 
reduceTasks);
+    computeNGramsPruneByLLR(output, baseConf, ngramCount, true, minLLRValue, 
reduceTasks);
   }
   
   /**
@@ -248,11 +209,13 @@ public final class CollocDriver extends 
    */
   public static long generateCollocations(Path input,
                                           Path output,
+                                          Configuration baseConf,
                                           boolean emitUnigrams,
                                           int maxNGramSize,
                                           int reduceTasks,
                                           int minSupport) throws IOException {
-    JobConf conf = new JobConf(CollocDriver.class);
+    
+    JobConf conf = new JobConf(baseConf, CollocDriver.class);
     conf.setJobName(CollocDriver.class.getSimpleName() + 
".generateCollocations:" + input);
     
     conf.setMapOutputKeyClass(GramKey.class);
@@ -268,8 +231,9 @@ public final class CollocDriver extends 
     conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
     
     FileInputFormat.setInputPaths(conf, input);
-    Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
-    FileOutputFormat.setOutputPath(conf, outPath);
+    
+    Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
+    FileOutputFormat.setOutputPath(conf, outputPath);
     
     conf.setInputFormat(SequenceFileInputFormat.class);
     conf.setMapperClass(CollocMapper.class);
@@ -287,12 +251,13 @@ public final class CollocDriver extends 
   /**
    * pass2: perform the LLR calculation
    */
-  public static void computeNGramsPruneByLLR(long nGramTotal,
-                                             Path output,
-                                             boolean emitUnigrams,
-                                             float minLLRValue,
-                                             int reduceTasks) throws 
IOException {
-    JobConf conf = new JobConf(CollocDriver.class);
+  public static void computeNGramsPruneByLLR(Path output,
+                                                Configuration baseConf,
+                                                long nGramTotal,
+                                                boolean emitUnigrams,
+                                                float minLLRValue,
+                                                int reduceTasks) throws 
IOException {
+    JobConf conf = new JobConf(baseConf, CollocDriver.class);
     conf.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + 
output);
     
     

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
 Tue Jun  8 02:20:01 2010
@@ -116,6 +116,7 @@ public final class DictionaryVectorizer 
    */
   public static void createTermFrequencyVectors(Path input,
                                                 Path output,
+                                                Configuration baseConf,
                                                 int minSupport,
                                                 int maxNGramSize,
                                                 float minLLRValue,
@@ -140,7 +141,7 @@ public final class DictionaryVectorizer 
       dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath, 
output,
         chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
     } else {
-      CollocDriver.generateAllGrams(input, dictionaryJobPath, maxNGramSize,
+      CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, 
maxNGramSize,
         minSupport, minLLRValue, numReducers);
       dictionaryChunks = createDictionaryChunks(minSupport, new Path(
           new Path(output, DICTIONARY_JOB_FOLDER), 
CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
 Tue Jun  8 02:20:01 2010
@@ -117,7 +117,7 @@ public class DictionaryVectorizerTest ex
     DocumentProcessor.tokenizeDocuments(path, analyzer,
     getTestTempDirPath("output/tokenized-documents"));
     
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
-      getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
+      getTestTempDirPath("output/wordcount"), conf, 2, 1, 0.0f, 1, 100, false);
     TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
                                 getTestTempDirPath("output/tfidf"), 100, 1, 
99, 1.0f, false, 1);
     


Reply via email to