Author: drew
Date: Tue Jun 8 02:20:01 2010
New Revision: 952511
URL: http://svn.apache.org/viewvc?rev=952511&view=rev
Log:
MAHOUT-404: AbstractJob Improvements
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
Tue Jun 8 02:20:01 2010
@@ -39,6 +39,7 @@ import org.apache.mahout.cf.taste.hadoop
import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper;
import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.VarIntWritable;
import org.apache.mahout.math.VarLongWritable;
import org.apache.mahout.math.VectorWritable;
@@ -73,23 +74,18 @@ public final class RecommenderJob extend
@Override
public int run(String[] args) throws IOException, ClassNotFoundException,
InterruptedException {
- Option numReccomendationsOpt =
AbstractJob.buildOption("numRecommendations", "n",
- "Number of recommendations per user",
+ addOption("numRecommendations", "n", "Number of recommendations per user",
String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
- Option usersFileOpt = AbstractJob.buildOption("usersFile", "u",
- "File of users to recommend for", null);
- Option booleanDataOpt = AbstractJob.buildOption("booleanData", "b",
- "Treat input as without pref values", Boolean.FALSE.toString());
- Option maxPrefsPerUserConsideredOpt =
AbstractJob.buildOption("maxPrefsPerUserConsidered", null,
+ addOption("usersFile", "u", "File of users to recommend for", null);
+ addOption("booleanData", "b", "Treat input as without pref values",
Boolean.FALSE.toString());
+ addOption("maxPrefsPerUserConsidered", null,
"Maximum number of preferences considered per user in final
recommendation phase",
String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
- Option maxCooccurrencesPerItemConsideredOpt =
AbstractJob.buildOption("maxCooccurrencesPerItemConsidered", null,
+ addOption("maxCooccurrencesPerItemConsidered", null,
"Maximum number of cooccurrences considered per item in count phase",
String.valueOf(UserVectorToCooccurrenceMapper.DEFAULT_MAX_COOCCURRENCES_PER_ITEM_CONSIDERED));
- Map<String,String> parsedArgs = AbstractJob.parseArguments(
- args, numReccomendationsOpt, usersFileOpt, booleanDataOpt,
- maxPrefsPerUserConsideredOpt, maxCooccurrencesPerItemConsideredOpt);
+ Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
Tue Jun 8 02:20:01 2010
@@ -31,6 +31,7 @@ import org.apache.hadoop.mapreduce.lib.o
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
import org.apache.mahout.math.VarLongWritable;
@@ -106,15 +107,13 @@ public final class RecommenderJob extend
@Override
public int run(String[] args) throws IOException, ClassNotFoundException,
InterruptedException {
- Option recommendClassOpt = AbstractJob.buildOption("recommenderClassName",
"r",
- "Name of recommender class to instantiate");
- Option numReccomendationsOpt =
AbstractJob.buildOption("numRecommendations", "n",
- "Number of recommendations per user", "10");
- Option usersFileOpt = AbstractJob.buildOption("usersFile", "u", "Number of
recommendations per user",
- null);
+ addOption(DefaultOptionCreator.inputOption().create());
+ addOption(DefaultOptionCreator.outputOption().create());
+ addOption("recommenderClassName", "r", "Name of recommender class to
instantiate");
+ addOption("numRecommendations", "n", "Number of recommendations per user",
"10");
+ addOption("usersFile", "u", "Number of recommendations per user", null);
- Map<String,String> parsedArgs = AbstractJob.parseArguments(args,
recommendClassOpt,
- numReccomendationsOpt, usersFileOpt);
+ Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
Tue Jun 8 02:20:01 2010
@@ -118,10 +118,9 @@ public final class ItemSimilarityJob ext
@Override
public int run(String[] args) throws IOException, ClassNotFoundException,
InterruptedException {
- Option similarityClassOpt = AbstractJob.buildOption(
- "similarityClassname", "s", "Name of distributed similarity class to
instantiate");
+ addOption("similarityClassname", "s", "Name of distributed similarity
class to instantiate");
- Map<String,String> parsedArgs = AbstractJob.parseArguments(args,
similarityClassOpt);
+ Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
Tue Jun 8 02:20:01 2010
@@ -37,6 +37,7 @@ import org.apache.mahout.cf.taste.hadoop
import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
import org.apache.mahout.cf.taste.hadoop.ToItemPrefsMapper;
import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.VarLongWritable;
public final class SlopeOneAverageDiffsJob extends AbstractJob {
@@ -44,7 +45,10 @@ public final class SlopeOneAverageDiffsJ
@Override
public int run(String[] args) throws IOException, ClassNotFoundException,
InterruptedException {
- Map<String,String> parsedArgs = AbstractJob.parseArguments(args);
+ addInputOption();
+ addOutputOption();
+
+ Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
Tue Jun 8 02:20:01 2010
@@ -18,6 +18,8 @@
package org.apache.mahout.common;
import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
@@ -75,50 +77,199 @@ public abstract class AbstractJob extend
private static final Logger log = LoggerFactory.getLogger(AbstractJob.class);
- protected static Option buildOption(String name, String shortName, String
description) {
- return buildOption(name, shortName, description, true, null);
+ /** option used to specify the input path */
+ private Option inputOption;
+
+ /** option used to specify the output path */
+ private Option outputOption;
+
+ /** input path, populated by {...@link #parseArguments(String[])} */
+ private Path inputPath;
+
+ /** output path, populated by {...@link #parseArguments(String[]) */
+ private Path outputPath;
+
+ /** internal list of options that have been added */
+ private final List<Option> options;
+
+ public AbstractJob() {
+ options = new LinkedList<Option>();
}
- protected static Option buildOption(String name, String shortName, String
description, String defaultValue) {
- return buildOption(name, shortName, description, false, defaultValue);
+ /** Returns the input path established by a call to {...@link
#parseArguments(String[])}.
+ * The source of the path may be an input option added using {...@link
#addInputOption()}
+ * or it may be the value of the <code>mapred.input.dir</code> configuration
+ * property.
+ * @return
+ */
+ public Path getInputPath() {
+ return inputPath;
+ }
+
+ /** Returns the output path established by a call to {...@link
#parseArguments(String[])}.
+ * The source of the path may be an output option added using {...@link
#addOutputOption()}
+ * or it may be the value of the <code>mapred.input.dir</code> configuration
+ * property.
+ * @return
+ */
+ public Path getOutputPath() {
+ return outputPath;
+ }
+
+ /** Add an option with no argument whose presence can be checked for using
+ * <code>containsKey<code> method on the map returned by
+ * {...@link #parseArguments(String[])};
+ *
+ * @param name
+ * @param shortName
+ * @param description
+ */
+ public void addFlag(String name, String shortName, String description) {
+ options.add(buildOption(name, shortName, description, true, false, null));
+ }
+
+ /** Add an option to the the set of options this job will parse when
+ * {...@link #parseArguments(String[])} is called. This options has an
argument
+ * with null as its default value.
+ *
+ * @param name
+ * @param shortName
+ * @param description
+ */
+ public void addOption(String name, String shortName, String description) {
+ options.add(buildOption(name, shortName, description, true, false, null));
+ }
+
+ /** Add an option to the the set of options this job will parse when
+ * {...@link #parseArguments(String[])} is called.
+ *
+ * @param name
+ * @param shortName
+ * @param description
+ * @param required if true the {...@link #parseArguments(String[])} will
throw
+ * fail with an error and usage message if this option is not specified
+ * on the command line.
+ */
+ public void addOption(String name, String shortName, String description,
boolean required) {
+ options.add(buildOption(name, shortName, description, true, required,
null));
+ }
+
+ /** Add an option to the the set of options this job will parse when
+ * {...@link #parseArguments(String[])} is called. If this option is not
+ * specified on the command line the default value will be
+ * used.
+ *
+ * @param name
+ * @param shortName
+ * @param description
+ * @param defaultValue the default argument value if this argument is not
+ * found on the command-line. null is allowed.
+ */
+ public void addOption(String name, String shortName, String description,
String defaultValue) {
+ options.add(buildOption(name, shortName, description, true, false,
defaultValue));
}
- protected static Option buildOption(String name, String shortName, String
description,
- boolean required) {
- return buildOption(name, shortName, description, required, null);
+ /** Add an arbitrary option to the set of options this job will parse when
+ * {...@link #parseArguments(String[])} is called. If this option has no
+ * argument, use <code>containsKey</code> on the map returned by
+ * <code>parseArguments</code> to check for its presence. Otherwise, the
+ * string value of the option will be placed in the map using a key
+ * equal to this options long name preceded by '--'.
+ * @param option
+ * @return the option added.
+ */
+ public Option addOption(Option option) {
+ options.add(option);
+ return option;
}
- protected static Option buildOption(String name,
- String shortName,
- String description,
- boolean required,
- String defaultValue) {
- ArgumentBuilder argBuilder = new
ArgumentBuilder().withName(name).withMinimum(1).withMaximum(1);
- if (defaultValue != null) {
- argBuilder = argBuilder.withDefault(defaultValue);
- }
- Argument arg = argBuilder.create();
- DefaultOptionBuilder optBuilder = new
DefaultOptionBuilder().withLongName(name).withRequired(required)
- .withArgument(arg).withDescription(description);
+ /** Add the default output directory option, '-o' which takes a directory
+ * name as an argument. When {...@link #parseArguments(String[])} is
+ * called, the outputPath will be set based upon the value for this option.
+ * This this method is called, the output is required.
+ */
+ public void addInputOption() {
+ this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
+ }
+
+ /** Add the default output directory option, '-o' which takes a directory
+ * name as an argument. When {...@link #parseArguments(String[])} is
+ * called, the outputPath will be set based upon the value for this option.
+ * This this method is called, the output is required.
+ */
+ public void addOutputOption() {
+ this.outputOption =
addOption(DefaultOptionCreator.outputOption().create());
+ }
+
+ /** Build an option with the given parameters. Name and description are
+ * required.
+ *
+ * @param name the long name of the option prefixed with '--' on the
command-line
+ * @param shortName the short name of the option, prefixed with '-' on the
command-line
+ * @param description description of the option displayed in help method
+ * @param hasArg true if the option has an argument.
+ * @param required true if the option is required.
+ * @param defaultValue default argument value, can be null.
+ * @return the option.
+ */
+ private static Option buildOption(String name,
+ String shortName,
+ String description,
+ boolean hasArg,
+ boolean required,
+ String defaultValue) {
+
+ DefaultOptionBuilder optBuilder = new DefaultOptionBuilder()
+ .withLongName(name)
+ .withDescription(description)
+ .withRequired(required);
+
if (shortName != null) {
- optBuilder = optBuilder.withShortName(shortName);
+ optBuilder.withShortName(shortName);
}
+
+ if (hasArg) {
+ ArgumentBuilder argBuilder = new ArgumentBuilder()
+ .withName(name)
+ .withMinimum(1)
+ .withMaximum(1);
+
+ if (defaultValue != null) {
+ argBuilder = argBuilder.withDefault(defaultValue);
+ }
+
+ optBuilder.withArgument(argBuilder.create());
+ }
+
return optBuilder.create();
}
- protected static Map<String,String> parseArguments(String[] args, Option...
extraOpts) {
+ /** Parse the arguments specified based on the options defined using the
+ * various <code>addOption</code> methods. If -h is specified or an
+ * exception is encountered pring help and return null. Has the
+ * side effect of setting inputPath and outputPath
+ * if <code>addInputOption</code> or <code>addOutputOption</code>
+ * or <code>mapred.input.dir</code> or <code>mapred.output.dir</code>
+ * are present in the Configuration.
+ *
+ * @param args
+ * @return a Map<String,Sting> containing options and their argument values.
+ * The presence of a flag can be tested using <code>containsKey</code>,
while
+ * argument values can be retrieved using <code>get(optionName</code>. The
+ * names used for keys are the option name parameter prefixed by '--'.
+ *
+ *
+ */
+ public Map<String,String> parseArguments(String[] args) {
- Option tempDirOpt = buildOption("tempDir", null, "Intermediate output
directory", "temp");
- Option helpOpt = DefaultOptionCreator.helpOption();
- Option startPhase = buildOption("startPhase", null, "First phase to run",
"0");
- Option endPhase = buildOption("endPhase", null, "Last phase to run",
String.valueOf(Integer.MAX_VALUE));
-
- GroupBuilder gBuilder = new GroupBuilder().withName("Options")
- .withOption(tempDirOpt)
- .withOption(helpOpt)
- .withOption(startPhase).withOption(endPhase);
+ Option helpOpt = addOption(DefaultOptionCreator.helpOption());
+ addOption("tempDir", null, "Intermediate output directory", "temp");
+ addOption("startPhase", null, "First phase to run", "0");
+ addOption("endPhase", null, "Last phase to run",
String.valueOf(Integer.MAX_VALUE));
+
+ GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific
Options:");
- for (Option opt : extraOpts) {
+ for (Option opt : options) {
gBuilder = gBuilder.withOption(opt);
}
@@ -132,28 +283,59 @@ public abstract class AbstractJob extend
cmdLine = parser.parse(args);
} catch (OptionException e) {
log.error(e.getMessage());
- CommandLineUtil.printHelp(group);
+ CommandLineUtil.printHelpWithGenericOptions(group);
return null;
}
if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
+ CommandLineUtil.printHelpWithGenericOptions(group);
return null;
}
Map<String,String> result = new TreeMap<String,String>();
- maybePut(result, cmdLine, tempDirOpt, helpOpt, startPhase, endPhase);
- maybePut(result, cmdLine, extraOpts);
+ maybePut(result, cmdLine, this.options.toArray(new Option[0]));
+ parseDirectories(cmdLine);
+
log.info("Command line arguments: {}", result);
return result;
}
+ /** Extract the values of the <code>inputOption</code> and
<code>outputOption</code>
+ * if present, otherwise attempt to retrieve the values of
<code>mapred.input.dir</code>
+ * and <code>mapred.output.dir</code>. If none of these are set,
+ * {...@link #getInputPath()} and {...@link #getOutputPath()} will return
null.
+ * @param cmdLine
+ */
+ protected void parseDirectories(CommandLine cmdLine) {
+ Configuration conf = getConf();
+
+ if (inputOption != null) {
+ if (cmdLine.hasOption(inputOption)) {
+ this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
+ }
+ }
+ else if (conf.get("mapred.input.dir") != null) {
+ this.inputPath = new Path(conf.get("mapred.input.dir"));
+ }
+
+ if (outputOption != null) {
+ if (cmdLine.hasOption(outputOption)) {
+ this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
+ }
+ }
+ else if (conf.get("mapred.output.dir") != null) {
+ this.outputPath = new Path(conf.get("mapred.output.dir"));
+ }
+ }
+
protected static void maybePut(Map<String,String> args, CommandLine cmdLine,
Option... opt) {
for (Option o : opt) {
- Object value = cmdLine.getValue(o);
- if (value != null) {
- args.put(o.getPreferredName(), value.toString());
+ if (cmdLine.hasOption(o)) {
+ // nulls are ok, for cases where options are simple flags.
+ Object vo = cmdLine.getValue(o);
+ String value = (vo == null) ? null : vo.toString();
+ args.put(o.getPreferredName(), value);
}
}
}
@@ -219,5 +401,4 @@ public abstract class AbstractJob extend
return job;
}
-
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/CommandLineUtil.java
Tue Jun 8 02:20:01 2010
@@ -17,8 +17,12 @@
package org.apache.mahout.common;
+import java.io.PrintWriter;
+
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.GenericOptionsParser;
public final class CommandLineUtil {
@@ -29,5 +33,26 @@ public final class CommandLineUtil {
formatter.setGroup(group);
formatter.print();
}
-
+
+ /** print the options supported by <code>GenericOptionsParser</code>
+ * in addition to the options supported by the job, passed in as the
+ * group parameter.
+ * @params group job-specific command-line options.
+ */
+ public static void printHelpWithGenericOptions(Group group) {
+ org.apache.commons.cli.Options ops = new org.apache.commons.cli.Options();
+ new GenericOptionsParser(new Configuration(), ops, new String[0]);
+ org.apache.commons.cli.HelpFormatter fmt = new
org.apache.commons.cli.HelpFormatter();
+ fmt.printHelp("<command> [Generic Options] [Job-Specific Options]",
+ "Generic Options:", ops, "");
+
+ PrintWriter pw = new PrintWriter(System.out);
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.setGroup(group);
+ formatter.setPrintWriter(pw);
+ formatter.printHelp();
+ pw.flush();
+
+
+ }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
Tue Jun 8 02:20:01 2010
@@ -70,33 +70,15 @@ public class MatrixMultiplicationJob ext
@Override
public int run(String[] strings) throws Exception {
- Option numRowsAOpt = buildOption("numRowsA",
- "nra",
- "Number of rows of the first input
matrix");
- Option numColsAOpt = buildOption("numColsA",
- "nca",
- "Number of columns of the first input
matrix");
- Option numRowsBOpt = buildOption("numRowsB",
- "nrb",
- "Number of rows of the second input
matrix");
-
- Option numColsBOpt = buildOption("numColsB",
- "ncb",
- "Number of columns of the second input
matrix");
- Option inputPathA = buildOption("inputPathA",
- "ia",
- "Path to the first input matrix");
- Option inputPathB = buildOption("inputPathB",
- "ib",
- "Path to the second input matrix");
-
- Map<String, String> argMap = parseArguments(strings,
- numRowsAOpt,
- numRowsBOpt,
- numColsAOpt,
- numColsBOpt,
- inputPathA,
- inputPathB);
+ addOption("numRowsA", "nra", "Number of rows of the first input matrix");
+ addOption("numColsA", "nca", "Number of columns of the first input
matrix");
+ addOption("numRowsB", "nrb", "Number of rows of the second input matrix");
+
+ addOption("numColsB", "ncb", "Number of columns of the second input
matrix");
+ addOption("inputPathA", "ia", "Path to the first input matrix");
+ addOption("inputPathB", "ib", "Path to the second input matrix");
+
+ Map<String, String> argMap = parseArguments(strings);
DistributedRowMatrix a = new
DistributedRowMatrix(argMap.get("--inputPathA"),
argMap.get("--tempDir"),
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
Tue Jun 8 02:20:01 2010
@@ -55,13 +55,9 @@ public class TransposeJob extends Abstra
@Override
public int run(String[] strings) throws Exception {
- Option numRowsOpt = buildOption("numRows",
- "nr",
- "Number of rows of the input matrix");
- Option numColsOpt = buildOption("numCols",
- "nc",
- "Number of columns of the input matrix");
- Map<String,String> parsedArgs = parseArguments(strings, numRowsOpt,
numColsOpt);
+ addOption("numRows", "nr", "Number of rows of the input matrix");
+ addOption("numCols", "nc", "Number of columns of the input matrix");
+ Map<String,String> parsedArgs = parseArguments(strings);
if (parsedArgs == null) {
// FIXME
return 0;
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
Tue Jun 8 02:20:01 2010
@@ -144,25 +144,13 @@ public class DistributedLanczosSolver ex
@Override
public int run(String[] args) throws Exception {
- Option numRowsOpt = buildOption("numRows",
- "nr",
- "Number of rows of the input matrix");
- Option numColsOpt = buildOption("numCols",
- "nc",
- "Number of columns of the input matrix");
- Option desiredRankOpt = buildOption("rank",
- "r",
- "Desired decomposition rank (note:
only roughly 1/4 to 1/3 "
- + "of these will have the top portion
of the spectrum)");
- Option isSymmetricOpt = buildOption("symmetric",
- "sym",
- "Is the input matrix square and
symmetric?");
+ addOption("numRows", "nr", "Number of rows of the input matrix");
+ addOption("numCols", "nc", "Number of columns of the input matrix");
+ addOption("rank", "r", "Desired decomposition rank (note: only roughly
1/4 to 1/3 "
+ + "of these will have the top portion of the
spectrum)");
+ addOption("symmetric", "sym", "Is the input matrix square and
symmetric?");
- DistributedLanczosSolver.this.parsedArgs = parseArguments(args,
- numRowsOpt,
- numColsOpt,
- desiredRankOpt,
-
isSymmetricOpt);
+ DistributedLanczosSolver.this.parsedArgs = parseArguments(args);
if (DistributedLanczosSolver.this.parsedArgs == null) {
return -1;
} else {
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
Tue Jun 8 02:20:01 2010
@@ -101,7 +101,7 @@ public class EigenVerificationJob extend
return 0;
}
Configuration originalConf = getConf();
- outPath = originalConf.get("mapred.output.class");
+ outPath = originalConf.get("mapred.output.dir");
tmpOut = outPath + "/tmp";
if (argMap.get("--eigenInput") != null && eigensToVerify == null) {
@@ -131,45 +131,18 @@ public class EigenVerificationJob extend
return 0;
}
- public static Map<String,String> handleArgs(String[] args) {
- Option eigenInputOpt = buildOption("eigenInput", "ei",
+ public Map<String,String> handleArgs(String[] args) {
+ addOption("eigenInput", "ei",
"The Path for purported eigenVector input files
(SequenceFile<WritableComparable,VectorWritable>.", null);
- Option corpusInputOpt = buildOption("corpusInput", "ci",
+ addOption("corpusInput", "ci",
"The Path for corpus input files
(SequenceFile<WritableComparable,VectorWritable>.");
- Option outOpt = DefaultOptionCreator.outputOption().create();
- Option helpOpt = DefaultOptionCreator.helpOption();
- Option inMemOpt = buildOption("inMemory", "mem", "Buffer eigen matrix into
memory (if you have enough!)", "false");
- Option errorOpt = buildOption("maxError", "err", "Maximum acceptable
error", "0.05");
- Option minEigenValOpt = buildOption("minEigenvalue", "mev", "Minimum
eigenvalue to keep the vector for", "0.0");
-
- GroupBuilder gBuilder = new GroupBuilder().withName("Options")
- .withOption(eigenInputOpt)
- .withOption(corpusInputOpt)
- .withOption(helpOpt)
- .withOption(outOpt)
- .withOption(inMemOpt)
- .withOption(errorOpt)
- .withOption(minEigenValOpt);
- Group group = gBuilder.create();
-
- Map<String,String> argMap = new HashMap<String,String>();
-
- CommandLine cmdLine;
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- cmdLine = parser.parse(args);
- } catch (OptionException e) {
- log.error(e.getMessage());
- CommandLineUtil.printHelp(group);
- return null;
- }
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return argMap;
- }
- maybePut(argMap, cmdLine, eigenInputOpt, corpusInputOpt, helpOpt, outOpt,
inMemOpt, errorOpt, minEigenValOpt);
- return argMap;
+ addOption(DefaultOptionCreator.outputOption().create());
+ addOption(DefaultOptionCreator.helpOption());
+ addOption("inMemory", "mem", "Buffer eigen matrix into memory (if you have
enough!)", "false");
+ addOption("maxError", "err", "Maximum acceptable error", "0.05");
+ addOption("minEigenvalue", "mev", "Minimum eigenvalue to keep the vector
for", "0.0");
+
+ return parseArguments(args);
}
public VectorIterable computePairwiseInnerProducts() {
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
Tue Jun 8 02:20:01 2010
@@ -25,6 +25,7 @@ import org.apache.commons.cli2.builder.A
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.CommandLineUtil;
@@ -219,7 +220,8 @@ public final class SparseVectorsFromSequ
sequentialAccessOutput = true;
}
- DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
outputDir, minSupport, maxNGramSize,
+ Configuration conf = new Configuration();
+ DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath,
outputDir, conf, minSupport, maxNGramSize,
minLLRValue, reduceTasks, chunkSize, sequentialAccessOutput);
if (processIdf) {
TFIDFConverter.processTfIdf(
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
Tue Jun 8 02:20:01 2010
@@ -18,16 +18,13 @@
package org.apache.mahout.utils.nlp.collocations.llr;
import java.io.IOException;
+import java.util.Map;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
@@ -39,18 +36,19 @@ import org.apache.hadoop.mapred.RunningJ
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
-import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.text.DefaultAnalyzer;
import org.apache.mahout.utils.vectors.text.DocumentProcessor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** Driver for LLR Collocation discovery mapreduce job */
-public final class CollocDriver extends Configured implements Tool {
+public final class CollocDriver extends AbstractJob {
public static final String DEFAULT_OUTPUT_DIRECTORY = "output";
public static final String SUBGRAM_OUTPUT_DIRECTORY = "subgrams";
public static final String NGRAM_OUTPUT_DIRECTORY = "ngrams";
@@ -72,143 +70,105 @@ public final class CollocDriver extends
@Override
public int run(String[] args) throws Exception {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option inputOpt =
obuilder.withLongName("input").withRequired(true).withArgument(
-
abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Path for input files.").withShortName("i").create();
-
- Option outputOpt =
obuilder.withLongName("output").withRequired(true).withArgument(
-
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Path write output to").withShortName("o").create();
-
- Option maxNGramSizeOpt =
obuilder.withLongName("maxNGramSize").withRequired(false).withArgument(
- abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create())
- .withDescription(
- "(Optional) The maximum size of ngrams to create"
- + " (2 = bigrams, 3 = trigrams, etc) Default
Value:2").withShortName("ng").create();
-
- Option minSupportOpt =
obuilder.withLongName("minSupport").withRequired(false).withArgument(
-
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional) Minimum Support. Default Value: " +
CollocReducer.DEFAULT_MIN_SUPPORT).withShortName("s")
- .create();
-
- Option minLLROpt =
obuilder.withLongName("minLLR").withRequired(false).withArgument(
-
abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional)The minimum Log Likelihood Ratio(Float) Default is " +
LLRReducer.DEFAULT_MIN_LLR)
- .withShortName("ml").create();
-
- Option numReduceTasksOpt =
obuilder.withLongName("numReducers").withRequired(false).withArgument(
-
abuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription(
- "(Optional) Number of reduce tasks. Default Value: " +
DEFAULT_PASS1_NUM_REDUCE_TASKS)
- .withShortName("nr").create();
-
- Option preprocessOpt =
obuilder.withLongName("preprocess").withRequired(false).withDescription(
- "If set, input is SequenceFile<Text,Text> where the value is the
document, "
- + " which will be tokenized using the specified
analyzer.").withShortName("p").create();
-
- Option unigramOpt =
obuilder.withLongName("unigram").withRequired(false).withDescription(
- "If set, unigrams will be emitted in the final output alongside
collocations").withShortName("u")
- .create();
-
- Option overwriteOutput =
obuilder.withLongName("overwrite").withRequired(false).withDescription(
- "If set, overwrite the output directory").withShortName("w").create();
-
- Option analyzerNameOpt =
obuilder.withLongName("analyzerName").withArgument(
-
abuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription(
- "The class name of the analyzer").withShortName("a").create();
-
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out
help").withShortName("h")
- .create();
-
- Group group =
gbuilder.withName("Options").withOption(inputOpt).withOption(outputOpt).withOption(
-
maxNGramSizeOpt).withOption(overwriteOutput).withOption(minSupportOpt).withOption(minLLROpt)
-
.withOption(numReduceTasksOpt).withOption(analyzerNameOpt).withOption(preprocessOpt).withOption(
- unigramOpt).withOption(helpOpt).create();
-
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelp(group);
- return 1;
- }
-
- Path input = new Path(cmdLine.getValue(inputOpt).toString());
- Path output = new Path(cmdLine.getValue(outputOpt).toString());
-
- int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
-
- if (cmdLine.hasOption(maxNGramSizeOpt)) {
- try {
- maxNGramSize =
Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt).toString());
- } catch (NumberFormatException ex) {
- log.warn("Could not parse ngram size option");
- }
- }
- log.info("Maximum n-gram size is: {}", maxNGramSize);
-
- if (cmdLine.hasOption(overwriteOutput)) {
- HadoopUtil.overwriteOutput(output);
- }
-
- int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
- if (cmdLine.hasOption(minSupportOpt)) {
- minSupport =
Integer.parseInt(cmdLine.getValue(minSupportOpt).toString());
- }
- log.info("Minimum Support value: {}", minSupport);
-
- float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
- if (cmdLine.hasOption(minLLROpt)) {
- minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString());
- }
- log.info("Minimum LLR value: {}", minLLRValue);
-
- int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
- if (cmdLine.hasOption(numReduceTasksOpt)) {
- reduceTasks =
Integer.parseInt(cmdLine.getValue(numReduceTasksOpt).toString());
+ addInputOption();
+ addOutputOption();
+ addOption(DefaultOptionCreator.numReducersOption().create());
+
+ addOption("maxNGramSize", "ng",
+ "(Optional) The max size of ngrams to create (2 = bigrams, 3 =
trigrams, etc) default: 2",
+ String.valueOf(DEFAULT_MAX_NGRAM_SIZE));
+ addOption("minSupport", "s",
+ "(Optional) Minimum Support. Default Value: " +
CollocReducer.DEFAULT_MIN_SUPPORT,
+ String.valueOf(CollocReducer.DEFAULT_MIN_SUPPORT));
+ addOption("minLLR", "ml",
+ "(Optional)The minimum Log Likelihood Ratio(Float) Default is " +
LLRReducer.DEFAULT_MIN_LLR,
+ String.valueOf(LLRReducer.DEFAULT_MIN_LLR));
+ addOption(DefaultOptionCreator.overwriteOption().create());
+ addOption("analyzerName", "a",
+ "The class name of the analyzer to use for preprocessing", null);
+
+ addFlag("preprocess", "p",
+ "If set, input is SequenceFile<Text,Text> where the value is the
document, "
+ + " which will be tokenized using the specified analyzer.");
+ addFlag("unigram", "u",
+ "If set, unigrams will be emitted in the final output alongside
collocations");
+
+ Map<String, String> argMap = parseArguments(args);
+
+ if (argMap == null) {
+ return -1;
+ }
+
+ Path input = getInputPath();
+ Path output = getOutputPath();
+
+
+ int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
+ if (argMap.get("--maxNGramSize") != null) {
+ try {
+ maxNGramSize = Integer.parseInt(argMap.get("--maxNGramSize"));
+ } catch (NumberFormatException ex) {
+ log.warn("Could not parse ngram size option");
}
- log.info("Number of pass1 reduce tasks: {}", reduceTasks);
-
- boolean emitUnigrams = cmdLine.hasOption(unigramOpt);
+ }
+ log.info("Maximum n-gram size is: {}", maxNGramSize);
+
+
+ if (argMap.containsKey("--overwrite")) {
+ HadoopUtil.overwriteOutput(output);
+ }
+
+
+ int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
+ if (argMap.get("--minsupport") != null) {
+ minSupport = Integer.parseInt(argMap.get("--minsupport"));
+ }
+ log.info("Minimum Support value: {}", minSupport);
+
+
+ float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
+ if (argMap.get("--minLLR") != null) {
+ minLLRValue = Float.parseFloat(argMap.get("--minLLR"));
+ }
+ log.info("Minimum LLR value: {}", minLLRValue);
+
+
+ int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
+ if (argMap.get("--maxRed") != null) {
+ reduceTasks = Integer.parseInt(argMap.get("--maxRed"));
+ }
+ log.info("Number of pass1 reduce tasks: {}", reduceTasks);
+
+
+ boolean emitUnigrams = argMap.containsKey("--emitUnigrams");
+
+ if (argMap.containsKey("--preprocess")) {
+ log.info("Input will be preprocessed");
- if (cmdLine.hasOption(preprocessOpt)) {
- log.info("Input will be preprocessed");
-
- Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
- if (cmdLine.hasOption(analyzerNameOpt)) {
- String className = cmdLine.getValue(analyzerNameOpt).toString();
- analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
- // try instantiating it, b/c there isn't any point in setting it if
- // you can't instantiate it
- analyzerClass.newInstance();
- }
-
- Path tokenizedPath = new Path(output,
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
-
- DocumentProcessor.tokenizeDocuments(input, analyzerClass,
tokenizedPath);
- input = tokenizedPath;
- } else {
- log.info("Input will NOT be preprocessed");
+ Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
+ if (argMap.get("--analyzerName") != null) {
+ String className = argMap.get("--analyzerName");
+ analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
+ // try instantiating it, b/c there isn't any point in setting it if
+ // you can't instantiate it
+ analyzerClass.newInstance();
}
- // parse input and extract collocations
- long ngramCount = generateCollocations(input, output, emitUnigrams,
maxNGramSize,
- reduceTasks, minSupport);
+ Path tokenizedPath = new Path(output,
DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
- // tally collocations and perform LLR calculation
- computeNGramsPruneByLLR(ngramCount, output, emitUnigrams, minLLRValue,
reduceTasks);
-
- } catch (OptionException e) {
- log.error("Exception", e);
- CommandLineUtil.printHelp(group);
- return 1;
+ DocumentProcessor.tokenizeDocuments(input, analyzerClass, tokenizedPath);
+ input = tokenizedPath;
+ } else {
+ log.info("Input will NOT be preprocessed");
}
+ // parse input and extract collocations
+ long ngramCount = generateCollocations(input, output, getConf(),
emitUnigrams, maxNGramSize,
+ reduceTasks, minSupport);
+
+ // tally collocations and perform LLR calculation
+ computeNGramsPruneByLLR(output, getConf(), ngramCount, emitUnigrams,
minLLRValue, reduceTasks);
+
return 0;
}
@@ -231,16 +191,17 @@ public final class CollocDriver extends
*/
public static void generateAllGrams(Path input,
Path output,
+ Configuration baseConf,
int maxNGramSize,
int minSupport,
float minLLRValue,
int reduceTasks) throws IOException {
// parse input and extract collocations
- long ngramCount = generateCollocations(input, output, true, maxNGramSize,
reduceTasks,
+ long ngramCount = generateCollocations(input, output, baseConf, true,
maxNGramSize, reduceTasks,
minSupport);
// tally collocations and perform LLR calculation
- computeNGramsPruneByLLR(ngramCount, output, true, minLLRValue,
reduceTasks);
+ computeNGramsPruneByLLR(output, baseConf, ngramCount, true, minLLRValue,
reduceTasks);
}
/**
@@ -248,11 +209,13 @@ public final class CollocDriver extends
*/
public static long generateCollocations(Path input,
Path output,
+ Configuration baseConf,
boolean emitUnigrams,
int maxNGramSize,
int reduceTasks,
int minSupport) throws IOException {
- JobConf conf = new JobConf(CollocDriver.class);
+
+ JobConf conf = new JobConf(baseConf, CollocDriver.class);
conf.setJobName(CollocDriver.class.getSimpleName() +
".generateCollocations:" + input);
conf.setMapOutputKeyClass(GramKey.class);
@@ -268,8 +231,9 @@ public final class CollocDriver extends
conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);
FileInputFormat.setInputPaths(conf, input);
- Path outPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
- FileOutputFormat.setOutputPath(conf, outPath);
+
+ Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY);
+ FileOutputFormat.setOutputPath(conf, outputPath);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setMapperClass(CollocMapper.class);
@@ -287,12 +251,13 @@ public final class CollocDriver extends
/**
* pass2: perform the LLR calculation
*/
- public static void computeNGramsPruneByLLR(long nGramTotal,
- Path output,
- boolean emitUnigrams,
- float minLLRValue,
- int reduceTasks) throws
IOException {
- JobConf conf = new JobConf(CollocDriver.class);
+ public static void computeNGramsPruneByLLR(Path output,
+ Configuration baseConf,
+ long nGramTotal,
+ boolean emitUnigrams,
+ float minLLRValue,
+ int reduceTasks) throws
IOException {
+ JobConf conf = new JobConf(baseConf, CollocDriver.class);
conf.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " +
output);
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java
Tue Jun 8 02:20:01 2010
@@ -116,6 +116,7 @@ public final class DictionaryVectorizer
*/
public static void createTermFrequencyVectors(Path input,
Path output,
+ Configuration baseConf,
int minSupport,
int maxNGramSize,
float minLLRValue,
@@ -140,7 +141,7 @@ public final class DictionaryVectorizer
dictionaryChunks = createDictionaryChunks(minSupport, dictionaryJobPath,
output,
chunkSizeInMegabytes, new LongWritable(), maxTermDimension);
} else {
- CollocDriver.generateAllGrams(input, dictionaryJobPath, maxNGramSize,
+ CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf,
maxNGramSize,
minSupport, minLLRValue, numReducers);
dictionaryChunks = createDictionaryChunks(minSupport, new Path(
new Path(output, DICTIONARY_JOB_FOLDER),
CollocDriver.NGRAM_OUTPUT_DIRECTORY), output,
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=952511&r1=952510&r2=952511&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Tue Jun 8 02:20:01 2010
@@ -117,7 +117,7 @@ public class DictionaryVectorizerTest ex
DocumentProcessor.tokenizeDocuments(path, analyzer,
getTestTempDirPath("output/tokenized-documents"));
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),
- getTestTempDirPath("output/wordcount"), 2, 1, 0.0f, 1, 100, false);
+ getTestTempDirPath("output/wordcount"), conf, 2, 1, 0.0f, 1, 100, false);
TFIDFConverter.processTfIdf(getTestTempDirPath("output/wordcount/vectors"),
getTestTempDirPath("output/tfidf"), 100, 1,
99, 1.0f, false, 1);