Author: robinanil Date: Thu Feb 11 04:34:04 2010 New Revision: 908839 URL: http://svn.apache.org/viewvc?rev=908839&view=rev Log: MAHOUT-285 Missed out the main class
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=908839&r1=908838&r2=908839&view=diff ============================================================================== --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original) +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Thu Feb 11 04:34:04 2010 @@ -27,11 +27,15 @@ import org.apache.commons.cli2.commandline.Parser; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.HadoopUtil; +import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer; import org.apache.mahout.utils.vectors.common.PartialVectorMerger; import org.apache.mahout.utils.vectors.text.DictionaryVectorizer; import org.apache.mahout.utils.vectors.text.DocumentProcessor; import org.apache.mahout.utils.vectors.tfidf.TFIDFConverter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Converts a given set of sequence files into SparseVectors @@ -39,6 +43,9 @@ */ public final class SparseVectorsFromSequenceFiles { + private static final Logger log = LoggerFactory + .getLogger(SparseVectorsFromSequenceFiles.class); + private SparseVectorsFromSequenceFiles() {} public static void main(String[] args) throws Exception { @@ -79,7 +86,7 @@ .withArgument( abuilder.withName("weight").withMinimum(1).withMaximum(1).create()) .withDescription("The kind of weight to use. Currently TF or TFIDF") - .withShortName("w").create(); + .withShortName("wt").create(); Option minDFOpt = obuilder.withLongName("minDF").withRequired(false) .withArgument( @@ -96,6 +103,21 @@ .withDescription( "The max percentage of docs for the DF. Can be used to remove really high frequency terms. Expressed as an integer between 0 and 100. Default is 99.") .withShortName("x").create(); + + Option minLLROpt = obuilder.withLongName("minLLR").withRequired(false) + .withArgument( + abuilder.withName("minLLR").withMinimum(1).withMaximum(1).create()) + .withDescription( + "(Optional)The minimum Log Likelihood Ratio(Float) Default is " + + LLRReducer.DEFAULT_MIN_LLR).withShortName("ml").create(); + + Option numReduceTasksOpt = obuilder.withLongName("numReducers") + .withArgument( + abuilder.withName("numReducers").withMinimum(1).withMaximum(1) + .create()).withDescription( + "(Optional) Number of reduce tasks. Default Value: 1").withShortName( + "nr").create(); + Option powerOpt = obuilder .withLongName("norm") .withRequired(false) @@ -105,17 +127,39 @@ "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm. " + "Must be greater or equal to 0. The default is not to normalize") .withShortName("n").create(); + Option maxNGramSizeOpt = obuilder + .withLongName("maxNGramSize") + .withRequired(false) + .withArgument( + abuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create()) + .withDescription( + "(Optional) The maximum size of ngrams to create" + + " (2 = bigrams, 3 = trigrams, etc) Default Value:2") + .withShortName("ng").create(); + + Option overwriteOutput = obuilder.withLongName("overwrite").withRequired( + false).withDescription("If set, overwrite the output directory") + .withShortName("w").create(); + Option helpOpt = obuilder.withLongName("help").withDescription( + "Print out help").withShortName("h").create(); Group group = gbuilder.withName("Options").withOption(minSupportOpt) .withOption(analyzerNameOpt).withOption(chunkSizeOpt).withOption( outputDirOpt).withOption(inputDirOpt).withOption(minDFOpt) .withOption(maxDFPercentOpt).withOption(weightOpt).withOption(powerOpt) + .withOption(minLLROpt).withOption(numReduceTasksOpt).withOption( + maxNGramSizeOpt).withOption(overwriteOutput).withOption(helpOpt) .create(); Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); + if (cmdLine.hasOption(helpOpt)) { + CommandLineUtil.printHelp(group); + return; + } + String inputDir = (String) cmdLine.getValue(inputDirOpt); String outputDir = (String) cmdLine.getValue(outputDirOpt); @@ -129,6 +173,35 @@ minSupport = Integer.parseInt(minSupportString); } + int maxNGramSize = 1; + + if (cmdLine.hasOption(maxNGramSizeOpt) == true) { + try { + maxNGramSize = Integer.parseInt(cmdLine.getValue(maxNGramSizeOpt) + .toString()); + } catch (NumberFormatException ex) { + log.warn("Could not parse ngram size option"); + } + } + log.info("Maximum n-gram size is: {}", maxNGramSize); + + if (cmdLine.hasOption(overwriteOutput) == true) { + HadoopUtil.overwriteOutput(outputDir); + } + + float minLLRValue = LLRReducer.DEFAULT_MIN_LLR; + if (cmdLine.hasOption(minLLROpt)) { + minLLRValue = Float.parseFloat(cmdLine.getValue(minLLROpt).toString()); + } + log.info("Minimum LLR value: {}", minLLRValue); + + int reduceTasks = 1; + if (cmdLine.hasOption(numReduceTasksOpt)) { + reduceTasks = Integer.parseInt(cmdLine.getValue(numReduceTasksOpt) + .toString()); + } + log.info("Pass1 reduce tasks: {}", reduceTasks); + Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class; if (cmdLine.hasOption(analyzerNameOpt)) { String className = cmdLine.getValue(analyzerNameOpt).toString(); @@ -176,8 +249,8 @@ String tokenizedPath = outputDir + "/tokenized-documents"; DocumentProcessor.tokenizeDocuments(inputDir, analyzerClass, tokenizedPath); - DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, - outputDir, minSupport, chunkSize); + DictionaryVectorizer.createTermFrequencyVectors(tokenizedPath, outputDir, + minSupport, maxNGramSize, minLLRValue, reduceTasks, chunkSize); if (processIdf) { TFIDFConverter.processTfIdf( outputDir + DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER,