FPGrowthDriver.java

gsingers Fri, 04 Nov 2011 21:10:37 -0700

Author: gsingers
Date: Sat Nov  5 04:10:05 2011
New Revision: 1197880

URL: http://svn.apache.org/viewvc?rev=1197880&view=rev
Log:
convert to use AbstractJob for arg parsing


Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java?rev=1197880&r1=1197879&r2=1197880&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java
 Sat Nov  5 04:10:05 2011
@@ -17,34 +17,19 @@
 
 package org.apache.mahout.fpm.pfpgrowth;
 
-import java.io.File;
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.List;
-
 import com.google.common.io.Closeables;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
-import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.iterator.FileLineIterable;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.iterator.FileLineIterable;
 import org.apache.mahout.common.iterator.StringRecordIterator;
-import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.fpm.pfpgrowth.convertors.ContextStatusUpdater;
 import org.apache.mahout.fpm.pfpgrowth.convertors.SequenceFileOutputCollector;
 import org.apache.mahout.fpm.pfpgrowth.convertors.string.StringOutputConverter;
@@ -53,161 +38,134 @@ import org.apache.mahout.fpm.pfpgrowth.f
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public final class FPGrowthDriver {
-  
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+public final class FPGrowthDriver extends AbstractJob {
+
   private static final Logger log = 
LoggerFactory.getLogger(FPGrowthDriver.class);
-  
-  private FPGrowthDriver() { }
-  
+
+  private FPGrowthDriver() {
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new FPGrowthDriver(), args);
+  }
+
   /**
    * Run TopK FPGrowth given the input file,
    */
-  public static void main(String[] args) throws Exception {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-    
-    Option inputDirOpt = DefaultOptionCreator.inputOption().create();
-    
-    Option outputOpt = DefaultOptionCreator.outputOption().create();
-    
-    Option helpOpt = DefaultOptionCreator.helpOption();
-    
-    // minSupport(3), maxHeapSize(50), numGroups(1000)
-    Option minSupportOpt = obuilder.withLongName("minSupport").withArgument(
-      
abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Minimum Support. Default Value: 
3").withShortName("s").create();
-    
-    Option maxHeapSizeOpt = obuilder.withLongName("maxHeapSize").withArgument(
-      
abuilder.withName("maxHeapSize").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Maximum Heap Size k, to denote the requirement to mine top K 
items. Default value: 50")
-        .withShortName("k").create();
-    
-    Option numGroupsOpt = obuilder.withLongName("numGroups").withArgument(
-      
abuilder.withName("numGroups").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Number of groups the features should be divided in the 
map-reduce version."
-          + " Doesn't work in sequential version Default 
Value:1000").withShortName("g").create();
-    
-    Option recordSplitterOpt = 
obuilder.withLongName("splitterPattern").withArgument(
-      
abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription(
-      "Regular Expression pattern used to split given string transaction into 
itemsets."
-          + " Default value splits comma separated itemsets.  Default Value:"
-          + " \"[ ,\\t]*[,|\\t][ ,\\t]*\" ").withShortName("regex").create();
-    
-    Option treeCacheOpt = 
obuilder.withLongName("numTreeCacheEntries").withArgument(
-      
abuilder.withName("numTreeCacheEntries").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) Number of entries in the tree cache to prevent duplicate 
tree building. "
-          + "(Warning) a first level conditional FP-Tree might consume a lot 
of memory, "
-          + "so keep this value small, but big enough to prevent duplicate 
tree building. "
-          + "Default Value:5 Recommended Values: 
[5-10]").withShortName("tc").create();
-    
-    Option methodOpt = 
obuilder.withLongName("method").withRequired(true).withArgument(
-      
abuilder.withName("method").withMinimum(1).withMaximum(1).create()).withDescription(
-      "Method of processing: 
sequential|mapreduce").withShortName("method").create();
-    Option encodingOpt = obuilder.withLongName("encoding").withArgument(
-      
abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription(
-      "(Optional) The file encoding.  Default value: 
UTF-8").withShortName("e").create();
-    
-    Group group = 
gbuilder.withName("Options").withOption(minSupportOpt).withOption(inputDirOpt).withOption(
-      
outputOpt).withOption(maxHeapSizeOpt).withOption(numGroupsOpt).withOption(methodOpt).withOption(
-      
encodingOpt).withOption(helpOpt).withOption(treeCacheOpt).withOption(recordSplitterOpt).create();
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-      
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelp(group);
-        return;
-      }
-      
-      Parameters params = new Parameters();
-      
-      if (cmdLine.hasOption(minSupportOpt)) {
-        String minSupportString = (String) cmdLine.getValue(minSupportOpt);
-        params.set("minSupport", minSupportString);
-      }
-      if (cmdLine.hasOption(maxHeapSizeOpt)) {
-        String maxHeapSizeString = (String) cmdLine.getValue(maxHeapSizeOpt);
-        params.set("maxHeapSize", maxHeapSizeString);
-      }
-      if (cmdLine.hasOption(numGroupsOpt)) {
-        String numGroupsString = (String) cmdLine.getValue(numGroupsOpt);
-        params.set("numGroups", numGroupsString);
-      }
-      
-      if (cmdLine.hasOption(treeCacheOpt)) {
-        String numTreeCacheString = (String) cmdLine.getValue(treeCacheOpt);
-        params.set("treeCacheSize", numTreeCacheString);
-      }
-      
-      if (cmdLine.hasOption(recordSplitterOpt)) {
-        String patternString = (String) cmdLine.getValue(recordSplitterOpt);
-        params.set("splitPattern", patternString);
-      }
-      
-      String encoding = "UTF-8";
-      if (cmdLine.hasOption(encodingOpt)) {
-        encoding = (String) cmdLine.getValue(encodingOpt);
-      }
-      params.set("encoding", encoding);
-      Path inputDir = new Path(cmdLine.getValue(inputDirOpt).toString());
-      Path outputDir = new Path(cmdLine.getValue(outputOpt).toString());
-      
-      params.set("input", inputDir.toString());
-      params.set("output", outputDir.toString());
-      
-      String classificationMethod = (String) cmdLine.getValue(methodOpt);
-      if ("sequential".equalsIgnoreCase(classificationMethod)) {
-        runFPGrowth(params);
-      } else if ("mapreduce".equalsIgnoreCase(classificationMethod)) {
-        Configuration conf = new Configuration();
-        HadoopUtil.delete(conf, outputDir);
-        PFPGrowth.runPFPGrowth(params);
-      }
-    } catch (OptionException e) {
-      CommandLineUtil.printHelp(group);
+  public int run(String[] args) throws Exception {
+    addInputOption();
+    addOutputOption();
+
+    addOption("minSupport", "s", "(Optional) The minimum number of times a 
co-occurrence must be present. Default Value: 3", "3");
+    addOption("maxHeapSize", "k", "(Optional) Maximum Heap Size k, to denote 
the requirement to mine top K items. Default value: 50", "50");
+    addOption("numGroups", "g", "(Optional) Number of groups the features 
should be divided in the map-reduce version."
+            + " Doesn't work in sequential version Default Value:1000", 
"1000");
+    addOption("splitterPattern", "regex", "Regular Expression pattern used to 
split given string transaction into itemsets."
+            + " Default value splits comma separated itemsets.  Default Value:"
+            + " \"[ ,\\t]*[,|\\t][ ,\\t]*\" ", "[ ,\t]*[,|\t][ ,\t]*");
+    addOption("numTreeCacheEntries", "tc", "(Optional) Number of entries in 
the tree cache to prevent duplicate tree building. "
+            + "(Warning) a first level conditional FP-Tree might consume a lot 
of memory, "
+            + "so keep this value small, but big enough to prevent duplicate 
tree building. "
+            + "Default Value:5 Recommended Values: [5-10]", "5");
+    addOption("method", "method", "Method of processing: 
sequential|mapreduce", "sequential");
+    addOption("encoding", "e", "(Optional) The file encoding.  Default value: 
UTF-8", "UTF-8");
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Parameters params = new Parameters();
+
+    if (hasOption("minSupport")) {
+      String minSupportString = getOption("minSupport");
+      params.set("minSupport", minSupportString);
+    }
+    if (hasOption("maxHeapSize")) {
+      String maxHeapSizeString = getOption("maxHeapSize");
+      params.set("maxHeapSize", maxHeapSizeString);
+    }
+    if (hasOption("numGroups")) {
+      String numGroupsString = getOption("numGroups");
+      params.set("numGroups", numGroupsString);
+    }
+
+    if (hasOption("numTreeCacheEntries")) {
+      String numTreeCacheString = getOption("numTreeCacheEntries");
+      params.set("treeCacheSize", numTreeCacheString);
+    }
+
+    if (hasOption("splitterPattern")) {
+      String patternString = getOption("splitterPattern");
+      params.set("splitPattern", patternString);
+    }
+
+    String encoding = "UTF-8";
+    if (hasOption("encoding")) {
+      encoding = getOption("encoding");
     }
+    params.set("encoding", encoding);
+    Path inputDir = getInputPath();
+    Path outputDir = getOutputPath();
+
+    params.set("input", inputDir.toString());
+    params.set("output", outputDir.toString());
+
+    String classificationMethod = getOption("method");
+    if ("sequential".equalsIgnoreCase(classificationMethod)) {
+      runFPGrowth(params);
+    } else if ("mapreduce".equalsIgnoreCase(classificationMethod)) {
+      Configuration conf = new Configuration();
+      HadoopUtil.delete(conf, outputDir);
+      PFPGrowth.runPFPGrowth(params);
+    }
+
+    return 0;
   }
-  
+
   private static void runFPGrowth(Parameters params) throws IOException {
     log.info("Starting Sequential FPGrowth");
     int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50"));
     int minSupport = Integer.valueOf(params.get("minSupport", "3"));
-    
+
     String output = params.get("output", "output.txt");
-    
+
     Path path = new Path(output);
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
-    
+
     Charset encoding = Charset.forName(params.get("encoding"));
     String input = params.get("input");
-    
+
     String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString());
-    
+
     SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, 
Text.class, TopKStringPatterns.class);
-    
+
     FPGrowth<String> fp = new FPGrowth<String>();
     Collection<String> features = new HashSet<String>();
 
     try {
       fp.generateTopKFrequentPatterns(
-          new StringRecordIterator(new FileLineIterable(new File(input), 
encoding, false), pattern),
-          fp.generateFList(
               new StringRecordIterator(new FileLineIterable(new File(input), 
encoding, false), pattern),
-              minSupport),
-          minSupport,
-          maxHeapSize,
-          features,
-          new StringOutputConverter(new 
SequenceFileOutputCollector<Text,TopKStringPatterns>(writer)),
-          new ContextStatusUpdater(null));
+              fp.generateFList(
+                      new StringRecordIterator(new FileLineIterable(new 
File(input), encoding, false), pattern),
+                      minSupport),
+              minSupport,
+              maxHeapSize,
+              features,
+              new StringOutputConverter(new SequenceFileOutputCollector<Text, 
TopKStringPatterns>(writer)),
+              new ContextStatusUpdater(null));
     } finally {
       Closeables.closeQuietly(writer);
     }
-    
-    List<Pair<String,TopKStringPatterns>> frequentPatterns = 
FPGrowth.readFrequentPattern(conf, path);
-    for (Pair<String,TopKStringPatterns> entry : frequentPatterns) {
+
+    List<Pair<String, TopKStringPatterns>> frequentPatterns = 
FPGrowth.readFrequentPattern(conf, path);
+    for (Pair<String, TopKStringPatterns> entry : frequentPatterns) {
       log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), 
entry.getSecond());
     }
   }

svn commit: r1197880 - /mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java

Reply via email to