Author: robinanil Date: Fri Feb 12 12:42:42 2010 New Revision: 909383 URL: http://svn.apache.org/viewvc?rev=909383&view=rev Log: FPGrowth driver edits
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java?rev=909383&r1=909382&r2=909383&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthDriver.java Fri Feb 12 12:42:42 2010 @@ -39,6 +39,7 @@ import org.apache.hadoop.io.Text; import org.apache.mahout.common.CommandLineUtil; import org.apache.mahout.common.FileLineIterable; +import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.Pair; import org.apache.mahout.common.Parameters; import org.apache.mahout.common.StringRecordIterator; @@ -52,12 +53,12 @@ import org.slf4j.LoggerFactory; public final class FPGrowthDriver { - - private static final Logger log = LoggerFactory.getLogger(FPGrowthDriver.class); - - private FPGrowthDriver() { - } - + + private static final Logger log = LoggerFactory + .getLogger(FPGrowthDriver.class); + + private FPGrowthDriver() {} + /** * Run TopK FPGrowth given the input file, * @@ -69,145 +70,183 @@ * @throws InterruptedException * @throws ClassNotFoundException */ - public static void main(String[] args) throws IOException, OptionException, NumberFormatException, - IllegalStateException, InterruptedException, ClassNotFoundException { + public static void main(String[] args) throws IOException, + OptionException, + NumberFormatException, + IllegalStateException, + InterruptedException, + ClassNotFoundException { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); - - Option inputDirOpt = obuilder.withLongName("input").withRequired(true).withArgument( - abuilder.withName("input").withMinimum(1).withMaximum(1).create()).withDescription( - "The Directory on HDFS containing the transaction files").withShortName("i").create(); - + + Option inputDirOpt = obuilder.withLongName("input").withRequired(true) + .withArgument( + abuilder.withName("input").withMinimum(1).withMaximum(1).create()) + .withDescription( + "The Directory on HDFS containing the transaction files") + .withShortName("i").create(); + Option outputOpt = DefaultOptionCreator.outputOption().create(); - + Option helpOpt = DefaultOptionCreator.helpOption(); - + // minSupport(3), maxHeapSize(50), numGroups(1000) Option minSupportOpt = obuilder.withLongName("minSupport").withArgument( - abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription( - "(Optional) Minimum Support. Default Value: 3").withShortName("s").create(); - - Option maxHeapSizeOpt = obuilder.withLongName("maxHeapSize").withArgument( - abuilder.withName("maxHeapSize").withMinimum(1).withMaximum(1).create()).withDescription( - "(Optional) Maximum Heap Size k, to denote the requirement to mine top K items. Default value: 50") + abuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()) + .withDescription("(Optional) Minimum Support. Default Value: 3") + .withShortName("s").create(); + + Option maxHeapSizeOpt = obuilder + .withLongName("maxHeapSize") + .withArgument( + abuilder.withName("maxHeapSize").withMinimum(1).withMaximum(1) + .create()) + .withDescription( + "(Optional) Maximum Heap Size k, to denote the requirement to mine top K items. Default value: 50") .withShortName("k").create(); - - Option numGroupsOpt = obuilder.withLongName("numGroups").withArgument( - abuilder.withName("numGroups").withMinimum(1).withMaximum(1).create()).withDescription( - "(Optional) Number of groups the features should be divided in the map-reduce version." - + " Doesn't work in sequential version Default Value:1000").withShortName("g").create(); - - Option recordSplitterOpt = obuilder.withLongName("splitterPattern").withArgument( - abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1).create()).withDescription( - "Regular Expression pattern used to split given string transaction into itemsets." - + " Default value splits comma separated itemsets. Default Value:" - + " \"[ ,\\t]*[,|\\t][ ,\\t]*\" ").withShortName("regex").create(); - - Option treeCacheOpt = obuilder.withLongName("numTreeCacheEntries").withArgument( - abuilder.withName("numTreeCacheEntries").withMinimum(1).withMaximum(1).create()).withDescription( - "(Optional) Number of entries in the tree cache to prevent duplicate tree building. " - + "(Warning) a first level conditional FP-Tree might consume a lot of memory, " - + "so keep this value small, but big enough to prevent duplicate tree building. " - + "Default Value:5 Recommended Values: [5-10]").withShortName("tc").create(); - - Option methodOpt = obuilder.withLongName("method").withRequired(true).withArgument( - abuilder.withName("method").withMinimum(1).withMaximum(1).create()).withDescription( - "Method of processing: sequential|mapreduce").withShortName("method").create(); - Option encodingOpt = obuilder.withLongName("encoding").withArgument( - abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()).withDescription( - "(Optional) The file encoding. Default value: UTF-8").withShortName("e").create(); - - Group group = gbuilder.withName("Options").withOption(minSupportOpt).withOption(inputDirOpt) - .withOption(outputOpt).withOption(maxHeapSizeOpt).withOption(numGroupsOpt).withOption(methodOpt) - .withOption(encodingOpt).withOption(helpOpt).withOption(treeCacheOpt).withOption(recordSplitterOpt) + + Option numGroupsOpt = obuilder + .withLongName("numGroups") + .withArgument( + abuilder.withName("numGroups").withMinimum(1).withMaximum(1).create()) + .withDescription( + "(Optional) Number of groups the features should be divided in the map-reduce version." + + " Doesn't work in sequential version Default Value:1000") + .withShortName("g").create(); + + Option recordSplitterOpt = obuilder + .withLongName("splitterPattern") + .withArgument( + abuilder.withName("splitterPattern").withMinimum(1).withMaximum(1) + .create()) + .withDescription( + "Regular Expression pattern used to split given string transaction into itemsets." + + " Default value splits comma separated itemsets. Default Value:" + + " \"[ ,\\t]*[,|\\t][ ,\\t]*\" ").withShortName("regex") .create(); - - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { + + Option treeCacheOpt = obuilder + .withLongName("numTreeCacheEntries") + .withArgument( + abuilder.withName("numTreeCacheEntries").withMinimum(1) + .withMaximum(1).create()) + .withDescription( + "(Optional) Number of entries in the tree cache to prevent duplicate tree building. " + + "(Warning) a first level conditional FP-Tree might consume a lot of memory, " + + "so keep this value small, but big enough to prevent duplicate tree building. " + + "Default Value:5 Recommended Values: [5-10]").withShortName( + "tc").create(); + + Option methodOpt = obuilder.withLongName("method").withRequired(true) + .withArgument( + abuilder.withName("method").withMinimum(1).withMaximum(1).create()) + .withDescription("Method of processing: sequential|mapreduce") + .withShortName("method").create(); + Option encodingOpt = obuilder.withLongName("encoding").withArgument( + abuilder.withName("encoding").withMinimum(1).withMaximum(1).create()) + .withDescription("(Optional) The file encoding. Default value: UTF-8") + .withShortName("e").create(); + + Group group = gbuilder.withName("Options").withOption(minSupportOpt) + .withOption(inputDirOpt).withOption(outputOpt).withOption( + maxHeapSizeOpt).withOption(numGroupsOpt).withOption(methodOpt) + .withOption(encodingOpt).withOption(helpOpt).withOption(treeCacheOpt) + .withOption(recordSplitterOpt).create(); + try { + Parser parser = new Parser(); + parser.setGroup(group); + CommandLine cmdLine = parser.parse(args); + + if (cmdLine.hasOption(helpOpt)) { + CommandLineUtil.printHelp(group); + return; + } + + Parameters params = new Parameters(); + + if (cmdLine.hasOption(minSupportOpt)) { + String minSupportString = (String) cmdLine.getValue(minSupportOpt); + params.set("minSupport", minSupportString); + } + if (cmdLine.hasOption(maxHeapSizeOpt)) { + String maxHeapSizeString = (String) cmdLine.getValue(maxHeapSizeOpt); + params.set("maxHeapSize", maxHeapSizeString); + } + if (cmdLine.hasOption(numGroupsOpt)) { + String numGroupsString = (String) cmdLine.getValue(numGroupsOpt); + params.set("numGroups", numGroupsString); + } + + if (cmdLine.hasOption(treeCacheOpt)) { + String numTreeCacheString = (String) cmdLine.getValue(treeCacheOpt); + params.set("treeCacheSize", numTreeCacheString); + } + + if (cmdLine.hasOption(recordSplitterOpt)) { + String patternString = (String) cmdLine.getValue(recordSplitterOpt); + params.set("splitPattern", patternString); + } + + String encoding = "UTF-8"; + if (cmdLine.hasOption(encodingOpt)) { + encoding = (String) cmdLine.getValue(encodingOpt); + } + params.set("encoding", encoding); + String inputDir = (String) cmdLine.getValue(inputDirOpt); + String outputDir = (String) cmdLine.getValue(outputOpt); + + params.set("input", inputDir); + params.set("output", outputDir); + + String classificationMethod = (String) cmdLine.getValue(methodOpt); + if (classificationMethod.equalsIgnoreCase("sequential")) { + runFPGrowth(params); + } else if (classificationMethod.equalsIgnoreCase("mapreduce")) { + HadoopUtil.overwriteOutput(outputDir); + PFPGrowth.runPFPGrowth(params); + } + } catch (OptionException e) { CommandLineUtil.printHelp(group); - return; - } - - Parameters params = new Parameters(); - - if (cmdLine.hasOption(minSupportOpt)) { - String minSupportString = (String) cmdLine.getValue(minSupportOpt); - params.set("minSupport", minSupportString); - } - if (cmdLine.hasOption(maxHeapSizeOpt)) { - String maxHeapSizeString = (String) cmdLine.getValue(maxHeapSizeOpt); - params.set("maxHeapSize", maxHeapSizeString); - } - if (cmdLine.hasOption(numGroupsOpt)) { - String numGroupsString = (String) cmdLine.getValue(numGroupsOpt); - params.set("numGroups", numGroupsString); - } - - if (cmdLine.hasOption(treeCacheOpt)) { - String numTreeCacheString = (String) cmdLine.getValue(treeCacheOpt); - params.set("treeCacheSize", numTreeCacheString); - } - - if (cmdLine.hasOption(recordSplitterOpt)) { - String patternString = (String) cmdLine.getValue(recordSplitterOpt); - params.set("splitPattern", patternString); - } - - String encoding = "UTF-8"; - if (cmdLine.hasOption(encodingOpt)) { - encoding = (String) cmdLine.getValue(encodingOpt); - } - params.set("encoding", encoding); - String inputDir = (String) cmdLine.getValue(inputDirOpt); - String outputDir = (String) cmdLine.getValue(outputOpt); - - params.set("input", inputDir); - params.set("output", outputDir); - - String classificationMethod = (String) cmdLine.getValue(methodOpt); - if (classificationMethod.equalsIgnoreCase("sequential")) { - runFPGrowth(params); - } else if (classificationMethod.equalsIgnoreCase("mapreduce")) { - PFPGrowth.runPFPGrowth(params); } } - + private static void runFPGrowth(Parameters params) throws IOException { log.info("Starting Sequential FPGrowth"); int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50")); int minSupport = Integer.valueOf(params.get("minSupport", "3")); - + String output = params.get("output", "output.txt"); - + Path path = new Path(output); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); - + Charset encoding = Charset.forName(params.get("encoding")); String input = params.get("input"); - + String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString()); - - SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class, - TopKStringPatterns.class); - + + SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, + Text.class, TopKStringPatterns.class); + FPGrowth<String> fp = new FPGrowth<String>(); Set<String> features = new HashSet<String>(); - - fp.generateTopKFrequentPatterns(new StringRecordIterator(new FileLineIterable(new File(input), - encoding, false), pattern), fp.generateFList(new StringRecordIterator(new FileLineIterable( - new File(input), encoding, false), pattern), minSupport), minSupport, maxHeapSize, features, - new StringOutputConverter(new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)), - new ContextStatusUpdater(null)); + + fp.generateTopKFrequentPatterns(new StringRecordIterator( + new FileLineIterable(new File(input), encoding, false), pattern), fp + .generateFList(new StringRecordIterator(new FileLineIterable(new File( + input), encoding, false), pattern), minSupport), minSupport, + maxHeapSize, features, new StringOutputConverter( + new SequenceFileOutputCollector<Text,TopKStringPatterns>(writer)), + new ContextStatusUpdater(null)); writer.close(); - - List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth.readFrequentPattern(fs, conf, path); - for (Pair<String, TopKStringPatterns> entry : frequentPatterns) { - log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond().toString()); + + List<Pair<String,TopKStringPatterns>> frequentPatterns = FPGrowth + .readFrequentPattern(fs, conf, path); + for (Pair<String,TopKStringPatterns> entry : frequentPatterns) { + log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry + .getSecond().toString()); } } }