This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 06b227122fac64533dd7e8912ae24ad2bac41c87 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Wed Apr 29 09:54:32 2020 +0200 NUTCH-2785 FreeGenerator: command-line option to define number of generated fetch lists - add command-line option `-numFetchers` to FreeGenerator - in local mode: generate one single fetch list --- src/java/org/apache/nutch/tools/FreeGenerator.java | 26 +++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java index 8c537d9..b51be74 100644 --- a/src/java/org/apache/nutch/tools/FreeGenerator.java +++ b/src/java/org/apache/nutch/tools/FreeGenerator.java @@ -146,27 +146,33 @@ public class FreeGenerator extends Configured implements Tool { @Override public int run(String[] args) throws Exception { if (args.length < 2) { - System.err - .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]"); + System.err.println( + "Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize] [-numFetchers <n>]"); System.err .println("\tinputDir\tinput directory containing one or more input files."); System.err - .println("\t\tEach text file contains a list of URLs, one URL per line"); + .println("\t \tEach text file contains a list of URLs, one URL per line"); System.err .println("\tsegmentsDir\toutput directory, where new segment will be created"); - System.err.println("\t-filter\trun current URLFilters on input URLs"); + System.err.println("\t-filter \trun current URLFilters on input URLs"); System.err .println("\t-normalize\trun current URLNormalizers on input URLs"); + System.err.println( + "\t-numFetchers <n>\tnumber of generated fetch lists, determines number of fetcher tasks"); return -1; } boolean filter = false; boolean normalize = false; + int numFetchers = -1; if (args.length > 2) { for (int i = 2; i < args.length; i++) { if (args[i].equals("-filter")) { filter = true; } else if (args[i].equals("-normalize")) { normalize = true; + } else if ("-numFetchers".equals(args[i])) { + numFetchers = Integer.parseInt(args[i + 1]); + i++; } else { LOG.error("Unknown argument: " + args[i] + ", exiting ..."); return -1; @@ -191,7 +197,17 @@ public class FreeGenerator extends Configured implements Tool { job.setPartitionerClass(URLPartitioner.class); job.setReducerClass(FG.FGReducer.class); String segName = Generator.generateSegmentName(); - job.setNumReduceTasks(Integer.parseInt(conf.get("mapreduce.job.maps"))); + if (numFetchers == -1) { + /* for politeness create exactly one partition per fetch task */ + numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps")); + } + if ("local".equals(conf.get("mapreduce.framework.name")) + && numFetchers != 1) { + // override + LOG.info( + "FreeGenerator: running in local mode, generating exactly one partition."); + numFetchers = 1; + } job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class);