This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 72f3ff2 NUTCH-2785 FreeGenerator: command-line option to define
number of generated fetch lists - add command-line option `-numFetchers` to
FreeGenerator - in local mode: generate one single fetch list
new aa0c75e Merge pull request #519 from
sebastian-nagel/NUTCH-2785-freegenerator-num-fetch-lists
72f3ff2 is described below
commit 72f3ff20d28f2e19281a5d1c83139b152acac1e1
Author: Sebastian Nagel <[email protected]>
AuthorDate: Wed Apr 29 09:54:32 2020 +0200
NUTCH-2785 FreeGenerator: command-line option to define number of generated
fetch lists
- add command-line option `-numFetchers` to FreeGenerator
- in local mode: generate one single fetch list
---
src/java/org/apache/nutch/tools/FreeGenerator.java | 26 +++++++++++++++++-----
1 file changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java
b/src/java/org/apache/nutch/tools/FreeGenerator.java
index 8c537d9..b51be74 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -146,27 +146,33 @@ public class FreeGenerator extends Configured implements
Tool {
@Override
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err
- .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter]
[-normalize]");
+ System.err.println(
+ "Usage: FreeGenerator <inputDir> <segmentsDir> [-filter]
[-normalize] [-numFetchers <n>]");
System.err
.println("\tinputDir\tinput directory containing one or more input
files.");
System.err
- .println("\t\tEach text file contains a list of URLs, one URL per
line");
+ .println("\t \tEach text file contains a list of URLs, one
URL per line");
System.err
.println("\tsegmentsDir\toutput directory, where new segment will be
created");
- System.err.println("\t-filter\trun current URLFilters on input URLs");
+ System.err.println("\t-filter \trun current URLFilters on input URLs");
System.err
.println("\t-normalize\trun current URLNormalizers on input URLs");
+ System.err.println(
+ "\t-numFetchers <n>\tnumber of generated fetch lists, determines
number of fetcher tasks");
return -1;
}
boolean filter = false;
boolean normalize = false;
+ int numFetchers = -1;
if (args.length > 2) {
for (int i = 2; i < args.length; i++) {
if (args[i].equals("-filter")) {
filter = true;
} else if (args[i].equals("-normalize")) {
normalize = true;
+ } else if ("-numFetchers".equals(args[i])) {
+ numFetchers = Integer.parseInt(args[i + 1]);
+ i++;
} else {
LOG.error("Unknown argument: " + args[i] + ", exiting ...");
return -1;
@@ -191,7 +197,17 @@ public class FreeGenerator extends Configured implements
Tool {
job.setPartitionerClass(URLPartitioner.class);
job.setReducerClass(FG.FGReducer.class);
String segName = Generator.generateSegmentName();
- job.setNumReduceTasks(Integer.parseInt(conf.get("mapreduce.job.maps")));
+ if (numFetchers == -1) {
+ /* for politeness create exactly one partition per fetch task */
+ numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps"));
+ }
+ if ("local".equals(conf.get("mapreduce.framework.name"))
+ && numFetchers != 1) {
+ // override
+ LOG.info(
+ "FreeGenerator: running in local mode, generating exactly one
partition.");
+ numFetchers = 1;
+ }
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);