This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 06b227122fac64533dd7e8912ae24ad2bac41c87
Author: Sebastian Nagel <sna...@apache.org>
AuthorDate: Wed Apr 29 09:54:32 2020 +0200

    NUTCH-2785 FreeGenerator: command-line option to define number of generated 
fetch lists
    - add command-line option `-numFetchers` to FreeGenerator
    - in local mode: generate one single fetch list
---
 src/java/org/apache/nutch/tools/FreeGenerator.java | 26 +++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java 
b/src/java/org/apache/nutch/tools/FreeGenerator.java
index 8c537d9..b51be74 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -146,27 +146,33 @@ public class FreeGenerator extends Configured implements 
Tool {
   @Override
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err
-          .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] 
[-normalize]");
+      System.err.println(
+          "Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] 
[-normalize] [-numFetchers <n>]");
       System.err
           .println("\tinputDir\tinput directory containing one or more input 
files.");
       System.err
-          .println("\t\tEach text file contains a list of URLs, one URL per 
line");
+          .println("\t        \tEach text file contains a list of URLs, one 
URL per line");
       System.err
           .println("\tsegmentsDir\toutput directory, where new segment will be 
created");
-      System.err.println("\t-filter\trun current URLFilters on input URLs");
+      System.err.println("\t-filter   \trun current URLFilters on input URLs");
       System.err
           .println("\t-normalize\trun current URLNormalizers on input URLs");
+      System.err.println(
+          "\t-numFetchers <n>\tnumber of generated fetch lists, determines 
number of fetcher tasks");
       return -1;
     }
     boolean filter = false;
     boolean normalize = false;
+    int numFetchers = -1;
     if (args.length > 2) {
       for (int i = 2; i < args.length; i++) {
         if (args[i].equals("-filter")) {
           filter = true;
         } else if (args[i].equals("-normalize")) {
           normalize = true;
+        } else if ("-numFetchers".equals(args[i])) {
+          numFetchers = Integer.parseInt(args[i + 1]);
+          i++;
         } else {
           LOG.error("Unknown argument: " + args[i] + ", exiting ...");
           return -1;
@@ -191,7 +197,17 @@ public class FreeGenerator extends Configured implements 
Tool {
     job.setPartitionerClass(URLPartitioner.class);
     job.setReducerClass(FG.FGReducer.class);
     String segName = Generator.generateSegmentName();
-    job.setNumReduceTasks(Integer.parseInt(conf.get("mapreduce.job.maps")));
+    if (numFetchers == -1) {
+      /* for politeness create exactly one partition per fetch task */
+      numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps"));
+    }
+    if ("local".equals(conf.get("mapreduce.framework.name"))
+        && numFetchers != 1) {
+      // override
+      LOG.info(
+          "FreeGenerator: running in local mode, generating exactly one 
partition.");
+      numFetchers = 1;
+    }
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);

Reply via email to