This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 21d56a0  NUTCH-1763 Code comment Injector contributed by Diaa
21d56a0 is described below

commit 21d56a0c5626553a3bf5058588d9277e6844e00f
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Oct 17 22:31:22 2017 +0200

    NUTCH-1763 Code comment Injector contributed by Diaa
---
 src/java/org/apache/nutch/crawl/Injector.java | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/Injector.java 
b/src/java/org/apache/nutch/crawl/Injector.java
index 642056e..5f5fd15 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -57,10 +57,11 @@ import java.util.Map;
 import java.util.Random;
 
 /**
- * Injector takes a flat file of URLs and merges ("injects") these URLs into 
the
- * CrawlDb. Useful for bootstrapping a Nutch crawl. The URL files contain one
- * URL per line, optionally followed by custom metadata separated by tabs with
- * the metadata key separated from the corresponding value by '='.
+ * Injector takes a flat text file of URLs (or a folder containing text files)
+ * and merges ("injects") these URLs into the CrawlDb. Useful for bootstrapping
+ * a Nutch crawl. The URL files contain one URL per line, optionally followed 
by
+ * custom metadata separated by tabs with the metadata key separated from the
+ * corresponding value by '='.
  * <p>
  * Note, that some metadata keys are reserved:
  * <dl>
@@ -100,6 +101,16 @@ public class Injector extends NutchTool implements Tool {
    */
   public static String nutchFixedFetchIntervalMDName = 
"nutch.fetchInterval.fixed";
 
+  /**
+   * InjectMapper reads
+   * <ul>
+   * <li>the CrawlDb seeds are injected into</li>
+   * <li>the plain-text seed files and parses each line into the URL and
+   * metadata. Seed URLs are passed to the reducer with STATUS_INJECTED.</li>
+   * </ul>
+   * Depending on configuration and command-line parameters the URLs are 
normalized
+   * and filtered using the configured plugins.
+   */
   public static class InjectMapper
       extends Mapper<Text, Writable, Text, CrawlDatum> {
     public static final String URL_NORMALIZING_SCOPE = 
"crawldb.url.normalizers.scope";
@@ -163,7 +174,7 @@ public class Injector extends NutchTool implements Tool {
       for (String split : splits) {
         // find separation between name and value
         int indexEquals = split.indexOf(EQUAL_CHARACTER);
-        if (indexEquals == -1) // skip anything without a EQUAL_CHARACTER
+        if (indexEquals == -1) // skip anything without an = (EQUAL_CHARACTER)
           continue;
 
         String metaname = split.substring(0, indexEquals);
@@ -276,7 +287,7 @@ public class Injector extends NutchTool implements Tool {
     }
 
     /**
-     * Merge the input records as per rules below :
+     * Merge the input records of one URL as per rules below :
      * 
      * <pre>
      * 1. If there is ONLY new injected record ==&gt; emit injected record
@@ -456,9 +467,9 @@ public class Injector extends NutchTool implements Tool {
     System.err.println(
         "  <crawldb>\tPath to a crawldb directory. If not present, a new one 
would be created.");
     System.err.println(
-        "  <url_dir>\tPath to directory with URL file(s) containing urls to be 
injected. A URL file");
+        "  <url_dir>\tPath to URL file or directory with URL file(s) 
containing URLs to be injected.");
     System.err.println(
-        "           \tshould have one URL per line, optionally followed by 
custom metadata.");
+        "           \tA URL file should have one URL per line, optionally 
followed by custom metadata.");
     System.err.println(
         "           \tBlank lines or lines starting with a '#' would be 
ignored. Custom metadata must");
     System.err

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to