This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 21d56a0 NUTCH-1763 Code comment Injector contributed by Diaa
21d56a0 is described below
commit 21d56a0c5626553a3bf5058588d9277e6844e00f
Author: Sebastian Nagel <[email protected]>
AuthorDate: Tue Oct 17 22:31:22 2017 +0200
NUTCH-1763 Code comment Injector contributed by Diaa
---
src/java/org/apache/nutch/crawl/Injector.java | 27 +++++++++++++++++++--------
1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/Injector.java
b/src/java/org/apache/nutch/crawl/Injector.java
index 642056e..5f5fd15 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -57,10 +57,11 @@ import java.util.Map;
import java.util.Random;
/**
- * Injector takes a flat file of URLs and merges ("injects") these URLs into
the
- * CrawlDb. Useful for bootstrapping a Nutch crawl. The URL files contain one
- * URL per line, optionally followed by custom metadata separated by tabs with
- * the metadata key separated from the corresponding value by '='.
+ * Injector takes a flat text file of URLs (or a folder containing text files)
+ * and merges ("injects") these URLs into the CrawlDb. Useful for bootstrapping
+ * a Nutch crawl. The URL files contain one URL per line, optionally followed
by
+ * custom metadata separated by tabs with the metadata key separated from the
+ * corresponding value by '='.
* <p>
* Note, that some metadata keys are reserved:
* <dl>
@@ -100,6 +101,16 @@ public class Injector extends NutchTool implements Tool {
*/
public static String nutchFixedFetchIntervalMDName =
"nutch.fetchInterval.fixed";
+ /**
+ * InjectMapper reads
+ * <ul>
+ * <li>the CrawlDb seeds are injected into</li>
+ * <li>the plain-text seed files and parses each line into the URL and
+ * metadata. Seed URLs are passed to the reducer with STATUS_INJECTED.</li>
+ * </ul>
+ * Depending on configuration and command-line parameters the URLs are
normalized
+ * and filtered using the configured plugins.
+ */
public static class InjectMapper
extends Mapper<Text, Writable, Text, CrawlDatum> {
public static final String URL_NORMALIZING_SCOPE =
"crawldb.url.normalizers.scope";
@@ -163,7 +174,7 @@ public class Injector extends NutchTool implements Tool {
for (String split : splits) {
// find separation between name and value
int indexEquals = split.indexOf(EQUAL_CHARACTER);
- if (indexEquals == -1) // skip anything without a EQUAL_CHARACTER
+ if (indexEquals == -1) // skip anything without an = (EQUAL_CHARACTER)
continue;
String metaname = split.substring(0, indexEquals);
@@ -276,7 +287,7 @@ public class Injector extends NutchTool implements Tool {
}
/**
- * Merge the input records as per rules below :
+ * Merge the input records of one URL as per rules below :
*
* <pre>
* 1. If there is ONLY new injected record ==> emit injected record
@@ -456,9 +467,9 @@ public class Injector extends NutchTool implements Tool {
System.err.println(
" <crawldb>\tPath to a crawldb directory. If not present, a new one
would be created.");
System.err.println(
- " <url_dir>\tPath to directory with URL file(s) containing urls to be
injected. A URL file");
+ " <url_dir>\tPath to URL file or directory with URL file(s)
containing URLs to be injected.");
System.err.println(
- " \tshould have one URL per line, optionally followed by
custom metadata.");
+ " \tA URL file should have one URL per line, optionally
followed by custom metadata.");
System.err.println(
" \tBlank lines or lines starting with a '#' would be
ignored. Custom metadata must");
System.err
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].