This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 76c8cff  NUTCH-2700 Indexchecker: improve command-line help - add 
options `-doIndex` to pass "checked" document to index writers   (the property 
`doIndex` is kept to ensure back-ward compatibility)
     new 510a4ea  Merge pull request #446 from 
sebastian-nagel/NUTCH-2700-indexchecker-cmd-line-help
76c8cff is described below

commit 76c8cff1402e217049942bac88a8a005d45abf43
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Mar 14 16:46:25 2019 +0100

    NUTCH-2700 Indexchecker: improve command-line help
    - add options `-doIndex` to pass "checked" document to index writers
      (the property `doIndex` is kept to ensure back-ward compatibility)
---
 .../nutch/indexer/IndexingFiltersChecker.java      | 27 ++++++++++++++++++++--
 src/java/org/apache/nutch/parse/ParserChecker.java |  2 +-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 08c85c3..fa62a00 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -59,6 +59,7 @@ public class IndexingFiltersChecker extends AbstractChecker {
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean doIndex = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
 
@@ -68,7 +69,24 @@ public class IndexingFiltersChecker extends AbstractChecker {
   public int run(String[] args) throws Exception {
     String url = null;
 
-    usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] 
[-dumpText] [-md key=value] (-stdin | -listen <port> [-keepClientCnxOpen])";
+    String usage = "Usage:\n" //
+        + "  IndexingFiltersChecker [OPTIONS] <url>\n" //
+        + "    Fetch single URL and index it\n" //
+        + "  IndexingFiltersChecker [OPTIONS] -stdin\n" //
+        + "    Read URLs to be indexed from stdin\n" //
+        + "  IndexingFiltersChecker [OPTIONS] -listen <port> 
[-keepClientCnxOpen]\n" //
+        + "    Listen on <port> for URLs to be indexed\n" //
+        + "Options:\n" //
+        + "  -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
+        + "                  \t(a generic Hadoop option to be passed\n" //
+        + "                  \t before other command-specific options)\n"
+        + "  -normalize      \tnormalize URLs\n" //
+        + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -dumpText       \tshow the entire plain-text content,\n" //"
+        + "                  \tnot only the first 100 characters\n" //
+        + "  -doIndex        \tpass document to configured index writers\n" //
+        + "                  \tand let them index it\n" //
+        + "  -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
 
     // Print help when no args given
     if (args.length < 1) {
@@ -76,6 +94,9 @@ public class IndexingFiltersChecker extends AbstractChecker {
       System.exit(-1);
     }
 
+    // read property "doIndex" for back-ward compatibility
+    doIndex = getConf().getBoolean("doIndex", false);
+
     int numConsumed;
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-normalize")) {
@@ -84,6 +105,8 @@ public class IndexingFiltersChecker extends AbstractChecker {
         followRedirects = true;
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
+      } else if (args[i].equals("-doIndex")) {
+        doIndex = true;
       } else if (args[i].equals("-md")) {
         String k = null, v = null;
         String nextOne = args[++i];
@@ -268,7 +291,7 @@ public class IndexingFiltersChecker extends AbstractChecker 
{
     
     output.append("\n"); // For readability if keepClientCnxOpen
 
-    if (getConf().getBoolean("doIndex", false)) {
+    if (doIndex) {
       IndexWriters writers = IndexWriters.get(getConf());
       writers.open(getConf(), "IndexingFilterChecker");
       writers.write(doc);
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java 
b/src/java/org/apache/nutch/parse/ParserChecker.java
index 454068b..8419fa3 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -91,7 +91,7 @@ public class ParserChecker extends AbstractChecker {
         + "Options:\n" //
         + "  -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
         + "                  \t(a generic Hadoop option to be passed\n" //
-        + "                  \t before other command-specific options)"
+        + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
         + "  -dumpText       \talso show the plain-text extracted by 
parsers\n" //

Reply via email to