This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 76c8cff NUTCH-2700 Indexchecker: improve command-line help - add
options `-doIndex` to pass "checked" document to index writers (the property
`doIndex` is kept to ensure back-ward compatibility)
new 510a4ea Merge pull request #446 from
sebastian-nagel/NUTCH-2700-indexchecker-cmd-line-help
76c8cff is described below
commit 76c8cff1402e217049942bac88a8a005d45abf43
Author: Sebastian Nagel <[email protected]>
AuthorDate: Thu Mar 14 16:46:25 2019 +0100
NUTCH-2700 Indexchecker: improve command-line help
- add options `-doIndex` to pass "checked" document to index writers
(the property `doIndex` is kept to ensure back-ward compatibility)
---
.../nutch/indexer/IndexingFiltersChecker.java | 27 ++++++++++++++++++++--
src/java/org/apache/nutch/parse/ParserChecker.java | 2 +-
2 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 08c85c3..fa62a00 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -59,6 +59,7 @@ public class IndexingFiltersChecker extends AbstractChecker {
protected URLNormalizers normalizers = null;
protected boolean dumpText = false;
protected boolean followRedirects = false;
+ protected boolean doIndex = false;
// used to simulate the metadata propagated from injection
protected HashMap<String, String> metadata = new HashMap<>();
@@ -68,7 +69,24 @@ public class IndexingFiltersChecker extends AbstractChecker {
public int run(String[] args) throws Exception {
String url = null;
- usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects]
[-dumpText] [-md key=value] (-stdin | -listen <port> [-keepClientCnxOpen])";
+ String usage = "Usage:\n" //
+ + " IndexingFiltersChecker [OPTIONS] <url>\n" //
+ + " Fetch single URL and index it\n" //
+ + " IndexingFiltersChecker [OPTIONS] -stdin\n" //
+ + " Read URLs to be indexed from stdin\n" //
+ + " IndexingFiltersChecker [OPTIONS] -listen <port>
[-keepClientCnxOpen]\n" //
+ + " Listen on <port> for URLs to be indexed\n" //
+ + "Options:\n" //
+ + " -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
+ + " \t(a generic Hadoop option to be passed\n" //
+ + " \t before other command-specific options)\n"
+ + " -normalize \tnormalize URLs\n" //
+ + " -followRedirects\tfollow redirects when fetching URL\n" //
+ + " -dumpText \tshow the entire plain-text content,\n" //"
+ + " \tnot only the first 100 characters\n" //
+ + " -doIndex \tpass document to configured index writers\n" //
+ + " \tand let them index it\n" //
+ + " -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
// Print help when no args given
if (args.length < 1) {
@@ -76,6 +94,9 @@ public class IndexingFiltersChecker extends AbstractChecker {
System.exit(-1);
}
+ // read property "doIndex" for back-ward compatibility
+ doIndex = getConf().getBoolean("doIndex", false);
+
int numConsumed;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-normalize")) {
@@ -84,6 +105,8 @@ public class IndexingFiltersChecker extends AbstractChecker {
followRedirects = true;
} else if (args[i].equals("-dumpText")) {
dumpText = true;
+ } else if (args[i].equals("-doIndex")) {
+ doIndex = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
String nextOne = args[++i];
@@ -268,7 +291,7 @@ public class IndexingFiltersChecker extends AbstractChecker
{
output.append("\n"); // For readability if keepClientCnxOpen
- if (getConf().getBoolean("doIndex", false)) {
+ if (doIndex) {
IndexWriters writers = IndexWriters.get(getConf());
writers.open(getConf(), "IndexingFilterChecker");
writers.write(doc);
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java
b/src/java/org/apache/nutch/parse/ParserChecker.java
index 454068b..8419fa3 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -91,7 +91,7 @@ public class ParserChecker extends AbstractChecker {
+ "Options:\n" //
+ " -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
+ " \t(a generic Hadoop option to be passed\n" //
- + " \t before other command-specific options)"
+ + " \t before other command-specific options)\n"
+ " -normalize \tnormalize URLs\n" //
+ " -followRedirects\tfollow redirects when fetching URL\n" //
+ " -dumpText \talso show the plain-text extracted by
parsers\n" //