Author: lewismc
Date: Wed Apr 17 23:22:59 2013
New Revision: 1469100
URL: http://svn.apache.org/r1469100
Log:
NUTCH-1501 Harmonize behavior of parsechecker and indexchecker
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1469100&r1=1469099&r2=1469100&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Apr 17 23:22:59 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel +
lewismc)
+
* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)
* NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)
Modified:
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1469100&r1=1469099&r2=1469100&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Wed Apr 17 23:22:59 2013
@@ -68,7 +68,7 @@ public class IndexingFiltersChecker exte
if (args.length != 1) {
System.err.println(usage);
- System.exit(-1);
+ return -1;
}
url = URLUtil.toASCII(args[0]);
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1469100&r1=1469099&r2=1469100&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Apr 17
23:22:59 2013
@@ -35,19 +35,38 @@ import org.apache.nutch.util.StringUtil;
/**
* Parser checker, useful for testing parser.
- *
+ * It also accurately reports possible fetching and
+ * parsing failures and presents protocol status signals to aid
+ * debugging. The tool enables us to retrieve the following data from
+ * any url:
+ * <ol>
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID)
and is used to remove
+ * duplicates during the dedup procedure.
+ * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Title</tt>: of the URL</li>
+ * <li><tt>Outlinks</tt>: associated with the URL</li>
+ * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
<i>Cache-Control</>, etc.</li>
+ * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
+ * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length
depdnecing on
+ * <code>content.length</code> configuration.</li>
+ * </ol>
* @author John Xing
*/
public class ParserChecker implements Tool {
public static final Logger LOG =
LoggerFactory.getLogger(ParserChecker.class);
+ private Configuration conf;
public ParserChecker() {
}
- Configuration conf = null;
-
public int run(String[] args) throws Exception {
boolean dumpText = false;
boolean force = false;
@@ -57,8 +76,8 @@ public class ParserChecker implements To
String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
if (args.length == 0) {
- System.err.println(usage);
- System.exit(-1);
+ LOG.error(usage);
+ return (-1);
}
for (int i = 0; i < args.length; i++) {
@@ -68,7 +87,7 @@ public class ParserChecker implements To
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (i != args.length - 1) {
- System.err.println(usage);
+ LOG.error(usage);
System.exit(-1);
} else {
url = URLUtil.toASCII(args[i]);
@@ -102,7 +121,7 @@ public class ParserChecker implements To
}
if (contentType == null) {
- System.err.println("");
+ LOG.error("Failed to determine content type!");
return (-1);
}
@@ -112,9 +131,14 @@ public class ParserChecker implements To
ParseResult parseResult = new ParseUtil(conf).parse(content);
+ if (parseResult == null) {
+ LOG.error("Problem with parse - check log");
+ return (-1);
+ }
+
// Calculate the signature
byte[] signature =
SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new
Text(url)));
-
+
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
@@ -123,12 +147,12 @@ public class ParserChecker implements To
for (java.util.Map.Entry<Text, Parse> entry : parseResult) {
Parse parse = entry.getValue();
- System.out.print("---------\nUrl\n---------------\n");
+ LOG.info("---------\nUrl\n---------------\n");
System.out.print(entry.getKey());
- System.out.print("\n---------\nParseData\n---------\n");
+ LOG.info("\n---------\nParseData\n---------\n");
System.out.print(parse.getData().toString());
if (dumpText) {
- System.out.print("---------\nParseText\n---------\n");
+ LOG.info("---------\nParseText\n---------\n");
System.out.print(parse.getText());
}
}