Author: lewismc
Date: Wed Apr 17 23:20:36 2013
New Revision: 1469099
URL: http://svn.apache.org/r1469099
Log:
NUTCH-1501 Harmonize behavior of parsechecker and indexchecker
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/log4j.properties
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Apr 17 23:20:36 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel +
lewismc)
+
* NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc)
* NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via
lewismc)
Modified: nutch/branches/2.x/conf/log4j.properties
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/conf/log4j.properties (original)
+++ nutch/branches/2.x/conf/log4j.properties Wed Apr 17 23:20:36 2013
@@ -38,6 +38,8 @@ log4j.logger.org.apache.nutch.indexer.so
log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
+log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout
log4j.logger.org.apache.nutch=INFO
log4j.logger.org.apache.hadoop=WARN
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
Wed Apr 17 23:20:36 2013
@@ -37,6 +37,7 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.protocol.ProtocolStatusUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -67,7 +68,7 @@ public class IndexingFiltersChecker exte
return -1;
}
- url = args[0];
+ url = URLUtil.toASCII(args[0]);
if (LOG.isInfoEnabled()) {
LOG.info("fetching: " + url);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Wed
Apr 17 23:20:36 2013
@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
@@ -36,10 +37,32 @@ import org.apache.nutch.protocol.Protoco
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
/**
* Parser checker, useful for testing parser.
- *
+ * It also accurately reports possible fetching and
+ * parsing failures and presents protocol status signals to aid
+ * debugging. The tool enables us to retrieve the following data from
+ * any url:
+ * <ol>
+ * <li><tt>contentType</tt>: The URL {@link org.apache.nutch.protocol.Content}
type.</li>
+ * <li><tt>signature</tt>: Digest is used to identify pages (like unique ID)
and is used to remove
+ * duplicates during the dedup procedure.
+ * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or
+ * {@link org.apache.nutch.crawl.TextProfileSignature}.</li>
+ * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li>
+ * <li><tt>Title</tt>: of the URL</li>
+ * <li><tt>Outlinks</tt>: associated with the URL</li>
+ * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>,
+ * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>,
<i>Cache-Control</>, etc.</li>
+ * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>,
+ * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li>
+ * <li><tt>ParseText</tt>: The page parse text which varies in length
depdnecing on
+ * <code>content.length</code> configuration.</li>
+ * </ol>
* @author John Xing
*/
@@ -60,7 +83,7 @@ public class ParserChecker implements To
String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";
if (args.length == 0) {
- System.err.println(usage);
+ LOG.error(usage);
return (-1);
}
@@ -71,10 +94,10 @@ public class ParserChecker implements To
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (i != args.length - 1) {
- System.err.println(usage);
+ LOG.error(usage);
System.exit(-1);
} else {
- url = args[i];
+ url = URLUtil.toASCII(args[i]);
}
}
@@ -110,15 +133,10 @@ public class ParserChecker implements To
}
if (contentType == null) {
- System.err.println("");
+ LOG.error("Failed to determine content type!");
return (-1);
}
- if (LOG.isInfoEnabled()) {
- LOG.info("parsing: " + url);
- LOG.info("contentType: " + contentType);
- }
-
page.setContentType(new Utf8(contentType));
if (ParserJob.isTruncated(url, page)) {
@@ -128,13 +146,23 @@ public class ParserChecker implements To
Parse parse = new ParseUtil(conf).parse(url, page);
if (parse == null) {
- System.err.println("Problem with parse - check log");
+ LOG.error("Problem with parse - check log");
return (-1);
}
+
+ // Calculate the signature
+ byte[] signature =
SignatureFactory.getSignature(getConf()).calculate(page);
+
+ if (LOG.isInfoEnabled()) {
+ LOG.info("parsing: " + url);
+ LOG.info("contentType: " + contentType);
+ LOG.info("signature: " + StringUtil.toHexString(signature));
+ }
+
- System.out.print("---------\nUrl\n---------------\n");
+ LOG.info("---------\nUrl\n---------------\n");
System.out.print(url + "\n");
- System.out.print("---------\nMetadata\n---------\n");
+ LOG.info("---------\nMetadata\n---------\n");
Map<Utf8, ByteBuffer> metadata = page.getMetadata();
StringBuffer sb = new StringBuffer();
if (metadata != null) {
@@ -148,7 +176,7 @@ public class ParserChecker implements To
System.out.print(sb.toString());
}
if (dumpText) {
- System.out.print("---------\nParseText\n---------\n");
+ LOG.info("---------\nParseText\n---------\n");
System.out.print(parse.getText());
}
@@ -170,4 +198,5 @@ public class ParserChecker implements To
args);
System.exit(res);
}
+
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1469099&r1=1469098&r2=1469099&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Wed Apr 17
23:20:36 2013
@@ -18,7 +18,7 @@
package org.apache.nutch.util;
import java.net.MalformedURLException;
-import java.net.URL;
+import java.net.*;
import java.util.regex.Pattern;
import org.apache.nutch.util.domain.DomainSuffix;
@@ -333,6 +333,43 @@ public class URLUtil {
}
}
+ public static String toASCII(String url) {
+ try {
+ URL u = new URL(url);
+ URI p = new URI(u.getProtocol(),
+ null,
+ IDN.toASCII(u.getHost()),
+ u.getPort(),
+ u.getPath(),
+ u.getQuery(),
+ u.getRef());
+
+ return p.toString();
+ }
+ catch (Exception e) {
+ return null;
+ }
+ }
+
+ public static String toUNICODE(String url) {
+ try {
+ URL u = new URL(url);
+ URI p = new URI(u.getProtocol(),
+ null,
+ IDN.toUnicode(u.getHost()),
+ u.getPort(),
+ u.getPath(),
+ u.getQuery(),
+ u.getRef());
+
+ return p.toString();
+ }
+ catch (Exception e) {
+ return null;
+ }
+ }
+
+
/** For testing */
public static void main(String[] args){