Author: lewismc
Date: Wed May 6 23:32:39 2015
New Revision: 1678111
URL: http://svn.apache.org/r1678111
Log:
NUTCH-2004 ParseChecker does not handle redirects
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1678111&r1=1678110&r2=1678111&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 6 23:32:39 2015
@@ -2,7 +2,7 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
-* NUTCH-XX
+* NUTCH-2004 ParseChecker does not handle redirects (mjoyce via lewismc)
Nutch 1.10 Release - 29/04/2015 (dd/mm/yyyy)
Release Report: http://s.apache.org/nutch10
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1678111&r1=1678110&r2=1678111&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed May 6
23:32:39 2015
@@ -34,6 +34,7 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.URLUtil;
@@ -135,9 +136,30 @@ public class ParserChecker implements To
Text turl = new Text(url);
ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
+ // If the configuration permits, handle redirects until we either run
+ // out of allowed redirects or we stop getting redirect statuses.
+ int maxRedirects = conf.getInt("http.redirect.max", 0);
+ int numRedirects = 0;
+ while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
+ String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
+ LOG.info("Handling redirect to " + newURL);
+
+ protocol = factory.getProtocol(newURL);
+ turl = new Text(newURL);
+ output = protocol.getProtocolOutput(turl, cd);
+
+ numRedirects++;
+ }
+
if (!output.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: "
+ output.getStatus());
+
+ if (output.getStatus().isRedirect()) {
+ System.err.println("Redirect(s) not handled due to configuration.");
+ System.err.println("Max Redirects to handle per config: " +
maxRedirects);
+ System.err.println("Number of Redirects handled: " + numRedirects);
+ }
return (-1);
}
Modified: nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=1678111&r1=1678110&r2=1678111&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Wed May
6 23:32:39 2015
@@ -227,6 +227,10 @@ public class ProtocolStatus implements W
|| code == ROBOTS_DENIED;
}
+ public boolean isRedirect() {
+ return code == MOVED || code == TEMP_MOVED;
+ }
+
public String getMessage() {
if (args != null && args.length > 0)
return args[0];