Author: ferdy
Date: Mon Sep 17 09:24:33 2012
New Revision: 1386526
URL: http://svn.apache.org/viewvc?rev=1386526&view=rev
Log:
NUTCH-1468 Redirects that are external links not adhering to
db.ignore.external.links
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1386526&r1=1386525&r2=1386526&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Sep 17 09:24:33 2012
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.1 - Current Development
+* NUTCH-1468 Redirects that are external links not adhering to
db.ignore.external.links (Matt MacDonald via ferdy)
+
* NUTCH-1470 Ensure test files are included for runtime testing (lewismc)
* NUTCH-1162 Write JUnit tests for parse-js (lewismc)
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1386526&r1=1386525&r2=1386526&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
Mon Sep 17 09:24:33 2012
@@ -18,6 +18,7 @@ package org.apache.nutch.fetcher;
import java.io.IOException;
import java.net.InetAddress;
+import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
@@ -434,6 +435,7 @@ extends GoraReducer<IntWritable, FetchEn
private String reprUrl;
private final Context context;
+ private final boolean ignoreExternalLinks;
public FetcherThread(Context context, int num) {
this.setDaemon(true); // don't hang JVM on exit
@@ -446,6 +448,7 @@ extends GoraReducer<IntWritable, FetchEn
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
// backward-compatible default setting
this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
+ this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links",
false);
}
@Override
@@ -598,6 +601,16 @@ extends GoraReducer<IntWritable, FetchEn
if (newUrl == null || newUrl.equals(url)) {
return;
}
+
+ if (ignoreExternalLinks) {
+ String toHost = new URL(newUrl).getHost().toLowerCase();
+ String fromHost = new URL(url).getHost().toLowerCase();
+ if (toHost == null || !toHost.equals(fromHost)) {
+ // external links
+ return;
+ }
+ }
+
page.putToOutlinks(new Utf8(newUrl), new Utf8());
page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);