Author: ferdy
Date: Mon Sep 17 09:24:33 2012
New Revision: 1386526

URL: http://svn.apache.org/viewvc?rev=1386526&view=rev
Log:
NUTCH-1468 Redirects that are external links not adhering to 
db.ignore.external.links

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1386526&r1=1386525&r2=1386526&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Sep 17 09:24:33 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.1 - Current Development
 
+* NUTCH-1468 Redirects that are external links not adhering to 
db.ignore.external.links (Matt MacDonald via ferdy)
+
 * NUTCH-1470 Ensure test files are included for runtime testing (lewismc)
 
 * NUTCH-1162 Write JUnit tests for parse-js (lewismc)

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1386526&r1=1386525&r2=1386526&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java 
Mon Sep 17 09:24:33 2012
@@ -18,6 +18,7 @@ package org.apache.nutch.fetcher;
 
 import java.io.IOException;
 import java.net.InetAddress;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
@@ -434,6 +435,7 @@ extends GoraReducer<IntWritable, FetchEn
     private String reprUrl;
 
     private final Context context;
+    private final boolean ignoreExternalLinks;
 
     public FetcherThread(Context context, int num) {
       this.setDaemon(true);                       // don't hang JVM on exit
@@ -446,6 +448,7 @@ extends GoraReducer<IntWritable, FetchEn
       this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
       // backward-compatible default setting
       this.byIP = conf.getBoolean("fetcher.threads.per.host.by.ip", true);
+      this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", 
false);
     }
 
     @Override
@@ -598,6 +601,16 @@ extends GoraReducer<IntWritable, FetchEn
       if (newUrl == null || newUrl.equals(url)) {
         return;
       }
+
+      if (ignoreExternalLinks) {
+        String toHost   = new URL(newUrl).getHost().toLowerCase();
+        String fromHost = new URL(url).getHost().toLowerCase();
+        if (toHost == null || !toHost.equals(fromHost)) {
+          // external links
+          return;
+        }
+      }
+
       page.putToOutlinks(new Utf8(newUrl), new Utf8());
       page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
       reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);


Reply via email to