ParseOutputFormat.java

jnioche Fri, 20 Nov 2015 08:22:49 -0800

Author: jnioche
Date: Fri Nov 20 16:21:46 2015
New Revision: 1715386

URL: http://svn.apache.org/viewvc?rev=1715386&view=rev
Log:
NUTCH-2069


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Nov 20 16:21:46 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 1.11 Release 25/10/2015 (dd/mm/yyyy)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2069 Ignore external links based on domain (jnioche)
+
 * NUTCH-2173 String.join in FileDumper breaks the build (joyce)
 
 * NUTCH-2166 Add reverse URL format to dump tool (joyce)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Nov 20 16:21:46 2015
@@ -548,12 +548,19 @@
 <property>
   <name>db.ignore.external.links</name>
   <value>false</value>
-  <description>If true, outlinks leading from a page to external hosts
+  <description>If true, outlinks leading from a page to external hosts or 
domain
   will be ignored. This is an effective way to limit the crawl to include
   only initially injected hosts, without creating complex URLFilters.
+  See 'db.ignore.external.links.mode'.
   </description>
 </property>
 
+<property>
+  <name>db.ignore.external.links.mode</name>
+  <value>byHost</value>
+  <description>Alternative value is byDomain</description>
+</property>
+
  <property>
   <name>db.injector.overwrite</name>
   <value>false</value>

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Fri Nov 20 
16:21:46 2015
@@ -85,6 +85,7 @@ public class FetcherThread extends Threa
   private boolean redirecting;
   private int redirectCount;
   private boolean ignoreExternalLinks;
+  private String ignoreExternalLinksMode;
 
   // Used by fetcher.follow.outlinks.depth in parse
   private int maxOutlinksPerPage;
@@ -168,14 +169,13 @@ public class FetcherThread extends Threa
     }
     LOG.info("Using queue mode : " + queueMode);
     this.maxRedirect = conf.getInt("http.redirect.max", 3);
-    this.ignoreExternalLinks = conf.getBoolean("db.ignore.external.links",
-        false);
 
     maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
     maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
         : maxOutlinksPerPage;
     interval = conf.getInt("db.fetch.interval.default", 2592000);
     ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
+    ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", 
"byHost");
     maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
     outlinksIgnoreExternal = conf.getBoolean(
         "fetcher.follow.outlinks.ignore.external", false);
@@ -616,19 +616,21 @@ public class FetcherThread extends Threa
             }
           }
 
-          String fromHost;
+          String origin = null;
 
           // collect outlinks for subsequent db update
           Outlink[] links = parseData.getOutlinks();
           int outlinksToStore = Math.min(maxOutlinks, links.length);
           if (ignoreExternalLinks) {
-            try {
-              fromHost = new URL(url.toString()).getHost().toLowerCase();
-            } catch (MalformedURLException e) {
-              fromHost = null;
+            URL originURL = new URL(url.toString());
+            // based on domain?
+            if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+              origin = URLUtil.getDomainName(originURL).toLowerCase();
+            } 
+            // use host 
+            else {
+              origin = originURL.getHost().toLowerCase();
             }
-          } else {
-            fromHost = null;
           }
           
           //used by fetchNode         
@@ -646,7 +648,7 @@ public class FetcherThread extends Threa
             String toUrl = links[i].getToUrl();
 
             toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
-                fromHost, ignoreExternalLinks, urlFilters, normalizers);
+                origin, ignoreExternalLinks, ignoreExternalLinksMode, 
urlFilters, normalizers);
             if (toUrl == null) {
               continue;
             }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1715386&r1=1715385&r2=1715386&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Fri Nov 
20 16:21:46 2015
@@ -104,6 +104,9 @@ public class ParseOutputFormat implement
     final int interval = job.getInt("db.fetch.interval.default", 2592000);
     final boolean ignoreExternalLinks = job.getBoolean(
         "db.ignore.external.links", false);
+    final String ignoreExternalLinksMode = job.get(
+        "db.ignore.external.links.mode", "byHost");
+    
     int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
     final boolean isParsing = job.getBoolean("fetcher.parse", true);
     final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
@@ -152,7 +155,8 @@ public class ParseOutputFormat implement
       public void write(Text key, Parse parse) throws IOException {
 
         String fromUrl = key.toString();
-        String fromHost = null;
+        // host or domain name of the source URL
+        String origin = null;
         textOut.append(key, new ParseText(parse.getText()));
 
         ParseData parseData = parse.getData();
@@ -184,15 +188,17 @@ public class ParseOutputFormat implement
         if (parseMDCrawlDatum != null)
           crawlOut.append(key, parseMDCrawlDatum);
 
+        // need to determine origin (once for all outlinks)
         if (ignoreExternalLinks) {
-          // need to determine fromHost (once for all outlinks)
-          try {
-            fromHost = new URL(fromUrl).getHost().toLowerCase();
-          } catch (MalformedURLException e) {
-            fromHost = null;
+          URL originURL = new URL(fromUrl.toString());
+          // based on domain?
+          if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+            origin = URLUtil.getDomainName(originURL).toLowerCase();
+          } 
+          // use host 
+          else {
+            origin = originURL.getHost().toLowerCase();
           }
-        } else {
-          fromHost = null;
         }
 
         ParseStatus pstatus = parseData.getStatus();
@@ -200,8 +206,8 @@ public class ParseOutputFormat implement
             && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
           String newUrl = pstatus.getMessage();
           int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
-          newUrl = filterNormalize(fromUrl, newUrl, fromHost,
-              ignoreExternalLinks, filters, normalizers,
+          newUrl = filterNormalize(fromUrl, newUrl, origin,
+              ignoreExternalLinks, ignoreExternalLinksMode, filters, 
normalizers,
               URLNormalizers.SCOPE_FETCHER);
 
           if (newUrl != null) {
@@ -231,8 +237,8 @@ public class ParseOutputFormat implement
 
           // Only normalize and filter if fetcher.parse = false
           if (!isParsing) {
-            toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost,
-                ignoreExternalLinks, filters, normalizers);
+            toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
+                ignoreExternalLinks, ignoreExternalLinksMode, filters, 
normalizers);
             if (toUrl == null) {
               continue;
             }
@@ -310,29 +316,39 @@ public class ParseOutputFormat implement
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
-      String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+      String fromHost, boolean ignoreExternalLinks,
+      String ignoreExternalLinksMode, URLFilters filters,
       URLNormalizers normalizers) {
     return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
-        filters, normalizers, URLNormalizers.SCOPE_OUTLINK);
+        ignoreExternalLinksMode, filters, normalizers,
+        URLNormalizers.SCOPE_OUTLINK);
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
-      String fromHost, boolean ignoreExternalLinks, URLFilters filters,
+      String origin, boolean ignoreExternalLinks, String 
ignoreExternalLinksMode, URLFilters filters,
       URLNormalizers normalizers, String urlNormalizerScope) {
     // ignore links to self (or anchors within the page)
     if (fromUrl.equals(toUrl)) {
       return null;
     }
     if (ignoreExternalLinks) {
-      String toHost;
+      URL targetURL = null;
       try {
-        toHost = new URL(toUrl).getHost().toLowerCase();
-      } catch (MalformedURLException e) {
-        toHost = null;
-      }
-      if (toHost == null || !toHost.equals(fromHost)) { // external links
+        targetURL = new URL(toUrl);
+      } catch (MalformedURLException e1) {
         return null; // skip it
       }
+      if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+        String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+        if (toDomain == null || !toDomain.equals(origin)) {
+          return null; // skip it
+        }
+      } else {
+        String toHost = targetURL.getHost().toLowerCase();
+        if (toHost == null || !toHost.equals(origin)) {
+          return null; // skip it
+        }
+      }
     }
     try {
       if (normalizers != null) {

svn commit: r1715386 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/FetcherThread.java src/java/org/apache/nutch/parse/ParseOutputFormat.java

Reply via email to