ParseOutputFormat.java

ab Wed, 19 Jul 2006 10:35:38 -0700

Author: ab
Date: Wed Jul 19 10:35:08 2006
New Revision: 423539

URL: http://svn.apache.org/viewvc?rev=423539&view=rev
Log:
Add ability to limit outlinks to only include initial hosts (NUTCH-173).


Modified:
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=423539&r1=423538&r2=423539&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Jul 19 10:35:08 2006
@@ -221,8 +221,17 @@
   <value>true</value>
   <description>If true, when adding new links to a page, links from
   the same host are ignored.  This is an effective way to limit the
-  size of the link database, keeping the only the highest quality
+  size of the link database, keeping only the highest quality
   links.
+  </description>
+</property>
+
+<property>
+  <name>db.ignore.external.links</name>
+  <value>false</value>
+  <description>If true, outlinks leading from a page to external hosts
+  will be ignored. This is an effective way to limit the crawl to include
+  only initially injected hosts, without creating complex URLFilters.
   </description>
 </property>
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=423539&r1=423538&r2=423539&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Wed Jul 19 10:35:08 2006
@@ -31,6 +31,9 @@
 import org.apache.nutch.net.*;
 
 import java.io.*;
+import java.net.MalformedURLException;
+import java.net.URL;
+
 import org.apache.hadoop.util.Progressable;
 
 /* Parse content in a segment. */
@@ -53,6 +56,7 @@
     this.filters = new URLFilters(job);
     this.scfilters = new ScoringFilters(job);
     final float interval = job.getFloat("db.default.fetch.interval", 30f);
+    final boolean ignoreExternalLinks = 
job.getBoolean("db.ignore.external.links", false);
     
     Path text =
       new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name);
@@ -77,7 +81,9 @@
           throws IOException {
           
           Parse parse = (Parse)value;
-          
+          String fromUrl = key.toString();
+          String fromHost = null; 
+          String toHost = null;          
           textOut.append(key, new ParseText(parse.getText()));
           
           ParseData parseData = parse.getData();
@@ -95,6 +101,15 @@
 
           // collect outlinks for subsequent db update
           Outlink[] links = parseData.getOutlinks();
+          if (ignoreExternalLinks) {
+            try {
+              fromHost = new URL(fromUrl).getHost().toLowerCase();
+            } catch (MalformedURLException e) {
+              fromHost = null;
+            }
+          } else {
+            fromHost = null;
+          }
 
           String[] toUrls = new String[links.length];
           int validCount = 0;
@@ -113,6 +128,16 @@
           // compute score contributions and adjustment to the original score
           for (int i = 0; i < toUrls.length; i++) {
             if (toUrls[i] == null) continue;
+            if (ignoreExternalLinks) {
+              try {
+                toHost = new URL(toUrls[i]).getHost().toLowerCase();
+              } catch (MalformedURLException e) {
+                toHost = null;
+              }
+              if (toHost == null || !toHost.equals(fromHost)) { // external 
links
+                continue; // skip it
+              }
+            }
             CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, 
interval);
             UTF8 targetUrl = new UTF8(toUrls[i]);
             adjust = null;



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys -- and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r423539 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/parse/ParseOutputFormat.java

Reply via email to