Author: ab Date: Wed Jul 19 10:35:08 2006 New Revision: 423539 URL: http://svn.apache.org/viewvc?rev=423539&view=rev Log: Add ability to limit outlinks to only include initial hosts (NUTCH-173).
Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=423539&r1=423538&r2=423539&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Jul 19 10:35:08 2006 @@ -221,8 +221,17 @@ <value>true</value> <description>If true, when adding new links to a page, links from the same host are ignored. This is an effective way to limit the - size of the link database, keeping the only the highest quality + size of the link database, keeping only the highest quality links. + </description> +</property> + +<property> + <name>db.ignore.external.links</name> + <value>false</value> + <description>If true, outlinks leading from a page to external hosts + will be ignored. This is an effective way to limit the crawl to include + only initially injected hosts, without creating complex URLFilters. </description> </property> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=423539&r1=423538&r2=423539&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Wed Jul 19 10:35:08 2006 @@ -31,6 +31,9 @@ import org.apache.nutch.net.*; import java.io.*; +import java.net.MalformedURLException; +import java.net.URL; + import org.apache.hadoop.util.Progressable; /* Parse content in a segment. */ @@ -53,6 +56,7 @@ this.filters = new URLFilters(job); this.scfilters = new ScoringFilters(job); final float interval = job.getFloat("db.default.fetch.interval", 30f); + final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); Path text = new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name); @@ -77,7 +81,9 @@ throws IOException { Parse parse = (Parse)value; - + String fromUrl = key.toString(); + String fromHost = null; + String toHost = null; textOut.append(key, new ParseText(parse.getText())); ParseData parseData = parse.getData(); @@ -95,6 +101,15 @@ // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); + if (ignoreExternalLinks) { + try { + fromHost = new URL(fromUrl).getHost().toLowerCase(); + } catch (MalformedURLException e) { + fromHost = null; + } + } else { + fromHost = null; + } String[] toUrls = new String[links.length]; int validCount = 0; @@ -113,6 +128,16 @@ // compute score contributions and adjustment to the original score for (int i = 0; i < toUrls.length; i++) { if (toUrls[i] == null) continue; + if (ignoreExternalLinks) { + try { + toHost = new URL(toUrls[i]).getHost().toLowerCase(); + } catch (MalformedURLException e) { + toHost = null; + } + if (toHost == null || !toHost.equals(fromHost)) { // external links + continue; // skip it + } + } CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); UTF8 targetUrl = new UTF8(toUrls[i]); adjust = null; ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs