Author: kubes Date: Thu Oct 18 09:53:48 2007 New Revision: 586032 URL: http://svn.apache.org/viewvc?rev=586032&view=rev Log: NUTCH-488 - Avoid parsing uneccessary links and get a more relevant outlink list. Thanks to Marcin Okraszewski and Emmanuel Joke.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=586032&r1=586031&r2=586032&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Oct 18 09:53:48 2007 @@ -147,6 +147,9 @@ 50. NUTCH-562 - Port mime type framework to use Tika mime detection framework. (mattmann) + +51. NUTCH-488 - Avoid parsing uneccessary links and get a more relevant outlink + list. (Emmanuel Joke, Marcin Okraszewski via kubes) Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=586032&r1=586031&r2=586032&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Oct 18 09:53:48 2007 @@ -887,6 +887,16 @@ be ignored.</description> </property> +<property> + <name>parser.html.outlinks.ignore_tags</name> + <value></value> + <description>Comma separated list of HTML tags, from which outlinks + shouldn't be extracted. Nutch takes links from: a, area, form, frame, + iframe, script, link, img. If you add any of those tags here, it + won't be taken. Default is empty list. Probably reasonable value + for most people would be "img,script,link".</description> +</property> + <!-- urlfilter plugin properties --> Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=586032&r1=586031&r2=586032&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Thu Oct 18 09:53:48 2007 @@ -19,6 +19,7 @@ import java.net.URL; import java.net.MalformedURLException; +import java.util.Collection; import java.util.ArrayList; import java.util.HashMap; import java.util.Stack; @@ -62,18 +63,30 @@ } public void setConf(Configuration conf) { + // forceTags is used to override configurable tag ignoring, later on + Collection<String> forceTags = new ArrayList<String>(1); + this.conf = conf; linkParams.clear(); linkParams.put("a", new LinkParams("a", "href", 1)); linkParams.put("area", new LinkParams("area", "href", 0)); - if (conf.getBoolean("parser.html.form.use_action", false)) { + if (conf.getBoolean("parser.html.form.use_action", true)) { linkParams.put("form", new LinkParams("form", "action", 1)); + if (conf.get("parser.html.form.use_action") != null) + forceTags.add("form"); } linkParams.put("frame", new LinkParams("frame", "src", 0)); linkParams.put("iframe", new LinkParams("iframe", "src", 0)); linkParams.put("script", new LinkParams("script", "src", 0)); linkParams.put("link", new LinkParams("link", "href", 0)); linkParams.put("img", new LinkParams("img", "src", 0)); + + // remove unwanted link tags from the linkParams map + String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags"); + for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) { + if ( ! forceTags.contains(ignoreTags[i]) ) + linkParams.remove(ignoreTags[i]); + } } /**