Author: kubes
Date: Thu Oct 18 09:53:48 2007
New Revision: 586032

URL: http://svn.apache.org/viewvc?rev=586032&view=rev
Log:
NUTCH-488 - Avoid parsing uneccessary links and get a more relevant outlink 
list.  Thanks to Marcin Okraszewski and Emmanuel Joke.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=586032&r1=586031&r2=586032&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Oct 18 09:53:48 2007
@@ -147,6 +147,9 @@
     
 50. NUTCH-562 - Port mime type framework to use Tika mime detection framework.
     (mattmann)
+    
+51. NUTCH-488 - Avoid parsing uneccessary links and get a more relevant 
outlink 
+    list. (Emmanuel Joke, Marcin Okraszewski via kubes)
 
 Release 0.9 - 2007-04-02
 

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=586032&r1=586031&r2=586032&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Oct 18 09:53:48 2007
@@ -887,6 +887,16 @@
   be ignored.</description>
 </property>
 
+<property>
+  <name>parser.html.outlinks.ignore_tags</name>
+  <value></value>
+  <description>Comma separated list of HTML tags, from which outlinks 
+  shouldn't be extracted. Nutch takes links from: a, area, form, frame, 
+  iframe, script, link, img. If you add any of those tags here, it
+  won't be taken. Default is empty list. Probably reasonable value
+  for most people would be "img,script,link".</description>
+</property>
+
 
 <!-- urlfilter plugin properties -->
 

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=586032&r1=586031&r2=586032&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Thu Oct 18 09:53:48 2007
@@ -19,6 +19,7 @@
 
 import java.net.URL;
 import java.net.MalformedURLException;
+import java.util.Collection;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Stack;
@@ -62,18 +63,30 @@
   }
   
   public void setConf(Configuration conf) {
+    // forceTags is used to override configurable tag ignoring, later on
+    Collection<String> forceTags = new ArrayList<String>(1);
+
     this.conf = conf;
     linkParams.clear();
     linkParams.put("a", new LinkParams("a", "href", 1));
     linkParams.put("area", new LinkParams("area", "href", 0));
-    if (conf.getBoolean("parser.html.form.use_action", false)) {
+    if (conf.getBoolean("parser.html.form.use_action", true)) {
       linkParams.put("form", new LinkParams("form", "action", 1));
+      if (conf.get("parser.html.form.use_action") != null)
+        forceTags.add("form");
     }
     linkParams.put("frame", new LinkParams("frame", "src", 0));
     linkParams.put("iframe", new LinkParams("iframe", "src", 0));
     linkParams.put("script", new LinkParams("script", "src", 0));
     linkParams.put("link", new LinkParams("link", "href", 0));
     linkParams.put("img", new LinkParams("img", "src", 0));
+
+    // remove unwanted link tags from the linkParams map
+    String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
+    for ( int i = 0 ; ignoreTags != null && i < ignoreTags.length ; i++ ) {
+      if ( ! forceTags.contains(ignoreTags[i]) )
+        linkParams.remove(ignoreTags[i]);
+    }
   }
   
   /**


Reply via email to