Author: markus
Date: Tue Jul 19 13:12:43 2011
New Revision: 1148308

URL: http://svn.apache.org/viewvc?rev=1148308&view=rev
Log:
NUTCH-1037 Option to deduplicate anchors prior to indexing

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1148308&r1=1148307&r2=1148308&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 19 13:12:43 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.0 - Current Development
 
+* NUTCH-1037 Option to deduplicate anchors prior to indexing (markus)
+
 * NUTCH-1055 upgrade package.html file in language identifier plugin (lewismc)
 
 * NUTCH-1043 Add pattern for filtering .js in default url filters (jnioche)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1148308&r1=1148307&r2=1148308&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Jul 19 13:12:43 2011
@@ -716,7 +716,6 @@
   </description>
 </property>
 
-
 <!-- moreindexingfilter plugin properties -->
 
 <property>
@@ -728,6 +727,16 @@
   </description>
 </property>
 
+<!-- AnchorIndexing filter plugin properties -->
+
+<property>
+  <name>anchorIndexingFilter.deduplicate</name>
+  <value>false</value>
+  <description>With this enabled the indexer will case-insensitive deduplicate 
hanchors
+  before indexing. This prevents possible hundreds or thousands of identical 
anchors for
+  a given page to be indexed but will affect the search scoring (i.e. tf=1.0f).
+  </description>
+</property>
 
 <!-- URL normalizer properties -->
 

Modified: 
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1148308&r1=1148307&r2=1148308&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
 Tue Jul 19 13:12:43 2011
@@ -19,6 +19,7 @@ package org.apache.nutch.indexer.anchor;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Map.Entry;
+import java.util.WeakHashMap;
 
 import org.apache.avro.util.Utf8;
 import org.slf4j.Logger;
@@ -37,6 +38,7 @@ public class AnchorIndexingFilter implem
 
   public static final Logger LOG = 
LoggerFactory.getLogger(AnchorIndexingFilter.class);
   private Configuration conf;
+  private boolean deduplicate = false;
 
   private static final Collection<WebPage.Field> FIELDS = new 
HashSet<WebPage.Field>();
 
@@ -46,6 +48,9 @@ public class AnchorIndexingFilter implem
 
   public void setConf(Configuration conf) {
     this.conf = conf;
+
+    deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
+    LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
   }
 
   public Configuration getConf() {
@@ -59,8 +64,25 @@ public class AnchorIndexingFilter implem
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {
 
+    // https://issues.apache.org/jira/browse/NUTCH-1037
+    WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
+
     for (Entry<Utf8, Utf8> e : page.getInlinks().entrySet()) {
-      doc.add("anchor", TableUtil.toString(e.getValue()));
+      String anchor = TableUtil.toString(e.getValue());
+
+      if (deduplicate) {
+        String lcAnchor = anchor.toLowerCase();
+
+        // Check if already processed the current anchor
+        if (!map.containsKey(lcAnchor)) {
+          doc.add("anchor", anchor);
+
+          // Add to map
+          map.put(lcAnchor, 1);
+        }
+      } else {
+        doc.add("anchor", anchor);
+      }
     }
 
     return doc;


Reply via email to