Author: markus
Date: Tue Jul 19 13:01:45 2011
New Revision: 1148305

URL: http://svn.apache.org/viewvc?rev=1148305&view=rev
Log:
NUTCH-1037 Option to deduplicate anchors prior to indexing

Modified:
    nutch/branches/branch-1.4/CHANGES.txt
    nutch/branches/branch-1.4/conf/nutch-default.xml
    
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1148305&r1=1148304&r2=1148305&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Tue Jul 19 13:01:45 2011
@@ -2,6 +2,10 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1037 Option to deduplicate anchors prior to indexing (markus)
+
+* NUTCH-1050 Add segmentDir option to WebGraph (markus)
+
 * NUTCH-1055 upgrade package.html file in language identifier plugin (lewismc)
 
 * NUTCH-1059 Remove convdb command from /bin/nutch (lewismc)

Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1148305&r1=1148304&r2=1148305&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Tue Jul 19 13:01:45 2011
@@ -697,6 +697,17 @@
   </description>
 </property>
 
+<!-- AnchorIndexing filter plugin properties -->
+
+<property>
+  <name>anchorIndexingFilter.deduplicate</name>
+  <value>false</value>
+  <description>With this enabled the indexer will case-insensitive deduplicate 
anchors
+  before indexing. This prevents possible hundreds or thousands of identical 
anchors for
+  a given page to be indexed but will affect the search scoring (i.e. tf=1.0f).
+  </description>
+</property>
+
 <!-- indexingfilter plugin properties -->
 
 <property>

Modified: 
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1148305&r1=1148304&r2=1148305&view=diff
==============================================================================
--- 
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
 (original)
+++ 
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
 Tue Jul 19 13:01:45 2011
@@ -16,6 +16,8 @@
  */
 package org.apache.nutch.indexer.anchor;
 
+import java.util.WeakHashMap;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -35,9 +37,13 @@ public class AnchorIndexingFilter
 
   public static final Log LOG = LogFactory.getLog(AnchorIndexingFilter.class);
   private Configuration conf;
+  private boolean deduplicate = false;
 
   public void setConf(Configuration conf) {
     this.conf = conf;
+
+    deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
+    LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
   }
 
   public Configuration getConf() {
@@ -49,8 +55,24 @@ public class AnchorIndexingFilter
 
     String[] anchors = (inlinks != null ? inlinks.getAnchors()
       : new String[0]);
+
+    // https://issues.apache.org/jira/browse/NUTCH-1037
+    WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
+
     for (int i = 0; i < anchors.length; i++) {
-      doc.add("anchor", anchors[i]);
+      if (deduplicate) {
+        String lcAnchor = anchors[i].toLowerCase();
+
+        // Check if already processed the current anchor
+        if (!map.containsKey(lcAnchor)) {
+          doc.add("anchor", anchors[i]);
+
+          // Add to map
+          map.put(lcAnchor, 1);
+        }
+      } else {
+        doc.add("anchor", anchors[i]);
+      }
     }
 
     return doc;


Reply via email to