Author: markus
Date: Tue Jul 19 13:01:45 2011
New Revision: 1148305
URL: http://svn.apache.org/viewvc?rev=1148305&view=rev
Log:
NUTCH-1037 Option to deduplicate anchors prior to indexing
Modified:
nutch/branches/branch-1.4/CHANGES.txt
nutch/branches/branch-1.4/conf/nutch-default.xml
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Modified: nutch/branches/branch-1.4/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1148305&r1=1148304&r2=1148305&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Tue Jul 19 13:01:45 2011
@@ -2,6 +2,10 @@ Nutch Change Log
Release 1.4 - Current development
+* NUTCH-1037 Option to deduplicate anchors prior to indexing (markus)
+
+* NUTCH-1050 Add segmentDir option to WebGraph (markus)
+
* NUTCH-1055 upgrade package.html file in language identifier plugin (lewismc)
* NUTCH-1059 Remove convdb command from /bin/nutch (lewismc)
Modified: nutch/branches/branch-1.4/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1148305&r1=1148304&r2=1148305&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.4/conf/nutch-default.xml Tue Jul 19 13:01:45 2011
@@ -697,6 +697,17 @@
</description>
</property>
+<!-- AnchorIndexing filter plugin properties -->
+
+<property>
+ <name>anchorIndexingFilter.deduplicate</name>
+ <value>false</value>
+ <description>With this enabled the indexer will case-insensitive deduplicate
anchors
+ before indexing. This prevents possible hundreds or thousands of identical
anchors for
+ a given page to be indexed but will affect the search scoring (i.e. tf=1.0f).
+ </description>
+</property>
+
<!-- indexingfilter plugin properties -->
<property>
Modified:
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1148305&r1=1148304&r2=1148305&view=diff
==============================================================================
---
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
(original)
+++
nutch/branches/branch-1.4/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Tue Jul 19 13:01:45 2011
@@ -16,6 +16,8 @@
*/
package org.apache.nutch.indexer.anchor;
+import java.util.WeakHashMap;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@@ -35,9 +37,13 @@ public class AnchorIndexingFilter
public static final Log LOG = LogFactory.getLog(AnchorIndexingFilter.class);
private Configuration conf;
+ private boolean deduplicate = false;
public void setConf(Configuration conf) {
this.conf = conf;
+
+ deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
+ LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
}
public Configuration getConf() {
@@ -49,8 +55,24 @@ public class AnchorIndexingFilter
String[] anchors = (inlinks != null ? inlinks.getAnchors()
: new String[0]);
+
+ // https://issues.apache.org/jira/browse/NUTCH-1037
+ WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
+
for (int i = 0; i < anchors.length; i++) {
- doc.add("anchor", anchors[i]);
+ if (deduplicate) {
+ String lcAnchor = anchors[i].toLowerCase();
+
+ // Check if already processed the current anchor
+ if (!map.containsKey(lcAnchor)) {
+ doc.add("anchor", anchors[i]);
+
+ // Add to map
+ map.put(lcAnchor, 1);
+ }
+ } else {
+ doc.add("anchor", anchors[i]);
+ }
}
return doc;