Author: markus
Date: Tue Jul 19 13:12:43 2011
New Revision: 1148308
URL: http://svn.apache.org/viewvc?rev=1148308&view=rev
Log:
NUTCH-1037 Option to deduplicate anchors prior to indexing
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1148308&r1=1148307&r2=1148308&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 19 13:12:43 2011
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.0 - Current Development
+* NUTCH-1037 Option to deduplicate anchors prior to indexing (markus)
+
* NUTCH-1055 upgrade package.html file in language identifier plugin (lewismc)
* NUTCH-1043 Add pattern for filtering .js in default url filters (jnioche)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1148308&r1=1148307&r2=1148308&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Jul 19 13:12:43 2011
@@ -716,7 +716,6 @@
</description>
</property>
-
<!-- moreindexingfilter plugin properties -->
<property>
@@ -728,6 +727,16 @@
</description>
</property>
+<!-- AnchorIndexing filter plugin properties -->
+
+<property>
+ <name>anchorIndexingFilter.deduplicate</name>
+ <value>false</value>
+ <description>With this enabled the indexer will case-insensitive deduplicate
hanchors
+ before indexing. This prevents possible hundreds or thousands of identical
anchors for
+ a given page to be indexed but will affect the search scoring (i.e. tf=1.0f).
+ </description>
+</property>
<!-- URL normalizer properties -->
Modified:
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1148308&r1=1148307&r2=1148308&view=diff
==============================================================================
---
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
(original)
+++
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Tue Jul 19 13:12:43 2011
@@ -19,6 +19,7 @@ package org.apache.nutch.indexer.anchor;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map.Entry;
+import java.util.WeakHashMap;
import org.apache.avro.util.Utf8;
import org.slf4j.Logger;
@@ -37,6 +38,7 @@ public class AnchorIndexingFilter implem
public static final Logger LOG =
LoggerFactory.getLogger(AnchorIndexingFilter.class);
private Configuration conf;
+ private boolean deduplicate = false;
private static final Collection<WebPage.Field> FIELDS = new
HashSet<WebPage.Field>();
@@ -46,6 +48,9 @@ public class AnchorIndexingFilter implem
public void setConf(Configuration conf) {
this.conf = conf;
+
+ deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
+ LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
}
public Configuration getConf() {
@@ -59,8 +64,25 @@ public class AnchorIndexingFilter implem
public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
+ // https://issues.apache.org/jira/browse/NUTCH-1037
+ WeakHashMap<String,Integer> map = new WeakHashMap<String,Integer>();
+
for (Entry<Utf8, Utf8> e : page.getInlinks().entrySet()) {
- doc.add("anchor", TableUtil.toString(e.getValue()));
+ String anchor = TableUtil.toString(e.getValue());
+
+ if (deduplicate) {
+ String lcAnchor = anchor.toLowerCase();
+
+ // Check if already processed the current anchor
+ if (!map.containsKey(lcAnchor)) {
+ doc.add("anchor", anchor);
+
+ // Add to map
+ map.put(lcAnchor, 1);
+ }
+ } else {
+ doc.add("anchor", anchor);
+ }
}
return doc;