Author: lewismc
Date: Wed Apr  9 23:39:42 2014
New Revision: 1586175

URL: http://svn.apache.org/r1586175
Log:
NUTCH-1751 Empty anchors should not index

Modified:
    nutch/branches/2.x/CHANGES.txt
    
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1586175&r1=1586174&r2=1586175&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Apr  9 23:39:42 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1751 Empty anchors should not index (Sertac TURKEL via lewismc)
+
 * NUTCH-1733 parse-html to support HTML5 charset definitions (snagel)
 
 * NUTCH-1727 Configurable length for Tlds (Sertac TURKEL via lewismc)

Modified: 
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1586175&r1=1586174&r2=1586175&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
 Wed Apr  9 23:39:42 2014
@@ -84,7 +84,10 @@ public class AnchorIndexingFilter implem
     
     for (Entry<Utf8, Utf8> e : page.getInlinks().entrySet()) {
       String anchor = TableUtil.toString(e.getValue());
-
+      
+      if(anchor.equals(""))
+        continue;
+      
       if (deduplicate) {
         if (set == null) set = new HashSet<String>();
         String lcAnchor = anchor.toLowerCase();


Reply via email to