Author: lewismc
Date: Wed Apr 9 23:39:42 2014
New Revision: 1586175
URL: http://svn.apache.org/r1586175
Log:
NUTCH-1751 Empty anchors should not index
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1586175&r1=1586174&r2=1586175&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Apr 9 23:39:42 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1751 Empty anchors should not index (Sertac TURKEL via lewismc)
+
* NUTCH-1733 parse-html to support HTML5 charset definitions (snagel)
* NUTCH-1727 Configurable length for Tlds (Sertac TURKEL via lewismc)
Modified:
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1586175&r1=1586174&r2=1586175&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
Wed Apr 9 23:39:42 2014
@@ -84,7 +84,10 @@ public class AnchorIndexingFilter implem
for (Entry<Utf8, Utf8> e : page.getInlinks().entrySet()) {
String anchor = TableUtil.toString(e.getValue());
-
+
+ if(anchor.equals(""))
+ continue;
+
if (deduplicate) {
if (set == null) set = new HashSet<String>();
String lcAnchor = anchor.toLowerCase();