Author: markus
Date: Thu Jul  4 09:07:12 2013
New Revision: 1499696

URL: http://svn.apache.org/r1499696
Log:
NUTCH-1597  HeadingsParseFilter to trim and remove exess whitespace

Modified:
    nutch/trunk/CHANGES.txt
    
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499696&r1=1499695&r2=1499696&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul  4 09:07:12 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1597  HeadingsParseFilter to trim and remove exess whitespace (markus)
+
 * NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus)
 
 * NUTCH-1600 Injector overwrite does not always work properly (markus)

Modified: 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499696&r1=1499695&r2=1499696&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 Thu Jul  4 09:07:12 2013
@@ -19,6 +19,7 @@ package org.apache.nutch.parse.headings;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.*;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
@@ -33,6 +34,11 @@ import org.w3c.dom.*;
  */
 public class HeadingsParseFilter implements HtmlParseFilter {
 
+  /**
+   * Pattern used to strip surpluss whitespace
+   */
+  protected static Pattern whitespacePattern = Pattern.compile("\\s+");
+    
   private Configuration conf;
   private DocumentFragment doc;
   private String[] headings;
@@ -113,6 +119,8 @@ public class HeadingsParseFilter impleme
       }
     }
 
-    return buffer.toString();
+    // Return with stripped surplus whitespace
+    Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
+    return matcher.replaceAll(" ").trim();
   }
 }


Reply via email to