Author: markus
Date: Thu Jul 4 09:07:12 2013
New Revision: 1499696
URL: http://svn.apache.org/r1499696
Log:
NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499696&r1=1499695&r2=1499696&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 4 09:07:12 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus)
+
* NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus)
* NUTCH-1600 Injector overwrite does not always work properly (markus)
Modified:
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499696&r1=1499695&r2=1499696&view=diff
==============================================================================
---
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
(original)
+++
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Thu Jul 4 09:07:12 2013
@@ -19,6 +19,7 @@ package org.apache.nutch.parse.headings;
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
@@ -33,6 +34,11 @@ import org.w3c.dom.*;
*/
public class HeadingsParseFilter implements HtmlParseFilter {
+ /**
+ * Pattern used to strip surpluss whitespace
+ */
+ protected static Pattern whitespacePattern = Pattern.compile("\\s+");
+
private Configuration conf;
private DocumentFragment doc;
private String[] headings;
@@ -113,6 +119,8 @@ public class HeadingsParseFilter impleme
}
}
- return buffer.toString();
+ // Return with stripped surplus whitespace
+ Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
+ return matcher.replaceAll(" ").trim();
}
}