Author: markus
Date: Thu Jul 4 11:13:34 2013
New Revision: 1499722
URL: http://svn.apache.org/r1499722
Log:
NUTCH-1596 HeadingsParseFilter not thread safe
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499722&r1=1499721&r2=1499722&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 4 11:13:34 2013
@@ -2,7 +2,9 @@ Nutch Change Log
Nutch Development Trunk
-* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus)
+* NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus)
+
+* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus)
* NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus)
Modified:
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499722&r1=1499721&r2=1499722&view=diff
==============================================================================
---
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
(original)
+++
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
Thu Jul 4 11:13:34 2013
@@ -40,17 +40,14 @@ public class HeadingsParseFilter impleme
protected static Pattern whitespacePattern = Pattern.compile("\\s+");
private Configuration conf;
- private DocumentFragment doc;
private String[] headings;
private boolean multiValued = false;
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
- this.doc = doc;
-
Parse parse = parseResult.get(content.getUrl());
for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
- List<String> discoveredHeadings = getElement(headings[i]);
+ List<String> discoveredHeadings = getElement(doc, headings[i]);
if (discoveredHeadings.size() > 0) {
for (String heading : discoveredHeadings) {
@@ -82,7 +79,7 @@ public class HeadingsParseFilter impleme
/**
* Finds the specified element and returns its value
*/
- protected List<String> getElement(String element) {
+ protected List<String> getElement(DocumentFragment doc, String element) {
List<String> headings = new ArrayList<String>();
NodeWalker walker = new NodeWalker(doc);