Author: markus Date: Thu Jul 4 11:13:34 2013 New Revision: 1499722 URL: http://svn.apache.org/r1499722 Log: NUTCH-1596 HeadingsParseFilter not thread safe
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499722&r1=1499721&r2=1499722&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Jul 4 11:13:34 2013 @@ -2,7 +2,9 @@ Nutch Change Log Nutch Development Trunk -* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus) +* NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus) + +* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus) * NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus) Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499722&r1=1499721&r2=1499722&view=diff ============================================================================== --- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (original) +++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Thu Jul 4 11:13:34 2013 @@ -40,17 +40,14 @@ public class HeadingsParseFilter impleme protected static Pattern whitespacePattern = Pattern.compile("\\s+"); private Configuration conf; - private DocumentFragment doc; private String[] headings; private boolean multiValued = false; public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { - this.doc = doc; - Parse parse = parseResult.get(content.getUrl()); for (int i = 0 ; headings != null && i < headings.length ; i++ ) { - List<String> discoveredHeadings = getElement(headings[i]); + List<String> discoveredHeadings = getElement(doc, headings[i]); if (discoveredHeadings.size() > 0) { for (String heading : discoveredHeadings) { @@ -82,7 +79,7 @@ public class HeadingsParseFilter impleme /** * Finds the specified element and returns its value */ - protected List<String> getElement(String element) { + protected List<String> getElement(DocumentFragment doc, String element) { List<String> headings = new ArrayList<String>(); NodeWalker walker = new NodeWalker(doc);