enhancement of the boilerpipe patch
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/32dd379d Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/32dd379d Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/32dd379d Branch: refs/heads/2.x Commit: 32dd379d4dbf46c707b4554b0385bb3345f74797 Parents: be91764 Author: Jérémie Bourseau <[email protected]> Authored: Mon Feb 29 14:36:49 2016 +0100 Committer: Jérémie Bourseau <[email protected]> Committed: Mon Feb 29 14:36:49 2016 +0100 ---------------------------------------------------------------------- conf/nutch-default.xml | 5 +++++ .../apache/nutch/indexer/IndexingFiltersChecker.java | 5 ++--- .../parse/tika/BoilerpipeExtractorRepository.java | 12 ++++++------ .../java/org/apache/nutch/parse/tika/TikaParser.java | 15 ++++++--------- 4 files changed, 19 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 30c5831..117737b 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -881,10 +881,15 @@ <property> <name>tika.boilerpipe</name> <value>false</value> + <description>Define if the parser tika uses boilerpipe or not. This property needs to activate the parse-tika in the plugin.includes property. + </description> </property> + <property> <name>tika.boilerpipe.extractor</name> <value>ArticleExtractor</value> + <description>Define what algorithm boilerpipe uses. + </description> </property> http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java index cc3af15..ec77607 100644 --- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java +++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java @@ -44,7 +44,7 @@ import org.slf4j.LoggerFactory; /** * Reads and parses a URL and run the indexers on it. Displays the fields - * obtained and the first 100 characters of their value + * obtained and all the characters of their value * * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker * http://www.lemonde.fr @@ -145,8 +145,7 @@ public class IndexingFiltersChecker extends Configured implements Tool { if (values != null) { for (Object value : values) { String str = value.toString(); - int minText = Math.min(100, str.length()); - System.out.println(fname + " :\t" + str.substring(0, minText)); + System.out.println(fname + " :\t" + str); } } } http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java index baa40d6..de9768e 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java @@ -19,15 +19,15 @@ package org.apache.nutch.parse.tika; import java.lang.ClassLoader; import java.lang.InstantiationException; import java.util.WeakHashMap; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.parser.html.BoilerpipeContentHandler; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.extractors.*; class BoilerpipeExtractorRepository { - public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class); + public static final Logger LOG = LoggerFactory.getLogger(BoilerpipeExtractorRepository.class); public static final WeakHashMap<String, BoilerpipeExtractor> extractorRepository = new WeakHashMap<String, BoilerpipeExtractor>(); /** @@ -48,11 +48,11 @@ class BoilerpipeExtractorRepository { extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance()); } catch (ClassNotFoundException e) { - LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!"); + LOG.error("BoilerpipeExtractor {} not found!", boilerpipeExtractorName); } catch (InstantiationException e) { - LOG.error("Could not instantiate " + boilerpipeExtractorName); + LOG.error("Could not instantiate {}!", boilerpipeExtractorName); } catch (Exception e) { - LOG.error(e); + LOG.error("Error due to the {}!",boilerpipeExtractorName, e); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index fb0bbe3..9da6160 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -106,19 +106,17 @@ public class TikaParser implements org.apache.nutch.parse.Parser { message, getConf()); } - LOG.debug("Using Tika parser " + parser.getClass().getName() - + " for mime-type " + mimeType); + LOG.debug("Using Tika parser {} for mime-type {}.", parser.getClass().getName(), mimeType); Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); - // DOMBuilder domhandler = new DOMBuilder(doc, root); ContentHandler domHandler; // Check whether to use Tika's BoilerplateContentHandler if (useBoilerpipe) { - LOG.debug("Using Tikas's Boilerpipe with Extractor: " + boilerpipeExtractorName); + LOG.debug("Using Tikas's Boilerpipe with Extractor: {}.", boilerpipeExtractorName); BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); bpHandler.setIncludeMarkup(true); domHandler = (ContentHandler)bpHandler; @@ -136,7 +134,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining()), (ContentHandler)domHandler, tikamd, context); } catch (Exception e) { - LOG.error("Error parsing " + url, e); + LOG.error("Error parsing {}.", url, e); return ParseStatusUtils.getEmptyParse(e, getConf()); } @@ -169,19 +167,18 @@ public class TikaParser implements org.apache.nutch.parse.Parser { title = sb.toString().trim(); } - // Warning: very nasty - // Parse again without BP to get all outlinks + // Parse again without boilerpipe to get all outlinks + // TODO avoid this second parsing if (useBoilerpipe) { root = doc.createDocumentFragment(); domHandler = new DOMBuilder(doc, root); try { parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining()), (ContentHandler)domHandler, tikamd, context); } catch (Exception e) { - LOG.error("Error parsing "+url,e); + LOG.error("Error parsing {}.", url, e); return ParseStatusUtils.getEmptyParse(e, getConf()); } } - // END NASTY STUFF if (!metaTags.getNoFollow()) { // okay to follow links
