NUTCH-961 improve parser with boilerpipe
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/93ea2e51 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/93ea2e51 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/93ea2e51 Branch: refs/heads/2.x Commit: 93ea2e51f444447be41ec93b2c0b0b61c117eeb3 Parents: 3e80673 Author: Jérémie Bourseau <[email protected]> Authored: Fri Feb 26 11:37:28 2016 +0100 Committer: Jérémie Bourseau <[email protected]> Committed: Fri Feb 26 11:59:27 2016 +0100 ---------------------------------------------------------------------- conf/nutch-default.xml | 13 ++++ .../tika/BoilerpipeExtractorRepository.java | 62 ++++++++++++++++++++ .../org/apache/nutch/parse/tika/DOMBuilder.java | 4 +- .../org/apache/nutch/parse/tika/TikaParser.java | 35 ++++++++++- 4 files changed, 111 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/93ea2e51/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 76a2d59..30c5831 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -876,6 +876,19 @@ </description> </property> +<!-- tika properties --> + +<property> + <name>tika.boilerpipe</name> + <value>false</value> +</property> +<property> + <name>tika.boilerpipe.extractor</name> + <value>ArticleExtractor</value> +</property> + + + <!-- mime properties --> <!-- http://git-wip-us.apache.org/repos/asf/nutch/blob/93ea2e51/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java new file mode 100644 index 0000000..baa40d6 --- /dev/null +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.tika; + +import java.lang.ClassLoader; +import java.lang.InstantiationException; +import java.util.WeakHashMap; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.parser.html.BoilerpipeContentHandler; +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.extractors.*; + +class BoilerpipeExtractorRepository { + + public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class); + public static final WeakHashMap<String, BoilerpipeExtractor> extractorRepository = new WeakHashMap<String, BoilerpipeExtractor>(); + + /** + * Returns an instance of the specified extractor + */ + public static BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) { + // Check if there's no instance of this extractor + if (!extractorRepository.containsKey(boilerpipeExtractorName)) { + // FQCN + boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName; + + // Attempt to load the class + try { + ClassLoader loader = BoilerpipeExtractor.class.getClassLoader(); + Class extractorClass = loader.loadClass(boilerpipeExtractorName); + + // Add an instance to the repository + extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance()); + + } catch (ClassNotFoundException e) { + LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!"); + } catch (InstantiationException e) { + LOG.error("Could not instantiate " + boilerpipeExtractorName); + } catch (Exception e) { + LOG.error(e); + } + } + + return extractorRepository.get(boilerpipeExtractorName); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/93ea2e51/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java index 13d710f..4f4c8a7 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java @@ -345,7 +345,9 @@ class DOMBuilder implements ContentHandler, LexicalHandler { */ public void endElement(String ns, String localName, String name) throws org.xml.sax.SAXException { - m_elemStack.pop(); + if (!m_elemStack.isEmpty()) { + m_elemStack.pop(); + } m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek(); } http://git-wip-us.apache.org/repos/asf/nutch/blob/93ea2e51/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 00aa30b..fb0bbe3 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -37,9 +37,11 @@ import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.html.HtmlMapper; +import org.apache.tika.parser.html.BoilerpipeContentHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; +import org.xml.sax.ContentHandler; import java.io.ByteArrayInputStream; import java.io.DataInputStream; @@ -80,6 +82,9 @@ public class TikaParser implements org.apache.nutch.parse.Parser { @Override public Parse getParse(String url, WebPage page) { + boolean useBoilerpipe = getConf().getBoolean("tika.boilerpipe", false); + String boilerpipeExtractorName = getConf().get("tika.boilerpipe.extractor", "ArticleExtractor"); + String baseUrl = TableUtil.toString(page.getBaseUrl()); URL base; try { @@ -109,7 +114,18 @@ public class TikaParser implements org.apache.nutch.parse.Parser { HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); - DOMBuilder domhandler = new DOMBuilder(doc, root); + // DOMBuilder domhandler = new DOMBuilder(doc, root); + ContentHandler domHandler; + // Check whether to use Tika's BoilerplateContentHandler + if (useBoilerpipe) { + LOG.debug("Using Tikas's Boilerpipe with Extractor: " + boilerpipeExtractorName); + BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); + bpHandler.setIncludeMarkup(true); + domHandler = (ContentHandler)bpHandler; + } else { + domHandler = new DOMBuilder(doc, root); + } + ParseContext context = new ParseContext(); if (HTMLMapper != null) context.set(HtmlMapper.class, HTMLMapper); @@ -118,7 +134,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { tikamd.set(Metadata.CONTENT_TYPE, mimeType); try { parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() - + raw.position(), raw.remaining()), domhandler, tikamd, context); + + raw.position(), raw.remaining()), (ContentHandler)domHandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + url, e); return ParseStatusUtils.getEmptyParse(e, getConf()); @@ -153,6 +169,21 @@ public class TikaParser implements org.apache.nutch.parse.Parser { title = sb.toString().trim(); } + // Warning: very nasty + // Parse again without BP to get all outlinks + if (useBoilerpipe) { + root = doc.createDocumentFragment(); + domHandler = new DOMBuilder(doc, root); + try { + parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining()), (ContentHandler)domHandler, tikamd, context); + } catch (Exception e) { + LOG.error("Error parsing "+url,e); + return ParseStatusUtils.getEmptyParse(e, getConf()); + } + } + // END NASTY STUFF + + if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root);
