[1/5] nutch git commit: improve parser with boilerpipe

mattmann Sat, 19 Mar 2016 17:47:27 -0700

Repository: nutch
Updated Branches:
  refs/heads/2.x 876aa4f3d -> d16b5afa2



improve parser with boilerpipe


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/f185bc44
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/f185bc44
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/f185bc44

Branch: refs/heads/2.x
Commit: f185bc4461c57a1a85578de0ecf0884c7026c3a6
Parents: 3e80673
Author: JÃ©rÃ©mie Bourseau <[email protected]>
Authored: Fri Feb 26 11:37:28 2016 +0100
Committer: JÃ©rÃ©mie Bourseau <[email protected]>
Committed: Fri Feb 26 11:37:28 2016 +0100

----------------------------------------------------------------------
 conf/nutch-default.xml                          | 13 ++++
 .../tika/BoilerpipeExtractorRepository.java     | 62 ++++++++++++++++++++
 .../org/apache/nutch/parse/tika/DOMBuilder.java |  4 +-
 .../org/apache/nutch/parse/tika/TikaParser.java | 35 ++++++++++-
 4 files changed, 111 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/f185bc44/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 76a2d59..30c5831 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -876,6 +876,19 @@
   </description>
 </property>
 
+<!-- tika properties -->
+
+<property>
+  <name>tika.boilerpipe</name>
+  <value>false</value>
+</property>
+<property>
+  <name>tika.boilerpipe.extractor</name>
+  <value>ArticleExtractor</value>
+</property>
+
+
+
 <!-- mime properties -->
 
 <!--

http://git-wip-us.apache.org/repos/asf/nutch/blob/f185bc44/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
new file mode 100644
index 0000000..baa40d6
--- /dev/null
+++ 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.lang.ClassLoader;
+import java.lang.InstantiationException;
+import java.util.WeakHashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.extractors.*;
+
+class BoilerpipeExtractorRepository {
+
+    public static final Log LOG = 
LogFactory.getLog(BoilerpipeExtractorRepository.class);
+    public static final WeakHashMap<String, BoilerpipeExtractor> 
extractorRepository = new WeakHashMap<String, BoilerpipeExtractor>();
+ 
+    /**
+     * Returns an instance of the specified extractor
+     */
+    public static BoilerpipeExtractor getExtractor(String 
boilerpipeExtractorName) {
+      // Check if there's no instance of this extractor
+      if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+        // FQCN
+        boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + 
boilerpipeExtractorName;
+
+        // Attempt to load the class
+        try {
+          ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+          Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+
+          // Add an instance to the repository
+          extractorRepository.put(boilerpipeExtractorName, 
(BoilerpipeExtractor)extractorClass.newInstance());
+
+        } catch (ClassNotFoundException e) {
+          LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not 
found!");
+        } catch (InstantiationException e) {
+          LOG.error("Could not instantiate " + boilerpipeExtractorName);
+        } catch (Exception e) {
+          LOG.error(e);
+        }
+      }
+
+      return extractorRepository.get(boilerpipeExtractorName);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/f185bc44/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
index 13d710f..4f4c8a7 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
@@ -345,7 +345,9 @@ class DOMBuilder implements ContentHandler, LexicalHandler {
    */
   public void endElement(String ns, String localName, String name)
       throws org.xml.sax.SAXException {
-    m_elemStack.pop();
+    if (!m_elemStack.isEmpty()) {
+        m_elemStack.pop();
+    }
     m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
   }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/f185bc44/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 00aa30b..fb0bbe3 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -37,9 +37,11 @@ import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
+import org.xml.sax.ContentHandler;
 
 import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
@@ -80,6 +82,9 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
   @Override
   public Parse getParse(String url, WebPage page) {
 
+    boolean useBoilerpipe = getConf().getBoolean("tika.boilerpipe", false); 
+    String boilerpipeExtractorName = 
getConf().get("tika.boilerpipe.extractor", "ArticleExtractor");
+
     String baseUrl = TableUtil.toString(page.getBaseUrl());
     URL base;
     try {
@@ -109,7 +114,18 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
     doc.setErrorChecking(false);
     DocumentFragment root = doc.createDocumentFragment();
-    DOMBuilder domhandler = new DOMBuilder(doc, root);
+   // DOMBuilder domhandler = new DOMBuilder(doc, root);
+    ContentHandler domHandler;
+    // Check whether to use Tika's BoilerplateContentHandler
+    if (useBoilerpipe) {
+        LOG.debug("Using Tikas's Boilerpipe with Extractor: " + 
boilerpipeExtractorName);
+        BoilerpipeContentHandler bpHandler = new 
BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), 
BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+        bpHandler.setIncludeMarkup(true);
+        domHandler = (ContentHandler)bpHandler;
+    } else {
+        domHandler = new DOMBuilder(doc, root);
+    }
+    
     ParseContext context = new ParseContext();
     if (HTMLMapper != null)
       context.set(HtmlMapper.class, HTMLMapper);
@@ -118,7 +134,7 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
     tikamd.set(Metadata.CONTENT_TYPE, mimeType);
     try {
       parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset()
-          + raw.position(), raw.remaining()), domhandler, tikamd, context);
+          + raw.position(), raw.remaining()), (ContentHandler)domHandler, 
tikamd, context);
     } catch (Exception e) {
       LOG.error("Error parsing " + url, e);
       return ParseStatusUtils.getEmptyParse(e, getConf());
@@ -153,6 +169,21 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
       title = sb.toString().trim();
     }
 
+    // Warning: very nasty
+    // Parse again without BP to get all outlinks
+    if (useBoilerpipe) {
+        root = doc.createDocumentFragment();
+        domHandler = new DOMBuilder(doc, root);
+        try {
+            parser.parse(new ByteArrayInputStream(raw.array(), 
raw.arrayOffset() + raw.position(), raw.remaining()), 
(ContentHandler)domHandler, tikamd, context);
+        } catch (Exception e) {
+           LOG.error("Error parsing "+url,e);
+           return ParseStatusUtils.getEmptyParse(e, getConf());
+        }
+    }
+    // END NASTY STUFF
+    
+
     if (!metaTags.getNoFollow()) { // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
       URL baseTag = utils.getBase(root);

[1/5] nutch git commit: improve parser with boilerpipe

Reply via email to