enhancement of the boilerpipe patch

Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/32dd379d
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/32dd379d
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/32dd379d

Branch: refs/heads/2.x
Commit: 32dd379d4dbf46c707b4554b0385bb3345f74797
Parents: be91764
Author: Jérémie Bourseau <[email protected]>
Authored: Mon Feb 29 14:36:49 2016 +0100
Committer: Jérémie Bourseau <[email protected]>
Committed: Mon Feb 29 14:36:49 2016 +0100

----------------------------------------------------------------------
 conf/nutch-default.xml                               |  5 +++++
 .../apache/nutch/indexer/IndexingFiltersChecker.java |  5 ++---
 .../parse/tika/BoilerpipeExtractorRepository.java    | 12 ++++++------
 .../java/org/apache/nutch/parse/tika/TikaParser.java | 15 ++++++---------
 4 files changed, 19 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 30c5831..117737b 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -881,10 +881,15 @@
 <property>
   <name>tika.boilerpipe</name>
   <value>false</value>
+  <description>Define if the parser tika uses boilerpipe or not. This property 
needs to activate the parse-tika in the plugin.includes property.
+  </description>
 </property>
+
 <property>
   <name>tika.boilerpipe.extractor</name>
   <value>ArticleExtractor</value>
+  <description>Define what algorithm boilerpipe uses.
+  </description>
 </property>
 
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index cc3af15..ec77607 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -44,7 +44,7 @@ import org.slf4j.LoggerFactory;
 
 /**
  * Reads and parses a URL and run the indexers on it. Displays the fields
- * obtained and the first 100 characters of their value
+ * obtained and all the characters of their value
  * 
  * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker
  * http://www.lemonde.fr
@@ -145,8 +145,7 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
       if (values != null) {
         for (Object value : values) {
           String str = value.toString();
-          int minText = Math.min(100, str.length());
-          System.out.println(fname + " :\t" + str.substring(0, minText));
+          System.out.println(fname + " :\t" + str);
         }
       }
     }

http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
index baa40d6..de9768e 100644
--- 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
+++ 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -19,15 +19,15 @@ package org.apache.nutch.parse.tika;
 import java.lang.ClassLoader;
 import java.lang.InstantiationException;
 import java.util.WeakHashMap;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import de.l3s.boilerpipe.BoilerpipeExtractor;
 import de.l3s.boilerpipe.extractors.*;
 
 class BoilerpipeExtractorRepository {
 
-    public static final Log LOG = 
LogFactory.getLog(BoilerpipeExtractorRepository.class);
+       public static final Logger LOG = 
LoggerFactory.getLogger(BoilerpipeExtractorRepository.class);
     public static final WeakHashMap<String, BoilerpipeExtractor> 
extractorRepository = new WeakHashMap<String, BoilerpipeExtractor>();
  
     /**
@@ -48,11 +48,11 @@ class BoilerpipeExtractorRepository {
           extractorRepository.put(boilerpipeExtractorName, 
(BoilerpipeExtractor)extractorClass.newInstance());
 
         } catch (ClassNotFoundException e) {
-          LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not 
found!");
+          LOG.error("BoilerpipeExtractor {} not found!", 
boilerpipeExtractorName);
         } catch (InstantiationException e) {
-          LOG.error("Could not instantiate " + boilerpipeExtractorName);
+          LOG.error("Could not instantiate {}!", boilerpipeExtractorName);
         } catch (Exception e) {
-          LOG.error(e);
+          LOG.error("Error due to the {}!",boilerpipeExtractorName, e);
         }
       }
 

http://git-wip-us.apache.org/repos/asf/nutch/blob/32dd379d/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index fb0bbe3..9da6160 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -106,19 +106,17 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
           message, getConf());
     }
 
-    LOG.debug("Using Tika parser " + parser.getClass().getName()
-        + " for mime-type " + mimeType);
+    LOG.debug("Using Tika parser {} for mime-type {}.", 
parser.getClass().getName(), mimeType);
 
     Metadata tikamd = new Metadata();
 
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
     doc.setErrorChecking(false);
     DocumentFragment root = doc.createDocumentFragment();
-   // DOMBuilder domhandler = new DOMBuilder(doc, root);
     ContentHandler domHandler;
     // Check whether to use Tika's BoilerplateContentHandler
     if (useBoilerpipe) {
-        LOG.debug("Using Tikas's Boilerpipe with Extractor: " + 
boilerpipeExtractorName);
+        LOG.debug("Using Tikas's Boilerpipe with Extractor: {}.", 
boilerpipeExtractorName);
         BoilerpipeContentHandler bpHandler = new 
BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), 
BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
         bpHandler.setIncludeMarkup(true);
         domHandler = (ContentHandler)bpHandler;
@@ -136,7 +134,7 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
       parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset()
           + raw.position(), raw.remaining()), (ContentHandler)domHandler, 
tikamd, context);
     } catch (Exception e) {
-      LOG.error("Error parsing " + url, e);
+      LOG.error("Error parsing {}.", url, e);
       return ParseStatusUtils.getEmptyParse(e, getConf());
     }
 
@@ -169,19 +167,18 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
       title = sb.toString().trim();
     }
 
-    // Warning: very nasty
-    // Parse again without BP to get all outlinks
+    // Parse again without boilerpipe to get all outlinks
+    // TODO avoid this second parsing
     if (useBoilerpipe) {
         root = doc.createDocumentFragment();
         domHandler = new DOMBuilder(doc, root);
         try {
             parser.parse(new ByteArrayInputStream(raw.array(), 
raw.arrayOffset() + raw.position(), raw.remaining()), 
(ContentHandler)domHandler, tikamd, context);
         } catch (Exception e) {
-           LOG.error("Error parsing "+url,e);
+           LOG.error("Error parsing {}.", url, e);
            return ParseStatusUtils.getEmptyParse(e, getConf());
         }
     }
-    // END NASTY STUFF
     
 
     if (!metaTags.getNoFollow()) { // okay to follow links

Reply via email to