Author: kkrugler
Date: Thu Aug 12 14:06:50 2010
New Revision: 984791

URL: http://svn.apache.org/viewvc?rev=984791&view=rev
Log:
TIKA-477: Add GUI support for Boilerpipe, and improve output from Boilerpipe 
content handler.

Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=984791&r1=984790&r2=984791&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java 
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Aug 
12 14:06:50 2010
@@ -42,6 +42,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.TeeContentHandler;
@@ -104,6 +105,11 @@ public class TikaGUI extends JFrame {
     private final JEditorPane text;
 
     /**
+     * Main content output.
+     */
+    private final JEditorPane textMain;
+    
+    /**
      * Raw XHTML source.
      */
     private final JEditorPane xml;
@@ -127,6 +133,7 @@ public class TikaGUI extends JFrame {
 
         html = createEditor("Formatted text", "text/html");
         text = createEditor("Plain text", "text/plain");
+        textMain = createEditor("Main content", "text/plain");
         xml = createEditor("Structured text", "text/plain");
         metadata = createEditor("Metadata", "text/plain");
         errors = createEditor("Errors", "text/plain");
@@ -144,12 +151,14 @@ public class TikaGUI extends JFrame {
         try {
             StringWriter htmlBuffer = new StringWriter();
             StringWriter textBuffer = new StringWriter();
+            StringWriter textMainBuffer = new StringWriter();
             StringWriter xmlBuffer = new StringWriter();
             StringBuilder metadataBuffer = new StringBuilder();
 
             ContentHandler handler = new TeeContentHandler(
                     getHtmlHandler(htmlBuffer),
                     getTextContentHandler(textBuffer),
+                    getTextMainContentHandler(textMainBuffer),
                     getXmlContentHandler(xmlBuffer));
 
             input = new ProgressMonitorInputStream(
@@ -169,6 +178,7 @@ public class TikaGUI extends JFrame {
             setText(metadata, metadataBuffer.toString());
             setText(xml, xmlBuffer.toString());
             setText(text, textBuffer.toString());
+            setText(textMain, textMainBuffer.toString());
             setText(html, htmlBuffer.toString());
             tabs.setSelectedIndex(0);
         } catch (Exception e) {
@@ -265,6 +275,9 @@ public class TikaGUI extends JFrame {
     private ContentHandler getTextContentHandler(Writer writer) {
         return new BodyContentHandler(writer);
     }
+    private ContentHandler getTextMainContentHandler(Writer writer) {
+        return new BoilerpipeContentHandler(writer);
+    }
 
     private ContentHandler getXmlContentHandler(Writer writer)
             throws TransformerConfigurationException {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=984791&r1=984790&r2=984791&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
 Thu Aug 12 14:06:50 2010
@@ -41,6 +41,11 @@ import de.l3s.boilerpipe.sax.BoilerpipeH
  */
 public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
 
+    /**
+     * The newline character that gets inserted after block elements.
+     */
+    private static final char[] NL = new char[] { '\n' };
+
     private ContentHandler delegate;
     private BoilerpipeExtractor extractor;
 
@@ -99,6 +104,7 @@ public class BoilerpipeContentHandler ex
                 char[] chars = block.getText().toCharArray();
                 delegate.characters(chars, 0, chars.length);
                 delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+                delegate.ignorableWhitespace(NL, 0, NL.length);
             }
         }
         


Reply via email to