Author: kkrugler
Date: Thu Aug 12 14:06:50 2010
New Revision: 984791
URL: http://svn.apache.org/viewvc?rev=984791&view=rev
Log:
TIKA-477: Add GUI support for Boilerpipe, and improve output from Boilerpipe
content handler.
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=984791&r1=984790&r2=984791&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java Thu Aug
12 14:06:50 2010
@@ -42,6 +42,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.TeeContentHandler;
@@ -104,6 +105,11 @@ public class TikaGUI extends JFrame {
private final JEditorPane text;
/**
+ * Main content output.
+ */
+ private final JEditorPane textMain;
+
+ /**
* Raw XHTML source.
*/
private final JEditorPane xml;
@@ -127,6 +133,7 @@ public class TikaGUI extends JFrame {
html = createEditor("Formatted text", "text/html");
text = createEditor("Plain text", "text/plain");
+ textMain = createEditor("Main content", "text/plain");
xml = createEditor("Structured text", "text/plain");
metadata = createEditor("Metadata", "text/plain");
errors = createEditor("Errors", "text/plain");
@@ -144,12 +151,14 @@ public class TikaGUI extends JFrame {
try {
StringWriter htmlBuffer = new StringWriter();
StringWriter textBuffer = new StringWriter();
+ StringWriter textMainBuffer = new StringWriter();
StringWriter xmlBuffer = new StringWriter();
StringBuilder metadataBuffer = new StringBuilder();
ContentHandler handler = new TeeContentHandler(
getHtmlHandler(htmlBuffer),
getTextContentHandler(textBuffer),
+ getTextMainContentHandler(textMainBuffer),
getXmlContentHandler(xmlBuffer));
input = new ProgressMonitorInputStream(
@@ -169,6 +178,7 @@ public class TikaGUI extends JFrame {
setText(metadata, metadataBuffer.toString());
setText(xml, xmlBuffer.toString());
setText(text, textBuffer.toString());
+ setText(textMain, textMainBuffer.toString());
setText(html, htmlBuffer.toString());
tabs.setSelectedIndex(0);
} catch (Exception e) {
@@ -265,6 +275,9 @@ public class TikaGUI extends JFrame {
private ContentHandler getTextContentHandler(Writer writer) {
return new BodyContentHandler(writer);
}
+ private ContentHandler getTextMainContentHandler(Writer writer) {
+ return new BoilerpipeContentHandler(writer);
+ }
private ContentHandler getXmlContentHandler(Writer writer)
throws TransformerConfigurationException {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=984791&r1=984790&r2=984791&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
Thu Aug 12 14:06:50 2010
@@ -41,6 +41,11 @@ import de.l3s.boilerpipe.sax.BoilerpipeH
*/
public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+ /**
+ * The newline character that gets inserted after block elements.
+ */
+ private static final char[] NL = new char[] { '\n' };
+
private ContentHandler delegate;
private BoilerpipeExtractor extractor;
@@ -99,6 +104,7 @@ public class BoilerpipeContentHandler ex
char[] chars = block.getText().toCharArray();
delegate.characters(chars, 0, chars.length);
delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+ delegate.ignorableWhitespace(NL, 0, NL.length);
}
}