Author: kkrugler
Date: Mon Jul 12 17:32:12 2010
New Revision: 963375
URL: http://svn.apache.org/viewvc?rev=963375&view=rev
Log:
TIKA-420: Integration of Boilerpipe.
- Add BoilerpipeContentHandler, test case
- Roll in patch for new -T option to TikaCLI
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate.html
(with props)
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=963375&r1=963374&r2=963375&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Mon Jul
12 17:32:12 2010
@@ -43,6 +43,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
@@ -88,6 +89,12 @@ public class TikaCLI {
}
};
+ private final OutputType TEXT_MAIN = new OutputType() {
+ public ContentHandler getContentHandler() throws Exception {
+ return new BoilerpipeContentHandler(getSystemOutWriter(encoding));
+ }
+ };
+
private final OutputType METADATA = new OutputType() {
public ContentHandler getContentHandler() throws Exception {
final PrintWriter writer =
@@ -146,6 +153,8 @@ public class TikaCLI {
type = HTML;
} else if (arg.equals("-t") || arg.equals("--text")) {
type = TEXT;
+ } else if (arg.equals("-T") || arg.equals("--text-main")) {
+ type = TEXT_MAIN;
} else if (arg.equals("-m") || arg.equals("--metadata")) {
type = METADATA;
} else {
@@ -171,6 +180,7 @@ public class TikaCLI {
metadata, context);
} finally {
input.close();
+ System.out.flush();
}
}
}
@@ -188,6 +198,7 @@ public class TikaCLI {
out.println(" -x or --xml Output XHTML content (default)");
out.println(" -h or --html Output HTML content");
out.println(" -t or --text Output plain text content");
+ out.println(" -T or --text-main Output plain text content (main
content only)");
out.println(" -m or --metadata Output only metadata");
out.println();
out.println("Description:");
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=963375&r1=963374&r2=963375&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon Jul 12 17:32:12 2010
@@ -38,6 +38,16 @@
<poi.version>3.7-beta1</poi.version>
</properties>
+ <repositories>
+ <!-- for boilerpipe, to be removed as per TIKA-462 when Boilerpipe is
sync'd to Maven central from Sonatype -->
+ <repository>
+ <id>maven2-repository.dev.java.net</id>
+ <name>Java.net Repository for Maven</name>
+ <url>http://download.java.net/maven/2/</url>
+ <layout>default</layout>
+ </repository>
+ </repositories>
+
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
@@ -123,6 +133,11 @@
<version>2.4.0-beta-1</version>
</dependency>
<dependency>
+ <groupId>de.l3s.boilerpipe</groupId>
+ <artifactId>boilerpipe</artifactId>
+ <version>1.0.4</version>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=963375&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
Mon Jul 12 17:32:12 2010
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ *
+ * Use this as a {...@link ContentHandler} object passed to
+ * {...@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata,
org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+ private ContentHandler delegate;
+ private BoilerpipeExtractor extractor;
+
+ /**
+ * Creates a new boilerpipe-based content extractor, using the
+ * {...@link DefaultExtractor} extraction rules and "delegate" as the
content handler.
+ *
+ * @param delegate
+ * The {...@link ContentHandler} object
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate) {
+ this(delegate, DefaultExtractor.INSTANCE);
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
+ public BoilerpipeContentHandler(Writer writer) {
+ this(new WriteOutContentHandler(writer));
+ }
+
+ /**
+ * Creates a new boilerpipe-based content extractor, using the given
+ * extraction rules. The extracted main content will be passed to the
+ * <delegate> content handler.
+ *
+ * @param delegate
+ * The {...@link ContentHandler} object
+ * @param extractor
+ * Extraction rules to use, e.g. {...@link ArticleExtractor}
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate,
BoilerpipeExtractor extractor) {
+ this.delegate = delegate;
+ this.extractor = extractor;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ super.endDocument();
+
+ TextDocument td = toTextDocument();
+ try {
+ extractor.process(td);
+ } catch (BoilerpipeProcessingException e) {
+ throw new SAXException(e);
+ }
+
+ delegate.startDocument();
+
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ delegate.startElement(XHTMLContentHandler.XHTML, "p", "p",
null);
+ char[] chars = block.getText().toCharArray();
+ delegate.characters(chars, 0, chars.length);
+ delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+ }
+ }
+
+ delegate.endDocument();
+ }
+}
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=963375&r1=963374&r2=963375&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Mon Jul 12 17:32:12 2010
@@ -31,6 +31,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -377,4 +378,24 @@ public class HtmlParserTest extends Test
assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
}
+ /**
+ * Test case for TIKA-420
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
+ */
+ public void testBoilerplateRemoval() throws Exception {
+ String path = "/test-documents/boilerplate.html";
+
+ Metadata metadata = new Metadata();
+ WriteOutContentHandler handler = new WriteOutContentHandler();
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ new BoilerpipeContentHandler(handler), metadata, new
ParseContext());
+
+ String content = handler.toString();
+ assertTrue(content.startsWith("This is the real meat"));
+ assertTrue(content.endsWith("This is the end of the text."));
+ assertFalse(content.contains("boilerplate"));
+ assertFalse(content.contains("footer"));
+ }
+
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate.html
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate.html?rev=963375&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate.html
(added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate.html
Mon Jul 12 17:32:12 2010
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+ <title>Title</title>
+</head>
+<body>
+
+<table>
+ <tr>
+ <td>
+ <table>
+ <tr>
+ <td ><a
href="Main.php">boilerplate</a></td>
+ <td ><a href="Main.php">text</a></td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+</table>
+
+<p>This is the real meat of the page,
+and represents the text we want.
+It has lots of juicy content.
+
+We assume that it won't get filtered out.
+And that all of the lines will be in the
+output.
+</p>
+
+<p>
+Here's another paragraph of text.
+This is the end of the text.
+</p>
+
+<p><a href="footer.html">footer</a></p>
+
+</body>
+</html>
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate.html
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/boilerplate.html
------------------------------------------------------------------------------
svn:mime-type = text/html