Author: nick
Date: Thu Aug 28 06:47:33 2014
New Revision: 1621062
URL: http://svn.apache.org/r1621062
Log:
More content handler examples
Added:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc
(with props)
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java?rev=1621062&r1=1621061&r2=1621062&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
Thu Aug 28 06:47:33 2014
@@ -24,6 +24,10 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -67,7 +71,51 @@ public class ContentHandlerExample {
}
}
- // TODO Only one part of the file as HTML
-
- // TODO Plain text, in chunks of a maximum size
+ /**
+ * Example of extracting just the body as HTML, without the
+ * head part, as a string
+ */
+ public String parseBodyToHTML() throws IOException, SAXException,
TikaException {
+ ContentHandler handler = new BodyContentHandler(
+ new ToXMLContentHandler());
+
+ InputStream stream =
ContentHandlerExample.class.getResourceAsStream("test.doc");
+ AutoDetectParser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ try {
+ parser.parse(stream, handler, metadata);
+ return handler.toString();
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Example of extracting just one part of the document's body,
+ * as HTML as a string, excluding the rest
+ */
+ public String parseOnePartToHTML() throws IOException, SAXException,
TikaException {
+ // Only get things under html -> body -> div (class=header)
+ XPathParser xhtmlParser = new XPathParser("xhtml",
XHTMLContentHandler.XHTML);
+ Matcher divContentMatcher = xhtmlParser.parse(
+ "/xhtml:html/xhtml:body/xhtml:div/descendant::node()");
+ ContentHandler handler = new MatchingContentHandler(
+ new ToXMLContentHandler(), divContentMatcher);
+
+ InputStream stream =
ContentHandlerExample.class.getResourceAsStream("test2.doc");
+ AutoDetectParser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ try {
+ parser.parse(stream, handler, metadata);
+ return handler.toString();
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Example of extracting the plain text in chunks, with each chunk
+ * of no more than a certain maximum size
+ */
+ // TODO Implement
}
Added:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc?rev=1621062&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java?rev=1621062&r1=1621061&r2=1621062&view=diff
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
(original)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
Thu Aug 28 06:47:33 2014
@@ -26,6 +26,7 @@ import java.io.IOException;
import static org.junit.Assert.assertEquals;
import static org.apache.tika.TikaTest.assertContains;
+import static org.apache.tika.TikaTest.assertNotContained;
public class ContentHandlerExampleTest {
ContentHandlerExample example;
@@ -53,5 +54,32 @@ public class ContentHandlerExampleTest {
assertContains(">test", result);
}
+ @Test
+ public void testParseBodyToHTML() throws IOException, SAXException,
TikaException {
+ String result = example.parseBodyToHTML().trim();
+
+ assertNotContained("<html", result);
+ assertNotContained("<head>", result);
+ assertNotContained("<meta name=\"dc:creator\"", result);
+ assertNotContained("<title>", result);
+ assertNotContained("<body>", result);
+ assertContains(">test", result);
+ }
+
+ @Test
+ public void testParseOnePartToHTML() throws IOException, SAXException,
TikaException {
+ String result = example.parseOnePartToHTML().trim();
+
+ assertNotContained("<html", result);
+ assertNotContained("<head>", result);
+ assertNotContained("<meta name=\"dc:creator\"", result);
+ assertNotContained("<title>", result);
+ assertNotContained("<body>", result);
+ assertContains("<p class=\"header\"", result);
+ assertContains("This is in the header", result);
+ assertNotContained("<h1>Test Document", result);
+ assertNotContained("<p>1 2 3", result);
+ }
+
// TODO Implement then test the other two methods
}