Author: nick
Date: Thu Aug 28 06:47:33 2014
New Revision: 1621062

URL: http://svn.apache.org/r1621062
Log:
More content handler examples

Added:
    
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc   
(with props)
Modified:
    
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
    
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java?rev=1621062&r1=1621061&r2=1621062&view=diff
==============================================================================
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
 (original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
 Thu Aug 28 06:47:33 2014
@@ -24,6 +24,10 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -67,7 +71,51 @@ public class ContentHandlerExample {
         }
     }
     
-    // TODO Only one part of the file as HTML
-
-    // TODO Plain text, in chunks of a maximum size
+    /**
+     * Example of extracting just the body as HTML, without the
+     *  head part, as a string
+     */
+    public String parseBodyToHTML() throws IOException, SAXException, 
TikaException {
+        ContentHandler handler = new BodyContentHandler(
+                new ToXMLContentHandler());
+        
+        InputStream stream = 
ContentHandlerExample.class.getResourceAsStream("test.doc");
+        AutoDetectParser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        try {
+            parser.parse(stream, handler, metadata);
+            return handler.toString();
+        } finally {
+            stream.close();
+        }
+    }
+    
+    /**
+     * Example of extracting just one part of the document's body,
+     *  as HTML as a string, excluding the rest
+     */
+    public String parseOnePartToHTML() throws IOException, SAXException, 
TikaException {
+        // Only get things under html -> body -> div (class=header)
+        XPathParser xhtmlParser = new XPathParser("xhtml", 
XHTMLContentHandler.XHTML);
+        Matcher divContentMatcher = xhtmlParser.parse(
+                "/xhtml:html/xhtml:body/xhtml:div/descendant::node()");        
+        ContentHandler handler = new MatchingContentHandler(
+                new ToXMLContentHandler(), divContentMatcher);
+        
+        InputStream stream = 
ContentHandlerExample.class.getResourceAsStream("test2.doc");
+        AutoDetectParser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        try {
+            parser.parse(stream, handler, metadata);
+            return handler.toString();
+        } finally {
+            stream.close();
+        }
+    }
+    
+    /**
+     * Example of extracting the plain text in chunks, with each chunk
+     *  of no more than a certain maximum size
+     */
+    // TODO Implement
 }

Added: 
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc?rev=1621062&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/test2.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java?rev=1621062&r1=1621061&r2=1621062&view=diff
==============================================================================
--- 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
 (original)
+++ 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
 Thu Aug 28 06:47:33 2014
@@ -26,6 +26,7 @@ import java.io.IOException;
 
 import static org.junit.Assert.assertEquals;
 import static org.apache.tika.TikaTest.assertContains;
+import static org.apache.tika.TikaTest.assertNotContained;
 
 public class ContentHandlerExampleTest {
     ContentHandlerExample example;
@@ -53,5 +54,32 @@ public class ContentHandlerExampleTest {
         assertContains(">test", result);
     }
 
+    @Test
+    public void testParseBodyToHTML() throws IOException, SAXException, 
TikaException {
+        String result = example.parseBodyToHTML().trim();
+        
+        assertNotContained("<html", result);
+        assertNotContained("<head>", result);
+        assertNotContained("<meta name=\"dc:creator\"", result);
+        assertNotContained("<title>", result);
+        assertNotContained("<body>", result);
+        assertContains(">test", result);
+    }
+
+    @Test
+    public void testParseOnePartToHTML() throws IOException, SAXException, 
TikaException {
+        String result = example.parseOnePartToHTML().trim();
+        
+        assertNotContained("<html", result);
+        assertNotContained("<head>", result);
+        assertNotContained("<meta name=\"dc:creator\"", result);
+        assertNotContained("<title>", result);
+        assertNotContained("<body>", result);
+        assertContains("<p class=\"header\"", result);
+        assertContains("This is in the header", result);
+        assertNotContained("<h1>Test Document", result);
+        assertNotContained("<p>1 2 3", result);
+    }
+
     // TODO Implement then test the other two methods
 }


Reply via email to