Author: nick
Date: Thu Aug 28 06:58:16 2014
New Revision: 1621064

URL: http://svn.apache.org/r1621064
Log:
ContentHandler example showing how to break the resulting text up by size

Modified:
    
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
    
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java

Modified: 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java?rev=1621064&r1=1621063&r2=1621064&view=diff
==============================================================================
--- 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
 (original)
+++ 
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
 Thu Aug 28 06:58:16 2014
@@ -18,11 +18,14 @@ package org.apache.tika.example;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.ToXMLContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.xpath.Matcher;
@@ -113,9 +116,36 @@ public class ContentHandlerExample {
         }
     }
     
+    protected final int MAXIMUM_TEXT_CHUNK_SIZE = 40;
     /**
      * Example of extracting the plain text in chunks, with each chunk
      *  of no more than a certain maximum size
      */
-    // TODO Implement
+    public List<String> parseToPlainTextChunks() throws IOException, 
SAXException, TikaException {
+        final List<String> chunks = new ArrayList<String>();
+        chunks.add("");
+        ContentHandlerDecorator handler = new ContentHandlerDecorator() {
+            @Override
+            public void characters(char[] ch, int start, int length) {
+                String lastChunk = chunks.get(chunks.size()-1);
+                String thisStr = new String(ch, start, length);
+                
+                if (lastChunk.length()+length > MAXIMUM_TEXT_CHUNK_SIZE) {
+                    chunks.add(thisStr);
+                } else {
+                    chunks.set(chunks.size()-1, lastChunk+thisStr);
+                }
+            }
+        };
+        
+        InputStream stream = 
ContentHandlerExample.class.getResourceAsStream("test2.doc");
+        AutoDetectParser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        try {
+            parser.parse(stream, handler, metadata);
+            return chunks;
+        } finally {
+            stream.close();
+        }
+    }
 }

Modified: 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java?rev=1621064&r1=1621063&r2=1621064&view=diff
==============================================================================
--- 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
 (original)
+++ 
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
 Thu Aug 28 06:58:16 2014
@@ -23,8 +23,10 @@ import org.junit.Test;
 import org.xml.sax.SAXException;
 
 import java.io.IOException;
+import java.util.List;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 import static org.apache.tika.TikaTest.assertContains;
 import static org.apache.tika.TikaTest.assertNotContained;
 
@@ -81,5 +83,23 @@ public class ContentHandlerExampleTest {
         assertNotContained("<p>1 2 3", result);
     }
 
-    // TODO Implement then test the other two methods
+
+    @Test
+    public void testParseToPlainTextChunks() throws IOException, SAXException, 
TikaException {
+        List<String> result = example.parseToPlainTextChunks();
+        
+        assertEquals(3, result.size());
+        for (String chunk : result) {
+            assertTrue("Chunk under max size", chunk.length() <= 
example.MAXIMUM_TEXT_CHUNK_SIZE);
+        }
+
+        assertContains("This is in the header", result.get(0));
+        assertContains("Test Document", result.get(0));
+        
+        assertContains("Testing", result.get(1));
+        assertContains("1 2 3", result.get(1));
+        assertContains("TestTable", result.get(1));
+        
+        assertContains("Testing 123", result.get(2));
+    }
 }


Reply via email to