Author: nick
Date: Thu Aug 28 06:58:16 2014
New Revision: 1621064
URL: http://svn.apache.org/r1621064
Log:
ContentHandler example showing how to break the resulting text up by size
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
Modified:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java?rev=1621064&r1=1621063&r2=1621064&view=diff
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
(original)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
Thu Aug 28 06:58:16 2014
@@ -18,11 +18,14 @@ package org.apache.tika.example;
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.ToXMLContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
@@ -113,9 +116,36 @@ public class ContentHandlerExample {
}
}
+ protected final int MAXIMUM_TEXT_CHUNK_SIZE = 40;
/**
* Example of extracting the plain text in chunks, with each chunk
* of no more than a certain maximum size
*/
- // TODO Implement
+ public List<String> parseToPlainTextChunks() throws IOException,
SAXException, TikaException {
+ final List<String> chunks = new ArrayList<String>();
+ chunks.add("");
+ ContentHandlerDecorator handler = new ContentHandlerDecorator() {
+ @Override
+ public void characters(char[] ch, int start, int length) {
+ String lastChunk = chunks.get(chunks.size()-1);
+ String thisStr = new String(ch, start, length);
+
+ if (lastChunk.length()+length > MAXIMUM_TEXT_CHUNK_SIZE) {
+ chunks.add(thisStr);
+ } else {
+ chunks.set(chunks.size()-1, lastChunk+thisStr);
+ }
+ }
+ };
+
+ InputStream stream =
ContentHandlerExample.class.getResourceAsStream("test2.doc");
+ AutoDetectParser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ try {
+ parser.parse(stream, handler, metadata);
+ return chunks;
+ } finally {
+ stream.close();
+ }
+ }
}
Modified:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java?rev=1621064&r1=1621063&r2=1621064&view=diff
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
(original)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
Thu Aug 28 06:58:16 2014
@@ -23,8 +23,10 @@ import org.junit.Test;
import org.xml.sax.SAXException;
import java.io.IOException;
+import java.util.List;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import static org.apache.tika.TikaTest.assertContains;
import static org.apache.tika.TikaTest.assertNotContained;
@@ -81,5 +83,23 @@ public class ContentHandlerExampleTest {
assertNotContained("<p>1 2 3", result);
}
- // TODO Implement then test the other two methods
+
+ @Test
+ public void testParseToPlainTextChunks() throws IOException, SAXException,
TikaException {
+ List<String> result = example.parseToPlainTextChunks();
+
+ assertEquals(3, result.size());
+ for (String chunk : result) {
+ assertTrue("Chunk under max size", chunk.length() <=
example.MAXIMUM_TEXT_CHUNK_SIZE);
+ }
+
+ assertContains("This is in the header", result.get(0));
+ assertContains("Test Document", result.get(0));
+
+ assertContains("Testing", result.get(1));
+ assertContains("1 2 3", result.get(1));
+ assertContains("TestTable", result.get(1));
+
+ assertContains("Testing 123", result.get(2));
+ }
}