Author: nick
Date: Wed Aug 27 18:57:18 2014
New Revision: 1620957
URL: http://svn.apache.org/r1620957
Log:
Start on examples of using different Content Handlers to get differing output
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java?rev=1620957&view=auto
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
(added)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ContentHandlerExample.java
Wed Aug 27 18:57:18 2014
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.example;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Examples of using different Content Handlers to
+ * get different parts of the file's contents
+ */
+public class ContentHandlerExample {
+ /**
+ * Example of extracting the plain text of the contents.
+ * Will return only the "body" part of the document
+ */
+ public String parseToPlainText() throws IOException, SAXException,
TikaException {
+ BodyContentHandler handler = new BodyContentHandler();
+
+ InputStream stream =
ContentHandlerExample.class.getResourceAsStream("test.doc");
+ AutoDetectParser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ try {
+ parser.parse(stream, handler, metadata);
+ return handler.toString();
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Example of extracting the contents as HTML, as a string.
+ */
+ public String parseToHTML() throws IOException, SAXException,
TikaException {
+ ContentHandler handler = new ToXMLContentHandler();
+
+ InputStream stream =
ContentHandlerExample.class.getResourceAsStream("test.doc");
+ AutoDetectParser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ try {
+ parser.parse(stream, handler, metadata);
+ return handler.toString();
+ } finally {
+ stream.close();
+ }
+ }
+
+ // TODO Only one part of the file as HTML
+
+ // TODO Plain text, in chunks of a maximum size
+}
Added:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java?rev=1620957&view=auto
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
(added)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/ContentHandlerExampleTest.java
Wed Aug 27 18:57:18 2014
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import org.apache.tika.exception.TikaException;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+public class ContentHandlerExampleTest {
+ ContentHandlerExample example;
+
+ @Before
+ public void setUp() {
+ example = new ContentHandlerExample();
+ }
+
+ @Test
+ public void testParseToPlainText() throws IOException, SAXException,
TikaException {
+ String result = example.parseToPlainText().trim();
+ assertEquals("Expected 'test', but got '" + result + "'", "test",
result);
+ }
+
+ @Test
+ public void testParseToHTML() throws IOException, SAXException,
TikaException {
+ String result = example.parseToHTML().trim();
+
+ assertContains("<html", result);
+ assertContains("<head>", result);
+ assertContains("<meta name=\"dc:creator\"", result);
+ assertContains("<title>", result);
+ assertContains("<body>", result);
+ assertContains(">test", result);
+ }
+
+ // TODO Implement then test the other two methods
+}