Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java?rev=1677694&view=auto
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java
(added)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java
Mon May 4 21:52:53 2015
@@ -0,0 +1,218 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Reader;
+import java.net.URL;
+import java.nio.CharBuffer;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.html.IdentityHtmlMapper;
+import org.apache.tika.parser.txt.TXTParser;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.LinkContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TIAParsingExample {
+
+ public static String parseToStringExample() throws Exception {
+ File document = new File("example.doc");
+ String content = new Tika().parseToString(document);
+ System.out.print(content);
+ return content;
+ }
+
+ public static void parseToReaderExample() throws Exception {
+ File document = new File("example.doc");
+ Reader reader = new Tika().parse(document);
+ try {
+ char[] buffer = new char[1000];
+ int n = reader.read(buffer);
+ while (n != -1) {
+ System.out.append(CharBuffer.wrap(buffer, 0,
n));
+ n = reader.read(buffer);
+ }
+ } finally {
+ reader.close();
+ }
+ }
+
+ public static void parseFileInputStream(String filename) throws
Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = new FileInputStream(new File(filename));
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ }
+
+ public static void parseURLStream(String address) throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = new GZIPInputStream(new
URL(address).openStream());
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ }
+
+ public static void parseTikaInputStream(String filename) throws
Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = TikaInputStream.get(new File(filename));
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ stream.close();
+ }
+ }
+
+ public static File tikaInputStreamGetFile(String filename) throws
Exception {
+ InputStream stream = TikaInputStream.get(new File(filename));
+ try {
+ TikaInputStream tikaInputStream =
TikaInputStream.get(stream);
+ File file = tikaInputStream.getFile();
+ return file;
+ } finally {
+ stream.close();
+ }
+ }
+
+ public static void useHtmlParser() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ Parser parser = new HtmlParser();
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void useCompositeParser() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ ParseContext context = new ParseContext();
+ Map<MediaType, Parser> parsersByType = new HashMap<MediaType,
Parser>();
+ parsersByType.put(MediaType.parse("text/html"), new
HtmlParser());
+ parsersByType.put(MediaType.parse("application/xml"), new
XMLParser());
+
+ CompositeParser parser = new CompositeParser();
+ parser.setParsers(parsersByType);
+ parser.setFallback(new TXTParser());
+
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html");
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void useAutoDetectParser() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ Parser parser = new AutoDetectParser();
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void testTeeContentHandler(String filename) throws
Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ Parser parser = new AutoDetectParser();
+ LinkContentHandler linkCollector = new LinkContentHandler();
+ OutputStream output = new FileOutputStream(new File(filename));
+ try {
+ ContentHandler handler = new TeeContentHandler(
+ new BodyContentHandler(output),
linkCollector);
+ parser.parse(stream, handler, metadata, context);
+ } finally {
+ output.close();
+ }
+ }
+
+ public static void testLocale() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.ENGLISH);
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void testHtmlMapper() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(HtmlMapper.class, new IdentityHtmlMapper());
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void testCompositeDocument() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new ParserDecorator(parser) {
+ private static final long serialVersionUID =
4424210691523343833L;
+
+ @Override
+ public void parse(InputStream stream, ContentHandler
handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException,
TikaException {
+ // custom processing of the component document
+ }
+ });
+ parser.parse(stream, handler, metadata, context);
+ }
+
+}
Propchange:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java
------------------------------------------------------------------------------
svn:executable = *
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java?rev=1677694&view=auto
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
(added)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
Mon May 4 21:52:53 2015
@@ -0,0 +1,111 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ *
+ * Generates document summaries for corpus analysis in the Open Relevance
+ * project.
+ *
+ */
+@SuppressWarnings("deprecation")
+public class TrecDocumentGenerator {
+
+ public TrecDocument summarize(File file) throws FileNotFoundException,
+ IOException, TikaException {
+ Tika tika = new Tika();
+ Metadata met = new Metadata();
+
+ String contents = tika.parseToString(new FileInputStream(file),
met);
+ return new TrecDocument(met.get(Metadata.RESOURCE_NAME_KEY),
contents,
+ met.getDate(Metadata.DATE));
+
+ }
+
+ // copied from
+ //
http://svn.apache.org/repos/asf/lucene/openrelevance/trunk/src/java/org/
+ // apache/orp/util/TrecDocument.java
+ // since the ORP jars aren't published anywhere
+ class TrecDocument {
+ private CharSequence docname;
+ private CharSequence body;
+ private Date date;
+
+ public TrecDocument(CharSequence docname, CharSequence body,
Date date) {
+ this.docname = docname;
+ this.body = body;
+ this.date = date;
+ }
+
+ public TrecDocument() {
+ }
+
+ /**
+ * @return the docname
+ */
+ public CharSequence getDocname() {
+ return docname;
+ }
+
+ /**
+ * @param docname
+ * the docname to set
+ */
+ public void setDocname(CharSequence docname) {
+ this.docname = docname;
+ }
+
+ /**
+ * @return the body
+ */
+ public CharSequence getBody() {
+ return body;
+ }
+
+ /**
+ * @param body
+ * the body to set
+ */
+ public void setBody(CharSequence body) {
+ this.body = body;
+ }
+
+ /**
+ * @return the date
+ */
+ public Date getDate() {
+ return date;
+ }
+
+ /**
+ * @param date
+ * the date to set
+ */
+ public void setDate(Date date) {
+ this.date = date;
+ }
+ }
+
+}
Propchange:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
------------------------------------------------------------------------------
svn:executable = *
Added:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java?rev=1677694&view=auto
==============================================================================
---
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
(added)
+++
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
Mon May 4 21:52:53 2015
@@ -0,0 +1,47 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+//JDK imports
+import java.io.IOException;
+import java.util.Collections;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+/**
+ *
+ *
+ * Example code listing from Chapter 1. Lists a zip file's entries using JDK's
+ * standard APIs.
+ *
+ */
+public class ZipListFiles {
+ public static void main(String[] args) throws Exception {
+ if (args.length > 0) {
+ for (String file : args) {
+ System.out.println("Files in " + file + "
file:");
+ listZipEntries(file);
+ }
+ }
+ }
+
+ public static void listZipEntries(String path) throws IOException {
+ ZipFile zip = new ZipFile(path);
+ for (ZipEntry entry : Collections.list(zip.entries())) {
+ System.out.println(entry.getName());
+ }
+ }
+
+}
\ No newline at end of file
Propchange:
tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
------------------------------------------------------------------------------
svn:executable = *
Added:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/spring.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/resources/org/apache/tika/example/spring.xml?rev=1677694&view=auto
==============================================================================
---
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/spring.xml
(added)
+++
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/spring.xml
Mon May 4 21:52:53 2015
@@ -0,0 +1,36 @@
+<!--
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+ -->
+<?xml version="1.0" encoding="UTF-8"?>
+<beans xmlns="http://www.springframework.org/schema/beans"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://www.springframework.org/schema/beans
+
http://www.springframework.org/schema/beans/spring-beans-3.0.xsd">
+
+<!--<start id="spring"/>-->
+ <bean id="tika" class="org.apache.tika.parser.AutoDetectParser">
+ <constructor-arg>
+ <list>
+ <ref bean="txt"/>
+ <ref bean="pdf"/>
+ </list>
+ </constructor-arg>
+ </bean>
+
+ <bean id="txt" class="org.apache.tika.parser.txt.TXTParser"/>
+ <bean id="pdf" class="org.apache.tika.parser.pdf.PDFParser"/>
+<!--<end id="spring"/>-->
+
+</beans>
\ No newline at end of file
Propchange:
tika/trunk/tika-example/src/main/resources/org/apache/tika/example/spring.xml
------------------------------------------------------------------------------
svn:executable = *
Added:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java?rev=1677694&view=auto
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
(added)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
Mon May 4 21:52:53 2015
@@ -0,0 +1,30 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+@SuppressWarnings("deprecation")
+public class AdvancedTypeDetectorTest {
+
+ @Test
+ public void testDetectWithCustomConfig() throws Exception {
+ Assert.assertEquals("application/xml",
+
AdvancedTypeDetector.detectWithCustomConfig("pom.xml"));
+ }
+
+}
Propchange:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
------------------------------------------------------------------------------
svn:executable = *
Added:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java?rev=1677694&view=auto
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
(added)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
Mon May 4 21:52:53 2015
@@ -0,0 +1,52 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+
+import junit.framework.Assert;
+
+import org.apache.commons.io.FileUtils;
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+
+@SuppressWarnings("deprecation")
+public class SimpleTextExtractorTest {
+
+ @Test
+ public void testSimpleTextExtractor() throws Exception {
+ String message =
+ "Hello, World! This is simple UTF-8 text content written"
+ + " in English to test autodetection of the character"
+ + " encoding of the input stream.";
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+ PrintStream out = System.out;
+ System.setOut(new PrintStream(buffer, true, Charsets.UTF_8.name()));
+
+ File file = new File("target", "test.txt");
+ FileUtils.writeStringToFile(file, message);
+ SimpleTextExtractor.main(new String[] { file.getPath() });
+ file.delete();
+
+ System.setOut(out);
+
+ Assert.assertEquals(message,
buffer.toString(Charsets.UTF_8.name()).trim());
+ }
+
+}
Propchange:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTextExtractorTest.java
------------------------------------------------------------------------------
svn:executable = *
Added:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java?rev=1677694&view=auto
==============================================================================
---
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
(added)
+++
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
Mon May 4 21:52:53 2015
@@ -0,0 +1,45 @@
+/**
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.example;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+import com.google.common.base.Charsets;
+
+@SuppressWarnings("deprecation")
+public class SimpleTypeDetectorTest {
+
+ @Test
+ public void testSimpleTypeDetector() throws Exception {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+
+ PrintStream out = System.out;
+ System.setOut(new PrintStream(buffer, true,
Charsets.UTF_8.name()));
+
+ SimpleTypeDetector.main(new String[] { "pom.xml" });
+
+ System.setOut(out);
+
+ Assert.assertEquals("pom.xml: application/xml",
+ buffer.toString(Charsets.UTF_8.name()).trim());
+ }
+
+}
Propchange:
tika/trunk/tika-example/src/test/java/org/apache/tika/example/SimpleTypeDetectorTest.java
------------------------------------------------------------------------------
svn:executable = *