Author: jukka
Date: Mon Oct 15 14:10:53 2007
New Revision: 584921
URL: http://svn.apache.org/viewvc?rev=584921&view=rev
Log:
TIKA-67 - Add an auto-detecting Parser implementation
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
(with props)
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
(with props)
Modified:
incubator/tika/trunk/CHANGES.txt
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584921&r1=584920&r2=584921&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Oct 15 14:10:53 2007
@@ -113,3 +113,5 @@
50. TIKA-65 - Add encode detection support for HTML parser (siren)
51. TIKA-68 - Add dummy parser classes to be used as sentinels (jukka)
+
+52. TIKA-67 - Add an auto-detecting Parser implementation (jukka)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=584921&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Mon Oct 15 14:10:53 2007
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.ParserConfig;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.jdom.JDOMException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AutoDetectParser implements Parser {
+
+ private TikaConfig config;
+
+ /**
+ * Creates an auto-detecting parser instance using the default Tika
+ * configuration.
+ */
+ public AutoDetectParser() {
+ try {
+ config = TikaConfig.getDefaultConfig();
+ } catch (IOException e) {
+ // FIXME: This should never happen
+ throw new RuntimeException(e);
+ } catch (JDOMException e) {
+ // FIXME: This should never happen
+ throw new RuntimeException(e);
+ }
+ }
+
+ public AutoDetectParser(TikaConfig config) {
+ this.config = config;
+ }
+
+ public TikaConfig getConfig() {
+ return config;
+ }
+
+ public void setConfig(TikaConfig config) {
+ this.config = config;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ // We need buffering to enable MIME magic detection before parsing
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+
+ // Automatically detect the MIME type of the document
+ MimeType type = getMimeType(stream, metadata);
+ metadata.set(Metadata.CONTENT_TYPE, type.getName());
+
+ // Get the parser configuration for the detected MIME type
+ ParserConfig pc = config.getParserConfig(type.getName());
+ if (pc == null) {
+ pc = config.getParserConfig(MimeTypes.DEFAULT);
+ }
+ if (pc == null) {
+ throw new TikaException("No parsers available for this document");
+ }
+
+ // Instantiate the configured parser and use it to parse the document
+ Parser parser = ParserFactory.getParser(pc);
+ parser.parse(stream, handler, metadata);
+ }
+
+ /**
+ * Automatically detects the MIME type of a document based on magic
+ * markers in the stream prefix and any given metadata hints.
+ * <p>
+ * The given stream is expected to support marks, so that this method
+ * can reset the stream to the position it was in before this method
+ * was called.
+ *
+ * @param stream document stream
+ * @param metadata metadata hints
+ * @return MIME type of the document
+ * @throws IOException if the document stream could not be read
+ */
+ private MimeType getMimeType(InputStream stream, Metadata metadata)
+ throws IOException {
+ MimeTypes types = config.getMimeRepository();
+ MimeType type = null;
+
+ // Get type based on metadata hint (if available)
+ String typename = metadata.get(Metadata.CONTENT_TYPE);
+ if (typename != null) {
+ try {
+ typename = MimeType.clean(typename);
+ type = types.forName(typename);
+ } catch (MimeTypeException e) {
+ // Malformed type name, ignore
+ }
+ }
+
+ // Get (or verify) type based on filename hint (if available)
+ String filename = metadata.get("filename");
+ if (filename != null) {
+ MimeType match = types.getMimeType(filename);
+ if (match != null && (type == null || !type.matches(filename))) {
+ type = match;
+ }
+ }
+
+ // Get (or verify) type based on magic prefix
+ stream.mark(types.getMinLength());
+ try {
+ byte[] prefix = getPrefix(stream, types.getMinLength());
+ MimeType match = types.getMimeType(prefix);
+ if (match != null && (type == null || !type.matches(prefix))) {
+ type = match;
+ }
+ } finally {
+ stream.reset();
+ }
+
+ // Finally, use the default type if no matches found
+ if (type == null) {
+ type = types.forName(MimeTypes.DEFAULT);
+ }
+
+ return type;
+ }
+
+ /**
+ * Reads and returns the first <code>length</code> bytes from the
+ * given stream. If the stream ends before that, returns all bytes
+ * from the stream.
+ *
+ * @param input input stream
+ * @param length number of bytes to read and return
+ * @return stream prefix
+ * @throws IOException if the stream could not be read
+ */
+ private byte[] getPrefix(InputStream input, int length) throws IOException
{
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+ byte[] buffer = new byte[Math.min(1024, length)];
+ int n = input.read(buffer);
+ while (n != -1) {
+ output.write(buffer, 0, n);
+ int remaining = length - output.size();
+ if (remaining > 0) {
+ n = input.read(buffer, 0, Math.min(buffer.length, remaining));
+ } else {
+ n = -1;
+ }
+ }
+ return output.toByteArray();
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=584921&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Mon Oct 15 14:10:53 2007
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class AutoDetectParserTest extends TestCase {
+
+ private void assertAutoDetect(
+ String resource, String type, String content) throws Exception {
+ InputStream input =
+ AutoDetectParserTest.class.getResourceAsStream(resource);
+ try {
+ Metadata metadata = new Metadata();
+ metadata.set("filename", resource);
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ StringWriter writer = new StringWriter();
+ ContentHandler handler = new WriteOutContentHandler(writer);
+ new AutoDetectParser().parse(input, handler, metadata);
+
+ assertEquals(type, metadata.get(Metadata.CONTENT_TYPE));
+ System.out.println(writer.toString());
+ assertTrue(writer.toString().contains(content));
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testAutoDetect() throws Exception {
+ assertAutoDetect(
+ "/test-documents/testEXCEL.xls",
+ "application/vnd.ms-excel",
+ "Sample Excel Worksheet");
+ assertAutoDetect(
+ "/test-documents/testHTML.html",
+ "text/html",
+ "Test Indexation Html");
+ /* FIXME: OpenDocument autodetection doesn't work
+ assertAutoDetect(
+ "/test-documents/testOpenOffice2.odt",
+ "application/vnd.oasis.opendocument.text",
+ "This is a sample Open Office document");
+ */
+ assertAutoDetect(
+ "/test-documents/testPDF.pdf",
+ "application/pdf",
+ "Content Analysis Toolkit");
+ assertAutoDetect(
+ "/test-documents/testPPT.ppt",
+ "application/vnd.ms-powerpoint",
+ "Sample Powerpoint Slide");
+ assertAutoDetect(
+ "/test-documents/testRTF.rtf",
+ "application/rtf",
+ "indexation Word");
+ assertAutoDetect(
+ "/test-documents/testTXT.txt",
+ "text/plain",
+ "indexation de Txt");
+ assertAutoDetect(
+ "/test-documents/testWORD.doc",
+ "application/msword",
+ "Sample Word Document");
+ assertAutoDetect(
+ "/test-documents/testXML.xml",
+ "application/xml",
+ "Archimède et Lius");
+ }
+
+}
Propchange:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native