Author: jukka
Date: Mon Oct 15 14:10:53 2007
New Revision: 584921

URL: http://svn.apache.org/viewvc?rev=584921&view=rev
Log:
TIKA-67 - Add an auto-detecting Parser implementation

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
  (with props)
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
   (with props)
Modified:
    incubator/tika/trunk/CHANGES.txt

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584921&r1=584920&r2=584921&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Oct 15 14:10:53 2007
@@ -113,3 +113,5 @@
 50. TIKA-65 - Add encode detection support for HTML parser (siren)
 
 51. TIKA-68 - Add dummy parser classes to be used as sentinels (jukka)
+
+52. TIKA-67 - Add an auto-detecting Parser implementation (jukka)

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=584921&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
Mon Oct 15 14:10:53 2007
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.ParserConfig;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.jdom.JDOMException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class AutoDetectParser implements Parser {
+
+    private TikaConfig config;
+
+    /**
+     * Creates an auto-detecting parser instance using the default Tika
+     * configuration.
+     */
+    public AutoDetectParser() {
+        try {
+            config = TikaConfig.getDefaultConfig();
+        } catch (IOException e) {
+            // FIXME: This should never happen
+            throw new RuntimeException(e);
+        } catch (JDOMException e) {
+            // FIXME: This should never happen
+            throw new RuntimeException(e);
+        }
+    }
+
+    public AutoDetectParser(TikaConfig config) {
+        this.config = config;
+    }
+
+    public TikaConfig getConfig() {
+        return config;
+    }
+
+    public void setConfig(TikaConfig config) {
+        this.config = config;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        // We need buffering to enable MIME magic detection before parsing
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+
+        // Automatically detect the MIME type of the document 
+        MimeType type = getMimeType(stream, metadata);
+        metadata.set(Metadata.CONTENT_TYPE, type.getName());
+
+        // Get the parser configuration for the detected MIME type
+        ParserConfig pc = config.getParserConfig(type.getName());
+        if (pc == null) {
+            pc = config.getParserConfig(MimeTypes.DEFAULT);
+        }
+        if (pc == null) {
+            throw new TikaException("No parsers available for this document");
+        }
+
+        // Instantiate the configured parser and use it to parse the document
+        Parser parser = ParserFactory.getParser(pc);
+        parser.parse(stream, handler, metadata);
+    }
+
+    /**
+     * Automatically detects the MIME type of a document based on magic
+     * markers in the stream prefix and any given metadata hints.
+     * <p>
+     * The given stream is expected to support marks, so that this method
+     * can reset the stream to the position it was in before this method
+     * was called.
+     *
+     * @param stream document stream
+     * @param metadata metadata hints
+     * @return MIME type of the document
+     * @throws IOException if the document stream could not be read
+     */
+    private MimeType getMimeType(InputStream stream, Metadata metadata)
+            throws IOException {
+        MimeTypes types = config.getMimeRepository();
+        MimeType type = null;
+
+        // Get type based on metadata hint (if available)
+        String typename = metadata.get(Metadata.CONTENT_TYPE);
+        if (typename != null) {
+            try {
+                typename = MimeType.clean(typename);
+                type = types.forName(typename);
+            } catch (MimeTypeException e) {
+                // Malformed type name, ignore
+            }
+        }
+
+        // Get (or verify) type based on filename hint (if available)
+        String filename = metadata.get("filename");
+        if (filename != null) {
+            MimeType match = types.getMimeType(filename);
+            if (match != null && (type == null || !type.matches(filename))) {
+                type = match;
+            }
+        }
+
+        // Get (or verify) type based on magic prefix
+        stream.mark(types.getMinLength());
+        try {
+            byte[] prefix = getPrefix(stream, types.getMinLength());
+            MimeType match = types.getMimeType(prefix);
+            if (match != null && (type == null || !type.matches(prefix))) {
+                type = match;
+            }
+        } finally {
+            stream.reset();
+        }
+
+        // Finally, use the default type if no matches found
+        if (type == null) {
+            type = types.forName(MimeTypes.DEFAULT);
+        }
+
+        return type;
+    }
+
+    /**
+     * Reads and returns the first <code>length</code> bytes from the
+     * given stream. If the stream ends before that, returns all bytes
+     * from the stream.
+     * 
+     * @param input input stream
+     * @param length number of bytes to read and return
+     * @return stream prefix
+     * @throws IOException if the stream could not be read
+     */
+    private byte[] getPrefix(InputStream input, int length) throws IOException 
{
+        ByteArrayOutputStream output = new ByteArrayOutputStream();
+        byte[] buffer = new byte[Math.min(1024, length)];
+        int n = input.read(buffer);
+        while (n != -1) {
+            output.write(buffer, 0, n);
+            int remaining = length - output.size();
+            if (remaining > 0) {
+                n = input.read(buffer, 0, Math.min(buffer.length, remaining));
+            } else {
+                n = -1;
+            }
+        }
+        return output.toByteArray();
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=584921&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
 Mon Oct 15 14:10:53 2007
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class AutoDetectParserTest extends TestCase {
+
+    private void assertAutoDetect(
+            String resource, String type, String content) throws Exception {
+        InputStream input =
+            AutoDetectParserTest.class.getResourceAsStream(resource);
+        try {
+            Metadata metadata = new Metadata();
+            metadata.set("filename", resource);
+            metadata.set(Metadata.CONTENT_TYPE, type);
+            StringWriter writer = new StringWriter();
+            ContentHandler handler = new WriteOutContentHandler(writer);
+            new AutoDetectParser().parse(input, handler, metadata);
+
+            assertEquals(type, metadata.get(Metadata.CONTENT_TYPE));
+            System.out.println(writer.toString());
+            assertTrue(writer.toString().contains(content));
+        } finally {
+            input.close();
+        }
+    }
+
+    public void testAutoDetect() throws Exception {
+        assertAutoDetect(
+                "/test-documents/testEXCEL.xls",
+                "application/vnd.ms-excel",
+                "Sample Excel Worksheet");
+        assertAutoDetect(
+                "/test-documents/testHTML.html",
+                "text/html",
+                "Test Indexation Html");
+        /* FIXME: OpenDocument autodetection doesn't work
+        assertAutoDetect(
+                "/test-documents/testOpenOffice2.odt",
+                "application/vnd.oasis.opendocument.text",
+                "This is a sample Open Office document");
+         */
+        assertAutoDetect(
+                "/test-documents/testPDF.pdf",
+                "application/pdf",
+                "Content Analysis Toolkit");
+        assertAutoDetect(
+                "/test-documents/testPPT.ppt",
+                "application/vnd.ms-powerpoint",
+                "Sample Powerpoint Slide");
+        assertAutoDetect(
+                "/test-documents/testRTF.rtf",
+                "application/rtf",
+                "indexation Word");
+        assertAutoDetect(
+                "/test-documents/testTXT.txt",
+                "text/plain",
+                "indexation de Txt");
+        assertAutoDetect(
+                "/test-documents/testWORD.doc",
+                "application/msword",
+                "Sample Word Document");
+        assertAutoDetect(
+                "/test-documents/testXML.xml",
+                "application/xml",
+                "Archimède et Lius");
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to