Author: jukka
Date: Tue Jan 20 14:38:04 2009
New Revision: 736149

URL: http://svn.apache.org/viewvc?rev=736149&view=rev
Log:
TIKA-95: Pluggable magic header detectors

Added a TypeDetector class for handling content type hints.

Added:
    lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java
    lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
Modified:
    lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java

Added: lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java?rev=736149&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java 
(added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java 
Tue Jan 20 14:38:04 2009
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on a content type hint. This detector simply
+ * trusts any valid content type hint given in the input metadata, and returns
+ * that as the likely type of the input document.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TypeDetector implements Detector {
+
+    /**
+     * Detects the content type of an input document based on a type hint
+     * given in the input metadata. The CONTENT_TYPE attribute of the given
+     * input metadata is expected to contain the type of the input document.
+     * If that attribute exists and contains a valid type name, then that
+     * type is returned.
+     *
+     * @param input ignored
+     * @param metadata input metadata, possibly with a CONTENT_TYPE value
+     * @return detected media type, or <code>application/octet-stream</code>
+     */
+    public MediaType detect(InputStream input, Metadata metadata) {
+        // Look for a type hint in the input metadata
+        String hint = metadata.get(Metadata.CONTENT_TYPE);
+        if (hint != null) {
+            MediaType type = MediaType.parse(hint);
+            if (type != null) {
+                return type;
+            }
+        }
+        return MediaType.OCTET_STREAM;
+    }
+
+}

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java?rev=736149&r1=736148&r2=736149&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java 
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java Tue Jan 
20 14:38:04 2009
@@ -52,29 +52,27 @@
      * in RFC 2045.
      * <p>
      * Note that currently this method only parses the "type/subtype" part
-     * of the string. Any parameters are simply discarded.
+     * of the string. Any parameters are simply discarded. TODO: Change this.
      *
      * @param string media type string to be parsed
-     * @return parsed media type
-     * @throws IllegalArgumentException if the string is not a media type
+     * @return parsed media type, or <code>null</code> if parsing fails
      */
     public static MediaType parse(String string) {
-        int slash = string.indexOf('/');
-        if (slash == -1) {
-            throw new IllegalArgumentException("Invalid media type: " + 
string);
+        int colon = string.indexOf(';');
+        if (colon != -1) {
+            string = string.substring(0, colon);
         }
 
-        String type = string.substring(0, slash);
-        String subtype = string.substring(slash + 1);
-        // String parameters = "";
-
-        int colon = subtype.indexOf(';');
-        if (colon != -1) {
-            // parameters = subtype.substring(colon + 1);
-            subtype = subtype.substring(0, colon);
+        int slash = string.indexOf('/');
+        if (slash != -1) {
+            String type = string.substring(0, slash).trim();
+            String subtype = string.substring(slash + 1).trim();
+            if (type.length() > 0 && subtype.length() > 0) {
+                return new MediaType(type, subtype);
+            }
         }
 
-        return new MediaType(type, subtype);
+        return null;
     }
 
     private final String type;

Added: 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java?rev=736149&view=auto
==============================================================================
--- 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java 
(added)
+++ 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java 
Tue Jan 20 14:38:04 2009
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for the {...@link TypeDetector} class.
+ */
+public class TypeDetectorTest extends TestCase {
+
+    private Detector detector = new TypeDetector();
+
+    public void testDetect() {
+        assertDetect(MediaType.TEXT_PLAIN, "text/plain");
+        assertDetect(MediaType.TEXT_PLAIN, "TEXT/PLAIN");
+        assertDetect(MediaType.TEXT_PLAIN, " text/\tplain\n");
+        assertDetect(MediaType.TEXT_PLAIN, "text/plain; a=b");
+        assertDetect(MediaType.TEXT_PLAIN, "\ttext/plain; a=b\n");
+
+        assertDetect(MediaType.OCTET_STREAM, "text\\plain");
+
+        // test also the zero input cases
+        assertDetect(MediaType.OCTET_STREAM, "");
+        assertDetect(MediaType.OCTET_STREAM, null);
+        try {
+            assertEquals(
+                    MediaType.OCTET_STREAM,
+                    detector.detect(null, new Metadata()));
+        } catch (IOException e) {
+            fail("TypeDetector should never throw an IOException");
+        }
+    }
+
+    private void assertDetect(MediaType type, String name){
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, name);
+        try {
+            assertEquals(type, detector.detect(null, metadata));
+        } catch (IOException e) {
+            fail("TypeDetector should never throw an IOException");
+        }
+    }
+
+}


Reply via email to