Author: jukka
Date: Tue Jan 20 14:38:04 2009
New Revision: 736149
URL: http://svn.apache.org/viewvc?rev=736149&view=rev
Log:
TIKA-95: Pluggable magic header detectors
Added a TypeDetector class for handling content type hints.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java
Added: lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java?rev=736149&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java
(added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/detect/TypeDetector.java
Tue Jan 20 14:38:04 2009
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on a content type hint. This detector simply
+ * trusts any valid content type hint given in the input metadata, and returns
+ * that as the likely type of the input document.
+ *
+ * @since Apache Tika 0.3
+ */
+public class TypeDetector implements Detector {
+
+ /**
+ * Detects the content type of an input document based on a type hint
+ * given in the input metadata. The CONTENT_TYPE attribute of the given
+ * input metadata is expected to contain the type of the input document.
+ * If that attribute exists and contains a valid type name, then that
+ * type is returned.
+ *
+ * @param input ignored
+ * @param metadata input metadata, possibly with a CONTENT_TYPE value
+ * @return detected media type, or <code>application/octet-stream</code>
+ */
+ public MediaType detect(InputStream input, Metadata metadata) {
+ // Look for a type hint in the input metadata
+ String hint = metadata.get(Metadata.CONTENT_TYPE);
+ if (hint != null) {
+ MediaType type = MediaType.parse(hint);
+ if (type != null) {
+ return type;
+ }
+ }
+ return MediaType.OCTET_STREAM;
+ }
+
+}
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java?rev=736149&r1=736148&r2=736149&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/mime/MediaType.java Tue Jan
20 14:38:04 2009
@@ -52,29 +52,27 @@
* in RFC 2045.
* <p>
* Note that currently this method only parses the "type/subtype" part
- * of the string. Any parameters are simply discarded.
+ * of the string. Any parameters are simply discarded. TODO: Change this.
*
* @param string media type string to be parsed
- * @return parsed media type
- * @throws IllegalArgumentException if the string is not a media type
+ * @return parsed media type, or <code>null</code> if parsing fails
*/
public static MediaType parse(String string) {
- int slash = string.indexOf('/');
- if (slash == -1) {
- throw new IllegalArgumentException("Invalid media type: " +
string);
+ int colon = string.indexOf(';');
+ if (colon != -1) {
+ string = string.substring(0, colon);
}
- String type = string.substring(0, slash);
- String subtype = string.substring(slash + 1);
- // String parameters = "";
-
- int colon = subtype.indexOf(';');
- if (colon != -1) {
- // parameters = subtype.substring(colon + 1);
- subtype = subtype.substring(0, colon);
+ int slash = string.indexOf('/');
+ if (slash != -1) {
+ String type = string.substring(0, slash).trim();
+ String subtype = string.substring(slash + 1).trim();
+ if (type.length() > 0 && subtype.length() > 0) {
+ return new MediaType(type, subtype);
+ }
}
- return new MediaType(type, subtype);
+ return null;
}
private final String type;
Added:
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java?rev=736149&view=auto
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
(added)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
Tue Jan 20 14:38:04 2009
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for the {...@link TypeDetector} class.
+ */
+public class TypeDetectorTest extends TestCase {
+
+ private Detector detector = new TypeDetector();
+
+ public void testDetect() {
+ assertDetect(MediaType.TEXT_PLAIN, "text/plain");
+ assertDetect(MediaType.TEXT_PLAIN, "TEXT/PLAIN");
+ assertDetect(MediaType.TEXT_PLAIN, " text/\tplain\n");
+ assertDetect(MediaType.TEXT_PLAIN, "text/plain; a=b");
+ assertDetect(MediaType.TEXT_PLAIN, "\ttext/plain; a=b\n");
+
+ assertDetect(MediaType.OCTET_STREAM, "text\\plain");
+
+ // test also the zero input cases
+ assertDetect(MediaType.OCTET_STREAM, "");
+ assertDetect(MediaType.OCTET_STREAM, null);
+ try {
+ assertEquals(
+ MediaType.OCTET_STREAM,
+ detector.detect(null, new Metadata()));
+ } catch (IOException e) {
+ fail("TypeDetector should never throw an IOException");
+ }
+ }
+
+ private void assertDetect(MediaType type, String name){
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, name);
+ try {
+ assertEquals(type, detector.detect(null, metadata));
+ } catch (IOException e) {
+ fail("TypeDetector should never throw an IOException");
+ }
+ }
+
+}