Author: jukka
Date: Tue Jan 20 13:36:56 2009
New Revision: 736118
URL: http://svn.apache.org/viewvc?rev=736118&view=rev
Log:
TIKA-95: Pluggable magic header detectors
Added a NameDetector class for detecting file name patterns.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java
lucene/tika/trunk/src/test/java/org/apache/tika/detect/
lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java
Added: lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java?rev=736118&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java
(added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java
Tue Jan 20 13:36:56 2009
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ *
+ * @author Jukka Zitting
+ *
+ */
+public class NameDetector implements Detector {
+
+ private final Map<Pattern, MediaType> patterns;
+
+ public NameDetector(Map<Pattern, MediaType> patterns) {
+ this.patterns = patterns;
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata) {
+ // Look for a resource name in the input metadata
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ // If the name is a URL, skip the trailing query and fragment parts
+ int question = name.indexOf('?');
+ if (question != -1) {
+ name = name.substring(0, question);
+ }
+ int hash = name.indexOf('#');
+ if (hash != -1) {
+ name = name.substring(0, hash);
+ }
+
+ // If the name is a URL or a path, skip all but the last component
+ int slash = name.lastIndexOf('/');
+ if (slash != -1) {
+ name = name.substring(slash + 1);
+ }
+ int backslash = name.lastIndexOf('\\');
+ if (backslash != -1) {
+ name = name.substring(backslash + 1);
+ }
+
+ // Skip any leading or trailing whitespace
+ name = name.trim();
+ if (name.length() > 0) {
+ // Decode any potential URL encoding
+ int percent = name.indexOf('%');
+ if (percent != -1) {
+ try {
+ name = URLDecoder.decode(name, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new AssertionError("UTF-8 not supported");
+ }
+ }
+
+ // Match the name against the registered patterns
+ for (Pattern pattern : patterns.keySet()) {
+ if (pattern.matcher(name).matches()) {
+ return patterns.get(pattern);
+ }
+ }
+ }
+ }
+
+ return MediaType.OCTET_STREAM;
+ }
+
+}
Added:
lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java?rev=736118&view=auto
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java
(added)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java
Tue Jan 20 13:36:56 2009
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+import junit.framework.TestCase;
+
+/**
+ * Test cases for the {...@link NameDetector} class.
+ */
+public class NameDetectorTest extends TestCase {
+
+ private Detector detector;
+
+ protected void setUp() {
+ Map<Pattern, MediaType> patterns = new HashMap<Pattern, MediaType>();
+ patterns.put(
+ Pattern.compile(".*\\.txt", Pattern.CASE_INSENSITIVE),
+ MediaType.TEXT_PLAIN);
+ patterns.put(Pattern.compile("README"), MediaType.TEXT_PLAIN);
+ detector = new NameDetector(patterns);
+ }
+
+ public void testDetect() {
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt");
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt?a=b"); // URL query
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt#abc"); // URL fragment
+ assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive
+ assertDetect(MediaType.OCTET_STREAM, "text.txt.gz");
+
+ assertDetect(MediaType.TEXT_PLAIN, "README");
+ assertDetect(MediaType.TEXT_PLAIN, " README "); // space around
+ assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace
+ assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path
+ assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path
+ assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive
+ assertDetect(MediaType.OCTET_STREAM, "README.NOW");
+
+ // tough one
+ assertDetect(
+ MediaType.TEXT_PLAIN,
+ " See http://www.example.com:1234/README.txt?a=b#c \n");
+ assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this!
+ assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this
+
+ // test also the zero input cases
+ assertDetect(MediaType.OCTET_STREAM, "");
+ assertDetect(MediaType.OCTET_STREAM, null);
+ try {
+ assertEquals(
+ MediaType.OCTET_STREAM,
+ detector.detect(null, new Metadata()));
+ } catch (IOException e) {
+ fail("NameDetector should never throw an IOException");
+ }
+ }
+
+ private void assertDetect(MediaType type, String name){
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ try {
+ assertEquals(type, detector.detect(null, metadata));
+ } catch (IOException e) {
+ fail("NameDetector should never throw an IOException");
+ }
+ }
+
+}