NameDetectorTest.java

jukka Tue, 20 Jan 2009 13:37:19 -0800

Author: jukka
Date: Tue Jan 20 13:36:56 2009
New Revision: 736118

URL: http://svn.apache.org/viewvc?rev=736118&view=rev
Log:
TIKA-95: Pluggable magic header detectors


Added a NameDetector class for detecting file name patterns.

Added:
    lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java
    lucene/tika/trunk/src/test/java/org/apache/tika/detect/
    lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java

Added: lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java?rev=736118&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java 
(added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/detect/NameDetector.java 
Tue Jan 20 13:36:56 2009
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * 
+ * @author Jukka Zitting
+ *
+ */
+public class NameDetector implements Detector {
+
+    private final Map<Pattern, MediaType> patterns;
+
+    public NameDetector(Map<Pattern, MediaType> patterns) {
+        this.patterns = patterns;
+    }
+
+    public MediaType detect(InputStream input, Metadata metadata) {
+        // Look for a resource name in the input metadata
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name != null) {
+            // If the name is a URL, skip the trailing query and fragment parts
+            int question = name.indexOf('?');
+            if (question != -1) {
+                name = name.substring(0, question);
+            }
+            int hash = name.indexOf('#');
+            if (hash != -1) {
+                name = name.substring(0, hash);
+            }
+
+            // If the name is a URL or a path, skip all but the last component
+            int slash = name.lastIndexOf('/');
+            if (slash != -1) {
+                name = name.substring(slash + 1);
+            }
+            int backslash = name.lastIndexOf('\\');
+            if (backslash != -1) {
+                name = name.substring(backslash + 1);
+            }
+
+            // Skip any leading or trailing whitespace
+            name = name.trim();
+            if (name.length() > 0) {
+                // Decode any potential URL encoding
+                int percent = name.indexOf('%');
+                if (percent != -1) {
+                    try {
+                        name = URLDecoder.decode(name, "UTF-8");
+                    } catch (UnsupportedEncodingException e) {
+                        throw new AssertionError("UTF-8 not supported");
+                    }
+                }
+
+                // Match the name against the registered patterns
+                for (Pattern pattern : patterns.keySet()) {
+                    if (pattern.matcher(name).matches()) {
+                        return patterns.get(pattern);
+                    }
+                }
+            }
+        }
+
+        return MediaType.OCTET_STREAM;
+    }
+
+}

Added: 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java?rev=736118&view=auto
==============================================================================
--- 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java 
(added)
+++ 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/NameDetectorTest.java 
Tue Jan 20 13:36:56 2009
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+import junit.framework.TestCase;
+
+/**
+ * Test cases for the {...@link NameDetector} class.
+ */
+public class NameDetectorTest extends TestCase {
+
+    private Detector detector;
+
+    protected void setUp() {
+        Map<Pattern, MediaType> patterns = new HashMap<Pattern, MediaType>();
+        patterns.put(
+                Pattern.compile(".*\\.txt", Pattern.CASE_INSENSITIVE),
+                MediaType.TEXT_PLAIN);
+        patterns.put(Pattern.compile("README"), MediaType.TEXT_PLAIN);
+        detector = new NameDetector(patterns);
+    }
+
+    public void testDetect() {
+        assertDetect(MediaType.TEXT_PLAIN, "text.txt");
+        assertDetect(MediaType.TEXT_PLAIN, "text.txt ");    // trailing space
+        assertDetect(MediaType.TEXT_PLAIN, "text.txt\n");   // trailing newline
+        assertDetect(MediaType.TEXT_PLAIN, "text.txt?a=b"); // URL query
+        assertDetect(MediaType.TEXT_PLAIN, "text.txt#abc"); // URL fragment
+        assertDetect(MediaType.TEXT_PLAIN, "text.TXT");     // case insensitive
+        assertDetect(MediaType.OCTET_STREAM, "text.txt.gz");
+
+        assertDetect(MediaType.TEXT_PLAIN, "README");
+        assertDetect(MediaType.TEXT_PLAIN, " README ");     // space around
+        assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n");   // other whitespace
+        assertDetect(MediaType.TEXT_PLAIN, "/a/README");    // leading path
+        assertDetect(MediaType.TEXT_PLAIN, "\\b\\README");  // windows path
+        assertDetect(MediaType.OCTET_STREAM, "ReadMe");     // case sensitive
+        assertDetect(MediaType.OCTET_STREAM, "README.NOW");
+
+        // tough one
+        assertDetect(
+                MediaType.TEXT_PLAIN,
+                " See http://www.example.com:1234/README.txt?a=b#c \n");
+        assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this!
+        assertDetect(MediaType.OCTET_STREAM, "See README");   // but not this
+
+        // test also the zero input cases
+        assertDetect(MediaType.OCTET_STREAM, "");
+        assertDetect(MediaType.OCTET_STREAM, null);
+        try {
+            assertEquals(
+                    MediaType.OCTET_STREAM,
+                    detector.detect(null, new Metadata()));
+        } catch (IOException e) {
+            fail("NameDetector should never throw an IOException");
+        }
+    }
+
+    private void assertDetect(MediaType type, String name){
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+        try {
+            assertEquals(type, detector.detect(null, metadata));
+        } catch (IOException e) {
+            fail("NameDetector should never throw an IOException");
+        }
+    }
+
+}

svn commit: r736118 - in /lucene/tika/trunk/src: main/java/org/apache/tika/detect/NameDetector.java test/java/org/apache/tika/detect/ test/java/org/apache/tika/detect/NameDetectorTest.java

Reply via email to