Author: jukka
Date: Sun Jan 25 20:13:36 2009
New Revision: 737577

URL: http://svn.apache.org/viewvc?rev=737577&view=rev
Log:
TIKA-95: Pluggable magic header detectors

Added a MagicDetector class for detecting magic byte patterns.

Added:
    lucene/tika/trunk/src/main/java/org/apache/tika/detect/MagicDetector.java
    
lucene/tika/trunk/src/test/java/org/apache/tika/detect/MagicDetectorTest.java

Added: lucene/tika/trunk/src/main/java/org/apache/tika/detect/MagicDetector.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=737577&view=auto
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/detect/MagicDetector.java 
(added)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/detect/MagicDetector.java 
Sun Jan 25 20:13:36 2009
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Content type detection based on magic bytes, i.e. type-specific patterns
+ * near the beginning of the document input stream.
+ *
+ * @since Apache Tika 0.3
+ */
+public class MagicDetector implements Detector {
+
+    /**
+     * The matching media type. Returned by the
+     * {...@link #detect(InputStream, Metadata)} method if a match is found.
+     */
+    private final MediaType type;
+
+    /**
+     * Length of the comparison window. All the byte arrays here are this long.
+     */
+    private final int length;
+
+    /**
+     * The magic match pattern. If this byte pattern is equal to the
+     * possibly bit-masked bytes in the comparison window, then the type
+     * detection succeeds and the configured {...@link #type} is returned.
+     */
+    private final byte[] pattern;
+
+    /**
+     * Bit mask that is applied to the source bytes in the comparison window
+     * before pattern matching. This mask may be <code>null</code>, in which
+     * case the source bytes are compared as-is against the configured pattern.
+     */
+    private final byte[] mask;
+
+    /**
+     * Byte buffer that contains the raw input bytes in the current comparison
+     * window. This buffer is first filled with the byte sequence starting at
+     * the beginning of the configured offset range. Then the buffer is moved
+     * forward one byte at a time until a match is found or the entire offset
+     * range has been covered.
+     */
+    private final byte[] sourceBuffer;
+
+    /**
+     * The comparison buffer that contains the result of combining the raw
+     * input bytes in the current comparison window with the configured
+     * {...@link #mask bit mask}. If a bit mask is not configured, then this
+     * reference points to the {...@link #sourceBuffer raw source buffer} to
+     * avoid extra logic or copying when doing the pattern match.
+     */
+    private final byte[] compareBuffer;
+
+    /**
+     * First offset (inclusive) of the comparison window within the
+     * document input stream. Greater than or equal to zero.
+     */
+    private final long offsetRangeBegin;
+
+    /**
+     * Last offset (inclusive) of the comparison window within the document
+     * input stream. Greater than or equal to the
+     * {...@link #offsetRangeBegin first offset}.
+     * <p>
+     * Note that this is <em>not</em> the offset of the last byte read from
+     * the document stream. Instead, the last window of bytes to be compared
+     * starts at this offset.
+     */
+    private final long offsetRangeEnd;
+
+    /**
+     * Creates a detector for input documents that have the exact given byte
+     * pattern at the beginning of the document stream.
+     *
+     * @param type matching media type
+     * @param pattern magic match pattern
+     */
+    public MagicDetector(MediaType type, byte[] pattern) {
+        this(type, pattern, 0);
+    }
+
+    /**
+     * Creates a detector for input documents that have the exact given byte
+     * pattern at the given offset of the document stream.
+     *
+     * @param type matching media type
+     * @param pattern magic match pattern
+     * @param offset offset of the pattern match
+     */
+    public MagicDetector(MediaType type, byte[] pattern, long offset) {
+        this(type, pattern, null, offset, offset);
+    }
+
+    /**
+     * Creates a detector for input documents that meet the specified
+     * magic match.
+     *
+     */
+    public MagicDetector(
+            MediaType type, byte[] pattern, byte[] mask,
+            long offsetRangeBegin, long offsetRangeEnd) {
+        if (type == null) {
+            throw new IllegalArgumentException("Matching media type is null");
+        } else if (pattern == null) {
+            throw new IllegalArgumentException("Magic match pattern is null");
+        } else if (mask != null && mask.length != pattern.length) {
+            throw new IllegalArgumentException(
+                    "Different pattern and mask lengths: "
+                    + pattern.length + " != " + mask.length);
+        } else if (offsetRangeBegin < 0
+                || offsetRangeEnd < offsetRangeBegin) {
+            throw new IllegalArgumentException(
+                    "Invalid offset range: ["
+                    + offsetRangeBegin + "," + offsetRangeEnd + "]");
+        } else {
+            this.type = type;
+            this.length = pattern.length;
+            this.pattern = pattern;
+            this.mask = mask;
+            this.sourceBuffer = new byte[length];
+            if (mask != null) {
+                this.compareBuffer = new byte[length];
+            } else {
+                this.compareBuffer = this.sourceBuffer;
+            }
+            this.offsetRangeBegin = offsetRangeBegin;
+            this.offsetRangeEnd = offsetRangeEnd;
+        }
+    }
+
+    public MediaType detect(InputStream input, Metadata metadata)
+            throws IOException {
+        long offset = 0;
+
+        // Skip bytes at the beginning, using skip() or read()
+        while (offset < offsetRangeBegin) {
+            long n = input.skip(offsetRangeBegin - offset);
+            if (n > 0) {
+                offset += n;
+            } else if (input.read() != -1) {
+                offset += 1;
+            } else {
+                return MediaType.OCTET_STREAM;
+            }
+        }
+
+        // Fill in the comparison window
+        while (offset < offsetRangeBegin + sourceBuffer.length) {
+            int i = (int) (offset - offsetRangeBegin);
+            int n = input.read(sourceBuffer, i, sourceBuffer.length - i);
+            if (n == -1) {
+                return MediaType.OCTET_STREAM;
+            }
+            offset += n;
+        }
+
+        // Loop until we've covered the entire offset range
+        while (true) {
+            // Apply the mask, if any
+            if (mask != null) {
+                for (int i = 0; i < length; i++) {
+                    compareBuffer[i] = (byte) (sourceBuffer[i] & mask[i]);
+                }
+            }
+
+            if (Arrays.equals(pattern, compareBuffer)) {
+                // We have a match, so return the matching media type
+                return type;
+            } else if (offset < offsetRangeEnd + sourceBuffer.length) {
+                // No match, move the comparison window forward and try again
+                int c = input.read();
+                if (c == -1) {
+                    return MediaType.OCTET_STREAM;
+                }
+                System.arraycopy(sourceBuffer, 1, sourceBuffer, 0, length - 1);
+                sourceBuffer[length - 1] = (byte) c;
+                offset += 1;
+            } else {
+                // We have reached the end of the offset range, no match.
+                return MediaType.OCTET_STREAM;
+            }
+        }
+    }
+
+}

Added: 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/detect/MagicDetectorTest.java?rev=737577&view=auto
==============================================================================
--- 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/MagicDetectorTest.java 
(added)
+++ 
lucene/tika/trunk/src/test/java/org/apache/tika/detect/MagicDetectorTest.java 
Sun Jan 25 20:13:36 2009
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Test cases for the {...@link MagicDetector} class.
+ */
+public class MagicDetectorTest extends TestCase {
+
+    public void testDetectSimple() throws Exception {
+        MediaType html = new MediaType("text", "html");
+        Detector detector = new MagicDetector(html, "<html".getBytes("ASCII"));
+
+        assertDetect(detector, html, "<html");
+        assertDetect(detector, html, "<html><head/><body/></html>");
+        assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
+        assertDetect(detector, MediaType.OCTET_STREAM, "<?xml?><html");
+        assertDetect(detector, MediaType.OCTET_STREAM, " <html");
+        assertDetect(detector, MediaType.OCTET_STREAM, "");
+    }
+
+    public void testDetectOffsetRange() throws Exception {
+        MediaType html = new MediaType("text", "html");
+        Detector detector = new MagicDetector(
+                html, "<html".getBytes("ASCII"), null, 0, 64);
+
+        assertDetect(detector, html, "<html");
+        assertDetect(detector, html, "<html><head/><body/></html>");
+        assertDetect(detector, html, "<?xml?><html/>");
+        assertDetect(detector, html, "\n    <html");
+        assertDetect(detector, html, "\u0000<html");
+        assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
+        assertDetect(detector, MediaType.OCTET_STREAM, " html");
+        assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
+
+        assertDetect(detector, html,
+                "0........1.........2.........3.........4.........5.........6"
+                + "1234<html");
+        assertDetect(detector, MediaType.OCTET_STREAM,
+                "0........1.........2.........3.........4.........5.........6"
+                + "12345<html");
+
+        assertDetect(detector, MediaType.OCTET_STREAM, "");
+}
+
+    public void testDetectMask() throws Exception {
+        MediaType html = new MediaType("text", "html");
+        byte up = (byte) 0xdf;
+        Detector detector = new MagicDetector(
+                html,
+                new byte[] { '<',  'H',  'T',  'M',  'L' },
+                new byte[] { (byte) 0xff, up, up, up, up },
+                0, 64);
+
+        assertDetect(detector, html, "<html");
+        assertDetect(detector, html, "<HTML><head/><body/></html>");
+        assertDetect(detector, html, "<?xml?><HtMl/>");
+        assertDetect(detector, html, "\n    <html");
+        assertDetect(detector, html, "\u0000<HTML");
+        assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
+        assertDetect(detector, MediaType.OCTET_STREAM, " html");
+
+        assertDetect(detector, html,
+                "0        1         2         3         4         5         6"
+                + "1234<html");
+        assertDetect(detector, MediaType.OCTET_STREAM,
+                "0        1         2         3         4         5         6"
+                + "12345<html");
+
+        assertDetect(detector, MediaType.OCTET_STREAM, "");
+    }
+
+    private void assertDetect(Detector detector, MediaType type, String data) {
+        try {
+            assertEquals(type, detector.detect(
+                    new ByteArrayInputStream(data.getBytes("ASCII")),
+                    new Metadata()));
+        } catch (IOException e) {
+            fail("Unexpected exception from MagicDetector");
+        }
+    }
+
+}


Reply via email to