Author: jukka
Date: Mon Nov 16 17:05:09 2009
New Revision: 880852
URL: http://svn.apache.org/viewvc?rev=880852&view=rev
Log:
TIKA-321: Optimize type detection speed
Make the MagicDetector class thread-safe and reduce the amount of memory writes
during matching.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java?rev=880852&r1=880851&r2=880852&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
Mon Nov 16 17:05:09 2009
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.io.InputStream;
-import java.util.Arrays;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -44,41 +43,21 @@
/**
* The magic match pattern. If this byte pattern is equal to the
- * possibly bit-masked bytes in the comparison window, then the type
+ * possibly bit-masked bytes from the input stream, then the type
* detection succeeds and the configured {...@link #type} is returned.
*/
private final byte[] pattern;
/**
- * Bit mask that is applied to the source bytes in the comparison window
- * before pattern matching. This mask may be <code>null</code>, in which
- * case the source bytes are compared as-is against the configured pattern.
+ * Bit mask that is applied to the source bytes before pattern matching.
*/
private final byte[] mask;
/**
- * Byte buffer that contains the raw input bytes in the current comparison
- * window. This buffer is first filled with the byte sequence starting at
- * the beginning of the configured offset range. Then the buffer is moved
- * forward one byte at a time until a match is found or the entire offset
- * range has been covered.
- */
- private final byte[] sourceBuffer;
-
- /**
- * The comparison buffer that contains the result of combining the raw
- * input bytes in the current comparison window with the configured
- * {...@link #mask bit mask}. If a bit mask is not configured, then this
- * reference points to the {...@link #sourceBuffer raw source buffer} to
- * avoid extra logic or copying when doing the pattern match.
- */
- private final byte[] compareBuffer;
-
- /**
* First offset (inclusive) of the comparison window within the
* document input stream. Greater than or equal to zero.
*/
- private final long offsetRangeBegin;
+ private final int offsetRangeBegin;
/**
* Last offset (inclusive) of the comparison window within the document
@@ -89,7 +68,7 @@
* the document stream. Instead, the last window of bytes to be compared
* starts at this offset.
*/
- private final long offsetRangeEnd;
+ private final int offsetRangeEnd;
/**
* Creates a detector for input documents that have the exact given byte
@@ -110,45 +89,51 @@
* @param pattern magic match pattern
* @param offset offset of the pattern match
*/
- public MagicDetector(MediaType type, byte[] pattern, long offset) {
+ public MagicDetector(MediaType type, byte[] pattern, int offset) {
this(type, pattern, null, offset, offset);
}
/**
* Creates a detector for input documents that meet the specified
* magic match.
- *
*/
public MagicDetector(
MediaType type, byte[] pattern, byte[] mask,
- long offsetRangeBegin, long offsetRangeEnd) {
+ int offsetRangeBegin, int offsetRangeEnd) {
if (type == null) {
throw new IllegalArgumentException("Matching media type is null");
} else if (pattern == null) {
throw new IllegalArgumentException("Magic match pattern is null");
- } else if (mask != null && mask.length != pattern.length) {
- throw new IllegalArgumentException(
- "Different pattern and mask lengths: "
- + pattern.length + " != " + mask.length);
} else if (offsetRangeBegin < 0
|| offsetRangeEnd < offsetRangeBegin) {
throw new IllegalArgumentException(
"Invalid offset range: ["
+ offsetRangeBegin + "," + offsetRangeEnd + "]");
- } else {
- this.type = type;
- this.length = pattern.length;
- this.pattern = pattern;
- this.mask = mask;
- this.sourceBuffer = new byte[length];
- if (mask != null) {
- this.compareBuffer = new byte[length];
+ }
+
+ this.type = type;
+
+ this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
+
+ this.mask = new byte[length];
+ this.pattern = new byte[length];
+
+ for (int i = 0; i < length; i++) {
+ if (mask != null && i < mask.length) {
+ this.mask[i] = mask[i];
+ } else {
+ this.mask[i] = -1;
+ }
+
+ if (i < pattern.length) {
+ this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
} else {
- this.compareBuffer = this.sourceBuffer;
+ this.pattern[i] = 0;
}
- this.offsetRangeBegin = offsetRangeBegin;
- this.offsetRangeEnd = offsetRangeEnd;
}
+
+ this.offsetRangeBegin = offsetRangeBegin;
+ this.offsetRangeEnd = offsetRangeEnd;
}
/**
@@ -162,9 +147,9 @@
return MediaType.OCTET_STREAM;
}
- input.mark(length);
+ input.mark(offsetRangeEnd + length);
try {
- long offset = 0;
+ int offset = 0;
// Skip bytes at the beginning, using skip() or read()
while (offset < offsetRangeBegin) {
@@ -179,42 +164,33 @@
}
// Fill in the comparison window
- while (offset < offsetRangeBegin + sourceBuffer.length) {
- int i = (int) (offset - offsetRangeBegin);
- int n = input.read(sourceBuffer, i, sourceBuffer.length - i);
- if (n == -1) {
- return MediaType.OCTET_STREAM;
- }
+ byte[] buffer =
+ new byte[length + (offsetRangeEnd - offsetRangeBegin)];
+ int n = input.read(buffer);
+ if (n > 0) {
offset += n;
}
+ while (n != -1 && offset < offsetRangeEnd + length) {
+ int bufferOffset = offset - offsetRangeBegin;
+ n = input.read(
+ buffer, bufferOffset, buffer.length - bufferOffset);
+ }
+ if (offset < offsetRangeBegin + length) {
+ return MediaType.OCTET_STREAM;
+ }
// Loop until we've covered the entire offset range
- while (true) {
- // Apply the mask, if any
- if (mask != null) {
- for (int i = 0; i < length; i++) {
- compareBuffer[i] = (byte) (sourceBuffer[i] & mask[i]);
- }
+ for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
+ boolean match = true;
+ for (int j = 0; match && j < length; j++) {
+ match = (buffer[i + j] & mask[j]) == pattern[j];
}
-
- if (Arrays.equals(pattern, compareBuffer)) {
- // We have a match, so return the matching media type
+ if (match) {
return type;
- } else if (offset < offsetRangeEnd + sourceBuffer.length) {
- // No match, move the comparison window forward
- int c = input.read();
- if (c == -1) {
- return MediaType.OCTET_STREAM;
- }
- System.arraycopy(
- sourceBuffer, 1, sourceBuffer, 0, length - 1);
- sourceBuffer[length - 1] = (byte) c;
- offset += 1;
- } else {
- // We have reached the end of the offset range, no match
- return MediaType.OCTET_STREAM;
}
}
+
+ return MediaType.OCTET_STREAM;
} finally {
input.reset();
}