Revision: 19678 http://sourceforge.net/p/gate/code/19678 Author: markagreenwood Date: 2016-10-14 12:03:14 +0000 (Fri, 14 Oct 2016) Log Message: ----------- we now look at the bytes when searching for magic numbers not characters as this allows us to sidestep the encoding issues
Modified Paths: -------------- gate/trunk/src/main/gate/DocumentFormat.java Modified: gate/trunk/src/main/gate/DocumentFormat.java =================================================================== --- gate/trunk/src/main/gate/DocumentFormat.java 2016-10-14 05:51:46 UTC (rev 19677) +++ gate/trunk/src/main/gate/DocumentFormat.java 2016-10-14 12:03:14 UTC (rev 19678) @@ -16,19 +16,12 @@ package gate; -import gate.corpora.MimeType; -import gate.corpora.RepositioningInfo; -import gate.creole.AbstractLanguageResource; -import gate.event.StatusListener; -import gate.util.BomStrippingInputStreamReader; -import gate.util.DocumentFormatException; - import java.io.IOException; import java.io.InputStream; import java.io.Reader; -import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; +import java.nio.charset.Charset; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; @@ -39,7 +32,14 @@ import java.util.Vector; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.CharSet; +import gate.corpora.MimeType; +import gate.corpora.RepositioningInfo; +import gate.creole.AbstractLanguageResource; +import gate.event.StatusListener; +import gate.util.DocumentFormatException; + /** The format of Documents. Subclasses of DocumentFormat know about * particular MIME types and how to unpack the information in any * markup or formatting they contain into GATE annotations. Each MIME @@ -363,7 +363,7 @@ protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream, String anEncoding){ - if (aInputStream == null) return null; + /*if (aInputStream == null) return null; Reader reader = null; if (anEncoding != null) try{ @@ -376,14 +376,92 @@ reader = new BomStrippingInputStreamReader(aInputStream); // We have a input stream reader - return runMagicNumbers(reader); + return runMagicNumbers(reader);*/ + MimeType detectedMimeType = null; + + // the offset of the first match now we use a "first wins" priority + int firstOffset = Integer.MAX_VALUE; + + byte[] header = new byte[2048]; + + try { + IOUtils.read(aInputStream, header); + } + catch (IOException e) { + return null; + } + + // Run the magic numbers test + for(Map.Entry<String, MimeType> kv : magic2mimeTypeMap.entrySet()) { + byte[] magic = null; + + try { + magic = kv.getKey().getBytes(anEncoding); + } + catch (Exception e) { + magic = kv.getKey().getBytes(); + } + + int offset = indexOf(header,magic); + if (offset != -1) { + if (offset < firstOffset) { + detectedMimeType = kv.getValue(); + } + } + } + + return detectedMimeType; }//guessTypeUsingMagicNumbers + + /** + * Finds the first occurrence of the pattern in the text. + */ + protected static int indexOf(byte[] data, byte[] pattern) { + int[] failure = computeFailure(pattern); + int j = 0; + if (data.length == 0) return -1; + + for (int i = 0; i < data.length; i++) { + while (j > 0 && pattern[j] != data[i]) { + j = failure[j - 1]; + } + if (pattern[j] == data[i]) { j++; } + if (j == pattern.length) { + return i - pattern.length + 1; + } + } + return -1; + } + + /** + * Computes the failure function using a boot-strapping process, + * where the pattern is matched against itself. + */ + private static int[] computeFailure(byte[] pattern) { + int[] failure = new int[pattern.length]; + + int j = 0; + for (int i = 1; i < pattern.length; i++) { + while (j > 0 && pattern[j] != pattern[i]) { + j = failure[j - 1]; + } + if (pattern[j] == pattern[i]) { + j++; + } + failure[i] = j; + } + + return failure; + } + /** Performs magic over Gate Document */ protected static MimeType runMagicNumbers(Reader aReader) { // No reader, nothing to detect if( aReader == null) return null; + System.err.println("doing magic numbers"); + // Prepare to run the magic stuff String strBuffer = null; int bufferSize = 2048; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs