Revision: 19678
          http://sourceforge.net/p/gate/code/19678
Author:   markagreenwood
Date:     2016-10-14 12:03:14 +0000 (Fri, 14 Oct 2016)
Log Message:
-----------
we now look at the bytes when searching for magic numbers not characters as 
this allows us to sidestep the encoding issues

Modified Paths:
--------------
    gate/trunk/src/main/gate/DocumentFormat.java

Modified: gate/trunk/src/main/gate/DocumentFormat.java
===================================================================
--- gate/trunk/src/main/gate/DocumentFormat.java        2016-10-14 05:51:46 UTC 
(rev 19677)
+++ gate/trunk/src/main/gate/DocumentFormat.java        2016-10-14 12:03:14 UTC 
(rev 19678)
@@ -16,19 +16,12 @@
 
 package gate;
 
-import gate.corpora.MimeType;
-import gate.corpora.RepositioningInfo;
-import gate.creole.AbstractLanguageResource;
-import gate.event.StatusListener;
-import gate.util.BomStrippingInputStreamReader;
-import gate.util.DocumentFormatException;
-
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
-import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.net.URLConnection;
+import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedList;
@@ -39,7 +32,14 @@
 import java.util.Vector;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.CharSet;
 
+import gate.corpora.MimeType;
+import gate.corpora.RepositioningInfo;
+import gate.creole.AbstractLanguageResource;
+import gate.event.StatusListener;
+import gate.util.DocumentFormatException;
+
 /** The format of Documents. Subclasses of DocumentFormat know about
   * particular MIME types and how to unpack the information in any
   * markup or formatting they contain into GATE annotations. Each MIME
@@ -363,7 +363,7 @@
   protected static MimeType guessTypeUsingMagicNumbers(InputStream 
aInputStream,
                                                             String anEncoding){
 
-    if (aInputStream == null) return null;
+    /*if (aInputStream == null) return null;
     Reader reader = null;
     if (anEncoding != null)
       try{
@@ -376,14 +376,92 @@
       reader = new BomStrippingInputStreamReader(aInputStream);
 
     // We have a input stream reader
-    return runMagicNumbers(reader);
+    return runMagicNumbers(reader);*/
+    MimeType detectedMimeType = null;
+
+    // the offset of the first match now we use a "first wins" priority
+    int firstOffset = Integer.MAX_VALUE;
+    
+    byte[] header = new byte[2048];
+    
+    try {
+      IOUtils.read(aInputStream, header);
+    }
+    catch (IOException e) {
+      return null;
+    }
+
+    // Run the magic numbers test
+    for(Map.Entry<String, MimeType> kv : magic2mimeTypeMap.entrySet()) {
+      byte[] magic = null;
+      
+      try {
+        magic = kv.getKey().getBytes(anEncoding);
+      }
+      catch (Exception e) {
+        magic = kv.getKey().getBytes();
+      }
+      
+      int offset = indexOf(header,magic);
+      if (offset != -1) {
+        if (offset < firstOffset) {
+          detectedMimeType = kv.getValue();
+        }
+      }
+    }
+    
+    return detectedMimeType;
   }//guessTypeUsingMagicNumbers
+  
+  /**
+   * Finds the first occurrence of the pattern in the text.
+   */
+  protected static int indexOf(byte[] data, byte[] pattern) {
+      int[] failure = computeFailure(pattern);
 
+      int j = 0;
+      if (data.length == 0) return -1;
+
+      for (int i = 0; i < data.length; i++) {
+          while (j > 0 && pattern[j] != data[i]) {
+              j = failure[j - 1];
+          }
+          if (pattern[j] == data[i]) { j++; }
+          if (j == pattern.length) {
+              return i - pattern.length + 1;
+          }
+      }
+      return -1;
+  }
+
+  /**
+   * Computes the failure function using a boot-strapping process,
+   * where the pattern is matched against itself.
+   */
+  private static int[] computeFailure(byte[] pattern) {
+      int[] failure = new int[pattern.length];
+
+      int j = 0;
+      for (int i = 1; i < pattern.length; i++) {
+          while (j > 0 && pattern[j] != pattern[i]) {
+              j = failure[j - 1];
+          }
+          if (pattern[j] == pattern[i]) {
+              j++;
+          }
+          failure[i] = j;
+      }
+
+      return failure;
+  }
+
   /** Performs magic over Gate Document */
   protected static MimeType runMagicNumbers(Reader aReader) {
     // No reader, nothing to detect
     if( aReader == null) return null;
 
+    System.err.println("doing magic numbers");
+    
     // Prepare to run the magic stuff
     String strBuffer = null;
     int bufferSize = 2048;

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most 
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to