Revision: 19678
http://sourceforge.net/p/gate/code/19678
Author: markagreenwood
Date: 2016-10-14 12:03:14 +0000 (Fri, 14 Oct 2016)
Log Message:
-----------
we now look at the bytes when searching for magic numbers not characters as
this allows us to sidestep the encoding issues
Modified Paths:
--------------
gate/trunk/src/main/gate/DocumentFormat.java
Modified: gate/trunk/src/main/gate/DocumentFormat.java
===================================================================
--- gate/trunk/src/main/gate/DocumentFormat.java 2016-10-14 05:51:46 UTC
(rev 19677)
+++ gate/trunk/src/main/gate/DocumentFormat.java 2016-10-14 12:03:14 UTC
(rev 19678)
@@ -16,19 +16,12 @@
package gate;
-import gate.corpora.MimeType;
-import gate.corpora.RepositioningInfo;
-import gate.creole.AbstractLanguageResource;
-import gate.event.StatusListener;
-import gate.util.BomStrippingInputStreamReader;
-import gate.util.DocumentFormatException;
-
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
-import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
+import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
@@ -39,7 +32,14 @@
import java.util.Vector;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.CharSet;
+import gate.corpora.MimeType;
+import gate.corpora.RepositioningInfo;
+import gate.creole.AbstractLanguageResource;
+import gate.event.StatusListener;
+import gate.util.DocumentFormatException;
+
/** The format of Documents. Subclasses of DocumentFormat know about
* particular MIME types and how to unpack the information in any
* markup or formatting they contain into GATE annotations. Each MIME
@@ -363,7 +363,7 @@
protected static MimeType guessTypeUsingMagicNumbers(InputStream
aInputStream,
String anEncoding){
- if (aInputStream == null) return null;
+ /*if (aInputStream == null) return null;
Reader reader = null;
if (anEncoding != null)
try{
@@ -376,14 +376,92 @@
reader = new BomStrippingInputStreamReader(aInputStream);
// We have a input stream reader
- return runMagicNumbers(reader);
+ return runMagicNumbers(reader);*/
+ MimeType detectedMimeType = null;
+
+ // the offset of the first match now we use a "first wins" priority
+ int firstOffset = Integer.MAX_VALUE;
+
+ byte[] header = new byte[2048];
+
+ try {
+ IOUtils.read(aInputStream, header);
+ }
+ catch (IOException e) {
+ return null;
+ }
+
+ // Run the magic numbers test
+ for(Map.Entry<String, MimeType> kv : magic2mimeTypeMap.entrySet()) {
+ byte[] magic = null;
+
+ try {
+ magic = kv.getKey().getBytes(anEncoding);
+ }
+ catch (Exception e) {
+ magic = kv.getKey().getBytes();
+ }
+
+ int offset = indexOf(header,magic);
+ if (offset != -1) {
+ if (offset < firstOffset) {
+ detectedMimeType = kv.getValue();
+ }
+ }
+ }
+
+ return detectedMimeType;
}//guessTypeUsingMagicNumbers
+
+ /**
+ * Finds the first occurrence of the pattern in the text.
+ */
+ protected static int indexOf(byte[] data, byte[] pattern) {
+ int[] failure = computeFailure(pattern);
+ int j = 0;
+ if (data.length == 0) return -1;
+
+ for (int i = 0; i < data.length; i++) {
+ while (j > 0 && pattern[j] != data[i]) {
+ j = failure[j - 1];
+ }
+ if (pattern[j] == data[i]) { j++; }
+ if (j == pattern.length) {
+ return i - pattern.length + 1;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Computes the failure function using a boot-strapping process,
+ * where the pattern is matched against itself.
+ */
+ private static int[] computeFailure(byte[] pattern) {
+ int[] failure = new int[pattern.length];
+
+ int j = 0;
+ for (int i = 1; i < pattern.length; i++) {
+ while (j > 0 && pattern[j] != pattern[i]) {
+ j = failure[j - 1];
+ }
+ if (pattern[j] == pattern[i]) {
+ j++;
+ }
+ failure[i] = j;
+ }
+
+ return failure;
+ }
+
/** Performs magic over Gate Document */
protected static MimeType runMagicNumbers(Reader aReader) {
// No reader, nothing to detect
if( aReader == null) return null;
+ System.err.println("doing magic numbers");
+
// Prepare to run the magic stuff
String strBuffer = null;
int bufferSize = 2048;
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs