Revision: 19677
          http://sourceforge.net/p/gate/code/19677
Author:   markagreenwood
Date:     2016-10-14 05:51:46 +0000 (Fri, 14 Oct 2016)
Log Message:
-----------
don't use document content to see if the XML is gate format or not, instead 
peep into the inputstream

Modified Paths:
--------------
    
gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java

Modified: 
gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java
===================================================================
--- 
gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java
       2016-10-14 05:51:04 UTC (rev 19676)
+++ 
gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java
       2016-10-14 05:51:46 UTC (rev 19677)
@@ -105,11 +105,12 @@
     };
 
     // determine whether we have a GATE format document or not
-    String content = doc.getContent().toString();
+    /*String content = doc.getContent().toString();
     if(content.length() > 2048) {
       content = content.substring(0, 2048);
-    }
-    boolean gateFormat = isGateFIFormat(content);
+    }*/
+    boolean gateFormat = isGateFIFormat(doc);
+    
 
     if(gateFormat) {
       unpackGateFormatMarkup(doc, statusListener);
@@ -117,7 +118,74 @@
       unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener);
     }
   }
+  
+  protected static boolean isGateFIFormat(Document doc)
+      throws DocumentFormatException {
 
+try {
+  byte[] header = new byte[2048];
+
+  if(hasContentButNoValidUrl(doc)) {
+    String content = doc.getContent().toString();
+    if(content.length() > 2048) {
+      content = content.substring(0, 2048);
+    }
+    header = content.getBytes(((TextualDocument)doc).getEncoding());
+  } else {
+    IOUtils.read(doc.getSourceUrl().openStream(), header);
+  }
+
+  int index = indexOf(header,
+          "GateDocument".getBytes(((TextualDocument)doc).getEncoding()));
+
+  return index != -1;
+} catch(IOException e) {
+  throw new DocumentFormatException(e);
+}
+}
+
+/**
+* Finds the first occurrence of the pattern in the text.
+*/
+protected static int indexOf(byte[] data, byte[] pattern) {
+  int[] failure = computeFailure(pattern);
+
+  int j = 0;
+  if (data.length == 0) return -1;
+
+  for (int i = 0; i < data.length; i++) {
+      while (j > 0 && pattern[j] != data[i]) {
+          j = failure[j - 1];
+      }
+      if (pattern[j] == data[i]) { j++; }
+      if (j == pattern.length) {
+          return i - pattern.length + 1;
+      }
+  }
+  return -1;
+}
+
+/**
+* Computes the failure function using a boot-strapping process,
+* where the pattern is matched against itself.
+*/
+private static int[] computeFailure(byte[] pattern) {
+  int[] failure = new int[pattern.length];
+
+  int j = 0;
+  for (int i = 1; i < pattern.length; i++) {
+      while (j > 0 && pattern[j] != pattern[i]) {
+          j = failure[j - 1];
+      }
+      if (pattern[j] == pattern[i]) {
+          j++;
+      }
+      failure[i] = j;
+  }
+
+  return failure;
+}
+
   /**
    * Unpacks markup in the GATE-specific standoff XML markup format.
    * 

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most 
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to