Revision: 19676
          http://sourceforge.net/p/gate/code/19676
Author:   markagreenwood
Date:     2016-10-14 05:51:04 +0000 (Fri, 14 Oct 2016)
Log Message:
-----------
IF EVERYTHING GOES PEAR SHAPED IT WILL BE THIS CHECKIN: made loading of 
document content lazy so we avoid loading it multiple times and triggering the 
horrible encoding synchornization bug. it might mess up some things around save 
original content and repositioning info, we'll need to check

Modified Paths:
--------------
    gate/trunk/src/main/gate/corpora/DocumentImpl.java
    gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java
    gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java

Modified: gate/trunk/src/main/gate/corpora/DocumentImpl.java
===================================================================
--- gate/trunk/src/main/gate/corpora/DocumentImpl.java  2016-10-14 01:22:27 UTC 
(rev 19675)
+++ gate/trunk/src/main/gate/corpora/DocumentImpl.java  2016-10-14 05:51:04 UTC 
(rev 19676)
@@ -250,13 +250,14 @@
       content = new DocumentContentImpl(stringContent);
       getFeatures().put("gate.SourceURL", "created from String");
     } else {
-      try {
-        content = new DocumentContentImpl(sourceUrl, getEncoding(),
-                sourceUrlStartOffset, sourceUrlEndOffset);
+      //try {
+        //content = new DocumentContentImpl(sourceUrl, getEncoding(),
+        //        sourceUrlStartOffset, sourceUrlEndOffset);
+        content = null;
         getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
-      } catch(IOException e) {
-        throw new ResourceInstantiationException("DocumentImpl.init: " + e);
-      }
+      //} catch(IOException e) {
+      //  throw new ResourceInstantiationException("DocumentImpl.init: " + e);
+      //}
     }
     if(preserveOriginalContent.booleanValue() && content != null) {
       String originalContent = new String(((DocumentContentImpl)content)
@@ -646,6 +647,25 @@
   /** The content of the document: a String for text; MPEG for video; etc. */
   @Override
   public DocumentContent getContent() {
+    if (content == null) {
+      if (sourceUrl != null) {
+        try {
+          content = new DocumentContentImpl(sourceUrl, getEncoding(), 
sourceUrlStartOffset, sourceUrlEndOffset);
+          if(preserveOriginalContent.booleanValue()) {
+            String originalContent = new String(((DocumentContentImpl)content)
+                    .getOriginalContent());
+            
getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
+                    originalContent);
+          }
+        } catch(IOException e) {
+          // TODO Auto-generated catch block
+          e.printStackTrace();
+        }
+      }
+      else {
+        content = new DocumentContentImpl("");
+      }
+    }
     return content;
   }
 
@@ -1861,7 +1881,7 @@
     // already know it does), make sure it ends at the end.
     if(anAnnotationList.size() == 1) {
       Annotation onlyAnn = anAnnotationList.get(0);
-      if(onlyAnn.getEndNode().getOffset().equals(content.size()))
+      if(onlyAnn.getEndNode().getOffset().equals(getContent().size()))
         return onlyAnn;
       return null;
     }
@@ -2124,8 +2144,8 @@
   public void edit(Long start, Long end, DocumentContent replacement)
           throws InvalidOffsetException {
     if(!isValidOffsetRange(start, end)) throw new 
InvalidOffsetException("Offsets: "+start+"/"+end);
-    if(content != null)
-      ((DocumentContentImpl)content).edit(start, end, replacement);
+    if(getContent() != null)
+      ((DocumentContentImpl)getContent()).edit(start, end, replacement);
     if(defaultAnnots != null)
       ((AnnotationSetImpl)defaultAnnots).edit(start, end, replacement);
     if(namedAnnotSets != null) {

Modified: gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java
===================================================================
--- gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java     2016-10-14 
01:22:27 UTC (rev 19675)
+++ gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java     2016-10-14 
05:51:04 UTC (rev 19676)
@@ -144,8 +144,11 @@
     }
     String documentText = readTextWithNodes(xsr, nodeIdToOffsetMap);
 
+    // TODO this is almost never needed and will cause the double
+    // loading so see if we can live without it for now 
     // save the content, in case anything goes wrong later
-    savedContent = doc.getContent();
+    //savedContent = doc.getContent();
+    
     // set the document content to the text with nodes text.
     doc.setContent(new DocumentContentImpl(documentText));
 

Modified: gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java
===================================================================
--- gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java     2016-10-14 
01:22:27 UTC (rev 19675)
+++ gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java     2016-10-14 
05:51:04 UTC (rev 19676)
@@ -34,6 +34,7 @@
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
 
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
@@ -42,6 +43,7 @@
 import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.XMLStreamReader;
 
+import org.apache.commons.io.IOUtils;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
@@ -123,11 +125,11 @@
 
     // determine whether we have a GATE format XML document or another
     // kind
-    String content = doc.getContent().toString();
+    /*String content = doc.getContent().toString();
     if(content.length() > 2048) {
       content = content.substring(0, 2048);
-    }
-    boolean gateFormat = isGateXmlFormat(content);
+    }*/
+    boolean gateFormat = isGateXmlFormat(doc);
 
     if(gateFormat) {
       unpackGateFormatMarkup(doc, statusListener);
@@ -354,11 +356,79 @@
    * Determine whether the given document content string represents a
    * GATE custom format XML document.
    */
+  @Deprecated
   protected static boolean isGateXmlFormat(String content) {
     return (content.indexOf("<GateDocument") != -1 || content
             .indexOf(" GateDocument") != -1);
   }
+  
+  protected static boolean isGateXmlFormat(Document doc)
+          throws DocumentFormatException {
 
+    try {
+      byte[] header = new byte[2048];
+
+      if(hasContentButNoValidUrl(doc)) {
+        String content = doc.getContent().toString();
+        if(content.length() > 2048) {
+          content = content.substring(0, 2048);
+        }
+        header = content.getBytes(((TextualDocument)doc).getEncoding());
+      } else {
+        IOUtils.read(doc.getSourceUrl().openStream(), header);
+      }
+
+      int index = indexOf(header,
+              "GateDocument".getBytes(((TextualDocument)doc).getEncoding()));
+
+      return index != -1;
+    } catch(IOException e) {
+      throw new DocumentFormatException(e);
+    }
+  }
+  
+  /**
+   * Finds the first occurrence of the pattern in the text.
+   */
+  protected static int indexOf(byte[] data, byte[] pattern) {
+      int[] failure = computeFailure(pattern);
+
+      int j = 0;
+      if (data.length == 0) return -1;
+
+      for (int i = 0; i < data.length; i++) {
+          while (j > 0 && pattern[j] != data[i]) {
+              j = failure[j - 1];
+          }
+          if (pattern[j] == data[i]) { j++; }
+          if (j == pattern.length) {
+              return i - pattern.length + 1;
+          }
+      }
+      return -1;
+  }
+
+  /**
+   * Computes the failure function using a boot-strapping process,
+   * where the pattern is matched against itself.
+   */
+  private static int[] computeFailure(byte[] pattern) {
+      int[] failure = new int[pattern.length];
+
+      int j = 0;
+      for (int i = 1; i < pattern.length; i++) {
+          while (j > 0 && pattern[j] != pattern[i]) {
+              j = failure[j - 1];
+          }
+          if (pattern[j] == pattern[i]) {
+              j++;
+          }
+          failure[i] = j;
+      }
+
+      return failure;
+  }
+
   /** Initialise this resource, and return it. */
   @Override
   public Resource init() throws ResourceInstantiationException {

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most 
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to