Revision: 19676 http://sourceforge.net/p/gate/code/19676 Author: markagreenwood Date: 2016-10-14 05:51:04 +0000 (Fri, 14 Oct 2016) Log Message: ----------- IF EVERYTHING GOES PEAR SHAPED IT WILL BE THIS CHECKIN: made loading of document content lazy so we avoid loading it multiple times and triggering the horrible encoding synchornization bug. it might mess up some things around save original content and repositioning info, we'll need to check
Modified Paths: -------------- gate/trunk/src/main/gate/corpora/DocumentImpl.java gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java Modified: gate/trunk/src/main/gate/corpora/DocumentImpl.java =================================================================== --- gate/trunk/src/main/gate/corpora/DocumentImpl.java 2016-10-14 01:22:27 UTC (rev 19675) +++ gate/trunk/src/main/gate/corpora/DocumentImpl.java 2016-10-14 05:51:04 UTC (rev 19676) @@ -250,13 +250,14 @@ content = new DocumentContentImpl(stringContent); getFeatures().put("gate.SourceURL", "created from String"); } else { - try { - content = new DocumentContentImpl(sourceUrl, getEncoding(), - sourceUrlStartOffset, sourceUrlEndOffset); + //try { + //content = new DocumentContentImpl(sourceUrl, getEncoding(), + // sourceUrlStartOffset, sourceUrlEndOffset); + content = null; getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm()); - } catch(IOException e) { - throw new ResourceInstantiationException("DocumentImpl.init: " + e); - } + //} catch(IOException e) { + // throw new ResourceInstantiationException("DocumentImpl.init: " + e); + //} } if(preserveOriginalContent.booleanValue() && content != null) { String originalContent = new String(((DocumentContentImpl)content) @@ -646,6 +647,25 @@ /** The content of the document: a String for text; MPEG for video; etc. */ @Override public DocumentContent getContent() { + if (content == null) { + if (sourceUrl != null) { + try { + content = new DocumentContentImpl(sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset); + if(preserveOriginalContent.booleanValue()) { + String originalContent = new String(((DocumentContentImpl)content) + .getOriginalContent()); + getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, + originalContent); + } + } catch(IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + else { + content = new DocumentContentImpl(""); + } + } return content; } @@ -1861,7 +1881,7 @@ // already know it does), make sure it ends at the end. if(anAnnotationList.size() == 1) { Annotation onlyAnn = anAnnotationList.get(0); - if(onlyAnn.getEndNode().getOffset().equals(content.size())) + if(onlyAnn.getEndNode().getOffset().equals(getContent().size())) return onlyAnn; return null; } @@ -2124,8 +2144,8 @@ public void edit(Long start, Long end, DocumentContent replacement) throws InvalidOffsetException { if(!isValidOffsetRange(start, end)) throw new InvalidOffsetException("Offsets: "+start+"/"+end); - if(content != null) - ((DocumentContentImpl)content).edit(start, end, replacement); + if(getContent() != null) + ((DocumentContentImpl)getContent()).edit(start, end, replacement); if(defaultAnnots != null) ((AnnotationSetImpl)defaultAnnots).edit(start, end, replacement); if(namedAnnotSets != null) { Modified: gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java =================================================================== --- gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java 2016-10-14 01:22:27 UTC (rev 19675) +++ gate/trunk/src/main/gate/corpora/DocumentStaxUtils.java 2016-10-14 05:51:04 UTC (rev 19676) @@ -144,8 +144,11 @@ } String documentText = readTextWithNodes(xsr, nodeIdToOffsetMap); + // TODO this is almost never needed and will cause the double + // loading so see if we can live without it for now // save the content, in case anything goes wrong later - savedContent = doc.getContent(); + //savedContent = doc.getContent(); + // set the document content to the text with nodes text. doc.setContent(new DocumentContentImpl(documentText)); Modified: gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java =================================================================== --- gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java 2016-10-14 01:22:27 UTC (rev 19675) +++ gate/trunk/src/main/gate/corpora/XmlDocumentFormat.java 2016-10-14 05:51:04 UTC (rev 19676) @@ -34,6 +34,7 @@ import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; +import java.io.UnsupportedEncodingException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; @@ -42,6 +43,7 @@ import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; +import org.apache.commons.io.IOUtils; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -123,11 +125,11 @@ // determine whether we have a GATE format XML document or another // kind - String content = doc.getContent().toString(); + /*String content = doc.getContent().toString(); if(content.length() > 2048) { content = content.substring(0, 2048); - } - boolean gateFormat = isGateXmlFormat(content); + }*/ + boolean gateFormat = isGateXmlFormat(doc); if(gateFormat) { unpackGateFormatMarkup(doc, statusListener); @@ -354,11 +356,79 @@ * Determine whether the given document content string represents a * GATE custom format XML document. */ + @Deprecated protected static boolean isGateXmlFormat(String content) { return (content.indexOf("<GateDocument") != -1 || content .indexOf(" GateDocument") != -1); } + + protected static boolean isGateXmlFormat(Document doc) + throws DocumentFormatException { + try { + byte[] header = new byte[2048]; + + if(hasContentButNoValidUrl(doc)) { + String content = doc.getContent().toString(); + if(content.length() > 2048) { + content = content.substring(0, 2048); + } + header = content.getBytes(((TextualDocument)doc).getEncoding()); + } else { + IOUtils.read(doc.getSourceUrl().openStream(), header); + } + + int index = indexOf(header, + "GateDocument".getBytes(((TextualDocument)doc).getEncoding())); + + return index != -1; + } catch(IOException e) { + throw new DocumentFormatException(e); + } + } + + /** + * Finds the first occurrence of the pattern in the text. + */ + protected static int indexOf(byte[] data, byte[] pattern) { + int[] failure = computeFailure(pattern); + + int j = 0; + if (data.length == 0) return -1; + + for (int i = 0; i < data.length; i++) { + while (j > 0 && pattern[j] != data[i]) { + j = failure[j - 1]; + } + if (pattern[j] == data[i]) { j++; } + if (j == pattern.length) { + return i - pattern.length + 1; + } + } + return -1; + } + + /** + * Computes the failure function using a boot-strapping process, + * where the pattern is matched against itself. + */ + private static int[] computeFailure(byte[] pattern) { + int[] failure = new int[pattern.length]; + + int j = 0; + for (int i = 1; i < pattern.length; i++) { + while (j > 0 && pattern[j] != pattern[i]) { + j = failure[j - 1]; + } + if (pattern[j] == pattern[i]) { + j++; + } + failure[i] = j; + } + + return failure; + } + /** Initialise this resource, and return it. */ @Override public Resource init() throws ResourceInstantiationException { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs