Revision: 19677 http://sourceforge.net/p/gate/code/19677 Author: markagreenwood Date: 2016-10-14 05:51:46 +0000 (Fri, 14 Oct 2016) Log Message: ----------- don't use document content to see if the XML is gate format or not, instead peep into the inputstream
Modified Paths: -------------- gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java Modified: gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java =================================================================== --- gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java 2016-10-14 05:51:04 UTC (rev 19676) +++ gate/trunk/plugins/Format_FastInfoset/src/gate/corpora/FastInfosetDocumentFormat.java 2016-10-14 05:51:46 UTC (rev 19677) @@ -105,11 +105,12 @@ }; // determine whether we have a GATE format document or not - String content = doc.getContent().toString(); + /*String content = doc.getContent().toString(); if(content.length() > 2048) { content = content.substring(0, 2048); - } - boolean gateFormat = isGateFIFormat(content); + }*/ + boolean gateFormat = isGateFIFormat(doc); + if(gateFormat) { unpackGateFormatMarkup(doc, statusListener); @@ -117,7 +118,74 @@ unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener); } } + + protected static boolean isGateFIFormat(Document doc) + throws DocumentFormatException { +try { + byte[] header = new byte[2048]; + + if(hasContentButNoValidUrl(doc)) { + String content = doc.getContent().toString(); + if(content.length() > 2048) { + content = content.substring(0, 2048); + } + header = content.getBytes(((TextualDocument)doc).getEncoding()); + } else { + IOUtils.read(doc.getSourceUrl().openStream(), header); + } + + int index = indexOf(header, + "GateDocument".getBytes(((TextualDocument)doc).getEncoding())); + + return index != -1; +} catch(IOException e) { + throw new DocumentFormatException(e); +} +} + +/** +* Finds the first occurrence of the pattern in the text. +*/ +protected static int indexOf(byte[] data, byte[] pattern) { + int[] failure = computeFailure(pattern); + + int j = 0; + if (data.length == 0) return -1; + + for (int i = 0; i < data.length; i++) { + while (j > 0 && pattern[j] != data[i]) { + j = failure[j - 1]; + } + if (pattern[j] == data[i]) { j++; } + if (j == pattern.length) { + return i - pattern.length + 1; + } + } + return -1; +} + +/** +* Computes the failure function using a boot-strapping process, +* where the pattern is matched against itself. +*/ +private static int[] computeFailure(byte[] pattern) { + int[] failure = new int[pattern.length]; + + int j = 0; + for (int i = 1; i < pattern.length; i++) { + while (j > 0 && pattern[j] != pattern[i]) { + j = failure[j - 1]; + } + if (pattern[j] == pattern[i]) { + j++; + } + failure[i] = j; + } + + return failure; +} + /** * Unpacks markup in the GATE-specific standoff XML markup format. * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, SlashDot.org! http://sdm.link/slashdot _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs