Revision: 18317 http://sourceforge.net/p/gate/code/18317 Author: ian_roberts Date: 2014-09-11 18:21:24 +0000 (Thu, 11 Sep 2014) Log Message: ----------- Proper handling of supplementary characters
Unicode "supplementary" characters above U+FFFF are represented in a Java string as a "surrogate pair" of two char values. Previously the tokeniser ignored this, working through the document text one char at a time and reporting all surrogates separately as the SURROGATE character type. In the absence of rules including SURROGATE, all surrogates were thus marked as DEFAULT_TOKEN with the side-effect that any attempt to save the annotated document as XML would destroy the original characters when the unpaired surrogates were stripped out. Now supplementary characters are treated as single units, and will be reported as the appropriate character type (OTHER_LETTER, OTHER_SYMBOL, etc.). The resulting token length is still in code units (char values), so a Token covering a single supplementary character will have length 2. Modified Paths: -------------- gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java Modified: gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java =================================================================== --- gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java 2014-09-11 11:54:57 UTC (rev 18316) +++ gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java 2014-09-11 18:21:24 UTC (rev 18317) @@ -616,16 +616,14 @@ String content = document.getContent().toString(); int length = content.length(); - char currentChar; + int currentChar; + int charsInCurrentCP = 1; DFSMState graphPosition = dInitialState; //the index of the first character of the token trying to be recognised int tokenStart = 0; - //the index of the last character of the last token recognised - int lastMatch = -1; - DFSMState lastMatchingState = null; DFSMState nextState; String tokenString; @@ -634,7 +632,12 @@ FeatureMap newTokenFm; while(charIdx < length){ - currentChar = content.charAt(charIdx); + currentChar = content.codePointAt(charIdx); + // number of chars we have to advance after processing this code point. + // 1 in the vast majority of cases, but 2 where the code point is a + // supplementary character represented as a surrogate pair. + charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1; + // Out.println( // currentChar + typesMnemonics[Character.getType(currentChar)+128]); nextState = graphPosition.next(typeIds.get( @@ -643,15 +646,17 @@ if( null != nextState ) { graphPosition = nextState; if(graphPosition.isFinal()) { - lastMatch = charIdx; lastMatchingState = graphPosition; } - charIdx ++; + charIdx += charsInCurrentCP; } else {//we have a match! newTokenFm = Factory.newFeatureMap(); if (null == lastMatchingState) { - tokenString = content.substring(tokenStart, tokenStart +1); + // no rule matches this character, so create a single-char + // DEFAULT_TOKEN annotation covering it and start again after it + charIdx = tokenStart + charsInCurrentCP; + tokenString = content.substring(tokenStart, charIdx); newTokenFm.put("type","UNKNOWN"); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, @@ -659,7 +664,7 @@ try { annotationSet.add(new Long(tokenStart), - new Long(tokenStart + 1), + new Long(charIdx), "DEFAULT_TOKEN", newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! @@ -667,9 +672,9 @@ } // Out.println("Default token: " + tokenStart + // "->" + tokenStart + " :" + tokenString + ";"); - charIdx = tokenStart + 1; } else { - tokenString = content.substring(tokenStart, lastMatch + 1); + // we've reached the end of a string that the FSM recognised + tokenString = content.substring(tokenStart, charIdx); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); @@ -684,7 +689,7 @@ try { annotationSet.add(new Long(tokenStart), - new Long(lastMatch + 1), + new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch(InvalidOffsetException ioe) { //This REALLY shouldn't happen! @@ -694,9 +699,10 @@ // Out.println(lastMatchingState.getTokenDesc()[0][0] + // ": " + tokenStart + "->" + lastMatch + // " :" + tokenString + ";"); - charIdx = lastMatch + 1; + //charIdx = lastMatch + 1; } + // reset to initial state and start looking again from here lastMatchingState = null; graphPosition = dInitialState; tokenStart = charIdx; @@ -711,7 +717,8 @@ } // while(charIdx < length) if (null != lastMatchingState) { - tokenString = content.substring(tokenStart, lastMatch + 1); + // we dropped off the end having found a match, annotate it + tokenString = content.substring(tokenStart, charIdx); newTokenFm = Factory.newFeatureMap(); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, @@ -725,7 +732,7 @@ try { annotationSet.add(new Long(tokenStart), - new Long(lastMatch + 1), + new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch(InvalidOffsetException ioe) { //This REALLY shouldn't happen! This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Want excitement? Manually upgrade your production database. When you want reliability, choose Perforce Perforce version control. Predictably reliable. http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs