SimpleTokeniser.java

ian_roberts Thu, 11 Sep 2014 11:22:04 -0700

Revision: 18317
          http://sourceforge.net/p/gate/code/18317
Author:   ian_roberts
Date:     2014-09-11 18:21:24 +0000 (Thu, 11 Sep 2014)
Log Message:
-----------
Proper handling of supplementary characters


Unicode "supplementary" characters above U+FFFF are represented in a Java
string as a "surrogate pair" of two char values.  Previously the tokeniser
ignored this, working through the document text one char at a time and
reporting all surrogates separately as the SURROGATE character type.  In the
absence of rules including SURROGATE, all surrogates were thus marked as
DEFAULT_TOKEN with the side-effect that any attempt to save the annotated
document as XML would destroy the original characters when the unpaired
surrogates were stripped out.

Now supplementary characters are treated as single units, and will be reported
as the appropriate character type (OTHER_LETTER, OTHER_SYMBOL, etc.).  The
resulting token length is still in code units (char values), so a Token
covering a single supplementary character will have length 2.

Modified Paths:
--------------
    gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java

Modified: gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java
===================================================================
--- gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java      
2014-09-11 11:54:57 UTC (rev 18316)
+++ gate/trunk/src/main/gate/creole/tokeniser/SimpleTokeniser.java      
2014-09-11 18:21:24 UTC (rev 18317)
@@ -616,16 +616,14 @@
 
     String content = document.getContent().toString();
     int length = content.length();
-    char currentChar;
+    int currentChar;
+    int charsInCurrentCP = 1;
 
     DFSMState graphPosition = dInitialState;
 
     //the index of the first character of the token trying to be recognised
     int tokenStart = 0;
 
-    //the index of the last character of the last token recognised
-    int lastMatch = -1;
-
     DFSMState lastMatchingState = null;
     DFSMState nextState;
     String tokenString;
@@ -634,7 +632,12 @@
     FeatureMap newTokenFm;
 
     while(charIdx < length){
-      currentChar = content.charAt(charIdx);
+      currentChar = content.codePointAt(charIdx);
+      // number of chars we have to advance after processing this code point.
+      // 1 in the vast majority of cases, but 2 where the code point is a
+      // supplementary character represented as a surrogate pair.
+      charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 
1;
+      
 //      Out.println(
 //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
       nextState = graphPosition.next(typeIds.get(
@@ -643,15 +646,17 @@
       if( null != nextState ) {
         graphPosition = nextState;
         if(graphPosition.isFinal()) {
-          lastMatch = charIdx;
           lastMatchingState = graphPosition;
         }
-        charIdx ++;
+        charIdx += charsInCurrentCP;
       } else {//we have a match!
         newTokenFm = Factory.newFeatureMap();
 
         if (null == lastMatchingState) {
-          tokenString = content.substring(tokenStart, tokenStart +1);
+          // no rule matches this character, so create a single-char
+          // DEFAULT_TOKEN annotation covering it and start again after it
+          charIdx  = tokenStart + charsInCurrentCP;
+          tokenString = content.substring(tokenStart, charIdx);
           newTokenFm.put("type","UNKNOWN");
           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
@@ -659,7 +664,7 @@
 
           try {
             annotationSet.add(new Long(tokenStart),
-                              new Long(tokenStart + 1),
+                              new Long(charIdx),
                               "DEFAULT_TOKEN", newTokenFm);
           } catch (InvalidOffsetException ioe) {
             //This REALLY shouldn't happen!
@@ -667,9 +672,9 @@
           }
           // Out.println("Default token: " + tokenStart +
           //             "->" + tokenStart + " :" + tokenString + ";");
-          charIdx  = tokenStart + 1;
         } else {
-          tokenString = content.substring(tokenStart, lastMatch + 1);
+          // we've reached the end of a string that the FSM recognised
+          tokenString = content.substring(tokenStart, charIdx);
           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
                          Integer.toString(tokenString.length()));
@@ -684,7 +689,7 @@
 
           try {
             annotationSet.add(new Long(tokenStart),
-                            new Long(lastMatch + 1),
+                            new Long(charIdx),
                             lastMatchingState.getTokenDesc()[0][0], 
newTokenFm);
           } catch(InvalidOffsetException ioe) {
             //This REALLY shouldn't happen!
@@ -694,9 +699,10 @@
           // Out.println(lastMatchingState.getTokenDesc()[0][0] +
           //              ": " + tokenStart + "->" + lastMatch +
           //              " :" + tokenString + ";");
-          charIdx = lastMatch + 1;
+          //charIdx = lastMatch + 1;
         }
 
+        // reset to initial state and start looking again from here
         lastMatchingState = null;
         graphPosition = dInitialState;
         tokenStart = charIdx;
@@ -711,7 +717,8 @@
     } // while(charIdx < length)
 
     if (null != lastMatchingState) {
-      tokenString = content.substring(tokenStart, lastMatch + 1);
+      // we dropped off the end having found a match, annotate it
+      tokenString = content.substring(tokenStart, charIdx);
       newTokenFm = Factory.newFeatureMap();
       newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
       newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
@@ -725,7 +732,7 @@
 
       try {
         annotationSet.add(new Long(tokenStart),
-                          new Long(lastMatch + 1),
+                          new Long(charIdx),
                           lastMatchingState.getTokenDesc()[0][0], newTokenFm);
       } catch(InvalidOffsetException ioe) {
         //This REALLY shouldn't happen!

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Want excitement?
Manually upgrade your production database.
When you want reliability, choose Perforce
Perforce version control. Predictably reliable.
http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

[gate-cvs] SF.net SVN: gate:[18317] gate/trunk/src/main/gate/creole/tokeniser/ SimpleTokeniser.java

Reply via email to