[cp-patches] RFC: gnu.regexp: Unicode-aware case folding

Ito Kazumitsu Sat, 08 Apr 2006 09:31:31 -0700

Hi,

gnu.regexp, by default, is Unocode-aware when it performs a
case-insensitive matching, i.e. it uses Character.toLowerCase or
Character.toUpperCase.


On the other hand, java.util.regex, by default, should be aware
only of US-ASCII alphabet when it performs a case-insensitive
matching.

This patch fills the gap.

ChangeLog:
2006-04-08  Ito Kazumitsu  <[EMAIL PROTECTED]>

        * gnu/regexp/REToken.java(unicodeAware): New field,
        (toLowerCase, toUpperCase): New methods.
        * gnu/regexp/RETokenBackRef.java, gnu/regexp/RETokenChar.java,
        gnu/regexp/RETokenNamedProperty.java, gnu/regexp/RETokenRange.java:
        Use toLowerCase and toUpperCase defined in REToken instead of
        those defined in java.lang.Character.
        * gnu/regexp/gnu/regexp/RE.java(REG_ICASE_USASCII): New flag.
        (initialize): Sets unicodeAware of the generated REToken to false if
        REG_ICASE_USASCII is set.
        * java/util/regex/Pattern.java: Sets the flag REG_ICASE_USASCII to true.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.22
diff -u -r1.22 RE.java
--- classpath/gnu/regexp/RE.java        4 Apr 2006 16:20:50 -0000       1.22
+++ classpath/gnu/regexp/RE.java        8 Apr 2006 16:03:47 -0000
@@ -245,9 +245,13 @@
    * Compilation flag. Allow whitespace and comments in pattern.
    * This is equivalent to the "/x" operator in Perl.
    */
-
   public static final int REG_X_COMMENTS = 0x0400;
 
+  /**
+   * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
+   */
+  public static final int REG_ICASE_USASCII = 0x0800;
+
   /** Returns a string representing the version of the gnu.regexp package. */
   public static final String version() {
     return VERSION;
@@ -347,6 +351,7 @@
     // Precalculate these so we don't pay for the math every time we
     // need to access them.
     boolean insens = ((cflags & REG_ICASE) > 0);
+    boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
 
     // Parse pattern into tokens.  Does anyone know if it's more efficient
     // to use char[] than a String.charAt()?  I'm assuming so.
@@ -458,6 +463,7 @@
         else {
           addToken(currentToken);
           currentToken = new RETokenChar(subIndex,unit.ch,insens);
+         if (insensUSASCII) currentToken.unicodeAware = false;
         } 
       }
       
@@ -533,7 +539,7 @@
          case 'd':
          case 'm':
          case 's':
-         // case 'u':  not supported
+         case 'u':
          case 'x':
          case '-':
             if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
@@ -573,7 +579,13 @@
                    newCflags |= REG_DOT_NEWLINE;
                  flagIndex++;
                  break;
-               // case 'u': not supported
+               case 'u':
+                 if (negate)
+                   newCflags |= REG_ICASE_USASCII;
+                 else
+                   newCflags &= ~REG_ICASE_USASCII;
+                 flagIndex++;
+                 break;
                case 'x':
                  if (negate)
                    newCflags &= ~REG_X_COMMENTS;
@@ -597,6 +609,7 @@
                syntax = newSyntax;
                cflags = newCflags;
                insens = ((cflags & REG_ICASE) > 0);
+               insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
                // This can be treated as though it were a comment.
                comment = true;
                index = flagIndex - 1;
@@ -609,6 +622,7 @@
                syntax = newSyntax;
                cflags = newCflags;
                insens = ((cflags & REG_ICASE) > 0);
+               insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
                index = flagIndex -1;
                // Fall through to the next case.
            }
@@ -717,6 +731,7 @@
              syntax = savedSyntax;
              cflags = savedCflags;
              insens = ((cflags & REG_ICASE) > 0);
+             insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
              flagsSaved = false;
          }
        } // not a comment
@@ -829,6 +844,7 @@
        index = index - 2 + ce.len;
        addToken(currentToken);
        currentToken = new RETokenChar(subIndex,ce.ch,insens);
+       if (insensUSASCII) currentToken.unicodeAware = false;
       }
 
       // BACKREFERENCE OPERATOR
@@ -856,6 +872,7 @@
        int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
 
        currentToken = new RETokenBackRef(subIndex,num,insens);
+       if (insensUSASCII) currentToken.unicodeAware = false;
        index = numEnd;
       }
 
@@ -904,6 +921,7 @@
       else if (unit.bk && (unit.ch == 'd') && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
        addToken(currentToken);
        currentToken = new 
RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
+       if (insensUSASCII) currentToken.unicodeAware = false;
       }
 
       // NON-DIGIT OPERATOR
@@ -912,6 +930,7 @@
        else if (unit.bk && (unit.ch == 'D') && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
          addToken(currentToken);
          currentToken = new 
RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        }
 
        // NEWLINE ESCAPE
@@ -936,6 +955,7 @@
        else if (unit.bk && (unit.ch == 's') && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
          addToken(currentToken);
          currentToken = new 
RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        }
 
        // NON-WHITESPACE OPERATOR
@@ -944,6 +964,7 @@
        else if (unit.bk && (unit.ch == 'S') && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
          addToken(currentToken);
          currentToken = new 
RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        }
 
        // TAB ESCAPE
@@ -960,6 +981,7 @@
        else if (unit.bk && (unit.ch == 'w') && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
          addToken(currentToken);
          currentToken = new 
RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        }
 
        // NON-ALPHANUMERIC OPERATOR
@@ -968,6 +990,7 @@
        else if (unit.bk && (unit.ch == 'W') && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
          addToken(currentToken);
          currentToken = new 
RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        }
 
        // END OF STRING OPERATOR
@@ -995,6 +1018,7 @@
          index = index - 2 + ce.len;
          addToken(currentToken);
          currentToken = new RETokenChar(subIndex,ce.ch,insens);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        }
 
        // NAMED PROPERTY
@@ -1008,6 +1032,7 @@
          index = index - 2 + np.len;
          addToken(currentToken);
          currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        }
 
        // END OF PREVIOUS MATCH
@@ -1025,6 +1050,7 @@
        else {  // not a special character
          addToken(currentToken);
          currentToken = new RETokenChar(subIndex,unit.ch,insens);
+         if (insensUSASCII) currentToken.unicodeAware = false;
        } 
       } // end while
 
@@ -1065,6 +1091,7 @@
                throws REException {
 
        boolean insens = ((cflags & REG_ICASE) > 0);
+       boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
        Vector options = new Vector();
        Vector addition = new Vector();
        boolean additionAndAppeared = false;
@@ -1094,7 +1121,9 @@
          if ((ch == '-') && (lastCharIsSet)) {
            if (index == pLength) throw new 
REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
            if ((ch = pattern[index]) == ']') {
-             options.addElement(new RETokenChar(subIndex,lastChar,insens));
+             RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+             if (insensUSASCII) t.unicodeAware = false;
+             options.addElement(t);
              lastChar = '-';
            } else {
              if ((ch == '\\') && 
syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
@@ -1104,7 +1133,9 @@
                ch = ce.ch;
                index = index + ce.len - 1;
              }
-             options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+             RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens);
+             if (insensUSASCII) t.unicodeAware = false;
+             options.addElement(t);
              lastChar = 0; lastCharIsSet = false;
              index++;
            }
@@ -1147,12 +1178,20 @@
              asciiEsc = ce.ch; asciiEscIsSet = true;
              index = index - 1 + ce.len - 1;
            }
-           if (lastCharIsSet) options.addElement(new 
RETokenChar(subIndex,lastChar,insens));
+           if (lastCharIsSet) {
+             RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+             if (insensUSASCII) t.unicodeAware = false;
+             options.addElement(t);
+           }
            
            if (posixID != -1) {
-             options.addElement(new 
RETokenPOSIX(subIndex,posixID,insens,negate));
+             RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate);
+             if (insensUSASCII) t.unicodeAware = false;
+             options.addElement(t);
            } else if (np != null) {
-             
options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+             RETokenNamedProperty t = 
getRETokenNamedProperty(subIndex,np,insens,index);
+             if (insensUSASCII) t.unicodeAware = false;
+             options.addElement(t);
            } else if (asciiEscIsSet) {
              lastChar = asciiEsc; lastCharIsSet = true;
            } else {
@@ -1163,8 +1202,11 @@
            StringBuffer posixSet = new StringBuffer();
            index = getPosixSet(pattern,index+1,posixSet);
            int posixId = RETokenPOSIX.intValue(posixSet.toString());
-           if (posixId != -1)
-             options.addElement(new 
RETokenPOSIX(subIndex,posixId,insens,false));
+           if (posixId != -1) {
+             RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false);
+             if (insensUSASCII) t.unicodeAware = false;
+             options.addElement(t);
+           }
          } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) 
{
                ParseCharClassResult result = parseCharClass(
                    subIndex, pattern, index, pLength, cflags, syntax, 0);
@@ -1217,14 +1259,22 @@
                        result.index: result.index - 1);
                }
          } else {
-           if (lastCharIsSet) options.addElement(new 
RETokenChar(subIndex,lastChar,insens));
+           if (lastCharIsSet) {
+             RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+             if (insensUSASCII) t.unicodeAware = false;
+             options.addElement(t);
+           }
            lastChar = ch; lastCharIsSet = true;
          }
          if (index == pLength) throw new 
REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
        } // while in list
        // Out of list, index is one past ']'
            
-       if (lastCharIsSet) options.addElement(new 
RETokenChar(subIndex,lastChar,insens));
+       if (lastCharIsSet) {
+         RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+         if (insensUSASCII) t.unicodeAware = false;
+         options.addElement(t);
+       }
           
        ParseCharClassResult result = new ParseCharClassResult(); 
        // Create a new RETokenOneOf
Index: classpath/gnu/regexp/REToken.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REToken.java,v
retrieving revision 1.5
diff -u -r1.5 REToken.java
--- classpath/gnu/regexp/REToken.java   11 Mar 2006 01:39:49 -0000      1.5
+++ classpath/gnu/regexp/REToken.java   8 Apr 2006 16:03:47 -0000
@@ -43,6 +43,7 @@
   protected REToken next = null;
   protected REToken uncle = null;
   protected int subIndex;
+  protected boolean unicodeAware = true;
 
   public Object clone() {
     try {
@@ -157,4 +158,32 @@
     return os.toString();
   }
 
+  /**
+    * Converts the character argument to lowercase.
+    * @param ch the character to be converted.
+    * @param unicodeAware If true, use java.lang.Character#toLowerCase;
+    * otherwise, only US-ASCII charactes can be converted.
+    * @return the lowercase equivalent of the character, if any;
+    * otherwise, the character itself.
+    */
+  public static char toLowerCase(char ch, boolean unicodeAware) {
+    if (unicodeAware) return Character.toLowerCase(ch);
+    if (ch >= 'A' && ch <= 'Z') return (char)(ch + 'a' - 'A');
+    return ch;
+  }
+
+  /**
+    * Converts the character argument to uppercase.
+    * @param ch the character to be converted.
+    * @param unicodeAware If true, use java.lang.Character#toUpperCase;
+    * otherwise, only US-ASCII charactes can be converted.
+    * @return the uppercase equivalent of the character, if any;
+    * otherwise, the character itself.
+    */
+  public static char toUpperCase(char ch, boolean unicodeAware) {
+    if (unicodeAware) return Character.toUpperCase(ch);
+    if (ch >= 'a' && ch <= 'z') return (char)(ch + 'A' - 'a');
+    return ch;
+  }
+
 }
Index: classpath/gnu/regexp/RETokenBackRef.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenBackRef.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenBackRef.java
--- classpath/gnu/regexp/RETokenBackRef.java    18 Mar 2006 00:43:11 -0000      
1.6
+++ classpath/gnu/regexp/RETokenBackRef.java    8 Apr 2006 16:03:47 -0000
@@ -64,8 +64,8 @@
            char c2 = input.charAt(i);
            if (c1 != c2) {
                if (insens) {
-                   if (c1 != Character.toLowerCase(c2) &&
-                       c1 != Character.toUpperCase(c2)) {
+                   if (c1 != toLowerCase(c2, unicodeAware) &&
+                       c1 != toUpperCase(c2, unicodeAware)) {
                        return null;
                    }
                }
Index: classpath/gnu/regexp/RETokenChar.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenChar.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenChar.java
--- classpath/gnu/regexp/RETokenChar.java       11 Mar 2006 01:39:49 -0000      
1.5
+++ classpath/gnu/regexp/RETokenChar.java       8 Apr 2006 16:03:47 -0000
@@ -45,7 +45,7 @@
   RETokenChar(int subIndex, char c, boolean ins) {
     super(subIndex);
     ch = new char [1];
-    ch[0] = (insens = ins) ? Character.toLowerCase(c) : c;
+    ch[0] = (insens = ins) ? toLowerCase(c, unicodeAware) : c;
   }
 
   int getMinimumLength() {
@@ -70,7 +70,7 @@
        char c;
        for (int i=0; i<z; i++) {
            c = input.charAt(index+i);
-           if (( (insens) ? Character.toLowerCase(c) : c ) != ch[i]) {
+           if (( (insens) ? toLowerCase(c, unicodeAware) : c ) != ch[i]) {
                return false;
            }
        }
Index: classpath/gnu/regexp/RETokenNamedProperty.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenNamedProperty.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenNamedProperty.java
--- classpath/gnu/regexp/RETokenNamedProperty.java      11 Mar 2006 01:39:49 
-0000      1.5
+++ classpath/gnu/regexp/RETokenNamedProperty.java      8 Apr 2006 16:03:47 
-0000
@@ -122,8 +122,8 @@
     boolean retval = handler.includes(ch);
     if (insens) {
         retval = retval ||
-                 handler.includes(Character.toUpperCase(ch)) ||
-                 handler.includes(Character.toLowerCase(ch));
+                 handler.includes(toUpperCase(ch, unicodeAware)) ||
+                 handler.includes(toLowerCase(ch, unicodeAware));
     }
 
     if (negate) retval = !retval;
Index: classpath/gnu/regexp/RETokenRange.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRange.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenRange.java
--- classpath/gnu/regexp/RETokenRange.java      11 Mar 2006 01:39:49 -0000      
1.5
+++ classpath/gnu/regexp/RETokenRange.java      8 Apr 2006 16:03:47 -0000
@@ -69,10 +69,10 @@
        if (c == CharIndexed.OUT_OF_BOUNDS) return false;
        boolean matches = (c >= lo) && (c <= hi);
        if (! matches && insens) {
-         char c1 = Character.toLowerCase(c);
+         char c1 = toLowerCase(c, unicodeAware);
          matches = (c1 >= lo) && (c1 <= hi);
          if (!matches) {
-           c1 = Character.toUpperCase(c);
+           c1 = toUpperCase(c, unicodeAware);
            matches = (c1 >= lo) && (c1 <= hi);
          }
        }
Index: classpath/java/util/regex/Pattern.java
===================================================================
RCS file: /cvsroot/classpath/classpath/java/util/regex/Pattern.java,v
retrieving revision 1.15
diff -u -r1.15 Pattern.java
--- classpath/java/util/regex/Pattern.java      22 Mar 2006 22:25:00 -0000      
1.15
+++ classpath/java/util/regex/Pattern.java      8 Apr 2006 16:03:47 -0000
@@ -74,14 +74,16 @@
     this.flags = flags;
 
     int gnuFlags = 0;
+    gnuFlags |= RE.REG_ICASE_USASCII;
     if ((flags & CASE_INSENSITIVE) != 0)
       gnuFlags |= RE.REG_ICASE;
     if ((flags & MULTILINE) != 0)
       gnuFlags |= RE.REG_MULTILINE;
     if ((flags & DOTALL) != 0)
       gnuFlags |= RE.REG_DOT_NEWLINE;
+    if ((flags & UNICODE_CASE) != 0)
+      gnuFlags &= ~RE.REG_ICASE_USASCII;
     // not yet supported:
-    // if ((flags & UNICODE_CASE) != 0) gnuFlags =
     // if ((flags & CANON_EQ) != 0) gnuFlags =
 
     RESyntax syntax = RESyntax.RE_SYNTAX_JAVA_1_4;

[cp-patches] RFC: gnu.regexp: Unicode-aware case folding

Reply via email to