Hi,
gnu.regexp, by default, is Unocode-aware when it performs a
case-insensitive matching, i.e. it uses Character.toLowerCase or
Character.toUpperCase.
On the other hand, java.util.regex, by default, should be aware
only of US-ASCII alphabet when it performs a case-insensitive
matching.
This patch fills the gap.
ChangeLog:
2006-04-08 Ito Kazumitsu <[EMAIL PROTECTED]>
* gnu/regexp/REToken.java(unicodeAware): New field,
(toLowerCase, toUpperCase): New methods.
* gnu/regexp/RETokenBackRef.java, gnu/regexp/RETokenChar.java,
gnu/regexp/RETokenNamedProperty.java, gnu/regexp/RETokenRange.java:
Use toLowerCase and toUpperCase defined in REToken instead of
those defined in java.lang.Character.
* gnu/regexp/gnu/regexp/RE.java(REG_ICASE_USASCII): New flag.
(initialize): Sets unicodeAware of the generated REToken to false if
REG_ICASE_USASCII is set.
* java/util/regex/Pattern.java: Sets the flag REG_ICASE_USASCII to true.
Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.22
diff -u -r1.22 RE.java
--- classpath/gnu/regexp/RE.java 4 Apr 2006 16:20:50 -0000 1.22
+++ classpath/gnu/regexp/RE.java 8 Apr 2006 16:03:47 -0000
@@ -245,9 +245,13 @@
* Compilation flag. Allow whitespace and comments in pattern.
* This is equivalent to the "/x" operator in Perl.
*/
-
public static final int REG_X_COMMENTS = 0x0400;
+ /**
+ * Compilation flag. If set, REG_ICASE is effective only for US-ASCII.
+ */
+ public static final int REG_ICASE_USASCII = 0x0800;
+
/** Returns a string representing the version of the gnu.regexp package. */
public static final String version() {
return VERSION;
@@ -347,6 +351,7 @@
// Precalculate these so we don't pay for the math every time we
// need to access them.
boolean insens = ((cflags & REG_ICASE) > 0);
+ boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
// Parse pattern into tokens. Does anyone know if it's more efficient
// to use char[] than a String.charAt()? I'm assuming so.
@@ -458,6 +463,7 @@
else {
addToken(currentToken);
currentToken = new RETokenChar(subIndex,unit.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
}
@@ -533,7 +539,7 @@
case 'd':
case 'm':
case 's':
- // case 'u': not supported
+ case 'u':
case 'x':
case '-':
if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
@@ -573,7 +579,13 @@
newCflags |= REG_DOT_NEWLINE;
flagIndex++;
break;
- // case 'u': not supported
+ case 'u':
+ if (negate)
+ newCflags |= REG_ICASE_USASCII;
+ else
+ newCflags &= ~REG_ICASE_USASCII;
+ flagIndex++;
+ break;
case 'x':
if (negate)
newCflags &= ~REG_X_COMMENTS;
@@ -597,6 +609,7 @@
syntax = newSyntax;
cflags = newCflags;
insens = ((cflags & REG_ICASE) > 0);
+ insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
// This can be treated as though it were a comment.
comment = true;
index = flagIndex - 1;
@@ -609,6 +622,7 @@
syntax = newSyntax;
cflags = newCflags;
insens = ((cflags & REG_ICASE) > 0);
+ insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
index = flagIndex -1;
// Fall through to the next case.
}
@@ -717,6 +731,7 @@
syntax = savedSyntax;
cflags = savedCflags;
insens = ((cflags & REG_ICASE) > 0);
+ insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
flagsSaved = false;
}
} // not a comment
@@ -829,6 +844,7 @@
index = index - 2 + ce.len;
addToken(currentToken);
currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// BACKREFERENCE OPERATOR
@@ -856,6 +872,7 @@
int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
currentToken = new RETokenBackRef(subIndex,num,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
index = numEnd;
}
@@ -904,6 +921,7 @@
else if (unit.bk && (unit.ch == 'd') &&
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new
RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NON-DIGIT OPERATOR
@@ -912,6 +930,7 @@
else if (unit.bk && (unit.ch == 'D') &&
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new
RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NEWLINE ESCAPE
@@ -936,6 +955,7 @@
else if (unit.bk && (unit.ch == 's') &&
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new
RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NON-WHITESPACE OPERATOR
@@ -944,6 +964,7 @@
else if (unit.bk && (unit.ch == 'S') &&
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new
RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// TAB ESCAPE
@@ -960,6 +981,7 @@
else if (unit.bk && (unit.ch == 'w') &&
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new
RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NON-ALPHANUMERIC OPERATOR
@@ -968,6 +990,7 @@
else if (unit.bk && (unit.ch == 'W') &&
syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
addToken(currentToken);
currentToken = new
RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// END OF STRING OPERATOR
@@ -995,6 +1018,7 @@
index = index - 2 + ce.len;
addToken(currentToken);
currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// NAMED PROPERTY
@@ -1008,6 +1032,7 @@
index = index - 2 + np.len;
addToken(currentToken);
currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
// END OF PREVIOUS MATCH
@@ -1025,6 +1050,7 @@
else { // not a special character
addToken(currentToken);
currentToken = new RETokenChar(subIndex,unit.ch,insens);
+ if (insensUSASCII) currentToken.unicodeAware = false;
}
} // end while
@@ -1065,6 +1091,7 @@
throws REException {
boolean insens = ((cflags & REG_ICASE) > 0);
+ boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0);
Vector options = new Vector();
Vector addition = new Vector();
boolean additionAndAppeared = false;
@@ -1094,7 +1121,9 @@
if ((ch == '-') && (lastCharIsSet)) {
if (index == pLength) throw new
REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
if ((ch = pattern[index]) == ']') {
- options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
lastChar = '-';
} else {
if ((ch == '\\') &&
syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
@@ -1104,7 +1133,9 @@
ch = ce.ch;
index = index + ce.len - 1;
}
- options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+ RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
lastChar = 0; lastCharIsSet = false;
index++;
}
@@ -1147,12 +1178,20 @@
asciiEsc = ce.ch; asciiEscIsSet = true;
index = index - 1 + ce.len - 1;
}
- if (lastCharIsSet) options.addElement(new
RETokenChar(subIndex,lastChar,insens));
+ if (lastCharIsSet) {
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
if (posixID != -1) {
- options.addElement(new
RETokenPOSIX(subIndex,posixID,insens,negate));
+ RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
} else if (np != null) {
-
options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+ RETokenNamedProperty t =
getRETokenNamedProperty(subIndex,np,insens,index);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
} else if (asciiEscIsSet) {
lastChar = asciiEsc; lastCharIsSet = true;
} else {
@@ -1163,8 +1202,11 @@
StringBuffer posixSet = new StringBuffer();
index = getPosixSet(pattern,index+1,posixSet);
int posixId = RETokenPOSIX.intValue(posixSet.toString());
- if (posixId != -1)
- options.addElement(new
RETokenPOSIX(subIndex,posixId,insens,false));
+ if (posixId != -1) {
+ RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
} else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS)))
{
ParseCharClassResult result = parseCharClass(
subIndex, pattern, index, pLength, cflags, syntax, 0);
@@ -1217,14 +1259,22 @@
result.index: result.index - 1);
}
} else {
- if (lastCharIsSet) options.addElement(new
RETokenChar(subIndex,lastChar,insens));
+ if (lastCharIsSet) {
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
lastChar = ch; lastCharIsSet = true;
}
if (index == pLength) throw new
REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
} // while in list
// Out of list, index is one past ']'
- if (lastCharIsSet) options.addElement(new
RETokenChar(subIndex,lastChar,insens));
+ if (lastCharIsSet) {
+ RETokenChar t = new RETokenChar(subIndex,lastChar,insens);
+ if (insensUSASCII) t.unicodeAware = false;
+ options.addElement(t);
+ }
ParseCharClassResult result = new ParseCharClassResult();
// Create a new RETokenOneOf
Index: classpath/gnu/regexp/REToken.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REToken.java,v
retrieving revision 1.5
diff -u -r1.5 REToken.java
--- classpath/gnu/regexp/REToken.java 11 Mar 2006 01:39:49 -0000 1.5
+++ classpath/gnu/regexp/REToken.java 8 Apr 2006 16:03:47 -0000
@@ -43,6 +43,7 @@
protected REToken next = null;
protected REToken uncle = null;
protected int subIndex;
+ protected boolean unicodeAware = true;
public Object clone() {
try {
@@ -157,4 +158,32 @@
return os.toString();
}
+ /**
+ * Converts the character argument to lowercase.
+ * @param ch the character to be converted.
+ * @param unicodeAware If true, use java.lang.Character#toLowerCase;
+ * otherwise, only US-ASCII charactes can be converted.
+ * @return the lowercase equivalent of the character, if any;
+ * otherwise, the character itself.
+ */
+ public static char toLowerCase(char ch, boolean unicodeAware) {
+ if (unicodeAware) return Character.toLowerCase(ch);
+ if (ch >= 'A' && ch <= 'Z') return (char)(ch + 'a' - 'A');
+ return ch;
+ }
+
+ /**
+ * Converts the character argument to uppercase.
+ * @param ch the character to be converted.
+ * @param unicodeAware If true, use java.lang.Character#toUpperCase;
+ * otherwise, only US-ASCII charactes can be converted.
+ * @return the uppercase equivalent of the character, if any;
+ * otherwise, the character itself.
+ */
+ public static char toUpperCase(char ch, boolean unicodeAware) {
+ if (unicodeAware) return Character.toUpperCase(ch);
+ if (ch >= 'a' && ch <= 'z') return (char)(ch + 'A' - 'a');
+ return ch;
+ }
+
}
Index: classpath/gnu/regexp/RETokenBackRef.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenBackRef.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenBackRef.java
--- classpath/gnu/regexp/RETokenBackRef.java 18 Mar 2006 00:43:11 -0000
1.6
+++ classpath/gnu/regexp/RETokenBackRef.java 8 Apr 2006 16:03:47 -0000
@@ -64,8 +64,8 @@
char c2 = input.charAt(i);
if (c1 != c2) {
if (insens) {
- if (c1 != Character.toLowerCase(c2) &&
- c1 != Character.toUpperCase(c2)) {
+ if (c1 != toLowerCase(c2, unicodeAware) &&
+ c1 != toUpperCase(c2, unicodeAware)) {
return null;
}
}
Index: classpath/gnu/regexp/RETokenChar.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenChar.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenChar.java
--- classpath/gnu/regexp/RETokenChar.java 11 Mar 2006 01:39:49 -0000
1.5
+++ classpath/gnu/regexp/RETokenChar.java 8 Apr 2006 16:03:47 -0000
@@ -45,7 +45,7 @@
RETokenChar(int subIndex, char c, boolean ins) {
super(subIndex);
ch = new char [1];
- ch[0] = (insens = ins) ? Character.toLowerCase(c) : c;
+ ch[0] = (insens = ins) ? toLowerCase(c, unicodeAware) : c;
}
int getMinimumLength() {
@@ -70,7 +70,7 @@
char c;
for (int i=0; i<z; i++) {
c = input.charAt(index+i);
- if (( (insens) ? Character.toLowerCase(c) : c ) != ch[i]) {
+ if (( (insens) ? toLowerCase(c, unicodeAware) : c ) != ch[i]) {
return false;
}
}
Index: classpath/gnu/regexp/RETokenNamedProperty.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenNamedProperty.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenNamedProperty.java
--- classpath/gnu/regexp/RETokenNamedProperty.java 11 Mar 2006 01:39:49
-0000 1.5
+++ classpath/gnu/regexp/RETokenNamedProperty.java 8 Apr 2006 16:03:47
-0000
@@ -122,8 +122,8 @@
boolean retval = handler.includes(ch);
if (insens) {
retval = retval ||
- handler.includes(Character.toUpperCase(ch)) ||
- handler.includes(Character.toLowerCase(ch));
+ handler.includes(toUpperCase(ch, unicodeAware)) ||
+ handler.includes(toLowerCase(ch, unicodeAware));
}
if (negate) retval = !retval;
Index: classpath/gnu/regexp/RETokenRange.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRange.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenRange.java
--- classpath/gnu/regexp/RETokenRange.java 11 Mar 2006 01:39:49 -0000
1.5
+++ classpath/gnu/regexp/RETokenRange.java 8 Apr 2006 16:03:47 -0000
@@ -69,10 +69,10 @@
if (c == CharIndexed.OUT_OF_BOUNDS) return false;
boolean matches = (c >= lo) && (c <= hi);
if (! matches && insens) {
- char c1 = Character.toLowerCase(c);
+ char c1 = toLowerCase(c, unicodeAware);
matches = (c1 >= lo) && (c1 <= hi);
if (!matches) {
- c1 = Character.toUpperCase(c);
+ c1 = toUpperCase(c, unicodeAware);
matches = (c1 >= lo) && (c1 <= hi);
}
}
Index: classpath/java/util/regex/Pattern.java
===================================================================
RCS file: /cvsroot/classpath/classpath/java/util/regex/Pattern.java,v
retrieving revision 1.15
diff -u -r1.15 Pattern.java
--- classpath/java/util/regex/Pattern.java 22 Mar 2006 22:25:00 -0000
1.15
+++ classpath/java/util/regex/Pattern.java 8 Apr 2006 16:03:47 -0000
@@ -74,14 +74,16 @@
this.flags = flags;
int gnuFlags = 0;
+ gnuFlags |= RE.REG_ICASE_USASCII;
if ((flags & CASE_INSENSITIVE) != 0)
gnuFlags |= RE.REG_ICASE;
if ((flags & MULTILINE) != 0)
gnuFlags |= RE.REG_MULTILINE;
if ((flags & DOTALL) != 0)
gnuFlags |= RE.REG_DOT_NEWLINE;
+ if ((flags & UNICODE_CASE) != 0)
+ gnuFlags &= ~RE.REG_ICASE_USASCII;
// not yet supported:
- // if ((flags & UNICODE_CASE) != 0) gnuFlags =
// if ((flags & CANON_EQ) != 0) gnuFlags =
RESyntax syntax = RESyntax.RE_SYNTAX_JAVA_1_4;