Author: knoaman
Date: Mon Nov 2 15:40:37 2009
New Revision: 831927
URL: http://svn.apache.org/viewvc?rev=831927&view=rev
Log:
[RegEx] Fix for case insensitive matching
Modified:
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/Token.java
Modified:
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
URL:
http://svn.apache.org/viewvc/xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java?rev=831927&r1=831926&r2=831927&view=diff
==============================================================================
---
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
(original)
+++
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
Mon Nov 2 15:40:37 2009
@@ -250,14 +250,24 @@
if (c == '-' && this.chardata != ']' && !firstloop) throw
this.ex("parser.cc.8", this.offset-2); // if regex = '[-]' then invalid
}
if (this.read() != T_CHAR || this.chardata != '-' || c == '-'
&& firstloop) { // Here is no '-'.
- tok.addRange(c, c);
+ if (!this.isSet(RegularExpression.IGNORE_CASE) || c >
0xffff) {
+ tok.addRange(c, c);
+ }
+ else {
+ addCaseInsensitiveChar(tok, c);
+ }
} else { // Found '-'
// Is this '-' is a from-to
token??
this.next(); // Skips '-'
if ((type = this.read()) == T_EOF) throw
this.ex("parser.cc.2", this.offset);
// c '-' ']' -> '-' is a
single-range.
if(type == T_CHAR && this.chardata == ']') {
// if - is at the last position of the group
- tok.addRange(c, c);
+ if (!this.isSet(RegularExpression.IGNORE_CASE) || c >
0xffff) {
+ tok.addRange(c, c);
+ }
+ else {
+ addCaseInsensitiveChar(tok, c);
+ }
tok.addRange('-', '-');
}
else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
@@ -275,7 +285,13 @@
this.next();
if (c > rangeend) throw this.ex("parser.ope.3",
this.offset-1);
- tok.addRange(c, rangeend);
+ if (!this.isSet(RegularExpression.IGNORE_CASE) ||
+ (c > 0xffff && rangeend > 0xffff)) {
+ tok.addRange(c, rangeend);
+ }
+ else {
+ addCaseInsensitiveCharRange(tok, c, rangeend);
+ }
}
}
}
Modified:
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
URL:
http://svn.apache.org/viewvc/xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegexParser.java?rev=831927&r1=831926&r2=831927&view=diff
==============================================================================
---
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
(original)
+++
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
Mon Nov 2 15:40:37 2009
@@ -107,7 +107,7 @@
return new ParseException(this.resources.getString(key), loc);
}
- private final boolean isSet(int flag) {
+ protected final boolean isSet(int flag) {
return (this.options & flag) == flag;
}
@@ -940,19 +940,35 @@
this.next();
if (!end) { // if not shorthands...
if (this.read() != T_CHAR || this.chardata != '-') { // Here
is no '-'.
- tok.addRange(c, c);
+ if (!this.isSet(RegularExpression.IGNORE_CASE) || c >
0xffff) {
+ tok.addRange(c, c);
+ }
+ else {
+ addCaseInsensitiveChar(tok, c);
+ }
} else {
this.next(); // Skips '-'
if ((type = this.read()) == T_EOF) throw
this.ex("parser.cc.2", this.offset);
if (type == T_CHAR && this.chardata == ']') {
- tok.addRange(c, c);
+ if (!this.isSet(RegularExpression.IGNORE_CASE) || c >
0xffff) {
+ tok.addRange(c, c);
+ }
+ else {
+ addCaseInsensitiveChar(tok, c);
+ }
tok.addRange('-', '-');
} else {
int rangeend = this.chardata;
if (type == T_BACKSOLIDUS)
rangeend = this.decodeEscaped();
this.next();
- tok.addRange(c, rangeend);
+ if (!this.isSet(RegularExpression.IGNORE_CASE) ||
+ (c > 0xffff && rangeend > 0xffff)) {
+ tok.addRange(c, rangeend);
+ }
+ else {
+ addCaseInsensitiveCharRange(tok, c, rangeend);
+ }
}
}
}
@@ -1151,4 +1167,46 @@
if (ch < 'a') return -1;
return ch-'a'+10;
}
+
+ static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
+ tok.addRange(c, c);
+ char cic = Character.toUpperCase((char)c);
+ if (cic != c) {
+ tok.addRange(cic, cic);
+ }
+ cic = Character.toLowerCase((char)c);
+ if (cic != c) {
+ tok.addRange(cic, cic);
+ }
+ }
+
+ static protected final void addCaseInsensitiveCharRange(RangeToken tok,
int start, int end) {
+ int r1, r2;
+ if (start <= end) {
+ r1 = start;
+ r2 = end;
+ } else {
+ r1 = end;
+ r2 = start;
+ }
+
+ tok.addRange(r1, r2);
+ for (int ch = r1; ch <= r2; ch++) {
+ if (ch <= 0xffff) {
+ char uch = Character.toUpperCase((char)ch);
+ if (uch != ch) {
+ tok.addRange(uch, uch);
+ }
+ }
+ }
+
+ for (int ch = r1; ch <= r2; ch++) {
+ if (ch <= 0xffff) {
+ char lch = Character.toLowerCase((char)ch);
+ if (lch != ch) {
+ tok.addRange(lch, lch);
+ }
+ }
+ }
+ }
}
Modified:
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
URL:
http://svn.apache.org/viewvc/xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java?rev=831927&r1=831926&r2=831927&view=diff
==============================================================================
---
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
(original)
+++
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
Mon Nov 2 15:40:37 2009
@@ -700,7 +700,7 @@
* @param match A Match instance for storing matching result.
* @return Offset of the start position in <VAR>target</VAR>; or -1 if not
match.
*/
- public boolean matches(char[] target, int start, int end, Match match) {
+ public boolean matches(char[] target, int start, int end, Match match) {
synchronized (this) {
if (this.operations == null)
@@ -807,33 +807,16 @@
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching:
"+this.firstChar);
RangeToken range = this.firstChar;
- if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
- range = this.firstChar.getCaseInsensitiveToken();
- for (matchStart = con.start; matchStart <= limit; matchStart
++) {
- int ch = target [ matchStart ] ;
- if (REUtil.isHighSurrogate(ch) && matchStart+1 <
con.limit) {
- ch = REUtil.composeFromSurrogates(ch, target [
matchStart+1 ] );
- if (!range.match(ch)) continue;
- } else {
- if (!range.match(ch)) {
- char ch1 = Character.toUpperCase((char)ch);
- if (!range.match(ch1))
- if (!range.match(Character.toLowerCase(ch1)))
- continue;
- }
- }
- if (0 <= (matchEnd = this. matchCharArray (con,
this.operations,
- matchStart, 1,
this.options)))
- break;
+ for (matchStart = con.start; matchStart <= limit; matchStart ++)
{
+ int ch = target [matchStart] ;
+ if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
+ ch = REUtil.composeFromSurrogates(ch,
target[matchStart+1]);
}
- } else {
- for (matchStart = con.start; matchStart <= limit; matchStart
++) {
- int ch = target [ matchStart ] ;
- if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
- ch = REUtil.composeFromSurrogates(ch, target [
matchStart+1 ] );
- if (!range.match(ch)) continue;
- if (0 <= (matchEnd = this. matchCharArray (con,
this.operations,
- matchStart, 1,
this.options)))
+ if (!range.match(ch)) {
+ continue;
+ }
+ if (0 <= (matchEnd = this. matchCharArray (con,
this.operations,
+ matchStart, 1,
this.options))) {
break;
}
}
@@ -945,20 +928,12 @@
if (offset >= con.limit)
return -1;
int ch = target [ offset ] ;
- if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
- ch = REUtil.composeFromSurrogates(ch, target [
++offset ] );
- RangeToken tok = op.getToken();
- if (isSet(opts, IGNORE_CASE)) {
- tok = tok.getCaseInsensitiveToken();
- if (!tok.match(ch)) {
- if (ch >= 0x10000) return -1;
- char uch;
- if (!tok.match(uch =
Character.toUpperCase((char)ch))
- && !tok.match(Character.toLowerCase(uch)))
- return -1;
- }
- } else {
- if (!tok.match(ch)) return -1;
+ if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) {
+ ch = REUtil.composeFromSurrogates(ch,
target[++offset]);
+ }
+ final RangeToken tok = op.getToken();
+ if (!tok.match(ch)) {
+ return -1;
}
offset ++;
} else {
@@ -966,20 +941,12 @@
if (o1 >= con.limit || o1 < 0)
return -1;
int ch = target [ o1 ] ;
- if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
+ if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) {
ch = REUtil.composeFromSurrogates( target [ --o1 ] ,
ch);
- RangeToken tok = op.getToken();
- if (isSet(opts, IGNORE_CASE)) {
- tok = tok.getCaseInsensitiveToken();
- if (!tok.match(ch)) {
- if (ch >= 0x10000) return -1;
- char uch;
- if (!tok.match(uch =
Character.toUpperCase((char)ch))
- && !tok.match(Character.toLowerCase(uch)))
- return -1;
- }
- } else {
- if (!tok.match(ch)) return -1;
+ }
+ final RangeToken tok = op.getToken();
+ if (!tok.match(ch)) {
+ return -1;
}
offset = o1;
}
@@ -1522,33 +1489,16 @@
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching:
"+this.firstChar);
RangeToken range = this.firstChar;
- if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
- range = this.firstChar.getCaseInsensitiveToken();
- for (matchStart = con.start; matchStart <= limit; matchStart
++) {
- int ch = target .charAt( matchStart ) ;
- if (REUtil.isHighSurrogate(ch) && matchStart+1 <
con.limit) {
- ch = REUtil.composeFromSurrogates(ch, target .charAt(
matchStart+1 ) );
- if (!range.match(ch)) continue;
- } else {
- if (!range.match(ch)) {
- char ch1 = Character.toUpperCase((char)ch);
- if (!range.match(ch1))
- if (!range.match(Character.toLowerCase(ch1)))
- continue;
- }
- }
- if (0 <= (matchEnd = this. matchString (con,
this.operations,
- matchStart, 1,
this.options)))
- break;
+ for (matchStart = con.start; matchStart <= limit; matchStart ++)
{
+ int ch = target .charAt( matchStart ) ;
+ if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
+ ch = REUtil.composeFromSurrogates(ch,
target.charAt(matchStart+1));
}
- } else {
- for (matchStart = con.start; matchStart <= limit; matchStart
++) {
- int ch = target .charAt( matchStart ) ;
- if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
- ch = REUtil.composeFromSurrogates(ch, target .charAt(
matchStart+1 ) );
- if (!range.match(ch)) continue;
- if (0 <= (matchEnd = this. matchString (con,
this.operations,
- matchStart, 1,
this.options)))
+ if (!range.match(ch)) {
+ continue;
+ }
+ if (0 <= (matchEnd = this. matchString (con, this.operations,
+ matchStart, 1,
this.options))) {
break;
}
}
@@ -1665,20 +1615,12 @@
if (offset >= con.limit)
return -1;
int ch = target .charAt( offset ) ;
- if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
- ch = REUtil.composeFromSurrogates(ch, target .charAt(
++offset ) );
- RangeToken tok = op.getToken();
- if (isSet(opts, IGNORE_CASE)) {
- tok = tok.getCaseInsensitiveToken();
- if (!tok.match(ch)) {
- if (ch >= 0x10000) return -1;
- char uch;
- if (!tok.match(uch =
Character.toUpperCase((char)ch))
- && !tok.match(Character.toLowerCase(uch)))
- return -1;
- }
- } else {
- if (!tok.match(ch)) return -1;
+ if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) {
+ ch = REUtil.composeFromSurrogates(ch,
target.charAt(++offset));
+ }
+ final RangeToken tok = op.getToken();
+ if (!tok.match(ch)) {
+ return -1;
}
offset ++;
} else {
@@ -1686,20 +1628,12 @@
if (o1 >= con.limit || o1 < 0)
return -1;
int ch = target .charAt( o1 ) ;
- if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
+ if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) {
ch = REUtil.composeFromSurrogates( target .charAt(
--o1 ) , ch);
- RangeToken tok = op.getToken();
- if (isSet(opts, IGNORE_CASE)) {
- tok = tok.getCaseInsensitiveToken();
- if (!tok.match(ch)) {
- if (ch >= 0x10000) return -1;
- char uch;
- if (!tok.match(uch =
Character.toUpperCase((char)ch))
- && !tok.match(Character.toLowerCase(uch)))
- return -1;
- }
- } else {
- if (!tok.match(ch)) return -1;
+ }
+ final RangeToken tok = op.getToken();
+ if (!tok.match(ch)) {
+ return -1;
}
offset = o1;
}
@@ -2169,34 +2103,17 @@
else if (this.firstChar != null) {
//System.err.println("DEBUG: with firstchar-matching:
"+this.firstChar);
RangeToken range = this.firstChar;
- if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
- range = this.firstChar.getCaseInsensitiveToken();
- for (matchStart = con.start; matchStart <= limit; matchStart
++) {
- int ch = target .setIndex( matchStart ) ;
- if (REUtil.isHighSurrogate(ch) && matchStart+1 <
con.limit) {
- ch = REUtil.composeFromSurrogates(ch, target
.setIndex( matchStart+1 ) );
- if (!range.match(ch)) continue;
- } else {
- if (!range.match(ch)) {
- char ch1 = Character.toUpperCase((char)ch);
- if (!range.match(ch1))
- if (!range.match(Character.toLowerCase(ch1)))
- continue;
- }
- }
- if (0 <= (matchEnd = this. matchCharacterIterator (con,
this.operations,
-
matchStart, 1, this.options)))
- break;
+ for (matchStart = con.start; matchStart <= limit; matchStart ++)
{
+ int ch = target .setIndex( matchStart ) ;
+ if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
+ ch = REUtil.composeFromSurrogates(ch,
target.setIndex(matchStart+1));
}
- } else {
- for (matchStart = con.start; matchStart <= limit; matchStart
++) {
- int ch = target .setIndex( matchStart ) ;
- if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
- ch = REUtil.composeFromSurrogates(ch, target
.setIndex( matchStart+1 ) );
- if (!range.match(ch)) continue;
- if (0 <= (matchEnd = this. matchCharacterIterator (con,
this.operations,
-
matchStart, 1, this.options)))
- break;
+ if (!range.match(ch)) {
+ continue;
+ }
+ if (0 <= (matchEnd = this.matchCharacterIterator(con,
this.operations,
+ matchStart,
1, this.options))) {
+ break;
}
}
}
@@ -2314,18 +2231,9 @@
int ch = target .setIndex( offset ) ;
if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
ch = REUtil.composeFromSurrogates(ch, target
.setIndex( ++offset ) );
- RangeToken tok = op.getToken();
- if (isSet(opts, IGNORE_CASE)) {
- tok = tok.getCaseInsensitiveToken();
- if (!tok.match(ch)) {
- if (ch >= 0x10000) return -1;
- char uch;
- if (!tok.match(uch =
Character.toUpperCase((char)ch))
- && !tok.match(Character.toLowerCase(uch)))
- return -1;
- }
- } else {
- if (!tok.match(ch)) return -1;
+ final RangeToken tok = op.getToken();
+ if (!tok.match(ch)) {
+ return -1;
}
offset ++;
} else {
@@ -2335,18 +2243,9 @@
int ch = target .setIndex( o1 ) ;
if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
ch = REUtil.composeFromSurrogates( target .setIndex(
--o1 ) , ch);
- RangeToken tok = op.getToken();
- if (isSet(opts, IGNORE_CASE)) {
- tok = tok.getCaseInsensitiveToken();
- if (!tok.match(ch)) {
- if (ch >= 0x10000) return -1;
- char uch;
- if (!tok.match(uch =
Character.toUpperCase((char)ch))
- && !tok.match(Character.toLowerCase(uch)))
- return -1;
- }
- } else {
- if (!tok.match(ch)) return -1;
+ final RangeToken tok = op.getToken();
+ if (!tok.match(ch)) {
+ return -1;
}
offset = o1;
}
Modified:
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/Token.java
URL:
http://svn.apache.org/viewvc/xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/Token.java?rev=831927&r1=831926&r2=831927&view=diff
==============================================================================
---
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/Token.java
(original)
+++
xerces/java/branches/xml-schema-1.1-dev/src/org/apache/xerces/impl/xpath/regex/Token.java
Mon Nov 2 15:40:37 2009
@@ -442,19 +442,11 @@
return FC_ANY;
case RANGE:
- if (isSet(options, RegularExpression.IGNORE_CASE)) {
-
result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
- } else {
- result.mergeRanges(this);
- }
+ result.mergeRanges(this);
return FC_TERMINAL;
case NRANGE: // ****
- if (isSet(options, RegularExpression.IGNORE_CASE)) {
-
result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
- } else {
- result.mergeRanges(Token.complementRanges(this));
- }
+ result.mergeRanges(Token.complementRanges(this));
return FC_TERMINAL;
case INDEPENDENT:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]