Author: knoaman
Date: Mon Nov  2 15:38:53 2009
New Revision: 831926

URL: http://svn.apache.org/viewvc?rev=831926&view=rev
Log:
[RegEx] Fix for case insensitive matching

Modified:
    
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
    xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
    
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
    xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/Token.java

Modified: 
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
URL: 
http://svn.apache.org/viewvc/xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java?rev=831926&r1=831925&r2=831926&view=diff
==============================================================================
--- 
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
 (original)
+++ 
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/ParserForXMLSchema.java
 Mon Nov  2 15:38:53 2009
@@ -250,14 +250,24 @@
                     if (c == '-' && this.chardata != ']' && !firstloop)  throw 
this.ex("parser.cc.8", this.offset-2);  // if regex = '[-]' then invalid
                 }
                 if (this.read() != T_CHAR || this.chardata != '-' || c == '-' 
&& firstloop) { // Here is no '-'.
-                    tok.addRange(c, c);
+                    if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 
0xffff) {
+                        tok.addRange(c, c);
+                    }
+                    else {
+                        addCaseInsensitiveChar(tok, c);
+                    }
                 } else {                        // Found '-'
                                                 // Is this '-' is a from-to 
token??
                     this.next(); // Skips '-'
                     if ((type = this.read()) == T_EOF)  throw 
this.ex("parser.cc.2", this.offset);
                                                 // c '-' ']' -> '-' is a 
single-range.
                     if(type == T_CHAR && this.chardata == ']') {               
                // if - is at the last position of the group
-                       tok.addRange(c, c);
+                        if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 
0xffff) {
+                           tok.addRange(c, c);
+                        }
+                        else {
+                            addCaseInsensitiveChar(tok, c);
+                        }
                        tok.addRange('-', '-');
                     }
                     else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
@@ -275,7 +285,13 @@
                         this.next();
 
                         if (c > rangeend)  throw this.ex("parser.ope.3", 
this.offset-1);
-                        tok.addRange(c, rangeend);
+                        if (!this.isSet(RegularExpression.IGNORE_CASE) ||
+                                (c > 0xffff && rangeend > 0xffff)) {
+                            tok.addRange(c, rangeend);
+                        }
+                        else {
+                            addCaseInsensitiveCharRange(tok, c, rangeend);
+                        }
                     }
                 }
             }

Modified: 
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegexParser.java
URL: 
http://svn.apache.org/viewvc/xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegexParser.java?rev=831926&r1=831925&r2=831926&view=diff
==============================================================================
--- xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegexParser.java 
(original)
+++ xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegexParser.java 
Mon Nov  2 15:38:53 2009
@@ -107,7 +107,7 @@
         return new ParseException(this.resources.getString(key), loc);
     }
 
-    private final boolean isSet(int flag) {
+    protected final boolean isSet(int flag) {
         return (this.options & flag) == flag;
     }
 
@@ -940,19 +940,35 @@
             this.next();
             if (!end) {                         // if not shorthands...
                 if (this.read() != T_CHAR || this.chardata != '-') { // Here 
is no '-'.
-                    tok.addRange(c, c);
+                    if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 
0xffff) {
+                        tok.addRange(c, c);
+                    }
+                    else {
+                        addCaseInsensitiveChar(tok, c);
+                    }
                 } else {
                     this.next(); // Skips '-'
                     if ((type = this.read()) == T_EOF)  throw 
this.ex("parser.cc.2", this.offset);
                     if (type == T_CHAR && this.chardata == ']') {
-                        tok.addRange(c, c);
+                        if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 
0xffff) {
+                            tok.addRange(c, c);
+                        }
+                        else {
+                            addCaseInsensitiveChar(tok, c);
+                        }
                         tok.addRange('-', '-');
                     } else {
                         int rangeend = this.chardata;
                         if (type == T_BACKSOLIDUS)
                             rangeend = this.decodeEscaped();
                         this.next();
-                        tok.addRange(c, rangeend);
+                        if (!this.isSet(RegularExpression.IGNORE_CASE) ||
+                                (c > 0xffff && rangeend > 0xffff)) {
+                            tok.addRange(c, rangeend);
+                        }
+                        else {
+                            addCaseInsensitiveCharRange(tok, c, rangeend);
+                        }
                     }
                 }
             }
@@ -1151,4 +1167,46 @@
         if (ch < 'a')  return -1;
         return ch-'a'+10;
     }
+    
+    static protected final void addCaseInsensitiveChar(RangeToken tok, int c) {
+        tok.addRange(c, c);
+        char cic = Character.toUpperCase((char)c);
+        if (cic != c) {
+            tok.addRange(cic, cic);
+        }
+        cic = Character.toLowerCase((char)c);
+        if (cic != c) {
+            tok.addRange(cic, cic);
+        }
+    }
+    
+    static protected final void addCaseInsensitiveCharRange(RangeToken tok, 
int start, int end) {
+        int r1, r2;
+        if (start <= end) {
+            r1 = start;
+            r2 = end;
+        } else {
+            r1 = end;
+            r2 = start;
+        }
+
+        tok.addRange(r1, r2);
+        for (int ch = r1;  ch <= r2;  ch++) {
+            if (ch <= 0xffff) {
+                char uch = Character.toUpperCase((char)ch);
+                if (uch != ch) {
+                    tok.addRange(uch, uch);
+                }
+            }
+        }
+
+        for (int ch = r1;  ch <= r2;  ch++) {
+            if (ch <= 0xffff) {
+                char lch = Character.toLowerCase((char)ch);
+                if (lch != ch) {
+                    tok.addRange(lch, lch);
+                }
+            }
+        }
+    }
 }

Modified: 
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java
URL: 
http://svn.apache.org/viewvc/xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java?rev=831926&r1=831925&r2=831926&view=diff
==============================================================================
--- 
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java 
(original)
+++ 
xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java 
Mon Nov  2 15:38:53 2009
@@ -700,7 +700,7 @@
      * @param match A Match instance for storing matching result.
      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not 
match.
      */
-    public boolean matches(char[]  target, int start, int end, Match match) {
+    public boolean matches(char[] target, int start, int end, Match match) {
 
         synchronized (this) {
             if (this.operations == null)
@@ -807,33 +807,16 @@
         else if (this.firstChar != null) {
             //System.err.println("DEBUG: with firstchar-matching: 
"+this.firstChar);
             RangeToken range = this.firstChar;
-            if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
-                range = this.firstChar.getCaseInsensitiveToken();
-                for (matchStart = con.start;  matchStart <= limit;  matchStart 
++) {
-                    int ch =  target [  matchStart ] ;
-                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < 
con.limit) {
-                        ch = REUtil.composeFromSurrogates(ch,  target [  
matchStart+1 ] );
-                        if (!range.match(ch))  continue;
-                    } else {
-                        if (!range.match(ch)) {
-                            char ch1 = Character.toUpperCase((char)ch);
-                            if (!range.match(ch1))
-                                if (!range.match(Character.toLowerCase(ch1)))
-                                    continue;
-                        }
-                    }
-                    if (0 <= (matchEnd = this. matchCharArray (con, 
this.operations,
-                                                               matchStart, 1, 
this.options)))
-                        break;
+            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) 
{
+                int ch =  target [matchStart] ;
+                if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
+                    ch = REUtil.composeFromSurrogates(ch, 
target[matchStart+1]);
                 }
-            } else {
-                for (matchStart = con.start;  matchStart <= limit;  matchStart 
++) {
-                    int ch =  target [  matchStart ] ;
-                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
-                        ch = REUtil.composeFromSurrogates(ch,  target [  
matchStart+1 ] );
-                    if (!range.match(ch))  continue;
-                    if (0 <= (matchEnd = this. matchCharArray (con, 
this.operations,
-                                                               matchStart, 1, 
this.options)))
+                if (!range.match(ch))  {
+                    continue;
+                }
+                if (0 <= (matchEnd = this. matchCharArray (con, 
this.operations,
+                                                           matchStart, 1, 
this.options))) {
                         break;
                 }
             }
@@ -945,20 +928,12 @@
                     if (offset >= con.limit)
                         return -1;
                     int ch =  target [  offset ] ;
-                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
-                        ch = REUtil.composeFromSurrogates(ch,  target [  
++offset ] );
-                    RangeToken tok = op.getToken();
-                    if (isSet(opts, IGNORE_CASE)) {
-                        tok = tok.getCaseInsensitiveToken();
-                        if (!tok.match(ch)) {
-                            if (ch >= 0x10000)  return -1;
-                            char uch;
-                            if (!tok.match(uch = 
Character.toUpperCase((char)ch))
-                                && !tok.match(Character.toLowerCase(uch)))
-                                return -1;
-                        }
-                    } else {
-                        if (!tok.match(ch))  return -1;
+                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) {
+                        ch = REUtil.composeFromSurrogates(ch,  
target[++offset]);
+                    }
+                    final RangeToken tok = op.getToken();
+                    if (!tok.match(ch)) {
+                        return -1;
                     }
                     offset ++;
                 } else {
@@ -966,20 +941,12 @@
                     if (o1 >= con.limit || o1 < 0)
                         return -1;
                     int ch =  target [  o1 ] ;
-                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
+                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) {
                         ch = REUtil.composeFromSurrogates( target [  --o1 ] , 
ch);
-                    RangeToken tok = op.getToken();
-                    if (isSet(opts, IGNORE_CASE)) {
-                        tok = tok.getCaseInsensitiveToken();
-                        if (!tok.match(ch)) {
-                            if (ch >= 0x10000)  return -1;
-                            char uch;
-                            if (!tok.match(uch = 
Character.toUpperCase((char)ch))
-                                && !tok.match(Character.toLowerCase(uch)))
-                                return -1;
-                        }
-                    } else {
-                        if (!tok.match(ch))  return -1;
+                    }
+                    final RangeToken tok = op.getToken();
+                    if (!tok.match(ch)) {
+                        return -1;
                     }
                     offset = o1;
                 }
@@ -1522,33 +1489,16 @@
         else if (this.firstChar != null) {
             //System.err.println("DEBUG: with firstchar-matching: 
"+this.firstChar);
             RangeToken range = this.firstChar;
-            if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
-                range = this.firstChar.getCaseInsensitiveToken();
-                for (matchStart = con.start;  matchStart <= limit;  matchStart 
++) {
-                    int ch =  target .charAt(  matchStart ) ;
-                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < 
con.limit) {
-                        ch = REUtil.composeFromSurrogates(ch,  target .charAt( 
 matchStart+1 ) );
-                        if (!range.match(ch))  continue;
-                    } else {
-                        if (!range.match(ch)) {
-                            char ch1 = Character.toUpperCase((char)ch);
-                            if (!range.match(ch1))
-                                if (!range.match(Character.toLowerCase(ch1)))
-                                    continue;
-                        }
-                    }
-                    if (0 <= (matchEnd = this. matchString (con, 
this.operations,
-                                                            matchStart, 1, 
this.options)))
-                        break;
+            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) 
{
+                int ch =  target .charAt(  matchStart ) ;
+                if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
+                    ch = REUtil.composeFromSurrogates(ch, 
target.charAt(matchStart+1));
                 }
-            } else {
-                for (matchStart = con.start;  matchStart <= limit;  matchStart 
++) {
-                    int ch =  target .charAt(  matchStart ) ;
-                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
-                        ch = REUtil.composeFromSurrogates(ch,  target .charAt( 
 matchStart+1 ) );
-                    if (!range.match(ch))  continue;
-                    if (0 <= (matchEnd = this. matchString (con, 
this.operations,
-                                                            matchStart, 1, 
this.options)))
+                if (!range.match(ch)) {
+                    continue;
+                }
+                if (0 <= (matchEnd = this. matchString (con, this.operations,
+                                                        matchStart, 1, 
this.options))) {
                         break;
                 }
             }
@@ -1665,20 +1615,12 @@
                     if (offset >= con.limit)
                         return -1;
                     int ch =  target .charAt(  offset ) ;
-                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
-                        ch = REUtil.composeFromSurrogates(ch,  target .charAt( 
 ++offset ) );
-                    RangeToken tok = op.getToken();
-                    if (isSet(opts, IGNORE_CASE)) {
-                        tok = tok.getCaseInsensitiveToken();
-                        if (!tok.match(ch)) {
-                            if (ch >= 0x10000)  return -1;
-                            char uch;
-                            if (!tok.match(uch = 
Character.toUpperCase((char)ch))
-                                && !tok.match(Character.toLowerCase(uch)))
-                                return -1;
-                        }
-                    } else {
-                        if (!tok.match(ch))  return -1;
+                    if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) {
+                        ch = REUtil.composeFromSurrogates(ch, 
target.charAt(++offset));
+                    }
+                    final RangeToken tok = op.getToken();
+                    if (!tok.match(ch)) {
+                        return -1;
                     }
                     offset ++;
                 } else {
@@ -1686,20 +1628,12 @@
                     if (o1 >= con.limit || o1 < 0)
                         return -1;
                     int ch =  target .charAt(  o1 ) ;
-                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
+                    if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) {
                         ch = REUtil.composeFromSurrogates( target .charAt(  
--o1 ) , ch);
-                    RangeToken tok = op.getToken();
-                    if (isSet(opts, IGNORE_CASE)) {
-                        tok = tok.getCaseInsensitiveToken();
-                        if (!tok.match(ch)) {
-                            if (ch >= 0x10000)  return -1;
-                            char uch;
-                            if (!tok.match(uch = 
Character.toUpperCase((char)ch))
-                                && !tok.match(Character.toLowerCase(uch)))
-                                return -1;
-                        }
-                    } else {
-                        if (!tok.match(ch))  return -1;
+                    }
+                    final RangeToken tok = op.getToken();
+                    if (!tok.match(ch)) {
+                        return -1;
                     }
                     offset = o1;
                 }
@@ -2169,34 +2103,17 @@
         else if (this.firstChar != null) {
             //System.err.println("DEBUG: with firstchar-matching: 
"+this.firstChar);
             RangeToken range = this.firstChar;
-            if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
-                range = this.firstChar.getCaseInsensitiveToken();
-                for (matchStart = con.start;  matchStart <= limit;  matchStart 
++) {
-                    int ch =  target .setIndex(  matchStart ) ;
-                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < 
con.limit) {
-                        ch = REUtil.composeFromSurrogates(ch,  target 
.setIndex(  matchStart+1 ) );
-                        if (!range.match(ch))  continue;
-                    } else {
-                        if (!range.match(ch)) {
-                            char ch1 = Character.toUpperCase((char)ch);
-                            if (!range.match(ch1))
-                                if (!range.match(Character.toLowerCase(ch1)))
-                                    continue;
-                        }
-                    }
-                    if (0 <= (matchEnd = this. matchCharacterIterator (con, 
this.operations,
-                                                                       
matchStart, 1, this.options)))
-                        break;
+            for (matchStart = con.start;  matchStart <= limit;  matchStart ++) 
{
+                int ch =  target .setIndex(  matchStart ) ;
+                if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
+                    ch = REUtil.composeFromSurrogates(ch, 
target.setIndex(matchStart+1));
                 }
-            } else {
-                for (matchStart = con.start;  matchStart <= limit;  matchStart 
++) {
-                    int ch =  target .setIndex(  matchStart ) ;
-                    if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
-                        ch = REUtil.composeFromSurrogates(ch,  target 
.setIndex(  matchStart+1 ) );
-                    if (!range.match(ch))  continue;
-                    if (0 <= (matchEnd = this. matchCharacterIterator (con, 
this.operations,
-                                                                       
matchStart, 1, this.options)))
-                        break;
+                if (!range.match(ch)) {
+                    continue;
+                }
+                if (0 <= (matchEnd = this.matchCharacterIterator(con, 
this.operations,
+                                                                 matchStart, 
1, this.options))) {
+                    break;
                 }
             }
         }
@@ -2314,18 +2231,9 @@
                     int ch =  target .setIndex(  offset ) ;
                     if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
                         ch = REUtil.composeFromSurrogates(ch,  target 
.setIndex(  ++offset ) );
-                    RangeToken tok = op.getToken();
-                    if (isSet(opts, IGNORE_CASE)) {
-                        tok = tok.getCaseInsensitiveToken();
-                        if (!tok.match(ch)) {
-                            if (ch >= 0x10000)  return -1;
-                            char uch;
-                            if (!tok.match(uch = 
Character.toUpperCase((char)ch))
-                                && !tok.match(Character.toLowerCase(uch)))
-                                return -1;
-                        }
-                    } else {
-                        if (!tok.match(ch))  return -1;
+                    final RangeToken tok = op.getToken();
+                    if (!tok.match(ch)) {
+                        return -1;
                     }
                     offset ++;
                 } else {
@@ -2335,18 +2243,9 @@
                     int ch =  target .setIndex(  o1 ) ;
                     if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
                         ch = REUtil.composeFromSurrogates( target .setIndex(  
--o1 ) , ch);
-                    RangeToken tok = op.getToken();
-                    if (isSet(opts, IGNORE_CASE)) {
-                        tok = tok.getCaseInsensitiveToken();
-                        if (!tok.match(ch)) {
-                            if (ch >= 0x10000)  return -1;
-                            char uch;
-                            if (!tok.match(uch = 
Character.toUpperCase((char)ch))
-                                && !tok.match(Character.toLowerCase(uch)))
-                                return -1;
-                        }
-                    } else {
-                        if (!tok.match(ch))  return -1;
+                    final RangeToken tok = op.getToken();
+                    if (!tok.match(ch)) {
+                        return -1;
                     }
                     offset = o1;
                 }

Modified: xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/Token.java
URL: 
http://svn.apache.org/viewvc/xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/Token.java?rev=831926&r1=831925&r2=831926&view=diff
==============================================================================
--- xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/Token.java 
(original)
+++ xerces/java/trunk/src/org/apache/xerces/impl/xpath/regex/Token.java Mon Nov 
 2 15:38:53 2009
@@ -442,19 +442,11 @@
               return FC_ANY;
 
           case RANGE:
-            if (isSet(options, RegularExpression.IGNORE_CASE)) {
-                
result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
-            } else {
-                result.mergeRanges(this);
-            }
+            result.mergeRanges(this);
             return FC_TERMINAL;
 
           case NRANGE:                          // ****
-            if (isSet(options, RegularExpression.IGNORE_CASE)) {
-                
result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
-            } else {
-                result.mergeRanges(Token.complementRanges(this));
-            }
+            result.mergeRanges(Token.complementRanges(this));
             return FC_TERMINAL;
 
           case INDEPENDENT:



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to