Index: REUtil.java
===================================================================
RCS file: /home/cvspublic/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/REUtil.java,v
retrieving revision 1.3
diff -u -r1.3 REUtil.java
--- REUtil.java	29 Jan 2002 01:15:14 -0000	1.3
+++ REUtil.java	21 May 2002 07:24:05 -0000
@@ -293,13 +293,15 @@
         synchronized (REUtil.regexCache) {
             int i;
             for (i = 0;  i < REUtil.CACHESIZE;  i ++) {
-                re = REUtil.regexCache[i];
-                if (re == null) {
+                RegularExpression cached = REUtil.regexCache[i];
+                if (cached == null) {
                     i = -1;
                     break;
                 }
-                if (re.equals(pattern, intOptions))
+                if (cached.equals(pattern, intOptions)) {
+                    re = cached;
                     break;
+                }
             }
             if (re != null) {
                 if (i != 0) {
@@ -347,6 +349,7 @@
                     if (i > 0)  buffer.append(literal.substring(0, i));
                 }
                 buffer.append((char)'\\');
+                buffer.append((char)ch);
             } else if (buffer != null)
                 buffer.append((char)ch);
         }
Index: RangeToken.java
===================================================================
RCS file: /home/cvspublic/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RangeToken.java,v
retrieving revision 1.2
diff -u -r1.2 RangeToken.java
--- RangeToken.java	29 Jan 2002 01:15:14 -0000	1.2
+++ RangeToken.java	21 May 2002 07:24:05 -0000
@@ -228,8 +228,6 @@
     }
 
     protected void mergeRanges(Token token) {
-        if (token.type != this.type)
-            throw new IllegalArgumentException("Token#mergeRanges(): Mismatched Type: "+token.type);
         RangeToken tok = (RangeToken)token;
         this.sortRanges();
         tok.sortRanges();
Index: RegexParser.java
===================================================================
RCS file: /home/cvspublic/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RegexParser.java,v
retrieving revision 1.2
diff -u -r1.2 RegexParser.java
--- RegexParser.java	29 Jan 2002 01:15:14 -0000	1.2
+++ RegexParser.java	21 May 2002 07:24:06 -0000
@@ -301,9 +301,6 @@
 
           default:
             ret = T_CHAR;
-            if (REUtil.isHighSurrogate(this.chardata) && this.offset < this.regexlen)
-                this.chardata = REUtil.composeFromSurrogates(this.chardata,
-                                                             this.regex.charAt(this.offset++));
         }
         this.nexttoken = ret;
     }
@@ -775,7 +772,16 @@
 
           case T_CHAR:
             tok = Token.createChar(this.chardata);
+            int high = this.chardata;
             this.next();
+            if (REUtil.isHighSurrogate(high)
+                && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
+                char[] sur = new char[2];
+                sur[0] = (char)high;
+                sur[1] = (char)this.chardata;
+                tok = Token.createParen(Token.createString(new String(sur)), 0);
+                this.next();
+            }
             break;
 
           default:
@@ -810,8 +816,10 @@
             int namestart = this.offset;
             int nameend = this.regex.indexOf('}', namestart);
             if (nameend < 0)  throw this.ex("parser.atom.3", this.offset);
+            String pname = this.regex.substring(namestart, nameend);
             this.offset = nameend+1;
-            tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
+            tok = Token.getRange(pname, positive,
+                                 this.isSet(RegularExpression.XMLSCHEMA_MODE));
             /*
               if (this.isSet(RegularExpression.IGNORE_CASE))
               tok = RangeToken.createCaseInsensitiveToken(tok);
@@ -900,7 +908,8 @@
                     positive = false;
                 }
                 String name = this.regex.substring(this.offset, nameend);
-                RangeToken range = Token.getRange(name, positive);
+                RangeToken range = Token.getRange(name, positive,
+                                                  this.isSet(RegularExpression.XMLSCHEMA_MODE));
                 if (range == null)  throw this.ex("parser.cc.3", this.offset);
                 tok.mergeRanges(range);
                 end = true;
@@ -933,153 +942,6 @@
         }
         if (this.read() == T_EOF)
             throw this.ex("parser.cc.2", this.offset);
-        if (!useNrange && nrange) {
-            base.subtractRanges(tok);
-            tok = base;
-        }
-        tok.sortRanges();
-        tok.compactRanges();
-        //tok.dumpRanges();
-        /*
-        if (this.isSet(RegularExpression.IGNORE_CASE))
-            tok = RangeToken.createCaseInsensitiveToken(tok);
-        */
-        this.setContext(S_NORMAL);
-        this.next();                    // Skips ']'
-
-        return tok;
-    }
-    private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException {
-        this.setContext(S_INBRACKETS);
-        this.next();                            // '['
-        boolean nrange = false;
-        RangeToken base = null;
-        RangeToken tok;
-        if (this.read() == T_CHAR && this.chardata == '^') {
-            nrange = true;
-            this.next();                        // '^'
-            if (useNrange) {
-                tok = Token.createNRange();
-            } else {
-                base = Token.createRange();
-                base.addRange(0, Token.UTF16_MAX);
-                tok = Token.createRange();
-            }
-        } else {
-            tok = Token.createRange();
-        }
-        int type;
-        while ((type = this.read()) != T_EOF
-               && !(type == T_CHAR && this.chardata == ']')) {
-            int c = this.chardata;
-            /*
-            if (type == T_CHAR && c == '^') {
-                this.next();
-                type = this.read();
-                c = this.chardata;
-                if (type == T_EOF)  break;
-
-                nrange = !nrange;
-                if (nrange)
-                    tok = Token.createRange();
-                else {
-                    base.subtractRanges(tok);
-                    tok = base;
-                }
-            }
-            */
-            boolean end = false;
-            if (type == T_BACKSOLIDUS) {
-                switch (c) {
-                  case 'd':  case 'D':
-                  case 'w':  case 'W':
-                  case 's':  case 'S':
-                    tok.mergeRanges(this.getTokenForShorthand(c));
-                    end = true;
-                    break;
-
-                  case 'i':  case 'I':
-                  case 'c':  case 'C':
-                    c = this.processCIinCharacterClass(tok, c);
-                    if (c < 0)  end = true;
-                    break;
-                    
-                  case 'p':
-                  case 'P':
-                    boolean positive = c ==  'p';
-                    int pstart = this.offset;
-                    this.next();
-                    if (this.read() != T_CHAR)  throw ex("parser.atom.2", this.offset-1);
-                    RangeToken tok2 = null;
-                    switch (this.chardata) {
-                      case 'L':                 // Letter
-                        tok2 = Token.getRange("L", positive);  break;
-                      case 'M':                 // Mark
-                        tok2 = Token.getRange("M", positive);  break;
-                      case 'N':                 // Number
-                        tok2 = Token.getRange("N", positive);  break;
-                      case 'Z':                 // Separator
-                        tok2 = Token.getRange("Z", positive);  break;
-                      case 'C':                 // Other
-                        tok2 = Token.getRange("C", positive);  break;
-                      case 'P':                 // Punctuation
-                        tok2 = Token.getRange("P", positive);  break;
-                      case 'S':                 // Symbol
-                        tok2 = Token.getRange("S", positive);  break;
-                      case '{':
-                        // this.offset points the next of '{'.
-                        pstart = this.offset;
-                        int namestart = this.offset;
-                        int nameend = this.regex.indexOf('}', namestart);
-                        if (nameend < 0)  throw ex("parser.atom.3", this.offset);
-                        this.offset = nameend+1;
-                        tok2 = Token.getRange(this.regex.substring(namestart, nameend), positive);
-                        break;
-
-                      default:
-                        throw ex("parser.atom.2", this.offset-1);
-                    }
-                    if (tok2 == null)  throw ex("parser.atom.5", pstart);
-                    tok.mergeRanges(tok2);
-                    end = true;
-                    break;
-
-                  default:
-                    c = this.decodeEscaped();
-                } // \ + c
-            } // backsolidus
-                                                // POSIX Character class such as [:alnum:]
-            else if (type == T_POSIX_CHARCLASS_START) {
-                int nameend = this.regex.indexOf(':', this.offset);
-                if (nameend < 0) throw ex("parser.cc.1", this.offset);
-                String name = this.regex.substring(this.offset, nameend);
-                RangeToken range = Token.getRange(name, true);
-                if (range == null)  throw ex("parser.cc.3", this.offset);
-                tok.mergeRanges(range);
-                end = true;
-                if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
-                    throw ex("parser.cc.1", nameend);
-                this.offset = nameend+2;
-            }
-            this.next();
-            if (!end) {
-                if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
-                    tok.addRange(c, c);
-                } else {
-                    this.next(); // Skips '-'
-                    if ((type = this.read()) == T_EOF) throw ex("parser.cc.2", this.offset);
-                    int rangeend = this.chardata;
-                    if (type == T_BACKSOLIDUS)
-                        rangeend = this.decodeEscaped();
-                    this.next();
-                    tok.addRange(c, rangeend);
-                }
-            }
-            if (this.read() == T_CHAR && this.chardata == ',')
-                this.next();
-        }
-        if (this.read() == T_EOF)
-            throw ex("parser.cc.2", this.offset);
         if (!useNrange && nrange) {
             base.subtractRanges(tok);
             tok = base;
Index: RegularExpression.java
===================================================================
RCS file: /home/cvspublic/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/RegularExpression.java,v
retrieving revision 1.2
diff -u -r1.2 RegularExpression.java
--- RegularExpression.java	29 Jan 2002 01:15:14 -0000	1.2
+++ RegularExpression.java	21 May 2002 07:24:06 -0000
@@ -905,7 +905,7 @@
 
         while (true) {
             if (op == null)
-                return offset;
+                return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
             if (offset > con.limit || offset < con.start)
                 return -1;
             switch (op.type) {
@@ -1221,7 +1221,7 @@
                     if (DEBUG) {
                         System.err.println("UNION: "+i+", ret="+ret);
                     }
-                    if (ret == con.length )  return ret;
+                    if (ret >= 0)  return ret;
                 }
                 return -1;
 
@@ -1625,7 +1625,7 @@
 
         while (true) {
             if (op == null)
-                return offset;
+                return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
             if (offset > con.limit || offset < con.start)
                 return -1;
             switch (op.type) {
@@ -1940,7 +1940,7 @@
                     if (DEBUG) {
                         System.err.println("UNION: "+i+", ret="+ret);
                     }
-                    if (ret == con.length )  return ret;
+                    if (ret >= 0)  return ret;
                 }
                 return -1;
 
@@ -2272,7 +2272,7 @@
 
         while (true) {
             if (op == null)
-                return offset;
+                return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
             if (offset > con.limit || offset < con.start)
                 return -1;
             switch (op.type) {
@@ -2588,7 +2588,7 @@
                     if (DEBUG) {
                         System.err.println("UNION: "+i+", ret="+ret);
                     }
-                    if (ret == con.length)  return ret;
+                    if (ret >= 0)  return ret;
                 }
                 return -1;
 
Index: Token.java
===================================================================
RCS file: /home/cvspublic/xml-xerces/java/src/org/apache/xerces/impl/xpath/regex/Token.java,v
retrieving revision 1.2
diff -u -r1.2 Token.java
--- Token.java	29 Jan 2002 01:15:14 -0000	1.2
+++ Token.java	21 May 2002 07:24:07 -0000
@@ -744,7 +744,7 @@
         /*FEFF..FEFF;*/ "Specials",
         /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
          //missing Specials add manually
-        /*10300..1032F;*/ "Old Italic",
+        /*10300..1032F;*/ "Old Italic",		// 87
         /*10330..1034F;*/ "Gothic",
         /*10400..1044F;*/ "Deseret",
         /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
@@ -771,10 +771,21 @@
         +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
         +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
         +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
-        +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF\u10300\u1032F\u10330\u1034F"
-        +"\u10400\u1044F\u1D000\u1D0FFs\u1D100\u1D1FF\u1D400\u1D7FF\u20000\u2A6D6\u2F800\u2FA1F\uE0000\uE007F";
+        +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
+    static final int[] nonBMPBlockRanges = {
+        0x10300, 0x1032F,       // 87
+        0x10330, 0x1034F,
+        0x10400, 0x1044F,
+        0x1D000, 0x1D0FF,
+        0x1D100, 0x1D1FF,
+        0x1D400, 0x1D7FF,
+        0x20000, 0x2A6D6,
+        0x2F800, 0x2FA1F,
+        0xE0000, 0xE007F
+    };
+    private static final int NONBMP_BLOCK_START = 87;
 
-     static protected RangeToken getRange(String name, boolean positive) {
+    static protected RangeToken getRange(String name, boolean positive) {
         if (Token.categories.size() == 0) {
             synchronized (Token.categories) {
                 Token[] ranges = new Token[Token.categoryNames.length];
@@ -864,17 +875,23 @@
                 //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
                 //
                 StringBuffer buffer = new StringBuffer(50);
-                int location = 0;
                 for (int i = 0;  i < Token.blockNames.length;  i ++) {
                     Token r1 = Token.createRange();
-                    location = i*2;
-                    int rstart = Token.blockRanges.charAt(location);
-                    int rend = Token.blockRanges.charAt(location+1);
+                    int location;
+                    if (i < NONBMP_BLOCK_START) {
+                        location = i*2;
+                        int rstart = Token.blockRanges.charAt(location);
+                        int rend = Token.blockRanges.charAt(location+1);
+                        //DEBUGING
+                        //System.out.println(n+" " +Integer.toHexString(rstart)
+                        //                     +"-"+ Integer.toHexString(rend));
+                        r1.addRange(rstart, rend);
+                    } else {
+                        location = (i - NONBMP_BLOCK_START) * 2;
+                        r1.addRange(Token.nonBMPBlockRanges[location],
+                                    Token.nonBMPBlockRanges[location + 1]);
+                    }
                     String n = Token.blockNames[i];
-                    //DEBUGING
-                    //System.out.println(n+" " +Integer.toHexString(rstart)
-                    //                     +"-"+ Integer.toHexString(rend));
-                    r1.addRange(rstart, rend);
                     if (n.equals("Specials"))
                         r1.addRange(0xfff0, 0xfffd);
                     if (n.equals("Private Use")) {
@@ -883,7 +900,7 @@
                     }
                     Token.categories.put(n, r1);
                     Token.categories2.put(n, Token.complementRanges(r1));
-                    buffer.setLength(0);                    
+                    buffer.setLength(0);
                     buffer.append("Is");
                     if (n.indexOf(' ') >= 0) {
                         for (int ci = 0;  ci < n.length();  ci ++)
@@ -895,11 +912,6 @@
                     Token.setAlias(buffer.toString(), n, true);
                 }
 
-                // REVISIT: remove this code later 
-                // the following does not match the XML Schema definition
-                // for Regular Expressions 
-                
-                /*
                 // TR#18 1.2
                 Token.setAlias("ASSIGNED", "Cn", false);
                 Token.setAlias("UNASSIGNED", "Cn", true);
@@ -907,44 +919,51 @@
                 all.addRange(0, Token.UTF16_MAX);
                 Token.categories.put("ALL", all);
                 Token.categories2.put("ALL", Token.complementRanges(all));
-                */
-                
-                /*
+                Token.registerNonXS("ASSIGNED");
+                Token.registerNonXS("UNASSIGNED");
+                Token.registerNonXS("ALL");
+
                 Token isalpha = Token.createRange();
                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
                 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
                 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
                 Token.categories.put("IsAlpha", isalpha);
                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
-                
+                Token.registerNonXS("IsAlpha");
+
                 Token isalnum = Token.createRange();
                 isalnum.mergeRanges(isalpha);   // Lu Ll Lo
                 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
                 Token.categories.put("IsAlnum", isalnum);
                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
+                Token.registerNonXS("IsAlnum");
 
                 Token isspace = Token.createRange();
                 isspace.mergeRanges(Token.token_spaces);
                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
                 Token.categories.put("IsSpace", isspace);
                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
+                Token.registerNonXS("IsSpace");
 
                 Token isword = Token.createRange();
                 isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
                 isword.addRange('_', '_');
                 Token.categories.put("IsWord", isword);
                 Token.categories2.put("IsWord", Token.complementRanges(isword));
+                Token.registerNonXS("IsWord");
 
                 Token isascii = Token.createRange();
                 isascii.addRange(0, 127);
                 Token.categories.put("IsASCII", isascii);
                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
+                Token.registerNonXS("IsASCII");
 
                 Token isnotgraph = Token.createRange();
                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
                 isnotgraph.addRange(' ', ' ');
                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
                 Token.categories2.put("IsGraph", isnotgraph);
+                Token.registerNonXS("IsGraph");
 
                 Token isxdigit = Token.createRange();
                 isxdigit.addRange('0', '9');
@@ -952,13 +971,20 @@
                 isxdigit.addRange('a', 'f');
                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
                 Token.categories2.put("IsXDigit", isxdigit);
-                
+                Token.registerNonXS("IsXDigit");
+
                 Token.setAlias("IsDigit", "Nd", true);
                 Token.setAlias("IsUpper", "Lu", true);
                 Token.setAlias("IsLower", "Ll", true);
                 Token.setAlias("IsCntrl", "C", true);
                 Token.setAlias("IsPrint", "C", false);
                 Token.setAlias("IsPunct", "P", true);
+                Token.registerNonXS("IsDigit");
+                Token.registerNonXS("IsUpper");
+                Token.registerNonXS("IsLower");
+                Token.registerNonXS("IsCntrl");
+                Token.registerNonXS("IsPrint");
+                Token.registerNonXS("IsPunct");
 
                 Token.setAlias("alpha", "IsAlpha", true);
                 Token.setAlias("alnum", "IsAlnum", true);
@@ -973,13 +999,48 @@
                 Token.setAlias("upper", "IsUpper", true);
                 Token.setAlias("word", "IsWord", true); // Perl extension
                 Token.setAlias("xdigit", "IsXDigit", true);
-                 */
+                Token.registerNonXS("alpha");
+                Token.registerNonXS("alnum");
+                Token.registerNonXS("ascii");
+                Token.registerNonXS("cntrl");
+                Token.registerNonXS("digit");
+                Token.registerNonXS("graph");
+                Token.registerNonXS("lower");
+                Token.registerNonXS("print");
+                Token.registerNonXS("punct");
+                Token.registerNonXS("space");
+                Token.registerNonXS("upper");
+                Token.registerNonXS("word");
+                Token.registerNonXS("xdigit");
             } // synchronized
         } // if null
         RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
             : (RangeToken)Token.categories2.get(name);
-        if (tok == null) System.out.println(name);
+        //if (tok == null) System.out.println(name);
         return tok;
+    }
+    static protected RangeToken getRange(String name, boolean positive, boolean xs) {
+        RangeToken range = Token.getRange(name, positive);
+        if (xs && range != null && Token.isRegisterNonXS(name))
+            range = null;
+        return range;
+    }
+
+    static Hashtable nonxs = null;
+    /**
+     * This method is called by only getRange().
+     * So this method need not MT-safe.
+     */
+    static protected void registerNonXS(String name) {
+        if (Token.nonxs == null)
+            Token.nonxs = new Hashtable();
+        Token.nonxs.put(name, name);
+    }
+    static protected boolean isRegisterNonXS(String name) {
+        if (Token.nonxs == null)
+            return false;
+        System.err.println("isRegisterNonXS: "+name);
+        return Token.nonxs.containsKey(name);
     }
 
     private static void setAlias(String newName, String name, boolean positive) {

