dfs 01/03/29 08:33:19
Modified: src/java/org/apache/oro/text/regex OpCode.java
Perl5Compiler.java Perl5Debug.java
Perl5Matcher.java
Log:
Applied Takashi's fix for his posix character class patch.
Revision Changes Path
1.5 +22 -11 jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java
Index: OpCode.java
===================================================================
RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- OpCode.java 2001/01/29 00:19:00 1.4
+++ OpCode.java 2001/03/29 16:33:17 1.5
@@ -63,7 +63,7 @@
* op-codes used in a compiled regular expression.
@author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
- @version $Id: OpCode.java,v 1.4 2001/01/29 00:19:00 dfs Exp $
+ @version $Id: OpCode.java,v 1.5 2001/03/29 16:33:17 dfs Exp $
*/
final class OpCode {
@@ -91,8 +91,8 @@
_NOTHING = 15, // no Match empty string.
_STAR = 16, // yes Match this (simple) thing 0 or more times.
_PLUS = 17, // yes Match this (simple) thing 1 or more times.
- _ALNUM = 18, // no Match any alphanumeric character
- _NALNUM = 19, // no Match any non-alphanumeric character
+ _WORD = 18, // no Match any word character
+ _NWORD = 19, // no Match any non-word character
_BOUND = 20, // no Match "" at any word boundary
_NBOUND = 21, // no Match "" at any word non-boundary
_SPACE = 22, // no Match any whitespace character
@@ -123,20 +123,29 @@
_UPPER = 45,
_XDIGIT = 46,
_OPCODE = 47,
- _ONECHAR = 48;
+ _NOPCODE = 48,
+ _ONECHAR = 49,
+ _ALNUM = 50,
+ _ASCII = 51;
// Lengths of the various operands.
static final int _operandLength[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 0-9
+ 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 10-19
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, // OpCode 20-29
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 30-39
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // OpCode 40-49
+ 0, 0 // OpCode 50-51
};
static final char _opType[] = {
_END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
- _CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
- _NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
+ _CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _WORD,
+ _NWORD, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
_OPEN, _CLOSE, _MINMOD, _BOL, _BRANCH, _BRANCH, _END, _WHILEM,
- _ANYOFUN, _NANYOFUN
+ _ANYOFUN, _NANYOFUN, _RANGE, _ALPHA, _BLANK, _CNTRL, _GRAPH,
+ _LOWER, _PRINT, _PUNCT, _UPPER, _XDIGIT, _OPCODE, _NOPCODE,
+ _ONECHAR, _ALNUM, _ASCII
};
static final char _opLengthVaries[] = {
@@ -144,8 +153,10 @@
};
static final char _opLengthOne[] = {
- _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT,
- _ANYOFUN, _NANYOFUN
+ _ANY, _SANY, _ANYOF, _WORD, _NWORD, _SPACE, _NSPACE, _DIGIT, _NDIGIT,
+ _ANYOFUN, _NANYOFUN, _ALPHA, _BLANK, _CNTRL, _GRAPH, _LOWER, _PRINT,
+ _PUNCT, _UPPER, _XDIGIT, _OPCODE, _NOPCODE, _ONECHAR, _ALNUM,
+ _ASCII
};
static final int _NULL_OFFSET = -1;
1.8 +99 -84
jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java
Index: Perl5Compiler.java
===================================================================
RCS file:
/home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- Perl5Compiler.java 2001/01/29 00:22:05 1.7
+++ Perl5Compiler.java 2001/03/29 16:33:17 1.8
@@ -67,7 +67,7 @@
* information about Perl5 regular expressions.
@author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
- @version $Id: Perl5Compiler.java,v 1.7 2001/01/29 00:22:05 dfs Exp $
+ @version $Id: Perl5Compiler.java,v 1.8 2001/03/29 16:33:17 dfs Exp $
* @see PatternCompiler
* @see MalformedPatternException
@@ -110,18 +110,20 @@
static {
__hashPOSIX = new HashMap();
- __hashPOSIX.put("alnum", new Character('w'));
+ __hashPOSIX.put("alnum", new Character(OpCode._ALNUM));
+ __hashPOSIX.put("word", new Character(OpCode._WORD));
__hashPOSIX.put("alpha", new Character(OpCode._ALPHA));
__hashPOSIX.put("blank", new Character(OpCode._BLANK));
__hashPOSIX.put("cntrl", new Character(OpCode._CNTRL));
- __hashPOSIX.put("digit", new Character('d'));
+ __hashPOSIX.put("digit", new Character(OpCode._DIGIT));
__hashPOSIX.put("graph", new Character(OpCode._GRAPH));
__hashPOSIX.put("lower", new Character(OpCode._LOWER));
__hashPOSIX.put("print", new Character(OpCode._PRINT));
__hashPOSIX.put("punct", new Character(OpCode._PUNCT));
- __hashPOSIX.put("space", new Character('s'));
+ __hashPOSIX.put("space", new Character(OpCode._SPACE));
__hashPOSIX.put("upper", new Character(OpCode._UPPER));
__hashPOSIX.put("xdigit", new Character(OpCode._XDIGIT));
+ __hashPOSIX.put("ascii", new Character(OpCode._ASCII));
}
@@ -642,12 +644,12 @@
__getNextChar();
break;
case 'w':
- offset = __emitNode(OpCode._ALNUM);
+ offset = __emitNode(OpCode._WORD);
retFlags[0] |= (__NONNULL | __SIMPLE);
__getNextChar();
break;
case 'W':
- offset = __emitNode(OpCode._NALNUM);
+ offset = __emitNode(OpCode._NWORD);
retFlags[0] |= (__NONNULL | __SIMPLE);
__getNextChar();
break;
@@ -732,7 +734,8 @@
if(__input._isAtEnd())
throw new
MalformedPatternException("Trailing \\ in expression.");
- // fall through to default
+
+ // fall through to default
default:
doDefault = true;
break tryAgain;
@@ -864,7 +867,6 @@
break forLoop;
}
break;
-
case CharStringPointer._END_OF_STRING:
case '\0':
if(pOffset >= maxOffset)
@@ -876,7 +878,6 @@
break;
} // end backslash switch
break;
-
case '#':
if((__modifierFlags[0] & __EXTENDED) != 0) {
while(pOffset < maxOffset && __input._getValue(pOffset) != '\n')
@@ -1106,7 +1107,9 @@
private int __parseUnicodeClass() throws MalformedPatternException {
boolean range = false, skipTest;
char clss, lastclss = Character.MAX_VALUE;
+
int offset, numLength[] = { 0 };
+ boolean negFlag[] = new boolean[1];
boolean opcodeFlag; /* clss isn't character when this flag true. */
if(__input._getValue() == '^') {
@@ -1136,83 +1139,85 @@
clss = __input._postIncrement();
} else {
/* try POSIX expression */
- char posixOpCode = __parsePOSIX();
+ char posixOpCode = __parsePOSIX(negFlag);
if(posixOpCode != 0){
opcodeFlag = true;
clss = posixOpCode;
}
}
-
- switch(clss){
- case 'w':
- opcodeFlag = true;
- clss = OpCode._ALNUM;
- lastclss = Character.MAX_VALUE;
- break;
- case 'W':
- opcodeFlag = true;
- clss = OpCode._NALNUM;
- lastclss = Character.MAX_VALUE;
- break;
- case 's':
- opcodeFlag = true;
- clss = OpCode._SPACE;
- lastclss = Character.MAX_VALUE;
- break;
- case 'S':
- opcodeFlag = true;
- clss = OpCode._NSPACE;
- lastclss = Character.MAX_VALUE;
- break;
- case 'd':
- opcodeFlag = true;
- clss = OpCode._DIGIT;
- lastclss = Character.MAX_VALUE;
- break;
- case 'D':
- opcodeFlag = true;
- clss = OpCode._NDIGIT;
- lastclss = Character.MAX_VALUE;
- break;
- case 'n':
- clss = '\n';
- break;
- case 'r':
- clss = '\r';
- break;
- case 't':
- clss = '\t';
- break;
- case 'f':
- clss = '\f';
- break;
- case 'b':
- clss = '\b';
- break;
- case 'e':
- clss = '\033';
- break;
- case 'a':
- clss = '\007';
- break;
- case 'x':
- clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
- numLength);
- __input._increment(numLength[0]);
- break;
- case 'c':
- clss = __input._postIncrement();
- if(Character.isLowerCase(clss))
- clss = Character.toUpperCase(clss);
- clss ^= 64;
- break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
- 3, numLength);
- __input._increment(numLength[0] - 1);
- break;
- default:
+ if (opcodeFlag != true) {
+ switch(clss){
+ case 'w':
+ opcodeFlag = true;
+ clss = OpCode._WORD;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'W':
+ opcodeFlag = true;
+ clss = OpCode._NWORD;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 's':
+ opcodeFlag = true;
+ clss = OpCode._SPACE;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'S':
+ opcodeFlag = true;
+ clss = OpCode._NSPACE;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'd':
+ opcodeFlag = true;
+ clss = OpCode._DIGIT;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'D':
+ opcodeFlag = true;
+ clss = OpCode._NDIGIT;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'n':
+ clss = '\n';
+ break;
+ case 'r':
+ clss = '\r';
+ break;
+ case 't':
+ clss = '\t';
+ break;
+ case 'f':
+ clss = '\f';
+ break;
+ case 'b':
+ clss = '\b';
+ break;
+ case 'e':
+ clss = '\033';
+ break;
+ case 'a':
+ clss = '\007';
+ break;
+ case 'x':
+ clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
+ numLength);
+ __input._increment(numLength[0]);
+ break;
+ case 'c':
+ clss = __input._postIncrement();
+ if(Character.isLowerCase(clss))
+ clss = Character.toUpperCase(clss);
+ clss ^= 64;
+ break;
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
+ 3, numLength);
+ __input._increment(numLength[0] - 1);
+ break;
+ default:
+ break;
+ }
}
}
@@ -1235,7 +1240,10 @@
if(lastclss == clss) {
if(opcodeFlag == true) {
- __emitCode(OpCode._OPCODE);
+ if(negFlag[0] == false)
+ __emitCode(OpCode._OPCODE);
+ else
+ __emitCode(OpCode._NOPCODE);
} else {
__emitCode(OpCode._ONECHAR);
}
@@ -1281,7 +1289,7 @@
*
* @return OpCode. return 0 when fail parsing POSIX expression.
*/
- private char __parsePOSIX() throws MalformedPatternException {
+ private char __parsePOSIX(boolean negFlag[]) throws MalformedPatternException {
int offset = __input._getOffset();
int len = __input._getLength();
int pos = offset;
@@ -1290,6 +1298,12 @@
Object opcode;
if( value != ':' ) return 0;
+ if( __input._getValue(pos) == '^' ) {
+ negFlag[0] = true;
+ pos++;
+ } else {
+ negFlag[0] = false;
+ }
buf = new StringBuffer();
@@ -1311,7 +1325,8 @@
return 0;
__input._setOffset(pos);
-
+//
System.out.println("posix="+buf.toString()+":"+((Character)opcode).charValue());
+
return ((Character)opcode).charValue();
}
1.4 +14 -3 jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java
Index: Perl5Debug.java
===================================================================
RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- Perl5Debug.java 2001/01/29 00:19:01 1.3
+++ Perl5Debug.java 2001/03/29 16:33:17 1.4
@@ -68,7 +68,7 @@
* comparison with the program generated by Perl5 with the -r option.
@author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
- @version $Id: Perl5Debug.java,v 1.3 2001/01/29 00:19:01 dfs Exp $
+ @version $Id: Perl5Debug.java,v 1.4 2001/03/29 16:33:17 dfs Exp $
* @see Perl5Pattern
*/
@@ -199,14 +199,25 @@
case OpCode._NOTHING: str = "NOTHING"; break;
case OpCode._BACK : str = "BACK"; break;
case OpCode._END : str = "END"; break;
- case OpCode._ALNUM : str = "ALNUM"; break;
- case OpCode._NALNUM: str = "NALNUM"; break;
+ case OpCode._WORD : str = "WORD"; break;
+ case OpCode._NWORD: str = "NWORD"; break;
case OpCode._BOUND : str = "BOUND"; break;
case OpCode._NBOUND: str = "NBOUND"; break;
case OpCode._SPACE : str = "SPACE"; break;
case OpCode._NSPACE: str = "NSPACE"; break;
case OpCode._DIGIT : str = "DIGIT"; break;
case OpCode._NDIGIT: str = "NDIGIT"; break;
+ case OpCode._ALPHA : str = "ALPHA"; break;
+ case OpCode._BLANK : str = "BLANK"; break;
+ case OpCode._CNTRL : str = "CNTRL"; break;
+ case OpCode._GRAPH : str = "GRAPH"; break;
+ case OpCode._LOWER : str = "LOWER"; break;
+ case OpCode._PRINT : str = "PRINT"; break;
+ case OpCode._PUNCT : str = "PUNCT"; break;
+ case OpCode._UPPER : str = "UPPER"; break;
+ case OpCode._XDIGIT: str = "XDIGIT"; break;
+ case OpCode._ALNUM : str = "ALNUM"; break;
+ case OpCode._ASCII : str = "ASCII"; break;
case OpCode._CURLY :
buffer.append("CURLY {");
buffer.append((int)OpCode._getArg1(program, offset));
1.10 +22 -14 jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java
Index: Perl5Matcher.java
===================================================================
RCS file:
/home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- Perl5Matcher.java 2001/01/29 00:22:05 1.9
+++ Perl5Matcher.java 2001/03/29 16:33:18 1.10
@@ -66,7 +66,7 @@
* Perl5Compiler.
@author <a href="mailto:[EMAIL PROTECTED]">Daniel F. Savarese</a>
- @version $Id: Perl5Matcher.java,v 1.9 2001/01/29 00:22:05 dfs Exp $
+ @version $Id: Perl5Matcher.java,v 1.10 2001/03/29 16:33:18 dfs Exp $
* @see PatternMatcher
* @see Perl5Compiler
@@ -512,7 +512,7 @@
}
break;
- case OpCode._ALNUM:
+ case OpCode._WORD:
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(OpCode._isWordCharacter(ch)) {
@@ -527,7 +527,7 @@
}
break;
- case OpCode._NALNUM:
+ case OpCode._NWORD:
while(__currentOffset < endOffset) {
ch = __input[__currentOffset];
if(!OpCode._isWordCharacter(ch)) {
@@ -637,14 +637,24 @@
} else {
offset+=2;
}
+
+ } else if(__program[offset] == OpCode._ONECHAR) {
+ offset++;
+ if(__program[offset++] == code) return isANYOF;
- } else if( __program[offset] == OpCode._OPCODE ){
+ } else {
+ isANYOF = (__program[offset] == OpCode._OPCODE)
+ ? isANYOF : !isANYOF;
+
offset++;
switch ( __program[offset++] ) {
case OpCode._ALNUM:
+ if(Character.isLetterOrDigit(code)) return isANYOF;
+ break;
+ case OpCode._WORD:
if(OpCode._isWordCharacter(code)) return isANYOF;
break;
- case OpCode._NALNUM:
+ case OpCode._NWORD:
if(!OpCode._isWordCharacter(code)) return isANYOF;
break;
case OpCode._SPACE:
@@ -697,12 +707,10 @@
(code >= 'a' && code <= 'f') ||
(code >= 'A' && code <= 'F')) return isANYOF;
break;
- }
- } else if((__program[offset++] == OpCode._ONECHAR) &&
- (__program[offset++] == code))
- {
- return isANYOF;
+ case OpCode._ASCII:
+ if(code < 0x80)return isANYOF;
}
+ }
}
return !isANYOF;
}
@@ -785,12 +793,12 @@
}
break;
- case OpCode._ALNUM:
+ case OpCode._WORD:
while(scan < eol && OpCode._isWordCharacter(__input[scan]))
++scan;
break;
- case OpCode._NALNUM:
+ case OpCode._NWORD:
while(scan < eol && !OpCode._isWordCharacter(__input[scan]))
++scan;
break;
@@ -953,7 +961,7 @@
nextChar = (inputRemains ? __input[input] : __EOS);
break;
- case OpCode._ALNUM:
+ case OpCode._WORD:
if(!inputRemains)
return false;
if(!OpCode._isWordCharacter(nextChar))
@@ -962,7 +970,7 @@
nextChar = (inputRemains ? __input[input] : __EOS);
break;
- case OpCode._NALNUM:
+ case OpCode._NWORD:
if(!inputRemains && input >= __eol)
return false;
if(OpCode._isWordCharacter(nextChar))