This is an automated email from the git hooks/post-receive script. henrich pushed a commit to branch debian/sid in repository jruby-joni.
commit 7d7a5189d47bed7b5ec47e9f01b428706eceb249 Author: Marcin Mielzynski <[email protected]> Date: Mon Feb 13 01:36:57 2012 +0100 Bump dependency version for jcodings and support nonunicode \s \d \w --- pom.xml | 2 +- src/org/joni/Config.java | 56 +++---- src/org/joni/Lexer.java | 336 +++++++++++++++++++-------------------- src/org/joni/Parser.java | 319 ++++++++++++++++++++----------------- src/org/joni/ast/CClassNode.java | 25 ++- 5 files changed, 392 insertions(+), 346 deletions(-) diff --git a/pom.xml b/pom.xml index dff5b07..a8a3eba 100644 --- a/pom.xml +++ b/pom.xml @@ -75,7 +75,7 @@ <dependency> <groupId>org.jruby.jcodings</groupId> <artifactId>jcodings</artifactId> - <version>1.0.4</version> + <version>1.0.6</version> </dependency> <dependency> <groupId>junit</groupId> diff --git a/src/org/joni/Config.java b/src/org/joni/Config.java index 07762f0..f1f4947 100644 --- a/src/org/joni/Config.java +++ b/src/org/joni/Config.java @@ -1,20 +1,20 @@ /* - * Permission is hereby granted, free of charge, to any person obtaining a copy of - * this software and associated documentation files (the "Software"), to deal in - * the Software without restriction, including without limitation the rights to - * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is furnished to do * so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ package org.joni; @@ -23,25 +23,25 @@ import java.io.PrintStream; public interface Config extends org.jcodings.Config { final int CHAR_TABLE_SIZE = 256; - + final boolean USE_NAMED_GROUP = true; final boolean USE_SUBEXP_CALL = true; final boolean USE_BACKREF_WITH_LEVEL = true; /* \k<name+n>, \k<name-n> */ - + final boolean USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT = true; /* /(?:()|())*\2/ */ final boolean USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE = true; /* /\n$/ =~ "\n" */ final boolean USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR = false; final boolean CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS = true; - + final boolean USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE = false; final boolean USE_CAPTURE_HISTORY = false; final boolean USE_VARIABLE_META_CHARS = true; final boolean USE_WORD_BEGIN_END = true; /* "\<": word-begin, "\>": word-end */ - final boolean USE_POSIX_API_REGION_OPTION = true; /* needed for POSIX API support */ + final boolean USE_POSIX_API_REGION_OPTION = true; /* needed for POSIX API support */ final boolean USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE = true; final boolean USE_COMBINATION_EXPLOSION_CHECK = false; - + final int NREGION = 10; final int MAX_BACKREF_NUM = 1000; final int MAX_REPEAT_NUM = 100000; @@ -53,34 +53,36 @@ public interface Config extends org.jcodings.Config { // internal config final boolean USE_PARSE_TREE_NODE_RECYCLE = true; final boolean USE_OP_PUSH_OR_JUMP_EXACT = true; - final boolean USE_SHARED_CCLASS_TABLE = false; - final boolean USE_QTFR_PEEK_NEXT = true; + final boolean USE_SHARED_CCLASS_TABLE = false; + final boolean USE_QTFR_PEEK_NEXT = true; final int INIT_MATCH_STACK_SIZE = 64; final int DEFAULT_MATCH_STACK_LIMIT_SIZE = 0; /* unlimited */ final int NUMBER_OF_POOLED_STACKS = 4; - - + + final boolean DONT_OPTIMIZE = false; - - + + final int MAX_CAPTURE_HISTORY_GROUP = 31; - + final int CHECK_STRING_THRESHOLD_LEN = 7; final int CHECK_BUFF_MAX_SIZE = 0x4000; - - + + final boolean NON_UNICODE_SDW = false; + + final PrintStream log = System.out; final PrintStream err = System.err; final boolean DEBUG_ALL = false; - final boolean DEBUG = DEBUG_ALL; + final boolean DEBUG = DEBUG_ALL; final boolean DEBUG_PARSE_TREE = DEBUG_ALL; final boolean DEBUG_COMPILE = DEBUG_ALL; final boolean DEBUG_COMPILE_BYTE_CODE_INFO = DEBUG_ALL; - final boolean DEBUG_SEARCH = DEBUG_ALL; + final boolean DEBUG_SEARCH = DEBUG_ALL; final boolean DEBUG_MATCH = DEBUG_ALL; final boolean DEBUG_ASM = true; final boolean DEBUG_ASM_EXEC = true; diff --git a/src/org/joni/Lexer.java b/src/org/joni/Lexer.java index 172132f..9094757 100644 --- a/src/org/joni/Lexer.java +++ b/src/org/joni/Lexer.java @@ -1,20 +1,20 @@ /* - * Permission is hereby granted, free of charge, to any person obtaining a copy of - * this software and associated documentation files (the "Software"), to deal in - * the Software without restriction, including without limitation the rights to - * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is furnished to do * so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ package org.joni; @@ -31,7 +31,7 @@ import org.joni.constants.TokenType; import org.joni.exception.ErrorMessages; class Lexer extends ScannerSupport { - protected final ScanEnvironment env; + protected final ScanEnvironment env; protected final Syntax syntax; // fast access to syntax protected final Token token = new Token(); // current token @@ -40,17 +40,17 @@ class Lexer extends ScannerSupport { this.env = env; this.syntax = env.syntax; } - + /** * @return 0: normal {n,m}, 2: fixed {n} - * !introduce returnCode here + * !introduce returnCode here */ private int fetchRangeQuantifier() { mark(); boolean synAllow = syntax.allowInvalidInterval(); - + if (!left()) { - if (synAllow) { + if (synAllow) { return 1; /* "....{" : OK! */ } else { newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE); @@ -63,7 +63,7 @@ class Lexer extends ScannerSupport { newSyntaxException(ERR_END_PATTERN_AT_LEFT_BRACE); } } - + int low = scanUnsignedNumber(); if (low < 0) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); if (low > Config.MAX_REPEAT_NUM) newSyntaxException(ErrorMessages.ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); @@ -77,18 +77,18 @@ class Lexer extends ScannerSupport { return invalidRangeQuantifier(synAllow); } } - + if (!left()) return invalidRangeQuantifier(synAllow); - + fetch(); int up; int ret = 0; if (c == ',') { - int prev = p; // ??? last + int prev = p; // ??? last up = scanUnsignedNumber(); if (up < 0) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); if (up > Config.MAX_REPEAT_NUM) newValueException(ERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE); - + if (p == prev) { if (nonLow) return invalidRangeQuantifier(synAllow); up = QuantifierNode.REPEAT_INFINITE; /* {n,} : {n,infinite} */ @@ -99,28 +99,28 @@ class Lexer extends ScannerSupport { up = low; /* {n} : exact n times */ ret = 2; /* fixed */ } - + if (!left()) return invalidRangeQuantifier(synAllow); fetch(); - + if (syntax.opEscBraceInterval()) { if (c != syntax.metaCharTable.esc) return invalidRangeQuantifier(synAllow); fetch(); } - + if (c != '}') return invalidRangeQuantifier(synAllow); - + if (!isRepeatInfinite(up) && low > up) { newValueException(ERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE); } - + token.type = TokenType.INTERVAL; token.setRepeatLower(low); token.setRepeatUpper(up); - + return ret; /* 0: normal {n,m}, 2: fixed {n} */ } - + private int invalidRangeQuantifier(boolean synAllow) { if (synAllow) { restore(); @@ -130,7 +130,7 @@ class Lexer extends ScannerSupport { return 0; // not reached } } - + /* \M-, \C-, \c, or \... */ private int fetchEscapedValue() { if (!left()) newSyntaxException(ERR_END_PATTERN_AT_ESCAPE); @@ -164,20 +164,20 @@ class Lexer extends ScannerSupport { fetchEscapedValueBackSlash(); } break; - + case 'c': if (syntax.opEscCControl()) { fetchEscapedValueControl(); } /* fall through */ - + default: fetchEscapedValueBackSlash(); } // switch - + return c; // ??? } - + private void fetchEscapedValueBackSlash() { c = env.convertBackslashValue(c); } @@ -194,7 +194,7 @@ class Lexer extends ScannerSupport { c &= 0x9f; } } - + private int nameEndCodePoint(int start) { switch(start) { case '<': @@ -212,16 +212,16 @@ class Lexer extends ScannerSupport { \k<num+n>, \k<num-n> \k<-num+n>, \k<-num-n> */ - + // value implicit (rnameEnd) private boolean fetchNameWithLevel(int startCode, int[]rbackNum, int[]rlevel) { int src = p; boolean existLevel = false; int isNum = 0; int sign = 1; - + int endCode = nameEndCodePoint(startCode); - int pnumHead = p; + int pnumHead = p; int nameEnd = stop; String err = null; @@ -232,15 +232,15 @@ class Lexer extends ScannerSupport { if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME); if (enc.isDigit(c)) { isNum = 1; - } else if (c == '-') { + } else if (c == '-') { isNum = 2; sign = -1; pnumHead = p; - } else if (!enc.isWord(c)) { + } else if (!enc.isWord(c)) { err = ERR_INVALID_GROUP_NAME; } } - + while (left()) { nameEnd = p; fetch(); @@ -248,7 +248,7 @@ class Lexer extends ScannerSupport { if (isNum == 2) err = ERR_INVALID_GROUP_NAME; break; } - + if (isNum != 0) { if (enc.isDigit(c)) { isNum = 1; @@ -273,11 +273,11 @@ class Lexer extends ScannerSupport { if (level < 0) newValueException(ERR_TOO_BIG_NUMBER); rlevel[0] = level * flag; existLevel = true; - + fetch(); isEndCode = c == endCode; } - + if (!isEndCode) { err = ERR_INVALID_GROUP_NAME; nameEnd = stop; @@ -295,7 +295,7 @@ class Lexer extends ScannerSupport { } else if (backNum == 0) { newValueException(ERR_INVALID_GROUP_NAME, src, stop); } - rbackNum[0] = backNum * sign; + rbackNum[0] = backNum * sign; } value = nameEnd; return existLevel; @@ -304,14 +304,14 @@ class Lexer extends ScannerSupport { return false; // not reached } } - + // USE_NAMED_GROUP // ref: 0 -> define name (don't allow number name) // 1 -> reference name (allow number name) private int fetchNameForNamedGroup(int startCode, boolean ref) { int src = p; value = 0; - + int isNum = 0; int sign = 1; @@ -332,7 +332,7 @@ class Lexer extends ScannerSupport { err = ERR_INVALID_GROUP_NAME; // isNum = 0; } - } else if (c == '-') { + } else if (c == '-') { if (ref) { isNum = 2; sign = -1; @@ -342,10 +342,10 @@ class Lexer extends ScannerSupport { // isNum = 0; } } else if (!enc.isWord(c)) { - err = ERR_INVALID_CHAR_IN_GROUP_NAME; + err = ERR_INVALID_CHAR_IN_GROUP_NAME; } } - + if (err == null) { while (left()) { nameEnd = p; @@ -354,7 +354,7 @@ class Lexer extends ScannerSupport { if (isNum == 2) err = ERR_INVALID_GROUP_NAME; break; } - + if (isNum != 0) { if (enc.isDigit(c)) { isNum = 1; @@ -372,7 +372,7 @@ class Lexer extends ScannerSupport { } } } - + if (c != endCode) { err = ERR_INVALID_GROUP_NAME; nameEnd = stop; @@ -410,12 +410,12 @@ class Lexer extends ScannerSupport { private final int fetchNameForNoNamedGroup(int startCode, boolean ref) { int src = p; value = 0; - + int isNum = 0; int sign = 1; - + int endCode = nameEndCodePoint(startCode); - int pnumHead = p; + int pnumHead = p; int nameEnd = stop; String err = null; @@ -424,7 +424,7 @@ class Lexer extends ScannerSupport { } else { fetch(); if (c == endCode) newValueException(ERR_EMPTY_GROUP_NAME); - + if (enc.isDigit(c)) { isNum = 1; } else if (c == '-') { @@ -438,17 +438,17 @@ class Lexer extends ScannerSupport { while(left()) { nameEnd = p; - + fetch(); if (c == endCode || c == ')') break; if (!enc.isDigit(c)) err = ERR_INVALID_CHAR_IN_GROUP_NAME; } - - if (err == null && c != endCode) { + + if (err == null && c != endCode) { err = ERR_INVALID_GROUP_NAME; nameEnd = stop; } - + if (err == null) { mark(); p = pnumHead; @@ -460,7 +460,7 @@ class Lexer extends ScannerSupport { newValueException(ERR_INVALID_GROUP_NAME, src, nameEnd); } backNum *= sign; - + value = nameEnd; return backNum; } else { @@ -468,7 +468,7 @@ class Lexer extends ScannerSupport { return 0; // not reached } } - + protected final int fetchName(int startCode, boolean ref) { if (Config.USE_NAMED_GROUP) { return fetchNameForNamedGroup(startCode, ref); @@ -476,11 +476,11 @@ class Lexer extends ScannerSupport { return fetchNameForNoNamedGroup(startCode, ref); } } - + private boolean strExistCheckWithEsc(int[]s, int n, int bad) { int p = this.p; int to = this.stop; - + boolean inEsc = false; int i=0; @@ -508,14 +508,14 @@ class Lexer extends ScannerSupport { } } return false; - } - - private static final int send[] = new int[]{':', ']'}; - + } + + private static final int send[] = new int[]{':', ']'}; + protected final TokenType fetchTokenInCC() { int last; int c2; - + if (!left()) { token.type = TokenType.EOT; return token.type; @@ -526,7 +526,7 @@ class Lexer extends ScannerSupport { token.base = 0; token.setC(c); token.escaped = false; - + if (c == ']') { token.type = TokenType.CC_CLOSE; } else if (c == '-') { @@ -539,40 +539,40 @@ class Lexer extends ScannerSupport { token.setC(c); switch (c) { - + case 'w': token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.WORD); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); token.setPropNot(false); break; - + case 'W': token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.WORD); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); token.setPropNot(true); break; - + case 'd': token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.DIGIT); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); token.setPropNot(false); break; case 'D': token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.DIGIT); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); token.setPropNot(true); break; case 's': token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.SPACE); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); token.setPropNot(false); break; - + case 'S': token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.SPACE); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); token.setPropNot(true); break; @@ -589,41 +589,41 @@ class Lexer extends ScannerSupport { token.setPropCType(CharacterType.XDIGIT); token.setPropNot(true); break; - + case 'p': case 'P': - c2 = peek(); // !!! migrate to peekIs + c2 = peek(); // !!! migrate to peekIs if (c2 == '{' && syntax.op2EscPBraceCharProperty()) { inc(); token.type = TokenType.CHAR_PROPERTY; token.setPropNot(c == 'P'); - + if (syntax.op2EscPBraceCircumflexNot()) { c2 = fetchTo(); if (c2 == '^') { - token.setPropNot(!token.getPropNot()); + token.setPropNot(!token.getPropNot()); } else { unfetch(); } } } break; - + case 'x': if (!left()) break; last = p; - + if (peekIs('{') && syntax.opEscXBraceHex8()) { inc(); int num = scanUnsignedHexadecimalNumber(8); if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE); if (left()) { c2 = peek(); - if (enc.isXDigit(c2)) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE); + if (enc.isXDigit(c2)) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE); } - + if (p > last + enc.length(bytes, last, stop) && left() && peekIs('}')) { - inc(); + inc(); token.type = TokenType.CODE_POINT; token.base = 16; token.setCode(num); @@ -642,11 +642,11 @@ class Lexer extends ScannerSupport { token.setC(num); } break; - + case 'u': if (!left()) break; last = p; - + if (syntax.op2EscUHex4()) { int num = scanUnsignedHexadecimalNumber(4); if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); @@ -658,7 +658,7 @@ class Lexer extends ScannerSupport { token.setCode(num); } break; - + case '0': case '1': case '2': @@ -680,7 +680,7 @@ class Lexer extends ScannerSupport { token.setC(num); } break; - + default: unfetch(); int num = fetchEscapedValue(); @@ -690,7 +690,7 @@ class Lexer extends ScannerSupport { } break; } // switch - + } else if (c == '[') { if (syntax.opPosixBracket() && peekIs(':')) { token.backP = p; /* point at '[' is readed */ @@ -721,24 +721,24 @@ class Lexer extends ScannerSupport { } return token.type; } - + protected final int backrefRelToAbs(int relNo) { return env.numMem + 1 + relNo; } - + protected final TokenType fetchToken() { int last; - + // mark(); // out - + start: while(true) { - + if (!left()) { token.type = TokenType.EOT; return token.type; } - + token.type = TokenType.STRING; token.base = 0; token.backP = p; @@ -814,14 +814,14 @@ class Lexer extends ScannerSupport { case 'w': if (!syntax.opEscWWord()) break; token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.WORD); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); token.setPropNot(false); break; case 'W': if (!syntax.opEscWWord()) break; token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.WORD); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.W : CharacterType.WORD); token.setPropNot(true); break; @@ -845,7 +845,7 @@ class Lexer extends ScannerSupport { break; } // USE_WORD_BEGIN_END break; // ? - + case '>': if (Config.USE_WORD_BEGIN_END) { if (!syntax.opEscLtGtWordBeginEnd()) break; @@ -858,28 +858,28 @@ class Lexer extends ScannerSupport { case 's': if (!syntax.opEscSWhiteSpace()) break; token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.SPACE); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); token.setPropNot(false); break; case 'S': if (!syntax.opEscSWhiteSpace()) break; token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.SPACE); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.S : CharacterType.SPACE); token.setPropNot(true); break; - + case 'd': if (!syntax.opEscDDigit()) break; token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.DIGIT); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); token.setPropNot(false); break; - + case 'D': if (!syntax.opEscDDigit()) break; token.type = TokenType.CHAR_TYPE; - token.setPropCType(CharacterType.DIGIT); + token.setPropCType(Config.NON_UNICODE_SDW ? CharacterType.D : CharacterType.DIGIT); token.setPropNot(true); break; @@ -903,26 +903,26 @@ class Lexer extends ScannerSupport { token.type = TokenType.ANCHOR; token.setSubtype(AnchorType.BEGIN_BUF); break; - + case 'Z': if (!syntax.opEscAZBufAnchor()) break; token.type = TokenType.ANCHOR; token.setSubtype(AnchorType.SEMI_END_BUF); break; - + case 'z': if (!syntax.opEscAZBufAnchor()) break; - // end_buf label - token.type = TokenType.ANCHOR; + // end_buf label + token.type = TokenType.ANCHOR; token.setSubtype(AnchorType.END_BUF); break; - + case 'G': if (!syntax.opEscCapitalGBeginAnchor()) break; token.type = TokenType.ANCHOR; token.setSubtype(AnchorType.BEGIN_POSITION); break; - + case '`': if (!syntax.op2EscGnuBufAnchor()) break; // goto begin_buf @@ -932,8 +932,8 @@ class Lexer extends ScannerSupport { case '\'': if (!syntax.op2EscGnuBufAnchor()) break; - // goto end_buf - token.type = TokenType.ANCHOR; + // goto end_buf + token.type = TokenType.ANCHOR; token.setSubtype(AnchorType.END_BUF); break; @@ -945,9 +945,9 @@ class Lexer extends ScannerSupport { int num = scanUnsignedHexadecimalNumber(8); if (num < 0) newValueException(ERR_TOO_BIG_WIDE_CHAR_VALUE); if (left()) { - if (enc.isXDigit(peek())) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE); + if (enc.isXDigit(peek())) newValueException(ERR_TOO_LONG_WIDE_CHAR_VALUE); } - + if (p > last + enc.length(bytes, last, stop) && left() && peekIs('}')) { inc(); token.type = TokenType.CODE_POINT; @@ -967,11 +967,11 @@ class Lexer extends ScannerSupport { token.setC(num); } break; - + case 'u': // extract to helper if (!left()) break; last = p; - + if (syntax.op2EscUHex4()) { int num = scanUnsignedHexadecimalNumber(4); if (num < 0) newValueException(ERR_TOO_BIG_NUMBER); @@ -983,7 +983,7 @@ class Lexer extends ScannerSupport { token.setCode(num); } break; - + case '1': case '2': case '3': @@ -992,11 +992,11 @@ class Lexer extends ScannerSupport { case '6': case '7': case '8': - case '9': + case '9': unfetch(); last = p; int num = scanUnsignedNumber(); - if (num < 0 || num > Config.MAX_BACKREF_NUM) { + if (num < 0 || num > Config.MAX_BACKREF_NUM) { // goto skip_backref } else if (syntax.opDecimalBackref() && (num <= env.numMem || num <= 9)) { /* This spec. from GNU regex */ if (syntax.strictCheckBackref()) { @@ -1018,7 +1018,7 @@ class Lexer extends ScannerSupport { } p = last; /* fall through */ - + case '0': if (syntax.opEscOctal3()) { last = p; @@ -1034,7 +1034,7 @@ class Lexer extends ScannerSupport { inc(); } break; - + case 'k': if (Config.USE_NAMED_GROUP) { if (syntax.op2EscKNamedBackref()) { @@ -1052,13 +1052,13 @@ class Lexer extends ScannerSupport { backNum = fetchName(c, true); } // USE_BACKREF_AT_LEVEL int nameEnd = value; // set by fetchNameWithLevel/fetchName - + if (backNum != 0) { if (backNum < 0) { backNum = backrefRelToAbs(backNum); if (backNum <= 0) newValueException(ERR_INVALID_BACKREF); } - + if (syntax.strictCheckBackref() && (backNum > env.numMem || env.memNodes == null)) { newValueException(ERR_INVALID_BACKREF); } @@ -1099,11 +1099,11 @@ class Lexer extends ScannerSupport { unfetch(); } } - + break; } // USE_NAMED_GROUP break; - + case 'g': if (Config.USE_SUBEXP_CALL) { if (syntax.op2EscGSubexpCall()) { @@ -1120,34 +1120,34 @@ class Lexer extends ScannerSupport { unfetch(); } } - break; + break; } // USE_SUBEXP_CALL break; - + case 'Q': if (syntax.op2EscCapitalQQuote()) { token.type = TokenType.QUOTE_OPEN; } break; - + case 'p': case 'P': if (peekIs('{') && syntax.op2EscPBraceCharProperty()) { inc(); token.type = TokenType.CHAR_PROPERTY; token.setPropNot(c == 'P'); - + if (syntax.op2EscPBraceCircumflexNot()) { fetch(); if (c == '^') { - token.setPropNot(!token.getPropNot()); + token.setPropNot(!token.getPropNot()); } else { unfetch(); } } } break; - + default: unfetch(); num = fetchEscapedValue(); @@ -1160,13 +1160,13 @@ class Lexer extends ScannerSupport { p = token.backP + enc.length(bytes, token.backP, stop); } break; - + } // switch (c) - + } else { token.setC(c); token.escaped = false; - + // remove code duplication if (Config.USE_VARIABLE_META_CHARS) { if (c != MetaChar.INEFFECTIVE_META_CHAR && syntax.opVariableMetaCharacters()) { @@ -1198,16 +1198,16 @@ class Lexer extends ScannerSupport { } } } // USE_VARIABLE_META_CHARS - - { + + { switch(c) { - + case '.': if (!syntax.opDotAnyChar()) break; // any_char: token.type = TokenType.ANYCHAR; break; - + case '*': if (!syntax.opAsteriskZeroInf()) break; // anytime: @@ -1225,8 +1225,8 @@ class Lexer extends ScannerSupport { token.setRepeatUpper(QuantifierNode.REPEAT_INFINITE); greedyCheck(); break; - - case '?': + + case '?': if (!syntax.opQMarkZeroOne()) break; // zero_or_one_time: token.type = TokenType.OP_REPEAT; @@ -1234,7 +1234,7 @@ class Lexer extends ScannerSupport { token.setRepeatUpper(1); greedyCheck(); break; - + case '{': if (!syntax.opBraceInterval()) break; switch(fetchRangeQuantifier()) { @@ -1251,12 +1251,12 @@ class Lexer extends ScannerSupport { default: /* 1 : normal char */ } // inner switch break; - + case '|': if (!syntax.opVBarAlt()) break; token.type = TokenType.ALT; break; - + case '(': if (peekIs('?') && syntax.op2QMarkGroupEffect()) { inc(); @@ -1275,49 +1275,49 @@ class Lexer extends ScannerSupport { } unfetch(); } - + if (!syntax.opLParenSubexp()) break; token.type = TokenType.SUBEXP_OPEN; break; - + case ')': if (!syntax.opLParenSubexp()) break; - token.type = TokenType.SUBEXP_CLOSE; + token.type = TokenType.SUBEXP_CLOSE; break; - + case '^': if (!syntax.opLineAnchor()) break; token.type = TokenType.ANCHOR; token.setSubtype(isSingleline(env.option) ? AnchorType.BEGIN_BUF : AnchorType.BEGIN_LINE); break; - + case '$': if (!syntax.opLineAnchor()) break; token.type = TokenType.ANCHOR; token.setSubtype(isSingleline(env.option) ? AnchorType.SEMI_END_BUF : AnchorType.END_LINE); break; - + case '[': if (!syntax.opBracketCC()) break; token.type = TokenType.CC_CC_OPEN; break; - + case ']': //if (*src > env->pattern) /* /].../ is allowed. */ //CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); break; - + case '#': if (Option.isExtend(env.option)) { while (left()) { fetch(); if (enc.isNewLine(c)) break; } - continue start; // goto start - + continue start; // goto start + } break; - + case ' ': case '\t': case '\n': @@ -1327,22 +1327,22 @@ class Lexer extends ScannerSupport { continue start; // goto start } break; - + default: // string break; - + } // switch } } - + break; } // while - return token.type; + return token.type; } - + private void greedyCheck() { if (left() && peekIs('?') && syntax.opQMarkNonGreedy()) { - + fetch(); token.setRepeatGreedy(false); @@ -1351,14 +1351,14 @@ class Lexer extends ScannerSupport { possessiveCheck(); } } - + private void possessiveCheck() { - if (left() && peekIs('+') && + if (left() && peekIs('+') && (syntax.op2PlusPossessiveRepeat() && token.type != TokenType.INTERVAL || syntax.op2PlusPossessiveInterval() && token.type == TokenType.INTERVAL)) { - + fetch(); - + token.setRepeatGreedy(true); token.setRepeatPossessive(true); } else { diff --git a/src/org/joni/Parser.java b/src/org/joni/Parser.java index a787d16..71d29fd 100644 --- a/src/org/joni/Parser.java +++ b/src/org/joni/Parser.java @@ -1,20 +1,20 @@ /* - * Permission is hereby granted, free of charge, to any person obtaining a copy of - * this software and associated documentation files (the "Software"), to deal in - * the Software without restriction, including without limitation the rights to - * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is furnished to do * so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. - * + * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ package org.joni; @@ -49,28 +49,28 @@ class Parser extends Lexer { protected final Regex regex; protected Node root; - + protected int returnCode; // return code used by parser methods (they itself return parsed nodes) - // this approach will not affect recursive calls - + // this approach will not affect recursive calls + protected Parser(ScanEnvironment env, byte[]bytes, int p, int end) { super(env, bytes, p, end); regex = env.reg; } - + // onig_parse_make_tree protected final Node parse() { root = parseRegexp(); regex.numMem = env.numMem; return root; } - + private static final int POSIX_BRACKET_NAME_MIN_LEN = 4; private static final int POSIX_BRACKET_CHECK_LIMIT_LENGTH = 20; private static final byte BRACKET_END[] = ":]".getBytes(); private boolean parsePosixBracket(CClassNode cc) { mark(); - + boolean not; if (peekIs('^')) { inc(); @@ -94,7 +94,7 @@ class Parser extends Lexer { return false; } } - + } // not_posix_bracket: @@ -104,7 +104,7 @@ class Parser extends Lexer { inc(); if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; } - + if (c == ':' && left()) { inc(); if (left()) { @@ -115,7 +115,7 @@ class Parser extends Lexer { restore(); return true; /* 1: is not POSIX bracket, but no error. */ } - + private CClassNode parseCharProperty() { int ctype = fetchCharPropertyToCType(); CClassNode n = new CClassNode(); @@ -123,28 +123,28 @@ class Parser extends Lexer { if (token.getPropNot()) n.setNot(); return n; } - + private boolean codeExistCheck(int code, boolean ignoreEscaped) { mark(); - + boolean inEsc = false; while(left()) { - if (ignoreEscaped && inEsc) { + if (ignoreEscaped && inEsc) { inEsc = false; } else { fetch(); if (c == code) { restore(); - return true; + return true; } if (c == syntax.metaCharTable.esc) inEsc = true; } } - + restore(); return false; } - + private CClassNode parseCharClass() { fetchTokenInCC(); @@ -155,35 +155,45 @@ class Parser extends Lexer { } else { neg = false; } - + if (token.type == TokenType.CC_CLOSE) { if (!codeExistCheck(']', true)) newSyntaxException(ERR_EMPTY_CHAR_CLASS); env.ccEscWarn("]"); token.type = TokenType.CHAR; /* allow []...] */ } - + CClassNode cc = new CClassNode(); CClassNode prevCC = null; CClassNode workCC = null; CCStateArg arg = new CCStateArg(); - + boolean andStart = false; arg.state = CCSTATE.START; while(token.type != TokenType.CC_CLOSE) { boolean fetched = false; - + switch (token.type) { - + case CHAR: - int len = enc.codeToMbcLength(token.getC()); - if (len > 1) { - arg.inType = CCVALTYPE.CODE_POINT; - } else { - // !sb_char:! - arg.inType = CCVALTYPE.SB; - } + int len; +// if (Config.VANILLA) { + len = enc.codeToMbcLength(token.getC()); + if (len > 1) { + arg.inType = CCVALTYPE.CODE_POINT; + } else { + // !sb_char:! + arg.inType = CCVALTYPE.SB; + } +// } else { +// if (token.getCode() >= BitSet.SINGLE_BYTE_SIZE || (len = enc.codeToMbcLength(token.getC())) > 1) { +// arg.inType = CCVALTYPE.CODE_POINT; +// } else { +// // !sb_char:! +// arg.inType = CCVALTYPE.SB; +// } +// } arg.v = token.getC(); arg.vIsRaw = false; // !goto val_entry2;! @@ -207,9 +217,9 @@ class Parser extends Lexer { buf[i] = (byte)token.getC(); } if (i < enc.minLength()) newValueException(ERR_TOO_SHORT_MULTI_BYTE_STRING); - + len = enc.length(buf, 0, i); - if (i < len) { + if (i < len) { newValueException(ERR_TOO_SHORT_MULTI_BYTE_STRING); } else if (i > len) { /* fetch back */ p = psave; @@ -233,7 +243,7 @@ class Parser extends Lexer { // !goto val_entry2;! valEntry2(cc, arg); break; - + case CODE_POINT: arg.v = token.getCode(); arg.vIsRaw = true; @@ -241,7 +251,7 @@ class Parser extends Lexer { // !val_entry2:! valEntry(cc, arg); break; - + case POSIX_BRACKET_OPEN: if (parsePosixBracket(cc)) { /* true: is not POSIX bracket */ env.ccEscWarn("["); @@ -255,20 +265,20 @@ class Parser extends Lexer { // !goto next_class;! cc.nextStateClass(arg, env); break; - + case CHAR_TYPE: cc.addCType(token.getPropCType(), token.getPropNot(), env, this); // !next_class:! cc.nextStateClass(arg, env); break; - + case CHAR_PROPERTY: int ctype = fetchCharPropertyToCType(); cc.addCType(ctype, token.getPropNot(), env, this); // !goto next_class;! cc.nextStateClass(arg, env); break; - + case CC_RANGE: if (arg.state == CCSTATE.VALUE) { fetchTokenInCC(); @@ -315,7 +325,7 @@ class Parser extends Lexer { rangeEndVal(cc, arg); break; } - + if (syntax.allowDoubleRangeOpInCC()) { env.ccEscWarn("-"); /* [0-9-a] is allowed as [0-9\-a] */ @@ -326,12 +336,12 @@ class Parser extends Lexer { newSyntaxException(ERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS); } break; - + case CC_CC_OPEN: /* [ */ CClassNode acc = parseCharClass(); cc.or(acc, enc); break; - + case CC_AND: /* && */ if (arg.state == CCSTATE.VALUE) { arg.v = 0; // ??? safe v ? @@ -349,36 +359,36 @@ class Parser extends Lexer { cc = workCC; } // initialize_cclass(cc); // clear it ?? - break; - + break; + case EOT: newSyntaxException(ERR_PREMATURE_END_OF_CHAR_CLASS); - - default: - newInternalException(ERR_PARSER_BUG); + + default: + newInternalException(ERR_PARSER_BUG); } // switch - + if (!fetched) fetchTokenInCC(); - + } // while - + if (arg.state == CCSTATE.VALUE) { arg.v = 0; // ??? safe v ? arg.vIsRaw = false; cc.nextStateValue(arg, env); } - + if (prevCC != null) { prevCC.and(cc, enc); cc = prevCC; } - + if (neg) { cc.setNot(); } else { cc.clearNot(); } - + if (cc.isNot() && syntax.notNewlineInNegativeCC()) { if (!cc.isEmpty()) { final int NEW_LINE = 0x0a; @@ -391,21 +401,21 @@ class Parser extends Lexer { } } } - + return cc; } - + private void valEntry2(CClassNode cc, CCStateArg arg) { cc.nextStateValue(arg, env); } - + private void valEntry(CClassNode cc, CCStateArg arg) { int len = enc.codeToMbcLength(arg.v); arg.inType = len == 1 ? CCVALTYPE.SB : CCVALTYPE.CODE_POINT; // !val_entry2:! valEntry2(cc, arg); } - + private void sbChar(CClassNode cc, CCStateArg arg) { arg.inType = CCVALTYPE.SB; arg.v = token.getC(); @@ -420,20 +430,20 @@ class Parser extends Lexer { // !goto val_entry;! valEntry(cc, arg); } - + private Node parseEnclose(TokenType term) { Node node = null; - + if (!left()) newSyntaxException(ERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS); - + int option = env.option; - - if (peekIs('?') && syntax.op2QMarkGroupEffect()) { + + if (peekIs('?') && syntax.op2QMarkGroupEffect()) { inc(); if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP); - + boolean listCapture = false; - + fetch(); switch(c) { case ':': /* (?:...) grouping only */ @@ -442,19 +452,19 @@ class Parser extends Lexer { node = parseSubExp(term); returnCode = 1; /* group */ return node; - + case '=': node = new AnchorNode(AnchorType.PREC_READ); break; - + case '!': /* preceding read */ node = new AnchorNode(AnchorType.PREC_READ_NOT); break; - + case '>': /* (?>...) stop backtrack */ node = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose break; - + case '\'': if (Config.USE_NAMED_GROUP) { if (syntax.op2QMarkLtNamedGroup()) { @@ -479,7 +489,7 @@ class Parser extends Lexer { if (syntax.op2QMarkLtNamedGroup()) { unfetch(); c = '<'; - + // !named_group1:! listCapture = false; // !named_group2:! @@ -488,17 +498,17 @@ class Parser extends Lexer { } else { newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); } - + } else { // USE_NAMED_GROUP newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); } // USE_NAMED_GROUP } break; - + case '@': - if (syntax.op2AtMarkCaptureHistory()) { + if (syntax.op2AtMarkCaptureHistory()) { if (Config.USE_NAMED_GROUP) { - if (syntax.op2QMarkLtNamedGroup()) { + if (syntax.op2QMarkLtNamedGroup()) { fetch(); if (c == '<' || c == '\'') { listCapture = true; @@ -518,7 +528,7 @@ class Parser extends Lexer { newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); } break; - + // case 'p': #ifdef USE_POSIXLINE_OPTION case '-': case 'i': @@ -531,19 +541,19 @@ class Parser extends Lexer { case ':': case ')': break; - + case '-': neg = true; break; - + case 'x': option = bsOnOff(option, Option.EXTEND, neg); break; - + case 'i': option = bsOnOff(option, Option.IGNORECASE, neg); break; - + case 's': if (syntax.op2OptionPerl()) { option = bsOnOff(option, Option.MULTILINE, neg); @@ -551,7 +561,7 @@ class Parser extends Lexer { newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); } break; - + case 'm': if (syntax.op2OptionPerl()) { option = bsOnOff(option, Option.SINGLELINE, !neg); @@ -561,15 +571,15 @@ class Parser extends Lexer { newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); } break; - + // case 'p': #ifdef USE_POSIXLINE_OPTION // not defined // option = bsOnOff(option, Option.MULTILINE|Option.SINGLELINE, neg); // break; - + default: newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); } // switch - + if (c == ')') { EncloseNode en = new EncloseNode(option, 0); // node_new_option node = en; @@ -590,11 +600,11 @@ class Parser extends Lexer { if (!left()) newSyntaxException(ERR_END_PATTERN_IN_GROUP); fetch(); } // while - + default: newSyntaxException(ERR_UNDEFINED_GROUP_OPTION); } // switch - + } else { if (isDontCaptureGroup(env.option)) { // !goto group;! @@ -608,7 +618,7 @@ class Parser extends Lexer { en.regNum = num; node = en; } - + fetchToken(); Node target = parseSubExp(term); @@ -626,25 +636,25 @@ class Parser extends Lexer { returnCode = 0; return node; // ?? } - + private Node namedGroup2(boolean listCapture) { int nm = p; int num = fetchName(c, false); int nameEnd = value; num = env.addMemEntry(); if (listCapture && num >= BitStatus.BIT_STATUS_BITS_NUM) newValueException(ERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY); - + regex.nameAdd(bytes, nm, nameEnd, num, syntax); EncloseNode en = new EncloseNode(env.option, true); // node_new_enclose_memory en.regNum = num; Node node = en; - + if (listCapture) env.captureHistory = bsOnAtSimple(env.captureHistory, num); env.numNamed++; return node; } - + private int nextChar; // hidden var private int findStrPosition(int[]s, int n, int from, int to) { int x; @@ -661,7 +671,7 @@ class Parser extends Lexer { q += enc.length(bytes, q, to); } if (i >= n) { - if (bytes[nextChar] != 0) nextChar = q; // we may need zero term semantics... + if (bytes[nextChar] != 0) nextChar = q; // we may need zero term semantics... return p; } } @@ -669,13 +679,13 @@ class Parser extends Lexer { } return -1; } - + private Node parseExp(TokenType term) { if (token.type == term) { //!goto end_of_token;! return new StringNode(); } - + Node node = null; boolean group = false; @@ -684,7 +694,7 @@ class Parser extends Lexer { case EOT: // !end_of_token:! return new StringNode(); // node_new_empty - + case SUBEXP_OPEN: node = parseEnclose(TokenType.SUBEXP_CLOSE); if (returnCode == 1) { @@ -697,13 +707,13 @@ class Parser extends Lexer { Node target = parseSubExp(term); env.option = prev; en.setTarget(target); - return node; + return node; } break; - + case SUBEXP_CLOSE: if (!syntax.allowUnmatchedCloseSubexp()) newSyntaxException(ERR_UNMATCHED_CLOSE_PARENTHESIS); - + if (token.escaped) { // !goto tk_raw_byte;! return parseExpTkRawByte(group); @@ -711,22 +721,22 @@ class Parser extends Lexer { // !goto tk_byte;! return parseExpTkByte(group); } - + case STRING: // !tk_byte:! return parseExpTkByte(group); - + case RAW_BYTE: // !tk_raw_byte:! return parseExpTkRawByte(group); - + case CODE_POINT: byte[]buf = new byte[Config.ENC_CODE_TO_MBC_MAXLEN]; int num = enc.codeToMbc(token.getCode(), buf, 0); - // #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG ... // setRaw() #else + // #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG ... // setRaw() #else node = new StringNode(buf, 0, num); break; - + case QUOTE_OPEN: int[]endOp = new int[]{syntax.metaCharTable.esc, 'E'}; int qstart = p; @@ -740,10 +750,21 @@ class Parser extends Lexer { case CHAR_TYPE: switch(token.getPropCType()) { + case CharacterType.D: + case CharacterType.S: + case CharacterType.W: + if (Config.NON_UNICODE_SDW) { + CClassNode cc = new CClassNode(); + cc.addCType(token.getPropCType(), false, env, this); + if (token.getPropNot()) cc.setNot(); + node = cc; + } + break; + case CharacterType.WORD: node = new CTypeNode(token.getPropCType(), token.getPropNot()); break; - + case CharacterType.SPACE: case CharacterType.DIGIT: case CharacterType.XDIGIT: @@ -753,41 +774,41 @@ class Parser extends Lexer { if (token.getPropNot()) ccn.setNot(); node = ccn; break; - + default: newInternalException(ERR_PARSER_BUG); - + } // inner switch break; - + case CHAR_PROPERTY: node = parseCharProperty(); break; - + case CC_CC_OPEN: CClassNode cc = parseCharClass(); node = cc; if (isIgnoreCase(env.option)) { ApplyCaseFoldArg arg = new ApplyCaseFoldArg(env, cc); enc.applyAllCaseFold(env.caseFoldFlag, ApplyCaseFold.INSTANCE, arg); - + if (arg.altRoot != null) { node = ConsAltNode.newAltNode(node, arg.altRoot); } } break; - + case ANYCHAR: node = new AnyCharNode(); break; - + case ANYCHAR_ANYTIME: node = new AnyCharNode(); QuantifierNode qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false); qn.setTarget(node); node = qn; break; - + case BACKREF: int[]backRefs = token.getBackrefNum() > 1 ? token.getBackrefRefs() : new int[]{token.getBackrefRef1()}; node = new BackRefNode(token.getBackrefNum(), @@ -796,9 +817,9 @@ class Parser extends Lexer { token.getBackrefExistLevel(), // #ifdef USE_BACKREF_AT_LEVEL token.getBackrefLevel(), // ... env); - + break; - + case CALL: if (Config.USE_SUBEXP_CALL) { int gNum = token.getCallGNum(); @@ -816,7 +837,7 @@ class Parser extends Lexer { case ANCHOR: node = new AnchorNode(token.getAnchor()); // possible bug in oniguruma break; - + case OP_REPEAT: case INTERVAL: if (syntax.contextIndepRepeatOps()) { @@ -830,75 +851,75 @@ class Parser extends Lexer { return parseExpTkByte(group); } break; - + default: newInternalException(ERR_PARSER_BUG); } //switch - + //targetp = node; - + // !re_entry:! fetchToken(); - + // !repeat:! return parseExpRepeat(node, group); } - + private Node parseExpTkByte(boolean group) { // !tk_byte:! StringNode node = new StringNode(bytes, token.backP, p); while (true) { fetchToken(); if (token.type != TokenType.STRING) break; - + if (token.backP == node.end) { node.end = p; // non escaped character, remain shared, just increase shared range } else { - node.cat(bytes, token.backP, p); // non continuous string stream, need to COW + node.cat(bytes, token.backP, p); // non continuous string stream, need to COW } - } + } // !string_end:! // targetp = node; // !goto repeat;! return parseExpRepeat(node, group); } - + private Node parseExpTkRawByte(boolean group) { // !tk_raw_byte:! // important: we don't use 0xff mask here neither in the compiler // (in the template string) so we won't have to mask target - // strings when comparing against them in the matcher + // strings when comparing against them in the matcher StringNode node = new StringNode((byte)token.getC()); node.setRaw(); - int len = 1; + int len = 1; while (true) { - if (len >= enc.minLength()) { - if (len == enc.length(node.bytes, node.p, node.end)) { + if (len >= enc.minLength()) { + if (len == enc.length(node.bytes, node.p, node.end)) { fetchToken(); node.clearRaw(); // !goto string_end;! return parseExpRepeat(node, group); } } - + fetchToken(); if (token.type != TokenType.RAW_BYTE) { /* Don't use this, it is wrong for little endian encodings. */ // USE_PAD_TO_SHORT_BYTE_CHAR ... - + newValueException(ERR_TOO_SHORT_MULTI_BYTE_STRING); } // important: we don't use 0xff mask here neither in the compiler // (in the template string) so we won't have to mask target - // strings when comparing against them in the matcher + // strings when comparing against them in the matcher node.cat((byte)token.getC()); len++; } // while } - + private Node parseExpRepeat(Node target, boolean group) { // !repeat:! while (token.type == TokenType.OP_REPEAT || token.type == TokenType.INTERVAL) { @@ -907,11 +928,11 @@ class Parser extends Lexer { QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(), token.getRepeatUpper(), token.type == TokenType.INTERVAL); - + qtfr.greedy = token.getRepeatGreedy(); int ret = qtfr.setQuantifier(target, group, env, bytes, getBegin(), getEnd()); Node qn = qtfr; - + if (token.getRepeatPossessive()) { EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose en.setTarget(qn); @@ -923,7 +944,7 @@ class Parser extends Lexer { } else if (ret == 2) { /* split case: /abc+/ */ target = ConsAltNode.newListNode(target, null); ConsAltNode tmp = ((ConsAltNode)target).setCdr(ConsAltNode.newListNode(qn, null)); - + fetchToken(); return parseExpRepeatForCar(target, tmp, group); } @@ -941,11 +962,11 @@ class Parser extends Lexer { QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(), token.getRepeatUpper(), token.type == TokenType.INTERVAL); - + qtfr.greedy = token.getRepeatGreedy(); int ret = qtfr.setQuantifier(target.car, group, env, bytes, getBegin(), getEnd()); Node qn = qtfr; - + if (token.getRepeatPossessive()) { EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose en.setTarget(qn); @@ -961,7 +982,7 @@ class Parser extends Lexer { fetchToken(); } return top; - } + } private Node parseBranch(TokenType term) { Node node = parseExp(term); @@ -971,13 +992,13 @@ class Parser extends Lexer { } else { ConsAltNode top = ConsAltNode.newListNode(node, null); ConsAltNode t = top; - + while (token.type != TokenType.EOT && token.type != term && token.type != TokenType.ALT) { node = parseExp(term); if (node.getType() == NodeType.LIST) { t.setCdr((ConsAltNode)node); while (((ConsAltNode)node).cdr != null ) node = ((ConsAltNode)node).cdr; - + t = ((ConsAltNode)node); } else { t.setCdr(ConsAltNode.newListNode(node, null)); @@ -987,7 +1008,7 @@ class Parser extends Lexer { return top; } } - + /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ private Node parseSubExp(TokenType term) { Node node = parseBranch(term); @@ -1000,11 +1021,11 @@ class Parser extends Lexer { while (token.type == TokenType.ALT) { fetchToken(); node = parseBranch(term); - + t.setCdr(ConsAltNode.newAltNode(node, null)); t = t.cdr; } - + if (token.type != term) parseSubExpError(term); return top; } else { @@ -1012,7 +1033,7 @@ class Parser extends Lexer { return null; //not reached } } - + private void parseSubExpError(TokenType term) { if (term == TokenType.SUBEXP_CLOSE) { newSyntaxException(ERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS); @@ -1020,7 +1041,7 @@ class Parser extends Lexer { newInternalException(ERR_PARSER_BUG); } } - + private Node parseRegexp() { fetchToken(); return parseSubExp(TokenType.EOT); diff --git a/src/org/joni/ast/CClassNode.java b/src/org/joni/ast/CClassNode.java index c05c9f3..86c82fb 100644 --- a/src/org/joni/ast/CClassNode.java +++ b/src/org/joni/ast/CClassNode.java @@ -22,8 +22,10 @@ package org.joni.ast; import org.jcodings.CodeRange; import org.jcodings.Encoding; import org.jcodings.IntHolder; +import org.jcodings.ascii.AsciiTables; import org.jcodings.constants.CharacterType; import org.jcodings.exception.EncodingException; +import org.jcodings.specific.ASCIIEncoding; import org.joni.BitSet; import org.joni.CodeRangeBuffer; import org.joni.Config; @@ -326,8 +328,29 @@ public final class CClassNode extends Node { public void addCType(int ctype, boolean not, ScanEnvironment env, IntHolder sbOut) { Encoding enc = env.enc; - int[]ranges = enc.ctypeCodeRange(ctype, sbOut); + if (Config.NON_UNICODE_SDW) { + switch(ctype) { + case CharacterType.D: + case CharacterType.S: + case CharacterType.W: + ctype ^= CharacterType.SPECIAL_MASK; + if (not) { + for (int c = 0; c < BitSet.SINGLE_BYTE_SIZE; c++) { + if (!ASCIIEncoding.INSTANCE.isCodeCType(c, ctype)) bs.set(c); + //if ((AsciiTables.AsciiCtypeTable[c] & (1 << ctype)) == 0) bs.set(c); + } + addAllMultiByteRange(enc); + } else { + for (int c = 0; c < BitSet.SINGLE_BYTE_SIZE; c++) { + if (ASCIIEncoding.INSTANCE.isCodeCType(c, ctype)) bs.set(c); + //if ((AsciiTables.AsciiCtypeTable[c] & (1 << ctype)) != 0) bs.set(c); + } + } + return; + } + } + int[]ranges = enc.ctypeCodeRange(ctype, sbOut); if (ranges != null) { addCTypeByRange(ctype, not, enc, sbOut.value, ranges); return; -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/jruby-joni.git _______________________________________________ pkg-java-commits mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-java-commits

