From: Ito Kazumitsu <[EMAIL PROTECTED]> Date: Sun, 29 Jan 2006 01:15:27 +0900 (JST)
> The patch proposed on Mon, 23 Jan 2006 has been cancelled, > and here is another patch. Hopefully improved. TODO: This patch cannot make the following test pass. /^(b+?|a){1,2}?c/ bbac 0: bbac 1: a /b+?/ matches only "b", and "bb" is not tried. ChangeLog 2006-01-29 Ito Kazumitsu <[EMAIL PROTECTED]> Fixes bug #24876 * gnu/regexp/gnu/regexp/RE.java(REG_TRY_ENTIRE_MATCH): New execution flag. (getMatchImpl): if REG_TRY_ENTIRE_MATCH is set, add an implicit RETokenEnd at the end of the regexp chain. Do not select the longest match, but select the first match. (match): Do not take care of REMatch.empty. * gnu/regexp/REMatch.java(empty): To be used only in RETokenRepeated. * gnu/regexp/RETokenOneOf.java: Corrected a typo in a comment. * gnu/regexp/RETokenBackRef.java: Do not take care of REMatch.empty. * gnu/regexp/RETokenRepeated.java (match): Rewrote stingy matching. Do not take care of REMatch.empty. Set and check REMatch.empty when trying to match the single token.
Index: classpath/gnu/regexp/RE.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v retrieving revision 1.12 diff -u -r1.12 RE.java --- classpath/gnu/regexp/RE.java 22 Jan 2006 02:22:21 -0000 1.12 +++ classpath/gnu/regexp/RE.java 29 Jan 2006 15:17:37 -0000 @@ -217,6 +217,13 @@ */ public static final int REG_NO_INTERPOLATE = 128; + /** + * Execution flag. + * Try to match the whole input string. An implicit match-end operator + * is added to this regexp. + */ + public static final int REG_TRY_ENTIRE_MATCH = 256; + /** Returns a string representing the version of the gnu.regexp package. */ public static final String version() { return VERSION; @@ -1265,20 +1272,14 @@ /* Implements abstract method REToken.match() */ boolean match(CharIndexed input, REMatch mymatch) { - int origin = mymatch.index; - boolean b; if (firstToken == null) { - b = next(input, mymatch); - if (b) mymatch.empty = (mymatch.index == origin); - return b; + return next(input, mymatch); } // Note the start of this subexpression mymatch.start[subIndex] = mymatch.index; - b = firstToken.match(input, mymatch); - if (b) mymatch.empty = (mymatch.index == origin); - return b; + return firstToken.match(input, mymatch); } /** @@ -1337,23 +1338,34 @@ } REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) { + boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0); + RE re = (tryEntireMatch ? (RE) this.clone() : this); + if (tryEntireMatch) { + re.chain(new RETokenEnd(0, null)); + } // Create a new REMatch to hold results REMatch mymatch = new REMatch(numSubs, anchor, eflags); do { // Optimization: check if anchor + minimumLength > length if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { - if (match(input, mymatch)) { - // Find longest match of them all to observe leftmost longest - REMatch longest = mymatch; + if (re.match(input, mymatch)) { + REMatch best = mymatch; + // We assume that the match that coms first is the best. + // And the following "The longer, the better" rule has + // been commented out. The longest is not neccesarily + // the best. For example, "a" out of "aaa" is the best + // match for /a+?/. + /* + // Find best match of them all to observe leftmost longest while ((mymatch = mymatch.next) != null) { - if (mymatch.index > longest.index) { - longest = mymatch; + if (mymatch.index > best.index) { + best = mymatch; } } - - longest.end[0] = longest.index; - longest.finish(input); - return longest; + */ + best.end[0] = best.index; + best.finish(input); + return best; } } mymatch.clear(++anchor); Index: classpath/gnu/regexp/REMatch.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v retrieving revision 1.3 diff -u -r1.3 REMatch.java --- classpath/gnu/regexp/REMatch.java 22 Jan 2006 02:22:21 -0000 1.3 +++ classpath/gnu/regexp/REMatch.java 29 Jan 2006 15:17:37 -0000 @@ -67,7 +67,8 @@ int[] start; // start positions (relative to offset) for each (sub)exp. int[] end; // end positions for the same REMatch next; // other possibility (to avoid having to use arrays) - boolean empty; // empty string matched + boolean empty; // empty string matched. This flag is used only within + // RETokenRepeated. public Object clone() { try { @@ -89,7 +90,6 @@ index = other.index; // need to deep clone? next = other.next; - empty = other.empty; } REMatch(int subs, int anchor, int eflags) { @@ -126,7 +126,6 @@ start[i] = end[i] = -1; } next = null; // cut off alternates - empty = false; } /** Index: classpath/gnu/regexp/RETokenBackRef.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenBackRef.java,v retrieving revision 1.3 diff -u -r1.3 RETokenBackRef.java --- classpath/gnu/regexp/RETokenBackRef.java 22 Jan 2006 02:22:21 -0000 1.3 +++ classpath/gnu/regexp/RETokenBackRef.java 29 Jan 2006 15:17:37 -0000 @@ -57,7 +57,6 @@ b = mymatch.start[num]; e = mymatch.end[num]; if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but... - int origin = mymatch.index; for (int i=b; i<e; i++) { char c1 = input.charAt(mymatch.index+i-b); char c2 = input.charAt(i); @@ -74,9 +73,7 @@ } } mymatch.index += e-b; - boolean result = next(input, mymatch); - if (result) mymatch.empty = (mymatch.index == origin); - return result; + return next(input, mymatch); } void dump(StringBuffer os) { Index: classpath/gnu/regexp/RETokenOneOf.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v retrieving revision 1.3 diff -u -r1.3 RETokenOneOf.java --- classpath/gnu/regexp/RETokenOneOf.java 23 Jan 2006 13:18:10 -0000 1.3 +++ classpath/gnu/regexp/RETokenOneOf.java 29 Jan 2006 15:17:37 -0000 @@ -98,7 +98,7 @@ REMatch last = null; REToken tk; for (int i=0; i < options.size(); i++) { - // In ordaer that the backtracking can work, + // In order that the backtracking can work, // each option must be chained to the next token. // But the chain method has some side effect, so // we use clones. Index: classpath/gnu/regexp/RETokenRepeated.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v retrieving revision 1.6 diff -u -r1.6 RETokenRepeated.java --- classpath/gnu/regexp/RETokenRepeated.java 22 Jan 2006 02:22:21 -0000 1.6 +++ classpath/gnu/regexp/RETokenRepeated.java 29 Jan 2006 15:17:37 -0000 @@ -84,6 +84,8 @@ return (min * token.getMinimumLength()); } + boolean stopMatchingIfSatisfied = true; + // We do need to save every possible point, but the number of clone() // invocations here is really a killer for performance on non-stingy // repeat operators. I'm open to suggestions... @@ -93,7 +95,6 @@ // the subexpression back-reference operator allow that? boolean match(CharIndexed input, REMatch mymatch) { - int origin = mymatch.index; // number of times we've matched so far int numRepeats = 0; @@ -113,19 +114,38 @@ REMatch recurrent; int lastIndex = mymatch.index; + // {0} needs some special treatment. + if (alwaysEmpty) { + REMatch result = matchRest(input, newMatch); + if (result != null) { + mymatch.assignFrom(result); + return true; + } + else { + return false; + } + } + do { - // Check for stingy match for each possibility. - if ((stingy && (numRepeats >= min)) || alwaysEmpty) { - REMatch result = matchRest(input, newMatch); - if (result != null) { - mymatch.assignFrom(result); - mymatch.empty = (mymatch.index == origin); - return true; - } - else { - // Special case of {0}. It must always match an empty string. - if (alwaysEmpty) return false; - } + // We want to check something like + // if (stingy && (numRepeats >= min)) + // and neglect the further matching. But experience tells + // such neglection may cause incomplete matching. + // For example, if we neglect the seemingly unnecessay + // matching, /^(b+?|a){1,2}?c/ cannot match "bbc". + // On the other hand, if we do not stop the unnecessary + // matching, /(([a-c])b*?\2)*/ matches "ababbbcbc" + // entirely when we wan to find only "ababb". + // In order to make regression tests pass, we do as we did. + if (stopMatchingIfSatisfied && stingy && (numRepeats >= min)) { + REMatch result = matchRest(input, newMatch); + if (result != null) { + StringBuffer sb = new StringBuffer(); + token.dumpAll(sb); + System.err.println("token=" + sb + " result.index=" + result.index); + mymatch.assignFrom(result); + return true; + } } doables = null; @@ -134,7 +154,9 @@ // try next repeat at all possible positions for (current = newMatch; current != null; current = current.next) { recurrent = (REMatch) current.clone(); + int origin = recurrent.index; if (token.match(input, recurrent)) { + if (recurrent.index == origin) recurrent.empty = true; // add all items in current to doables array if (doables == null) { doables = recurrent; @@ -222,8 +244,11 @@ // desired. indexCount = 1; } + if (stingy) { + posIndex = (posIndex - indexCount - 1); + } while (indexCount-- > 0) { - --posIndex; + if (stingy) ++posIndex; else --posIndex; newMatch = (REMatch) positions.elementAt(posIndex); results = matchRest(input, newMatch); if (results != null) { @@ -233,6 +258,7 @@ } else { // Order these from longest to shortest // Start by assuming longest (more repeats) + // If stingy the order is shortest to longest. allResultsLast.next = results; } // Find new doablesLast @@ -246,7 +272,6 @@ } if (allResults != null) { mymatch.assignFrom(allResults); // does this get all? - mymatch.empty = (mymatch.index == origin); return true; } // If we fall out, no matches.