From: Ito Kazumitsu <[EMAIL PROTECTED]>
Date: Sun, 29 Jan 2006 01:15:27 +0900 (JST)
> The patch proposed on Mon, 23 Jan 2006 has been cancelled,
> and here is another patch.
Hopefully improved.
TODO:
This patch cannot make the following test pass.
/^(b+?|a){1,2}?c/
bbac
0: bbac
1: a
/b+?/ matches only "b", and "bb" is not tried.
ChangeLog
2006-01-29 Ito Kazumitsu <[EMAIL PROTECTED]>
Fixes bug #24876
* gnu/regexp/gnu/regexp/RE.java(REG_TRY_ENTIRE_MATCH):
New execution flag.
(getMatchImpl): if REG_TRY_ENTIRE_MATCH is set, add an
implicit RETokenEnd at the end of the regexp chain.
Do not select the longest match, but select the first match.
(match): Do not take care of REMatch.empty.
* gnu/regexp/REMatch.java(empty): To be used only in RETokenRepeated.
* gnu/regexp/RETokenOneOf.java: Corrected a typo in a comment.
* gnu/regexp/RETokenBackRef.java: Do not take care of REMatch.empty.
* gnu/regexp/RETokenRepeated.java (match): Rewrote stingy matching.
Do not take care of REMatch.empty. Set and check REMatch.empty
when trying to match the single token.
Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.12
diff -u -r1.12 RE.java
--- classpath/gnu/regexp/RE.java 22 Jan 2006 02:22:21 -0000 1.12
+++ classpath/gnu/regexp/RE.java 29 Jan 2006 15:17:37 -0000
@@ -217,6 +217,13 @@
*/
public static final int REG_NO_INTERPOLATE = 128;
+ /**
+ * Execution flag.
+ * Try to match the whole input string. An implicit match-end operator
+ * is added to this regexp.
+ */
+ public static final int REG_TRY_ENTIRE_MATCH = 256;
+
/** Returns a string representing the version of the gnu.regexp package. */
public static final String version() {
return VERSION;
@@ -1265,20 +1272,14 @@
/* Implements abstract method REToken.match() */
boolean match(CharIndexed input, REMatch mymatch) {
- int origin = mymatch.index;
- boolean b;
if (firstToken == null) {
- b = next(input, mymatch);
- if (b) mymatch.empty = (mymatch.index == origin);
- return b;
+ return next(input, mymatch);
}
// Note the start of this subexpression
mymatch.start[subIndex] = mymatch.index;
- b = firstToken.match(input, mymatch);
- if (b) mymatch.empty = (mymatch.index == origin);
- return b;
+ return firstToken.match(input, mymatch);
}
/**
@@ -1337,23 +1338,34 @@
}
REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer
buffer) {
+ boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
+ RE re = (tryEntireMatch ? (RE) this.clone() : this);
+ if (tryEntireMatch) {
+ re.chain(new RETokenEnd(0, null));
+ }
// Create a new REMatch to hold results
REMatch mymatch = new REMatch(numSubs, anchor, eflags);
do {
// Optimization: check if anchor + minimumLength > length
if (minimumLength == 0 || input.charAt(minimumLength-1) !=
CharIndexed.OUT_OF_BOUNDS) {
- if (match(input, mymatch)) {
- // Find longest match of them all to observe leftmost longest
- REMatch longest = mymatch;
+ if (re.match(input, mymatch)) {
+ REMatch best = mymatch;
+ // We assume that the match that coms first is the best.
+ // And the following "The longer, the better" rule has
+ // been commented out. The longest is not neccesarily
+ // the best. For example, "a" out of "aaa" is the best
+ // match for /a+?/.
+ /*
+ // Find best match of them all to observe leftmost longest
while ((mymatch = mymatch.next) != null) {
- if (mymatch.index > longest.index) {
- longest = mymatch;
+ if (mymatch.index > best.index) {
+ best = mymatch;
}
}
-
- longest.end[0] = longest.index;
- longest.finish(input);
- return longest;
+ */
+ best.end[0] = best.index;
+ best.finish(input);
+ return best;
}
}
mymatch.clear(++anchor);
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.3
diff -u -r1.3 REMatch.java
--- classpath/gnu/regexp/REMatch.java 22 Jan 2006 02:22:21 -0000 1.3
+++ classpath/gnu/regexp/REMatch.java 29 Jan 2006 15:17:37 -0000
@@ -67,7 +67,8 @@
int[] start; // start positions (relative to offset) for each (sub)exp.
int[] end; // end positions for the same
REMatch next; // other possibility (to avoid having to use arrays)
- boolean empty; // empty string matched
+ boolean empty; // empty string matched. This flag is used only within
+ // RETokenRepeated.
public Object clone() {
try {
@@ -89,7 +90,6 @@
index = other.index;
// need to deep clone?
next = other.next;
- empty = other.empty;
}
REMatch(int subs, int anchor, int eflags) {
@@ -126,7 +126,6 @@
start[i] = end[i] = -1;
}
next = null; // cut off alternates
- empty = false;
}
/**
Index: classpath/gnu/regexp/RETokenBackRef.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenBackRef.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenBackRef.java
--- classpath/gnu/regexp/RETokenBackRef.java 22 Jan 2006 02:22:21 -0000
1.3
+++ classpath/gnu/regexp/RETokenBackRef.java 29 Jan 2006 15:17:37 -0000
@@ -57,7 +57,6 @@
b = mymatch.start[num];
e = mymatch.end[num];
if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but...
- int origin = mymatch.index;
for (int i=b; i<e; i++) {
char c1 = input.charAt(mymatch.index+i-b);
char c2 = input.charAt(i);
@@ -74,9 +73,7 @@
}
}
mymatch.index += e-b;
- boolean result = next(input, mymatch);
- if (result) mymatch.empty = (mymatch.index == origin);
- return result;
+ return next(input, mymatch);
}
void dump(StringBuffer os) {
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java 23 Jan 2006 13:18:10 -0000
1.3
+++ classpath/gnu/regexp/RETokenOneOf.java 29 Jan 2006 15:17:37 -0000
@@ -98,7 +98,7 @@
REMatch last = null;
REToken tk;
for (int i=0; i < options.size(); i++) {
- // In ordaer that the backtracking can work,
+ // In order that the backtracking can work,
// each option must be chained to the next token.
// But the chain method has some side effect, so
// we use clones.
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java 22 Jan 2006 02:22:21 -0000
1.6
+++ classpath/gnu/regexp/RETokenRepeated.java 29 Jan 2006 15:17:37 -0000
@@ -84,6 +84,8 @@
return (min * token.getMinimumLength());
}
+ boolean stopMatchingIfSatisfied = true;
+
// We do need to save every possible point, but the number of clone()
// invocations here is really a killer for performance on non-stingy
// repeat operators. I'm open to suggestions...
@@ -93,7 +95,6 @@
// the subexpression back-reference operator allow that?
boolean match(CharIndexed input, REMatch mymatch) {
- int origin = mymatch.index;
// number of times we've matched so far
int numRepeats = 0;
@@ -113,19 +114,38 @@
REMatch recurrent;
int lastIndex = mymatch.index;
+ // {0} needs some special treatment.
+ if (alwaysEmpty) {
+ REMatch result = matchRest(input, newMatch);
+ if (result != null) {
+ mymatch.assignFrom(result);
+ return true;
+ }
+ else {
+ return false;
+ }
+ }
+
do {
- // Check for stingy match for each possibility.
- if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
- REMatch result = matchRest(input, newMatch);
- if (result != null) {
- mymatch.assignFrom(result);
- mymatch.empty = (mymatch.index == origin);
- return true;
- }
- else {
- // Special case of {0}. It must always match an empty string.
- if (alwaysEmpty) return false;
- }
+ // We want to check something like
+ // if (stingy && (numRepeats >= min))
+ // and neglect the further matching. But experience tells
+ // such neglection may cause incomplete matching.
+ // For example, if we neglect the seemingly unnecessay
+ // matching, /^(b+?|a){1,2}?c/ cannot match "bbc".
+ // On the other hand, if we do not stop the unnecessary
+ // matching, /(([a-c])b*?\2)*/ matches "ababbbcbc"
+ // entirely when we wan to find only "ababb".
+ // In order to make regression tests pass, we do as we did.
+ if (stopMatchingIfSatisfied && stingy && (numRepeats >= min)) {
+ REMatch result = matchRest(input, newMatch);
+ if (result != null) {
+ StringBuffer sb = new StringBuffer();
+ token.dumpAll(sb);
+ System.err.println("token=" + sb + " result.index=" +
result.index);
+ mymatch.assignFrom(result);
+ return true;
+ }
}
doables = null;
@@ -134,7 +154,9 @@
// try next repeat at all possible positions
for (current = newMatch; current != null; current = current.next) {
recurrent = (REMatch) current.clone();
+ int origin = recurrent.index;
if (token.match(input, recurrent)) {
+ if (recurrent.index == origin) recurrent.empty = true;
// add all items in current to doables array
if (doables == null) {
doables = recurrent;
@@ -222,8 +244,11 @@
// desired.
indexCount = 1;
}
+ if (stingy) {
+ posIndex = (posIndex - indexCount - 1);
+ }
while (indexCount-- > 0) {
- --posIndex;
+ if (stingy) ++posIndex; else --posIndex;
newMatch = (REMatch) positions.elementAt(posIndex);
results = matchRest(input, newMatch);
if (results != null) {
@@ -233,6 +258,7 @@
} else {
// Order these from longest to shortest
// Start by assuming longest (more repeats)
+ // If stingy the order is shortest to longest.
allResultsLast.next = results;
}
// Find new doablesLast
@@ -246,7 +272,6 @@
}
if (allResults != null) {
mymatch.assignFrom(allResults); // does this get all?
- mymatch.empty = (mymatch.index == origin);
return true;
}
// If we fall out, no matches.