Hi, I am rewriting gnu/regexp/RETokenRepeated.java so that it performs a depth-first search.
The good news is that we can move a few more cases from gnu/testlet/java/util/regex/Pattern/testdata2 (tests which fail) to gnu/testlet/java/util/regex/Pattern/testdata1 (tests which pass). The bad news is that the performance has worsened a bit (although the number of nodes we have to visit is equal to that of a width-first search, the overhead of loop control may be heavier, I guess). And there remains a very difficult case: /^(b+?|a){1,2}?c/ bbc 0: bbc 1: b The reluctant operator ? of b+? comes befoe that of (){1,2}?, so the former should be stronger than the latter. But in our implementation, b+? belongs to (){1,2}? and the reluctance of the latter is stronger. ChangeLog 2006-02-16 Ito Kazumitsu <[EMAIL PROTECTED]> * gnu/regexp/REMatch.java(matchFlags): New int field used as option flags passed to match methods. (MF_FIND_ALL): New flag. * gnu/regexp/RETokenOneOf.java(matchP): Unless MF_FIND_ALL is set, do not try other possibilties once a match is found. * gnu/regexp/RETokenRepeated.java(findDoables): Set MF_FIND_ALL so that all possibilities can be found. (match): Rewritten using new methods matchMinimum and _match. (_match): New method which performs a depth-first recursive search. (matchMinimum): New method. (initVisited), (visitedContains), (addVisited): New methods for manipulating an array of icharacter positions which _match has already visited.
Index: classpath/gnu/regexp/REMatch.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v retrieving revision 1.7 diff -u -r1.7 REMatch.java --- classpath/gnu/regexp/REMatch.java 9 Feb 2006 13:44:59 -0000 1.7 +++ classpath/gnu/regexp/REMatch.java 16 Feb 2006 15:14:35 -0000 @@ -1,5 +1,5 @@ /* gnu/regexp/REMatch.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -69,6 +69,8 @@ REMatch next; // other possibility (to avoid having to use arrays) boolean empty; // empty string matched. This flag is used only within // RETokenRepeated. + int matchFlags; // flags passed to match methods + static final int MF_FIND_ALL = 0x01; public Object clone() { try { Index: classpath/gnu/regexp/RETokenOneOf.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v retrieving revision 1.7 diff -u -r1.7 RETokenOneOf.java --- classpath/gnu/regexp/RETokenOneOf.java 13 Feb 2006 13:19:44 -0000 1.7 +++ classpath/gnu/regexp/RETokenOneOf.java 16 Feb 2006 15:14:35 -0000 @@ -187,6 +187,8 @@ } private boolean matchP(CharIndexed input, REMatch mymatch, boolean tryOnly) { + boolean stopMatchingIfSatisfied = + (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0; REMatch.REMatchList newMatch = new REMatch.REMatchList(); REToken tk; for (int i=0; i < options.size(); i++) { @@ -204,6 +206,7 @@ if (tk.match(input, tryMatch)) { // match was successful if (tryOnly) return true; newMatch.addTail(tryMatch); + if (stopMatchingIfSatisfied) break; } // is a match } // try next option if (tryOnly) return false; Index: classpath/gnu/regexp/RETokenRepeated.java =================================================================== RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v retrieving revision 1.9 diff -u -r1.9 RETokenRepeated.java --- classpath/gnu/regexp/RETokenRepeated.java 6 Feb 2006 14:03:59 -0000 1.9 +++ classpath/gnu/regexp/RETokenRepeated.java 16 Feb 2006 15:14:35 -0000 @@ -1,5 +1,5 @@ /* gnu/regexp/RETokenRepeated.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -39,20 +39,19 @@ package gnu.regexp; import java.util.Vector; +import java.util.Arrays; final class RETokenRepeated extends REToken { private REToken token; private int min,max; private boolean stingy; private boolean possessive; - private boolean alwaysEmpty; // Special case of {0} RETokenRepeated(int subIndex, REToken token, int min, int max) { super(subIndex); this.token = token; this.min = min; this.max = max; - alwaysEmpty = (min == 0 && max == 0); } /** Sets the minimal matching mode to true. */ @@ -91,8 +90,6 @@ return (max * tmax); } - boolean stopMatchingIfSatisfied = true; - private static REMatch findDoables(REToken tk, CharIndexed input, REMatch mymatch) { @@ -105,7 +102,11 @@ int origin = recurrent.index; tk = (REToken) tk.clone(); tk.next = tk.uncle = null; + recurrent.matchFlags |= REMatch.MF_FIND_ALL; if (tk.match(input, recurrent)) { + for (REMatch m = recurrent; m != null; m = m.next) { + m.matchFlags &= ~REMatch.MF_FIND_ALL; + } if (recurrent.index == origin) recurrent.empty = true; // add all items in current to doables array doables.addTail(recurrent); @@ -123,131 +124,179 @@ // the subexpression back-reference operator allow that? boolean match(CharIndexed input, REMatch mymatch) { - // Possible positions for the next repeat to match at - REMatch newMatch = mymatch; - // {0} needs some special treatment. - if (alwaysEmpty) { - REMatch result = matchRest(input, newMatch); - if (result != null) { - mymatch.assignFrom(result); - return true; - } - else { - return false; - } - } + boolean stopMatchingIfSatisfied = + (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0; - // number of times we've matched so far - int numRepeats = 0; - - REMatch doables; - int lastIndex = mymatch.index; - boolean emptyMatchFound = false; + REMatch newMatch = matchMinimum(input, mymatch); + if (newMatch == null) return false; - while (numRepeats < min) { - doables = findDoables(token, input, newMatch); + // Array of positions we have already visited + int[] visited = initVisited(); + for (REMatch m = newMatch; m != null; m = m.next) { + visited = addVisited(m.index, visited); + } - // if none of the possibilities worked out, - // it means that minimum number of repeats could not be found. - if (doables == null) return false; - - // reassign where the next repeat can match - newMatch = doables; - - // increment how many repeats we've successfully found - ++numRepeats; - - if (newMatch.empty) { - numRepeats = min; - emptyMatchFound = true; - break; - } - lastIndex = newMatch.index; + int max1 = decreaseMax(max, min); + + newMatch = _match(input, newMatch, max1, + stopMatchingIfSatisfied, visited); + if (newMatch != null) { + mymatch.assignFrom(newMatch); + return true; } + return false; + } - Vector positions = new Vector(); + private static int decreaseMax(int m, int n) { + if (m == Integer.MAX_VALUE) return m; + return m - n; + } - while (numRepeats <= max) { - // We want to check something like - // if (stingy) - // and neglect the further matching. But experience tells - // such neglection may cause incomplete matching. - // For example, if we neglect the seemingly unnecessay - // matching, /^(b+?|a){1,2}?c/ cannot match "bbc". - // On the other hand, if we do not stop the unnecessary - // matching, /(([a-c])b*?\2)*/ matches "ababbbcbc" - // entirely when we wan to find only "ababb". - // In order to make regression tests pass, we do as we did. - if (stopMatchingIfSatisfied && stingy) { - REMatch results = matchRest(input, newMatch); - if (results != null) { - mymatch.assignFrom(results); - return true; - } - } - positions.add(newMatch); - if (emptyMatchFound) break; + private static int[] initVisited() { + int[] visited = new int[32]; + for (int i = 0; i < visited.length; i++) { + visited[i] = Integer.MAX_VALUE; + } + return visited; + } - doables = findDoables(token, input, newMatch); - if (doables == null) break; + private static boolean visitedContains(int n, int[] visited) { + return (Arrays.binarySearch(visited, n) >= 0); + } - // doables.index == lastIndex occurs either - // (1) when an empty string was the longest - // that matched this token. - // or - // (2) when the same string matches this token many times. - // For example, "acbab" itself matches "a.*b" and - // its substrings "acb" and "ab" also match. - // In this case, we do not have to go further until - // numRepeats == max because the more numRepeats grows, - // the shorter the substring matching this token becomes. - // So the previous succesful match must have bee the best - // match. But this is not necessarily the case if stingy. - if (doables.index == lastIndex) { - if (doables.empty) { - emptyMatchFound = true; - } - else { - if (!stingy) break; - } + private static int[] addVisited(int n, int[] visited) { + int p = Arrays.binarySearch(visited, n); + if (p >= 0) return visited; + p = -1 - p; + if (p >= visited.length) { + System.err.println("p=" + p); + int[] newvisited = new int[visited.length + 32]; + System.arraycopy(visited, 0, newvisited, 0, visited.length); + for (int i = visited.length; i < newvisited.length; i++) { + newvisited[i] = Integer.MAX_VALUE; } - numRepeats++; - newMatch = doables; - lastIndex = newMatch.index; + visited = newvisited; + } + if (visited[p] != Integer.MAX_VALUE) { + System.arraycopy(visited, p, visited, p+1, visited.length - p - 1); } + visited[p] = n; + return visited; + } + + private REMatch _match(CharIndexed input, REMatch mymatch, + int max1, boolean stopMatchingIfSatisfied, + int[] visited) { - // We're greedy, but ease off until a true match is found. - // At this point we've either got too many or just the right amount. - // See if this numRepeats works with the rest of the regexp. + if (max1 == 0) { + return matchRest(input, mymatch); + } + max1 = decreaseMax(max1, 1); REMatch.REMatchList allResults = new REMatch.REMatchList(); - int posCount = positions.size(); - int posIndex = (stingy ? 0 : posCount - 1); + // Depth-first search + + for (REMatch cur = mymatch; cur != null; cur = cur.next) { + + REMatch cur1 = (REMatch) cur.clone(); - while (posCount-- > 0) { - REMatch m = (REMatch) positions.elementAt(posIndex); - if (stingy) posIndex++; else posIndex--; - - REMatch results = matchRest(input, m); - if (results != null) { - // Order these from longest to shortest - // Start by assuming longest (more repeats) - // If stingy the order is shortest to longest. - allResults.addTail(results); + if (stingy) { + REMatch results = matchRest(input, cur1); + if (results != null) { + if (stopMatchingIfSatisfied) { + return results; + } + allResults.addTail(results); + } } - else { - if (possessive) break; + + DO_THIS: + do { + + boolean emptyMatchFound = false; + REMatch doables = findDoables(token, input, cur1); + if (doables == null) break DO_THIS; + if (doables.empty) emptyMatchFound = true; + + if (!emptyMatchFound) { + REMatch.REMatchList list = new REMatch.REMatchList(); + for (REMatch m = doables; m != null; m = m.next) { + REMatch m1 = (REMatch) m.clone(); + int n = m1.index; + if (! visitedContains(n, visited)) { + visited = addVisited(n, visited); + list.addTail(m1); + } + } + if (list.head == null) break DO_THIS; + doables = list.head; + } + + for (REMatch m = doables; m != null; m = m.next) { + if (! emptyMatchFound) { + REMatch m1 = _match(input, m, max1, + stopMatchingIfSatisfied, visited); + if (possessive) return m1; + if (m1 != null) { + if (stopMatchingIfSatisfied) { + return m1; + } + allResults.addTail(m1); + } + } + else { + REMatch m1 = matchRest(input, m); + if (m1 != null) { + if (stopMatchingIfSatisfied) { + return m1; + } + allResults.addTail(m1); + } + } + } + + } while (false); // DO_THIS only once; + + // This point itself is a candidate. + if (!stingy) { + REMatch m2 = matchRest(input, cur1); + if (m2 != null) { + if (stopMatchingIfSatisfied) { + return m2; + } + allResults.addTail(m2); + } } } - if (allResults.head != null) { - mymatch.assignFrom(allResults.head); // does this get all? - return true; + return allResults.head; + } + + private REMatch matchMinimum(CharIndexed input, final REMatch mymatch) { + // Possible positions for the next repeat to match at + REMatch newMatch = mymatch; + + // number of times we've matched so far + int numRepeats = 0; + + while (numRepeats < min) { + REMatch doables = findDoables(token, input, newMatch); + + // if none of the possibilities worked out, + // it means that minimum number of repeats could not be found. + if (doables == null) return null; + + // reassign where the next repeat can match + newMatch = doables; + + // increment how many repeats we've successfully found + ++numRepeats; + + if (newMatch.empty) break; } - // If we fall out, no matches. - return false; + return newMatch; } private REMatch matchRest(CharIndexed input, final REMatch newMatch) {