Hi,

I am rewriting gnu/regexp/RETokenRepeated.java so that it
performs a depth-first search.

The good news is that we can move a few more cases from
gnu/testlet/java/util/regex/Pattern/testdata2
(tests which fail) 
 to
gnu/testlet/java/util/regex/Pattern/testdata1
(tests which pass).

The bad news is that the performance has worsened
a bit (although the number of nodes we have to visit
is equal to that of a width-first search, the overhead
of loop control may be heavier, I guess). And there
remains a very difficult case:

/^(b+?|a){1,2}?c/
    bbc
 0: bbc
 1: b

The reluctant operator ? of b+? comes befoe that of
(){1,2}?, so the former should be stronger than the latter.

But in our implementation, b+? belongs to (){1,2}? and the
reluctance of the latter is stronger.

ChangeLog

2006-02-16  Ito Kazumitsu  <[EMAIL PROTECTED]>

        * gnu/regexp/REMatch.java(matchFlags): New int field used as
        option flags passed to match methods.
        (MF_FIND_ALL): New flag.
        * gnu/regexp/RETokenOneOf.java(matchP): Unless MF_FIND_ALL is set,
        do not try other possibilties once a match is found.
        * gnu/regexp/RETokenRepeated.java(findDoables): Set MF_FIND_ALL
        so that all possibilities can be found.
        (match): Rewritten using new methods matchMinimum and _match.
        (_match): New method which performs a depth-first recursive search.
        (matchMinimum): New method.
        (initVisited), (visitedContains), (addVisited): New methods for
        manipulating an array of icharacter positions which _match has
        already visited.

Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.7
diff -u -r1.7 REMatch.java
--- classpath/gnu/regexp/REMatch.java   9 Feb 2006 13:44:59 -0000       1.7
+++ classpath/gnu/regexp/REMatch.java   16 Feb 2006 15:14:35 -0000
@@ -1,5 +1,5 @@
 /* gnu/regexp/REMatch.java
-   Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2006 Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -69,6 +69,8 @@
     REMatch next; // other possibility (to avoid having to use arrays)
     boolean empty; // empty string matched. This flag is used only within
                   // RETokenRepeated.
+    int matchFlags; // flags passed to match methods
+    static final int MF_FIND_ALL = 0x01;
 
     public Object clone() {
        try {
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.7
diff -u -r1.7 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java      13 Feb 2006 13:19:44 -0000      
1.7
+++ classpath/gnu/regexp/RETokenOneOf.java      16 Feb 2006 15:14:35 -0000
@@ -187,6 +187,8 @@
     }
 
     private boolean matchP(CharIndexed input, REMatch mymatch, boolean 
tryOnly) {
+      boolean stopMatchingIfSatisfied =
+         (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
       REMatch.REMatchList newMatch = new REMatch.REMatchList();
       REToken tk;
       for (int i=0; i < options.size(); i++) {
@@ -204,6 +206,7 @@
        if (tk.match(input, tryMatch)) { // match was successful
          if (tryOnly) return true;
          newMatch.addTail(tryMatch);
+         if (stopMatchingIfSatisfied) break;
        } // is a match
       } // try next option
       if (tryOnly) return false;
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.9
diff -u -r1.9 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java   6 Feb 2006 14:03:59 -0000       
1.9
+++ classpath/gnu/regexp/RETokenRepeated.java   16 Feb 2006 15:14:35 -0000
@@ -1,5 +1,5 @@
 /* gnu/regexp/RETokenRepeated.java
-   Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2006 Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -39,20 +39,19 @@
 package gnu.regexp;
 
 import java.util.Vector;
+import java.util.Arrays;
 
 final class RETokenRepeated extends REToken {
     private REToken token;
     private int min,max;
     private boolean stingy;
     private boolean possessive;
-    private boolean alwaysEmpty; // Special case of {0}
     
     RETokenRepeated(int subIndex, REToken token, int min, int max) {
        super(subIndex);
        this.token = token;
        this.min = min;
        this.max = max;
-       alwaysEmpty = (min == 0 && max == 0);
     }
 
     /** Sets the minimal matching mode to true. */
@@ -91,8 +90,6 @@
        return (max * tmax);
     }
 
-    boolean stopMatchingIfSatisfied = true;
-
     private static REMatch findDoables(REToken tk,
                        CharIndexed input, REMatch mymatch) {
 
@@ -105,7 +102,11 @@
                int origin = recurrent.index;
                tk = (REToken) tk.clone();
                tk.next = tk.uncle = null;
+               recurrent.matchFlags |= REMatch.MF_FIND_ALL;
                if (tk.match(input, recurrent)) {
+                   for (REMatch m = recurrent; m != null; m = m.next) {
+                       m.matchFlags &= ~REMatch.MF_FIND_ALL;
+                   }
                    if (recurrent.index == origin) recurrent.empty = true;
                    // add all items in current to doables array
                    doables.addTail(recurrent);
@@ -123,131 +124,179 @@
     // the subexpression back-reference operator allow that?
 
     boolean match(CharIndexed input, REMatch mymatch) {
-       // Possible positions for the next repeat to match at
-       REMatch newMatch = mymatch;
 
-       // {0} needs some special treatment.
-       if (alwaysEmpty) {
-           REMatch result = matchRest(input, newMatch);
-           if (result != null) {
-               mymatch.assignFrom(result);
-               return true;
-           }
-           else {
-               return false;
-           }
-       }
+        boolean stopMatchingIfSatisfied =
+               (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
 
-       // number of times we've matched so far
-       int numRepeats = 0; 
-       
-       REMatch doables;
-       int lastIndex = mymatch.index;
-       boolean emptyMatchFound = false;
+       REMatch newMatch = matchMinimum(input, mymatch);
+       if (newMatch == null) return false;
 
-       while (numRepeats < min) {
-           doables = findDoables(token, input, newMatch);
+       // Array of positions we have already visited
+       int[] visited = initVisited();
+       for (REMatch m = newMatch; m != null; m = m.next) {
+           visited = addVisited(m.index, visited);
+       }
 
-           // if none of the possibilities worked out, 
-           // it means that minimum number of repeats could not be found.
-           if (doables == null) return false;
-           
-           // reassign where the next repeat can match
-           newMatch = doables;
-           
-           // increment how many repeats we've successfully found
-           ++numRepeats;
-           
-           if (newMatch.empty) {
-               numRepeats = min;
-               emptyMatchFound = true;
-               break;
-           }
-           lastIndex = newMatch.index;
+       int max1 = decreaseMax(max, min);
+
+       newMatch = _match(input, newMatch, max1,
+           stopMatchingIfSatisfied, visited);
+       if (newMatch != null) {
+           mymatch.assignFrom(newMatch);
+           return true;
        }
+       return false;
+    }
 
-       Vector positions = new Vector();
+    private static int decreaseMax(int m, int n) {
+        if (m == Integer.MAX_VALUE) return m;
+       return m - n;
+    }
 
-       while (numRepeats <= max) {
-           // We want to check something like  
-           //    if (stingy)
-           // and neglect the further matching.  But experience tells
-           // such neglection may cause incomplete matching.
-           // For example, if we neglect the seemingly unnecessay
-           // matching, /^(b+?|a){1,2}?c/ cannot match "bbc".
-           // On the other hand, if we do not stop the unnecessary
-           // matching, /(([a-c])b*?\2)*/ matches "ababbbcbc"
-           // entirely when we wan to find only "ababb".
-           // In order to make regression tests pass, we do as we did.
-           if (stopMatchingIfSatisfied && stingy) {
-               REMatch results = matchRest(input, newMatch);
-               if (results != null) {
-                   mymatch.assignFrom(results);
-                   return true;
-               }
-           }
-           positions.add(newMatch);
-           if (emptyMatchFound) break;
+    private static int[] initVisited() {
+       int[] visited = new int[32];
+       for (int i = 0; i < visited.length; i++) {
+           visited[i] = Integer.MAX_VALUE;
+       }
+       return visited;
+    }
 
-           doables = findDoables(token, input, newMatch);
-           if (doables == null) break;
+    private static boolean visitedContains(int n, int[] visited) {
+       return (Arrays.binarySearch(visited, n) >= 0);
+    }
 
-           // doables.index == lastIndex occurs either
-           //   (1) when an empty string was the longest
-           //       that matched this token.
-           // or
-           //   (2) when the same string matches this token many times.
-           //       For example, "acbab" itself matches "a.*b" and
-           //       its substrings "acb" and "ab" also match.
-           //       In this case, we do not have to go further until
-           //       numRepeats == max because the more numRepeats grows,
-           //       the shorter the substring matching this token becomes.
-           //       So the previous succesful match must have bee the best
-           //       match.  But this is not necessarily the case if stingy.
-           if (doables.index == lastIndex) {
-               if (doables.empty) {
-                   emptyMatchFound = true;
-                }
-               else {
-                   if (!stingy) break;
-               }
+    private static int[] addVisited(int n, int[] visited) {
+       int p = Arrays.binarySearch(visited, n);
+       if (p >= 0) return visited;
+       p = -1 - p;
+       if (p >= visited.length) {
+           System.err.println("p=" + p);
+           int[] newvisited = new int[visited.length + 32];
+           System.arraycopy(visited, 0, newvisited, 0, visited.length);
+           for (int i = visited.length; i < newvisited.length; i++) {
+               newvisited[i] = Integer.MAX_VALUE;
            }
-           numRepeats++;
-           newMatch = doables;
-           lastIndex = newMatch.index;
+           visited = newvisited;
+       }
+       if (visited[p] != Integer.MAX_VALUE) {
+           System.arraycopy(visited, p, visited, p+1, visited.length - p - 1);
        }
+       visited[p] = n;
+       return visited;
+    }
+
+    private REMatch _match(CharIndexed input, REMatch mymatch,
+           int max1, boolean stopMatchingIfSatisfied,
+           int[] visited) {
 
-       // We're greedy, but ease off until a true match is found.
-       // At this point we've either got too many or just the right amount.
-       // See if this numRepeats works with the rest of the regexp.
+        if (max1 == 0) {
+           return matchRest(input, mymatch);
+       }
+       max1 = decreaseMax(max1, 1);
 
        REMatch.REMatchList allResults = new REMatch.REMatchList();
 
-       int posCount = positions.size();
-       int posIndex = (stingy ? 0 : posCount - 1);
+       // Depth-first search
+
+       for (REMatch cur = mymatch; cur != null; cur = cur.next) {
+
+           REMatch cur1 = (REMatch) cur.clone();
 
-       while (posCount-- > 0) {
-           REMatch m = (REMatch) positions.elementAt(posIndex);
-            if (stingy) posIndex++; else posIndex--;
-
-           REMatch results = matchRest(input, m);
-            if (results != null) {
-               // Order these from longest to shortest
-               // Start by assuming longest (more repeats)
-               // If stingy the order is shortest to longest.
-               allResults.addTail(results);
+           if (stingy) {
+               REMatch results = matchRest(input, cur1);
+               if (results != null) {
+                   if (stopMatchingIfSatisfied) {
+                       return results;
+                   }
+                   allResults.addTail(results);
+               }
            }
-           else {
-               if (possessive) break;
+
+           DO_THIS:
+           do {
+
+           boolean emptyMatchFound = false;
+           REMatch doables = findDoables(token, input, cur1);
+           if (doables == null) break DO_THIS;
+           if (doables.empty) emptyMatchFound = true;
+
+           if (!emptyMatchFound) {
+               REMatch.REMatchList list = new REMatch.REMatchList();
+               for (REMatch m = doables; m != null; m = m.next) {
+                   REMatch m1 = (REMatch) m.clone();
+                   int n = m1.index;
+                   if (! visitedContains(n, visited)) {
+                       visited = addVisited(n, visited);
+                       list.addTail(m1);
+                   }
+               }
+               if (list.head == null) break DO_THIS;
+               doables = list.head;
+           }
+
+           for (REMatch m = doables; m != null; m = m.next) {
+               if (! emptyMatchFound) {
+                   REMatch m1 = _match(input, m, max1,
+                       stopMatchingIfSatisfied, visited);
+                   if (possessive) return m1;
+                   if (m1 != null) {
+                       if (stopMatchingIfSatisfied) {
+                           return m1;
+                       }
+                       allResults.addTail(m1);
+                   }
+               }
+               else {
+                   REMatch m1 = matchRest(input, m);
+                   if (m1 != null) {
+                       if (stopMatchingIfSatisfied) {
+                           return m1;
+                       }
+                       allResults.addTail(m1);
+                   }
+               }
+           }
+
+           } while (false); // DO_THIS only once;
+
+           // This point itself is a candidate.
+           if (!stingy) {
+               REMatch m2 = matchRest(input, cur1);
+               if (m2 != null) {
+                   if (stopMatchingIfSatisfied) {
+                       return m2;
+                   }
+                   allResults.addTail(m2);
+               }
            }
        }
 
-       if (allResults.head != null) {
-           mymatch.assignFrom(allResults.head); // does this get all?
-           return true;
+        return allResults.head;
+    }
+
+    private REMatch matchMinimum(CharIndexed input, final REMatch mymatch) {
+       // Possible positions for the next repeat to match at
+       REMatch newMatch = mymatch;
+
+       // number of times we've matched so far
+       int numRepeats = 0; 
+       
+       while (numRepeats < min) {
+           REMatch doables = findDoables(token, input, newMatch);
+
+           // if none of the possibilities worked out, 
+           // it means that minimum number of repeats could not be found.
+           if (doables == null) return null;
+           
+           // reassign where the next repeat can match
+           newMatch = doables;
+           
+           // increment how many repeats we've successfully found
+           ++numRepeats;
+           
+           if (newMatch.empty) break;
        }
-       // If we fall out, no matches.
-       return false;
+       return newMatch;
     }
 
     private REMatch matchRest(CharIndexed input, final REMatch newMatch) {

Reply via email to