From: Ito Kazumitsu <[EMAIL PROTECTED]>
Date: Fri, 17 Feb 2006 01:13:11 +0900 (JST)

> The bad news is that the performance has worsened
> a bit (although the number of nodes we have to visit
> is equal to that of a width-first search, the overhead
> of loop control may be heavier, I guess). And there
> remains a very difficult case:
> 
> /^(b+?|a){1,2}?c/
>     bbc
>  0: bbc
>  1: b
> 
> The reluctant operator ? of b+? comes befoe that of
> (){1,2}?, so the former should be stronger than the latter.
> 
> But in our implementation, b+? belongs to (){1,2}? and the
> reluctance of the latter is stronger.

I may have been doing something wrong and now this case also
passes.  I changed the binary search to simple linear search,
which is faster in this case where the array size is usually
small.

ChangeLog

2006-02-18  Ito Kazumitsu  <[EMAIL PROTECTED]>

        * gnu/regexp/REMatch.java(matchFlags): New int field used as
        option flags passed to match methods.
        (MF_FIND_ALL): New flag.
        * gnu/regexp/RETokenOneOf.java(matchP): Unless MF_FIND_ALL is set,
        do not try other possibilties once a match is found.
        * gnu/regexp/RETokenRepeated.java(findDoables): Set MF_FIND_ALL
        so that all possibilities can be found.
        (match): Rewritten using new methods matchMinimum and _match.
        (_match): New method which performs a depth-first recursive search.
        (matchMinimum): New method.
        (initVisited), (visitedContains), (addVisited): New methods for
        manipulating an array of icharacter positions which _match has
        already visited.

Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.7
diff -u -r1.7 REMatch.java
--- classpath/gnu/regexp/REMatch.java   9 Feb 2006 13:44:59 -0000       1.7
+++ classpath/gnu/regexp/REMatch.java   17 Feb 2006 16:11:39 -0000
@@ -1,5 +1,5 @@
 /* gnu/regexp/REMatch.java
-   Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2006 Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -69,6 +69,8 @@
     REMatch next; // other possibility (to avoid having to use arrays)
     boolean empty; // empty string matched. This flag is used only within
                   // RETokenRepeated.
+    int matchFlags; // flags passed to match methods
+    static final int MF_FIND_ALL = 0x01;
 
     public Object clone() {
        try {
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.7
diff -u -r1.7 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java      13 Feb 2006 13:19:44 -0000      
1.7
+++ classpath/gnu/regexp/RETokenOneOf.java      17 Feb 2006 16:11:39 -0000
@@ -187,6 +187,8 @@
     }
 
     private boolean matchP(CharIndexed input, REMatch mymatch, boolean 
tryOnly) {
+      boolean stopMatchingIfSatisfied =
+         (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
       REMatch.REMatchList newMatch = new REMatch.REMatchList();
       REToken tk;
       for (int i=0; i < options.size(); i++) {
@@ -204,6 +206,7 @@
        if (tk.match(input, tryMatch)) { // match was successful
          if (tryOnly) return true;
          newMatch.addTail(tryMatch);
+         if (stopMatchingIfSatisfied) break;
        } // is a match
       } // try next option
       if (tryOnly) return false;
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.9
diff -u -r1.9 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java   6 Feb 2006 14:03:59 -0000       
1.9
+++ classpath/gnu/regexp/RETokenRepeated.java   17 Feb 2006 16:11:39 -0000
@@ -1,5 +1,5 @@
 /* gnu/regexp/RETokenRepeated.java
-   Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+   Copyright (C) 2006 Free Software Foundation, Inc.
 
 This file is part of GNU Classpath.
 
@@ -39,20 +39,19 @@
 package gnu.regexp;
 
 import java.util.Vector;
+import java.util.Arrays;
 
 final class RETokenRepeated extends REToken {
     private REToken token;
     private int min,max;
     private boolean stingy;
     private boolean possessive;
-    private boolean alwaysEmpty; // Special case of {0}
     
     RETokenRepeated(int subIndex, REToken token, int min, int max) {
        super(subIndex);
        this.token = token;
        this.min = min;
        this.max = max;
-       alwaysEmpty = (min == 0 && max == 0);
     }
 
     /** Sets the minimal matching mode to true. */
@@ -91,8 +90,6 @@
        return (max * tmax);
     }
 
-    boolean stopMatchingIfSatisfied = true;
-
     private static REMatch findDoables(REToken tk,
                        CharIndexed input, REMatch mymatch) {
 
@@ -105,7 +102,11 @@
                int origin = recurrent.index;
                tk = (REToken) tk.clone();
                tk.next = tk.uncle = null;
+               recurrent.matchFlags |= REMatch.MF_FIND_ALL;
                if (tk.match(input, recurrent)) {
+                   for (REMatch m = recurrent; m != null; m = m.next) {
+                       m.matchFlags &= ~REMatch.MF_FIND_ALL;
+                   }
                    if (recurrent.index == origin) recurrent.empty = true;
                    // add all items in current to doables array
                    doables.addTail(recurrent);
@@ -123,34 +124,167 @@
     // the subexpression back-reference operator allow that?
 
     boolean match(CharIndexed input, REMatch mymatch) {
-       // Possible positions for the next repeat to match at
-       REMatch newMatch = mymatch;
 
-       // {0} needs some special treatment.
-       if (alwaysEmpty) {
-           REMatch result = matchRest(input, newMatch);
-           if (result != null) {
-               mymatch.assignFrom(result);
-               return true;
+        boolean stopMatchingIfSatisfied =
+               (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
+
+       REMatch newMatch = matchMinimum(input, mymatch);
+       if (newMatch == null) return false;
+
+       // Array of positions we have already visited
+       int[] visited = initVisited();
+       for (REMatch m = newMatch; m != null; m = m.next) {
+           visited = addVisited(m.index, visited);
+       }
+
+       int max1 = decreaseMax(max, min);
+
+       newMatch = _match(input, newMatch, max1,
+           stopMatchingIfSatisfied, visited);
+       if (newMatch != null) {
+           mymatch.assignFrom(newMatch);
+           return true;
+       }
+       return false;
+    }
+
+    private static int decreaseMax(int m, int n) {
+        if (m == Integer.MAX_VALUE) return m;
+       return m - n;
+    }
+
+    // Array visited is an array of character positions we have already
+    // visited. visited[0] is used to store the effective length of the
+    // array.
+    private static int[] initVisited() {
+       int[] visited = new int[32];
+       visited[0] = 0;
+       return visited;
+    }
+
+    private static boolean visitedContains(int n, int[] visited) {
+       // Experience tells that for a small array like this,
+       // simple linear search is faster than binary search.
+       for (int i = 1; i < visited[0]; i++) {
+           if (n == visited[i]) return true;
+       }
+       return false;
+    }
+
+    private static int[] addVisited(int n, int[] visited) {
+       if (visitedContains(n, visited)) return visited;
+       if (visited[0] >= visited.length - 1) {
+           int[] newvisited = new int[visited.length + 32];
+           System.arraycopy(visited, 0, newvisited, 0, visited.length);
+           visited = newvisited;
+       }
+       visited[0]++;
+       visited[visited[0]] = n;
+       return visited;
+    }
+
+    private REMatch _match(CharIndexed input, REMatch mymatch,
+           int max1, boolean stopMatchingIfSatisfied,
+           int[] visited) {
+
+        if (max1 == 0) {
+           return matchRest(input, mymatch);
+       }
+       max1 = decreaseMax(max1, 1);
+
+       REMatch.REMatchList allResults = new REMatch.REMatchList();
+
+       // Depth-first search
+
+       for (REMatch cur = mymatch; cur != null; cur = cur.next) {
+
+           REMatch cur1 = (REMatch) cur.clone();
+
+           if (stingy) {
+               REMatch results = matchRest(input, cur1);
+               if (results != null) {
+                   if (stopMatchingIfSatisfied) {
+                       return results;
+                   }
+                   allResults.addTail(results);
+               }
+           }
+
+           DO_THIS:
+           do {
+
+           boolean emptyMatchFound = false;
+           REMatch doables = findDoables(token, input, cur1);
+           if (doables == null) break DO_THIS;
+           if (doables.empty) emptyMatchFound = true;
+
+           if (!emptyMatchFound) {
+               REMatch.REMatchList list = new REMatch.REMatchList();
+               for (REMatch m = doables; m != null; m = m.next) {
+                   REMatch m1 = (REMatch) m.clone();
+                   int n = m1.index;
+                   if (! visitedContains(n, visited)) {
+                       visited = addVisited(n, visited);
+                       list.addTail(m1);
+                   }
+               }
+               if (list.head == null) break DO_THIS;
+               doables = list.head;
            }
-           else {
-               return false;
+
+           for (REMatch m = doables; m != null; m = m.next) {
+               if (! emptyMatchFound) {
+                   REMatch m1 = _match(input, m, max1,
+                       stopMatchingIfSatisfied, visited);
+                   if (possessive) return m1;
+                   if (m1 != null) {
+                       if (stopMatchingIfSatisfied) {
+                           return m1;
+                       }
+                       allResults.addTail(m1);
+                   }
+               }
+               else {
+                   REMatch m1 = matchRest(input, m);
+                   if (m1 != null) {
+                       if (stopMatchingIfSatisfied) {
+                           return m1;
+                       }
+                       allResults.addTail(m1);
+                   }
+               }
+           }
+
+           } while (false); // DO_THIS only once;
+
+           // This point itself is a candidate.
+           if (!stingy) {
+               REMatch m2 = matchRest(input, cur1);
+               if (m2 != null) {
+                   if (stopMatchingIfSatisfied) {
+                       return m2;
+                   }
+                   allResults.addTail(m2);
+               }
            }
        }
 
+        return allResults.head;
+    }
+
+    private REMatch matchMinimum(CharIndexed input, final REMatch mymatch) {
+       // Possible positions for the next repeat to match at
+       REMatch newMatch = mymatch;
+
        // number of times we've matched so far
        int numRepeats = 0; 
        
-       REMatch doables;
-       int lastIndex = mymatch.index;
-       boolean emptyMatchFound = false;
-
        while (numRepeats < min) {
-           doables = findDoables(token, input, newMatch);
+           REMatch doables = findDoables(token, input, newMatch);
 
            // if none of the possibilities worked out, 
            // it means that minimum number of repeats could not be found.
-           if (doables == null) return false;
+           if (doables == null) return null;
            
            // reassign where the next repeat can match
            newMatch = doables;
@@ -158,96 +292,9 @@
            // increment how many repeats we've successfully found
            ++numRepeats;
            
-           if (newMatch.empty) {
-               numRepeats = min;
-               emptyMatchFound = true;
-               break;
-           }
-           lastIndex = newMatch.index;
-       }
-
-       Vector positions = new Vector();
-
-       while (numRepeats <= max) {
-           // We want to check something like  
-           //    if (stingy)
-           // and neglect the further matching.  But experience tells
-           // such neglection may cause incomplete matching.
-           // For example, if we neglect the seemingly unnecessay
-           // matching, /^(b+?|a){1,2}?c/ cannot match "bbc".
-           // On the other hand, if we do not stop the unnecessary
-           // matching, /(([a-c])b*?\2)*/ matches "ababbbcbc"
-           // entirely when we wan to find only "ababb".
-           // In order to make regression tests pass, we do as we did.
-           if (stopMatchingIfSatisfied && stingy) {
-               REMatch results = matchRest(input, newMatch);
-               if (results != null) {
-                   mymatch.assignFrom(results);
-                   return true;
-               }
-           }
-           positions.add(newMatch);
-           if (emptyMatchFound) break;
-
-           doables = findDoables(token, input, newMatch);
-           if (doables == null) break;
-
-           // doables.index == lastIndex occurs either
-           //   (1) when an empty string was the longest
-           //       that matched this token.
-           // or
-           //   (2) when the same string matches this token many times.
-           //       For example, "acbab" itself matches "a.*b" and
-           //       its substrings "acb" and "ab" also match.
-           //       In this case, we do not have to go further until
-           //       numRepeats == max because the more numRepeats grows,
-           //       the shorter the substring matching this token becomes.
-           //       So the previous succesful match must have bee the best
-           //       match.  But this is not necessarily the case if stingy.
-           if (doables.index == lastIndex) {
-               if (doables.empty) {
-                   emptyMatchFound = true;
-                }
-               else {
-                   if (!stingy) break;
-               }
-           }
-           numRepeats++;
-           newMatch = doables;
-           lastIndex = newMatch.index;
-       }
-
-       // We're greedy, but ease off until a true match is found.
-       // At this point we've either got too many or just the right amount.
-       // See if this numRepeats works with the rest of the regexp.
-
-       REMatch.REMatchList allResults = new REMatch.REMatchList();
-
-       int posCount = positions.size();
-       int posIndex = (stingy ? 0 : posCount - 1);
-
-       while (posCount-- > 0) {
-           REMatch m = (REMatch) positions.elementAt(posIndex);
-            if (stingy) posIndex++; else posIndex--;
-
-           REMatch results = matchRest(input, m);
-            if (results != null) {
-               // Order these from longest to shortest
-               // Start by assuming longest (more repeats)
-               // If stingy the order is shortest to longest.
-               allResults.addTail(results);
-           }
-           else {
-               if (possessive) break;
-           }
+           if (newMatch.empty) break;
        }
-
-       if (allResults.head != null) {
-           mymatch.assignFrom(allResults.head); // does this get all?
-           return true;
-       }
-       // If we fall out, no matches.
-       return false;
+       return newMatch;
     }
 
     private REMatch matchRest(CharIndexed input, final REMatch newMatch) {

Reply via email to