From: Ito Kazumitsu <[EMAIL PROTECTED]>
Date: Sun, 29 Jan 2006 01:15:27 +0900 (JST)

> The patch proposed on Mon, 23 Jan 2006 has been cancelled,
> and here is another patch.

Hopefully improved.

TODO:

This patch cannot make the following test pass.

/^(b+?|a){1,2}?c/
    bbac
 0: bbac
 1: a

/b+?/ matches only "b", and "bb" is not tried. 

ChangeLog
2006-01-29  Ito Kazumitsu  <[EMAIL PROTECTED]>

        Fixes bug #24876
        * gnu/regexp/gnu/regexp/RE.java(REG_TRY_ENTIRE_MATCH):
        New execution flag.
        (getMatchImpl): if REG_TRY_ENTIRE_MATCH is set, add an
        implicit RETokenEnd at the end of the regexp chain.
        Do not select the longest match, but select the first match.
        (match): Do not take care of REMatch.empty.
        * gnu/regexp/REMatch.java(empty): To be used only in RETokenRepeated.
        * gnu/regexp/RETokenOneOf.java: Corrected a typo in a comment.
        * gnu/regexp/RETokenBackRef.java: Do not take care of REMatch.empty.
        * gnu/regexp/RETokenRepeated.java (match): Rewrote stingy matching.
        Do not take care of REMatch.empty. Set and check REMatch.empty
        when trying to match the single token.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.12
diff -u -r1.12 RE.java
--- classpath/gnu/regexp/RE.java        22 Jan 2006 02:22:21 -0000      1.12
+++ classpath/gnu/regexp/RE.java        29 Jan 2006 15:17:37 -0000
@@ -217,6 +217,13 @@
    */
   public static final int REG_NO_INTERPOLATE = 128;
 
+  /**
+   * Execution flag.
+   * Try to match the whole input string. An implicit match-end operator
+   * is added to this regexp.
+   */
+  public static final int REG_TRY_ENTIRE_MATCH = 256;
+
   /** Returns a string representing the version of the gnu.regexp package. */
   public static final String version() {
     return VERSION;
@@ -1265,20 +1272,14 @@
   
     /* Implements abstract method REToken.match() */
     boolean match(CharIndexed input, REMatch mymatch) { 
-       int origin = mymatch.index;
-       boolean b;
        if (firstToken == null) {
-           b = next(input, mymatch);
-           if (b) mymatch.empty = (mymatch.index == origin);
-           return b;
+           return next(input, mymatch);
        }
 
        // Note the start of this subexpression
        mymatch.start[subIndex] = mymatch.index;
 
-       b = firstToken.match(input, mymatch);
-       if (b) mymatch.empty = (mymatch.index == origin);
-       return b;
+       return firstToken.match(input, mymatch);
     }
   
   /**
@@ -1337,23 +1338,34 @@
   }
 
   REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer 
buffer) {
+      boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
+      RE re = (tryEntireMatch ? (RE) this.clone() : this);
+      if (tryEntireMatch) {
+         re.chain(new RETokenEnd(0, null));
+      }
       // Create a new REMatch to hold results
       REMatch mymatch = new REMatch(numSubs, anchor, eflags);
       do {
          // Optimization: check if anchor + minimumLength > length
          if (minimumLength == 0 || input.charAt(minimumLength-1) != 
CharIndexed.OUT_OF_BOUNDS) {
-             if (match(input, mymatch)) {
-                 // Find longest match of them all to observe leftmost longest
-                 REMatch longest = mymatch;
+             if (re.match(input, mymatch)) {
+                 REMatch best = mymatch;
+                 // We assume that the match that coms first is the best.
+                 // And the following "The longer, the better" rule has
+                 // been commented out. The longest is not neccesarily
+                 // the best. For example, "a" out of "aaa" is the best
+                 // match for /a+?/.
+                 /*
+                 // Find best match of them all to observe leftmost longest
                  while ((mymatch = mymatch.next) != null) {
-                     if (mymatch.index > longest.index) {
-                         longest = mymatch;
+                     if (mymatch.index > best.index) {
+                       best = mymatch;
                      }
                  }
-                 
-                 longest.end[0] = longest.index;
-                 longest.finish(input);
-                 return longest;
+                 */
+                 best.end[0] = best.index;
+                 best.finish(input);
+                 return best;
              }
          }
          mymatch.clear(++anchor);
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.3
diff -u -r1.3 REMatch.java
--- classpath/gnu/regexp/REMatch.java   22 Jan 2006 02:22:21 -0000      1.3
+++ classpath/gnu/regexp/REMatch.java   29 Jan 2006 15:17:37 -0000
@@ -67,7 +67,8 @@
     int[] start; // start positions (relative to offset) for each (sub)exp.
     int[] end;   // end positions for the same
     REMatch next; // other possibility (to avoid having to use arrays)
-    boolean empty; // empty string matched
+    boolean empty; // empty string matched. This flag is used only within
+                  // RETokenRepeated.
 
     public Object clone() {
        try {
@@ -89,7 +90,6 @@
        index = other.index;
        // need to deep clone?
        next = other.next;
-       empty = other.empty;
     }
 
     REMatch(int subs, int anchor, int eflags) {
@@ -126,7 +126,6 @@
            start[i] = end[i] = -1;
        }
        next = null; // cut off alternates
-       empty = false;
     }
     
     /**
Index: classpath/gnu/regexp/RETokenBackRef.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenBackRef.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenBackRef.java
--- classpath/gnu/regexp/RETokenBackRef.java    22 Jan 2006 02:22:21 -0000      
1.3
+++ classpath/gnu/regexp/RETokenBackRef.java    29 Jan 2006 15:17:37 -0000
@@ -57,7 +57,6 @@
        b = mymatch.start[num];
        e = mymatch.end[num];
        if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but...
-       int origin = mymatch.index;
        for (int i=b; i<e; i++) {
            char c1 = input.charAt(mymatch.index+i-b);
            char c2 = input.charAt(i);
@@ -74,9 +73,7 @@
            }
        }
        mymatch.index += e-b;
-       boolean result = next(input, mymatch);
-       if (result) mymatch.empty = (mymatch.index == origin);
-       return result;
+       return next(input, mymatch);
     }
     
     void dump(StringBuffer os) {
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java      23 Jan 2006 13:18:10 -0000      
1.3
+++ classpath/gnu/regexp/RETokenOneOf.java      29 Jan 2006 15:17:37 -0000
@@ -98,7 +98,7 @@
     REMatch last = null;
     REToken tk;
     for (int i=0; i < options.size(); i++) {
-       // In ordaer that the backtracking can work,
+       // In order that the backtracking can work,
        // each option must be chained to the next token.
        // But the chain method has some side effect, so
        // we use clones.
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java   22 Jan 2006 02:22:21 -0000      
1.6
+++ classpath/gnu/regexp/RETokenRepeated.java   29 Jan 2006 15:17:37 -0000
@@ -84,6 +84,8 @@
        return (min * token.getMinimumLength());
     }
 
+    boolean stopMatchingIfSatisfied = true;
+
     // We do need to save every possible point, but the number of clone()
     // invocations here is really a killer for performance on non-stingy
     // repeat operators.  I'm open to suggestions...
@@ -93,7 +95,6 @@
     // the subexpression back-reference operator allow that?
 
     boolean match(CharIndexed input, REMatch mymatch) {
-       int origin = mymatch.index;
        // number of times we've matched so far
        int numRepeats = 0; 
        
@@ -113,19 +114,38 @@
        REMatch recurrent;
        int lastIndex = mymatch.index;
 
+       // {0} needs some special treatment.
+       if (alwaysEmpty) {
+           REMatch result = matchRest(input, newMatch);
+           if (result != null) {
+               mymatch.assignFrom(result);
+               return true;
+           }
+           else {
+               return false;
+           }
+       }
+
        do {
-           // Check for stingy match for each possibility.
-           if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
-               REMatch result = matchRest(input, newMatch);
-               if (result != null) {
-                   mymatch.assignFrom(result);
-                   mymatch.empty = (mymatch.index == origin);
-                   return true;
-               }
-               else {
-               // Special case of {0}. It must always match an empty string.
-                   if (alwaysEmpty) return false;
-               }
+           // We want to check something like  
+           //    if (stingy && (numRepeats >= min))
+           // and neglect the further matching.  But experience tells
+           // such neglection may cause incomplete matching.
+           // For example, if we neglect the seemingly unnecessay
+           // matching, /^(b+?|a){1,2}?c/ cannot match "bbc".
+           // On the other hand, if we do not stop the unnecessary
+           // matching, /(([a-c])b*?\2)*/ matches "ababbbcbc"
+           // entirely when we wan to find only "ababb".
+           // In order to make regression tests pass, we do as we did.
+           if (stopMatchingIfSatisfied && stingy && (numRepeats >= min)) {
+               REMatch result = matchRest(input, newMatch);
+               if (result != null) {
+                   StringBuffer sb = new StringBuffer();
+                   token.dumpAll(sb);
+                   System.err.println("token=" + sb + " result.index=" + 
result.index);
+                   mymatch.assignFrom(result);
+                   return true;
+               }
            }
 
            doables = null;
@@ -134,7 +154,9 @@
            // try next repeat at all possible positions
            for (current = newMatch; current != null; current = current.next) {
                recurrent = (REMatch) current.clone();
+               int origin = recurrent.index;
                if (token.match(input, recurrent)) {
+                   if (recurrent.index == origin) recurrent.empty = true;
                    // add all items in current to doables array
                    if (doables == null) {
                        doables = recurrent;
@@ -222,8 +244,11 @@
            // desired.
            indexCount = 1;
        }
+       if (stingy) {
+           posIndex = (posIndex - indexCount - 1);
+       }
        while (indexCount-- > 0) {
-           --posIndex;
+           if (stingy) ++posIndex; else --posIndex;
            newMatch = (REMatch) positions.elementAt(posIndex);
            results = matchRest(input, newMatch);
            if (results != null) {
@@ -233,6 +258,7 @@
                } else {
                    // Order these from longest to shortest
                    // Start by assuming longest (more repeats)
+                   // If stingy the order is shortest to longest.
                    allResultsLast.next = results;
                }
                // Find new doablesLast
@@ -246,7 +272,6 @@
        }
        if (allResults != null) {
            mymatch.assignFrom(allResults); // does this get all?
-           mymatch.empty = (mymatch.index == origin);
            return true;
        }
        // If we fall out, no matches.

Reply via email to