Author: vgritsenko Date: Mon Mar 12 20:02:04 2007 New Revision: 517503 URL: http://svn.apache.org/viewvc?view=rev&rev=517503 Log: Reimplemented */+/? closures to use STAR/PLUS/MAYBE operations. Fixed bug #9153.
Modified: jakarta/regexp/trunk/docs/RETest.txt jakarta/regexp/trunk/docs/changes.html jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java jakarta/regexp/trunk/src/java/org/apache/regexp/RECompiler.java jakarta/regexp/trunk/src/java/org/apache/regexp/REDebugCompiler.java jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java jakarta/regexp/trunk/xdocs/RETest.txt jakarta/regexp/trunk/xdocs/changes.xml Modified: jakarta/regexp/trunk/docs/RETest.txt URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/docs/RETest.txt?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/docs/RETest.txt (original) +++ jakarta/regexp/trunk/docs/RETest.txt Mon Mar 12 20:02:04 2007 @@ -1566,3 +1566,8 @@ YES 1 aaabcca + +#231 +^a{1,35}$ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab +NO Modified: jakarta/regexp/trunk/docs/changes.html URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/docs/changes.html?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/docs/changes.html (original) +++ jakarta/regexp/trunk/docs/changes.html Mon Mar 12 20:02:04 2007 @@ -92,6 +92,9 @@ <h3>Version 1.5-dev</h3> <ul> <li>Fixed Bug + <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=9153">9153</a>: + {m,n} implementation had exponential complexity (VG)</li> +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=27763">27763</a>: RE incorrectly processed reluctant matchers (VG)</li> <li>Fixed Bug Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java (original) +++ jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java Mon Mar 12 20:02:04 2007 @@ -817,6 +817,46 @@ switch (opcode) { + case OP_MAYBE: + { + // Try to match the following subexpr. + // If it succeeds, it will continue matching by itself without returning here. + if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) + { + return idxNew; + } + + // If failed, just continue with the rest of expression + break; + } + + case OP_PLUS: + { + // Try to match the following subexpr again (and again (and ... + if ((idxNew = matchNodes(next, maxNode, idx)) != -1) + { + return idxNew; + } + + // If failed, just continue with the rest of expression + // Rest is located at the next pointer of the next instruction + // which must be OP_CONTINUE. + node = next + instruction[next + offsetNext]; + continue; + } + + case OP_STAR: + { + // Try to match the following subexpr (and again (and again (and ... + if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) + { + return idxNew; + } + + // If failed, just continue with the rest of expression + break; + } + case OP_RELUCTANTMAYBE: { // Try to match the rest without using the reluctant subexpr Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RECompiler.java URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/src/java/org/apache/regexp/RECompiler.java?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/src/java/org/apache/regexp/RECompiler.java (original) +++ jakarta/regexp/trunk/src/java/org/apache/regexp/RECompiler.java Mon Mar 12 20:02:04 2007 @@ -56,13 +56,9 @@ static final int ESC_CLASS = 0xffffd; // Escape represents a whole class of characters // {m,n} stacks - int maxBrackets = 10; // Maximum number of bracket pairs static final int bracketUnbounded = -1; // Unbounded value - int brackets = 0; // Number of bracket sets - int[] bracketStart = null; // Starting point - int[] bracketEnd = null; // Ending point - int[] bracketMin = null; // Minimum number of matches - int[] bracketOpt = null; // Additional optional matches + int bracketMin; // Minimum number of matches + int bracketOpt; // Additional optional matches // Lookup table for POSIX character class names static final Hashtable hashPOSIX = new Hashtable(); @@ -236,59 +232,10 @@ } /** - * Allocate storage for brackets only as needed - */ - void allocBrackets() - { - // Allocate bracket stacks if not already done - if (bracketStart == null) - { - // Allocate storage - bracketStart = new int[maxBrackets]; - bracketEnd = new int[maxBrackets]; - bracketMin = new int[maxBrackets]; - bracketOpt = new int[maxBrackets]; - - // Initialize to invalid values - for (int i = 0; i < maxBrackets; i++) - { - bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1; - } - } - } - - /** Enlarge storage for brackets only as needed. */ - synchronized void reallocBrackets() { - // trick the tricky - if (bracketStart == null) { - allocBrackets(); - } - - int new_size = maxBrackets * 2; - int[] new_bS = new int[new_size]; - int[] new_bE = new int[new_size]; - int[] new_bM = new int[new_size]; - int[] new_bO = new int[new_size]; - // Initialize to invalid values - for (int i=brackets; i<new_size; i++) { - new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1; - } - System.arraycopy(bracketStart,0, new_bS,0, brackets); - System.arraycopy(bracketEnd,0, new_bE,0, brackets); - System.arraycopy(bracketMin,0, new_bM,0, brackets); - System.arraycopy(bracketOpt,0, new_bO,0, brackets); - bracketStart = new_bS; - bracketEnd = new_bE; - bracketMin = new_bM; - bracketOpt = new_bO; - maxBrackets = new_size; - } - - /** * Match bracket {m,n} expression put results in bracket member variables * @exception RESyntaxException Thrown if the regular expression has invalid syntax. */ - void bracket(int bracket) throws RESyntaxException + void bracket() throws RESyntaxException { // Current character must be a '{' if (idx >= len || pattern.charAt(idx++) != '{') @@ -310,7 +257,7 @@ } try { - bracketMin[bracket] = Integer.parseInt(number.toString()); + bracketMin = Integer.parseInt(number.toString()); } catch (NumberFormatException e) { @@ -327,7 +274,7 @@ if (pattern.charAt(idx) == '}') { idx++; - bracketOpt[bracket] = 0; + bracketOpt = 0; return; } @@ -347,7 +294,7 @@ if (pattern.charAt(idx) == '}') { idx++; - bracketOpt[bracket] = bracketUnbounded; + bracketOpt = bracketUnbounded; return; } @@ -365,7 +312,7 @@ } try { - bracketOpt[bracket] = Integer.parseInt(number.toString()) - bracketMin[bracket]; + bracketOpt = Integer.parseInt(number.toString()) - bracketMin; } catch (NumberFormatException e) { @@ -373,7 +320,7 @@ } // Optional repetitions must be >= 0 - if (bracketOpt[bracket] < 0) + if (bracketOpt < 0) { syntaxError("Bad range"); } @@ -1018,132 +965,92 @@ { case '{': { - // We look for our bracket in the list - boolean found = false; - int i; - allocBrackets(); - for (i = 0; i < brackets; i++) - { - if (bracketStart[i] == idx) - { - // Re-initialize bracketMin, bracketOpt since they are all used up by now - if (bracketMin[i] == -1) { - bracket(i); - } - found = true; - break; - } - } + bracket(); + int bracketEnd = idx; + int bracketMin = this.bracketMin; + int bracketOpt = this.bracketOpt; - // If its not in the list we parse the {m,n} - if (!found) - { - if (brackets >= maxBrackets) - { - reallocBrackets(); - } - bracketStart[brackets] = idx; - bracket(brackets); - bracketEnd[brackets] = idx; - i = brackets++; - } + // Pointer to the last terminal + int pos = ret; // Process min first - if (bracketMin[i] > 0) + for (int c = 0; c < bracketMin; c++) { - if (--bracketMin[i] > 0 || bracketOpt[i] != 0) { - // Rewind stream and run it through again - more matchers coming - for (int j = 0; j < i; j++) { - bracketMin[j] = -1; - } - idx = idxBeforeTerminal; - } else { - // Bug #1030: No optional matches - no need to rewind - idx = bracketEnd[i]; - } - break; + // Rewind stream and run it through again - more matchers coming + idx = idxBeforeTerminal; + setNextOfEnd(pos, pos = terminal(terminalFlags)); } // Do the right thing for maximum ({m,}) - if (bracketOpt[i] == bracketUnbounded) + if (bracketOpt == bracketUnbounded) { // Drop through now and closure expression. // We are done with the {m,} expr, so skip rest - closureType = '*'; - bracketOpt[i] = 0; - idx = bracketEnd[i]; + idx = bracketEnd; + nodeInsert(RE.OP_STAR, 0, pos); + setNextOfEnd(pos + RE.nodeSize, pos); + break; } - else - if (bracketOpt[i]-- > 0) + else if (bracketOpt > 0) + { + int opt[] = new int[bracketOpt + 1]; + // Surround first optional terminal with MAYBE + nodeInsert(RE.OP_MAYBE, 0, pos); + opt[0] = pos; + + // Add all the rest optional terminals with preceeding MAYBEs + for (int c = 1; c < bracketOpt; c++) { - if (bracketOpt[i] > 0) { - // More optional matchers - 'play it again sam!' - for (int j = 0; j < i; j++) { - bracketMin[j] = -1; - } - idx = idxBeforeTerminal; - } else { - // Bug #1030: We are done - this one is last and optional - idx = bracketEnd[i]; - } - // Drop through to optionally close - closureType = '?'; + opt[c] = node(RE.OP_MAYBE, 0); + // Rewind stream and run it through again - more matchers coming + idx = idxBeforeTerminal; + terminal(terminalFlags); } - else - { - // Rollback terminal - neither min nor opt matchers present - lenInstruction = ret; - node(RE.OP_NOTHING, 0); - // We are done. skip the rest of {m,n} expr - idx = bracketEnd[i]; - break; + // Tie ends together + int end = opt[bracketOpt] = node(RE.OP_NOTHING, 0); + for (int c = 0; c < bracketOpt; c++) + { + setNextOfEnd(opt[c], end); + setNextOfEnd(opt[c] + RE.nodeSize, opt[c + 1]); } - } - - // Fall through! - - case '?': - case '*': - - if (!greedy) - { - break; } - - if (closureType == '?') + else { - // X? is compiled as (X|) - nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X - setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option - int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING - setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING - setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node + // Rollback terminal - no opt matchers present + lenInstruction = pos; + node(RE.OP_NOTHING, 0); } - if (closureType == '*') - { - // X* is compiled as (X{gotoX}|) - nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X - setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option - setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto - setNextOfEnd(ret + RE.nodeSize, ret); // the start again - setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is - setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING - } + // We are done. skip the reminder of {m,n} expr + idx = bracketEnd; break; + } + + case '?': + { + nodeInsert(RE.OP_MAYBE, 0, ret); + int n = node(RE.OP_NOTHING, 0); + setNextOfEnd(ret, n); + setNextOfEnd(ret + RE.nodeSize, n); + break; + } + + case '*': + { + nodeInsert(RE.OP_STAR, 0, ret); + setNextOfEnd(ret + RE.nodeSize, ret); + break; + } case '+': { - // X+ is compiled as X({gotoX}|) - int branch; - branch = node(RE.OP_BRANCH, 0); // a new branch - setNextOfEnd(ret, branch); // is added to the end of X - setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start - setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option - setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING + nodeInsert(RE.OP_CONTINUE, 0, ret); + int n = node(RE.OP_PLUS, 0); + setNextOfEnd(ret + RE.nodeSize, n); + setNextOfEnd(n, ret); + break; } - break; } } else @@ -1177,6 +1084,7 @@ } } } + return ret; } @@ -1309,16 +1217,16 @@ // Hook the ends of each branch to the end node int currentNode = ret; - int nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; + int nextNodeOffset = instruction[currentNode + RE.offsetNext]; // while the next node o - while ( nextNodeOffset != 0 && currentNode < lenInstruction ) + while (nextNodeOffset != 0 && currentNode < lenInstruction) { // If branch, make the end of the branch's operand chain point to the end node. - if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH ) + if (instruction[currentNode /* + RE.offsetOpcode */] == RE.OP_BRANCH) { - setNextOfEnd( currentNode + RE.nodeSize, end ); + setNextOfEnd(currentNode + RE.nodeSize, end); } - nextNodeOffset = instruction[ currentNode + RE.offsetNext ]; + nextNodeOffset = instruction[currentNode + RE.offsetNext]; currentNode += nextNodeOffset; } @@ -1344,7 +1252,6 @@ idx = 0; // Set parsing index to the first character lenInstruction = 0; // Set emitted instruction count to zero parens = 1; // Set paren level to 1 (the implicit outer parens) - brackets = 0; // No bracketed closures yet // Initialize pass by reference flags value int[] flags = { NODE_TOPLEVEL }; Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/REDebugCompiler.java URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/src/java/org/apache/regexp/REDebugCompiler.java?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/src/java/org/apache/regexp/REDebugCompiler.java (original) +++ jakarta/regexp/trunk/src/java/org/apache/regexp/REDebugCompiler.java Mon Mar 12 20:02:04 2007 @@ -49,6 +49,7 @@ hashOpcode.put(new Integer(RE.OP_MAYBE), "OP_MAYBE"); hashOpcode.put(new Integer(RE.OP_NOTHING), "OP_NOTHING"); hashOpcode.put(new Integer(RE.OP_GOTO), "OP_GOTO"); + hashOpcode.put(new Integer(RE.OP_CONTINUE), "OP_CONTINUE"); hashOpcode.put(new Integer(RE.OP_ESCAPE), "OP_ESCAPE"); hashOpcode.put(new Integer(RE.OP_OPEN), "OP_OPEN"); hashOpcode.put(new Integer(RE.OP_CLOSE), "OP_CLOSE"); Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java (original) +++ jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java Mon Mar 12 20:02:04 2007 @@ -17,17 +17,16 @@ package org.apache.regexp; import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.FileReader; +import java.io.IOException; import java.io.InputStreamReader; -import java.io.PrintWriter; -import java.io.File; -import java.io.ByteArrayOutputStream; -import java.io.ObjectOutputStream; -import java.io.ByteArrayInputStream; import java.io.ObjectInputStream; -import java.io.StringBufferInputStream; +import java.io.ObjectOutputStream; +import java.io.PrintWriter; import java.io.StringReader; -import java.io.IOException; /** * Data driven (and optionally interactive) testing harness to exercise regular @@ -760,7 +759,7 @@ return; log.append(" Match using StreamCharacterIterator\n"); - if (!tryMatchUsingCI(new StreamCharacterIterator(new StringBufferInputStream(toMatch)))) + if (!tryMatchUsingCI(new StreamCharacterIterator(new ByteArrayInputStream(toMatch.getBytes())))) return; log.append(" Match using ReaderCharacterIterator\n"); Modified: jakarta/regexp/trunk/xdocs/RETest.txt URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/xdocs/RETest.txt?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/xdocs/RETest.txt (original) +++ jakarta/regexp/trunk/xdocs/RETest.txt Mon Mar 12 20:02:04 2007 @@ -1566,3 +1566,8 @@ YES 1 aaabcca + +#231 +^a{1,30}$ +aaaaaaaaaaaaaaaaaaaaaaaaaaaab +NO Modified: jakarta/regexp/trunk/xdocs/changes.xml URL: http://svn.apache.org/viewvc/jakarta/regexp/trunk/xdocs/changes.xml?view=diff&rev=517503&r1=517502&r2=517503 ============================================================================== --- jakarta/regexp/trunk/xdocs/changes.xml (original) +++ jakarta/regexp/trunk/xdocs/changes.xml Mon Mar 12 20:02:04 2007 @@ -35,6 +35,9 @@ <h3>Version 1.5-dev</h3> <ul> <li>Fixed Bug + <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=9153">9153</a>: + {m,n} implementation had exponential complexity (VG)</li> +<li>Fixed Bug <a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=27763">27763</a>: RE incorrectly processed reluctant matchers (VG)</li> <li>Fixed Bug --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]