Ok have a fix.
Its not perfect though. I think there is a small design flaw with way
reluctant matches are handled which makes a proper fix very difficult.
Have made sure that programs get dumped on test failures and also for
interactive tests. ( a small helper script is attached which if put into the
build directory can be used to run RETest.)
Fixed a bug that caused various failures when the same compiler was used for
multiple compilations. Did a bit of cruising past the end of its program
chaining all sorts of goodies together. (Usually did not affect the program
but occasionally caused the compiler to get ArrayIndexOutOfBounds).
Michael
? Clustering.patch
? Clustering-v2.patch
? RECompiler.java
? RETest.java
? Reluctant.patch
? build/run-tests.sh
? docs/RETest2.txt
? docs/RETest3.txt
Index: build/build-regexp.xml
===================================================================
RCS file: /home/cvspublic/jakarta-regexp/build/build-regexp.xml,v
retrieving revision 1.6
diff -u -r1.6 build-regexp.xml
--- build/build-regexp.xml 2001/02/11 23:04:21 1.6
+++ build/build-regexp.xml 2001/02/18 12:44:26
@@ -8,6 +8,7 @@
<!-- =================================================================== -->
<!-- Initializes some variables -->
<!-- =================================================================== -->
+ <property file="${user.home}/.jakarta-regexp.properties"/>
<property name="ant.home" value="."/>
<property name="Name" value="Jakarta-Regexp"/>
<property name="year" value="2001"/>
Index: docs/RETest.txt
===================================================================
RCS file: /home/cvspublic/jakarta-regexp/docs/RETest.txt,v
retrieving revision 1.2
diff -u -r1.2 RETest.txt
--- docs/RETest.txt 2001/02/11 23:04:21 1.2
+++ docs/RETest.txt 2001/02/18 12:44:27
@@ -978,3 +978,37 @@
www.test.com
YES
www.test.com
+
+#163
+abc.*?x+yz
+abcaaaaaxyzbbbbxyz
+YES
+abcaaaaaxyz
+
+#164
+abc.+?x+yz
+abcaaaaaxyzbbbbxyz
+YES
+abcaaaaaxyz
+
+#165
+a.+?(c|d)
+aaaacaaaaad
+YES
+aaaac
+c
+
+#166
+a.+(c|d)
+aaaacaaaaad
+YES
+aaaacaaaaad
+d
+
+#167
+a+?b+?c+?
+aaabccaaabbbccc
+YES
+aaabc
+
+
Index: src/java/org/apache/regexp/RECompiler.java
===================================================================
RCS file: /home/cvspublic/jakarta-regexp/src/java/org/apache/regexp/RECompiler.java,v
retrieving revision 1.3
diff -u -r1.3 RECompiler.java
--- src/java/org/apache/regexp/RECompiler.java 2001/02/11 23:04:22 1.3
+++ src/java/org/apache/regexp/RECompiler.java 2001/02/18 12:44:32
@@ -71,6 +71,7 @@
* @see recompile
*
* @author <a href="mailto:[EMAIL PROTECTED]">Jonathan Locke</a>
+ * @author <a href="mailto:[EMAIL PROTECTED]">Michael McCallum</a>
* @version $Id: RECompiler.java,v 1.3 2001/02/11 23:04:22 jon Exp $
*/
public class RECompiler
@@ -203,14 +204,29 @@
void setNextOfEnd(int node, int pointTo)
{
// Traverse the chain until the next offset is 0
- int next;
- while ((next = instruction[node + RE.offsetNext]) != 0)
- {
+ int next = instruction[node + RE.offsetNext];
+ // while the 'node' is not the last in the chain
+ // and the 'node' is not the last in the program.
+ while ( next != 0 && node < lenInstruction )
+ {
+ // if the node we are supposed to point to is in the chain then
+ // point to the end of the program instead.
+ // Michael McCallum <[EMAIL PROTECTED]>
+ // FIXME: // This is a _hack_ to stop infinite programs.
+ // I believe that the implementation of the reluctant matches is wrong but
+ // have not worked out a better way yet.
+ if ( node == pointTo ) {
+ pointTo = lenInstruction;
+ }
node += next;
+ next = instruction[node + RE.offsetNext];
}
-
- // Point the last node in the chain to pointTo.
- instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
+ // if we have reached the end of the program then dont set the pointTo.
+ // im not sure if this will break any thing but passes all the tests.
+ if ( node < lenInstruction ) {
+ // Point the last node in the chain to pointTo.
+ instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
+ }
}
/**
@@ -1258,13 +1274,18 @@
setNextOfEnd(ret, end);
// Hook the ends of each branch to the end node
- for (int next = -1, i = ret; next != 0; next = instruction[i +
RE.offsetNext], i += next)
+ int currentNode = ret;
+ int nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
+ // while the next node o
+ while ( nextNodeOffset != 0 && currentNode < lenInstruction )
{
// If branch, make the end of the branch's operand chain point to the end
node.
- if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH)
+ if ( instruction[ currentNode + RE.offsetOpcode ] == RE.OP_BRANCH )
{
- setNextOfEnd(i + RE.nodeSize, end);
+ setNextOfEnd( currentNode + RE.nodeSize, end );
}
+ nextNodeOffset = instruction[ currentNode + RE.offsetNext ];
+ currentNode += nextNodeOffset;
}
// Return the node list
Index: src/java/org/apache/regexp/REDebugCompiler.java
===================================================================
RCS file:
/home/cvspublic/jakarta-regexp/src/java/org/apache/regexp/REDebugCompiler.java,v
retrieving revision 1.1
diff -u -r1.1 REDebugCompiler.java
--- src/java/org/apache/regexp/REDebugCompiler.java 2000/04/27 01:22:33 1.1
+++ src/java/org/apache/regexp/REDebugCompiler.java 2001/02/18 12:44:33
@@ -95,6 +95,8 @@
hashOpcode.put(new Integer(RE.OP_CLOSE), "OP_CLOSE");
hashOpcode.put(new Integer(RE.OP_BACKREF), "OP_BACKREF");
hashOpcode.put(new Integer(RE.OP_POSIXCLASS), "OP_POSIXCLASS");
+ hashOpcode.put(new Integer(RE.OP_OPEN_CLUSTER), "OP_OPEN_CLUSTER");
+ hashOpcode.put(new Integer(RE.OP_CLOSE_CLUSTER), "OP_CLOSE_CLUSTER");
}
/**
@@ -146,6 +148,38 @@
// Return opcode as a string and opdata value
return opcodeToString(opcode) + ", opdata = " + opdata;
}
+
+ /**
+ * Inserts a node with a given opcode and opdata at insertAt. The node relative
+next
+ * pointer is initialized to 0.
+ * @param opcode Opcode for new node
+ * @param opdata Opdata for new node (only the low 16 bits are currently used)
+ * @param insertAt Index at which to insert the new node in the program * /
+ void nodeInsert(char opcode, int opdata, int insertAt) {
+ System.out.println( "====> " + opcode + " " + opdata + " " + insertAt );
+ PrintWriter writer = new PrintWriter( System.out );
+ dumpProgram( writer );
+ super.nodeInsert( opcode, opdata, insertAt );
+ System.out.println( "====< " );
+ dumpProgram( writer );
+ writer.flush();
+ }/**/
+
+
+ /**
+ * Appends a node to the end of a node chain
+ * @param node Start of node chain to traverse
+ * @param pointTo Node to have the tail of the chain point to * /
+ void setNextOfEnd(int node, int pointTo) {
+ System.out.println( "====> " + node + " " + pointTo );
+ PrintWriter writer = new PrintWriter( System.out );
+ dumpProgram( writer );
+ super.setNextOfEnd( node, pointTo );
+ System.out.println( "====< " );
+ dumpProgram( writer );
+ writer.flush();
+ }/**/
+
/**
* Dumps the current program to a PrintWriter
Index: src/java/org/apache/regexp/RETest.java
===================================================================
RCS file: /home/cvspublic/jakarta-regexp/src/java/org/apache/regexp/RETest.java,v
retrieving revision 1.3
diff -u -r1.3 RETest.java
--- src/java/org/apache/regexp/RETest.java 2001/02/11 23:04:22 1.3
+++ src/java/org/apache/regexp/RETest.java 2001/02/18 12:44:35
@@ -65,6 +65,7 @@
*
* @author <a href="mailto:[EMAIL PROTECTED]">Jonathan Locke</a>
* @author <a href="mailto:[EMAIL PROTECTED]">Jon S. Stevens</a>
+ * @author <a href="mailto:[EMAIL PROTECTED]">Michael McCallum</a>
* @version $Id: RETest.java,v 1.3 2001/02/11 23:04:22 jon Exp $
*/
public class RETest
@@ -144,10 +145,13 @@
say("\n" + expr + "\n");
// Show program for compiled expression
- compiler.dumpProgram(new PrintWriter(System.out));
+ PrintWriter writer = new PrintWriter( System.out );
+ compiler.dumpProgram( writer );
+ writer.flush();
+ boolean running = true;
// Test matching against compiled expression
- while (true)
+ while ( running )
{
// Read from keyboard
BufferedReader br = new BufferedReader(new
InputStreamReader(System.in));
@@ -155,18 +159,26 @@
System.out.flush();
String match = br.readLine();
- // Try a match against the keyboard input
- if (r.match(match))
+ if ( match != null )
{
- say("Match successful.");
+ // Try a match against the keyboard input
+ if (r.match(match))
+ {
+ say("Match successful.");
+ }
+ else
+ {
+ say("Match failed.");
+ }
+
+ // Show subparen registers
+ showParens(r);
}
else
{
- say("Match failed.");
+ running = false;
+ System.out.println();
}
-
- // Show subparen registers
- showParens(r);
}
}
catch (Exception e)
@@ -187,8 +199,9 @@
}
/**
- * Fail with an error
- * @param s Failure description
+ * Fail with an error.
+ * Will print a big failure message to System.out.
+ * @param s Failure description
*/
void fail(String s)
{
@@ -199,8 +212,11 @@
say("*******************************************************");
say("\n");
say(s);
- say("");
- compiler.dumpProgram(new PrintWriter(System.out));
+ say("");
+ // make sure the writer gets flushed.
+ PrintWriter writer = new PrintWriter( System.out );
+ compiler.dumpProgram( writer );
+ writer.flush();
say("\n");
}
@@ -371,7 +387,9 @@
}
// Wasn't supposed to be an error
- fail("Produces the unexpected error \"" + e.getMessage() + "\"");
+ String message = e.getMessage() == null ? e.toString() :
+e.getMessage();
+ fail("Produces an unexpected exception \"" + message + "\"");
+ e.printStackTrace();
}
catch (Error e)
{
Index: xdocs/RETest.txt
===================================================================
RCS file: /home/cvspublic/jakarta-regexp/xdocs/RETest.txt,v
retrieving revision 1.2
diff -u -r1.2 RETest.txt
--- xdocs/RETest.txt 2001/02/11 23:04:23 1.2
+++ xdocs/RETest.txt 2001/02/18 12:44:36
@@ -978,3 +978,35 @@
www.test.com
YES
www.test.com
+
+#163
+abc.*?x+yz
+abcaaaaaxyzbbbbxyz
+YES
+abcaaaaaxyz
+
+#164
+abc.+?x+yz
+abcaaaaaxyzbbbbxyz
+YES
+abcaaaaaxyz
+
+#165
+a.+?(c|d)
+aaaacaaaaad
+YES
+aaaac
+c
+
+#166
+a.+(c|d)
+aaaacaaaaad
+YES
+aaaacaaaaad
+d
+
+#167
+a+?b+?c+?
+aaabccaaabbbccc
+YES
+aaabc
run-tests.sh