Author: vgritsenko
Date: Thu Aug 11 20:04:07 2005
New Revision: 232192
URL: http://svn.apache.org/viewcvs?rev=232192&view=rev
Log:
Applied patches for Bug #27795:
Add optimization for regexps which start with ^ (BOL)
Modified:
jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java
jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java
jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java
jakarta/regexp/trunk/xdocs/changes.xml
Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java
URL:
http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java (original)
+++ jakarta/regexp/trunk/src/java/org/apache/regexp/RE.java Thu Aug 11 20:04:07
2005
@@ -1414,6 +1414,43 @@
// Save string to search
this.search = search;
+ // Can we optimize the search by looking for new lines?
+ if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL)
+ {
+ // Non multi-line matching with BOL: Must match at '0' index
+ if ((matchFlags & MATCH_MULTILINE) == 0)
+ {
+ return i == 0 && matchAt(i);
+ }
+
+ // Multi-line matching with BOL: Seek to next line
+ for ( ;! search.isEnd(i); i++)
+ {
+ // Skip if we are at the beginning of the line
+ if (isNewline(i))
+ {
+ continue;
+ }
+
+ // Match at the beginning of the line
+ if (matchAt(i))
+ {
+ return true;
+ }
+
+ // Skip to the end of line
+ for ( ;! search.isEnd(i); i++)
+ {
+ if (isNewline(i))
+ {
+ break;
+ }
+ }
+ }
+
+ return false;
+ }
+
// Can we optimize the search by looking for a prefix string?
if (program.prefix == null)
{
Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java
URL:
http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java (original)
+++ jakarta/regexp/trunk/src/java/org/apache/regexp/REProgram.java Thu Aug 11
20:04:07 2005
@@ -33,6 +33,7 @@
public class REProgram implements Serializable
{
static final int OPT_HASBACKREFS = 1;
+ static final int OPT_HASBOL = 2;
char[] instruction; // The compiled regular expression 'program'
int lenInstruction; // The amount of the instruction buffer in use
@@ -81,7 +82,7 @@
// Ensure program has been compiled!
if (lenInstruction != 0)
{
- // Return copy of program
+ // Return copy of program
char[] ret = new char[lenInstruction];
System.arraycopy(instruction, 0, ret, 0, lenInstruction);
return ret;
@@ -116,16 +117,23 @@
if (lenInstruction >= RE.nodeSize && instruction[0 +
RE.offsetOpcode] == RE.OP_BRANCH)
{
// to the end node
- int next = instruction[0 + RE.offsetNext];
- if (instruction[next + RE.offsetOpcode] == RE.OP_END)
+ char next = instruction[0 + RE.offsetNext];
+ if (instruction[next + RE.offsetOpcode] == RE.OP_END &&
lenInstruction >= (RE.nodeSize * 2))
{
- // and the branch starts with an atom
- if (lenInstruction >= (RE.nodeSize * 2) &&
instruction[RE.nodeSize + RE.offsetOpcode] == RE.OP_ATOM)
+ final char nextOp = instruction[RE.nodeSize +
RE.offsetOpcode];
+ // the branch starts with an atom
+ if (nextOp == RE.OP_ATOM)
{
// then get that atom as an prefix because there's no
other choice
int lenAtom = instruction[RE.nodeSize +
RE.offsetOpdata];
prefix = new char[lenAtom];
System.arraycopy(instruction, RE.nodeSize * 2, prefix,
0, lenAtom);
+ }
+ // the branch starts with a BOL
+ else if (nextOp == RE.OP_BOL)
+ {
+ // then set the flag indicating that BOL is present
+ flags |= OPT_HASBOL;
}
}
}
Modified: jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java
URL:
http://svn.apache.org/viewcvs/jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java (original)
+++ jakarta/regexp/trunk/src/java/org/apache/regexp/RETest.java Thu Aug 11
20:04:07 2005
@@ -378,6 +378,12 @@
showParens(r);
}
+ // Test for eol/bol symbols.
+ r = new RE("^abc$");
+ if (r.match("\nabc")) {
+ fail("\"\\nabc\" matches \"^abc$\"");
+ }
+
// Test MATCH_MULTILINE. Test for eol/bol symbols.
r = new RE("^abc$", RE.MATCH_MULTILINE);
if (!r.match("\nabc")) {
Modified: jakarta/regexp/trunk/xdocs/changes.xml
URL:
http://svn.apache.org/viewcvs/jakarta/regexp/trunk/xdocs/changes.xml?rev=232192&r1=232191&r2=232192&view=diff
==============================================================================
--- jakarta/regexp/trunk/xdocs/changes.xml (original)
+++ jakarta/regexp/trunk/xdocs/changes.xml Thu Aug 11 20:04:07 2005
@@ -34,53 +34,56 @@
<h3>Version 1.4-dev</h3>
<ul>
-<li>Fixed Bug
+<li>Applied patches for Bug
+ <a
href="http://issues.apache.org/bugzilla/show_bug.cgi?id=27795">27795</a>:
+ Add optimization for regexps which start with ^ (BOL) (VG)</li>
+<li>Fixed Bug
<a
href="http://issues.apache.org/bugzilla/show_bug.cgi?id=25985">25985</a>:
In MATCH_MULTILINE mode $ does not match end of line (VG)</li>
-<li>Fixed Bug
+<li>Fixed Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=2121">2121</a>:
'.' or '-' in bracket expression gives unexpected results (VG)</li>
<li>Regexp is relicensed to <a
href="http://www.apache.org/licenses/LICENSE-2.0">
Apache License, Version 2.0</a> (VG)</li>
-<li>Fixed Bug
+<li>Fixed Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=2525">2525</a>:
Leading zero-length string splitted by RE (VG)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=4137">4137</a>:
Regexp match gets different results on different platforms (VG)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3303">3303</a>:
Unicode 3.0 character \\uFFFD (VG)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3773">3773</a>:
Problem with parsing greedy match modifiers (VG)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3273">3273</a>:
CharacterArrayCharacterIterator docs and implementation mismatch (VG)</li>
-<li>Fixed Bug
+<li>Fixed Bug
<a
href="http://issues.apache.org/bugzilla/show_bug.cgi?id=22928">22928</a>:
subst() with REPLACE_BACKREFERENCES cuts first 2 characters (VG)</li>
</ul>
<h3>Version 1.3</h3>
<ul>
-<li>Fixed Bug
+<li>Fixed Bug
<a
href="http://issues.apache.org/bugzilla/show_bug.cgi?id=22804">22804</a>:
ArrayIndexOutOfBoundsException on negated classes (VG)</li>
<li>New Feature: subst() can now process backreferences when flag
REPLACE_BACKREFERENCES is set. See API docs for details.
Patch provided by Tobias Schaefer. (VG)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a
href="http://issues.apache.org/bugzilla/show_bug.cgi?id=16592">16592</a>:
Syntax error: Too many bracketed closures (limit is 10) (VG)</li>
-<li>Fixed Bug
+<li>Fixed Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=5212">5212</a>,
aka
<a
href="http://issues.apache.org/bugzilla/show_bug.cgi?id=14954">14954</a>:
A bug caused by '-' in character class definition ('[...]') (VG)</li>
-<li>Fixed Bug
+<li>Fixed Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=4057">4057</a>:
\w does not match underscore (VG)</li>
-<li>Fixed Bug
+<li>Fixed Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=1030">1030</a>,
aka
<a
href="http://issues.apache.org/bugzilla/show_bug.cgi?id=10893">10893</a>:
{n.m} notation work incorrect if n=0 (VG)</li>
@@ -89,22 +92,22 @@
Expressions using {0,n} match 0 to n+1 times instead of 0 to n times.
Now, expression "[a-z]{0,3}" matches "123abcdefg123" resulting in ""
(empty string). (VG)</li>
-<li>Fixed Bug
+<li>Fixed Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=306">306</a>:
Why is the RE class not Serializable? (VG)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3879">3879</a>:
Expressions using {0,n} match 0 to n+1 times instead of 0 to n times.
(JSS)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=7288">7288</a>:
Bug in negative character ranges. (JSS)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=986">986</a>:
Leading "\b" word boundary is ignored. (JSS)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=3877">3877</a>:
{n} and {n,m} not thread safe. (JSS)</li>
-<li>Applied patches for Bug
+<li>Applied patches for Bug
<a href="http://issues.apache.org/bugzilla/show_bug.cgi?id=8467">8467</a>:
Number of paren pairs limited to 16 (JSS)</li>
<li>Fixed RE.grep() documentation to reflect a String[] is returned
@@ -117,7 +120,7 @@
<h3>Version 1.2</h3>
<ul>
<li>Updated to Ant 1.2 (JSS)</li>
-<li>Documentation now built with <a
+<li>Documentation now built with <a
href="http://jakarta.apache.org/site/jakarta-site2.html">Anakia</a> (JSS)</li>
<li><a
href="http://jakarta.apache.org/cvsweb/index.cgi/jakarta-regexp/src/java/org/apache/regexp/RE.java?rev=1.3&content-type=text/vnd.viewcvs-markup">Fixed
bug</a></li>
<li><a
href="http://jakarta.apache.org/cvsweb/index.cgi/jakarta-regexp/src/java/org/apache/regexp/RE.java?rev=1.4&content-type=text/vnd.viewcvs-markup">
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]