Author: amassari
Date: Thu Aug 30 09:23:48 2007
New Revision: 571231
URL: http://svn.apache.org/viewvc?rev=571231&view=rev
Log:
- Don't allocate a stack unless the string to be matched is longer than 256
characters
- Don't use backtracking if the regex pattern doesn't have ambiguities
(XERCESC-1242)
Modified:
xerces/c/trunk/src/xercesc/util/regx/Op.hpp
xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp
xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp
xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp
Modified: xerces/c/trunk/src/xercesc/util/regx/Op.hpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/Op.hpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/Op.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/Op.hpp Thu Aug 30 09:23:48 2007
@@ -41,26 +41,28 @@
public:
typedef enum {
- O_DOT = 0,
- O_CHAR = 1,
- O_RANGE = 3,
- O_NRANGE = 4,
- O_ANCHOR = 5,
- O_STRING = 6,
- O_CLOSURE = 7,
- O_NONGREEDYCLOSURE = 8,
- O_QUESTION = 9,
- O_NONGREEDYQUESTION = 10,
- O_UNION = 11,
- O_CAPTURE = 15,
- O_BACKREFERENCE = 16,
- O_LOOKAHEAD = 20,
- O_NEGATIVELOOKAHEAD = 21,
- O_LOOKBEHIND = 22,
- O_NEGATIVELOOKBEHIND = 23,
- O_INDEPENDENT = 24,
- O_MODIFIER = 25,
- O_CONDITION = 26
+ O_DOT = 0,
+ O_CHAR = 1,
+ O_RANGE = 3,
+ O_NRANGE = 4,
+ O_ANCHOR = 5,
+ O_STRING = 6,
+ O_CLOSURE = 7,
+ O_NONGREEDYCLOSURE = 8,
+ O_FINITE_CLOSURE = 9,
+ O_FINITE_NONGREEDYCLOSURE = 10,
+ O_QUESTION = 11,
+ O_NONGREEDYQUESTION = 12,
+ O_UNION = 13,
+ O_CAPTURE = 15,
+ O_BACKREFERENCE = 16,
+ O_LOOKAHEAD = 20,
+ O_NEGATIVELOOKAHEAD = 21,
+ O_LOOKBEHIND = 22,
+ O_NEGATIVELOOKBEHIND = 23,
+ O_INDEPENDENT = 24,
+ O_MODIFIER = 25,
+ O_CONDITION = 26
} opType;
// -----------------------------------------------------------------------
Modified: xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp Thu Aug 30 09:23:48 2007
@@ -76,6 +76,8 @@
TokenFactory* const tokFactory,
MemoryManager* const manager =
XMLPlatformUtils::fgMemoryManager);
+ bool empty() const;
+
// -----------------------------------------------------------------------
// Match methods
// -----------------------------------------------------------------------
@@ -85,8 +87,7 @@
// Creates the map. This will happen automatically,
// necessary.
// -----------------------------------------------------------------------
- void
- createMap();
+ void createMap();
private:
// -----------------------------------------------------------------------
@@ -130,6 +131,10 @@
}
}
+inline bool RangeToken::empty() const
+{
+ return fElemCount==0;
+}
XERCES_CPP_NAMESPACE_END
Modified: xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp Thu Aug 30
09:23:48 2007
@@ -601,7 +601,7 @@
/*
* Check whether the expression start with ".*"
*/
- if (fOperations != 0 && fOperations->getOpType() == Op::O_CLOSURE
+ if (fOperations != 0 && (fOperations->getOpType() == Op::O_CLOSURE ||
fOperations->getOpType() == Op::O_FINITE_CLOSURE)
&& fOperations->getChild()->getOpType() == Op::O_DOT) {
if (isSet(fOptions, SINGLE_LINE)) {
@@ -995,7 +995,13 @@
int RegularExpression::match(Context* const context, const Op* const operations
, XMLSize_t offset,
const short direction)
{
- ValueStackOf<RE_RuntimeContext> opStack(0, context->fMemoryManager);
+ ValueStackOf<RE_RuntimeContext>* opStack=NULL;
+ Janitor<ValueStackOf<RE_RuntimeContext> > janStack(NULL);
+ if(context->fLimit > 256)
+ {
+ opStack=new ValueStackOf<RE_RuntimeContext>(16,
context->fMemoryManager);
+ janStack.reset(opStack);
+ }
const Op* tmpOp = operations;
bool ignoreCase = isSet(fOptions, IGNORE_CASE);
int doReturn;
@@ -1049,6 +1055,59 @@
else
tmpOp = tmpOp->getNextOp();
break;
+ case Op::O_FINITE_CLOSURE:
+ {
+ XMLInt32 id = tmpOp->getData();
+ // if id is not -1, it's a closure with a child token
having a minumum length,
+ // where id is the index of the fOffsets array where its
status is stored
+ if (id >= 0) {
+ int prevOffset = context->fOffsets[id];
+ if (prevOffset < 0 || prevOffset !=
(int)offset) {
+ context->fOffsets[id] =
(int)offset;
+ }
+ else {
+ // the status didn't change, we haven't found
other copies; move on to the next match
+ context->fOffsets[id] = -1;
+ tmpOp = tmpOp->getNextOp();
+ break;
+ }
+ }
+
+ // match the subitems until they do
+ int ret;
+ while((ret = match(context, tmpOp->getChild(), offset,
direction)) != -1)
+ {
+ if(offset == (XMLSize_t)ret)
+ break;
+ offset = ret;
+ }
+
+ if (id >= 0) {
+ // loop has ended, reset the status for this closure
+ context->fOffsets[id] = -1;
+ }
+ tmpOp = tmpOp->getNextOp();
+ }
+ break;
+ case Op::O_FINITE_NONGREEDYCLOSURE:
+ {
+ int ret =
match(context,tmpOp->getNextOp(),offset,direction);
+ if (ret >= 0)
+ doReturn = ret;
+ else
+ {
+ // match the subitems until they do
+ int ret;
+ while((ret = match(context, tmpOp->getChild(), offset,
direction)) != -1)
+ {
+ if(offset == (XMLSize_t)ret)
+ break;
+ offset = ret;
+ }
+ tmpOp = tmpOp->getNextOp();
+ }
+ }
+ break;
case Op::O_CLOSURE:
{
XMLInt32 id = tmpOp->getData();
@@ -1067,14 +1126,40 @@
}
}
- opStack.push(RE_RuntimeContext(tmpOp,
offset));
- tmpOp = tmpOp->getChild();
+ if(opStack!=NULL)
+ {
+ opStack->push(RE_RuntimeContext(tmpOp,
offset));
+ tmpOp = tmpOp->getChild();
+ }
+ else
+ {
+ int ret = match(context,
tmpOp->getChild(), offset, direction);
+ if (id >= 0) {
+ context->fOffsets[id] = -1;
+ }
+
+ if (ret >= 0)
+ doReturn = ret;
+ else
+ tmpOp = tmpOp->getNextOp();
+ }
}
break;
case Op::O_QUESTION:
{
- opStack.push(RE_RuntimeContext(tmpOp,
offset));
- tmpOp = tmpOp->getChild();
+ if(opStack!=NULL)
+ {
+ opStack->push(RE_RuntimeContext(tmpOp,
offset));
+ tmpOp = tmpOp->getChild();
+ }
+ else
+ {
+ int ret = match(context,
tmpOp->getChild(), offset, direction);
+ if (ret >= 0)
+ doReturn = ret;
+ else
+ tmpOp = tmpOp->getNextOp();
+ }
}
break;
case Op::O_NONGREEDYCLOSURE:
@@ -1151,9 +1236,9 @@
}
}
if (doReturn != -5) {
- if (opStack.size() == 0)
+ if (opStack==NULL || opStack->size() == 0)
return doReturn;
- RE_RuntimeContext ctx = opStack.pop();
+ RE_RuntimeContext ctx = opStack->pop();
tmpOp = ctx.op_;
offset = ctx.offs_;
if (tmpOp->getOpType() == Op::O_CLOSURE) {
@@ -1462,7 +1547,7 @@
bestResult=ret;
bestResultContext=tmpContext;
// exit early, if we reached the end of the string
- if(ret == context->fLimit)
+ if((XMLSize_t)ret == context->fLimit)
break;
}
}
@@ -1788,6 +1873,57 @@
return wordTypeOther;
}
+bool RegularExpression::doTokenOverlap(const Op* op, Token* token)
+{
+ if(op->getOpType()==Op::O_RANGE)
+ {
+ RangeToken* t1=(RangeToken*)op->getToken();
+ switch(token->getTokenType())
+ {
+ case Token::T_CHAR:
+ return t1->match(token->getChar());
+ case Token::T_STRING:
+ return t1->match(*token->getString());
+ case Token::T_RANGE:
+ {
+ try
+ {
+ RangeToken tempRange(Token::T_RANGE, fMemoryManager);
+ tempRange.mergeRanges(t1);
+ tempRange.intersectRanges((RangeToken*)token);
+ return !tempRange.empty();
+ }
+ catch(RuntimeException&)
+ {
+ }
+ break;
+ }
+ }
+ return true;
+ }
+
+ XMLInt32 ch=0;
+ if(op->getOpType()==Op::O_CHAR)
+ ch=op->getData();
+ else if(op->getOpType()==Op::O_STRING)
+ ch=*op->getLiteral();
+
+ if(ch!=0)
+ {
+ switch(token->getTokenType())
+ {
+ case Token::T_CHAR:
+ return token->getChar()==ch;
+ case Token::T_STRING:
+ return *token->getString()==ch;
+ case Token::T_RANGE:
+ case Token::T_NRANGE:
+ return ((RangeToken*)token)->match(ch);
+ }
+ }
+ // in any other case, there is the chance that they overlap
+ return true;
+}
XERCES_CPP_NAMESPACE_END
Modified: xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp
URL:
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp Thu Aug 30
09:23:48 2007
@@ -291,6 +291,8 @@
Op* compileClosure(const Token* const token, Op* const next,
const bool reverse, const Token::tokType tkType);
+ bool doTokenOverlap(const Op* op, Token* token);
+
// -----------------------------------------------------------------------
// Private data members
// -----------------------------------------------------------------------
@@ -544,7 +546,15 @@
}
childOp->setNextOp(next);
- childOp->setChild(compile(childTok, childOp, reverse));
+ if(next==NULL || !doTokenOverlap(next, childTok))
+ {
+ childOp->setOpType(tkType ==
Token::T_NONGREEDYCLOSURE?Op::O_FINITE_NONGREEDYCLOSURE:Op::O_FINITE_CLOSURE);
+ childOp->setChild(compile(childTok, NULL, reverse));
+ }
+ else
+ {
+ childOp->setChild(compile(childTok, childOp, reverse));
+ }
ret = childOp;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]