Author: amassari
Date: Thu Aug 30 09:23:48 2007
New Revision: 571231

URL: http://svn.apache.org/viewvc?rev=571231&view=rev
Log:
- Don't allocate a stack unless the string to be matched is longer than 256 
characters
- Don't use backtracking if the regex pattern doesn't have ambiguities 
(XERCESC-1242)

Modified:
    xerces/c/trunk/src/xercesc/util/regx/Op.hpp
    xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp
    xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp
    xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp

Modified: xerces/c/trunk/src/xercesc/util/regx/Op.hpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/Op.hpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/Op.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/Op.hpp Thu Aug 30 09:23:48 2007
@@ -41,26 +41,28 @@
 public:
 
     typedef enum {
-        O_DOT                = 0,
-        O_CHAR               = 1,
-        O_RANGE              = 3,
-        O_NRANGE             = 4,
-        O_ANCHOR             = 5,
-        O_STRING             = 6,
-        O_CLOSURE            = 7,
-        O_NONGREEDYCLOSURE   = 8,
-        O_QUESTION           = 9,
-        O_NONGREEDYQUESTION  = 10,
-        O_UNION              = 11,
-        O_CAPTURE            = 15,
-        O_BACKREFERENCE      = 16,
-        O_LOOKAHEAD          = 20,
-        O_NEGATIVELOOKAHEAD  = 21,
-        O_LOOKBEHIND         = 22,
-        O_NEGATIVELOOKBEHIND = 23,
-        O_INDEPENDENT        = 24,
-        O_MODIFIER           = 25,
-        O_CONDITION          = 26
+        O_DOT                       = 0,
+        O_CHAR                      = 1,
+        O_RANGE                     = 3,
+        O_NRANGE                    = 4,
+        O_ANCHOR                    = 5,
+        O_STRING                    = 6,
+        O_CLOSURE                   = 7,
+        O_NONGREEDYCLOSURE          = 8,
+        O_FINITE_CLOSURE            = 9,
+        O_FINITE_NONGREEDYCLOSURE   = 10,
+        O_QUESTION                  = 11,
+        O_NONGREEDYQUESTION         = 12,
+        O_UNION                     = 13,
+        O_CAPTURE                   = 15,
+        O_BACKREFERENCE             = 16,
+        O_LOOKAHEAD                 = 20,
+        O_NEGATIVELOOKAHEAD         = 21,
+        O_LOOKBEHIND                = 22,
+        O_NEGATIVELOOKBEHIND        = 23,
+        O_INDEPENDENT               = 24,
+        O_MODIFIER                  = 25,
+        O_CONDITION                 = 26
     } opType;
 
     // -----------------------------------------------------------------------

Modified: xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RangeToken.hpp Thu Aug 30 09:23:48 2007
@@ -76,6 +76,8 @@
                                    TokenFactory* const tokFactory,
                                    MemoryManager* const manager = 
XMLPlatformUtils::fgMemoryManager);
 
+    bool empty() const;
+
     // -----------------------------------------------------------------------
     //  Match methods
     // -----------------------------------------------------------------------
@@ -85,8 +87,7 @@
     //  Creates the map.  This will happen automatically,
     //  necessary.
     // -----------------------------------------------------------------------
-    void
-    createMap();
+    void createMap();
 
 private:
     // -----------------------------------------------------------------------
@@ -130,6 +131,10 @@
     }
 }
 
+inline bool RangeToken::empty() const
+{
+    return fElemCount==0; 
+}
 
 XERCES_CPP_NAMESPACE_END
 

Modified: xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp Thu Aug 30 
09:23:48 2007
@@ -601,7 +601,7 @@
        /*
         *      Check whether the expression start with ".*"
         */
-       if (fOperations != 0 && fOperations->getOpType() == Op::O_CLOSURE
+       if (fOperations != 0 && (fOperations->getOpType() == Op::O_CLOSURE || 
fOperations->getOpType() == Op::O_FINITE_CLOSURE)
         && fOperations->getChild()->getOpType() == Op::O_DOT) {
 
                if (isSet(fOptions, SINGLE_LINE)) {
@@ -995,7 +995,13 @@
 int RegularExpression::match(Context* const context, const Op* const operations
                                                         , XMLSize_t offset, 
const short direction)
 {
-    ValueStackOf<RE_RuntimeContext>    opStack(0, context->fMemoryManager);
+    ValueStackOf<RE_RuntimeContext>* opStack=NULL;
+    Janitor<ValueStackOf<RE_RuntimeContext> > janStack(NULL);
+    if(context->fLimit > 256)
+    {
+        opStack=new ValueStackOf<RE_RuntimeContext>(16, 
context->fMemoryManager);
+        janStack.reset(opStack);
+    }
        const Op* tmpOp = operations;
        bool ignoreCase = isSet(fOptions, IGNORE_CASE);
        int     doReturn;
@@ -1049,6 +1055,59 @@
                            else
                            tmpOp = tmpOp->getNextOp();
                            break;
+                   case Op::O_FINITE_CLOSURE:
+                           {
+                                   XMLInt32 id = tmpOp->getData();
+                    // if id is not -1, it's a closure with a child token 
having a minumum length, 
+                    // where id is the index of the fOffsets array where its 
status is stored
+                               if (id >= 0) {
+                                       int prevOffset = context->fOffsets[id];
+                                       if (prevOffset < 0 || prevOffset != 
(int)offset) {
+                                               context->fOffsets[id] = 
(int)offset;
+                                       }
+                                       else {
+                            // the status didn't change, we haven't found 
other copies; move on to the next match
+                                               context->fOffsets[id] = -1;
+                                               tmpOp = tmpOp->getNextOp();
+                                               break;
+                                       }
+                               }
+
+                    // match the subitems until they do
+                    int ret;
+                    while((ret = match(context, tmpOp->getChild(), offset, 
direction)) != -1)
+                    {
+                        if(offset == (XMLSize_t)ret)
+                            break;
+                        offset = ret;
+                    }
+
+                    if (id >= 0) {
+                        // loop has ended, reset the status for this closure
+                                           context->fOffsets[id] = -1;
+                                   }
+                                   tmpOp = tmpOp->getNextOp();
+                           }
+                           break;
+                   case Op::O_FINITE_NONGREEDYCLOSURE:
+                           {
+                                   int ret = 
match(context,tmpOp->getNextOp(),offset,direction);
+                                   if (ret >= 0)
+                        doReturn = ret;
+                    else
+                    {
+                        // match the subitems until they do
+                        int ret;
+                        while((ret = match(context, tmpOp->getChild(), offset, 
direction)) != -1)
+                        {
+                            if(offset == (XMLSize_t)ret)
+                                break;
+                            offset = ret;
+                        }
+                                   tmpOp = tmpOp->getNextOp();
+                                   }
+                           }
+                           break;
                    case Op::O_CLOSURE:
                            {
                                    XMLInt32 id = tmpOp->getData();
@@ -1067,14 +1126,40 @@
                                        }
                                }
 
-                                   opStack.push(RE_RuntimeContext(tmpOp, 
offset));
-                                   tmpOp = tmpOp->getChild();
+                    if(opStack!=NULL)
+                    {
+                                       opStack->push(RE_RuntimeContext(tmpOp, 
offset));
+                                       tmpOp = tmpOp->getChild();
+                    }
+                    else
+                    {
+                                       int ret = match(context, 
tmpOp->getChild(), offset, direction);
+                                       if (id >= 0) {
+                                               context->fOffsets[id] = -1;
+                                       }
+
+                                       if (ret >= 0)
+                                               doReturn = ret;
+                        else 
+                                           tmpOp = tmpOp->getNextOp();
+                    }
                            }
                            break;
                    case Op::O_QUESTION:
                            {
-                                   opStack.push(RE_RuntimeContext(tmpOp, 
offset));
-                                   tmpOp = tmpOp->getChild();
+                    if(opStack!=NULL)
+                    {
+                                       opStack->push(RE_RuntimeContext(tmpOp, 
offset));
+                                       tmpOp = tmpOp->getChild();
+                    }
+                    else
+                    {
+                                       int ret = match(context, 
tmpOp->getChild(), offset, direction);
+                                       if (ret >= 0)
+                                               doReturn = ret;
+                        else
+                                           tmpOp = tmpOp->getNextOp();
+                    }
                            }
                            break;
                    case Op::O_NONGREEDYCLOSURE:
@@ -1151,9 +1236,9 @@
                    }
         }
                if (doReturn != -5) {
-                       if (opStack.size() == 0)
+                       if (opStack==NULL || opStack->size() == 0)
                                return doReturn;
-                       RE_RuntimeContext       ctx = opStack.pop();
+                       RE_RuntimeContext       ctx = opStack->pop();
                        tmpOp = ctx.op_;
                        offset = ctx.offs_;
                        if (tmpOp->getOpType() == Op::O_CLOSURE) {
@@ -1462,7 +1547,7 @@
             bestResult=ret;
             bestResultContext=tmpContext;
             // exit early, if we reached the end of the string
-            if(ret == context->fLimit)
+            if((XMLSize_t)ret == context->fLimit)
                 break;
         }
     }
@@ -1788,6 +1873,57 @@
     return wordTypeOther;
 }
 
+bool RegularExpression::doTokenOverlap(const Op* op, Token* token)
+{
+    if(op->getOpType()==Op::O_RANGE)
+    {
+        RangeToken* t1=(RangeToken*)op->getToken();
+        switch(token->getTokenType())
+        {
+        case Token::T_CHAR:
+            return t1->match(token->getChar());
+        case Token::T_STRING:
+            return t1->match(*token->getString());
+        case Token::T_RANGE:
+            {
+                try
+                {
+                    RangeToken tempRange(Token::T_RANGE, fMemoryManager);
+                    tempRange.mergeRanges(t1);
+                    tempRange.intersectRanges((RangeToken*)token);
+                    return !tempRange.empty();
+                }
+                catch(RuntimeException&)
+                {
+                }
+                break;
+            }
+        }
+        return true;
+    }
+
+    XMLInt32 ch=0;
+    if(op->getOpType()==Op::O_CHAR)
+        ch=op->getData();
+    else if(op->getOpType()==Op::O_STRING)
+        ch=*op->getLiteral();
+
+    if(ch!=0)
+    {
+        switch(token->getTokenType())
+        {
+        case Token::T_CHAR:
+            return token->getChar()==ch;
+        case Token::T_STRING:
+            return *token->getString()==ch;
+        case Token::T_RANGE:
+        case Token::T_NRANGE:
+            return ((RangeToken*)token)->match(ch);
+        }
+    }
+    // in any other case, there is the chance that they overlap
+    return true;
+}
 
 XERCES_CPP_NAMESPACE_END
 

Modified: xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp
URL: 
http://svn.apache.org/viewvc/xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp?rev=571231&r1=571230&r2=571231&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp Thu Aug 30 
09:23:48 2007
@@ -291,6 +291,8 @@
     Op* compileClosure(const Token* const token, Op* const next,
                        const bool reverse, const Token::tokType tkType);
 
+    bool doTokenOverlap(const Op* op, Token* token);
+
     // -----------------------------------------------------------------------
     //  Private data members
     // -----------------------------------------------------------------------
@@ -544,7 +546,15 @@
           }
 
           childOp->setNextOp(next);
-          childOp->setChild(compile(childTok, childOp, reverse));
+          if(next==NULL || !doTokenOverlap(next, childTok))
+          {
+              childOp->setOpType(tkType == 
Token::T_NONGREEDYCLOSURE?Op::O_FINITE_NONGREEDYCLOSURE:Op::O_FINITE_CLOSURE);
+              childOp->setChild(compile(childTok, NULL, reverse));
+          }
+          else
+          {
+              childOp->setChild(compile(childTok, childOp, reverse));
+          }
           ret = childOp;
       }
 



---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to