gareth      2002/12/18 05:01:02

  Modified:    c/src/xercesc/util/regx RegularExpression.hpp
                        RegularExpression.cpp Match.cpp Match.hpp
  Log:
  New functionality - tokenize and replace. Fixed REVISIT for case insensitive match. 
Patch by Jennifer Schachter.
  
  Revision  Changes    Path
  1.4       +315 -266  xml-xerces/c/src/xercesc/util/regx/RegularExpression.hpp
  
  Index: RegularExpression.hpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/RegularExpression.hpp,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- RegularExpression.hpp     4 Nov 2002 15:17:00 -0000       1.3
  +++ RegularExpression.hpp     18 Dec 2002 13:01:02 -0000      1.4
  @@ -65,6 +65,8 @@
   //  Includes
   // ---------------------------------------------------------------------------
   #include <xercesc/util/XMLUniDefs.hpp>
  +#include <xercesc/util/RefArrayVectorOf.hpp>
  +#include <xercesc/util/XMLString.hpp>
   #include <xercesc/util/regx/Op.hpp>
   #include <xercesc/util/regx/TokenFactory.hpp>
   #include <xercesc/util/regx/BMPattern.hpp>
  @@ -136,6 +138,32 @@
       bool matches(const XMLCh* const matchString, const int start,
                    const int end, Match* const pMatch);
   
  +    // -----------------------------------------------------------------------
  +    //  Tokenize methods
  +    // -----------------------------------------------------------------------
  +    // Note: The caller owns the string vector that is returned, and is responsible
  +    //       for deleting it. 
  +    RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString);
  +    RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const int 
start,
  +                                      const int end);
  +
  +    RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString);
  +    RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, 
  +                                      const int start, const int end);
  +
  +    // -----------------------------------------------------------------------
  +    //  Replace methods
  +    // -----------------------------------------------------------------------
  +    // Note: The caller owns the XMLCh* that is returned, and is responsible for 
  +    //       deleting it. 
  +    XMLCh *replace(const char* const matchString, const char* const replaceString);
  +    XMLCh *replace(const char* const matchString, const char* const replaceString,
  +                   const int start, const int end);
  +
  +    XMLCh *replace(const XMLCh* const matchString, const XMLCh* const 
replaceString);
  +    XMLCh *replace(const XMLCh* const matchString, const XMLCh* const 
replaceString, 
  +                   const int start, const int end);
  +
   private:
       // -----------------------------------------------------------------------
       //  Private data types
  @@ -225,8 +253,29 @@
                         const short direction);
   
       /**
  -      *    Converts a token tree into an operation tree
  -      */
  +     *    Tokenize helper
  +     * 
  +     *    This overloaded tokenize is for internal use only. It provides a way to
  +     *    keep track of the sub-expressions in each match of the pattern.
  +     *    
  +     *    It is called by the other tokenize methods, and by the replace method.
  +     *    The caller is responsible for the deletion of the returned 
  +     *    RefArrayVectorOf<XMLCh*>
  +     */
  +    RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, 
  +                                      const int start, const int end, 
  +                                      RefVectorOf<Match> *subEx);
  +    /**
  +     *    Replace helpers
  +     *
  +     *    Note: the caller owns the XMLCh* that is returned
  +     */
  +    const XMLCh *subInExp(const XMLCh* const repString,
  +                          const XMLCh* const origString, 
  +                          const Match* subEx);
  +    /**
  +     *    Converts a token tree into an operation tree
  +     */
       void compile(const Token* const token);
       Op*  compile(const Token* const token, Op* const next,
                    const bool reverse);
  @@ -271,309 +320,309 @@
   };
   
   
  -// ---------------------------------------------------------------------------
  -//  RegularExpression: Cleanup methods
  -// ---------------------------------------------------------------------------
  -inline void RegularExpression::cleanUp() {
  -
  -    delete [] fPattern;
  -    delete [] fFixedString;
  -    delete fContext;
  -    delete fBMPattern;
  -    delete fTokenFactory;
  -}
  -
  -// ---------------------------------------------------------------------------
  -//  RegularExpression: Helper methods
  -// ---------------------------------------------------------------------------
  -inline bool RegularExpression::isSet(const int options, const int flag) {
  -
  -    return (options & flag) == flag;
  -}
  -
  -inline Op* RegularExpression::compileLook(const Token* const token,
  -                                          const Op* const next,
  -                                          const bool reverse,
  -                                          const unsigned short tokType) {
  -
  -    Op*    ret = 0;
  -    Op*    result = compile(token->getChild(0), 0, reverse);
  -
  -    switch(tokType) {
  -    case Token::T_LOOKAHEAD:
  -        ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result);
  -        break;
  -    case Token::T_NEGATIVELOOKAHEAD:
  -        ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result);
  -        break;
  -    case Token::T_LOOKBEHIND:
  -        ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result);
  -        break;
  -    case Token::T_NEGATIVELOOKBEHIND:
  -        ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result);
  -        break;
  -    case Token::T_INDEPENDENT:
  -        ret = fOpFactory.createIndependentOp(next, result);
  -        break;
  -    case Token::T_MODIFIERGROUP:
  -        ret = fOpFactory.createModifierOp(next, result,
  -                                   ((ModifierToken *) token)->getOptions(),
  -                                   ((ModifierToken *) token)->getOptionsMask());
  -        break;
  -    }
  -
  -
  -    return ret;
  -}
  -
  -inline Op* RegularExpression::compileSingle(const Token* const token,
  -                                            Op* const next,
  +  // ---------------------------------------------------------------------------
  +  //  RegularExpression: Cleanup methods
  +  // ---------------------------------------------------------------------------
  +  inline void RegularExpression::cleanUp() {
  +
  +      delete [] fPattern;
  +      delete [] fFixedString;
  +      delete fContext;
  +      delete fBMPattern;
  +      delete fTokenFactory;
  +  }
  +
  +  // ---------------------------------------------------------------------------
  +  //  RegularExpression: Helper methods
  +  // ---------------------------------------------------------------------------
  +  inline bool RegularExpression::isSet(const int options, const int flag) {
  +
  +      return (options & flag) == flag;
  +  }
  +
  +  inline Op* RegularExpression::compileLook(const Token* const token,
  +                                            const Op* const next,
  +                                            const bool reverse,
                                               const unsigned short tokType) {
   
  -    Op* ret = 0;
  -
  -    switch (tokType) {
  -    case Token::T_DOT:
  -        ret = fOpFactory.createDotOp();
  -        break;
  -    case Token::T_CHAR:
  -        ret = fOpFactory.createCharOp(token->getChar());
  -        break;
  -    case Token::T_ANCHOR:
  -        ret = fOpFactory.createAnchorOp(token->getChar());
  -        break;
  -    case Token::T_RANGE:
  -    case Token::T_NRANGE:
  -        ret = fOpFactory.createRangeOp(token);
  -        break;
  -    case Token::T_EMPTY:
  -        ret = next;
  -        break;
  -    case Token::T_STRING:
  -        ret = fOpFactory.createStringOp(token->getString());
  -        break;
  -    case Token::T_BACKREFERENCE:
  -        ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
  -        break;
  -    }
  -
  -    if (tokType != Token::T_EMPTY)
  -        ret->setNextOp(next);
  -
  -    return ret;
  -}
  -
  +      Op*    ret = 0;
  +      Op*    result = compile(token->getChild(0), 0, reverse);
   
  -inline Op* RegularExpression::compileUnion(const Token* const token,
  -                                           Op* const next,
  -                                           const bool reverse) {
  +      switch(tokType) {
  +      case Token::T_LOOKAHEAD:
  +          ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result);
  +          break;
  +      case Token::T_NEGATIVELOOKAHEAD:
  +          ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result);
  +          break;
  +      case Token::T_LOOKBEHIND:
  +          ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result);
  +          break;
  +      case Token::T_NEGATIVELOOKBEHIND:
  +          ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result);
  +          break;
  +      case Token::T_INDEPENDENT:
  +          ret = fOpFactory.createIndependentOp(next, result);
  +          break;
  +      case Token::T_MODIFIERGROUP:
  +          ret = fOpFactory.createModifierOp(next, result,
  +                                     ((ModifierToken *) token)->getOptions(),
  +                                     ((ModifierToken *) token)->getOptionsMask());
  +          break;
  +      }
  +
  +
  +      return ret;
  +  }
  +
  +  inline Op* RegularExpression::compileSingle(const Token* const token,
  +                                              Op* const next,
  +                                              const unsigned short tokType) {
  +
  +      Op* ret = 0;
  +
  +      switch (tokType) {
  +      case Token::T_DOT:
  +          ret = fOpFactory.createDotOp();
  +          break;
  +      case Token::T_CHAR:
  +          ret = fOpFactory.createCharOp(token->getChar());
  +          break;
  +      case Token::T_ANCHOR:
  +          ret = fOpFactory.createAnchorOp(token->getChar());
  +          break;
  +      case Token::T_RANGE:
  +      case Token::T_NRANGE:
  +          ret = fOpFactory.createRangeOp(token);
  +          break;
  +      case Token::T_EMPTY:
  +          ret = next;
  +          break;
  +      case Token::T_STRING:
  +          ret = fOpFactory.createStringOp(token->getString());
  +          break;
  +      case Token::T_BACKREFERENCE:
  +          ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
  +          break;
  +      }
   
  -    int tokSize = token->size();
  -    UnionOp* uniOp = fOpFactory.createUnionOp(tokSize);
  +      if (tokType != Token::T_EMPTY)
  +          ret->setNextOp(next);
   
  -    for (int i=0; i<tokSize; i++) {
  +      return ret;
  +  }
   
  -        uniOp->addElement(compile(token->getChild(i), next, reverse));
  -    }
   
  -    return uniOp;
  -}
  +  inline Op* RegularExpression::compileUnion(const Token* const token,
  +                                             Op* const next,
  +                                             const bool reverse) {
   
  +      int tokSize = token->size();
  +      UnionOp* uniOp = fOpFactory.createUnionOp(tokSize);
   
  -inline Op* RegularExpression::compileCondition(const Token* const token,
  -                                               Op* const next,
  -                                               const bool reverse) {
  +      for (int i=0; i<tokSize; i++) {
   
  -    Token* condTok = ((ConditionToken*) token)->getConditionToken();
  -    Token* yesTok  = token->getChild(0);
  -    Token* noTok   = token->getChild(1);
  -    int    refNo   = token->getReferenceNo();
  -    Op*    condOp  = (condTok == 0) ? 0 : compile(condTok, 0, reverse);
  -    Op*    yesOp   = compile(yesTok, next, reverse);
  -    Op*    noOp    = (noTok == 0) ? 0 : compile(noTok, next, reverse);
  +          uniOp->addElement(compile(token->getChild(i), next, reverse));
  +      }
   
  -    return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp);
  -}
  +      return uniOp;
  +  }
   
   
  -inline Op* RegularExpression::compileParenthesis(const Token* const token,
  +  inline Op* RegularExpression::compileCondition(const Token* const token,
                                                    Op* const next,
                                                    const bool reverse) {
   
  -    if (token->getNoParen() == 0)
  -        return compile(token->getChild(0), next, reverse);
  +      Token* condTok = ((ConditionToken*) token)->getConditionToken();
  +      Token* yesTok  = token->getChild(0);
  +      Token* noTok   = token->getChild(1);
  +      int    refNo   = token->getReferenceNo();
  +      Op*    condOp  = (condTok == 0) ? 0 : compile(condTok, 0, reverse);
  +      Op*    yesOp   = compile(yesTok, next, reverse);
  +      Op*    noOp    = (noTok == 0) ? 0 : compile(noTok, next, reverse);
   
  -    Op* captureOp    = 0;
  +      return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp);
  +  }
   
  -    if (reverse) {
   
  -        captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next);
  -        captureOp = compile(token->getChild(0), captureOp, reverse);
  +  inline Op* RegularExpression::compileParenthesis(const Token* const token,
  +                                                   Op* const next,
  +                                                   const bool reverse) {
   
  -        return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp);
  -    }
  +      if (token->getNoParen() == 0)
  +          return compile(token->getChild(0), next, reverse);
   
  -    captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next);
  -    captureOp = compile(token->getChild(0), captureOp, reverse);
  +      Op* captureOp    = 0;
   
  -    return fOpFactory.createCaptureOp(token->getNoParen(), captureOp);
  -}
  +      if (reverse) {
   
  -inline Op* RegularExpression::compileConcat(const Token* const token,
  -                                            Op*  const next,
  -                                            const bool reverse) {
  -
  -    Op* ret = next;
  -    int tokSize = token->size();
  -
  -    if (!reverse) {
  -
  -        for (int i= tokSize - 1; i>=0; i--) {
  -            ret = compile(token->getChild(i), ret, false);
  -        }
  -    }
  -    else {
  -
  -        for (int i= 0; i< tokSize; i++) {
  -            ret = compile(token->getChild(i), ret, true);
  -        }
  -    }
  +          captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next);
  +          captureOp = compile(token->getChild(0), captureOp, reverse);
   
  -    return ret;
  -}
  +          return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp);
  +      }
   
  -inline Op* RegularExpression::compileClosure(const Token* const token,
  -                                             Op* const next,
  -                                             const bool reverse,
  -                                             const unsigned short tokType) {
  +      captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next);
  +      captureOp = compile(token->getChild(0), captureOp, reverse);
   
  -    Op*    ret      = 0;
  -    Token* childTok = token->getChild(0);
  -    int    min      = token->getMin();
  -    int    max      = token->getMax();
  +      return fOpFactory.createCaptureOp(token->getNoParen(), captureOp);
  +  }
   
  -    if (min >= 0 && min == max) {
  +  inline Op* RegularExpression::compileConcat(const Token* const token,
  +                                              Op*  const next,
  +                                              const bool reverse) {
   
  -        ret = next;
  -        for (int i=0; i< min; i++) {
  -            ret = compile(childTok, ret, reverse);
  -        }
  +      Op* ret = next;
  +      int tokSize = token->size();
   
  -        return ret;
  -    }
  +      if (!reverse) {
   
  -    if (min > 0 && max > 0)
  -        max -= min;
  +          for (int i= tokSize - 1; i>=0; i--) {
  +              ret = compile(token->getChild(i), ret, false);
  +          }
  +      }
  +      else {
   
  -    if (max > 0) {
  +          for (int i= 0; i< tokSize; i++) {
  +              ret = compile(token->getChild(i), ret, true);
  +          }
  +      }
   
  -        ret = next;
  -        for (int i=0; i<max; i++) {
  +      return ret;
  +  }
   
  -            ChildOp* childOp = fOpFactory.createQuestionOp(
  -                tokType == Token::T_NONGREEDYCLOSURE);
  -
  -            childOp->setNextOp(next);
  -            childOp->setChild(compile(childTok, ret, reverse));
  -            ret = childOp;
  -        }
  -    }
  -    else {
  -
  -        ChildOp* childOp = 0;
  -
  -        if (tokType == Token::T_NONGREEDYCLOSURE) {
  -            childOp = fOpFactory.createNonGreedyClosureOp();
  -        }
  -        else {
  -
  -            if (childTok->getMinLength() == 0)
  -                childOp = fOpFactory.createClosureOp(fNoClosures++);
  -            else
  -                childOp = fOpFactory.createClosureOp(-1);
  -        }
  -
  -        childOp->setNextOp(next);
  -        childOp->setChild(compile(childTok, childOp, reverse));
  -        ret = childOp;
  -    }
  -
  -    if (min > 0) {
  -
  -        for (int i=0; i< min; i++) {
  -            ret = compile(childTok, ret, reverse);
  -        }
  -    }
  -
  -    return ret;
  -}
  -
  -inline int RegularExpression::matchUnion(Context* const context,
  -                                         const Op* const op, int offset,
  -                                         const short direction)
  -{
  -    unsigned int opSize = op->getSize();
  -    int ret = -1;
  -
  -    for (unsigned int i=0; i < opSize; i++) {
  -
  -        ret = match(context, op->elementAt(i), offset, direction);
  -
  -        if (ret == context->fLimit)
  -            return ret;
  -    }
  -
  -    return -1;
  -}
  -
  -inline int RegularExpression::matchModifier(Context* const context,
  -                                            const Op* const op, int offset,
  -                                            const short direction)
  -{
  -    int saveOptions = fOptions;
  -    fOptions |= (int) op->getData();
  -    fOptions &= (int) ~op->getData2();
  -
  -    int ret = match(context, op->getChild(), offset, direction);
  -
  -    fOptions = saveOptions;
  -
  -    return ret;
  -}
  -
  -inline unsigned short RegularExpression::getWordType(const XMLCh* const target
  -                                                     , const int begin
  -                                                     , const int end
  -                                                     , const int offset)
  -{
  -    if (offset < begin || offset >= end)
  -        return WT_OTHER;
  -
  -    return getCharType(target[offset]);
  -}
  -
  -inline
  -unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target
  -                                                      , const int start
  -                                                      , const int end
  -                                                      , int offset)
  -{
  -    unsigned short ret = getWordType(target, start, end, --offset);
  +  inline Op* RegularExpression::compileClosure(const Token* const token,
  +                                               Op* const next,
  +                                               const bool reverse,
  +                                               const unsigned short tokType) {
   
  -    while (ret == WT_IGNORE) {
  -        ret = getWordType(target, start, end, --offset);
  -    }
  +      Op*    ret      = 0;
  +      Token* childTok = token->getChild(0);
  +      int    min      = token->getMin();
  +      int    max      = token->getMax();
  +
  +      if (min >= 0 && min == max) {
  +
  +          ret = next;
  +          for (int i=0; i< min; i++) {
  +              ret = compile(childTok, ret, reverse);
  +          }
  +
  +          return ret;
  +      }
  +
  +      if (min > 0 && max > 0)
  +          max -= min;
  +
  +      if (max > 0) {
  +
  +          ret = next;
  +          for (int i=0; i<max; i++) {
  +
  +              ChildOp* childOp = fOpFactory.createQuestionOp(
  +                  tokType == Token::T_NONGREEDYCLOSURE);
  +
  +              childOp->setNextOp(next);
  +              childOp->setChild(compile(childTok, ret, reverse));
  +              ret = childOp;
  +          }
  +      }
  +      else {
  +
  +          ChildOp* childOp = 0;
  +
  +          if (tokType == Token::T_NONGREEDYCLOSURE) {
  +              childOp = fOpFactory.createNonGreedyClosureOp();
  +          }
  +          else {
  +
  +              if (childTok->getMinLength() == 0)
  +                  childOp = fOpFactory.createClosureOp(fNoClosures++);
  +              else
  +                  childOp = fOpFactory.createClosureOp(-1);
  +          }
  +
  +          childOp->setNextOp(next);
  +          childOp->setChild(compile(childTok, childOp, reverse));
  +          ret = childOp;
  +      }
  +
  +      if (min > 0) {
  +
  +          for (int i=0; i< min; i++) {
  +              ret = compile(childTok, ret, reverse);
  +          }
  +      }
  +
  +      return ret;
  +  }
  +
  +  inline int RegularExpression::matchUnion(Context* const context,
  +                                           const Op* const op, int offset,
  +                                           const short direction)
  +  {
  +      unsigned int opSize = op->getSize();
  +      int ret = -1;
  +
  +      for (unsigned int i=0; i < opSize; i++) {
  +
  +          ret = match(context, op->elementAt(i), offset, direction);
  +
  +          if (ret == context->fLimit)
  +              return ret;
  +      }
  +
  +      return -1;
  +  }
  +
  +  inline int RegularExpression::matchModifier(Context* const context,
  +                                              const Op* const op, int offset,
  +                                              const short direction)
  +  {
  +      int saveOptions = fOptions;
  +      fOptions |= (int) op->getData();
  +      fOptions &= (int) ~op->getData2();
  +
  +      int ret = match(context, op->getChild(), offset, direction);
  +
  +      fOptions = saveOptions;
  +
  +      return ret;
  +  }
  +
  +  inline unsigned short RegularExpression::getWordType(const XMLCh* const target
  +                                                       , const int begin
  +                                                       , const int end
  +                                                       , const int offset)
  +  {
  +      if (offset < begin || offset >= end)
  +          return WT_OTHER;
  +
  +      return getCharType(target[offset]);
  +  }
  +
  +  inline
  +  unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target
  +                                                        , const int start
  +                                                        , const int end
  +                                                        , int offset)
  +  {
  +      unsigned short ret = getWordType(target, start, end, --offset);
  +
  +      while (ret == WT_IGNORE) {
  +          ret = getWordType(target, start, end, --offset);
  +      }
   
  -    return ret;
  -}
  +      return ret;
  +  }
   
  -inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
  +  inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
                                                  const XMLInt32 ch2)
   {
   
  -    //REVISIT - for now we will return a case sensitive match
  -    return (ch1 == ch2);
  +    return (0==XMLString::compareNIString((XMLCh*)&ch1,(XMLCh*)&ch2, 1));
   }
  +
   
   XERCES_CPP_NAMESPACE_END
   
  
  
  
  1.5       +334 -3    xml-xerces/c/src/xercesc/util/regx/RegularExpression.cpp
  
  Index: RegularExpression.cpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/RegularExpression.cpp,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- RegularExpression.cpp     4 Nov 2002 15:17:00 -0000       1.4
  +++ RegularExpression.cpp     18 Dec 2002 13:01:02 -0000      1.5
  @@ -56,6 +56,9 @@
   
   /*
    * $Log$
  + * Revision 1.5  2002/12/18 13:01:02  gareth
  + * New functionality - tokenize and replace. Fixed REVISIT for case insensitive 
match. Patch by Jennifer Schachter.
  + *
    * Revision 1.4  2002/11/04 15:17:00  tng
    * C++ Namespace Support.
    *
  @@ -104,7 +107,6 @@
   //  Includes
   // ---------------------------------------------------------------------------
   #include <xercesc/util/regx/RegularExpression.hpp>
  -#include <xercesc/util/XMLString.hpp>
   #include <xercesc/util/PlatformUtils.hpp>
   #include <xercesc/util/regx/RegxUtil.hpp>
   #include <xercesc/util/regx/Match.hpp>
  @@ -114,6 +116,7 @@
   #include <xercesc/util/regx/ParserForXMLSchema.hpp>
   #include <xercesc/util/Janitor.hpp>
   #include <xercesc/util/ParseException.hpp>
  +#include <xercesc/framework/XMLBuffer.hpp>
   
   XERCES_CPP_NAMESPACE_BEGIN
   
  @@ -620,6 +623,253 @@
   }
   
   // ---------------------------------------------------------------------------
  +//  RegularExpression: Tokenize methods
  +// ---------------------------------------------------------------------------
  +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression) {
  +
  +  XMLCh* tmpBuf = XMLString::transcode(expression);
  +  ArrayJanitor<XMLCh> janBuf(tmpBuf);
  +  return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf));
  +}
  +
  +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression,
  +                                                             const int start, const 
int end) {
  +
  +  XMLCh* tmpBuf = XMLString::transcode(expression);
  +  ArrayJanitor<XMLCh> janBuf(tmpBuf);
  +  return tokenize(tmpBuf, start, end);
  +}
  +
  +
  +
  +// ---------------------------------------------------------------------------
  +//  RegularExpression: Tokenize methods - Wide char version
  +// ---------------------------------------------------------------------------
  +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression) 
{
  +  return tokenize(expression, 0, XMLString::stringLen(expression), 0);
  +}
  +
  +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression,
  +                                                                                    
              const int start, const int end)
  +{
  +  return tokenize(expression, start, end, 0);
  +}
  +
  +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression, 
  +                                                     const int start, const int end,
  +                                                     RefVectorOf<Match> *subEx){
  +  
  +  if (fOperations == 0)
  +       prepare();
  +
  +  Match* pMatch = 0;
  +
  +  if (subEx){
  +    pMatch = new Match();
  +  }
  +
  +  RefArrayVectorOf<XMLCh>* tokenStack = new RefArrayVectorOf<XMLCh>(16, true);
  +
  +  Context* context = 0;
  +  Context* tmpContext = 0;
  +
  +  int                 strLength = XMLString::stringLen(expression);
  + 
  +  {
  +        XMLMutexLock lockInit(&fMutex);
  +
  +        if (fContext == 0)
  +          fContext = new Context();
  +
  +        if (fContext->fInUse) {
  +          context = new Context();
  +          tmpContext = context;
  +        }
  +        else {
  +          context = fContext;
  +        }
  +
  +        context->reset(expression, start, end, fNoClosures);
  +  }
  +
  +  Janitor<Context> janContext(tmpContext);
  +
  +  bool adoptMatch = false;
  +  Match* lMatch = pMatch;
  +
  +  if (lMatch != 0) {
  +       lMatch->setNoGroups(fNoGroups);
  +  }
  +  else if (fHasBackReferences) {
  +
  +       lMatch = new Match();
  +       lMatch->setNoGroups(fNoGroups);
  +       adoptMatch = true;
  +  }
  +
  +  if (context->fAdoptMatch)
  +       delete context->fMatch;
  +  
  +  context->fMatch = lMatch;
  +  context->fAdoptMatch = adoptMatch;
  +
  +  int tokStart = start;
  +  int matchStart = start;
  +
  +  for (; matchStart <= end; matchStart++) { 
  +  
  +       int matchEnd = match(context, fOperations, matchStart, 1);
  +  
  +       if (matchEnd != -1) {
  +
  +         if (context->fMatch != 0) {
  +           context->fMatch->setStartPos(0, context->fStart);
  +           context->fMatch->setEndPos(0, matchEnd);
  +         }
  +
  +      if (subEx){
  +        subEx->addElement(lMatch);
  +        lMatch = new Match(*(context->fMatch));
  +        context->fMatch = lMatch;
  +      }
  +  
  +         context->fInUse = false;
  +
  +      XMLCh* token;
  +      if (tokStart == matchStart){
  +  
  +        if (tokStart == strLength){
  +          tokStart--;
  +          break;  
  +        }
  +
  +        token = new XMLCh[1];
  +        token[0] = chNull;
  +
  +        // When you tokenize using zero string, will return each
  +        // token in the string. Since the zero string will also 
  +        // match the start/end characters, resulting in empty 
  +        // tokens, we ignore them and do not add them to the stack. 
  +        if (!XMLString::equals(fPattern, &chNull)) 
  +          tokenStack->addElement(token); 
  +  
  +      } else {
  +        token = new XMLCh[matchStart + 1 - tokStart];
  +        XMLString::subString(token, expression, tokStart, matchStart);
  +        tokenStack->addElement(token);
  +      } 
  +
  +      tokStart = matchEnd;
  +
  +      //decrement matchStart as will increment it at the top of the loop
  +      if (matchStart < matchEnd - 1) 
  +        matchStart = matchEnd - 1;       
  +    }
  +  }
  + 
  +  XMLCh* token;
  + 
  +  if (matchStart == tokStart + 1){
  +    token = new XMLCh[1];
  +    token[0] = chNull;
  +  
  +  } else {
  +    token = new XMLCh[strLength + 1 - tokStart];
  +    XMLString::subString(token, expression, tokStart, strLength);
  +  }  
  +
  +  if (!XMLString::equals(fPattern, &chNull)) 
  +    tokenStack->addElement(token);
  +
  +  return tokenStack;
  +
  +}
  +
  +
  +// -----------------------------------------------------------------------
  +//  RegularExpression: Replace methods
  +// -----------------------------------------------------------------------
  +XMLCh* RegularExpression::replace(const char* const matchString, 
  +                                  const char* const replaceString){
  +
  +     XMLCh* tmpBuf = XMLString::transcode(matchString);
  +    ArrayJanitor<XMLCh> janBuf(tmpBuf);
  +     XMLCh* tmpBuf2 = XMLString::transcode(replaceString);
  +    ArrayJanitor<XMLCh> janBuf2(tmpBuf2);
  +
  +     return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf));
  +}
  +
  +XMLCh* RegularExpression::replace(const char* const matchString, 
  +                                  const char* const replaceString,
  +                                  const int start, const int end){
  +
  +     XMLCh* tmpBuf = XMLString::transcode(matchString);
  +    ArrayJanitor<XMLCh> janBuf(tmpBuf);
  +     XMLCh* tmpBuf2 = XMLString::transcode(replaceString);
  +    ArrayJanitor<XMLCh> janBuf2(tmpBuf2);
  +  
  +  return replace(tmpBuf, tmpBuf2, start, end);
  +}
  +
  +
  +// ---------------------------------------------------------------------------
  +//  RegularExpression: Replace methods - Wide char version
  +// ---------------------------------------------------------------------------
  +XMLCh* RegularExpression::replace(const XMLCh* const matchString, 
  +                                  const XMLCh* const replaceString){
  +
  +  return replace(matchString, replaceString, 0, 
  +                 XMLString::stringLen(matchString));
  +}
  +
  +XMLCh* RegularExpression::replace(const XMLCh* const matchString,  
  +                                  const XMLCh* const replaceString,
  +                                  const int start, const int end)
  +{
  +  //check if matches zero length string - throw error if so
  +  if (matches(XMLUni::fgZeroLenString)){
  +             ThrowXML(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString);
  +  }
  +      
  +  RefVectorOf<Match> *subEx = new RefVectorOf<Match>(10, true);
  +       Janitor<RefVectorOf<Match> > janSubEx(subEx);
  +
  +  //Call to tokenize with Match vector so that we keep track of the locations
  +  //of the subExpression within each of the matches
  +  RefArrayVectorOf<XMLCh>* tokenStack = tokenize(matchString, start, end, subEx);
  +       Janitor<RefArrayVectorOf<XMLCh> > janTokStack(tokenStack);
  +    
  +  XMLBuffer result;
  +  
  +  int numSubEx = 0;
  +  
  +  if (subEx && subEx->size() > 0)
  +    numSubEx = subEx->elementAt(0)->getNoGroups() - 1;
  +  
  +  int tokStackSize = tokenStack->size();
  +  const XMLCh* curRepString = XMLString::replicate(replaceString);
  +    
  +  for (int i = 0; i < tokStackSize; i++){
  +      
  +    result.append(tokenStack->elementAt(i));
  +  
  +    if (i != tokStackSize - 1) {
  +       
  +      //if there are subExpressions, then determine the string we want to 
  +      //substitute in.
  +      if (numSubEx != 0)
  +        curRepString = subInExp(replaceString, matchString, subEx->elementAt(i)); 
  +    
  +      result.append(curRepString);
  +    }
  +  }  
  +    
  +  return XMLString::replicate(result.getRawBuffer()); 
  +    
  +}
  +
  +// ---------------------------------------------------------------------------
   //  RegularExpression: Helpers methods
   // ---------------------------------------------------------------------------
   int RegularExpression::getOptionValue(const XMLCh ch) {
  @@ -665,6 +915,7 @@
        return ret;
   }
   
  +
   int RegularExpression::match(Context* const context, const Op* const operations
                                                         , int offset, const short 
direction)
   {
  @@ -815,7 +1066,6 @@
        
        return offset;
   }
  -
   bool RegularExpression::matchChar(Context* const context,
                                                                  const XMLInt32 ch, 
int& offset,
                                                                  const short 
direction, const bool ignoreCase)
  @@ -832,7 +1082,6 @@
   
        bool match = ignoreCase ? matchIgnoreCase(ch, strCh)
                                    : (ch == strCh);
  -     
        if (!match)
                return false;
   
  @@ -1183,6 +1432,87 @@
   }
   
   /*
  + * Helper for Replace. This method prepares the replacement string by substituting
  + * in actual values for parenthesized sub expressions. 
  + *
  + * An error will be thrown if:
  + *  1) repString references an undefined subExpression
  + *  2) there is an unescaped chDollar which is not followed by a digit
  + *
  + */
  +const XMLCh* RegularExpression::subInExp(const XMLCh* const repString, 
  +                                         const XMLCh* const origString, 
  +                                         const Match* subEx){
  +
  +  int numSubExp = subEx->getNoGroups() - 1;
  +
  +  if (numSubExp == 0)
  +    return XMLString::replicate(repString);
  +  
  +  bool notEscaped = true;                 
  +  
  +  XMLBuffer newString;                   
  +  
  +  XMLCh *indexStr = new XMLCh[2];                   //holds the string rep of a 
  +        ArrayJanitor<XMLCh> indexJan(indexStr);     //digit
  +
  +  indexStr[1] = chNull;
  +  int index = -1;
  +
  +  for (const XMLCh* ptr = repString; *ptr != chNull; ptr++){
  +
  +    if ((*ptr == chDollarSign) && notEscaped) {
  +      
  +      ptr++;
  +      
  +      //check that after the $ is a digit 
  +      if (!XMLString::isDigit(*ptr)){
  +       
  +        //invalid replace string - $ must be followed by a digit
  +                             ThrowXML(RuntimeException, 
XMLExcepts::Regex_InvalidRepPattern);
  +      }
  +        
  +      indexStr[0] = *ptr;                     //get the digit 
  +      index = XMLString::parseInt(indexStr);  //convert it to an int
  +
  +      //now check that the index is legal
  +      if (index > numSubExp){
  +                             ThrowXML(RuntimeException, 
XMLExcepts::Regex_InvalidRepPattern);
  +      }
  +        
  +      int start = subEx->getStartPos(index);
  +      int end = subEx->getEndPos(index);
  +
  +      //now copy the substring into the new string
  +      for (int i=start; i<end; i++){
  +        newString.append(origString[i]);
  +      }
  +          
  +    } else {
  + 
  +      //if you have a slash and then a character that's not a $ or /, 
  +      //then it's an invalid replace string  
  +      if (!notEscaped && (*ptr != chDollarSign && *ptr != chBackSlash)){
  +                             ThrowXML(RuntimeException, 
XMLExcepts::Regex_InvalidRepPattern);
  +      }
  +      
  +      if (*ptr == chBackSlash){
  +        notEscaped = false;
  +        continue;
  +        
  +      }else   
  +        notEscaped = true;  
  +
  +      newString.append(*ptr);
  +    }
  +  }
  +
  +  return XMLString::replicate(newString.getRawBuffer());
  +       
  +}
  +
  +
  +/*
    * Prepares for matching. This method is called just before starting matching
    */
   void RegularExpression::prepare() {
  @@ -1312,6 +1642,7 @@
   
       return WT_OTHER;
   }
  +
   
   XERCES_CPP_NAMESPACE_END
   
  
  
  
  1.3       +32 -0     xml-xerces/c/src/xercesc/util/regx/Match.cpp
  
  Index: Match.cpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/Match.cpp,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Match.cpp 4 Nov 2002 15:17:00 -0000       1.2
  +++ Match.cpp 18 Dec 2002 13:01:02 -0000      1.3
  @@ -56,6 +56,9 @@
   
   /*
    * $Log$
  + * Revision 1.3  2002/12/18 13:01:02  gareth
  + * New functionality - tokenize and replace. Fixed REVISIT for case insensitive 
match. Patch by Jennifer Schachter.
  + *
    * Revision 1.2  2002/11/04 15:17:00  tng
    * C++ Namespace Support.
    *
  @@ -87,6 +90,20 @@
   
   }
   
  +Match::Match(const Match& toCopy) : fNoGroups(0),
  +                                                         fPositionsSize(0),
  +                                                         fStartPositions(0),
  +                                                         fEndPositions(0){
  +  initialize(toCopy);
  +}
  +
  +Match& Match::operator=(const Match& toAssign){
  +  
  +  initialize(toAssign);
  +  return *this;
  +}
  +
  +
   Match::~Match() {
   
        cleanUp();
  @@ -117,6 +134,21 @@
   // ---------------------------------------------------------------------------
   //  Match: private helpers methods
   // ---------------------------------------------------------------------------
  +void Match::initialize(const Match &toCopy){
  +
  +  //do not copy over value of fPositionSize as it is irrelevant to the 
  +  //state of the Match
  +   
  +  int toCopySize = toCopy.getNoGroups();
  +  setNoGroups(toCopySize);
  +
  +  for (int i=0; i<toCopySize; i++){
  +    setStartPos(i, toCopy.getStartPos(i));
  +    setEndPos(i, toCopy.getEndPos(i));
  +  }           
  +
  +}
  +
   void Match::cleanUp() {
   
        delete [] fStartPositions;
  
  
  
  1.3       +19 -11    xml-xerces/c/src/xercesc/util/regx/Match.hpp
  
  Index: Match.hpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/Match.hpp,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Match.hpp 4 Nov 2002 15:17:00 -0000       1.2
  +++ Match.hpp 18 Dec 2002 13:01:02 -0000      1.3
  @@ -76,10 +76,17 @@
   public:
   
        // -----------------------------------------------------------------------
  -    //  Public Constructors and Destructor
  -    // -----------------------------------------------------------------------
  +  //  Public Constructors and Destructor
  +  // -----------------------------------------------------------------------
        Match();
  -     ~Match();
  +     
  +  /**
  +  * Copy constructor
  +  */
  +  Match(const Match& toCopy);
  +  Match& operator=(const Match& toAssign);
  +
  +     virtual ~Match();
   
        // -----------------------------------------------------------------------
        // Getter functions
  @@ -99,16 +106,17 @@
        // -----------------------------------------------------------------------
        // Initialize/Clean up methods
        // -----------------------------------------------------------------------
  +  void initialize(const Match& toCopy);
        void cleanUp();
   
        // -----------------------------------------------------------------------
  -    //  Private data members
  -    //
  -    //  fNoGroups
  -    //      Represents no of regular expression groups
  +  //  Private data members
  +  //
  +  //  fNoGroups
  +  //      Represents no of regular expression groups
        //              
  -    //  fStartPositions
  -    //      Array of start positions in the target text matched to specific
  +  //  fStartPositions
  +  //      Array of start positions in the target text matched to specific
        //              regular expression group
        //
        //      fEndPositions
  @@ -117,7 +125,7 @@
        //
        //      fPositionsSize
        //              Actual size of Start/EndPositions array.
  -    // -----------------------------------------------------------------------
  +  // -----------------------------------------------------------------------
        int fNoGroups;
        int fPositionsSize;
        int* fStartPositions;
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to