gareth 2002/12/18 05:01:02 Modified: c/src/xercesc/util/regx RegularExpression.hpp RegularExpression.cpp Match.cpp Match.hpp Log: New functionality - tokenize and replace. Fixed REVISIT for case insensitive match. Patch by Jennifer Schachter. Revision Changes Path 1.4 +315 -266 xml-xerces/c/src/xercesc/util/regx/RegularExpression.hpp Index: RegularExpression.hpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/RegularExpression.hpp,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- RegularExpression.hpp 4 Nov 2002 15:17:00 -0000 1.3 +++ RegularExpression.hpp 18 Dec 2002 13:01:02 -0000 1.4 @@ -65,6 +65,8 @@ // Includes // --------------------------------------------------------------------------- #include <xercesc/util/XMLUniDefs.hpp> +#include <xercesc/util/RefArrayVectorOf.hpp> +#include <xercesc/util/XMLString.hpp> #include <xercesc/util/regx/Op.hpp> #include <xercesc/util/regx/TokenFactory.hpp> #include <xercesc/util/regx/BMPattern.hpp> @@ -136,6 +138,32 @@ bool matches(const XMLCh* const matchString, const int start, const int end, Match* const pMatch); + // ----------------------------------------------------------------------- + // Tokenize methods + // ----------------------------------------------------------------------- + // Note: The caller owns the string vector that is returned, and is responsible + // for deleting it. + RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString); + RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const int start, + const int end); + + RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString); + RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, + const int start, const int end); + + // ----------------------------------------------------------------------- + // Replace methods + // ----------------------------------------------------------------------- + // Note: The caller owns the XMLCh* that is returned, and is responsible for + // deleting it. + XMLCh *replace(const char* const matchString, const char* const replaceString); + XMLCh *replace(const char* const matchString, const char* const replaceString, + const int start, const int end); + + XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString); + XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString, + const int start, const int end); + private: // ----------------------------------------------------------------------- // Private data types @@ -225,8 +253,29 @@ const short direction); /** - * Converts a token tree into an operation tree - */ + * Tokenize helper + * + * This overloaded tokenize is for internal use only. It provides a way to + * keep track of the sub-expressions in each match of the pattern. + * + * It is called by the other tokenize methods, and by the replace method. + * The caller is responsible for the deletion of the returned + * RefArrayVectorOf<XMLCh*> + */ + RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, + const int start, const int end, + RefVectorOf<Match> *subEx); + /** + * Replace helpers + * + * Note: the caller owns the XMLCh* that is returned + */ + const XMLCh *subInExp(const XMLCh* const repString, + const XMLCh* const origString, + const Match* subEx); + /** + * Converts a token tree into an operation tree + */ void compile(const Token* const token); Op* compile(const Token* const token, Op* const next, const bool reverse); @@ -271,309 +320,309 @@ }; -// --------------------------------------------------------------------------- -// RegularExpression: Cleanup methods -// --------------------------------------------------------------------------- -inline void RegularExpression::cleanUp() { - - delete [] fPattern; - delete [] fFixedString; - delete fContext; - delete fBMPattern; - delete fTokenFactory; -} - -// --------------------------------------------------------------------------- -// RegularExpression: Helper methods -// --------------------------------------------------------------------------- -inline bool RegularExpression::isSet(const int options, const int flag) { - - return (options & flag) == flag; -} - -inline Op* RegularExpression::compileLook(const Token* const token, - const Op* const next, - const bool reverse, - const unsigned short tokType) { - - Op* ret = 0; - Op* result = compile(token->getChild(0), 0, reverse); - - switch(tokType) { - case Token::T_LOOKAHEAD: - ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result); - break; - case Token::T_NEGATIVELOOKAHEAD: - ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result); - break; - case Token::T_LOOKBEHIND: - ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result); - break; - case Token::T_NEGATIVELOOKBEHIND: - ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result); - break; - case Token::T_INDEPENDENT: - ret = fOpFactory.createIndependentOp(next, result); - break; - case Token::T_MODIFIERGROUP: - ret = fOpFactory.createModifierOp(next, result, - ((ModifierToken *) token)->getOptions(), - ((ModifierToken *) token)->getOptionsMask()); - break; - } - - - return ret; -} - -inline Op* RegularExpression::compileSingle(const Token* const token, - Op* const next, + // --------------------------------------------------------------------------- + // RegularExpression: Cleanup methods + // --------------------------------------------------------------------------- + inline void RegularExpression::cleanUp() { + + delete [] fPattern; + delete [] fFixedString; + delete fContext; + delete fBMPattern; + delete fTokenFactory; + } + + // --------------------------------------------------------------------------- + // RegularExpression: Helper methods + // --------------------------------------------------------------------------- + inline bool RegularExpression::isSet(const int options, const int flag) { + + return (options & flag) == flag; + } + + inline Op* RegularExpression::compileLook(const Token* const token, + const Op* const next, + const bool reverse, const unsigned short tokType) { - Op* ret = 0; - - switch (tokType) { - case Token::T_DOT: - ret = fOpFactory.createDotOp(); - break; - case Token::T_CHAR: - ret = fOpFactory.createCharOp(token->getChar()); - break; - case Token::T_ANCHOR: - ret = fOpFactory.createAnchorOp(token->getChar()); - break; - case Token::T_RANGE: - case Token::T_NRANGE: - ret = fOpFactory.createRangeOp(token); - break; - case Token::T_EMPTY: - ret = next; - break; - case Token::T_STRING: - ret = fOpFactory.createStringOp(token->getString()); - break; - case Token::T_BACKREFERENCE: - ret = fOpFactory.createBackReferenceOp(token->getReferenceNo()); - break; - } - - if (tokType != Token::T_EMPTY) - ret->setNextOp(next); - - return ret; -} - + Op* ret = 0; + Op* result = compile(token->getChild(0), 0, reverse); -inline Op* RegularExpression::compileUnion(const Token* const token, - Op* const next, - const bool reverse) { + switch(tokType) { + case Token::T_LOOKAHEAD: + ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result); + break; + case Token::T_NEGATIVELOOKAHEAD: + ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result); + break; + case Token::T_LOOKBEHIND: + ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result); + break; + case Token::T_NEGATIVELOOKBEHIND: + ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result); + break; + case Token::T_INDEPENDENT: + ret = fOpFactory.createIndependentOp(next, result); + break; + case Token::T_MODIFIERGROUP: + ret = fOpFactory.createModifierOp(next, result, + ((ModifierToken *) token)->getOptions(), + ((ModifierToken *) token)->getOptionsMask()); + break; + } + + + return ret; + } + + inline Op* RegularExpression::compileSingle(const Token* const token, + Op* const next, + const unsigned short tokType) { + + Op* ret = 0; + + switch (tokType) { + case Token::T_DOT: + ret = fOpFactory.createDotOp(); + break; + case Token::T_CHAR: + ret = fOpFactory.createCharOp(token->getChar()); + break; + case Token::T_ANCHOR: + ret = fOpFactory.createAnchorOp(token->getChar()); + break; + case Token::T_RANGE: + case Token::T_NRANGE: + ret = fOpFactory.createRangeOp(token); + break; + case Token::T_EMPTY: + ret = next; + break; + case Token::T_STRING: + ret = fOpFactory.createStringOp(token->getString()); + break; + case Token::T_BACKREFERENCE: + ret = fOpFactory.createBackReferenceOp(token->getReferenceNo()); + break; + } - int tokSize = token->size(); - UnionOp* uniOp = fOpFactory.createUnionOp(tokSize); + if (tokType != Token::T_EMPTY) + ret->setNextOp(next); - for (int i=0; i<tokSize; i++) { + return ret; + } - uniOp->addElement(compile(token->getChild(i), next, reverse)); - } - return uniOp; -} + inline Op* RegularExpression::compileUnion(const Token* const token, + Op* const next, + const bool reverse) { + int tokSize = token->size(); + UnionOp* uniOp = fOpFactory.createUnionOp(tokSize); -inline Op* RegularExpression::compileCondition(const Token* const token, - Op* const next, - const bool reverse) { + for (int i=0; i<tokSize; i++) { - Token* condTok = ((ConditionToken*) token)->getConditionToken(); - Token* yesTok = token->getChild(0); - Token* noTok = token->getChild(1); - int refNo = token->getReferenceNo(); - Op* condOp = (condTok == 0) ? 0 : compile(condTok, 0, reverse); - Op* yesOp = compile(yesTok, next, reverse); - Op* noOp = (noTok == 0) ? 0 : compile(noTok, next, reverse); + uniOp->addElement(compile(token->getChild(i), next, reverse)); + } - return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp); -} + return uniOp; + } -inline Op* RegularExpression::compileParenthesis(const Token* const token, + inline Op* RegularExpression::compileCondition(const Token* const token, Op* const next, const bool reverse) { - if (token->getNoParen() == 0) - return compile(token->getChild(0), next, reverse); + Token* condTok = ((ConditionToken*) token)->getConditionToken(); + Token* yesTok = token->getChild(0); + Token* noTok = token->getChild(1); + int refNo = token->getReferenceNo(); + Op* condOp = (condTok == 0) ? 0 : compile(condTok, 0, reverse); + Op* yesOp = compile(yesTok, next, reverse); + Op* noOp = (noTok == 0) ? 0 : compile(noTok, next, reverse); - Op* captureOp = 0; + return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp); + } - if (reverse) { - captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next); - captureOp = compile(token->getChild(0), captureOp, reverse); + inline Op* RegularExpression::compileParenthesis(const Token* const token, + Op* const next, + const bool reverse) { - return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp); - } + if (token->getNoParen() == 0) + return compile(token->getChild(0), next, reverse); - captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next); - captureOp = compile(token->getChild(0), captureOp, reverse); + Op* captureOp = 0; - return fOpFactory.createCaptureOp(token->getNoParen(), captureOp); -} + if (reverse) { -inline Op* RegularExpression::compileConcat(const Token* const token, - Op* const next, - const bool reverse) { - - Op* ret = next; - int tokSize = token->size(); - - if (!reverse) { - - for (int i= tokSize - 1; i>=0; i--) { - ret = compile(token->getChild(i), ret, false); - } - } - else { - - for (int i= 0; i< tokSize; i++) { - ret = compile(token->getChild(i), ret, true); - } - } + captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next); + captureOp = compile(token->getChild(0), captureOp, reverse); - return ret; -} + return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp); + } -inline Op* RegularExpression::compileClosure(const Token* const token, - Op* const next, - const bool reverse, - const unsigned short tokType) { + captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next); + captureOp = compile(token->getChild(0), captureOp, reverse); - Op* ret = 0; - Token* childTok = token->getChild(0); - int min = token->getMin(); - int max = token->getMax(); + return fOpFactory.createCaptureOp(token->getNoParen(), captureOp); + } - if (min >= 0 && min == max) { + inline Op* RegularExpression::compileConcat(const Token* const token, + Op* const next, + const bool reverse) { - ret = next; - for (int i=0; i< min; i++) { - ret = compile(childTok, ret, reverse); - } + Op* ret = next; + int tokSize = token->size(); - return ret; - } + if (!reverse) { - if (min > 0 && max > 0) - max -= min; + for (int i= tokSize - 1; i>=0; i--) { + ret = compile(token->getChild(i), ret, false); + } + } + else { - if (max > 0) { + for (int i= 0; i< tokSize; i++) { + ret = compile(token->getChild(i), ret, true); + } + } - ret = next; - for (int i=0; i<max; i++) { + return ret; + } - ChildOp* childOp = fOpFactory.createQuestionOp( - tokType == Token::T_NONGREEDYCLOSURE); - - childOp->setNextOp(next); - childOp->setChild(compile(childTok, ret, reverse)); - ret = childOp; - } - } - else { - - ChildOp* childOp = 0; - - if (tokType == Token::T_NONGREEDYCLOSURE) { - childOp = fOpFactory.createNonGreedyClosureOp(); - } - else { - - if (childTok->getMinLength() == 0) - childOp = fOpFactory.createClosureOp(fNoClosures++); - else - childOp = fOpFactory.createClosureOp(-1); - } - - childOp->setNextOp(next); - childOp->setChild(compile(childTok, childOp, reverse)); - ret = childOp; - } - - if (min > 0) { - - for (int i=0; i< min; i++) { - ret = compile(childTok, ret, reverse); - } - } - - return ret; -} - -inline int RegularExpression::matchUnion(Context* const context, - const Op* const op, int offset, - const short direction) -{ - unsigned int opSize = op->getSize(); - int ret = -1; - - for (unsigned int i=0; i < opSize; i++) { - - ret = match(context, op->elementAt(i), offset, direction); - - if (ret == context->fLimit) - return ret; - } - - return -1; -} - -inline int RegularExpression::matchModifier(Context* const context, - const Op* const op, int offset, - const short direction) -{ - int saveOptions = fOptions; - fOptions |= (int) op->getData(); - fOptions &= (int) ~op->getData2(); - - int ret = match(context, op->getChild(), offset, direction); - - fOptions = saveOptions; - - return ret; -} - -inline unsigned short RegularExpression::getWordType(const XMLCh* const target - , const int begin - , const int end - , const int offset) -{ - if (offset < begin || offset >= end) - return WT_OTHER; - - return getCharType(target[offset]); -} - -inline -unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target - , const int start - , const int end - , int offset) -{ - unsigned short ret = getWordType(target, start, end, --offset); + inline Op* RegularExpression::compileClosure(const Token* const token, + Op* const next, + const bool reverse, + const unsigned short tokType) { - while (ret == WT_IGNORE) { - ret = getWordType(target, start, end, --offset); - } + Op* ret = 0; + Token* childTok = token->getChild(0); + int min = token->getMin(); + int max = token->getMax(); + + if (min >= 0 && min == max) { + + ret = next; + for (int i=0; i< min; i++) { + ret = compile(childTok, ret, reverse); + } + + return ret; + } + + if (min > 0 && max > 0) + max -= min; + + if (max > 0) { + + ret = next; + for (int i=0; i<max; i++) { + + ChildOp* childOp = fOpFactory.createQuestionOp( + tokType == Token::T_NONGREEDYCLOSURE); + + childOp->setNextOp(next); + childOp->setChild(compile(childTok, ret, reverse)); + ret = childOp; + } + } + else { + + ChildOp* childOp = 0; + + if (tokType == Token::T_NONGREEDYCLOSURE) { + childOp = fOpFactory.createNonGreedyClosureOp(); + } + else { + + if (childTok->getMinLength() == 0) + childOp = fOpFactory.createClosureOp(fNoClosures++); + else + childOp = fOpFactory.createClosureOp(-1); + } + + childOp->setNextOp(next); + childOp->setChild(compile(childTok, childOp, reverse)); + ret = childOp; + } + + if (min > 0) { + + for (int i=0; i< min; i++) { + ret = compile(childTok, ret, reverse); + } + } + + return ret; + } + + inline int RegularExpression::matchUnion(Context* const context, + const Op* const op, int offset, + const short direction) + { + unsigned int opSize = op->getSize(); + int ret = -1; + + for (unsigned int i=0; i < opSize; i++) { + + ret = match(context, op->elementAt(i), offset, direction); + + if (ret == context->fLimit) + return ret; + } + + return -1; + } + + inline int RegularExpression::matchModifier(Context* const context, + const Op* const op, int offset, + const short direction) + { + int saveOptions = fOptions; + fOptions |= (int) op->getData(); + fOptions &= (int) ~op->getData2(); + + int ret = match(context, op->getChild(), offset, direction); + + fOptions = saveOptions; + + return ret; + } + + inline unsigned short RegularExpression::getWordType(const XMLCh* const target + , const int begin + , const int end + , const int offset) + { + if (offset < begin || offset >= end) + return WT_OTHER; + + return getCharType(target[offset]); + } + + inline + unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target + , const int start + , const int end + , int offset) + { + unsigned short ret = getWordType(target, start, end, --offset); + + while (ret == WT_IGNORE) { + ret = getWordType(target, start, end, --offset); + } - return ret; -} + return ret; + } -inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1, + inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2) { - //REVISIT - for now we will return a case sensitive match - return (ch1 == ch2); + return (0==XMLString::compareNIString((XMLCh*)&ch1,(XMLCh*)&ch2, 1)); } + XERCES_CPP_NAMESPACE_END 1.5 +334 -3 xml-xerces/c/src/xercesc/util/regx/RegularExpression.cpp Index: RegularExpression.cpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/RegularExpression.cpp,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- RegularExpression.cpp 4 Nov 2002 15:17:00 -0000 1.4 +++ RegularExpression.cpp 18 Dec 2002 13:01:02 -0000 1.5 @@ -56,6 +56,9 @@ /* * $Log$ + * Revision 1.5 2002/12/18 13:01:02 gareth + * New functionality - tokenize and replace. Fixed REVISIT for case insensitive match. Patch by Jennifer Schachter. + * * Revision 1.4 2002/11/04 15:17:00 tng * C++ Namespace Support. * @@ -104,7 +107,6 @@ // Includes // --------------------------------------------------------------------------- #include <xercesc/util/regx/RegularExpression.hpp> -#include <xercesc/util/XMLString.hpp> #include <xercesc/util/PlatformUtils.hpp> #include <xercesc/util/regx/RegxUtil.hpp> #include <xercesc/util/regx/Match.hpp> @@ -114,6 +116,7 @@ #include <xercesc/util/regx/ParserForXMLSchema.hpp> #include <xercesc/util/Janitor.hpp> #include <xercesc/util/ParseException.hpp> +#include <xercesc/framework/XMLBuffer.hpp> XERCES_CPP_NAMESPACE_BEGIN @@ -620,6 +623,253 @@ } // --------------------------------------------------------------------------- +// RegularExpression: Tokenize methods +// --------------------------------------------------------------------------- +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression) { + + XMLCh* tmpBuf = XMLString::transcode(expression); + ArrayJanitor<XMLCh> janBuf(tmpBuf); + return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf)); +} + +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression, + const int start, const int end) { + + XMLCh* tmpBuf = XMLString::transcode(expression); + ArrayJanitor<XMLCh> janBuf(tmpBuf); + return tokenize(tmpBuf, start, end); +} + + + +// --------------------------------------------------------------------------- +// RegularExpression: Tokenize methods - Wide char version +// --------------------------------------------------------------------------- +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression) { + return tokenize(expression, 0, XMLString::stringLen(expression), 0); +} + +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression, + const int start, const int end) +{ + return tokenize(expression, start, end, 0); +} + +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression, + const int start, const int end, + RefVectorOf<Match> *subEx){ + + if (fOperations == 0) + prepare(); + + Match* pMatch = 0; + + if (subEx){ + pMatch = new Match(); + } + + RefArrayVectorOf<XMLCh>* tokenStack = new RefArrayVectorOf<XMLCh>(16, true); + + Context* context = 0; + Context* tmpContext = 0; + + int strLength = XMLString::stringLen(expression); + + { + XMLMutexLock lockInit(&fMutex); + + if (fContext == 0) + fContext = new Context(); + + if (fContext->fInUse) { + context = new Context(); + tmpContext = context; + } + else { + context = fContext; + } + + context->reset(expression, start, end, fNoClosures); + } + + Janitor<Context> janContext(tmpContext); + + bool adoptMatch = false; + Match* lMatch = pMatch; + + if (lMatch != 0) { + lMatch->setNoGroups(fNoGroups); + } + else if (fHasBackReferences) { + + lMatch = new Match(); + lMatch->setNoGroups(fNoGroups); + adoptMatch = true; + } + + if (context->fAdoptMatch) + delete context->fMatch; + + context->fMatch = lMatch; + context->fAdoptMatch = adoptMatch; + + int tokStart = start; + int matchStart = start; + + for (; matchStart <= end; matchStart++) { + + int matchEnd = match(context, fOperations, matchStart, 1); + + if (matchEnd != -1) { + + if (context->fMatch != 0) { + context->fMatch->setStartPos(0, context->fStart); + context->fMatch->setEndPos(0, matchEnd); + } + + if (subEx){ + subEx->addElement(lMatch); + lMatch = new Match(*(context->fMatch)); + context->fMatch = lMatch; + } + + context->fInUse = false; + + XMLCh* token; + if (tokStart == matchStart){ + + if (tokStart == strLength){ + tokStart--; + break; + } + + token = new XMLCh[1]; + token[0] = chNull; + + // When you tokenize using zero string, will return each + // token in the string. Since the zero string will also + // match the start/end characters, resulting in empty + // tokens, we ignore them and do not add them to the stack. + if (!XMLString::equals(fPattern, &chNull)) + tokenStack->addElement(token); + + } else { + token = new XMLCh[matchStart + 1 - tokStart]; + XMLString::subString(token, expression, tokStart, matchStart); + tokenStack->addElement(token); + } + + tokStart = matchEnd; + + //decrement matchStart as will increment it at the top of the loop + if (matchStart < matchEnd - 1) + matchStart = matchEnd - 1; + } + } + + XMLCh* token; + + if (matchStart == tokStart + 1){ + token = new XMLCh[1]; + token[0] = chNull; + + } else { + token = new XMLCh[strLength + 1 - tokStart]; + XMLString::subString(token, expression, tokStart, strLength); + } + + if (!XMLString::equals(fPattern, &chNull)) + tokenStack->addElement(token); + + return tokenStack; + +} + + +// ----------------------------------------------------------------------- +// RegularExpression: Replace methods +// ----------------------------------------------------------------------- +XMLCh* RegularExpression::replace(const char* const matchString, + const char* const replaceString){ + + XMLCh* tmpBuf = XMLString::transcode(matchString); + ArrayJanitor<XMLCh> janBuf(tmpBuf); + XMLCh* tmpBuf2 = XMLString::transcode(replaceString); + ArrayJanitor<XMLCh> janBuf2(tmpBuf2); + + return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf)); +} + +XMLCh* RegularExpression::replace(const char* const matchString, + const char* const replaceString, + const int start, const int end){ + + XMLCh* tmpBuf = XMLString::transcode(matchString); + ArrayJanitor<XMLCh> janBuf(tmpBuf); + XMLCh* tmpBuf2 = XMLString::transcode(replaceString); + ArrayJanitor<XMLCh> janBuf2(tmpBuf2); + + return replace(tmpBuf, tmpBuf2, start, end); +} + + +// --------------------------------------------------------------------------- +// RegularExpression: Replace methods - Wide char version +// --------------------------------------------------------------------------- +XMLCh* RegularExpression::replace(const XMLCh* const matchString, + const XMLCh* const replaceString){ + + return replace(matchString, replaceString, 0, + XMLString::stringLen(matchString)); +} + +XMLCh* RegularExpression::replace(const XMLCh* const matchString, + const XMLCh* const replaceString, + const int start, const int end) +{ + //check if matches zero length string - throw error if so + if (matches(XMLUni::fgZeroLenString)){ + ThrowXML(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString); + } + + RefVectorOf<Match> *subEx = new RefVectorOf<Match>(10, true); + Janitor<RefVectorOf<Match> > janSubEx(subEx); + + //Call to tokenize with Match vector so that we keep track of the locations + //of the subExpression within each of the matches + RefArrayVectorOf<XMLCh>* tokenStack = tokenize(matchString, start, end, subEx); + Janitor<RefArrayVectorOf<XMLCh> > janTokStack(tokenStack); + + XMLBuffer result; + + int numSubEx = 0; + + if (subEx && subEx->size() > 0) + numSubEx = subEx->elementAt(0)->getNoGroups() - 1; + + int tokStackSize = tokenStack->size(); + const XMLCh* curRepString = XMLString::replicate(replaceString); + + for (int i = 0; i < tokStackSize; i++){ + + result.append(tokenStack->elementAt(i)); + + if (i != tokStackSize - 1) { + + //if there are subExpressions, then determine the string we want to + //substitute in. + if (numSubEx != 0) + curRepString = subInExp(replaceString, matchString, subEx->elementAt(i)); + + result.append(curRepString); + } + } + + return XMLString::replicate(result.getRawBuffer()); + +} + +// --------------------------------------------------------------------------- // RegularExpression: Helpers methods // --------------------------------------------------------------------------- int RegularExpression::getOptionValue(const XMLCh ch) { @@ -665,6 +915,7 @@ return ret; } + int RegularExpression::match(Context* const context, const Op* const operations , int offset, const short direction) { @@ -815,7 +1066,6 @@ return offset; } - bool RegularExpression::matchChar(Context* const context, const XMLInt32 ch, int& offset, const short direction, const bool ignoreCase) @@ -832,7 +1082,6 @@ bool match = ignoreCase ? matchIgnoreCase(ch, strCh) : (ch == strCh); - if (!match) return false; @@ -1183,6 +1432,87 @@ } /* + * Helper for Replace. This method prepares the replacement string by substituting + * in actual values for parenthesized sub expressions. + * + * An error will be thrown if: + * 1) repString references an undefined subExpression + * 2) there is an unescaped chDollar which is not followed by a digit + * + */ +const XMLCh* RegularExpression::subInExp(const XMLCh* const repString, + const XMLCh* const origString, + const Match* subEx){ + + int numSubExp = subEx->getNoGroups() - 1; + + if (numSubExp == 0) + return XMLString::replicate(repString); + + bool notEscaped = true; + + XMLBuffer newString; + + XMLCh *indexStr = new XMLCh[2]; //holds the string rep of a + ArrayJanitor<XMLCh> indexJan(indexStr); //digit + + indexStr[1] = chNull; + int index = -1; + + for (const XMLCh* ptr = repString; *ptr != chNull; ptr++){ + + if ((*ptr == chDollarSign) && notEscaped) { + + ptr++; + + //check that after the $ is a digit + if (!XMLString::isDigit(*ptr)){ + + //invalid replace string - $ must be followed by a digit + ThrowXML(RuntimeException, XMLExcepts::Regex_InvalidRepPattern); + } + + indexStr[0] = *ptr; //get the digit + index = XMLString::parseInt(indexStr); //convert it to an int + + //now check that the index is legal + if (index > numSubExp){ + ThrowXML(RuntimeException, XMLExcepts::Regex_InvalidRepPattern); + } + + int start = subEx->getStartPos(index); + int end = subEx->getEndPos(index); + + //now copy the substring into the new string + for (int i=start; i<end; i++){ + newString.append(origString[i]); + } + + } else { + + //if you have a slash and then a character that's not a $ or /, + //then it's an invalid replace string + if (!notEscaped && (*ptr != chDollarSign && *ptr != chBackSlash)){ + ThrowXML(RuntimeException, XMLExcepts::Regex_InvalidRepPattern); + } + + if (*ptr == chBackSlash){ + notEscaped = false; + continue; + + }else + notEscaped = true; + + newString.append(*ptr); + } + } + + return XMLString::replicate(newString.getRawBuffer()); + +} + + +/* * Prepares for matching. This method is called just before starting matching */ void RegularExpression::prepare() { @@ -1312,6 +1642,7 @@ return WT_OTHER; } + XERCES_CPP_NAMESPACE_END 1.3 +32 -0 xml-xerces/c/src/xercesc/util/regx/Match.cpp Index: Match.cpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/Match.cpp,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Match.cpp 4 Nov 2002 15:17:00 -0000 1.2 +++ Match.cpp 18 Dec 2002 13:01:02 -0000 1.3 @@ -56,6 +56,9 @@ /* * $Log$ + * Revision 1.3 2002/12/18 13:01:02 gareth + * New functionality - tokenize and replace. Fixed REVISIT for case insensitive match. Patch by Jennifer Schachter. + * * Revision 1.2 2002/11/04 15:17:00 tng * C++ Namespace Support. * @@ -87,6 +90,20 @@ } +Match::Match(const Match& toCopy) : fNoGroups(0), + fPositionsSize(0), + fStartPositions(0), + fEndPositions(0){ + initialize(toCopy); +} + +Match& Match::operator=(const Match& toAssign){ + + initialize(toAssign); + return *this; +} + + Match::~Match() { cleanUp(); @@ -117,6 +134,21 @@ // --------------------------------------------------------------------------- // Match: private helpers methods // --------------------------------------------------------------------------- +void Match::initialize(const Match &toCopy){ + + //do not copy over value of fPositionSize as it is irrelevant to the + //state of the Match + + int toCopySize = toCopy.getNoGroups(); + setNoGroups(toCopySize); + + for (int i=0; i<toCopySize; i++){ + setStartPos(i, toCopy.getStartPos(i)); + setEndPos(i, toCopy.getEndPos(i)); + } + +} + void Match::cleanUp() { delete [] fStartPositions; 1.3 +19 -11 xml-xerces/c/src/xercesc/util/regx/Match.hpp Index: Match.hpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/Match.hpp,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- Match.hpp 4 Nov 2002 15:17:00 -0000 1.2 +++ Match.hpp 18 Dec 2002 13:01:02 -0000 1.3 @@ -76,10 +76,17 @@ public: // ----------------------------------------------------------------------- - // Public Constructors and Destructor - // ----------------------------------------------------------------------- + // Public Constructors and Destructor + // ----------------------------------------------------------------------- Match(); - ~Match(); + + /** + * Copy constructor + */ + Match(const Match& toCopy); + Match& operator=(const Match& toAssign); + + virtual ~Match(); // ----------------------------------------------------------------------- // Getter functions @@ -99,16 +106,17 @@ // ----------------------------------------------------------------------- // Initialize/Clean up methods // ----------------------------------------------------------------------- + void initialize(const Match& toCopy); void cleanUp(); // ----------------------------------------------------------------------- - // Private data members - // - // fNoGroups - // Represents no of regular expression groups + // Private data members + // + // fNoGroups + // Represents no of regular expression groups // - // fStartPositions - // Array of start positions in the target text matched to specific + // fStartPositions + // Array of start positions in the target text matched to specific // regular expression group // // fEndPositions @@ -117,7 +125,7 @@ // // fPositionsSize // Actual size of Start/EndPositions array. - // ----------------------------------------------------------------------- + // ----------------------------------------------------------------------- int fNoGroups; int fPositionsSize; int* fStartPositions;
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]