gareth 2002/12/18 05:01:02
Modified: c/src/xercesc/util/regx RegularExpression.hpp
RegularExpression.cpp Match.cpp Match.hpp
Log:
New functionality - tokenize and replace. Fixed REVISIT for case insensitive match.
Patch by Jennifer Schachter.
Revision Changes Path
1.4 +315 -266 xml-xerces/c/src/xercesc/util/regx/RegularExpression.hpp
Index: RegularExpression.hpp
===================================================================
RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/RegularExpression.hpp,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- RegularExpression.hpp 4 Nov 2002 15:17:00 -0000 1.3
+++ RegularExpression.hpp 18 Dec 2002 13:01:02 -0000 1.4
@@ -65,6 +65,8 @@
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/XMLUniDefs.hpp>
+#include <xercesc/util/RefArrayVectorOf.hpp>
+#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/regx/Op.hpp>
#include <xercesc/util/regx/TokenFactory.hpp>
#include <xercesc/util/regx/BMPattern.hpp>
@@ -136,6 +138,32 @@
bool matches(const XMLCh* const matchString, const int start,
const int end, Match* const pMatch);
+ // -----------------------------------------------------------------------
+ // Tokenize methods
+ // -----------------------------------------------------------------------
+ // Note: The caller owns the string vector that is returned, and is responsible
+ // for deleting it.
+ RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString);
+ RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const int
start,
+ const int end);
+
+ RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString);
+ RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString,
+ const int start, const int end);
+
+ // -----------------------------------------------------------------------
+ // Replace methods
+ // -----------------------------------------------------------------------
+ // Note: The caller owns the XMLCh* that is returned, and is responsible for
+ // deleting it.
+ XMLCh *replace(const char* const matchString, const char* const replaceString);
+ XMLCh *replace(const char* const matchString, const char* const replaceString,
+ const int start, const int end);
+
+ XMLCh *replace(const XMLCh* const matchString, const XMLCh* const
replaceString);
+ XMLCh *replace(const XMLCh* const matchString, const XMLCh* const
replaceString,
+ const int start, const int end);
+
private:
// -----------------------------------------------------------------------
// Private data types
@@ -225,8 +253,29 @@
const short direction);
/**
- * Converts a token tree into an operation tree
- */
+ * Tokenize helper
+ *
+ * This overloaded tokenize is for internal use only. It provides a way to
+ * keep track of the sub-expressions in each match of the pattern.
+ *
+ * It is called by the other tokenize methods, and by the replace method.
+ * The caller is responsible for the deletion of the returned
+ * RefArrayVectorOf<XMLCh*>
+ */
+ RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString,
+ const int start, const int end,
+ RefVectorOf<Match> *subEx);
+ /**
+ * Replace helpers
+ *
+ * Note: the caller owns the XMLCh* that is returned
+ */
+ const XMLCh *subInExp(const XMLCh* const repString,
+ const XMLCh* const origString,
+ const Match* subEx);
+ /**
+ * Converts a token tree into an operation tree
+ */
void compile(const Token* const token);
Op* compile(const Token* const token, Op* const next,
const bool reverse);
@@ -271,309 +320,309 @@
};
-// ---------------------------------------------------------------------------
-// RegularExpression: Cleanup methods
-// ---------------------------------------------------------------------------
-inline void RegularExpression::cleanUp() {
-
- delete [] fPattern;
- delete [] fFixedString;
- delete fContext;
- delete fBMPattern;
- delete fTokenFactory;
-}
-
-// ---------------------------------------------------------------------------
-// RegularExpression: Helper methods
-// ---------------------------------------------------------------------------
-inline bool RegularExpression::isSet(const int options, const int flag) {
-
- return (options & flag) == flag;
-}
-
-inline Op* RegularExpression::compileLook(const Token* const token,
- const Op* const next,
- const bool reverse,
- const unsigned short tokType) {
-
- Op* ret = 0;
- Op* result = compile(token->getChild(0), 0, reverse);
-
- switch(tokType) {
- case Token::T_LOOKAHEAD:
- ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result);
- break;
- case Token::T_NEGATIVELOOKAHEAD:
- ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result);
- break;
- case Token::T_LOOKBEHIND:
- ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result);
- break;
- case Token::T_NEGATIVELOOKBEHIND:
- ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result);
- break;
- case Token::T_INDEPENDENT:
- ret = fOpFactory.createIndependentOp(next, result);
- break;
- case Token::T_MODIFIERGROUP:
- ret = fOpFactory.createModifierOp(next, result,
- ((ModifierToken *) token)->getOptions(),
- ((ModifierToken *) token)->getOptionsMask());
- break;
- }
-
-
- return ret;
-}
-
-inline Op* RegularExpression::compileSingle(const Token* const token,
- Op* const next,
+ // ---------------------------------------------------------------------------
+ // RegularExpression: Cleanup methods
+ // ---------------------------------------------------------------------------
+ inline void RegularExpression::cleanUp() {
+
+ delete [] fPattern;
+ delete [] fFixedString;
+ delete fContext;
+ delete fBMPattern;
+ delete fTokenFactory;
+ }
+
+ // ---------------------------------------------------------------------------
+ // RegularExpression: Helper methods
+ // ---------------------------------------------------------------------------
+ inline bool RegularExpression::isSet(const int options, const int flag) {
+
+ return (options & flag) == flag;
+ }
+
+ inline Op* RegularExpression::compileLook(const Token* const token,
+ const Op* const next,
+ const bool reverse,
const unsigned short tokType) {
- Op* ret = 0;
-
- switch (tokType) {
- case Token::T_DOT:
- ret = fOpFactory.createDotOp();
- break;
- case Token::T_CHAR:
- ret = fOpFactory.createCharOp(token->getChar());
- break;
- case Token::T_ANCHOR:
- ret = fOpFactory.createAnchorOp(token->getChar());
- break;
- case Token::T_RANGE:
- case Token::T_NRANGE:
- ret = fOpFactory.createRangeOp(token);
- break;
- case Token::T_EMPTY:
- ret = next;
- break;
- case Token::T_STRING:
- ret = fOpFactory.createStringOp(token->getString());
- break;
- case Token::T_BACKREFERENCE:
- ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
- break;
- }
-
- if (tokType != Token::T_EMPTY)
- ret->setNextOp(next);
-
- return ret;
-}
-
+ Op* ret = 0;
+ Op* result = compile(token->getChild(0), 0, reverse);
-inline Op* RegularExpression::compileUnion(const Token* const token,
- Op* const next,
- const bool reverse) {
+ switch(tokType) {
+ case Token::T_LOOKAHEAD:
+ ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result);
+ break;
+ case Token::T_NEGATIVELOOKAHEAD:
+ ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result);
+ break;
+ case Token::T_LOOKBEHIND:
+ ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result);
+ break;
+ case Token::T_NEGATIVELOOKBEHIND:
+ ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result);
+ break;
+ case Token::T_INDEPENDENT:
+ ret = fOpFactory.createIndependentOp(next, result);
+ break;
+ case Token::T_MODIFIERGROUP:
+ ret = fOpFactory.createModifierOp(next, result,
+ ((ModifierToken *) token)->getOptions(),
+ ((ModifierToken *) token)->getOptionsMask());
+ break;
+ }
+
+
+ return ret;
+ }
+
+ inline Op* RegularExpression::compileSingle(const Token* const token,
+ Op* const next,
+ const unsigned short tokType) {
+
+ Op* ret = 0;
+
+ switch (tokType) {
+ case Token::T_DOT:
+ ret = fOpFactory.createDotOp();
+ break;
+ case Token::T_CHAR:
+ ret = fOpFactory.createCharOp(token->getChar());
+ break;
+ case Token::T_ANCHOR:
+ ret = fOpFactory.createAnchorOp(token->getChar());
+ break;
+ case Token::T_RANGE:
+ case Token::T_NRANGE:
+ ret = fOpFactory.createRangeOp(token);
+ break;
+ case Token::T_EMPTY:
+ ret = next;
+ break;
+ case Token::T_STRING:
+ ret = fOpFactory.createStringOp(token->getString());
+ break;
+ case Token::T_BACKREFERENCE:
+ ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
+ break;
+ }
- int tokSize = token->size();
- UnionOp* uniOp = fOpFactory.createUnionOp(tokSize);
+ if (tokType != Token::T_EMPTY)
+ ret->setNextOp(next);
- for (int i=0; i<tokSize; i++) {
+ return ret;
+ }
- uniOp->addElement(compile(token->getChild(i), next, reverse));
- }
- return uniOp;
-}
+ inline Op* RegularExpression::compileUnion(const Token* const token,
+ Op* const next,
+ const bool reverse) {
+ int tokSize = token->size();
+ UnionOp* uniOp = fOpFactory.createUnionOp(tokSize);
-inline Op* RegularExpression::compileCondition(const Token* const token,
- Op* const next,
- const bool reverse) {
+ for (int i=0; i<tokSize; i++) {
- Token* condTok = ((ConditionToken*) token)->getConditionToken();
- Token* yesTok = token->getChild(0);
- Token* noTok = token->getChild(1);
- int refNo = token->getReferenceNo();
- Op* condOp = (condTok == 0) ? 0 : compile(condTok, 0, reverse);
- Op* yesOp = compile(yesTok, next, reverse);
- Op* noOp = (noTok == 0) ? 0 : compile(noTok, next, reverse);
+ uniOp->addElement(compile(token->getChild(i), next, reverse));
+ }
- return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp);
-}
+ return uniOp;
+ }
-inline Op* RegularExpression::compileParenthesis(const Token* const token,
+ inline Op* RegularExpression::compileCondition(const Token* const token,
Op* const next,
const bool reverse) {
- if (token->getNoParen() == 0)
- return compile(token->getChild(0), next, reverse);
+ Token* condTok = ((ConditionToken*) token)->getConditionToken();
+ Token* yesTok = token->getChild(0);
+ Token* noTok = token->getChild(1);
+ int refNo = token->getReferenceNo();
+ Op* condOp = (condTok == 0) ? 0 : compile(condTok, 0, reverse);
+ Op* yesOp = compile(yesTok, next, reverse);
+ Op* noOp = (noTok == 0) ? 0 : compile(noTok, next, reverse);
- Op* captureOp = 0;
+ return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp);
+ }
- if (reverse) {
- captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next);
- captureOp = compile(token->getChild(0), captureOp, reverse);
+ inline Op* RegularExpression::compileParenthesis(const Token* const token,
+ Op* const next,
+ const bool reverse) {
- return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp);
- }
+ if (token->getNoParen() == 0)
+ return compile(token->getChild(0), next, reverse);
- captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next);
- captureOp = compile(token->getChild(0), captureOp, reverse);
+ Op* captureOp = 0;
- return fOpFactory.createCaptureOp(token->getNoParen(), captureOp);
-}
+ if (reverse) {
-inline Op* RegularExpression::compileConcat(const Token* const token,
- Op* const next,
- const bool reverse) {
-
- Op* ret = next;
- int tokSize = token->size();
-
- if (!reverse) {
-
- for (int i= tokSize - 1; i>=0; i--) {
- ret = compile(token->getChild(i), ret, false);
- }
- }
- else {
-
- for (int i= 0; i< tokSize; i++) {
- ret = compile(token->getChild(i), ret, true);
- }
- }
+ captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next);
+ captureOp = compile(token->getChild(0), captureOp, reverse);
- return ret;
-}
+ return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp);
+ }
-inline Op* RegularExpression::compileClosure(const Token* const token,
- Op* const next,
- const bool reverse,
- const unsigned short tokType) {
+ captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next);
+ captureOp = compile(token->getChild(0), captureOp, reverse);
- Op* ret = 0;
- Token* childTok = token->getChild(0);
- int min = token->getMin();
- int max = token->getMax();
+ return fOpFactory.createCaptureOp(token->getNoParen(), captureOp);
+ }
- if (min >= 0 && min == max) {
+ inline Op* RegularExpression::compileConcat(const Token* const token,
+ Op* const next,
+ const bool reverse) {
- ret = next;
- for (int i=0; i< min; i++) {
- ret = compile(childTok, ret, reverse);
- }
+ Op* ret = next;
+ int tokSize = token->size();
- return ret;
- }
+ if (!reverse) {
- if (min > 0 && max > 0)
- max -= min;
+ for (int i= tokSize - 1; i>=0; i--) {
+ ret = compile(token->getChild(i), ret, false);
+ }
+ }
+ else {
- if (max > 0) {
+ for (int i= 0; i< tokSize; i++) {
+ ret = compile(token->getChild(i), ret, true);
+ }
+ }
- ret = next;
- for (int i=0; i<max; i++) {
+ return ret;
+ }
- ChildOp* childOp = fOpFactory.createQuestionOp(
- tokType == Token::T_NONGREEDYCLOSURE);
-
- childOp->setNextOp(next);
- childOp->setChild(compile(childTok, ret, reverse));
- ret = childOp;
- }
- }
- else {
-
- ChildOp* childOp = 0;
-
- if (tokType == Token::T_NONGREEDYCLOSURE) {
- childOp = fOpFactory.createNonGreedyClosureOp();
- }
- else {
-
- if (childTok->getMinLength() == 0)
- childOp = fOpFactory.createClosureOp(fNoClosures++);
- else
- childOp = fOpFactory.createClosureOp(-1);
- }
-
- childOp->setNextOp(next);
- childOp->setChild(compile(childTok, childOp, reverse));
- ret = childOp;
- }
-
- if (min > 0) {
-
- for (int i=0; i< min; i++) {
- ret = compile(childTok, ret, reverse);
- }
- }
-
- return ret;
-}
-
-inline int RegularExpression::matchUnion(Context* const context,
- const Op* const op, int offset,
- const short direction)
-{
- unsigned int opSize = op->getSize();
- int ret = -1;
-
- for (unsigned int i=0; i < opSize; i++) {
-
- ret = match(context, op->elementAt(i), offset, direction);
-
- if (ret == context->fLimit)
- return ret;
- }
-
- return -1;
-}
-
-inline int RegularExpression::matchModifier(Context* const context,
- const Op* const op, int offset,
- const short direction)
-{
- int saveOptions = fOptions;
- fOptions |= (int) op->getData();
- fOptions &= (int) ~op->getData2();
-
- int ret = match(context, op->getChild(), offset, direction);
-
- fOptions = saveOptions;
-
- return ret;
-}
-
-inline unsigned short RegularExpression::getWordType(const XMLCh* const target
- , const int begin
- , const int end
- , const int offset)
-{
- if (offset < begin || offset >= end)
- return WT_OTHER;
-
- return getCharType(target[offset]);
-}
-
-inline
-unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target
- , const int start
- , const int end
- , int offset)
-{
- unsigned short ret = getWordType(target, start, end, --offset);
+ inline Op* RegularExpression::compileClosure(const Token* const token,
+ Op* const next,
+ const bool reverse,
+ const unsigned short tokType) {
- while (ret == WT_IGNORE) {
- ret = getWordType(target, start, end, --offset);
- }
+ Op* ret = 0;
+ Token* childTok = token->getChild(0);
+ int min = token->getMin();
+ int max = token->getMax();
+
+ if (min >= 0 && min == max) {
+
+ ret = next;
+ for (int i=0; i< min; i++) {
+ ret = compile(childTok, ret, reverse);
+ }
+
+ return ret;
+ }
+
+ if (min > 0 && max > 0)
+ max -= min;
+
+ if (max > 0) {
+
+ ret = next;
+ for (int i=0; i<max; i++) {
+
+ ChildOp* childOp = fOpFactory.createQuestionOp(
+ tokType == Token::T_NONGREEDYCLOSURE);
+
+ childOp->setNextOp(next);
+ childOp->setChild(compile(childTok, ret, reverse));
+ ret = childOp;
+ }
+ }
+ else {
+
+ ChildOp* childOp = 0;
+
+ if (tokType == Token::T_NONGREEDYCLOSURE) {
+ childOp = fOpFactory.createNonGreedyClosureOp();
+ }
+ else {
+
+ if (childTok->getMinLength() == 0)
+ childOp = fOpFactory.createClosureOp(fNoClosures++);
+ else
+ childOp = fOpFactory.createClosureOp(-1);
+ }
+
+ childOp->setNextOp(next);
+ childOp->setChild(compile(childTok, childOp, reverse));
+ ret = childOp;
+ }
+
+ if (min > 0) {
+
+ for (int i=0; i< min; i++) {
+ ret = compile(childTok, ret, reverse);
+ }
+ }
+
+ return ret;
+ }
+
+ inline int RegularExpression::matchUnion(Context* const context,
+ const Op* const op, int offset,
+ const short direction)
+ {
+ unsigned int opSize = op->getSize();
+ int ret = -1;
+
+ for (unsigned int i=0; i < opSize; i++) {
+
+ ret = match(context, op->elementAt(i), offset, direction);
+
+ if (ret == context->fLimit)
+ return ret;
+ }
+
+ return -1;
+ }
+
+ inline int RegularExpression::matchModifier(Context* const context,
+ const Op* const op, int offset,
+ const short direction)
+ {
+ int saveOptions = fOptions;
+ fOptions |= (int) op->getData();
+ fOptions &= (int) ~op->getData2();
+
+ int ret = match(context, op->getChild(), offset, direction);
+
+ fOptions = saveOptions;
+
+ return ret;
+ }
+
+ inline unsigned short RegularExpression::getWordType(const XMLCh* const target
+ , const int begin
+ , const int end
+ , const int offset)
+ {
+ if (offset < begin || offset >= end)
+ return WT_OTHER;
+
+ return getCharType(target[offset]);
+ }
+
+ inline
+ unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target
+ , const int start
+ , const int end
+ , int offset)
+ {
+ unsigned short ret = getWordType(target, start, end, --offset);
+
+ while (ret == WT_IGNORE) {
+ ret = getWordType(target, start, end, --offset);
+ }
- return ret;
-}
+ return ret;
+ }
-inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
+ inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
const XMLInt32 ch2)
{
- //REVISIT - for now we will return a case sensitive match
- return (ch1 == ch2);
+ return (0==XMLString::compareNIString((XMLCh*)&ch1,(XMLCh*)&ch2, 1));
}
+
XERCES_CPP_NAMESPACE_END
1.5 +334 -3 xml-xerces/c/src/xercesc/util/regx/RegularExpression.cpp
Index: RegularExpression.cpp
===================================================================
RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/RegularExpression.cpp,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- RegularExpression.cpp 4 Nov 2002 15:17:00 -0000 1.4
+++ RegularExpression.cpp 18 Dec 2002 13:01:02 -0000 1.5
@@ -56,6 +56,9 @@
/*
* $Log$
+ * Revision 1.5 2002/12/18 13:01:02 gareth
+ * New functionality - tokenize and replace. Fixed REVISIT for case insensitive
match. Patch by Jennifer Schachter.
+ *
* Revision 1.4 2002/11/04 15:17:00 tng
* C++ Namespace Support.
*
@@ -104,7 +107,6 @@
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/regx/RegularExpression.hpp>
-#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/regx/RegxUtil.hpp>
#include <xercesc/util/regx/Match.hpp>
@@ -114,6 +116,7 @@
#include <xercesc/util/regx/ParserForXMLSchema.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/ParseException.hpp>
+#include <xercesc/framework/XMLBuffer.hpp>
XERCES_CPP_NAMESPACE_BEGIN
@@ -620,6 +623,253 @@
}
// ---------------------------------------------------------------------------
+// RegularExpression: Tokenize methods
+// ---------------------------------------------------------------------------
+RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression) {
+
+ XMLCh* tmpBuf = XMLString::transcode(expression);
+ ArrayJanitor<XMLCh> janBuf(tmpBuf);
+ return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf));
+}
+
+RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression,
+ const int start, const
int end) {
+
+ XMLCh* tmpBuf = XMLString::transcode(expression);
+ ArrayJanitor<XMLCh> janBuf(tmpBuf);
+ return tokenize(tmpBuf, start, end);
+}
+
+
+
+// ---------------------------------------------------------------------------
+// RegularExpression: Tokenize methods - Wide char version
+// ---------------------------------------------------------------------------
+RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression)
{
+ return tokenize(expression, 0, XMLString::stringLen(expression), 0);
+}
+
+RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression,
+
const int start, const int end)
+{
+ return tokenize(expression, start, end, 0);
+}
+
+RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression,
+ const int start, const int end,
+ RefVectorOf<Match> *subEx){
+
+ if (fOperations == 0)
+ prepare();
+
+ Match* pMatch = 0;
+
+ if (subEx){
+ pMatch = new Match();
+ }
+
+ RefArrayVectorOf<XMLCh>* tokenStack = new RefArrayVectorOf<XMLCh>(16, true);
+
+ Context* context = 0;
+ Context* tmpContext = 0;
+
+ int strLength = XMLString::stringLen(expression);
+
+ {
+ XMLMutexLock lockInit(&fMutex);
+
+ if (fContext == 0)
+ fContext = new Context();
+
+ if (fContext->fInUse) {
+ context = new Context();
+ tmpContext = context;
+ }
+ else {
+ context = fContext;
+ }
+
+ context->reset(expression, start, end, fNoClosures);
+ }
+
+ Janitor<Context> janContext(tmpContext);
+
+ bool adoptMatch = false;
+ Match* lMatch = pMatch;
+
+ if (lMatch != 0) {
+ lMatch->setNoGroups(fNoGroups);
+ }
+ else if (fHasBackReferences) {
+
+ lMatch = new Match();
+ lMatch->setNoGroups(fNoGroups);
+ adoptMatch = true;
+ }
+
+ if (context->fAdoptMatch)
+ delete context->fMatch;
+
+ context->fMatch = lMatch;
+ context->fAdoptMatch = adoptMatch;
+
+ int tokStart = start;
+ int matchStart = start;
+
+ for (; matchStart <= end; matchStart++) {
+
+ int matchEnd = match(context, fOperations, matchStart, 1);
+
+ if (matchEnd != -1) {
+
+ if (context->fMatch != 0) {
+ context->fMatch->setStartPos(0, context->fStart);
+ context->fMatch->setEndPos(0, matchEnd);
+ }
+
+ if (subEx){
+ subEx->addElement(lMatch);
+ lMatch = new Match(*(context->fMatch));
+ context->fMatch = lMatch;
+ }
+
+ context->fInUse = false;
+
+ XMLCh* token;
+ if (tokStart == matchStart){
+
+ if (tokStart == strLength){
+ tokStart--;
+ break;
+ }
+
+ token = new XMLCh[1];
+ token[0] = chNull;
+
+ // When you tokenize using zero string, will return each
+ // token in the string. Since the zero string will also
+ // match the start/end characters, resulting in empty
+ // tokens, we ignore them and do not add them to the stack.
+ if (!XMLString::equals(fPattern, &chNull))
+ tokenStack->addElement(token);
+
+ } else {
+ token = new XMLCh[matchStart + 1 - tokStart];
+ XMLString::subString(token, expression, tokStart, matchStart);
+ tokenStack->addElement(token);
+ }
+
+ tokStart = matchEnd;
+
+ //decrement matchStart as will increment it at the top of the loop
+ if (matchStart < matchEnd - 1)
+ matchStart = matchEnd - 1;
+ }
+ }
+
+ XMLCh* token;
+
+ if (matchStart == tokStart + 1){
+ token = new XMLCh[1];
+ token[0] = chNull;
+
+ } else {
+ token = new XMLCh[strLength + 1 - tokStart];
+ XMLString::subString(token, expression, tokStart, strLength);
+ }
+
+ if (!XMLString::equals(fPattern, &chNull))
+ tokenStack->addElement(token);
+
+ return tokenStack;
+
+}
+
+
+// -----------------------------------------------------------------------
+// RegularExpression: Replace methods
+// -----------------------------------------------------------------------
+XMLCh* RegularExpression::replace(const char* const matchString,
+ const char* const replaceString){
+
+ XMLCh* tmpBuf = XMLString::transcode(matchString);
+ ArrayJanitor<XMLCh> janBuf(tmpBuf);
+ XMLCh* tmpBuf2 = XMLString::transcode(replaceString);
+ ArrayJanitor<XMLCh> janBuf2(tmpBuf2);
+
+ return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf));
+}
+
+XMLCh* RegularExpression::replace(const char* const matchString,
+ const char* const replaceString,
+ const int start, const int end){
+
+ XMLCh* tmpBuf = XMLString::transcode(matchString);
+ ArrayJanitor<XMLCh> janBuf(tmpBuf);
+ XMLCh* tmpBuf2 = XMLString::transcode(replaceString);
+ ArrayJanitor<XMLCh> janBuf2(tmpBuf2);
+
+ return replace(tmpBuf, tmpBuf2, start, end);
+}
+
+
+// ---------------------------------------------------------------------------
+// RegularExpression: Replace methods - Wide char version
+// ---------------------------------------------------------------------------
+XMLCh* RegularExpression::replace(const XMLCh* const matchString,
+ const XMLCh* const replaceString){
+
+ return replace(matchString, replaceString, 0,
+ XMLString::stringLen(matchString));
+}
+
+XMLCh* RegularExpression::replace(const XMLCh* const matchString,
+ const XMLCh* const replaceString,
+ const int start, const int end)
+{
+ //check if matches zero length string - throw error if so
+ if (matches(XMLUni::fgZeroLenString)){
+ ThrowXML(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString);
+ }
+
+ RefVectorOf<Match> *subEx = new RefVectorOf<Match>(10, true);
+ Janitor<RefVectorOf<Match> > janSubEx(subEx);
+
+ //Call to tokenize with Match vector so that we keep track of the locations
+ //of the subExpression within each of the matches
+ RefArrayVectorOf<XMLCh>* tokenStack = tokenize(matchString, start, end, subEx);
+ Janitor<RefArrayVectorOf<XMLCh> > janTokStack(tokenStack);
+
+ XMLBuffer result;
+
+ int numSubEx = 0;
+
+ if (subEx && subEx->size() > 0)
+ numSubEx = subEx->elementAt(0)->getNoGroups() - 1;
+
+ int tokStackSize = tokenStack->size();
+ const XMLCh* curRepString = XMLString::replicate(replaceString);
+
+ for (int i = 0; i < tokStackSize; i++){
+
+ result.append(tokenStack->elementAt(i));
+
+ if (i != tokStackSize - 1) {
+
+ //if there are subExpressions, then determine the string we want to
+ //substitute in.
+ if (numSubEx != 0)
+ curRepString = subInExp(replaceString, matchString, subEx->elementAt(i));
+
+ result.append(curRepString);
+ }
+ }
+
+ return XMLString::replicate(result.getRawBuffer());
+
+}
+
+// ---------------------------------------------------------------------------
// RegularExpression: Helpers methods
// ---------------------------------------------------------------------------
int RegularExpression::getOptionValue(const XMLCh ch) {
@@ -665,6 +915,7 @@
return ret;
}
+
int RegularExpression::match(Context* const context, const Op* const operations
, int offset, const short
direction)
{
@@ -815,7 +1066,6 @@
return offset;
}
-
bool RegularExpression::matchChar(Context* const context,
const XMLInt32 ch,
int& offset,
const short
direction, const bool ignoreCase)
@@ -832,7 +1082,6 @@
bool match = ignoreCase ? matchIgnoreCase(ch, strCh)
: (ch == strCh);
-
if (!match)
return false;
@@ -1183,6 +1432,87 @@
}
/*
+ * Helper for Replace. This method prepares the replacement string by substituting
+ * in actual values for parenthesized sub expressions.
+ *
+ * An error will be thrown if:
+ * 1) repString references an undefined subExpression
+ * 2) there is an unescaped chDollar which is not followed by a digit
+ *
+ */
+const XMLCh* RegularExpression::subInExp(const XMLCh* const repString,
+ const XMLCh* const origString,
+ const Match* subEx){
+
+ int numSubExp = subEx->getNoGroups() - 1;
+
+ if (numSubExp == 0)
+ return XMLString::replicate(repString);
+
+ bool notEscaped = true;
+
+ XMLBuffer newString;
+
+ XMLCh *indexStr = new XMLCh[2]; //holds the string rep of a
+ ArrayJanitor<XMLCh> indexJan(indexStr); //digit
+
+ indexStr[1] = chNull;
+ int index = -1;
+
+ for (const XMLCh* ptr = repString; *ptr != chNull; ptr++){
+
+ if ((*ptr == chDollarSign) && notEscaped) {
+
+ ptr++;
+
+ //check that after the $ is a digit
+ if (!XMLString::isDigit(*ptr)){
+
+ //invalid replace string - $ must be followed by a digit
+ ThrowXML(RuntimeException,
XMLExcepts::Regex_InvalidRepPattern);
+ }
+
+ indexStr[0] = *ptr; //get the digit
+ index = XMLString::parseInt(indexStr); //convert it to an int
+
+ //now check that the index is legal
+ if (index > numSubExp){
+ ThrowXML(RuntimeException,
XMLExcepts::Regex_InvalidRepPattern);
+ }
+
+ int start = subEx->getStartPos(index);
+ int end = subEx->getEndPos(index);
+
+ //now copy the substring into the new string
+ for (int i=start; i<end; i++){
+ newString.append(origString[i]);
+ }
+
+ } else {
+
+ //if you have a slash and then a character that's not a $ or /,
+ //then it's an invalid replace string
+ if (!notEscaped && (*ptr != chDollarSign && *ptr != chBackSlash)){
+ ThrowXML(RuntimeException,
XMLExcepts::Regex_InvalidRepPattern);
+ }
+
+ if (*ptr == chBackSlash){
+ notEscaped = false;
+ continue;
+
+ }else
+ notEscaped = true;
+
+ newString.append(*ptr);
+ }
+ }
+
+ return XMLString::replicate(newString.getRawBuffer());
+
+}
+
+
+/*
* Prepares for matching. This method is called just before starting matching
*/
void RegularExpression::prepare() {
@@ -1312,6 +1642,7 @@
return WT_OTHER;
}
+
XERCES_CPP_NAMESPACE_END
1.3 +32 -0 xml-xerces/c/src/xercesc/util/regx/Match.cpp
Index: Match.cpp
===================================================================
RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/Match.cpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Match.cpp 4 Nov 2002 15:17:00 -0000 1.2
+++ Match.cpp 18 Dec 2002 13:01:02 -0000 1.3
@@ -56,6 +56,9 @@
/*
* $Log$
+ * Revision 1.3 2002/12/18 13:01:02 gareth
+ * New functionality - tokenize and replace. Fixed REVISIT for case insensitive
match. Patch by Jennifer Schachter.
+ *
* Revision 1.2 2002/11/04 15:17:00 tng
* C++ Namespace Support.
*
@@ -87,6 +90,20 @@
}
+Match::Match(const Match& toCopy) : fNoGroups(0),
+ fPositionsSize(0),
+ fStartPositions(0),
+ fEndPositions(0){
+ initialize(toCopy);
+}
+
+Match& Match::operator=(const Match& toAssign){
+
+ initialize(toAssign);
+ return *this;
+}
+
+
Match::~Match() {
cleanUp();
@@ -117,6 +134,21 @@
// ---------------------------------------------------------------------------
// Match: private helpers methods
// ---------------------------------------------------------------------------
+void Match::initialize(const Match &toCopy){
+
+ //do not copy over value of fPositionSize as it is irrelevant to the
+ //state of the Match
+
+ int toCopySize = toCopy.getNoGroups();
+ setNoGroups(toCopySize);
+
+ for (int i=0; i<toCopySize; i++){
+ setStartPos(i, toCopy.getStartPos(i));
+ setEndPos(i, toCopy.getEndPos(i));
+ }
+
+}
+
void Match::cleanUp() {
delete [] fStartPositions;
1.3 +19 -11 xml-xerces/c/src/xercesc/util/regx/Match.hpp
Index: Match.hpp
===================================================================
RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/regx/Match.hpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Match.hpp 4 Nov 2002 15:17:00 -0000 1.2
+++ Match.hpp 18 Dec 2002 13:01:02 -0000 1.3
@@ -76,10 +76,17 @@
public:
// -----------------------------------------------------------------------
- // Public Constructors and Destructor
- // -----------------------------------------------------------------------
+ // Public Constructors and Destructor
+ // -----------------------------------------------------------------------
Match();
- ~Match();
+
+ /**
+ * Copy constructor
+ */
+ Match(const Match& toCopy);
+ Match& operator=(const Match& toAssign);
+
+ virtual ~Match();
// -----------------------------------------------------------------------
// Getter functions
@@ -99,16 +106,17 @@
// -----------------------------------------------------------------------
// Initialize/Clean up methods
// -----------------------------------------------------------------------
+ void initialize(const Match& toCopy);
void cleanUp();
// -----------------------------------------------------------------------
- // Private data members
- //
- // fNoGroups
- // Represents no of regular expression groups
+ // Private data members
+ //
+ // fNoGroups
+ // Represents no of regular expression groups
//
- // fStartPositions
- // Array of start positions in the target text matched to specific
+ // fStartPositions
+ // Array of start positions in the target text matched to specific
// regular expression group
//
// fEndPositions
@@ -117,7 +125,7 @@
//
// fPositionsSize
// Actual size of Start/EndPositions array.
- // -----------------------------------------------------------------------
+ // -----------------------------------------------------------------------
int fNoGroups;
int fPositionsSize;
int* fStartPositions;
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]