peiyongz 2003/03/15 22:00:44 Modified: c/src/xercesc/framework XMLFormatter.hpp XMLFormatter.cpp Log: Bug#17983 Formatter does not escape control characters Revision Changes Path 1.11 +23 -4 xml-xerces/c/src/xercesc/framework/XMLFormatter.hpp Index: XMLFormatter.hpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/framework/XMLFormatter.hpp,v retrieving revision 1.10 retrieving revision 1.11 diff -u -r1.10 -r1.11 --- XMLFormatter.hpp 11 Mar 2003 12:58:36 -0000 1.10 +++ XMLFormatter.hpp 16 Mar 2003 06:00:43 -0000 1.11 @@ -56,6 +56,9 @@ /* * $Log$ + * Revision 1.11 2003/03/16 06:00:43 peiyongz + * Bug#17983 Formatter does not escape control characters + * * Revision 1.10 2003/03/11 12:58:36 tng * Fix compilation error on AIX. * @@ -273,6 +276,7 @@ XMLFormatter ( const XMLCh* const outEncoding + , const XMLCh* const docVersion , XMLFormatTarget* const target , const EscapeFlags escapeFlags = NoEscapes , const UnRepFlags unrepFlags = UnRep_Fail @@ -281,6 +285,7 @@ XMLFormatter ( const char* const outEncoding + , const char* const docVersion , XMLFormatTarget* const target , const EscapeFlags escapeFlags = NoEscapes , const UnRepFlags unrepFlags = UnRep_Fail @@ -411,9 +416,15 @@ XMLByte * ref, const XMLCh * stdRef); - unsigned int handleUnEscapedChars(const XMLCh * srcPtr, - const unsigned int count, - const UnRepFlags unrepFlags); + const void writeCharRef(const XMLCh &toWrite); + + bool inEscapeList(const XMLFormatter::EscapeFlags escStyle + , const XMLCh toCheck); + + + unsigned int handleUnEscapedChars(const XMLCh * srcPtr, + const unsigned int count, + const UnRepFlags unrepFlags); void specialFormat ( @@ -458,6 +469,11 @@ // These are character refs for the standard char refs, in the // output encoding. They are faulted in as required, by transcoding // them from fixed Unicode versions. + // + // fIsXML11 + // for performance reason, we do not store the actual version string + // and do the string comparison again and again. + // // ----------------------------------------------------------------------- EscapeFlags fEscapeFlags; XMLCh* fOutEncoding; @@ -476,6 +492,9 @@ unsigned int fLTLen; XMLByte* fQuoteRef; unsigned int fQuoteLen; + + bool fIsXML11; + }; @@ -493,7 +512,7 @@ // ----------------------------------------------------------------------- virtual void writeChars ( - const XMLByte* const toWrite + const XMLByte* const toWrite , const unsigned int count , XMLFormatter* const formatter ) = 0; 1.9 +110 -53 xml-xerces/c/src/xercesc/framework/XMLFormatter.cpp Index: XMLFormatter.cpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/framework/XMLFormatter.cpp,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- XMLFormatter.cpp 7 Mar 2003 21:42:37 -0000 1.8 +++ XMLFormatter.cpp 16 Mar 2003 06:00:43 -0000 1.9 @@ -69,6 +69,9 @@ #include <xercesc/util/TranscodingException.hpp> #include <xercesc/util/XMLExceptMsgs.hpp> #include <xercesc/framework/XMLFormatter.hpp> +#include <xercesc/util/Janitor.hpp> +#include <xercesc/util/XMLChar.hpp> + #include <string.h> XERCES_CPP_NAMESPACE_BEGIN @@ -124,8 +127,8 @@ // --------------------------------------------------------------------------- // Local methods // --------------------------------------------------------------------------- -static inline bool inEscapeList(const XMLFormatter::EscapeFlags escStyle - , const XMLCh toCheck) +bool XMLFormatter::inEscapeList(const XMLFormatter::EscapeFlags escStyle + , const XMLCh toCheck) { const XMLCh* escList = gEscapeChars[escStyle]; while (*escList) @@ -133,7 +136,42 @@ if (*escList++ == toCheck) return true; } - return false; + + /*** + * XML1.1 + * + * Finally, there is considerable demand to define a standard representation of + * arbitrary Unicode characters in XML documents. Therefore, XML 1.1 allows the + * use of character references to the control characters #x1 through #x1F, + * most of which are forbidden in XML 1.0. For reasons of robustness, however, + * these characters still cannot be used directly in documents. + * In order to improve the robustness of character encoding detection, the + * additional control characters #x7F through #x9F, which were freely allowed in + * XML 1.0 documents, now must also appear only as character references. + * (Whitespace characters are of course exempt.) The minor sacrifice of backward + * compatibility is considered not significant. + * Due to potential problems with APIs, #x0 is still forbidden both directly and + * as a character reference. + * + ***/ + if (fIsXML11) + { + // for XML11 + if ( XMLChar1_1::isControlChar(toCheck, 0) && + !XMLChar1_1::isWhitespace(toCheck, 0) ) + { + return true; + } + else + { + return false; + } + } + else + { + return false; + } + } @@ -141,24 +179,26 @@ // XMLFormatter: Constructors and Destructor // --------------------------------------------------------------------------- XMLFormatter::XMLFormatter( const char* const outEncoding + , const char* const docVersion , XMLFormatTarget* const target , const EscapeFlags escapeFlags - , const UnRepFlags unrepFlags) : - fEscapeFlags(escapeFlags) + , const UnRepFlags unrepFlags) + : fEscapeFlags(escapeFlags) , fOutEncoding(0) , fTarget(target) , fUnRepFlags(unrepFlags) , fXCoder(0) , fAposRef(0) - , fAmpRef(0) - , fGTRef(0) - , fLTRef(0) - , fQuoteRef(0) , fAposLen(0) + , fAmpRef(0) , fAmpLen(0) + , fGTRef(0) , fGTLen(0) + , fLTRef(0) , fLTLen(0) + , fQuoteRef(0) , fQuoteLen(0) + , fIsXML11(false) { // Transcode the encoding string fOutEncoding = XMLString::transcode(outEncoding); @@ -183,24 +223,34 @@ , outEncoding ); } + + XMLCh* const tmpDocVer = XMLString::transcode(docVersion); + ArrayJanitor<XMLCh> jname(tmpDocVer); + fIsXML11 = XMLString::equals(tmpDocVer, XMLUni::fgVersion1_1); } XMLFormatter::XMLFormatter( const XMLCh* const outEncoding + , const XMLCh* const docVersion , XMLFormatTarget* const target , const EscapeFlags escapeFlags - , const UnRepFlags unrepFlags) : - fEscapeFlags(escapeFlags) + , const UnRepFlags unrepFlags) + : fEscapeFlags(escapeFlags) , fOutEncoding(0) , fTarget(target) , fUnRepFlags(unrepFlags) - , fXCoder(0) - + , fXCoder(0) , fAposRef(0) - , fAmpRef(0) + , fAposLen(0) + , fAmpRef(0) + , fAmpLen(0) , fGTRef(0) + , fGTLen(0) , fLTRef(0) + , fLTLen(0) , fQuoteRef(0) + , fQuoteLen(0) + , fIsXML11(false) { // Copy the encoding string fOutEncoding = XMLString::replicate(outEncoding); @@ -225,6 +275,8 @@ , outEncoding ); } + + fIsXML11 = XMLString::equals(docVersion, XMLUni::fgVersion1_1); } XMLFormatter::~XMLFormatter() @@ -323,32 +375,33 @@ const XMLByte * theChars; switch (*srcPtr) { case chAmpersand : - theChars = getCharRef(fAmpLen, fAmpRef, gAmpRef); + theChars = getCharRef(fAmpLen, fAmpRef, gAmpRef); fTarget->writeChars(theChars, fAmpLen, this); break; case chSingleQuote : - theChars = getCharRef(fAposLen, fAposRef, gAposRef); + theChars = getCharRef(fAposLen, fAposRef, gAposRef); fTarget->writeChars(theChars, fAposLen, this); break; case chDoubleQuote : - theChars = getCharRef(fQuoteLen, fQuoteRef, gQuoteRef); + theChars = getCharRef(fQuoteLen, fQuoteRef, gQuoteRef); fTarget->writeChars(theChars, fQuoteLen, this); break; case chCloseAngle : - theChars = getCharRef(fGTLen, fGTRef, gGTRef); + theChars = getCharRef(fGTLen, fGTRef, gGTRef); fTarget->writeChars(theChars, fGTLen, this); break; case chOpenAngle : - theChars = getCharRef(fLTLen, fLTRef, gLTRef); + theChars = getCharRef(fLTLen, fLTRef, gLTRef); fTarget->writeChars(theChars, fLTLen, this); break; default: - // <TBD> This is obviously an error + // control characters + writeCharRef(*srcPtr); break; } srcPtr++; @@ -357,7 +410,7 @@ } } - + unsigned int XMLFormatter::handleUnEscapedChars(const XMLCh * srcPtr, const unsigned int oCount, @@ -432,29 +485,52 @@ // --------------------------------------------------------------------------- // XMLFormatter: Private helper methods // --------------------------------------------------------------------------- +const void XMLFormatter::writeCharRef(const XMLCh &toWrite) +{ + XMLCh tmpBuf[32]; + tmpBuf[0] = chAmpersand; + tmpBuf[1] = chPound; + tmpBuf[2] = chLatin_x; + + // Build a char ref for the current char + XMLString::binToText(toWrite, &tmpBuf[3], 8, 16); + const unsigned int bufLen = XMLString::stringLen(tmpBuf); + tmpBuf[bufLen] = chSemiColon; + tmpBuf[bufLen+1] = chNull; + + // write it out + formatBuf(tmpBuf + , bufLen + 1 + , XMLFormatter::NoEscapes + , XMLFormatter::UnRep_Fail); + +} + const XMLByte* XMLFormatter::getCharRef(unsigned int & count, XMLByte * ref, const XMLCh * stdRef) { if (!ref) { - unsigned int charsEaten; - const unsigned int outBytes - = fXCoder->transcodeTo(stdRef, XMLString::stringLen(stdRef), + + unsigned int charsEaten; + const unsigned int outBytes = + fXCoder->transcodeTo(stdRef, XMLString::stringLen(stdRef), fTmpBuf, kTmpBufSize, charsEaten, XMLTranscoder::UnRep_Throw); - fTmpBuf[outBytes] = 0; fTmpBuf[outBytes + 1] = 0; - fTmpBuf[outBytes + 2] = 0; fTmpBuf[outBytes + 3] = 0; - - ref = new XMLByte[outBytes + 4]; - memcpy(ref, fTmpBuf, outBytes + 4); - count = outBytes; - } + fTmpBuf[outBytes] = 0; + fTmpBuf[outBytes + 1] = 0; + fTmpBuf[outBytes + 2] = 0; + fTmpBuf[outBytes + 3] = 0; + + ref = new XMLByte[outBytes + 4]; + memcpy(ref, fTmpBuf, outBytes + 4); + count = outBytes; + } return ref; } - void XMLFormatter::specialFormat(const XMLCh* const toFormat , const unsigned int count , const EscapeFlags escapeFlags) @@ -470,12 +546,6 @@ const XMLCh* srcPtr = toFormat; const XMLCh* endPtr = toFormat + count; - // Set up the common part of the buffer that we build char refs into - XMLCh tmpBuf[32]; - tmpBuf[0] = chAmpersand; - tmpBuf[1] = chPound; - tmpBuf[2] = chLatin_x; - while (srcPtr < endPtr) { const XMLCh* tmpPtr = srcPtr; @@ -510,20 +580,7 @@ // while (srcPtr < endPtr) { - // Build a char ref for the current char - XMLString::binToText(*srcPtr, &tmpBuf[3], 8, 16); - const unsigned int bufLen = XMLString::stringLen(tmpBuf); - tmpBuf[bufLen] = chSemiColon; - tmpBuf[bufLen+1] = chNull; - - // And now call recursively back to our caller to format this - formatBuf - ( - tmpBuf - , bufLen + 1 - , XMLFormatter::NoEscapes - , XMLFormatter::UnRep_Fail - ); + writeCharRef(*srcPtr); // Move up the source pointer and break out if needed srcPtr++;
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]