cargilld 2005/03/22 12:43:27 Modified: c/src/xercesc/internal XMLReader.cpp XMLReader.hpp Log: Check in Christian's patches for Xercesc-1369 and 1370. Revision Changes Path 1.29 +188 -8 xml-xerces/c/src/xercesc/internal/XMLReader.cpp Index: XMLReader.cpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/internal/XMLReader.cpp,v retrieving revision 1.28 retrieving revision 1.29 diff -u -r1.28 -r1.29 --- XMLReader.cpp 20 Mar 2005 19:02:45 -0000 1.28 +++ XMLReader.cpp 22 Mar 2005 20:43:27 -0000 1.29 @@ -40,7 +40,7 @@ // not. Breaks out on the first non-whitespace. // bool XMLReader::isAllSpaces(const XMLCh* const toCheck - , const unsigned int count) + , const unsigned int count) const { const XMLCh* curCh = toCheck; const XMLCh* endPtr = toCheck + count; @@ -58,7 +58,7 @@ // not. // bool XMLReader::containsWhiteSpace(const XMLCh* const toCheck - , const unsigned int count) + , const unsigned int count) const { const XMLCh* curCh = toCheck; const XMLCh* endPtr = toCheck + count; @@ -73,7 +73,7 @@ // // This one is not called terribly often, so call the XMLChar utility // -bool XMLReader::isPublicIdChar(const XMLCh toCheck) +bool XMLReader::isPublicIdChar(const XMLCh toCheck) const { if (fXMLVersion == XMLV1_1) return XMLChar1_1::isPublicIdChar(toCheck); @@ -704,7 +704,28 @@ // Eat this char fCharIndex++; - handleEOL(curCh, false); + // + // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have + // end-of-line combinations with a leading chCR(xD) or chLF(xA) + // + // 100000 x20 + // 001001 x9 + // 001010 chLF + // 001101 chCR + // ----------- + // 000110 == (chCR|chLF) & ~(0x9|0x20) + // + // if the result of thelogical-& operation is + // true : 'curCh' must be xA or xD + // false : 'curCh' must be x20 or x9 + // + if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) + { + fCurCol++; + } else + { + handleEOL(curCh, false); + } // Ok we can add this guy to our buffer toFill.append(curCh); @@ -747,7 +768,27 @@ // Eat this char fCharIndex++; - handleEOL(curCh, false); + // + // 'curCh' is not a whitespace(x20|x9|xD|xA), so we only can + // have end-of-line combinations with a leading chNEL(x85) or + // chLineSeparator(x2028) + // + // 0010000000101000 chLineSeparator + // 0000000010000101 chNEL + // --------------------- + // 1101111101010010 == ~(chNEL|chLineSeparator) + // + // if the result of the logical-& operation is + // true : 'curCh' can not be chNEL or chLineSeparator + // false : 'curCh' can be chNEL or chLineSeparator + // + if ( curCh & (XMLCh) ~(chNEL|chLineSeparator) ) + { + fCurCol++; + } else + { + handleEOL(curCh, false); + } // Add it to our buffer toFill.append(curCh); @@ -813,7 +854,28 @@ // Get the current char out of the buffer and eat it XMLCh curCh = fCharBuf[fCharIndex++]; - handleEOL(curCh, inDecl); + // + // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have + // end-of-line combinations with a leading chCR(xD) or chLF(xA) + // + // 100000 x20 + // 001001 x9 + // 001010 chLF + // 001101 chCR + // ----------- + // 000110 == (chCR|chLF) & ~(0x9|0x20) + // + // if the result of the logical-& operation is + // true : 'curCh' must be xA or xD + // false : 'curCh' must be x20 or x9 + // + if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) + { + fCurCol++; + } else + { + handleEOL(curCh, inDecl); + } } else @@ -883,7 +945,28 @@ // Eat the character fCharIndex++; - handleEOL((XMLCh&)curCh, false); + // + // 'curCh' is a whitespace(x20|x9|xD|xA), so we only can have + // end-of-line combinations with a leading chCR(xD) or chLF(xA) + // + // 100000 x20 + // 001001 x9 + // 001010 chLF + // 001101 chCR + // ----------- + // 000110 == (chCR|chLF) & ~(0x9|0x20) + // + // if the result of the logical-& operation is + // true : 'curCh' must be xA or xD + // false : 'curCh' must be x20 or x9 + // + if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 ) + { + fCurCol++; + } else + { + handleEOL((XMLCh&)curCh, false); + } return true; } @@ -1522,4 +1605,101 @@ return charsDone; } +/*** + * + * XML1.1 + * + * 2.11 End-of-Line Handling + * + * XML parsed entities are often stored in computer files which, for editing + * convenience, are organized into lines. These lines are typically separated + * by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA). + * + * To simplify the tasks of applications, the XML processor MUST behave as if + * it normalized all line breaks in external parsed entities (including the document + * entity) on input, before parsing, by translating all of the following to a single + * #xA character: + * + * 1. the two-character sequence #xD #xA + * 2. the two-character sequence #xD #x85 + * 3. the single character #x85 + * 4. the single character #x2028 + * 5. any #xD character that is not immediately followed by #xA or #x85. + * + * + ***/ +void XMLReader::handleEOL(XMLCh& curCh, bool inDecl) +{ + // 1. the two-character sequence #xD #xA + // 2. the two-character sequence #xD #x85 + // 5. any #xD character that is not immediately followed by #xA or #x85. + if (curCh == chCR) + { + fCurCol = 1; + fCurLine++; + + // + // If not already internalized, then convert it to an + // LF and eat any following LF. + // + if (fSource == Source_External) + { + if ((fCharIndex < fCharsAvail) || refreshCharBuffer()) + { + if ( fCharBuf[fCharIndex] == chLF || + ((fCharBuf[fCharIndex] == chNEL) && fNEL) ) + { + fCharIndex++; + } + } + curCh = chLF; + } + } + else if (curCh == chLF) + { + fCurCol = 1; + fCurLine++; + } + // 3. the single character #x85 + // 4. the single character #x2028 + else if (curCh == chNEL || curCh == chLineSeparator) + { + if (inDecl && fXMLVersion == XMLV1_1) + { + + /*** + * XML1.1 + * + * 2.11 End-of-Line Handling + * ... + * The characters #x85 and #x2028 cannot be reliably recognized and translated + * until an entity's encoding declaration (if present) has been read. + * Therefore, it is a fatal error to use them within the XML declaration or + * text declaration. + * + ***/ + ThrowXMLwithMemMgr1 + ( + TranscodingException + , XMLExcepts::Reader_NelLsepinDecl + , fSystemId + , fMemoryManager + ); + } + + if (fNEL && fSource == Source_External) + { + fCurCol = 1; + fCurLine++; + curCh = chLF; + } + } + else + { + fCurCol++; + } + + return; +} + XERCES_CPP_NAMESPACE_END 1.21 +67 -119 xml-xerces/c/src/xercesc/internal/XMLReader.hpp Index: XMLReader.hpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/internal/XMLReader.hpp,v retrieving revision 1.20 retrieving revision 1.21 diff -u -r1.20 -r1.21 --- XMLReader.hpp 29 Sep 2004 00:24:01 -0000 1.20 +++ XMLReader.hpp 22 Mar 2005 20:43:27 -0000 1.21 @@ -16,6 +16,9 @@ /* * $Log$ + * Revision 1.21 2005/03/22 20:43:27 cargilld + * Check in Christian's patches for Xercesc-1369 and 1370. + * * Revision 1.20 2004/09/29 00:24:01 knoaman * Performance: improve src offset calculation. Patch by Anthony O'Dowd. * @@ -219,24 +222,24 @@ ( const XMLCh* const toCheck , const unsigned int count - ); + ) const; bool containsWhiteSpace ( const XMLCh* const toCheck , const unsigned int count - ); + ) const; - bool isXMLLetter(const XMLCh toCheck); - bool isFirstNameChar(const XMLCh toCheck); - bool isNameChar(const XMLCh toCheck); - bool isPlainContentChar(const XMLCh toCheck); - bool isSpecialStartTagChar(const XMLCh toCheck); - bool isXMLChar(const XMLCh toCheck); - bool isWhitespace(const XMLCh toCheck); - bool isControlChar(const XMLCh toCheck); - bool isPublicIdChar(const XMLCh toCheck); + bool isXMLLetter(const XMLCh toCheck) const; + bool isFirstNameChar(const XMLCh toCheck) const; + bool isNameChar(const XMLCh toCheck) const; + bool isPlainContentChar(const XMLCh toCheck) const; + bool isSpecialStartTagChar(const XMLCh toCheck) const; + bool isXMLChar(const XMLCh toCheck) const; + bool isWhitespace(const XMLCh toCheck) const; + bool isControlChar(const XMLCh toCheck) const; + bool isPublicIdChar(const XMLCh toCheck) const; // ----------------------------------------------------------------------- // Constructors and Destructor @@ -400,7 +403,7 @@ , const unsigned int maxChars ); - inline void handleEOL + void handleEOL ( XMLCh& curCh , bool inDecl = false @@ -590,43 +593,43 @@ // --------------------------------------------------------------------------- // XMLReader: Public, query methods // --------------------------------------------------------------------------- -inline bool XMLReader::isNameChar(const XMLCh toCheck) +inline bool XMLReader::isNameChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gNameCharMask) != 0); } -inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) +inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gPlainContentCharMask) != 0); } -inline bool XMLReader::isFirstNameChar(const XMLCh toCheck) +inline bool XMLReader::isFirstNameChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gFirstNameCharMask) != 0); } -inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) +inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gSpecialStartTagCharMask) != 0); } -inline bool XMLReader::isXMLChar(const XMLCh toCheck) +inline bool XMLReader::isXMLChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gXMLCharMask) != 0); } -inline bool XMLReader::isXMLLetter(const XMLCh toCheck) +inline bool XMLReader::isXMLLetter(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gLetterCharMask) != 0); } -inline bool XMLReader::isWhitespace(const XMLCh toCheck) +inline bool XMLReader::isWhitespace(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gWhitespaceCharMask) != 0); } -inline bool XMLReader::isControlChar(const XMLCh toCheck) +inline bool XMLReader::isControlChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gControlCharMask) != 0); } @@ -784,7 +787,28 @@ chGotten = fCharBuf[fCharIndex++]; // Handle end of line normalization and line/col member maintenance. - handleEOL(chGotten, false); + // + // we can have end-of-line combinations with a leading + // chCR(xD), chLF(xA), chNEL(x85), or chLineSeparator(x2028) + // + // 0000000000001101 chCR + // 0000000000001010 chLF + // 0000000010000101 chNEL + // 0010000000101000 chLineSeparator + // ----------------------- + // 1101111101010000 == ~(chCR|chLF|chNEL|chLineSeparator) + // + // if the result of the logical-& operation is + // true : 'curCh' can not be chCR, chLF, chNEL or chLineSeparator + // false : 'curCh' can be chCR, chLF, chNEL or chLineSeparator + // + if ( chGotten & (XMLCh) ~(chCR|chLF|chNEL|chLineSeparator) ) + { + fCurCol++; + } else + { + handleEOL(chGotten, false); + } return true; } @@ -812,7 +836,28 @@ chGotten = fCharBuf[fCharIndex++]; // Handle end of line normalization and line/col member maintenance. - handleEOL(chGotten, false); + // + // we can have end-of-line combinations with a leading + // chCR(xD), chLF(xA), chNEL(x85), or chLineSeparator(x2028) + // + // 0000000000001101 chCR + // 0000000000001010 chLF + // 0000000010000101 chNEL + // 0010000000101000 chLineSeparator + // ----------------------- + // 1101111101010000 == ~(chCR|chLF|chNEL|chLineSeparator) + // + // if the result of the logical-& operation is + // true : 'curCh' can not be chCR, chLF, chNEL or chLineSeparator + // false : 'curCh' can be chCR, chLF, chNEL or chLineSeparator + // + if ( chGotten & (XMLCh) ~(chCR|chLF|chNEL|chLineSeparator) ) + { + fCurCol++; + } else + { + handleEOL(chGotten, false); + } return true; } @@ -851,103 +896,6 @@ return true; } -/*** - * - * XML1.1 - * - * 2.11 End-of-Line Handling - * - * XML parsed entities are often stored in computer files which, for editing - * convenience, are organized into lines. These lines are typically separated - * by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA). - * - * To simplify the tasks of applications, the XML processor MUST behave as if - * it normalized all line breaks in external parsed entities (including the document - * entity) on input, before parsing, by translating all of the following to a single - * #xA character: - * - * 1. the two-character sequence #xD #xA - * 2. the two-character sequence #xD #x85 - * 3. the single character #x85 - * 4. the single character #x2028 - * 5. any #xD character that is not immediately followed by #xA or #x85. - * - * - ***/ -inline void XMLReader::handleEOL(XMLCh& curCh, bool inDecl) -{ - // 1. the two-character sequence #xD #xA - // 2. the two-character sequence #xD #x85 - // 5. any #xD character that is not immediately followed by #xA or #x85. - if (curCh == chCR) - { - fCurCol = 1; - fCurLine++; - - // - // If not already internalized, then convert it to an - // LF and eat any following LF. - // - if (fSource == Source_External) - { - if ((fCharIndex < fCharsAvail) || refreshCharBuffer()) - { - if ( fCharBuf[fCharIndex] == chLF || - ((fCharBuf[fCharIndex] == chNEL) && fNEL) ) - { - fCharIndex++; - } - } - curCh = chLF; - } - } - else if (curCh == chLF) - { - fCurCol = 1; - fCurLine++; - } - // 3. the single character #x85 - // 4. the single character #x2028 - else if (curCh == chNEL || curCh == chLineSeparator) - { - if (inDecl && fXMLVersion == XMLV1_1) - { - - /*** - * XML1.1 - * - * 2.11 End-of-Line Handling - * ... - * The characters #x85 and #x2028 cannot be reliably recognized and translated - * until an entity's encoding declaration (if present) has been read. - * Therefore, it is a fatal error to use them within the XML declaration or - * text declaration. - * - ***/ - ThrowXMLwithMemMgr1 - ( - TranscodingException - , XMLExcepts::Reader_NelLsepinDecl - , fSystemId - , fMemoryManager - ); - } - - if (fNEL && fSource == Source_External) - { - fCurCol = 1; - fCurLine++; - curCh = chLF; - } - } - else - { - fCurCol++; - } - - return; -} - XERCES_CPP_NAMESPACE_END #endif
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]