peiyongz 2004/06/30 12:03:26 Modified: c/src/xercesc/util XMLUTF8Transcoder.cpp Log: XML1.0-3rd Edition: UTF_8 Revision Changes Path 1.9 +39 -26 xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp Index: XMLUTF8Transcoder.cpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/util/XMLUTF8Transcoder.cpp,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- XMLUTF8Transcoder.cpp 19 May 2004 20:51:20 -0000 1.8 +++ XMLUTF8Transcoder.cpp 30 Jun 2004 19:03:26 -0000 1.9 @@ -106,7 +106,7 @@ , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + , 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 @@ -262,24 +262,21 @@ * ***/ + XMLUInt32 tmpVal = 0; + switch(trailingBytes) { case 1 : // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] // - if (*srcPtr < 0xC2) - { - char byte[2] = {*srcPtr,0}; - - ThrowXMLwithMemMgr1(UTFDataFormatException - , XMLExcepts::UTF8_Invalid_2BytesSeq - , byte - , getMemoryManager()); - } - + // 0xC0, 0xC1 has been filtered out checkTrailingBytes(*(srcPtr+1), 1, 1); + tmpVal = *srcPtr++; + tmpVal <<= 6; + tmpVal += *srcPtr++; + break; case 2 : // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] @@ -300,6 +297,19 @@ checkTrailingBytes(*(srcPtr+1), 2, 1); checkTrailingBytes(*(srcPtr+2), 2, 2); + // + // D36 (a) UTF-8 is the Unicode Transformation Format that serializes + // a Unicode code point as a sequence of one to four bytes, + // as specified in Table 3.1, UTF-8 Bit Distribution. + // (b) An illegal UTF-8 code unit sequence is any byte sequence that + // does not match the patterns listed in Table 3.1B, Legal UTF-8 + // Byte Sequences. + // (c) An irregular UTF-8 code unit sequence is a six-byte sequence + // where the first three bytes correspond to a high surrogate, + // and the next three bytes correspond to a low surrogate. + // As a consequence of C12, these irregular UTF-8 sequences shall + // not be generated by a conformant process. + // //irregular three bytes sequence // that is zzzzyy matches leading surrogate tag 110110 or // trailing surrogate tag 110111 @@ -322,6 +332,12 @@ , getMemoryManager()); } + tmpVal = *srcPtr++; + tmpVal <<= 6; + tmpVal += *srcPtr++; + tmpVal <<= 6; + tmpVal += *srcPtr++; + break; case 3 : // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]* @@ -342,10 +358,18 @@ , getMemoryManager()); } - checkTrailingBytes(*(srcPtr+1), 2, 1); - checkTrailingBytes(*(srcPtr+2), 2, 1); - checkTrailingBytes(*(srcPtr+3), 2, 1); + checkTrailingBytes(*(srcPtr+1), 3, 1); + checkTrailingBytes(*(srcPtr+2), 3, 2); + checkTrailingBytes(*(srcPtr+3), 3, 3); + tmpVal = *srcPtr++; + tmpVal <<= 6; + tmpVal += *srcPtr++; + tmpVal <<= 6; + tmpVal += *srcPtr++; + tmpVal <<= 6; + tmpVal += *srcPtr++; + break; default: // trailingBytes > 3 @@ -369,17 +393,6 @@ break; } - // All bytes have been verified, need not to check any more - - XMLUInt32 tmpVal = *srcPtr++; - tmpVal <<= 6; - for(unsigned int i=1; i<trailingBytes; i++) - { - tmpVal += *srcPtr++; - tmpVal <<= 6; - } - - tmpVal += *srcPtr++; // since trailingBytes comes from an array, this logic is redundant // default :
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]