peiyongz 2003/02/05 09:30:20 Modified: c/src/xercesc/framework XMLRecognizer.cpp Log: Bug#16796: Possible out of bounds memory read in XMLRecognizer::basicEncodingProbe Revision Changes Path 1.7 +43 -10 xml-xerces/c/src/xercesc/framework/XMLRecognizer.cpp Index: XMLRecognizer.cpp =================================================================== RCS file: /home/cvs/xml-xerces/c/src/xercesc/framework/XMLRecognizer.cpp,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- XMLRecognizer.cpp 29 Jan 2003 16:44:27 -0000 1.6 +++ XMLRecognizer.cpp 5 Feb 2003 17:30:20 -0000 1.7 @@ -149,9 +149,50 @@ // if (rawByteCount < 2) return UTF_8; + + // + // We have two to four bytes, so lets check for a UTF-16 BOM. That + // is quick to check and enough to identify two major encodings. + // + + if (rawByteCount < 4) + { + if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF)) + return UTF_16B; + else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) + return UTF_16L; + else + return UTF_8; + } + + /*** + * F.1 Detection Without External Encoding Information + * + * Because each XML entity not accompanied by external encoding information and + * not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, + * in which the first characters must be '<?xml', any conforming processor can detect, + * after two to four octets of input, which of the following cases apply. + * + * In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and + * '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is + * "#xFEFF". The notation ## is used to denote any byte value except that two consecutive + * ##s cannot be both 00. + * + * With a Byte Order Mark: + * + * 00 00 FE FF UCS-4, big-endian machine (1234 order) + * FF FE 00 00 UCS-4, little-endian machine (4321 order) + * 00 00 FF FE UCS-4, unusual octet order (2143) + * FE FF 00 00 UCS-4, unusual octet order (3412) + * FE FF ## ## UTF-16, big-endian + * FF FE ## ## UTF-16, little-endian + * EF BB BF UTF-8 + * + ***/ // - // Checking BOM for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE + // We have at least four bytes, so we can check all BOM + // for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well. // if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF)) return UCS_4B; @@ -161,14 +202,6 @@ return UTF_16B; else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE)) return UTF_16L; - - // - // Oh well, not one of those. So now lets see if we have at least 4 - // bytes. If not, then we are out of ideas and can return UTF-8 as the - // fallback. - // - if (rawByteCount < 4) - return UTF_8; // // We have at least 4 bytes. So lets check the 4 byte sequences that
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]