framework XMLRecognizer.cpp

peiyongz Wed, 05 Feb 2003 09:29:55 -0800

peiyongz    2003/02/05 09:30:20

  Modified:    c/src/xercesc/framework XMLRecognizer.cpp
  Log:
  Bug#16796: Possible out of bounds memory read in XMLRecognizer::basicEncodingProbe
  
  Revision  Changes    Path
  1.7       +43 -10    xml-xerces/c/src/xercesc/framework/XMLRecognizer.cpp
  
  Index: XMLRecognizer.cpp
  ===================================================================
  RCS file: /home/cvs/xml-xerces/c/src/xercesc/framework/XMLRecognizer.cpp,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- XMLRecognizer.cpp 29 Jan 2003 16:44:27 -0000      1.6
  +++ XMLRecognizer.cpp 5 Feb 2003 17:30:20 -0000       1.7
  @@ -149,9 +149,50 @@
       //
       if (rawByteCount < 2)
           return UTF_8;
  +         
  +    //  
  +    //  We have two to four bytes, so lets check for a UTF-16 BOM. That
  +    //  is quick to check and enough to identify two major encodings.   
  +    // 
  +
  +    if (rawByteCount < 4)
  +    {
  +        if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
  +            return UTF_16B;
  +        else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
  +            return UTF_16L;
  +        else 
  +            return UTF_8;
  +    }
  +
  +    /***
  +     *    F.1 Detection Without External Encoding Information
  +     *
  +     *    Because each XML entity not accompanied by external encoding information 
and 
  +     *    not in UTF-8 or UTF-16 encoding must begin with an XML encoding 
declaration, 
  +     *    in which the first characters must be '<?xml', any conforming processor 
can detect, 
  +     *    after two to four octets of input, which of the following cases apply. 
  +     *
  +     *    In reading this list, it may help to know that in UCS-4, '<' is 
"#x0000003C" and 
  +     *    '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data 
streams is 
  +     *    "#xFEFF". The notation ## is used to denote any byte value except that 
two consecutive 
  +     *    ##s cannot be both 00.
  +     *
  +     *    With a Byte Order Mark:
  +     *
  +     *    00 00 FE FF           UCS-4,    big-endian machine    (1234 order) 
  +     *    FF FE 00 00           UCS-4,    little-endian machine (4321 order) 
  +     *    00 00 FF FE           UCS-4,    unusual octet order   (2143) 
  +     *    FE FF 00 00           UCS-4,    unusual octet order   (3412) 
  +     *    FE FF ## ##           UTF-16,   big-endian 
  +     *    FF FE ## ##           UTF-16,   little-endian 
  +     *    EF BB BF              UTF-8 
  +     *
  +     ***/
   
       //
  -    //  Checking BOM for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE
  +    //  We have at least four bytes, so we can check all BOM
  +    //  for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well.
       //
       if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) 
&& (rawBuffer[3] == 0xFF))
           return UCS_4B;
  @@ -161,14 +202,6 @@
           return UTF_16B;
       else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
           return UTF_16L;
  -
  -    //
  -    //  Oh well, not one of those. So now lets see if we have at least 4
  -    //  bytes. If not, then we are out of ideas and can return UTF-8 as the
  -    //  fallback.
  -    //
  -    if (rawByteCount < 4)
  -        return UTF_8;
   
       //
       //  We have at least 4 bytes. So lets check the 4 byte sequences that


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-xerces/c/src/xercesc/framework XMLRecognizer.cpp

Reply via email to