> Original message from: "Tomas Frydrych" <[EMAIL PROTECTED]>
> >
> >> Unfortunately my original patch was not correct. Here is an
> ammended
> >> patch:
> >
> >Could you please make the patch against the current sources?
> >Thanks.
Sorry it's late. It fixes two bugs. BOMs caused an infinite
loop, and the un-cast char type caused sign-related problems
which resulted in misidentified files.
Andrew.
--
http://linguaphile.sourceforge.net
Index: src/wp/impexp/xp/ie_imp_UTF8.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_UTF8.cpp,v
retrieving revision 1.8
diff -u -r1.8 ie_imp_UTF8.cpp
--- src/wp/impexp/xp/ie_imp_UTF8.cpp 2001/04/10 09:36:18 1.8
+++ src/wp/impexp/xp/ie_imp_UTF8.cpp 2001/04/12 16:51:27
@@ -309,21 +309,26 @@
bool IE_Imp_UTF8::RecognizeContents(const char * szBuf, UT_uint32 iNumbytes)
{
bool bSuccess = false;
- const char *p = szBuf;
+ const unsigned char *p = reinterpret_cast<const unsigned char *>(szBuf);
- while (p < szBuf + iNumbytes)
+ while (p < reinterpret_cast<const unsigned char *>(szBuf + iNumbytes))
{
int len;
- if ((*p & 0x80) == 0) // ASCII
+ if ((*p & 0x80) == 0) // ASCII
{
++p;
continue;
}
+ else if ((*p & 0xc0) == 0x80) // not UTF-8
+ {
+ return false;
+ }
else if (*p == 0xfe || *p == 0xff) // BOM markers?
RFC2279 says illegal
{
//UT_DEBUGMSG((" BOM?\n"));
- break;
+ ++p;
+ continue;
}
else if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte
sequence
len = 6;
@@ -335,16 +340,17 @@
len = 3;
else if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte
sequence
len = 2;
- else // not UTF-8 lead byte
+ else
{
- //UT_DEBUGMSG((" not utf-8 lead byte\n"));
+ // the above code covers all cases - if we reach here the
+logic is wrong
+ UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
return false;
}
while (--len)
{
++p;
- if (p >= szBuf + iNumbytes)
+ if (p >= reinterpret_cast<const unsigned char *>(szBuf +
+iNumbytes))
{
//UT_DEBUGMSG((" out of data!\n"));
break;