Patch: RecognizeContents for UTF-8

Andrew Dunbar Mon, 09 Apr 2001 05:58:01 -0700
Here's my patch to allow loading UTF-8 files regardless
of the filename extension.  Including the .txt which
is generally the case on Windows at least.

Andrew.


--- ie_imp_Text.cpp.orig        Wed Feb  7 08:55:08 2001
+++ ie_imp_Text.cpp     Mon Apr  9 23:08:12 2001
@@ -250,7 +250,10 @@
 
 bool IE_Imp_Text::RecognizeSuffix(const char * szSuffix)
 {
-       return (UT_stricmp(szSuffix,".txt") == 0);
+       // TODO: We give the other guys a chance, since this
+       // TODO: importer is so generic.  Does this seem
+       // TODO: like a sensible strategy?
+       return(false);
 }
 
 UT_Error IE_Imp_Text::StaticConstructor(PD_Document * pDocument,

--- ie_imp_UTF8.cpp.orig        Wed Feb  7 08:55:08 2001
+++ ie_imp_UTF8.cpp     Sun Apr  8 00:20:56 2001
@@ -308,8 +308,58 @@
 
 bool IE_Imp_UTF8::RecognizeContents(const char * szBuf, UT_uint32
iNumbytes)
 {
-       // TODO: Not yet written
-       return(false);
+       bool bSuccess = false;
+       const char *p = szBuf;
+
+       while (p < szBuf + iNumbytes)
+       {
+               int len;
+               
+               if ((*p & 0x80) == 0)                           // ASCII
+               {
+                       ++p;
+                       continue;
+               }
+               else if (*p == 0xfe || *p == 0xff)              // BOM markers?  
+RFC2279 says
illegal
+               {
+                       UT_DEBUGMSG(("  BOM?\n"));
+                       break;
+               }
+               else if ((*p & 0xfe) == 0xfc)                   // lead byte in 6-byte 
+sequence
+                       len = 6;
+               else if ((*p & 0xfc) == 0xf8)                   // lead byte in 5-byte 
+sequence
+                       len = 5;
+               else if ((*p & 0xf8) == 0xf0)                   // lead byte in 4-byte 
+sequence
+                       len = 4;
+               else if ((*p & 0xf0) == 0xe0)                   // lead byte in 3-byte 
+sequence
+                       len = 3;
+               else if ((*p & 0xe0) == 0xc0)                   // lead byte in 2-byte 
+sequence
+                       len = 2;
+               else                                            // not UTF-8 lead byte
+               {
+                       UT_DEBUGMSG(("  not utf-8 lead byte\n"));
+                       UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+                       return(false);
+               }
+       
+               while (--len)
+               {
+                       ++p;
+                       if (p >= szBuf + iNumbytes)
+                       {
+                               UT_DEBUGMSG(("  out of data!\n"));
+                               //return(false);
+                               break;
+                       }
+                       if ((*p & 0xc0) == 0x80)
+                               bSuccess = true;
+                       else
+                               return(false);
+               }
+               ++p;
+       }
+       
+       return(bSuccess);
 }
 
 bool IE_Imp_UTF8::RecognizeSuffix(const char * szSuffix)



__________________________________________________________________
Get your free Australian email account at http://www.start.com.au
Patch: RecognizeContents for UTF-8

Reply via email to