util strutil.cpp,1.133,1.134

Vadim Zeitlin Wed, 22 Sep 2004 15:05:07 -0700

Update of /cvsroot/mahogany/M/src/util
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16291/src/util


Modified Files:
        strutil.cpp 
Log Message:
use new ConvertUTFToMB() instead of ConvertUnicodeToSystem(): it now tries to guess 
the correct encoding for UTF text

Index: strutil.cpp
===================================================================
RCS file: /cvsroot/mahogany/M/src/util/strutil.cpp,v
retrieving revision 1.133
retrieving revision 1.134
diff -b -u -2 -r1.133 -r1.134
--- strutil.cpp 14 Jul 2004 16:17:07 -0000      1.133
+++ strutil.cpp 22 Sep 2004 22:03:28 -0000      1.134
@@ -38,8 +38,14 @@
 #include <wx/textfile.h>  // just for strutil_enforceNativeCRLF()
 #include <wx/regex.h>
+#include <wx/fontmap.h>
 
 extern "C"
 {
    #include "utf8.h"  // for utf8_text_utf7()
+
+   // arrays used by GuessUnicodeCharset()
+   #include "charset/iso_8859.c"
+   #include "charset/windows.c"
+   #include "charset/koi8_r.c"
 }
 
@@ -1374,18 +1380,105 @@
 }
 
-// convert a string in UTF-8 or 7 into the string in the current encoding: of
-// course, this doesn't work in general as Unicode is not representable as an 8
-// bit charset but it works in some common cases and is better than no UTF-8
-// support at all
+// guess the charset of the given Unicode text
+static wxFontEncoding GuessUnicodeCharset(const wchar_t *pwz)
+{
+   typedef const unsigned short *codepage;
+   struct CodePageInfo
+   {
+      codepage cp;
+      wxFontEncoding enc;
+   };
+   static const CodePageInfo s_codepages[] =
+   {
+      { iso8859_2tab,         wxFONTENCODING_ISO8859_2 },
+      { iso8859_3tab,         wxFONTENCODING_ISO8859_3 },
+      { iso8859_4tab,         wxFONTENCODING_ISO8859_4 },
+      { iso8859_5tab,         wxFONTENCODING_ISO8859_5 },
+      { iso8859_6tab,         wxFONTENCODING_ISO8859_6 },
+      { iso8859_7tab,         wxFONTENCODING_ISO8859_7 },
+      { iso8859_8tab,         wxFONTENCODING_ISO8859_8 },
+      { iso8859_9tab,         wxFONTENCODING_ISO8859_9 },
+      { iso8859_10tab,        wxFONTENCODING_ISO8859_10 },
+      { iso8859_13tab,        wxFONTENCODING_ISO8859_13 },
+      { iso8859_14tab,        wxFONTENCODING_ISO8859_14 },
+      { iso8859_15tab,        wxFONTENCODING_ISO8859_15 },
+      { windows_1250tab,      wxFONTENCODING_CP1250 },
+      { windows_1251tab,      wxFONTENCODING_CP1251 },
+      { windows_1252tab,      wxFONTENCODING_CP1252 },
+      { windows_1253tab,      wxFONTENCODING_CP1253 },
+      { windows_1254tab,      wxFONTENCODING_CP1254 },
+      { windows_1255tab,      wxFONTENCODING_CP1255 },
+      { windows_1256tab,      wxFONTENCODING_CP1256 },
+      { windows_1257tab,      wxFONTENCODING_CP1257 },
+      { koi8rtab,             wxFONTENCODING_KOI8 },
+   };
+
+   // default value: use system default font
+   wxFontEncoding enc = wxFONTENCODING_SYSTEM;
+
+   // first find a non ASCII character as ASCII ones are present in all (well,
+   // many) code pages
+   while ( *pwz && *pwz < 0x80 )
+      pwz++;
+
+   const wchar_t wch = *pwz;
+
+   if ( !wch )
+      return enc;
+
+   // build the array of encodings in which the character appears
+   wxFontEncoding encodings[WXSIZEOF(s_codepages)];
+   size_t numEncodings = 0;
+
+   // special test for iso8859-1 which is identical to first 256 Unicode
+   // characters
+   if ( wch < 0xff )
+   {
+      encodings[numEncodings++] = wxFONTENCODING_ISO8859_1;
+   }
+
+   for ( size_t nPage = 0; nPage < WXSIZEOF(s_codepages); nPage++ )
+   {
+      codepage cp = s_codepages[nPage].cp;
+      for ( size_t i = 0; i < 0x80; i++ )
+      {
+         if ( wch == cp[i] )
+         {
+            ASSERT_MSG( numEncodings < WXSIZEOF(encodings),
+                           _T("encodings array index out of bounds") );
+
+            encodings[numEncodings++] = s_codepages[nPage].enc;
+            break;
+         }
+      }
+   }
+
+   // now find an encoding which is available on this system
+   for ( size_t nEnc = 0; nEnc < numEncodings; nEnc++ )
+   {
+      if ( wxFontMapper::Get()->IsEncodingAvailable(encodings[nEnc]) )
+      {
+         enc = encodings[nEnc];
+         break;
+      }
+   }
+
+   return enc;
+}
+
+// convert a string in UTF-8 or 7 into the string in some multibyte encoding:
+// of course, this doesn't work in general as Unicode is not representable as
+// an 8 bit charset but it works in some common cases and is better than no
+// UTF-8 support at all
 //
 // FIXME this won't be needed when full Unicode support is available
 wxFontEncoding
-ConvertUnicodeToSystem(wxString *strUtf, wxFontEncoding enc)
+ConvertUTFToMB(wxString *strUtf, wxFontEncoding enc)
 {
-   CHECK( strUtf, wxFONTENCODING_SYSTEM,
-          _T("NULL string in ConvertUnicodeToSystem") );
+   CHECK( strUtf, wxFONTENCODING_SYSTEM, _T("NULL string in ConvertUTFToMB") );
 
    if ( !strUtf->empty() )
    {
+      // first convert to UTF-8
       if ( enc == wxFONTENCODING_UTF7 )
       {
@@ -1395,9 +1488,10 @@
          SIZEDTEXT text7, text8;
          text7.data = (unsigned char *) strUtf->c_str();
-         text7.size = strUtf->Length();
+         text7.size = strUtf->length();
 
          utf8_text_utf7 (&text7, &text8);
 
          strUtf->clear();
+         strUtf->reserve(text8.size);
          for ( unsigned long k = 0; k < text8.size; k++ )
          {
@@ -1410,5 +1504,20 @@
       }
 
-      wxString str(strUtf->wc_str(wxConvUTF8), wxConvLocal);
+      // try to determine which multibyte encoding is best suited for this
+      // Unicode string
+      wxWCharBuffer wbuf(strUtf->wc_str(wxConvUTF8));
+      enc = GuessUnicodeCharset(wbuf);
+
+      // finally convert to multibyte
+      wxString str;
+      if ( enc == wxFONTENCODING_SYSTEM )
+      {
+         str = wxString(wbuf);
+      }
+      else
+      {
+         wxCSConv conv(enc);
+         str = wxString(wbuf, conv);
+      }
       if ( str.empty() )
       {
@@ -1422,10 +1531,10 @@
       }
    }
+   else // doesn't really matter what we return from here
+   {
+      enc = wxFONTENCODING_SYSTEM;
+   }
 
-#if wxUSE_INTL
-   return wxLocale::GetSystemEncoding();
-#else // !wxUSE_INTL
-   return wxFONTENCODING_ISO8859_1;
-#endif // wxUSE_INTL/!wxUSE_INTL
+   return enc;
 }
 



-------------------------------------------------------
This SF.Net email is sponsored by: YOU BE THE JUDGE. Be one of 170
Project Admins to receive an Apple iPod Mini FREE for your judgement on
who ports your project to Linux PPC the best. Sponsored by IBM.
Deadline: Sept. 24. Go here: http://sf.net/ppc_contest.php
_______________________________________________
Mahogany-cvsupdates mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/mahogany-cvsupdates

[M-CVS] CVS: M/src/util strutil.cpp,1.133,1.134

Reply via email to