Patch to fix Bug 1462: zh-CN copy&paste problem

Anthony Fok Thu, 17 Jan 2002 03:31:47 -0800

Hello all,

This patch (for AbiWord-0.99.1 or the latest CVS) fixes the copy-and-paste
problem for zh_CN locale. Previously, simplified Chinese characters were
pasted as "?". (See Bug 1462)


I guess the problem was that entries like zh_CN.BIG5 and zh_TW.GB2312
confused AbiWord to choose the wrong codepage for zh_CN.
Previously, CP950 was invoked during a copy-and-paste, but CP950 is for
zh_TW; zh_CN uses CP936, or the equivalent of GBK.  (GB2312 is ancient).
The locale entries are now corrected, and a few ones added too.
This should take care of the problems with Bug 1462.  After applying this
patch, you can go ahead and close the bug.

The next step is to extend full support for BIG5-HKSCS and GB18030, but
as I currently do not know how Windows handles these encodings, this will
be a future project.  AFAIK, CP950 only covers BIG5, and CP950 only covers
GBK.  Does Microsoft use new codepages, or ignore codepages altogether and
use straight Unicode?  Not sure.  :-)  Anyhow, this will be a wishlist item.

Cheers,

Anthony

-- 
Anthony Fok Tung-Ling
ThizLinux Laboratory   <[EMAIL PROTECTED]> http://www.thizlinux.com/
Debian Chinese Project <[EMAIL PROTECTED]>       http://www.debian.org/intl/zh/
Come visit Our Lady of Victory Camp!           http://www.olvc.ab.ca/

diff -aur abiword-0.99.1~/abi/src/af/util/xp/ut_Encoding.cpp 
abiword-0.99.1/abi/src/af/util/xp/ut_Encoding.cpp
--- abiword-0.99.1~/abi/src/af/util/xp/ut_Encoding.cpp  Sat Aug 11 02:32:38 2001
+++ abiword-0.99.1/abi/src/af/util/xp/ut_Encoding.cpp   Thu Jan 17 12:55:13 2002
@@ -46,7 +46,7 @@
 // Another approach is to do these tests in an external program which
 // outputs the C++ code for the following table.
 //
-// TODO Note that certain operations in Abiword currently try to open or
+// TODO Note that certain operations in AbiWord currently try to open or
 // TODO compare certain encodings via hard-coded names.  This should be
 // TODO discouraged and replaced with names derived as in these tables.
 //
@@ -54,6 +54,7 @@
 
 static XML_Char * enc_armscii[]        = {"ARMSCII-8",0};
 static XML_Char * enc_big5[]   = {"BIG5","BIG-5","BIG-FIVE","BIGFIVE","CN-BIG5",0};
+static XML_Char * enc_big5hkscs[]      = {"BIG5-HKSCS","BIG5HKSCS",0};
 static XML_Char * enc_cp874[]  = {"CP874",0};
 static XML_Char * enc_cp932[]  = {"CP932",0};
 static XML_Char * enc_cp936[]  = {"CP936","GBK",0};
@@ -133,6 +134,7 @@
        //the property value, the localised translation, the numerical id
        {enc_armscii,                   NULL, XAP_STRING_ID_ENC_ARME_ARMSCII},
        {enc_big5,                              NULL, XAP_STRING_ID_ENC_CHTR_BIG5},
+       {enc_big5hkscs,                         NULL, 
+XAP_STRING_ID_ENC_CHTR_BIG5HKSCS},
        {enc_cp874,                             NULL, XAP_STRING_ID_ENC_THAI_WIN},
        {enc_cp932,                             NULL, XAP_STRING_ID_ENC_JAPN_WIN},
        {enc_cp936,                             NULL, XAP_STRING_ID_ENC_CHSI_WIN},
diff -aur abiword-0.99.1~/abi/src/af/xap/xp/xap_EncodingManager.cpp 
abiword-0.99.1/abi/src/af/xap/xp/xap_EncodingManager.cpp
--- abiword-0.99.1~/abi/src/af/xap/xp/xap_EncodingManager.cpp   Wed Jan 16 10:00:07 
2002
+++ abiword-0.99.1/abi/src/af/xap/xp/xap_EncodingManager.cpp    Thu Jan 17 15:03:38 
+2002
@@ -476,11 +476,11 @@
        {NULL,NULL},
        {"ru","english,russian"},
        
-       /* I'm not sure that this is correct, but my TeTex 0.9.17 works only 
+       /* I'm not sure that this is correct, but my teTeX 0.9.17 works only 
           this way (i.e. only with "russian" in the middle) - hvv */
        {"uk","english,russian,ukrainian"},
        
-       /* I'm not sure again - my TeTex 0.9.17 doesn't know 'byelorussian' 
+       /* I'm not sure again - my teTeX 0.9.17 doesn't know 'byelorussian' 
           language - hvv */
        {"be","english,russian"},
        {NULL,NULL}
@@ -493,7 +493,7 @@
  RUSSIAN_CHARSET).
 */
 static const char* wincharsetcode_ru[]= /* russian charset */
-{ "ru","be", "uk" , NULL };
+{ "ru", "be", "uk" , NULL };
 static const char* wincharsetcode_el[]=  /* greek charset*/
 { "el", NULL };
 
@@ -510,10 +510,10 @@
   Tested with GB2312 only.  
 */
 static const char* wincharsetcode_zh_GB2312[]= /* chinese*/
-{ "zh_CN.GB2312", "zh_TW.GB2312", NULL };
+{ "zh_CN.GB2312", "zh_CN.GBK", "zh_CN.GB18030", NULL };
 
 static const char* wincharsetcode_zh_BIG5[]= /* chinese*/
-{ "zh_CN.BIG5", "zh_TW.BIG5", NULL };
+{ "zh_TW.BIG5", "zh_HK.BIG5-HKSCS", NULL };
 
 static const _rmap langcode_to_wincharsetcode[]=
 {
@@ -528,7 +528,7 @@
        {NULL}
 };
 
-static const UT_Bijection::pair_data zh_CN_big5[]=
+static const UT_Bijection::pair_data zh_TW_big5[]=
 {
 /*
     This data was constructed from the HJ's patch for support  of Big5 to 
@@ -541,20 +541,20 @@
     {NULL,NULL}
 };
 
-static const char* zh_CN_big5_keys[]=
-{  "zh_CN.BIG5", NULL };
+static const char* zh_TW_big5_keys[]=
+{  "zh_TW.BIG5", NULL };
 
 static const _rmap cjk_word_fontname_mapping_data[]=
 {
     {NULL},
-    {(char*)zh_CN_big5,zh_CN_big5_keys},
+    {(char*)zh_TW_big5,zh_TW_big5_keys},
     {NULL}
 };
 
 
 /*all CJK language codes should be listed here to be marked as CJK*/
 static const char* cjk_languages[]=
-{ "zh","ja","ko",NULL}; 
+{ "zh", "ja", "ko", NULL }; 
 
 static const _rmap langcode_to_cjk[]=
 {
@@ -585,16 +585,16 @@
     {NULL,NULL},
        // libiconv also lists "SHIFT_JIS", "SHIFT-JIS", "MS_KANJI", "csShiftJIS"
        {"CP932","SJIS"},
-    {"CP936","GB2312"},
-    {"CP950","BIG5"},  
+    {"CP936","GBK"},
+    {"CP950","BIG5"},  
        {"CP1361","JOHAB"},
     {NULL,NULL}
 };
 
 /*
- This table is only concern CJK RTF part.It is a reverse table of
- MSCodepagename_to_charset_name_map.Iconv doesn't know some cpNNNN,
- but M$Word know.
+ This table is only concern CJK RTF part.  It is a reverse table of
+ MSCodepagename_to_charset_name_map.  Iconv doesn't know some cpNNNN,
+ but M$Word knows.
 */
 static const _map charset_name_to_MSCodepagename_map[]=
 {
@@ -603,7 +603,10 @@
        // libiconv also lists "SHIFT_JIS", "SHIFT-JIS", "MS_KANJI", "csShiftJIS"
        {"SJIS","CP932"},
     {"GB2312","CP936"},
+    {"GBK","CP936"},
+    {"GB18030","CP936"},
     {"BIG5","CP950"},
+    {"BIG5-HKSCS","CP950"},
        {"JOHAB","CP1361"},
     {NULL,NULL}
 };
@@ -613,10 +616,11 @@
 {
 /*key, value*/
     {NULL},
-   {"zh_CN.BIG5",      "0x404"},  
    {"zh_CN.GB2312",    "0x804"},     
+   {"zh_CN.GBK",       "0x804"}, 
+   {"zh_CN.GB18030",   "0x804"}, 
+   {"zh_HK.BIG5-HKSCS",        "0x404"},  
    {"zh_TW.BIG5",      "0x404"},  
-   {"zh_TW.GB2312",    "0x804"}, 
     {NULL}
 };
 
diff -aur abiword-0.99.1~/abi/src/af/xap/xp/xap_String_Id.h 
abiword-0.99.1/abi/src/af/xap/xp/xap_String_Id.h
--- abiword-0.99.1~/abi/src/af/xap/xp/xap_String_Id.h   Wed Jan 16 10:00:07 2002
+++ abiword-0.99.1/abi/src/af/xap/xp/xap_String_Id.h    Thu Jan 17 13:01:50 2002
@@ -339,12 +339,13 @@
 dcl(ENC_GEOR_PS,                                               "Georgian, PS")
 /* Multibyte CJK */
 /* Chinese Simplified */
-dcl(ENC_CHSI_EUC,                                              "Chinese Simplified, 
EUC-CN")
+dcl(ENC_CHSI_EUC,                                              "Chinese Simplified, 
+EUC-CN (GB2312)")
 dcl(ENC_CHSI_GB,                                               "Chinese Simplified, 
GB_2312-80")       // Cf. EUC
 dcl(ENC_CHSI_HZ,                                               "Chinese Simplified, 
HZ")
 dcl(ENC_CHSI_WIN,                                              "Chinese Simplified, 
Windows Code Page 936")
 /* Chinese Traditional */
 dcl(ENC_CHTR_BIG5,                                             "Chinese Traditional, 
BIG5")
+dcl(ENC_CHTR_BIG5HKSCS,                                                "Chinese 
+Traditional, BIG5-HKSCS")
 dcl(ENC_CHTR_EUC,                                              "Chinese Traditional, 
EUC-TW")
 dcl(ENC_CHTR_WIN,                                              "Chinese Traditional, 
Windows Code Page 950")
 /* Japanese */
diff -aur abiword-0.99.1~/abi/src/wp/impexp/xp/ie_imp_RTF.cpp 
abiword-0.99.1/abi/src/wp/impexp/xp/ie_imp_RTF.cpp
--- abiword-0.99.1~/abi/src/wp/impexp/xp/ie_imp_RTF.cpp Tue Jan  8 20:08:08 2002
+++ abiword-0.99.1/abi/src/wp/impexp/xp/ie_imp_RTF.cpp  Thu Jan 17 12:58:58 2002
@@ -920,7 +920,7 @@
                        break;
                        // 936  Chinese: Simplified
                case 936:
-                       CPNAME_OR_FALLBACK(m_szEncoding,"CP936","GB2312");
+                       CPNAME_OR_FALLBACK(m_szEncoding,"CP936","GBK");
                        break;
                        // 950  Chinese: Traditional
                case 950:
@@ -977,7 +977,7 @@
                                m_szEncoding = "CP1361";
                                break;
                        case 134:       // Chinese GB - undocumented?
-                               CPNAME_OR_FALLBACK(m_szEncoding,"CP936","GB2312");
+                               CPNAME_OR_FALLBACK(m_szEncoding,"CP936","GBK");
                                break;
                        case 136:       // Chinese BIG5 - undocumented?
                                CPNAME_OR_FALLBACK(m_szEncoding,"CP950","BIG5");
diff -aur abiword-0.99.1~/wv/text.c abiword-0.99.1/wv/text.c
--- abiword-0.99.1~/wv/text.c   Thu Dec 27 23:55:49 2001
+++ abiword-0.99.1/wv/text.c    Thu Jan 17 13:36:27 2002
@@ -592,11 +592,12 @@
       switch (lid)
                {
 #if 0
-       case 0x0c04:            /*Chinese (Hong Kong SAR, PRC) */
        case 0x1404:            /*Chinese (Macau SAR) */
 #endif
+       case 0x0c04:            /*Chinese (Hong Kong SAR, PRC) */
+               CPNAME_OR_FALLBACK ("CP950", "BIG5-HKSCS");
        case 0x0804:            /*Chinese (PRC) */
-               CPNAME_OR_FALLBACK ("CP936", "GB2312");
+               CPNAME_OR_FALLBACK ("CP936", "GBK");
 #if 0
        case 0x1004:            /*Chinese (Singapore) */
 #endif

Patch to fix Bug 1462: zh-CN copy&paste problem

Reply via email to