final bits for CJK patch - p5

Vlad Harchev Sat, 11 Nov 2000 12:21:17 -0600 (CST)


 Here is a latest version of CJK bits.
 
 It has the same as p4 had, plus patch to ie_exp_RTF*.cpp by ha shao and
 a fix to allow importing of word8 format .doc files with CJK inside
 (included changes were posted here as separate patches).

 Seems there is no reason not to commit it - it was tested by our CJK
 hackers.

 Best regards,
  -Vlad

diff -ru abi-0.7.11-orig~/abi-0.7.11/CREDITS.TXT 
abi-0.7.11-orig-orig/abi-0.7.11/CREDITS.TXT
--- abi-0.7.11-orig~/abi-0.7.11/CREDITS.TXT     Thu Nov  9 19:38:46 2000
+++ abi-0.7.11-orig-orig/abi-0.7.11/CREDITS.TXT Fri Nov 10 15:05:41 2000
@@ -44,7 +44,7 @@
 Vlad Harchev <[EMAIL PROTECTED]>                     Support for non latin-1 Languages
 Stephen Hack <[EMAIL PROTECTED]>                   options dialog
 Martin Willemoes Hansen <[EMAIL PROTECTED]>
-hj <[EMAIL PROTECTED]>                           XIM, focus
+hj <[EMAIL PROTECTED]>                           XIM, focus, principal author of CJK 
+support patch
 Roman Hodek <[EMAIL PROTECTED]>    m68k Endian patch
 Ming-I Hsieh <[EMAIL PROTECTED]>        FreeBSD
 Perry Ismangil <[EMAIL PROTECTED]>               AbiHello
@@ -106,6 +106,7 @@
 Robert G. Werner <[EMAIL PROTECTED]>    VI keybindings
 John Wood <[EMAIL PROTECTED]>                       NetBSD
 Alan Young <[EMAIL PROTECTED]>                Alpha/NT
+Belcon Zhao <[EMAIL PROTECTED]>                        Testing/fixing CJK support
 
 translators
 -----------
Only in abi-0.7.11-orig-orig/abi-0.7.11: l1.latex
diff -ru abi-0.7.11-orig~/abi-0.7.11/src/af/util/xp/ut_mbtowc.cpp 
abi-0.7.11-orig-orig/abi-0.7.11/src/af/util/xp/ut_mbtowc.cpp
--- abi-0.7.11-orig~/abi-0.7.11/src/af/util/xp/ut_mbtowc.cpp    Thu Nov  9 19:38:46 
2000
+++ abi-0.7.11-orig-orig/abi-0.7.11/src/af/util/xp/ut_mbtowc.cpp        Thu Nov  9 
+22:09:50 2000
@@ -240,7 +240,9 @@
 
 UT_Mbtowc::~UT_Mbtowc()
 {
-    iconv_close(cd);
+    /*libiconv is stupid - we'll get segfault if we don't check  - VH */
+    if (cd!=(iconv_t)-1)
+           iconv_close(cd);
 };
 
 int UT_Mbtowc::mbtowc(wchar_t &wc,char mb)
diff -ru abi-0.7.11-orig~/abi-0.7.11/src/af/util/xp/ut_wctomb.cpp 
abi-0.7.11-orig-orig/abi-0.7.11/src/af/util/xp/ut_wctomb.cpp
--- abi-0.7.11-orig~/abi-0.7.11/src/af/util/xp/ut_wctomb.cpp    Thu Nov  9 19:38:47 
2000
+++ abi-0.7.11-orig-orig/abi-0.7.11/src/af/util/xp/ut_wctomb.cpp        Thu Nov  9 
+22:09:30 2000
@@ -185,7 +185,9 @@
 
 UT_Wctomb::~UT_Wctomb()
 {
-    iconv_close(cd);
+    /*libiconv is stupid - we'll get segfault if we don't check  - VH */
+    if (cd!=(iconv_t)-1)
+           iconv_close(cd);
 };
 
 int UT_Wctomb::wctomb(char * pC,int &length,wchar_t wc)
diff -ru abi-0.7.11-orig~/abi-0.7.11/src/af/xap/xp/xap_EncodingManager.cpp 
abi-0.7.11-orig-orig/abi-0.7.11/src/af/xap/xp/xap_EncodingManager.cpp
--- abi-0.7.11-orig~/abi-0.7.11/src/af/xap/xp/xap_EncodingManager.cpp   Thu Nov  9 
19:38:47 2000
+++ abi-0.7.11-orig-orig/abi-0.7.11/src/af/xap/xp/xap_EncodingManager.cpp       Fri 
+Nov 10 15:08:48 2000
@@ -395,6 +395,15 @@
 static const char* wincharsetcode_th[]=  /* thai charset*/
 { "th", NULL };
 
+/*I'm not sure that charset code is the same for Big5 and GB2312.
+  Tested with GB2312 only.  
+*/
+static const char* wincharsetcode_zh_GB2312[]= /* chinese*/
+{ "zh_CN.GB2312", "zh_TW.GB2312", NULL };
+
+static const char* wincharsetcode_zh_BIG5[]= /* chinese*/
+{ "zh_CN.BIG5", "zh_TW.BIG5", NULL };
+
 static const _rmap langcode_to_wincharsetcode[]=
 {
        {"0"}, /* default value - ansi charset*/
@@ -403,6 +412,8 @@
        {"162",wincharsetcode_tr},
        {"163",wincharsetcode_vi},
        {"222",wincharsetcode_th},      
+       {"134",wincharsetcode_zh_GB2312},
+       {"136",wincharsetcode_zh_BIG5}, 
        {NULL}
 };
 
@@ -449,12 +460,16 @@
        {NULL}
 };
 
+/*
+ This table is useful since iconv implementations don't know some cpNNNN 
+ charsets but under some different name.
+*/
 static const _map MSCodepagename_to_charset_name_map[]=
 {
 /*key, value*/
     {NULL,NULL},
-    {"CP936","BIG5"}, /* most probably it's correct  - VH*/
-    {"CP950","GB2312"},    /* 100% correct */
+    {"CP936","GB2312"},
+    {"CP950","BIG5"},  
     {NULL,NULL}
 };
 
@@ -463,7 +478,10 @@
 {
 /*key, value*/
     {NULL},
-/*   {"0x404","zh_CN"},*/  /*I guess - VH*/
+   {"zh_CN.BIG5",      "0x404"},  
+   {"zh_CN.GB2312",    "0x804"},     
+   {"zh_TW.BIG5",      "0x404"},  
+   {"zh_TW.GB2312",    "0x804"}, 
     {NULL}
 };
 
@@ -728,7 +746,7 @@
                    len += 
sprintf(buf+len,"\\usepackage[%s]{inputenc}\n",NativeTexEncodingName);
                if (NativeBabelArgument)
                    len += 
sprintf(buf+len,"\\usepackage[%s]{babel}\n",NativeBabelArgument);
-               TexPrologue = len ? UT_strdup(buf)  : "";
+               TexPrologue = len ? UT_strdup(buf)  : " ";
            };
        }
        if (cjk_locale()) {
@@ -815,7 +833,9 @@
 
 const char* XAP_EncodingManager::charsetFromCodepage(int lid) const
 {
-    char* cpname = wvLIDToCodePageConverter(lid);
+    static char buf[100];
+    sprintf(buf,"CP%d",lid);    
+    char* cpname = buf;
     UT_Bool is_default;
     const char* ret = 
search_map(MSCodepagename_to_charset_name_map,cpname,&is_default);
     return is_default ? cpname : ret;
@@ -823,7 +843,10 @@
 
 const char* XAP_EncodingManager::WindowsCharsetName() const
 {
-    return charsetFromCodepage( getWinLanguageCode() );
+    char* cpname = wvLIDToCodePageConverter(getWinLanguageCode());
+    UT_Bool is_default;
+    const char* ret = 
+search_map(MSCodepagename_to_charset_name_map,cpname,&is_default);
+    return is_default ? cpname : ret;
 };
 
 UT_uint32  XAP_EncodingManager::getWinLanguageCode() const
diff -ru abi-0.7.11-orig~/abi-0.7.11/src/wp/impexp/xp/ie_exp_RTF_listenerWriteDoc.cpp 
abi-0.7.11-orig-orig/abi-0.7.11/src/wp/impexp/xp/ie_exp_RTF_listenerWriteDoc.cpp
--- abi-0.7.11-orig~/abi-0.7.11/src/wp/impexp/xp/ie_exp_RTF_listenerWriteDoc.cpp       
 Thu Nov  9 19:38:49 2000
+++ abi-0.7.11-orig-orig/abi-0.7.11/src/wp/impexp/xp/ie_exp_RTF_listenerWriteDoc.cpp   
+ Sat Nov 11 21:54:27 2000
@@ -196,14 +196,20 @@
                        {
                                /*FIXME: can it happen that wctomb will fail under CJK 
locales? */
                                m_wctomb.wctomb_or_fallback(mbbuf,mblen,*pData++);
-                               for(int i=0;i<mblen;++i) {
-                                       unsigned char c = mbbuf[i];
-                                       if ( c > 0x007f)
+                               if (mbbuf[0] & 0x80)
+                               {
+                                       FlushBuffer();
+                                       for(int i=0;i<mblen;++i) {
+                                               unsigned char c = mbbuf[i];
                                                m_pie->_rtf_nonascii_hex2(c);
-                                       else
-                                               *pBuf++ = c;
-                                       
-                               };
+                                       }
+                               }
+                               else
+                               {
+                                       for(int i=0;i<mblen;++i) {
+                                               *pBuf++ = mbbuf[i];
+                                       }
+                               }
                        } else if (!m_pie->m_atticFormat) 
                        {
                                if (*pData > 0x00ff)            // emit unicode 
character
diff -ru abi-0.7.11-orig~/abi-0.7.11/src/wp/impexp/xp/ie_imp_MsWord_97.cpp 
abi-0.7.11-orig-orig/abi-0.7.11/src/wp/impexp/xp/ie_imp_MsWord_97.cpp
--- abi-0.7.11-orig~/abi-0.7.11/src/wp/impexp/xp/ie_imp_MsWord_97.cpp   Thu Nov  9 
19:38:49 2000
+++ abi-0.7.11-orig-orig/abi-0.7.11/src/wp/impexp/xp/ie_imp_MsWord_97.cpp       Sat 
+Nov 11 12:48:13 2000
@@ -629,7 +629,7 @@
                           else
                                 {
                                   FREEP(fname);
-                                  fname=UT_strdup(f);
+                                  fname=UT_strdup(f ? f : "helvetic");
                                 }                         
                      }
                   }
diff -ru abi-0.7.11-orig~/abi-0.7.11/src/wp/impexp/xp/ie_imp_RTF.cpp 
abi-0.7.11-orig-orig/abi-0.7.11/src/wp/impexp/xp/ie_imp_RTF.cpp
--- abi-0.7.11-orig~/abi-0.7.11/src/wp/impexp/xp/ie_imp_RTF.cpp Thu Nov  9 19:38:49 
2000
+++ abi-0.7.11-orig-orig/abi-0.7.11/src/wp/impexp/xp/ie_imp_RTF.cpp     Fri Nov 10 
+15:11:25 2000
@@ -1447,6 +1447,10 @@
        //is seen
        // Now comes the font name, terminated by either a close brace or a slash or a 
semi-colon
        int count = 0;
+       /*
+           FIXME: CJK font names come in form \'aa\'cd\'ef - so we have to 
+           parse \'HH correctly (currently we ignore them!) - VH
+       */
        while ( ch != '}'  &&  ch != '\\'  &&  ch != ';' && ch!= '{')
        {
                keyword[count++] = ch;
@@ -1472,6 +1476,8 @@
                {
                        if (!ReadCharFromFile(&ch))
                                return UT_FALSE;
+                       if (ch=='{')
+                               ++nesting;
                }
                if (nesting>0 && i!=nesting) //we need to skip '}' we've just seen.
                        if (!ReadCharFromFile(&ch))

final bits for CJK patch - p5

Reply via email to