This patch enables the RTF importer to make us of the \fcharset and
\fcpg properties of the font table and switch between encodings
when it encounters \f.
There are a few edge cases and charsets I couldn't find information
on so please contact me or implement these if you know about them.
Andrew Dunbar.
--
http://linguaphile.sourceforge.net
Index: src/wp/impexp/xp/ie_imp_RTF.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_RTF.cpp,v
retrieving revision 1.61
diff -u -r1.61 ie_imp_RTF.cpp
--- src/wp/impexp/xp/ie_imp_RTF.cpp 2001/05/08 04:19:38 1.61
+++ src/wp/impexp/xp/ie_imp_RTF.cpp 2001/05/21 08:16:57
@@ -149,10 +149,89 @@
m_family = fontFamily;
m_charSet = charSet;
m_codepage = codepage;
+ m_szCodepage = "MS-ANSI";
m_pitch = pitch;
memcpy(m_panose, panose, 10*sizeof(unsigned char));
m_pFontName = pFontName;
m_pAlternativeFontName = pAlternativeFontName;
+
+ // Set charset/codepage converter
+ if (m_codepage && m_charSet)
+ {
+ UT_DEBUGMSG(("RTF Font has codepage *and* charset\n"));
+ UT_ASSERT(UT_NOT_IMPLEMENTED);
+ }
+ else if (m_codepage)
+ {
+ m_szCodepage =
+XAP_EncodingManager::get_instance()->charsetFromCodepage(m_codepage);
+ }
+ else if (m_charSet)
+ {
+ switch (m_charSet)
+ {
+ case 0: // ANSI_CHARSET
+ m_szCodepage = "MS-ANSI"; // CP1252
+ break;
+ case 2: // SYMBOL_CHARSET
+ UT_DEBUGMSG(("RTF Font charset 'Symbol' not
+implemented\n"));
+ UT_ASSERT(UT_NOT_IMPLEMENTED);
+ break;
+ case 128: // SHIFTJIS_CHARSET
+ m_szCodepage = "SHIFT-JIS";
+ break;
+ case 161: // GREEK_CHARSET
+ m_szCodepage = "MS-GREEK"; // CP1253
+ break;
+ case 162: // TURKISH_CHARSET
+ m_szCodepage = "MS-TURK"; // CP1254
+ break;
+ // TODO What is different? Iconv only supports one MS Hebrew
+codepage.
+ case 181: // HEBREWUSER_CHARSET
+ UT_DEBUGMSG(("RTF Font charset 'HEBREWUSER'??\n"));
+ case 177: // HEBREW_CHARSET
+ m_szCodepage = "MS-HEBR"; // CP1255
+ break;
+ // TODO What is different? Iconv only supports one MS Arabic
+codepage.
+ case 178: // ARABICSIMPLIFIED_CHARSET
+ UT_DEBUGMSG(("RTF Font charset
+'ARABICSIMPLIFIED'??\n"));
+ m_szCodepage = "MS-ARAB"; // CP1256
+ break;
+ case 179: // ARABICTRADITIONAL_CHARSET
+ UT_DEBUGMSG(("RTF Font charset
+'ARABICTRADITIONAL'??\n"));
+ m_szCodepage = "MS-ARAB"; // CP1256
+ break;
+ case 180: // ARABICUSER_CHARSET
+ UT_DEBUGMSG(("RTF Font charset 'ARABICUSER'??\n"));
+ m_szCodepage = "MS-ARAB"; // CP1256
+ break;
+ case 204: // CYRILLIC_CHARSET
+ m_szCodepage = "MS-CYRL"; // CP1251
+ break;
+ case 238: // EASTERNEUROPE_CHARSET
+ m_szCodepage = "MS-EE"; // CP1250
+ break;
+ case 254: // PC437_CHARSET
+ // TODO What is this and can iconv do it?
+ UT_DEBUGMSG(("RTF Font charset 'PC437'??\n"));
+ UT_ASSERT(UT_NOT_IMPLEMENTED);
+ break;
+ case 255: // OEM_CHARSET
+ // TODO Can iconv do this?
+ UT_DEBUGMSG(("RTF Font charset 'OEM'??\n"));
+ UT_ASSERT(UT_NOT_IMPLEMENTED);
+ break;
+ default:
+ UT_DEBUGMSG(("RTF Font charset unknown: %d\n",
+m_charSet));
+ // TODO Unknown charset
+ UT_ASSERT(UT_NOT_IMPLEMENTED);
+ }
+ }
+ else
+ {
+ // TODO No codepage or charset - what do we do?
+ UT_DEBUGMSG(("RTF Font has neither codepage *nor* charset\n"));
+ // UT_ASSERT(UT_NOT_IMPLEMENTED);
+ }
}
@@ -3499,6 +3578,10 @@
bool IE_Imp_RTF::HandleFace(UT_uint32 fontNumber)
{
+ RTFFontTableItem* pFont = GetNthTableFont(fontNumber);
+ if (pFont != NULL)
+ m_mbtowc.setInCharset(pFont->m_szCodepage);
+
return HandleU32CharacterProp(fontNumber,
&m_currentRTFState.m_charProps.m_fontNumber);
}
Index: src/wp/impexp/xp/ie_imp_RTF.h
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_RTF.h,v
retrieving revision 1.32
diff -u -r1.32 ie_imp_RTF.h
--- src/wp/impexp/xp/ie_imp_RTF.h 2001/05/09 12:34:18 1.32
+++ src/wp/impexp/xp/ie_imp_RTF.h 2001/05/21 08:16:59
@@ -51,6 +51,7 @@
FontFamilyEnum m_family;
int m_charSet;
int m_codepage;
+ const char* m_szCodepage;
FontPitch m_pitch;
unsigned char m_panose[10];
char* m_pFontName;