Patch: Multi-encoding Text import/export

Andrew Dunbar Sat, 19 May 2001 00:56:20 -0700

I consider this a pretty important change.

It allows you to import a text file no matter if
it's an old 8-bit encoding, UTF-8, or UCS-2 as is
used in Windows and Mac OSX.

It also allows you to export to any of these text
formats - though changes are needed to the rest of
AbiWord to fully support this.

This also means we will no longer need separate
UTF-8 and UCS-2 importers and exporters and any
.txt file will "just work" - perfect for church
secretaries (:

Please somebody have a serious look at this!
Feedback much appreciated.

Andrew Dunbar.

-- 
http://linguaphile.sourceforge.net

Index: src/af/util/xp/ut_mbtowc.cpp
===================================================================
RCS file: /cvsroot/abi/src/af/util/xp/ut_mbtowc.cpp,v
retrieving revision 1.17
diff -u -r1.17 ut_mbtowc.cpp
--- src/af/util/xp/ut_mbtowc.cpp        2001/05/03 21:11:37     1.17
+++ src/af/util/xp/ut_mbtowc.cpp        2001/05/19 07:27:00
@@ -22,6 +22,8 @@
 #include <limits.h>
 #include "ut_mbtowc.h"
 
+// UTF-8 can use up to 6 bytes
+#define MY_MB_LEN_MAX 6
 
 #if 0 /* big if 0 */
 #if defined(__OpenBSD__) || defined(__FreeBSD__)
@@ -191,7 +193,7 @@
 
 int UT_Mbtowc::mbtowc(wchar_t &wc,char mb)
 {
-  if(++m_bufLen>MB_LEN_MAX)
+  if(++m_bufLen>MY_MB_LEN_MAX)
        {
          initialize();
          return 0;
@@ -202,7 +204,7 @@
 #else
   size_t thisLen=mbrtowc(&wc,m_buf,m_bufLen,&m_state);
 #endif
-  if(thisLen>MB_LEN_MAX)return 0;
+  if(thisLen>MY_MB_LEN_MAX)return 0;
   if(thisLen==0)thisLen=1;
   m_bufLen-=thisLen;
   return 1;
@@ -229,6 +231,12 @@
     cd = iconv_open("UCS-2", charset );
 };
 
+UT_Mbtowc::UT_Mbtowc(const char* from_charset): m_bufLen(0)
+{
+    cd = iconv_open("UCS-2", from_charset);
+    UT_ASSERT(cd != (iconv_t)-1);    
+};
+
 UT_Mbtowc::UT_Mbtowc(): m_bufLen(0)
 {
     cd = iconv_open("UCS-2", 
XAP_EncodingManager::get_instance()->getNativeEncodingName() );
@@ -250,7 +258,7 @@
 
 int UT_Mbtowc::mbtowc(wchar_t &wc,char mb)
 {
-    if(++m_bufLen>MB_LEN_MAX) {
+    if(++m_bufLen>MY_MB_LEN_MAX) {
       initialize();
       return 0;
     }
Index: src/af/util/xp/ut_mbtowc.h
===================================================================
RCS file: /cvsroot/abi/src/af/util/xp/ut_mbtowc.h,v
retrieving revision 1.9
diff -u -r1.9 ut_mbtowc.h
--- src/af/util/xp/ut_mbtowc.h  2000/11/04 04:54:56     1.9
+++ src/af/util/xp/ut_mbtowc.h  2001/05/19 07:27:00
@@ -49,6 +49,7 @@
 public:
   void initialize();
   UT_Mbtowc();
+  UT_Mbtowc(const char* from_charset);
   UT_Mbtowc(const UT_Mbtowc& v);
   ~UT_Mbtowc();  
   int mbtowc(wchar_t &wc,char mb);
Index: src/wp/impexp/xp/ie_exp.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp.cpp,v
retrieving revision 1.46
diff -u -r1.46 ie_exp.cpp
--- src/wp/impexp/xp/ie_exp.cpp 2001/05/05 20:08:13     1.46
+++ src/wp/impexp/xp/ie_exp.cpp 2001/05/19 07:27:31
@@ -109,7 +109,8 @@
        // TODO add code to make a backup of the original file, if it exists.
 
 #ifndef HAVE_GNOMEVFS
-       m_fp = fopen(szFilename,"w");
+       // Open file in binary mode or UCS-2 output will be mangled.
+       m_fp = fopen(szFilename,"wb");
        return (m_fp != 0);
 #else
        GnomeVFSResult result;
Index: src/wp/impexp/xp/ie_exp_Text.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp_Text.cpp,v
retrieving revision 1.23
diff -u -r1.23 ie_exp_Text.cpp
--- src/wp/impexp/xp/ie_exp_Text.cpp    2001/05/05 20:08:13     1.23
+++ src/wp/impexp/xp/ie_exp_Text.cpp    2001/05/19 07:27:39
@@ -34,6 +34,8 @@
 
 #include "ut_string_class.h"
 
+#define MY_MB_LEN_MAX 6
+
 //////////////////////////////////////////////////////////////////
 // a private listener class to help us translate the document
 // into a text stream.  code is at the bottom of this file.
@@ -70,12 +72,18 @@
 protected:
        void                            _closeBlock(void);
        void                            _outputData(const UT_UCSChar * p, UT_uint32 
length);
+       void                            _output8BitData(const UT_UCSChar * , UT_uint32 
+length);
+       void                            _output16BitData(const UT_UCSChar * , 
+UT_uint32 length);
        
        PD_Document *           m_pDocument;
        IE_Exp_Text *           m_pie;
        bool                            m_bInBlock;
        bool                            m_bToClipboard;
-       UT_Wctomb               m_wctomb;
+       bool                            m_bFirstWrite;
+       UT_Wctomb                       m_wctomb;
+       const char *            m_szEncoding;
+       bool                            m_bBigEndian;
+       bool                            m_bUseBOM;
 };
 
 /*****************************************************************/
@@ -144,34 +152,56 @@
        if (!m_bInBlock)
                return;
 
-#ifdef WIN32                                                   // we need to generate 
CRLFs on Win32
-       if (m_bToClipboard)                                     // when writing to the 
clipboard.  we
-               m_pie->write("\r");                             // use text mode when 
going to a file
-#endif                                                                 // so we don't 
need to then.
+       // TODO All writes should be re-routed via iconv since UCS-2
+       // TODO uses two bytes for each character.
+       // TODO Old Mac should use "\r".  Mac OSX should Use U+2028 or U+2029.
+#ifdef WIN32
+       m_pie->write("\r\n");
+#else
        m_pie->write("\n");
+#endif
        m_bInBlock = false;
        return;
 }
 
 void s_Text_Listener::_outputData(const UT_UCSChar * data, UT_uint32 length)
 {
+       if (m_szEncoding && !strncmp(m_szEncoding,"UCS-2",5))
+               _output16BitData(data, length);
+       else
+               _output8BitData(data, length);
+}
+
+void s_Text_Listener::_output8BitData(const UT_UCSChar * data, UT_uint32 length)
+{
        UT_String sBuf;
        const UT_UCSChar * pData;
        
        int mbLen;
-       char pC[MB_LEN_MAX];
-       
+       char pC[MY_MB_LEN_MAX];
+
        UT_ASSERT(sizeof(UT_Byte) == sizeof(char));
 
+       if (m_bFirstWrite)
+       {
+               if (m_szEncoding)
+                       m_wctomb.setOutCharset(m_szEncoding);
+               if (m_bUseBOM)
+               {
+                       // TODO There may be reason for using a BOM in UTF-8 text.
+                       // TODO I've seen MS software do it.
+               }
+               m_bFirstWrite = false;
+       }
+
        for (pData=data; (pData<data+length); /**/)
        {
                if(!m_wctomb.wctomb(pC,mbLen,(wchar_t)*pData))
                {
-                   mbLen=1;
-                   pC[0]='?';
-                   m_wctomb.initialize();
+                       mbLen=1;
+                       pC[0]='?';
+                       m_wctomb.initialize();
                }
-               pData++;                
                if (mbLen>1)            
                {
                        sBuf += pC;
@@ -179,17 +209,89 @@
                else
                {
                        // We let any UCS_LF's (forced line breaks) go out as is.
+                       // TODO Old Mac should use "\r".  Mac OSX should Use U+2028 or 
+U+2029.
 #ifdef WIN32
-                       if (m_bToClipboard && pC[0]==UCS_LF)
+                       if (pC[0]==UCS_LF)
                                sBuf += "\r";
 #endif
                        sBuf += (char)pC[0];
                }
+               pData++;
        }
 
        m_pie->write(sBuf.c_str(),sBuf.size());
 }
 
+void s_Text_Listener::_output16BitData(const UT_UCSChar * data, UT_uint32 length)
+{
+       const UT_UCSChar * pInData;
+       char * pOutData;
+       
+       int mbLen;
+       unsigned char pC[MY_MB_LEN_MAX];
+       char * pConvertedData = 0;
+       
+       UT_ASSERT(sizeof(UT_Byte) == sizeof(char));
+
+       pConvertedData = new char[length * sizeof(UT_UCSChar)];
+       pOutData = pConvertedData;
+
+       UT_ASSERT(pConvertedData);
+
+       if (m_bFirstWrite)
+       {
+               if (m_szEncoding)
+                       m_wctomb.setOutCharset(m_szEncoding);
+               if (m_bUseBOM)
+               {
+                       if (m_bBigEndian)
+                               m_pie->write("\xfe\xff",2);
+                       else
+                               m_pie->write("\xff\xfe",2);
+               }
+               m_bFirstWrite = false;
+       }
+
+       for (pInData=data; (pInData<data+length); /**/)
+       {
+               if(!m_wctomb.wctomb(reinterpret_cast<char 
+*>(pC),mbLen,(wchar_t)*pInData))
+               {
+                       // TODO U+FFFD "REPLACEMENT CHARACTER" is the
+                       // TODO correct unicode equivalent of '?' isn't it?
+                       mbLen=2;
+                       if (m_bBigEndian)
+                       {
+                               pC[0]=0xff;
+                               pC[1]=0xfd;
+                       }
+                       else
+                       {
+                               pC[0]=0xfd;
+                               pC[1]=0xff;
+                       }
+                       m_wctomb.initialize();
+               }
+               // We let any UCS_LF's (forced line breaks) go out as is.
+               if (*pInData == UCS_LF)
+               {
+                       // TODO Old Mac should use "\r".  Mac OSX should Use U+2028 or 
+U+2029.
+#ifdef WIN32
+                       // TODO Win needs to *insert* an extra CR character before the 
+LF.
+                       // TODO The old 8-bit code used UT_String which could grow 
+dynamically
+                       // TODO but the 16-bit code uses a fixed size buffer.
+                       // TODO What is an appropriate solution?
+#endif
+               }
+               *pOutData++ = pC[0];
+               *pOutData++ = pC[1];
+               ++pInData;
+       }
+
+       m_pie->write(pConvertedData,length * sizeof(UT_UCSChar));
+
+       delete [] pConvertedData;
+}
+
 s_Text_Listener::s_Text_Listener(PD_Document * pDocument,
                                                                 IE_Exp_Text * pie,
                                                                 bool bToClipboard)
@@ -201,6 +303,10 @@
        // assume that we are starting in the middle of a block.
        // when going to a file we should not.
        m_bInBlock = m_bToClipboard;
+       m_bFirstWrite = true;
+       m_szEncoding = 0;
+       m_bBigEndian = true;
+       m_bUseBOM = false;
 }
 
 s_Text_Listener::~s_Text_Listener()
Index: src/wp/impexp/xp/ie_imp.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp.cpp,v
retrieving revision 1.40
diff -u -r1.40 ie_imp.cpp
--- src/wp/impexp/xp/ie_imp.cpp 2001/05/07 16:50:43     1.40
+++ src/wp/impexp/xp/ie_imp.cpp 2001/05/19 07:27:40
@@ -176,7 +176,8 @@
                char szBuf[4096];  // 4096 ought to be enough
                int iNumbytes;
                FILE *f;
-               if ( ( f = fopen( szFilename, "r" ) ) != (FILE *)0 )
+               // we must open in binary mode for UCS-2 compatibility
+               if ( ( f = fopen( szFilename, "rb" ) ) != (FILE *)0 )
                {
                        iNumbytes = fread(szBuf, 1, sizeof(szBuf), f);
                        fclose(f);
Index: src/wp/impexp/xp/ie_imp_Text.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.cpp,v
retrieving revision 1.24
diff -u -r1.24 ie_imp_Text.cpp
--- src/wp/impexp/xp/ie_imp_Text.cpp    2001/05/03 00:45:36     1.24
+++ src/wp/impexp/xp/ie_imp_Text.cpp    2001/05/19 07:28:01
@@ -29,21 +29,177 @@
 #include "ut_growbuf.h"
 #include "xap_EncodingManager.h"
 
+// TODO Can we make these members of the importer or the sniffer?
+enum UCS2_Endian { UE_BigEnd = -1, UE_NotUCS = 0, UE_LittleEnd };
+
+static bool _recognizeUTF8 (const char * szBuf,
+                                                       UT_uint32 iNumbytes);
+static UCS2_Endian _recognizeUCS2 (const char * szBuf,
+                                                                  UT_uint32 iNumbytes,
+                                                                  bool bDeep);
+
 /*****************************************************************/
 /*****************************************************************/
 
 bool IE_Imp_Text_Sniffer::recognizeContents(const char * szBuf, 
                                                                                       
 UT_uint32 iNumbytes)
 {
-       // We give the other guys a chance, since this
-       // importer is so generic.  
-       return false;
+       // TODO It may or may not be worthwhile trying to guess CJK encodings.
+       
+       bool bSuccess = false;
+
+       bSuccess = _recognizeUTF8(szBuf, iNumbytes);
+
+       if (bSuccess == false)
+       {
+               if (_recognizeUCS2(szBuf, iNumbytes, false) != UE_NotUCS)
+               {
+                       bSuccess = true;
+               }
+       }
+       
+       return bSuccess;
+}
+
+static bool _recognizeUTF8(const char * szBuf,
+                                                                               
+UT_uint32 iNumbytes)
+{
+       bool bSuccess = false;
+       const unsigned char *p = reinterpret_cast<const unsigned char *>(szBuf);
+
+       while (p < reinterpret_cast<const unsigned char *>(szBuf + iNumbytes))
+       {
+               UT_sint32 iLen;
+               
+               if ((*p & 0x80) == 0)                           // ASCII
+               {
+                       ++p;
+                       continue;
+               }
+               else if ((*p & 0xc0) == 0x80)                   // not UTF-8
+               {
+                       return false;
+               }
+               else if (*p == 0xfe || *p == 0xff)
+               {
+                       // BOM shouldn't occur in UTF-8 - file may be UCS-2
+                       return false;
+               }
+               else if ((*p & 0xfe) == 0xfc)                   // lead byte in 6-byte 
+sequence
+                       iLen = 6;
+               else if ((*p & 0xfc) == 0xf8)                   // lead byte in 5-byte 
+sequence
+                       iLen = 5;
+               else if ((*p & 0xf8) == 0xf0)                   // lead byte in 4-byte 
+sequence
+                       iLen = 4;
+               else if ((*p & 0xf0) == 0xe0)                   // lead byte in 3-byte 
+sequence
+                       iLen = 3;
+               else if ((*p & 0xe0) == 0xc0)                   // lead byte in 2-byte 
+sequence
+                       iLen = 2;
+               else    
+               {
+                       // the above code covers all cases - if we reach here the 
+logic is wrong
+                       UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+                       return false;
+               }
+       
+               while (--iLen)
+               {
+                       ++p;
+                       if (p >= reinterpret_cast<const unsigned char *>(szBuf + 
+iNumbytes))
+                       {
+                               //UT_DEBUGMSG(("  out of data!\n"));
+                               break;
+                       }
+                       if ((*p & 0xc0) != 0x80)
+                               return false;
+               }
+               // all bytes in sequence were ok
+               bSuccess = true;
+               ++p;
+       }
+       
+       return bSuccess;
 }
 
+static UCS2_Endian _recognizeUCS2(const char * szBuf,
+                                                               UT_uint32 iNumbytes,
+                                                               bool bDeep)
+{
+       UCS2_Endian eResult = UE_NotUCS;
+       
+       if (iNumbytes >= 2)
+       {
+               const unsigned char *p = reinterpret_cast<const unsigned char 
+*>(szBuf);
+
+               // Big endian ?
+               if (p[0] == 0xfe && p[1] == 0xff)
+                       eResult = UE_BigEnd;
+
+               // Little endian
+               else if (p[0] == 0xff && p[1] == 0xfe)
+                       eResult = UE_LittleEnd;
+
+               if (eResult == UE_NotUCS && bDeep)
+               {
+                       // If we know this is a text file, know it isn't UTF-8, and it 
+doesn't
+                       // begin with a BOM, let's try a couple of heuristics too see 
+if it
+                       // might be a UCS-2 file without a BOM.
+                       // Since CR and LF are very common and their endian-swapped 
+counterparts
+                       // are reserved in Unicode, they should only exist in big 
+endian or
+                       // little endian but not both.
+                       // If there are no CRs or LFs we fall back on counting how 
+many characters
+                       // fall within the ASCII range for both endians.  The one with 
+the higher
+                       // count wins.
+                       // Text files which contain NUL characters will be wrongly 
+identified as
+                       // UCS-2 using this technique.
+
+                       UT_sint32 iLineEndBE = 0;
+                       UT_sint32 iLineEndLE = 0;
+                       UT_sint32 iAsciiBE = 0;
+                       UT_sint32 iAsciiLE = 0;
+
+                       // Count all CR, LF, and ASCII range characters.
+                       for (p = reinterpret_cast<const unsigned char *>(szBuf);
+                                p < reinterpret_cast<const unsigned char *>(szBuf + 
+iNumbytes - 1);
+                                p += 2)
+                       {
+                               // A 16-bit null character probably won't exist in a 
+UCS-2 file
+                               if (p[0] == 0 && p[1] == 0)
+                                       break;
+                               if (p[0] == 0)
+                               {
+                                       ++iAsciiBE;
+                                       if (p[1] == 0x0A || p[1] == 0x0D)
+                                               ++iLineEndBE;
+                               }
+                               if (p[1] == 0)
+                               {
+                                       ++iAsciiLE;
+                                       if (p[0] == 0x0A || p[0] == 0x0D)
+                                               ++iLineEndLE;
+                               }
+                       }
+
+                       // Take an educated guess.
+                       if (iLineEndBE && !iLineEndLE)
+                               eResult = UE_BigEnd;
+                       else if (iLineEndLE && !iLineEndBE)
+                               eResult = UE_LittleEnd;
+                       else if (!iLineEndBE && !iLineEndLE)
+                       {
+                               if (iAsciiBE > iAsciiLE)
+                                       eResult = UE_BigEnd;
+                               else if (iAsciiLE > iAsciiBE)
+                                       eResult = UE_LittleEnd;
+                       }
+               }
+       }
+
+       return eResult;
+}
+
 bool IE_Imp_Text_Sniffer::recognizeSuffix(const char * szSuffix)
 {
-       // We give the other guys a chance, since this
-       // importer is so generic.
        return (!UT_stricmp (szSuffix, ".txt") || !UT_stricmp(szSuffix, ".text"));
 }
 
@@ -69,10 +225,9 @@
 /*****************************************************************/
 
 /*
-  Import US-ASCII (actually Latin-1) data from a plain
-  text file.  We allow either LF or CR or CRLF line
-  termination.  Each line terminator is taken to be a
-  paragraph break.
+  Import data from a plain text file.  We allow either
+  LF or CR or CRLF line termination.  Each line
+  terminator is taken to be a paragraph break.
 */
 
 /*****************************************************************/
@@ -82,7 +237,8 @@
 
 UT_Error IE_Imp_Text::importFile(const char * szFilename)
 {
-       FILE *fp = fopen(szFilename, "r");
+       // We must open in binary mode for UCS-2 compatibility.
+       FILE *fp = fopen(szFilename, "rb");
        if (!fp)
        {
                UT_DEBUGMSG(("Could not open file %s\n",szFilename));
@@ -91,6 +247,9 @@
        
        UT_Error error;
 
+       // First we need to determine the encoding.
+       // TODO We might want to find a way to combine this with recognizeContents().
+       X_CleanupIfError(error,_recognizeEncoding(fp));
        X_CleanupIfError(error,_writeHeader(fp));
        X_CleanupIfError(error,_parseFile(fp));
 
@@ -113,6 +272,7 @@
 IE_Imp_Text::IE_Imp_Text(PD_Document * pDocument)
        : IE_Imp(pDocument)
 {
+       m_szEncoding = 0;
 }
 
 /*****************************************************************/
@@ -121,6 +281,37 @@
 #define X_ReturnIfFail(exp,error)              do { bool b = (exp); if (!b) return 
(error); } while (0)
 #define X_ReturnNoMemIfError(exp)      X_ReturnIfFail(exp,UT_IE_NOMEMORY)
 
+UT_Error IE_Imp_Text::_recognizeEncoding(FILE * fp)
+{
+       char szBuf[4096];  // 4096 ought to be enough
+       UT_sint32 iNumbytes;
+
+       iNumbytes = fread(szBuf, 1, sizeof(szBuf), fp);
+       fseek(fp, 0, SEEK_SET);
+
+       if (_recognizeUTF8(szBuf, iNumbytes))
+       {
+               m_szEncoding = "UTF-8";
+       }
+       else
+       {
+               UCS2_Endian eUcs2 = UE_NotUCS;
+
+               eUcs2 = _recognizeUCS2(szBuf, iNumbytes, true);
+               
+               if (eUcs2 == UE_BigEnd)
+               {
+                       m_szEncoding = "UCS-2-BE";
+               }
+               else if (eUcs2 == UE_LittleEnd)
+               {
+                       m_szEncoding = "UCS-2-LE";
+               }
+       }
+
+       return UT_OK;
+}
+
 UT_Error IE_Imp_Text::_writeHeader(FILE * /* fp */)
 {
        X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Section, NULL));
@@ -137,6 +328,9 @@
        UT_UCSChar c;
        wchar_t wc;
 
+       if (m_szEncoding)
+               m_Mbtowc.setInCharset(m_szEncoding);
+
        while (fread(&b, 1, sizeof(b), fp) > 0)
        {
                if(!m_Mbtowc.mbtowc(wc,b))
@@ -146,6 +340,8 @@
                {
                case (UT_UCSChar)'\r':
                case (UT_UCSChar)'\n':
+               case 0x2028:                    // Unicode line separator
+               case 0x2029:                    // Unicode paragraph separator
                        
                        if ((c == (UT_UCSChar)'\n') && bEatLF)
                        {
@@ -158,7 +354,9 @@
                                bEatLF = true;
                        }
                        
-                       // we interprete either CRLF, CR, or LF as a paragraph break.
+                       // we interpret either CRLF, CR, or LF as a paragraph break.
+                       // we also accept U+2028 (line separator) and U+2029 (para 
+separator)
+                       // especially since these are recommended by Mac OS X.
                        
                        // start a paragraph and emit any text that we
                        // have accumulated.
@@ -224,6 +422,8 @@
                {
                case (UT_UCSChar)'\r':
                case (UT_UCSChar)'\n':
+               case 0x2028:                    // Unicode line separator
+               case 0x2029:                    // Unicode paragraph separator
                        if ((c == (UT_UCSChar)'\n') && bEatLF)
                        {
                                bEatLF = false;
@@ -235,7 +435,9 @@
                                bEatLF = true;
                        }
                        
-                       // we interprete either CRLF, CR, or LF as a paragraph break.
+                       // we interpret either CRLF, CR, or LF as a paragraph break.
+                       // we also accept U+2028 (line separator) and U+2029 (para 
+separator)
+                       // especially since these are recommended by Mac OS X.
                        
                        if (gbBlock.getLength() > 0)
                        {
Index: src/wp/impexp/xp/ie_imp_Text.h
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.h,v
retrieving revision 1.13
diff -u -r1.13 ie_imp_Text.h
--- src/wp/impexp/xp/ie_imp_Text.h      2001/05/03 00:45:36     1.13
+++ src/wp/impexp/xp/ie_imp_Text.h      2001/05/19 07:28:02
@@ -58,9 +58,11 @@
                                                                                
unsigned char * pData, UT_uint32 lenData);
        
 protected:
+       UT_Error                        _recognizeEncoding(FILE * fp);
        UT_Error                        _parseFile(FILE * fp);
        UT_Error                        _writeHeader(FILE * fp);
        UT_Mbtowc               m_Mbtowc;
+       const char *    m_szEncoding;
 };
 
 #endif /* IE_IMP_TEXT_H */

Patch: Multi-encoding Text import/export

Reply via email to