I consider this a pretty important change.
It allows you to import a text file no matter if
it's an old 8-bit encoding, UTF-8, or UCS-2 as is
used in Windows and Mac OSX.
It also allows you to export to any of these text
formats - though changes are needed to the rest of
AbiWord to fully support this.
This also means we will no longer need separate
UTF-8 and UCS-2 importers and exporters and any
.txt file will "just work" - perfect for church
secretaries (:
Please somebody have a serious look at this!
Feedback much appreciated.
Andrew Dunbar.
--
http://linguaphile.sourceforge.net
Index: src/af/util/xp/ut_mbtowc.cpp
===================================================================
RCS file: /cvsroot/abi/src/af/util/xp/ut_mbtowc.cpp,v
retrieving revision 1.17
diff -u -r1.17 ut_mbtowc.cpp
--- src/af/util/xp/ut_mbtowc.cpp 2001/05/03 21:11:37 1.17
+++ src/af/util/xp/ut_mbtowc.cpp 2001/05/19 07:27:00
@@ -22,6 +22,8 @@
#include <limits.h>
#include "ut_mbtowc.h"
+// UTF-8 can use up to 6 bytes
+#define MY_MB_LEN_MAX 6
#if 0 /* big if 0 */
#if defined(__OpenBSD__) || defined(__FreeBSD__)
@@ -191,7 +193,7 @@
int UT_Mbtowc::mbtowc(wchar_t &wc,char mb)
{
- if(++m_bufLen>MB_LEN_MAX)
+ if(++m_bufLen>MY_MB_LEN_MAX)
{
initialize();
return 0;
@@ -202,7 +204,7 @@
#else
size_t thisLen=mbrtowc(&wc,m_buf,m_bufLen,&m_state);
#endif
- if(thisLen>MB_LEN_MAX)return 0;
+ if(thisLen>MY_MB_LEN_MAX)return 0;
if(thisLen==0)thisLen=1;
m_bufLen-=thisLen;
return 1;
@@ -229,6 +231,12 @@
cd = iconv_open("UCS-2", charset );
};
+UT_Mbtowc::UT_Mbtowc(const char* from_charset): m_bufLen(0)
+{
+ cd = iconv_open("UCS-2", from_charset);
+ UT_ASSERT(cd != (iconv_t)-1);
+};
+
UT_Mbtowc::UT_Mbtowc(): m_bufLen(0)
{
cd = iconv_open("UCS-2",
XAP_EncodingManager::get_instance()->getNativeEncodingName() );
@@ -250,7 +258,7 @@
int UT_Mbtowc::mbtowc(wchar_t &wc,char mb)
{
- if(++m_bufLen>MB_LEN_MAX) {
+ if(++m_bufLen>MY_MB_LEN_MAX) {
initialize();
return 0;
}
Index: src/af/util/xp/ut_mbtowc.h
===================================================================
RCS file: /cvsroot/abi/src/af/util/xp/ut_mbtowc.h,v
retrieving revision 1.9
diff -u -r1.9 ut_mbtowc.h
--- src/af/util/xp/ut_mbtowc.h 2000/11/04 04:54:56 1.9
+++ src/af/util/xp/ut_mbtowc.h 2001/05/19 07:27:00
@@ -49,6 +49,7 @@
public:
void initialize();
UT_Mbtowc();
+ UT_Mbtowc(const char* from_charset);
UT_Mbtowc(const UT_Mbtowc& v);
~UT_Mbtowc();
int mbtowc(wchar_t &wc,char mb);
Index: src/wp/impexp/xp/ie_exp.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp.cpp,v
retrieving revision 1.46
diff -u -r1.46 ie_exp.cpp
--- src/wp/impexp/xp/ie_exp.cpp 2001/05/05 20:08:13 1.46
+++ src/wp/impexp/xp/ie_exp.cpp 2001/05/19 07:27:31
@@ -109,7 +109,8 @@
// TODO add code to make a backup of the original file, if it exists.
#ifndef HAVE_GNOMEVFS
- m_fp = fopen(szFilename,"w");
+ // Open file in binary mode or UCS-2 output will be mangled.
+ m_fp = fopen(szFilename,"wb");
return (m_fp != 0);
#else
GnomeVFSResult result;
Index: src/wp/impexp/xp/ie_exp_Text.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp_Text.cpp,v
retrieving revision 1.23
diff -u -r1.23 ie_exp_Text.cpp
--- src/wp/impexp/xp/ie_exp_Text.cpp 2001/05/05 20:08:13 1.23
+++ src/wp/impexp/xp/ie_exp_Text.cpp 2001/05/19 07:27:39
@@ -34,6 +34,8 @@
#include "ut_string_class.h"
+#define MY_MB_LEN_MAX 6
+
//////////////////////////////////////////////////////////////////
// a private listener class to help us translate the document
// into a text stream. code is at the bottom of this file.
@@ -70,12 +72,18 @@
protected:
void _closeBlock(void);
void _outputData(const UT_UCSChar * p, UT_uint32
length);
+ void _output8BitData(const UT_UCSChar * , UT_uint32
+length);
+ void _output16BitData(const UT_UCSChar * ,
+UT_uint32 length);
PD_Document * m_pDocument;
IE_Exp_Text * m_pie;
bool m_bInBlock;
bool m_bToClipboard;
- UT_Wctomb m_wctomb;
+ bool m_bFirstWrite;
+ UT_Wctomb m_wctomb;
+ const char * m_szEncoding;
+ bool m_bBigEndian;
+ bool m_bUseBOM;
};
/*****************************************************************/
@@ -144,34 +152,56 @@
if (!m_bInBlock)
return;
-#ifdef WIN32 // we need to generate
CRLFs on Win32
- if (m_bToClipboard) // when writing to the
clipboard. we
- m_pie->write("\r"); // use text mode when
going to a file
-#endif // so we don't
need to then.
+ // TODO All writes should be re-routed via iconv since UCS-2
+ // TODO uses two bytes for each character.
+ // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029.
+#ifdef WIN32
+ m_pie->write("\r\n");
+#else
m_pie->write("\n");
+#endif
m_bInBlock = false;
return;
}
void s_Text_Listener::_outputData(const UT_UCSChar * data, UT_uint32 length)
{
+ if (m_szEncoding && !strncmp(m_szEncoding,"UCS-2",5))
+ _output16BitData(data, length);
+ else
+ _output8BitData(data, length);
+}
+
+void s_Text_Listener::_output8BitData(const UT_UCSChar * data, UT_uint32 length)
+{
UT_String sBuf;
const UT_UCSChar * pData;
int mbLen;
- char pC[MB_LEN_MAX];
-
+ char pC[MY_MB_LEN_MAX];
+
UT_ASSERT(sizeof(UT_Byte) == sizeof(char));
+ if (m_bFirstWrite)
+ {
+ if (m_szEncoding)
+ m_wctomb.setOutCharset(m_szEncoding);
+ if (m_bUseBOM)
+ {
+ // TODO There may be reason for using a BOM in UTF-8 text.
+ // TODO I've seen MS software do it.
+ }
+ m_bFirstWrite = false;
+ }
+
for (pData=data; (pData<data+length); /**/)
{
if(!m_wctomb.wctomb(pC,mbLen,(wchar_t)*pData))
{
- mbLen=1;
- pC[0]='?';
- m_wctomb.initialize();
+ mbLen=1;
+ pC[0]='?';
+ m_wctomb.initialize();
}
- pData++;
if (mbLen>1)
{
sBuf += pC;
@@ -179,17 +209,89 @@
else
{
// We let any UCS_LF's (forced line breaks) go out as is.
+ // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or
+U+2029.
#ifdef WIN32
- if (m_bToClipboard && pC[0]==UCS_LF)
+ if (pC[0]==UCS_LF)
sBuf += "\r";
#endif
sBuf += (char)pC[0];
}
+ pData++;
}
m_pie->write(sBuf.c_str(),sBuf.size());
}
+void s_Text_Listener::_output16BitData(const UT_UCSChar * data, UT_uint32 length)
+{
+ const UT_UCSChar * pInData;
+ char * pOutData;
+
+ int mbLen;
+ unsigned char pC[MY_MB_LEN_MAX];
+ char * pConvertedData = 0;
+
+ UT_ASSERT(sizeof(UT_Byte) == sizeof(char));
+
+ pConvertedData = new char[length * sizeof(UT_UCSChar)];
+ pOutData = pConvertedData;
+
+ UT_ASSERT(pConvertedData);
+
+ if (m_bFirstWrite)
+ {
+ if (m_szEncoding)
+ m_wctomb.setOutCharset(m_szEncoding);
+ if (m_bUseBOM)
+ {
+ if (m_bBigEndian)
+ m_pie->write("\xfe\xff",2);
+ else
+ m_pie->write("\xff\xfe",2);
+ }
+ m_bFirstWrite = false;
+ }
+
+ for (pInData=data; (pInData<data+length); /**/)
+ {
+ if(!m_wctomb.wctomb(reinterpret_cast<char
+*>(pC),mbLen,(wchar_t)*pInData))
+ {
+ // TODO U+FFFD "REPLACEMENT CHARACTER" is the
+ // TODO correct unicode equivalent of '?' isn't it?
+ mbLen=2;
+ if (m_bBigEndian)
+ {
+ pC[0]=0xff;
+ pC[1]=0xfd;
+ }
+ else
+ {
+ pC[0]=0xfd;
+ pC[1]=0xff;
+ }
+ m_wctomb.initialize();
+ }
+ // We let any UCS_LF's (forced line breaks) go out as is.
+ if (*pInData == UCS_LF)
+ {
+ // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or
+U+2029.
+#ifdef WIN32
+ // TODO Win needs to *insert* an extra CR character before the
+LF.
+ // TODO The old 8-bit code used UT_String which could grow
+dynamically
+ // TODO but the 16-bit code uses a fixed size buffer.
+ // TODO What is an appropriate solution?
+#endif
+ }
+ *pOutData++ = pC[0];
+ *pOutData++ = pC[1];
+ ++pInData;
+ }
+
+ m_pie->write(pConvertedData,length * sizeof(UT_UCSChar));
+
+ delete [] pConvertedData;
+}
+
s_Text_Listener::s_Text_Listener(PD_Document * pDocument,
IE_Exp_Text * pie,
bool bToClipboard)
@@ -201,6 +303,10 @@
// assume that we are starting in the middle of a block.
// when going to a file we should not.
m_bInBlock = m_bToClipboard;
+ m_bFirstWrite = true;
+ m_szEncoding = 0;
+ m_bBigEndian = true;
+ m_bUseBOM = false;
}
s_Text_Listener::~s_Text_Listener()
Index: src/wp/impexp/xp/ie_imp.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp.cpp,v
retrieving revision 1.40
diff -u -r1.40 ie_imp.cpp
--- src/wp/impexp/xp/ie_imp.cpp 2001/05/07 16:50:43 1.40
+++ src/wp/impexp/xp/ie_imp.cpp 2001/05/19 07:27:40
@@ -176,7 +176,8 @@
char szBuf[4096]; // 4096 ought to be enough
int iNumbytes;
FILE *f;
- if ( ( f = fopen( szFilename, "r" ) ) != (FILE *)0 )
+ // we must open in binary mode for UCS-2 compatibility
+ if ( ( f = fopen( szFilename, "rb" ) ) != (FILE *)0 )
{
iNumbytes = fread(szBuf, 1, sizeof(szBuf), f);
fclose(f);
Index: src/wp/impexp/xp/ie_imp_Text.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.cpp,v
retrieving revision 1.24
diff -u -r1.24 ie_imp_Text.cpp
--- src/wp/impexp/xp/ie_imp_Text.cpp 2001/05/03 00:45:36 1.24
+++ src/wp/impexp/xp/ie_imp_Text.cpp 2001/05/19 07:28:01
@@ -29,21 +29,177 @@
#include "ut_growbuf.h"
#include "xap_EncodingManager.h"
+// TODO Can we make these members of the importer or the sniffer?
+enum UCS2_Endian { UE_BigEnd = -1, UE_NotUCS = 0, UE_LittleEnd };
+
+static bool _recognizeUTF8 (const char * szBuf,
+ UT_uint32 iNumbytes);
+static UCS2_Endian _recognizeUCS2 (const char * szBuf,
+ UT_uint32 iNumbytes,
+ bool bDeep);
+
/*****************************************************************/
/*****************************************************************/
bool IE_Imp_Text_Sniffer::recognizeContents(const char * szBuf,
UT_uint32 iNumbytes)
{
- // We give the other guys a chance, since this
- // importer is so generic.
- return false;
+ // TODO It may or may not be worthwhile trying to guess CJK encodings.
+
+ bool bSuccess = false;
+
+ bSuccess = _recognizeUTF8(szBuf, iNumbytes);
+
+ if (bSuccess == false)
+ {
+ if (_recognizeUCS2(szBuf, iNumbytes, false) != UE_NotUCS)
+ {
+ bSuccess = true;
+ }
+ }
+
+ return bSuccess;
+}
+
+static bool _recognizeUTF8(const char * szBuf,
+
+UT_uint32 iNumbytes)
+{
+ bool bSuccess = false;
+ const unsigned char *p = reinterpret_cast<const unsigned char *>(szBuf);
+
+ while (p < reinterpret_cast<const unsigned char *>(szBuf + iNumbytes))
+ {
+ UT_sint32 iLen;
+
+ if ((*p & 0x80) == 0) // ASCII
+ {
+ ++p;
+ continue;
+ }
+ else if ((*p & 0xc0) == 0x80) // not UTF-8
+ {
+ return false;
+ }
+ else if (*p == 0xfe || *p == 0xff)
+ {
+ // BOM shouldn't occur in UTF-8 - file may be UCS-2
+ return false;
+ }
+ else if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte
+sequence
+ iLen = 6;
+ else if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte
+sequence
+ iLen = 5;
+ else if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte
+sequence
+ iLen = 4;
+ else if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte
+sequence
+ iLen = 3;
+ else if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte
+sequence
+ iLen = 2;
+ else
+ {
+ // the above code covers all cases - if we reach here the
+logic is wrong
+ UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+ return false;
+ }
+
+ while (--iLen)
+ {
+ ++p;
+ if (p >= reinterpret_cast<const unsigned char *>(szBuf +
+iNumbytes))
+ {
+ //UT_DEBUGMSG((" out of data!\n"));
+ break;
+ }
+ if ((*p & 0xc0) != 0x80)
+ return false;
+ }
+ // all bytes in sequence were ok
+ bSuccess = true;
+ ++p;
+ }
+
+ return bSuccess;
}
+static UCS2_Endian _recognizeUCS2(const char * szBuf,
+ UT_uint32 iNumbytes,
+ bool bDeep)
+{
+ UCS2_Endian eResult = UE_NotUCS;
+
+ if (iNumbytes >= 2)
+ {
+ const unsigned char *p = reinterpret_cast<const unsigned char
+*>(szBuf);
+
+ // Big endian ?
+ if (p[0] == 0xfe && p[1] == 0xff)
+ eResult = UE_BigEnd;
+
+ // Little endian
+ else if (p[0] == 0xff && p[1] == 0xfe)
+ eResult = UE_LittleEnd;
+
+ if (eResult == UE_NotUCS && bDeep)
+ {
+ // If we know this is a text file, know it isn't UTF-8, and it
+doesn't
+ // begin with a BOM, let's try a couple of heuristics too see
+if it
+ // might be a UCS-2 file without a BOM.
+ // Since CR and LF are very common and their endian-swapped
+counterparts
+ // are reserved in Unicode, they should only exist in big
+endian or
+ // little endian but not both.
+ // If there are no CRs or LFs we fall back on counting how
+many characters
+ // fall within the ASCII range for both endians. The one with
+the higher
+ // count wins.
+ // Text files which contain NUL characters will be wrongly
+identified as
+ // UCS-2 using this technique.
+
+ UT_sint32 iLineEndBE = 0;
+ UT_sint32 iLineEndLE = 0;
+ UT_sint32 iAsciiBE = 0;
+ UT_sint32 iAsciiLE = 0;
+
+ // Count all CR, LF, and ASCII range characters.
+ for (p = reinterpret_cast<const unsigned char *>(szBuf);
+ p < reinterpret_cast<const unsigned char *>(szBuf +
+iNumbytes - 1);
+ p += 2)
+ {
+ // A 16-bit null character probably won't exist in a
+UCS-2 file
+ if (p[0] == 0 && p[1] == 0)
+ break;
+ if (p[0] == 0)
+ {
+ ++iAsciiBE;
+ if (p[1] == 0x0A || p[1] == 0x0D)
+ ++iLineEndBE;
+ }
+ if (p[1] == 0)
+ {
+ ++iAsciiLE;
+ if (p[0] == 0x0A || p[0] == 0x0D)
+ ++iLineEndLE;
+ }
+ }
+
+ // Take an educated guess.
+ if (iLineEndBE && !iLineEndLE)
+ eResult = UE_BigEnd;
+ else if (iLineEndLE && !iLineEndBE)
+ eResult = UE_LittleEnd;
+ else if (!iLineEndBE && !iLineEndLE)
+ {
+ if (iAsciiBE > iAsciiLE)
+ eResult = UE_BigEnd;
+ else if (iAsciiLE > iAsciiBE)
+ eResult = UE_LittleEnd;
+ }
+ }
+ }
+
+ return eResult;
+}
+
bool IE_Imp_Text_Sniffer::recognizeSuffix(const char * szSuffix)
{
- // We give the other guys a chance, since this
- // importer is so generic.
return (!UT_stricmp (szSuffix, ".txt") || !UT_stricmp(szSuffix, ".text"));
}
@@ -69,10 +225,9 @@
/*****************************************************************/
/*
- Import US-ASCII (actually Latin-1) data from a plain
- text file. We allow either LF or CR or CRLF line
- termination. Each line terminator is taken to be a
- paragraph break.
+ Import data from a plain text file. We allow either
+ LF or CR or CRLF line termination. Each line
+ terminator is taken to be a paragraph break.
*/
/*****************************************************************/
@@ -82,7 +237,8 @@
UT_Error IE_Imp_Text::importFile(const char * szFilename)
{
- FILE *fp = fopen(szFilename, "r");
+ // We must open in binary mode for UCS-2 compatibility.
+ FILE *fp = fopen(szFilename, "rb");
if (!fp)
{
UT_DEBUGMSG(("Could not open file %s\n",szFilename));
@@ -91,6 +247,9 @@
UT_Error error;
+ // First we need to determine the encoding.
+ // TODO We might want to find a way to combine this with recognizeContents().
+ X_CleanupIfError(error,_recognizeEncoding(fp));
X_CleanupIfError(error,_writeHeader(fp));
X_CleanupIfError(error,_parseFile(fp));
@@ -113,6 +272,7 @@
IE_Imp_Text::IE_Imp_Text(PD_Document * pDocument)
: IE_Imp(pDocument)
{
+ m_szEncoding = 0;
}
/*****************************************************************/
@@ -121,6 +281,37 @@
#define X_ReturnIfFail(exp,error) do { bool b = (exp); if (!b) return
(error); } while (0)
#define X_ReturnNoMemIfError(exp) X_ReturnIfFail(exp,UT_IE_NOMEMORY)
+UT_Error IE_Imp_Text::_recognizeEncoding(FILE * fp)
+{
+ char szBuf[4096]; // 4096 ought to be enough
+ UT_sint32 iNumbytes;
+
+ iNumbytes = fread(szBuf, 1, sizeof(szBuf), fp);
+ fseek(fp, 0, SEEK_SET);
+
+ if (_recognizeUTF8(szBuf, iNumbytes))
+ {
+ m_szEncoding = "UTF-8";
+ }
+ else
+ {
+ UCS2_Endian eUcs2 = UE_NotUCS;
+
+ eUcs2 = _recognizeUCS2(szBuf, iNumbytes, true);
+
+ if (eUcs2 == UE_BigEnd)
+ {
+ m_szEncoding = "UCS-2-BE";
+ }
+ else if (eUcs2 == UE_LittleEnd)
+ {
+ m_szEncoding = "UCS-2-LE";
+ }
+ }
+
+ return UT_OK;
+}
+
UT_Error IE_Imp_Text::_writeHeader(FILE * /* fp */)
{
X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Section, NULL));
@@ -137,6 +328,9 @@
UT_UCSChar c;
wchar_t wc;
+ if (m_szEncoding)
+ m_Mbtowc.setInCharset(m_szEncoding);
+
while (fread(&b, 1, sizeof(b), fp) > 0)
{
if(!m_Mbtowc.mbtowc(wc,b))
@@ -146,6 +340,8 @@
{
case (UT_UCSChar)'\r':
case (UT_UCSChar)'\n':
+ case 0x2028: // Unicode line separator
+ case 0x2029: // Unicode paragraph separator
if ((c == (UT_UCSChar)'\n') && bEatLF)
{
@@ -158,7 +354,9 @@
bEatLF = true;
}
- // we interprete either CRLF, CR, or LF as a paragraph break.
+ // we interpret either CRLF, CR, or LF as a paragraph break.
+ // we also accept U+2028 (line separator) and U+2029 (para
+separator)
+ // especially since these are recommended by Mac OS X.
// start a paragraph and emit any text that we
// have accumulated.
@@ -224,6 +422,8 @@
{
case (UT_UCSChar)'\r':
case (UT_UCSChar)'\n':
+ case 0x2028: // Unicode line separator
+ case 0x2029: // Unicode paragraph separator
if ((c == (UT_UCSChar)'\n') && bEatLF)
{
bEatLF = false;
@@ -235,7 +435,9 @@
bEatLF = true;
}
- // we interprete either CRLF, CR, or LF as a paragraph break.
+ // we interpret either CRLF, CR, or LF as a paragraph break.
+ // we also accept U+2028 (line separator) and U+2029 (para
+separator)
+ // especially since these are recommended by Mac OS X.
if (gbBlock.getLength() > 0)
{
Index: src/wp/impexp/xp/ie_imp_Text.h
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.h,v
retrieving revision 1.13
diff -u -r1.13 ie_imp_Text.h
--- src/wp/impexp/xp/ie_imp_Text.h 2001/05/03 00:45:36 1.13
+++ src/wp/impexp/xp/ie_imp_Text.h 2001/05/19 07:28:02
@@ -58,9 +58,11 @@
unsigned char * pData, UT_uint32 lenData);
protected:
+ UT_Error _recognizeEncoding(FILE * fp);
UT_Error _parseFile(FILE * fp);
UT_Error _writeHeader(FILE * fp);
UT_Mbtowc m_Mbtowc;
+ const char * m_szEncoding;
};
#endif /* IE_IMP_TEXT_H */