Here is a much tidied-up text exporter. Removed a lot of special
cases, unicode clipboard now works for non-unicode locales, and
improved coding the style.
Andrew Dunbar.
--
http://linguaphile.sourceforge.net
Index: src/af/util/win/ut_Win32OS.h
===================================================================
RCS file: /cvsroot/abi/src/af/util/win/ut_Win32OS.h,v
retrieving revision 1.3
diff -u -r1.3 ut_Win32OS.h
--- src/af/util/win/ut_Win32OS.h 2001/02/06 22:54:10 1.3
+++ src/af/util/win/ut_Win32OS.h 2001/06/15 06:50:05
@@ -21,6 +21,7 @@
#ifndef UT_Win32OS_H
#define UT_Win32OS_H
+#include <windows.h>
#include "ut_types.h"
bool UT_IsWinNT(void);
Index: src/wp/ap/win/ap_Win32App.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/ap/win/ap_Win32App.cpp,v
retrieving revision 1.64
diff -u -r1.64 ap_Win32App.cpp
--- src/wp/ap/win/ap_Win32App.cpp 2001/06/07 15:52:36 1.64
+++ src/wp/ap/win/ap_Win32App.cpp 2001/06/15 06:50:44
@@ -58,6 +58,7 @@
#include "ap_EditMethods.h"
#include "fp_Run.h"
+#include "ut_Win32OS.h"
#include "ie_exp.h"
#include "ie_exp_AbiWord_1.h"
@@ -435,7 +436,8 @@
// put raw text on the clipboard
- if (XAP_EncodingManager::get_instance()->isUnicodeLocale())
+ // TODO Should use a finer-grain technique than IsWinNT() since Win98
+supports unicode clipboard.
+ if (UT_IsWinNT())
{
// put raw unicode text on the clipboard
// TODO On NT we should always put unicode text on the
clipboard regardless of locale.
Index: src/wp/impexp/xp/ie_exp_Text.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp_Text.cpp,v
retrieving revision 1.26
diff -u -r1.26 ie_exp_Text.cpp
--- src/wp/impexp/xp/ie_exp_Text.cpp 2001/06/11 12:04:18 1.26
+++ src/wp/impexp/xp/ie_exp_Text.cpp 2001/06/15 06:50:54
@@ -1,19 +1,19 @@
/* AbiWord
* Copyright (C) 1998 AbiSource, Inc.
- *
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
- *
+ *
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
* 02111-1307, USA.
*/
@@ -33,14 +33,15 @@
#include "ut_iconv.h"
#include "ut_wctomb.h"
#include "xap_EncodingManager.h"
-
-#include "ut_string_class.h"
-
#include "ap_Dialog_Id.h"
#include "xap_App.h"
#include "xap_DialogFactory.h"
#include "xap_Dlg_Encoding.h"
+#ifdef WIN32
+#include "ut_Win32OS.h"
+#endif
+
#define MY_MB_LEN_MAX 6
//////////////////////////////////////////////////////////////////
@@ -58,7 +59,7 @@
bool bIs16Bit,
bool bUseBOM,
bool bBigEndian);
- virtual ~s_Text_Listener() {};
+ virtual ~s_Text_Listener() {}
virtual bool populate(PL_StruxFmtHandle sfh,
const PX_ChangeRecord
* pcr);
@@ -81,17 +82,21 @@
virtual bool signal(UT_uint32 iSignal);
protected:
- void _closeBlock(void);
+ void _genBOM(void);
+ void _genLineBreak(void);
void _outputData(const UT_UCSChar * p, UT_uint32
length);
- void _output8BitData(const UT_UCSChar * , UT_uint32
length);
- void _output16BitData(const UT_UCSChar * ,
UT_uint32 length);
-
+ void _closeBlock(void);
+
PD_Document * m_pDocument;
IE_Exp_Text * m_pie;
+ UT_Wctomb m_wctomb;
+ char m_mbBOM[MY_MB_LEN_MAX];
+ int m_iBOMLen;
+ char m_mbLineBreak[MY_MB_LEN_MAX*2];
+ int m_iLineBreakLen;
bool m_bInBlock;
bool m_bToClipboard;
bool m_bFirstWrite;
- UT_Wctomb m_wctomb;
const char * m_szEncoding;
bool m_bIs16Bit;
bool m_bBigEndian;
@@ -102,29 +107,21 @@
/*****************************************************************/
IE_Exp_Text::IE_Exp_Text(PD_Document * pDocument, bool bEncoded)
- : IE_Exp(pDocument)
+ : IE_Exp(pDocument),
+ m_pListener(NULL),
+ m_bIsEncoded(bEncoded)
{
UT_ASSERT(pDocument);
+ m_error = 0;
+
const char *szEncodingName = pDocument->getEncodingName();
if (!szEncodingName || !*szEncodingName)
szEncodingName =
XAP_EncodingManager::get_instance()->getNativeEncodingName();
- m_error = 0;
- m_pListener = NULL;
- m_bIsEncoded = bEncoded;
-
- // TODO Use persistent document encoding when it exists
_setEncoding(szEncodingName);
}
-/*!
- Destruct text exporter
- */
-IE_Exp_Text::~IE_Exp_Text()
-{
-}
-
/*****************************************************************/
/*****************************************************************/
@@ -198,6 +195,17 @@
UT_Error IE_Exp_Text::_writeDocument(void)
{
+ // TODO If we're going to the clipboard and the OS supports unicode, set
+encoding.
+ // TODO Only supports Windows so far.
+ // TODO Should use a finer-grain technique than IsWinNT() since Win98 supports
+unicode clipboard.
+ if (m_pDocRange)
+ {
+#ifdef WIN32
+ if (UT_IsWinNT())
+
+_setEncoding(XAP_EncodingManager::get_instance()->getNativeUnicodeEncodingName());
+#endif
+ }
+
m_pListener = new
s_Text_Listener(m_pDocument,this,(m_pDocRange!=NULL),m_szEncoding,m_bIs16Bit,m_bUseBOM,m_bBigEndian);
if (!m_pListener)
return UT_IE_NOMEMORY;
@@ -226,6 +234,7 @@
/*!
Request file encoding from user
+ \param szEncoding Encoding to export file into
This function should be identical to the one in ie_Imp_Text
*/
@@ -234,10 +243,10 @@
XAP_Dialog_Id id = XAP_DIALOG_ID_ENCODING;
XAP_DialogFactory * pDialogFactory
- = (XAP_DialogFactory *)(m_pDocument->getApp()->getDialogFactory());
+ = reinterpret_cast<XAP_DialogFactory
+*>(m_pDocument->getApp()->getDialogFactory());
XAP_Dialog_Encoding * pDialog
- = (XAP_Dialog_Encoding *)(pDialogFactory->requestDialog(id));
+ = reinterpret_cast<XAP_Dialog_Encoding
+*>(pDialogFactory->requestDialog(id));
UT_ASSERT(pDialog);
pDialog->setEncoding(szEncoding);
@@ -261,7 +270,7 @@
UT_ASSERT (s);
strcpy(szEnc,s);
- _setEncoding((const char *)szEnc);
+ _setEncoding(reinterpret_cast<const char *>(szEnc));
m_pDocument->setEncodingName(szEnc);
}
@@ -270,214 +279,191 @@
return bOK;
}
-/*****************************************************************/
-/*****************************************************************/
+/*!
+ Set exporter's encoding and related members
+ \param szEncoding Encoding to export file into
-void s_Text_Listener::_closeBlock(void)
+ Decides endian and BOM policy based on encoding
+ */
+void IE_Exp_Text::_setEncoding(const char *szEncoding)
{
- if (!m_bInBlock)
- return;
+ UT_ASSERT(szEncoding);
- UT_UCSChar wcLineBreak[3] = {0,0,0};
- char mbLineBreak[MY_MB_LEN_MAX*2];
- UT_UCSChar *pWC = wcLineBreak;
- char *pMB = mbLineBreak;
- int mbLen;
+ m_szEncoding = szEncoding;
- // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029.
+ // TODO some iconvs use a different string!
+ if (!strncmp(m_szEncoding,"UCS-2",5))
+ {
+ m_bIs16Bit = true;
+ if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "BE"))
+ m_bBigEndian = true;
+ else if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "LE"))
+ m_bBigEndian = false;
+ else
+ UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+
+ // TODO Should BOM use be a user pref?
+ // TODO Does Mac OSX prefer BOMs?
#ifdef WIN32
- wcLineBreak[0] = '\r';
- wcLineBreak[1] = '\n';
+ m_bUseBOM = true;
#else
- wcLineBreak[0] = '\n';
+ m_bUseBOM = false;
#endif
-
- while (*pWC)
+ }
+ else
{
- if (!m_wctomb.wctomb(pMB,mbLen,(wchar_t)*pWC))
- UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
- ++pWC;
- pMB += mbLen;
+ m_bIs16Bit = false;
+ // These are currently meaningless when not in a Unicode encoding
+ m_bBigEndian = false;
+ m_bUseBOM = false;
}
-
- m_pie->write(mbLineBreak,pMB-mbLineBreak);
- m_bInBlock = false;
- return;
}
+/*****************************************************************/
+/*****************************************************************/
+
/*!
- Output text buffer to file
- \param data Buffer to output
- \param length Size of buffer
+ Generate correct BOM
- Currently calls different functions for 8 bit and 16 bit encodings due to use
- of dynamically sized string classes which only support one character size.
- If we support UCS-4/UTF-32 in the future we'll need a third function.
+ Makes a Byte Order Mark correct for the encoding.
*/
-void s_Text_Listener::_outputData(const UT_UCSChar * data, UT_uint32 length)
+void s_Text_Listener::_genBOM(void)
{
- if (m_bFirstWrite)
- {
- UT_ASSERT(m_szEncoding);
- m_wctomb.setOutCharset(m_szEncoding);
+ // TODO iconv (at least libiconv) actually converts BOM to nothing at all ):
+#if 0
+ UT_UCSChar wcBOM[2] = {0,0};
+ UT_UCSChar *pWC = wcBOM;
+ char *pMB = reinterpret_cast<char *>(m_mbBOM);
+ int mbLen;
- // TODO BOMs need separate code for UTF-7, UCS-4, etc
- if (m_bUseBOM)
+ wcBOM[0] = UCS_BOM;
+
+ while (*pWC)
+ {
+ if (!m_wctomb.wctomb(pMB,mbLen,static_cast<wchar_t>(*pWC)))
+ UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+ ++pWC;
+ pMB += mbLen;
+ }
+ m_iBOMLen = pMB - m_mbBOM;
+#else
+ // Hard-coded BOM values
+ if (m_bIs16Bit)
+ {
+ // This code should cover UCS-2 and UTF-16, both endians
+ if (m_bBigEndian)
{
- if (m_bIs16Bit)
- {
- if (m_bBigEndian)
- m_pie->write("\xfe\xff",2);
- else
- m_pie->write("\xff\xfe",2);
- }
- else
- {
- // TODO There may be reason for using a BOM in UTF-8
text.
- // TODO I've seen MS software do it.
- m_pie->write("\xef\xbb\xbf",3);
- }
+ strcpy(m_mbBOM,"\xfe\xff");
+ m_iBOMLen = 2;
}
- m_bFirstWrite = false;
+ else
+ {
+ strcpy(m_mbBOM,"\xff\xfe");
+ m_iBOMLen = 2;
+ }
}
-
- // TODO some iconvs use a different string!
- if (m_bIs16Bit)
- _output16BitData(data, length);
else
- _output8BitData(data, length);
+ {
+ // This code covers UTF-8 only
+ strcpy(m_mbBOM,"\xef\xbb\xbf");
+ m_iBOMLen = 3;
+ }
+ // TODO UTF-7, UCS-4, UTF-32
+#endif
}
/*!
- Output 8-bit text buffer to file
- \param data Buffer to output
- \param length Size of buffer
+ Generate correct line break characters
- Single byte and multi byte encodings are supported.
- Wide character encodings are not supported.
- The buffer must not contain NULL bytes.
+ Makes a line break correct for the encoding and platform.
*/
-void s_Text_Listener::_output8BitData(const UT_UCSChar * data, UT_uint32 length)
+void s_Text_Listener::_genLineBreak(void)
{
- UT_String sBuf;
- const UT_UCSChar * pData;
-
+ UT_UCSChar wcLineBreak[3] = {0,0,0};
+ UT_UCSChar *pWC = wcLineBreak;
+ char *pMB = reinterpret_cast<char *>(m_mbLineBreak);
int mbLen;
- char pC[MY_MB_LEN_MAX];
- // TODO Do we still need this?
- UT_ASSERT(sizeof(UT_Byte) == sizeof(char));
-
- for (pData=data; (pData<data+length); /**/)
- {
- if(!m_wctomb.wctomb(pC,mbLen,(wchar_t)*pData))
- {
- mbLen=1;
- pC[0]='?';
- m_wctomb.initialize();
- }
- if (mbLen>1)
- {
- pC[mbLen]='\0';
- sBuf += pC;
- }
- else
- {
- // We let any UCS_LF's (forced line breaks) go out as is.
- // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or
U+2029.
+ // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029.
#ifdef WIN32
- if (pC[0]==UCS_LF)
- sBuf += '\r';
+ wcLineBreak[0] = '\r';
+ wcLineBreak[1] = '\n';
+#else
+ wcLineBreak[0] = '\n';
#endif
- sBuf += (char)pC[0];
- }
- pData++;
- }
- m_pie->write(sBuf.c_str(),sBuf.size());
+ while (*pWC)
+ {
+ if (!m_wctomb.wctomb(pMB,mbLen,static_cast<wchar_t>(*pWC)))
+ UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+ ++pWC;
+ pMB += mbLen;
+ }
+ m_iLineBreakLen = pMB - m_mbLineBreak;
}
/*!
- Output 16-bit text buffer to file
+ Output text buffer to stream
\param data Buffer to output
\param length Size of buffer
-
- Supports the UCS-2 encodings. UCS-2 streams include NULL bytes.
*/
-void s_Text_Listener::_output16BitData(const UT_UCSChar * data, UT_uint32 length)
+void s_Text_Listener::_outputData(const UT_UCSChar * data, UT_uint32 length)
{
- UT_UCS2String sBuf;
+ UT_ByteBuf bBuf;
const UT_UCSChar * pData;
-
+
int mbLen;
- UT_UCSChar c;
+ char pC[MY_MB_LEN_MAX];
+
+ if (m_bFirstWrite)
+ {
+ UT_ASSERT(m_szEncoding);
+ m_wctomb.setOutCharset(m_szEncoding);
- UT_ASSERT(sizeof(UT_Byte) == sizeof(char));
+ if (m_bUseBOM)
+ _genBOM();
+ _genLineBreak();
+
+ // TODO BOMs need separate code for UTF-7, UCS-4, etc
+ if (m_bUseBOM)
+ m_pie->write(reinterpret_cast<const char
+*>(m_mbBOM),m_iBOMLen);
+
+ m_bFirstWrite = false;
+ }
- for (pData=data; (pData<data+length); /**/)
+ for (pData=data; (pData<data+length); ++pData)
{
- if(!m_wctomb.wctomb(reinterpret_cast<char
*>(&c),mbLen,(wchar_t)*pData))
+ // We let any UCS_LF's (forced line breaks) go out as is.
+ if (*pData==UCS_LF)
+ bBuf.append(reinterpret_cast<UT_Byte
+*>(m_mbLineBreak),m_iLineBreakLen);
+ else
{
- mbLen=2;
- // TODO U+FFFD "REPLACEMENT CHARACTER" is the
- // TODO correct unicode equivalent of '?' isn't it?
- // TODO Or is it U+25A0 "BLACK SQUARE"?
- c=0xFFFD;
- m_wctomb.initialize();
+ if (!m_wctomb.wctomb(pC,mbLen,static_cast<wchar_t>(*pData)))
+ {
+ mbLen=1;
+ pC[0]='?';
+ m_wctomb.initialize();
+ }
+ UT_ASSERT(mbLen>=1);
+ bBuf.append(reinterpret_cast<const UT_Byte *>(pC),mbLen);
}
- UT_ASSERT(mbLen==2);
- // We let any UCS_LF's (forced line breaks) go out as is.
- // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029.
-#ifdef WIN32
- if (c==UCS_LF)
- sBuf += '\r';
-#endif
- sBuf += c;
-
- pData++;
}
- m_pie->write(reinterpret_cast<const char
*>(sBuf.ucs_str()),sBuf.size()*sizeof(UT_UCSChar));
+ m_pie->write(reinterpret_cast<const char
+*>(bBuf.getPointer(0)),bBuf.getLength());
}
-
-/*!
- Set exporter's encoding and related members
- \param szEncoding Encoding to export file into
- Decides endian and BOM policy based on encoding
- */
-void IE_Exp_Text::_setEncoding(const char *szEncoding)
+void s_Text_Listener::_closeBlock(void)
{
- UT_ASSERT(szEncoding);
+ if (!m_bInBlock)
+ return;
- m_szEncoding = szEncoding;
+ UT_ASSERT(!m_bFirstWrite);
+ UT_ASSERT(m_iLineBreakLen);
- // TODO some iconvs use a different string!
- if (!strncmp(m_szEncoding,"UCS-2",5))
- {
- m_bIs16Bit = true;
- if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "BE"))
- m_bBigEndian = true;
- else if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "LE"))
- m_bBigEndian = false;
- else
- UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+ m_pie->write(reinterpret_cast<const char *>(m_mbLineBreak),m_iLineBreakLen);
- // TODO Should BOM use be a user pref?
- // TODO Does Mac OSX prefer BOMs?
-#ifdef WIN32
- m_bUseBOM = true;
-#else
- m_bUseBOM = false;
-#endif
- }
- else
- {
- m_bIs16Bit = false;
- // These are currently meaningless when not in a Unicode encoding
- m_bBigEndian = false;
- m_bUseBOM = false;
- }
+ m_bInBlock = false;
+ return;
}
s_Text_Listener::s_Text_Listener(PD_Document * pDocument,
@@ -487,19 +473,19 @@
bool bIs16Bit,
bool bUseBOM,
bool bBigEndian)
+ : m_pDocument(pDocument),
+ m_pie(pie),
+ m_bToClipboard(bToClipboard),
+ // when we are going to the clipboard, we should implicitly
+ // assume that we are starting in the middle of a block.
+ // when going to a file we should not.
+ m_bInBlock(bToClipboard),
+ m_bFirstWrite(true),
+ m_szEncoding(szEncoding),
+ m_bIs16Bit(bIs16Bit),
+ m_bUseBOM(bToClipboard ? false : bUseBOM),
+ m_bBigEndian(bBigEndian)
{
- m_pDocument = pDocument;
- m_pie = pie;
- m_bToClipboard = bToClipboard;
- // when we are going to the clipboard, we should implicitly
- // assume that we are starting in the middle of a block.
- // when going to a file we should not.
- m_bInBlock = m_bToClipboard;
- m_bFirstWrite = true;
- m_szEncoding = szEncoding;
- m_bIs16Bit = bIs16Bit;
- m_bUseBOM = bToClipboard ? false : bUseBOM;
- m_bBigEndian = bBigEndian;
}
bool s_Text_Listener::populate(PL_StruxFmtHandle /*sfh*/,
@@ -509,7 +495,7 @@
{
case PX_ChangeRecord::PXT_InsertSpan:
{
- const PX_ChangeRecord_Span * pcrs = static_cast<const
PX_ChangeRecord_Span *> (pcr);
+ const PX_ChangeRecord_Span * pcrs = static_cast<const
+PX_ChangeRecord_Span *>(pcr);
PT_BufIndex bi = pcrs->getBufIndex();
_outputData(m_pDocument->getPointer(bi),pcrs->getLength());
@@ -521,8 +507,8 @@
{
#if 1
// TODO decide how to indicate objects in text output.
-
- const PX_ChangeRecord_Object * pcro = static_cast<const
PX_ChangeRecord_Object *> (pcr);
+
+ const PX_ChangeRecord_Object * pcro = static_cast<const
+PX_ChangeRecord_Object *>(pcr);
//PT_AttrPropIndex api = pcr->getIndexAP();
fd_Field* field;
switch (pcro->getObjectType())
@@ -540,7 +526,7 @@
//
if(field->getValue() != NULL)
m_pie->write(field->getValue());
-
+
return true;
default:
@@ -566,7 +552,7 @@
PL_StruxFmtHandle * psfh)
{
UT_ASSERT(pcr->getType() == PX_ChangeRecord::PXT_InsertStrux);
- const PX_ChangeRecord_Strux * pcrx = static_cast<const PX_ChangeRecord_Strux
*> (pcr);
+ const PX_ChangeRecord_Strux * pcrx = static_cast<const PX_ChangeRecord_Strux
+*>(pcr);
*psfh = 0; // we don't
need it.
switch (pcrx->getStruxType())
Index: src/wp/impexp/xp/ie_exp_Text.h
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp_Text.h,v
retrieving revision 1.9
diff -u -r1.9 ie_exp_Text.h
--- src/wp/impexp/xp/ie_exp_Text.h 2001/06/07 15:52:42 1.9
+++ src/wp/impexp/xp/ie_exp_Text.h 2001/06/15 06:50:54
@@ -1,19 +1,19 @@
/* AbiWord
* Copyright (C) 1998 AbiSource, Inc.
- *
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
- *
+ *
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
* 02111-1307, USA.
*/
@@ -66,20 +66,20 @@
{
public:
IE_Exp_Text(PD_Document * pDocument, bool bEncoded=false);
- virtual ~IE_Exp_Text();
+ virtual ~IE_Exp_Text() {}
protected:
virtual UT_Error _writeDocument(void);
virtual bool _openFile(const char * szFilename);
bool _doEncodingDialog(const char *szEncoding);
void _setEncoding(const char *szEncoding);
-
+
s_Text_Listener * m_pListener;
bool m_bIsEncoded;
const char * m_szEncoding;
bool m_bIs16Bit;
- bool m_bUseBOM;
bool m_bBigEndian;
+ bool m_bUseBOM;
};
#endif /* IE_EXP_TEXT_H */