Text importer reworked

Andrew Dunbar Wed, 13 Jun 2001 11:39:45 -0700

Okay here's my revamped version of the text importer.  I've made a
couple of helper classes so we can import from a file or the
clipboard using the same code.  They can also handle the CR+LF
trickery and provide one character of lookahead which makes parsing
easier - somebody may like to use them for other text-based importers.
I feel the code is cleaner and more managable now.

This also fixes Bug 894.  Any comments / suggestions?

Andrew Dunbar.

-- 
http://linguaphile.sourceforge.net

Index: src/af/util/xp/ut_types.h
===================================================================
RCS file: /cvsroot/abi/src/af/util/xp/ut_types.h,v
retrieving revision 1.52
diff -u -r1.52 ut_types.h
--- src/af/util/xp/ut_types.h   2001/06/12 20:09:30     1.52
+++ src/af/util/xp/ut_types.h   2001/06/13 18:34:41
@@ -127,6 +127,11 @@
 #define UCS_SPACE              ((UT_UCSChar)0x0020)
 #define UCS_NBSP               ((UT_UCSChar)0x00a0)
 #define UCS_PILCROW            ((UT_UCSChar)0x00b6)
+#define UCS_LINESEP            ((UT_UCSChar)0x2028)                    /* Unicode 
+line separator */
+#define UCS_PARASEP            ((UT_UCSChar)0x2029)                    /* Unicode 
+paragraph separator */
+#define UCS_BOM                        ((UT_UCSChar)0xFEFF)                    /* 
+Byte order mark */
+
+/* Note: the following are our interpretations, not Unicode's */
 #define UCS_FIELDSTART         ((UT_UCSChar)0xFFFE)
 #define UCS_FIELDEND           ((UT_UCSChar)0xFFFD)
 
@@ -137,10 +142,13 @@
 #define UCS_EN_DASH            ((UT_UCSChar)0x2013)
 #define UCS_EM_DASH            ((UT_UCSChar)0x2014)
 #define UCS_BULLET             ((UT_UCSChar)0x2022)
+/* TODO Quote marks need to be localized - not hard-coded */
 #define UCS_LQUOTE             ((UT_UCSChar)0x2018)
 #define UCS_RQUOTE             ((UT_UCSChar)0x2019)
 #define UCS_LDBLQUOTE          ((UT_UCSChar)0x201c)
 #define UCS_RDBLQUOTE          ((UT_UCSChar)0x201d)
+
+/* Note: the following is our interpretation, not Unicode's */
 #define UCS_UNKPUNK            ((UT_UCSChar)0xFFFF)  /* "unknown punctuation" used 
with UT_isWordDelimiter() */
 
 #else /* see bug 512 */
Index: src/wp/impexp/xp/ie_imp_Text.cpp
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.cpp,v
retrieving revision 1.28
diff -u -r1.28 ie_imp_Text.cpp
--- src/wp/impexp/xp/ie_imp_Text.cpp    2001/06/11 12:23:50     1.28
+++ src/wp/impexp/xp/ie_imp_Text.cpp    2001/06/13 18:35:59
@@ -36,6 +36,197 @@
 #include "xap_DialogFactory.h"
 #include "xap_Dlg_Encoding.h"
 
+/*!
+  Construct ImportStream
+ */
+ImportStream::ImportStream()
+{
+       m_ucsLookAhead = 0;
+       m_bEOF = false;
+}
+
+/*!
+  Initialize ImportStream
+ \param szEncoding Text encoding to convert from
+
+ Sets encoding and prefetches lookahead character
+ */
+bool ImportStream::init(const char *szEncoding)
+{
+       UT_ASSERT(szEncoding);
+       m_Mbtowc.setInCharset(szEncoding);
+       UT_UCSChar dummy;
+       return getChar(dummy);
+}
+
+/*!
+  Get UCS-2 character from stream
+ \param ucs Reference to the character
+
+ Returns single character for CRLF combination
+ */
+bool ImportStream::getChar(UT_UCSChar &ucs)
+{
+       if (!getRawChar(ucs))
+               return false;
+       if (ucs == UCS_CR && peekChar() == UCS_LF)
+               getRawChar(ucs);
+       return true;
+}
+
+/*!
+  Get UCS-2 character from stream
+ \param ucs Reference to the character
+
+ Get the next UCS character, converting from file's encoding
+ */
+bool ImportStream::getRawChar(UT_UCSChar &ucs)
+{
+       wchar_t wc = 0;
+       unsigned char b;
+
+       if (m_bEOF)
+               return false;
+
+       do
+       {
+               if (!_getByte(b))
+               {
+                       m_bEOF = true;
+                       break;
+               }
+
+       } while (!m_Mbtowc.mbtowc(wc,b));
+
+       ucs = m_ucsLookAhead;
+       m_ucsLookAhead = wc;
+
+       return true;
+}
+
+/*!
+  Construct ImportStreamFile from FILE pointer
+ \param pFile File to read from
+ */
+ImportStreamFile::ImportStreamFile(FILE *pFile)
+{
+       m_pFile = pFile;
+}
+
+/*!
+  Get next byte from file
+ \param b Reference to the byte
+ */
+bool ImportStreamFile::_getByte(unsigned char &b)
+{
+       UT_ASSERT(m_pFile);
+
+       return fread(&b, 1, sizeof(b), m_pFile) > 0;
+}
+
+/*!
+  Construct ImportStreamClipboard from memory buffer
+ \param pClipboard Buffer to read from
+ \param iLength Length of buffer
+ */
+ImportStreamClipboard::ImportStreamClipboard(unsigned char *pClipboard, UT_uint32 
+iLength)
+{
+       m_p = pClipboard;
+       m_pEnd = pClipboard + iLength;
+}
+
+/*!
+  Get next byte from clipboard
+ \param b Reference to the byte
+ */
+bool ImportStreamClipboard::_getByte(unsigned char &b)
+{
+       if (m_p >= m_pEnd)
+               return false;
+       b = *m_p++;
+       return true;
+}
+
+// Helper class so we can parse files and clipboard with same code
+
+class Inserter
+{
+public:
+       Inserter(PD_Document * pDocument);
+       Inserter(PD_Document * pDocument, PT_DocPosition dPos);
+       bool insertBlock();
+       bool insertSpan(UT_GrowBuf &b);
+private:
+       PD_Document * m_pDocument;
+       bool m_bClipboard;
+       PT_DocPosition m_dPos;
+};
+
+/*!
+  Construct Inserter helper class
+ \param pDocument Document to insert data into
+ */
+Inserter::Inserter(PD_Document * pDocument)
+{
+       m_pDocument = pDocument;
+       m_bClipboard = false;
+}
+
+/*!
+  Construct Inserter helper class
+ \param pDocument Document to insert data into
+ \param dPos Position in document to begin inserting at
+ */
+Inserter::Inserter(PD_Document * pDocument, PT_DocPosition dPos)
+{
+       m_pDocument = pDocument;
+       m_bClipboard = true;
+       m_dPos = dPos;
+}
+
+/*!
+  Insert a Block into the document
+
+ Uses appropriate function for clipboard or file
+ */
+bool Inserter::insertBlock()
+{
+       bool bRes;
+       
+       if (m_bClipboard)
+       {
+               bRes = m_pDocument->insertStrux(m_dPos, PTX_Block);
+               m_dPos++;
+       }
+       else
+               bRes = m_pDocument->appendStrux(PTX_Block, NULL);
+
+       return bRes;
+}
+
+/*!
+  Insert a span of text into the document
+ \param b Buffer containing UCS text to insert
+
+ Uses appropriate function for clipboard or file
+ */
+bool Inserter::insertSpan(UT_GrowBuf &b)
+{
+       bool bRes;
+       
+       if (m_bClipboard)
+       {
+               bRes = m_pDocument->insertSpan(m_dPos, b.getPointer(0), b.getLength());
+               m_dPos += b.getLength();
+       }
+       else
+               bRes = m_pDocument->appendSpan(b.getPointer(0), b.getLength());
+
+       b.truncate(0);
+
+       return bRes;
+}
+
 /*****************************************************************/
 /*****************************************************************/
 
@@ -269,17 +460,14 @@
 /*****************************************************************/
 /*****************************************************************/
 
-/*
-  Import data from a plain text file.  We allow either
-  LF or CR or CRLF line termination.  Each line
-  terminator is taken to be a paragraph break.
-*/
-
-/*****************************************************************/
-/*****************************************************************/
-
 #define X_CleanupIfError(error,exp)    do { if (((error)=(exp)) != UT_OK) goto 
Cleanup; } while (0)
 
+/*
+  Import data from a plain text file
+ \param szFilename Name of file to import
+  
+ Each line terminator is taken to be a paragraph break
+*/
 UT_Error IE_Imp_Text::importFile(const char * szFilename)
 {
        // We must open in binary mode for UCS-2 compatibility.
@@ -295,10 +483,18 @@
        // First we try to determine the encoding.
        if (_recognizeEncoding(fp) == UT_OK)
                m_pDocument->setEncodingName(m_szEncoding);
-       X_CleanupIfError(error,_writeHeader(fp));
-       X_CleanupIfError(error,_parseFile(fp));
 
-       error = UT_OK;
+       // Call encoding dialog
+       if (!m_bIsEncoded || _doEncodingDialog(m_szEncoding))
+       {
+               ImportStreamFile stream(fp);
+               Inserter ins(m_pDocument);
+               X_CleanupIfError(error,_writeHeader(fp));
+               X_CleanupIfError(error,_parseStream(stream,ins));
+               error = UT_OK;
+       }
+       else
+               error = UT_ERROR;
 
 Cleanup:
        fclose(fp);
@@ -310,13 +506,13 @@
 /*****************************************************************/
 /*****************************************************************/
 
-/*!
-  Destruct text importer
- */
-IE_Imp_Text::~IE_Imp_Text()
-{
-}
-
+/*
+  Construct text importer
+ \param pDocument Document to import text into
+ \param bEncoded True if we should show encoding dialog
+  
+ Uses current document's encoding if it is set
+*/
 IE_Imp_Text::IE_Imp_Text(PD_Document * pDocument, bool bEncoded)
        : IE_Imp(pDocument)
 {
@@ -328,7 +524,6 @@
 
        m_bIsEncoded = bEncoded;
 
-       // TODO Use persistent document encoding when it exists
        _setEncoding(szEncodingName);
 }
 
@@ -353,6 +548,19 @@
        iNumbytes = fread(szBuf, 1, sizeof(szBuf), fp);
        fseek(fp, 0, SEEK_SET);
 
+       return _recognizeEncoding(szBuf, iNumbytes);
+}
+
+/*!
+  Detect encoding of text buffer
+ \param pData Buffer
+ \param lenData Length of buffer
+
+ Supports UTF-8 and UCS-2 big and little endian
+ CJK encodings could be added
+ */
+UT_Error IE_Imp_Text::_recognizeEncoding(const char *szBuf, UT_uint32 iNumbytes)
+{
        if (IE_Imp_Text_Sniffer::_recognizeUTF8(szBuf, iNumbytes))
        {
                _setEncoding("UTF-8");
@@ -372,92 +580,63 @@
        return UT_OK;
 }
 
+/*!
+  Write header to document
+
+ Writes the minimum needed Section and Block before we begin import
+ */
 UT_Error IE_Imp_Text::_writeHeader(FILE * /* fp */)
 {
        X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Section, NULL));
+       X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Block, NULL));
 
        return UT_OK;
 }
 
-UT_Error IE_Imp_Text::_parseFile(FILE * fp)
+/*!
+  Parse stream contents into the document
+ \param stream Stream to import from
+ \param ins Inserter helper class
+
+ This code is used for both files and the clipboard
+ */
+UT_Error IE_Imp_Text::_parseStream(ImportStream & stream, Inserter & ins)
 {
        UT_GrowBuf gbBlock(1024);
-       bool bEatLF = false;
-       bool bEmptyFile = true;
-       unsigned char b;
        UT_UCSChar c;
-       wchar_t wc;
-
-       // Call encoding dialog
-       if (!m_bIsEncoded || _doEncodingDialog(m_szEncoding))
-       {
-               UT_ASSERT(m_szEncoding);
-               m_Mbtowc.setInCharset(m_szEncoding);
-
-               while (fread(&b, 1, sizeof(b), fp) > 0)
-               {
-                       if(!m_Mbtowc.mbtowc(wc,b))
-                               continue;
-                       c = (UT_UCSChar)wc;
-
-                       // TODO We should switch fonts when we encounter
-                       // TODO characters from different scripts
-                       switch (c)
-                       {
-                       case (UT_UCSChar)'\r':
-                       case (UT_UCSChar)'\n':
-                       case 0x2028:                    // Unicode line separator
-                       case 0x2029:                    // Unicode paragraph separator
-                               
-                               if ((c == (UT_UCSChar)'\n') && bEatLF)
-                               {
-                                       bEatLF = false;
-                                       break;
-                               }
-
-                               if (c == (UT_UCSChar)'\r')
-                               {
-                                       bEatLF = true;
-                               }
-                               
-                               // we interpret either CRLF, CR, or LF as a paragraph 
break.
-                               // we also accept U+2028 (line separator) and U+2029 
(para separator)
-                               // especially since these are recommended by Mac OS X.
-                               
-                               // start a paragraph and emit any text that we
-                               // have accumulated.
-                               
X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Block, NULL));
-                               bEmptyFile = false;
-                               if (gbBlock.getLength() > 0)
-                               {
-                                       
X_ReturnNoMemIfError(m_pDocument->appendSpan(gbBlock.getPointer(0), 
gbBlock.getLength()));
-                                       gbBlock.truncate(0);
-                               }
-                               break;
 
-                       default:
-                               bEatLF = false;
-                               
X_ReturnNoMemIfError(gbBlock.ins(gbBlock.getLength(),&c,1));
-                               break;
-                       }
-               } 
+       stream.init(m_szEncoding);
 
-               if (gbBlock.getLength() > 0 || bEmptyFile)
+       while (stream.getChar(c))
+       {
+               // TODO We should switch fonts when we encounter
+               // TODO characters from different scripts
+               switch (c)
                {
-                       // if we have text left over (without final CR/LF),
-                       // or if we read an empty file,
-                       // create a paragraph and emit the text now.
-                       X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Block, 
NULL));
+               case UCS_CR:
+               case UCS_LF:
+               case UCS_LINESEP:
+               case UCS_PARASEP:
+                       // we interpret either CRLF, CR, or LF as a paragraph break.
+                       // we also accept U+2028 (line separator) and U+2029 (para 
+separator)
+                       // especially since these are recommended by Mac OS X.
+                       
+                       // flush out what we have
                        if (gbBlock.getLength() > 0)
-                               
X_ReturnNoMemIfError(m_pDocument->appendSpan(gbBlock.getPointer(0), 
gbBlock.getLength()));
+                               X_ReturnNoMemIfError(ins.insertSpan(gbBlock));
+                       X_ReturnNoMemIfError(ins.insertBlock());
+                       break;
+
+               default:
+                       X_ReturnNoMemIfError(gbBlock.append(&c,1));
+                       break;
                }
-               return UT_OK;
-       }
+       } 
 
-       // TODO If the encoding dialog was cancelled we still get an empty new document
-       // TODO with an error dialog ):
+       if (gbBlock.getLength() > 0)
+               X_ReturnNoMemIfError(ins.insertSpan(gbBlock));
 
-       return UT_ERROR;
+       return UT_OK;
 }
 
 /*!
@@ -518,19 +697,22 @@
 
        m_szEncoding = szEncoding;
 
-       // TODO some iconvs use a different string!
-       if (!strncmp(m_szEncoding,"UCS-2",5))
+       // TODO Should BOM use be a user pref?
+       // TODO Does Mac OSX prefer BOMs?
+       if (!strcmp(m_szEncoding,XAP_EncodingManager::get_instance()->getUCS2LEName()))
        {
                m_bIs16Bit = true;
-               if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "BE"))
-                       m_bBigEndian = true;
-               else if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "LE"))
-                       m_bBigEndian = false;
-               else
-                       UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
-
-               // TODO Should BOM use be a user pref?
-               // TODO Does Mac OSX prefer BOMs?
+               m_bBigEndian = false;
+#ifdef WIN32
+               m_bUseBOM = true;
+#else
+               m_bUseBOM = false;
+#endif
+       }
+       else if 
+(!strcmp(m_szEncoding,XAP_EncodingManager::get_instance()->getUCS2BEName()))
+       {
+               m_bIs16Bit = true;
+               m_bBigEndian = true;
 #ifdef WIN32
                m_bUseBOM = true;
 #else
@@ -564,91 +746,11 @@
        UT_ASSERT(pDocRange->m_pos1 == pDocRange->m_pos2);
 
        // Attempt to guess whether we're pasting 8 bit or unicode text
-       IE_Imp_Text_Sniffer::UCS2_Endian eUcs2 = 
IE_Imp_Text_Sniffer::_recognizeUCS2((const char *)pData, lenData, true);
-       
-       if (eUcs2 == IE_Imp_Text_Sniffer::UE_BigEnd)
-               _setEncoding(XAP_EncodingManager::get_instance()->getUCS2BEName());
-       else if (eUcs2 == IE_Imp_Text_Sniffer::UE_LittleEnd)
-               _setEncoding(XAP_EncodingManager::get_instance()->getUCS2LEName());
-       else
-               
_setEncoding(XAP_EncodingManager::get_instance()->getNativeEncodingName());
+       _recognizeEncoding((const char *)pData, lenData);
 
-       m_Mbtowc.setInCharset(m_szEncoding);
-
-       UT_GrowBuf gbBlock(1024);
-       bool bEatLF = false;
-       bool bSuppressLeadingParagraph = true;
-       bool bInColumn1 = true;
-       unsigned char * pc;
-
-       PT_DocPosition dpos = pDocRange->m_pos1;
-       
-       for (pc=pData; (pc<pData+lenData); pc++)
-       {
-               unsigned char b = *pc;
-               UT_UCSChar c;
-               wchar_t wc;
-               if(!m_Mbtowc.mbtowc(wc,b))
-                   continue;
-               c = (UT_UCSChar)wc;
-               
-               // TODO We should switch fonts when we encounter
-               // TODO characters from different scripts
-               switch (c)
-               {
-               case (UT_UCSChar)'\r':
-               case (UT_UCSChar)'\n':
-               case 0x2028:                    // Unicode line separator
-               case 0x2029:                    // Unicode paragraph separator
-                       if ((c == (UT_UCSChar)'\n') && bEatLF)
-                       {
-                               bEatLF = false;
-                               break;
-                       }
+       ImportStreamClipboard stream(pData, lenData);
+       Inserter ins(m_pDocument, pDocRange->m_pos1);
 
-                       if (c == (UT_UCSChar)'\r')
-                       {
-                               bEatLF = true;
-                       }
-                       
-                       // we interpret either CRLF, CR, or LF as a paragraph break.
-                       // we also accept U+2028 (line separator) and U+2029 (para 
separator)
-                       // especially since these are recommended by Mac OS X.
-                       
-                       if (gbBlock.getLength() > 0)
-                       {
-                               // flush out what we have
-                               m_pDocument->insertSpan(dpos, gbBlock.getPointer(0), 
gbBlock.getLength());
-                               dpos += gbBlock.getLength();
-                               gbBlock.truncate(0);
-                       }
-                       bInColumn1 = true;
-                       break;
-
-               default:
-                       bEatLF = false;
-                       if (bInColumn1 && !bSuppressLeadingParagraph)
-                       {
-                               m_pDocument->insertStrux(dpos,PTX_Block);
-                               dpos++;
-                       }
-                       
-                       gbBlock.ins(gbBlock.getLength(),&c,1);
-
-                       bInColumn1 = false;
-                       bSuppressLeadingParagraph = false;
-                       break;
-               }
-       } 
-
-       if (gbBlock.getLength() > 0)
-       {
-               // if we have text left over (without final CR/LF),
-               m_pDocument->insertSpan(dpos, gbBlock.getPointer(0), 
gbBlock.getLength());
-               dpos += gbBlock.getLength();
-       }
-
-       return;
+       _parseStream(stream,ins);
 }
-
 
Index: src/wp/impexp/xp/ie_imp_Text.h
===================================================================
RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.h,v
retrieving revision 1.15
diff -u -r1.15 ie_imp_Text.h
--- src/wp/impexp/xp/ie_imp_Text.h      2001/06/07 15:52:42     1.15
+++ src/wp/impexp/xp/ie_imp_Text.h      2001/06/13 18:36:00
@@ -56,6 +56,51 @@
                                                                           bool bDeep);
 };
 
+// Stream class can be File or Clipboard
+
+class ImportStream
+{
+public:
+       ImportStream();
+       virtual ~ImportStream() {};
+       bool init(const char *szEncoding);
+       bool getChar(UT_UCSChar &b);
+       UT_UCSChar peekChar() { return m_ucsLookAhead; }
+private:
+       virtual bool _getByte(unsigned char &b) = 0;
+       bool getRawChar(UT_UCSChar &b);
+       UT_Mbtowc m_Mbtowc;
+       UT_UCSChar m_ucsLookAhead;
+       bool m_bEOF;
+};
+
+// File stream class
+
+class ImportStreamFile : public ImportStream
+{
+public:
+       ImportStreamFile(FILE *pFile);
+       ~ImportStreamFile() {};
+       bool getChar();
+private:
+       bool _getByte(unsigned char &b);
+       FILE *m_pFile;
+};
+
+// Clipboard stream class
+
+class ImportStreamClipboard : public ImportStream
+{
+public:
+       ImportStreamClipboard(unsigned char *pClipboard, UT_uint32 iLength);
+       ~ImportStreamClipboard() {};
+       bool getChar();
+private:
+       bool _getByte(unsigned char &b);
+       unsigned char *m_p;
+       unsigned char *m_pEnd;
+};
+
 // The importer/reader for Plain Text Files with selectable encoding.
 
 class IE_Imp_EncodedText_Sniffer : public IE_Imp_Text_Sniffer
@@ -81,7 +126,7 @@
 {
 public:
        IE_Imp_Text(PD_Document * pDocument, bool bEncoded=false);
-       ~IE_Imp_Text();
+       ~IE_Imp_Text() {}
 
        virtual UT_Error        importFile(const char * szFilename);
        virtual void            pasteFromBuffer(PD_DocumentRange * pDocRange,
@@ -89,12 +134,12 @@
        
 protected:
        UT_Error                        _recognizeEncoding(FILE * fp);
-       UT_Error                        _parseFile(FILE * fp);
+       UT_Error                        _recognizeEncoding(const char *szBuf, 
+UT_uint32 iNumbytes);
+       UT_Error                        _parseStream(ImportStream & stream, class 
+Inserter & ins);
        UT_Error                        _writeHeader(FILE * fp);
        bool                            _doEncodingDialog(const char *szEncoding);
        void                            _setEncoding(const char *szEncoding);
 
-       UT_Mbtowc               m_Mbtowc;
        const char *    m_szEncoding;
        bool                    m_bIsEncoded;
        bool                    m_bIs16Bit;

Text importer reworked

Reply via email to