sc/source/ui/dbgui/scuiasciiopt.cxx |   40 ++++++++++++++++++++--------
 sc/source/ui/docshell/impex.cxx     |   50 ++++++++++++++++++++++++++++--------
 sc/source/ui/inc/impex.hxx          |   18 ++++++++++--
 sc/source/ui/inc/scuiasciiopt.hxx   |    2 -
 4 files changed, 85 insertions(+), 25 deletions(-)

New commits:
commit ff62e0165a0add7c7e3cb606df5b24b20c822d8a
Author:     Eike Rathke <er...@redhat.com>
AuthorDate: Wed Aug 18 19:05:08 2021 +0200
Commit:     Eike Rathke <er...@redhat.com>
CommitDate: Thu Aug 19 17:27:51 2021 +0200

    Resolves: tdf#102846 CSV: Detect separator, limit preview line 
concatenations
    
    In CSV import preview, if a line starts with a quote character and
    none of the remembered last field separators used occur in data in
    conjunction with a closing quote, then reading data tried to
    concatenate line by line to form a data field to be presented in
    the preview, worst case the entire file..
    
    For the preview, detect one possible not yet selected separator if
    used with a quoted field (similar to commit
    c807e7ea7a0725a4d8375eda07d6f70870e0d50a for tdf#56910 space
    separator) and limit the number of source lines that are tried to
    be concatenated if no separator was encountered after a possibly
    closing field quote.
    
    Change-Id: Iefd37a8301161e72cb607cea88d4faadad47b4ae
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120690
    Reviewed-by: Eike Rathke <er...@redhat.com>
    Tested-by: Jenkins

diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx 
b/sc/source/ui/dbgui/scuiasciiopt.cxx
index b66403492961..a95d8bdfff7a 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -44,6 +44,14 @@
 //! TODO make dynamic
 const SCSIZE ASCIIDLG_MAXROWS                = MAXROWCOUNT;
 
+// Maximum number of source lines to concatenate while generating the preview
+// for one logical line. This may result in a wrong preview if the actual
+// number of embedded line feeds is greater, but a number too high would take
+// too much time (loop excessively if unlimited and large data) if none of the
+// selected separators are actually used in data but a field at start of line
+// is quoted.
+constexpr sal_uInt32 kMaxEmbeddedLinefeeds = 500;
+
 using namespace com::sun::star::uno;
 
 namespace {
@@ -293,7 +301,7 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, 
const OUString& aDatNa
     , mnRowPosCount(0)
     , mcTextSep(ScAsciiOptions::cDefaultTextSep)
     , meCall(eCall)
-    , mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS)
+    , mbDetectSep(eCall != SC_TEXTTOCOLUMNS)
     , mxFtCharSet(m_xBuilder->weld_label("textcharset"))
     , mxLbCharSet(new 
SvxTextEncodingBox(m_xBuilder->weld_combo_box("charset")))
     , mxFtCustomLang(m_xBuilder->weld_label("textlanguage"))
@@ -579,7 +587,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString 
&rText, sal_Unicode& r
                 break;
             }
             rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators,
-                    mcTextSep, rcDetectSep);
+                    mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
             mnStreamPos = mpDatStream->Tell();
             mpRowPosArray[++mnRowPosCount] = mnStreamPos;
         } while (nLine >= mnRowPosCount && mpDatStream->good());
@@ -594,7 +602,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString 
&rText, sal_Unicode& r
     else
     {
         Seek( mpRowPosArray[nLine]);
-        rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, 
mcTextSep, rcDetectSep);
+        rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, 
mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
         mnStreamPos = mpDatStream->Tell();
     }
 
@@ -793,7 +801,11 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, 
ScCsvTableBox&, void)
     // when the dialog wasn't already presented to the user.
     // As a side effect this has the benefit that the check is only done on the
     // first set of visible lines.
-    sal_Unicode cDetectSep = (mbDetectSpaceSep && !mxRbFixed->get_active() && 
!mxCkbSpace->get_active() ? 0 : 0xffff);
+    sal_Unicode cDetectSep = ((mbDetectSep && !mxRbFixed->get_active()
+                && (!mxCkbTab->get_active() || !mxCkbSemicolon->get_active()
+                    || !mxCkbComma->get_active() || 
!mxCkbSpace->get_active())) ? 0 : 0xffff);
+    if (cDetectSep == 0xffff)
+        mbDetectSep = false;
 
     sal_Int32 nBaseLine = mxTableBox->GetGrid().GetFirstVisLine();
     sal_Int32 nRead = mxTableBox->GetGrid().GetVisLineCount();
@@ -813,16 +825,22 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, 
ScCsvTableBox&, void)
     for (; i < CSV_PREVIEW_LINES; i++)
         maPreviewLine[i].clear();
 
-    if (mbDetectSpaceSep)
+    if (mbDetectSep)
     {
-        mbDetectSpaceSep = false;
-        if (cDetectSep == ' ')
+        mbDetectSep = false;
+        if (cDetectSep)
         {
-            // Expect space to be appended by now so all subsequent
+            // Expect separator to be appended by now so all subsequent
             // GetLine()/ReadCsvLine() actually used it.
-            assert(maFieldSeparators.endsWith(" "));
-            // Preselect Space in UI.
-            mxCkbSpace->set_active(true);
+            assert(maFieldSeparators.endsWith(OUStringChar(cDetectSep)));
+            // Preselect separator in UI.
+            switch (cDetectSep)
+            {
+                case '\t':  mxCkbTab->set_active(true);         break;
+                case ';':   mxCkbSemicolon->set_active(true);   break;
+                case ',':   mxCkbComma->set_active(true);       break;
+                case ' ':   mxCkbSpace->set_active(true);       break;
+            }
         }
     }
 
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 3a798b37e50e..3c2772619dac 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -607,17 +607,40 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* 
p, const sal_Unicode* p
     // Due to broken CSV generators that don't double embedded quotes check if
     // a field separator immediately or with trailing spaces follows the quote,
     // only then end the field, or at end of string.
-    const sal_Unicode cBlank = ' ';
+    constexpr sal_Unicode cBlank = ' ';
     if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank))
         return FIELDEND_QUOTE;
     // Detect a possible blank separator if it's not already in the list (which
     // was checked right above for p[1]==cBlank).
-    if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank)
-        rcDetectSep = cBlank;
+    const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != 
cBlank);
     while (p[1] == cBlank)
         ++p;
     if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
         return FIELDEND_QUOTE;
+    // Extended separator detection after a closing quote (with or without
+    // blanks). Note that nQuotes is incremented *after* the call so is not yet
+    // even here, and that with separator detection we reach here only if
+    // lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or
+    // SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have
+    // to be checked.
+    if (!rcDetectSep)
+    {
+        constexpr sal_Unicode vSep[] = { ',', '\t', ';' };
+        for (const sal_Unicode c : vSep)
+        {
+            if (p[1] == c)
+            {
+                rcDetectSep = c;
+                return FIELDEND_QUOTE;
+            }
+        }
+    }
+    // Blank separator is least significant, after others.
+    if (bBlankSep)
+    {
+        rcDetectSep = cBlank;
+        return FIELDEND_QUOTE;
+    }
     return DONTKNOW_QUOTE;
 }
 
@@ -645,7 +668,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, 
const sal_Unicode* p
 static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const 
sal_Unicode* p,
         const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
 {
-    if ((nQuotes % 2) == 0)
+    if ((nQuotes & 1) == 0)
     {
         if (p[-1] == cStr)
             return SECOND_QUOTE;
@@ -2481,7 +2504,7 @@ ScImportStringStream::ScImportStringStream( const 
OUString& rStr )
 }
 
 OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
-        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& 
rcDetectSep )
+        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& 
rcDetectSep, sal_uInt32 nMaxSourceLines )
 {
     enum RetryState
     {
@@ -2506,6 +2529,8 @@ Label_RetryWithNewSep:
 
     if (bEmbeddedLineBreak)
     {
+        sal_uInt32 nLine = 0;
+
         const sal_Unicode* pSeps = rFieldSeparators.getStr();
 
         QuoteType eQuoteState = FIELDEND_QUOTE;
@@ -2544,10 +2569,11 @@ Label_RetryWithNewSep:
                         {
                             eQuoteState = lcl_isEscapedOrFieldEndQuote( 
nQuotes, p, pSeps, cFieldQuote, rcDetectSep);
 
-                            if (eRetryState == RetryState::ALLOW && 
rcDetectSep == ' ')
+                            if (eRetryState == RetryState::ALLOW && 
rcDetectSep)
                             {
                                 eRetryState = RetryState::RETRY;
-                                rFieldSeparators += " ";
+                                rFieldSeparators += OUStringChar(rcDetectSep);
+                                pSeps = rFieldSeparators.getStr();
                                 goto Label_RetryWithNewSep;
                             }
 
@@ -2593,10 +2619,14 @@ Label_RetryWithNewSep:
                 ++p;
             }
 
-            if (nQuotes % 2 == 0)
+            if ((nQuotes & 1) == 0)
                 // We still have a (theoretical?) problem here if due to
-                // nArbitraryLineLengthLimit we split a string right between a
-                // doubled quote pair.
+                // nArbitraryLineLengthLimit (or nMaxSourceLines below) we
+                // split a string right between a doubled quote pair.
+                break;
+            else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0)
+                // Unconditionally increment nLine even if nMaxSourceLines==0
+                // so it can be observed in debugger.
                 break;
             else
             {
diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx
index b334755ca011..031a0ca3c571 100644
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -194,14 +194,25 @@ public:
     The quote character used.
 
     @param rcDetectSep
-    If 0 then attempt to detect a possible space (blank) separator if
+    If 0 then attempt to detect a possible separator if
     rFieldSeparators doesn't include it already. This can be necessary because
     of the "accept broken misquoted CSV fields" feature that tries to ignore
     trailing blanks after a quoted field and if no separator follows continues
     to add content to the field assuming the single double quote was in error.
-    If this blank separator is detected it is added to rFieldSeparators and the
+    It is also necessary if the only possible separator was not selected and
+    not included in rFieldSeparators and a line starts with a quoted field, in
+    which case appending lines is tried until end of file.
+    If a separator is detected it is added to rFieldSeparators and the
     line is reread with the new separators
 
+    @param nMaxSourceLines
+    Maximum source lines to read and combine into one logical line for embedded
+    new line purpose. Should be limited for the preview dialog because only
+    non-matching separators selected otherwise would lead to trying to
+    concatenate lines until file end.
+    If 0 no limit other than the internal arbitrary resulting line length
+    limit.
+
     check Stream::good() to detect IO problems during read
 
     @ATTENTION
@@ -222,6 +233,7 @@ public:
 
   */
 SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
-        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& 
rcDetectSep );
+        OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& 
rcDetectSep,
+        sal_uInt32 nMaxSourceLines = 0 );
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/ui/inc/scuiasciiopt.hxx 
b/sc/source/ui/inc/scuiasciiopt.hxx
index 140ca09122d0..eae2f2f06bc0 100644
--- a/sc/source/ui/inc/scuiasciiopt.hxx
+++ b/sc/source/ui/inc/scuiasciiopt.hxx
@@ -44,7 +44,7 @@ class ScImportAsciiDlg : public weld::GenericDialogController
     rtl_TextEncoding            meCharSet;          /// Selected char set.
     bool                        mbCharSetSystem;    /// Is System char set 
selected?
     ScImportAsciiCall           meCall;             /// How the dialog is 
called (see asciiopt.hxx)
-    bool                        mbDetectSpaceSep;   /// Whether to detect a 
possible space separator.
+    bool                        mbDetectSep;        /// Whether to detect a 
possible separator.
 
     std::unique_ptr<weld::Label> mxFtCharSet;
     std::unique_ptr<SvxTextEncodingBox> mxLbCharSet;

Reply via email to