sc/source/ui/dbgui/scuiasciiopt.cxx | 40 ++++++++++++++++++++-------- sc/source/ui/docshell/impex.cxx | 50 ++++++++++++++++++++++++++++-------- sc/source/ui/inc/impex.hxx | 18 ++++++++++-- sc/source/ui/inc/scuiasciiopt.hxx | 2 - 4 files changed, 85 insertions(+), 25 deletions(-)
New commits: commit 73356b6c678643f222b98e111a90f68c93694154 Author: Eike Rathke <er...@redhat.com> AuthorDate: Wed Aug 18 19:05:08 2021 +0200 Commit: Balazs Varga <balazs.varga.ext...@allotropia.de> CommitDate: Thu Apr 20 13:07:06 2023 +0200 Resolves: tdf#102846 CSV: Detect separator, limit preview line concatenations In CSV import preview, if a line starts with a quote character and none of the remembered last field separators used occur in data in conjunction with a closing quote, then reading data tried to concatenate line by line to form a data field to be presented in the preview, worst case the entire file.. For the preview, detect one possible not yet selected separator if used with a quoted field (similar to commit c807e7ea7a0725a4d8375eda07d6f70870e0d50a for tdf#56910 space separator) and limit the number of source lines that are tried to be concatenated if no separator was encountered after a possibly closing field quote. Change-Id: Iefd37a8301161e72cb607cea88d4faadad47b4ae Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120690 Reviewed-by: Eike Rathke <er...@redhat.com> Tested-by: Jenkins Reviewed-on: https://gerrit.libreoffice.org/c/core/+/150680 Tested-by: Balazs Varga <balazs.varga.ext...@allotropia.de> Reviewed-by: Balazs Varga <balazs.varga.ext...@allotropia.de> diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx index 75d915438f44..447563e1d8ae 100644 --- a/sc/source/ui/dbgui/scuiasciiopt.cxx +++ b/sc/source/ui/dbgui/scuiasciiopt.cxx @@ -40,6 +40,14 @@ //! TODO make dynamic const SCSIZE ASCIIDLG_MAXROWS = MAXROWCOUNT; +// Maximum number of source lines to concatenate while generating the preview +// for one logical line. This may result in a wrong preview if the actual +// number of embedded line feeds is greater, but a number too high would take +// too much time (loop excessively if unlimited and large data) if none of the +// selected separators are actually used in data but a field at start of line +// is quoted. +constexpr sal_uInt32 kMaxEmbeddedLinefeeds = 500; + using namespace com::sun::star::uno; // Defines - CSV Import Preserve Options @@ -285,7 +293,7 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, const OUString& aDatNa , mnRowPosCount(0) , mcTextSep(ScAsciiOptions::cDefaultTextSep) , meCall(eCall) - , mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS) + , mbDetectSep(eCall != SC_TEXTTOCOLUMNS) , mxFtCharSet(m_xBuilder->weld_label("textcharset")) , mxLbCharSet(new SvxTextEncodingBox(m_xBuilder->weld_combo_box("charset"))) , mxFtCustomLang(m_xBuilder->weld_label("textlanguage")) @@ -555,7 +563,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r break; } rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, - mcTextSep, rcDetectSep); + mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds); mnStreamPos = mpDatStream->Tell(); mpRowPosArray[++mnRowPosCount] = mnStreamPos; } while (nLine >= mnRowPosCount && mpDatStream->good()); @@ -570,7 +578,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r else { Seek( mpRowPosArray[nLine]); - rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep); + rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds); mnStreamPos = mpDatStream->Tell(); } @@ -769,7 +777,11 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void) // when the dialog wasn't already presented to the user. // As a side effect this has the benefit that the check is only done on the // first set of visible lines. - sal_Unicode cDetectSep = (mbDetectSpaceSep && !mxRbFixed->get_active() && !mxCkbSpace->get_active() ? 0 : 0xffff); + sal_Unicode cDetectSep = ((mbDetectSep && !mxRbFixed->get_active() + && (!mxCkbTab->get_active() || !mxCkbSemicolon->get_active() + || !mxCkbComma->get_active() || !mxCkbSpace->get_active())) ? 0 : 0xffff); + if (cDetectSep == 0xffff) + mbDetectSep = false; sal_Int32 nBaseLine = mxTableBox->GetGrid().GetFirstVisLine(); sal_Int32 nRead = mxTableBox->GetGrid().GetVisLineCount(); @@ -789,16 +801,22 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void) for (; i < CSV_PREVIEW_LINES; i++) maPreviewLine[i].clear(); - if (mbDetectSpaceSep) + if (mbDetectSep) { - mbDetectSpaceSep = false; - if (cDetectSep == ' ') + mbDetectSep = false; + if (cDetectSep) { - // Expect space to be appended by now so all subsequent + // Expect separator to be appended by now so all subsequent // GetLine()/ReadCsvLine() actually used it. - assert(maFieldSeparators.endsWith(" ")); - // Preselect Space in UI. - mxCkbSpace->set_active(true); + assert(maFieldSeparators.endsWith(OUStringChar(cDetectSep))); + // Preselect separator in UI. + switch (cDetectSep) + { + case '\t': mxCkbTab->set_active(true); break; + case ';': mxCkbSemicolon->set_active(true); break; + case ',': mxCkbComma->set_active(true); break; + case ' ': mxCkbSpace->set_active(true); break; + } } } diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx index 8db98e388603..5adc2093c185 100644 --- a/sc/source/ui/docshell/impex.cxx +++ b/sc/source/ui/docshell/impex.cxx @@ -566,17 +566,40 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p // Due to broken CSV generators that don't double embedded quotes check if // a field separator immediately or with trailing spaces follows the quote, // only then end the field, or at end of string. - const sal_Unicode cBlank = ' '; + constexpr sal_Unicode cBlank = ' '; if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank)) return FIELDEND_QUOTE; // Detect a possible blank separator if it's not already in the list (which // was checked right above for p[1]==cBlank). - if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank) - rcDetectSep = cBlank; + const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank); while (p[1] == cBlank) ++p; if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1])) return FIELDEND_QUOTE; + // Extended separator detection after a closing quote (with or without + // blanks). Note that nQuotes is incremented *after* the call so is not yet + // even here, and that with separator detection we reach here only if + // lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or + // SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have + // to be checked. + if (!rcDetectSep) + { + constexpr sal_Unicode vSep[] = { ',', '\t', ';' }; + for (const sal_Unicode c : vSep) + { + if (p[1] == c) + { + rcDetectSep = c; + return FIELDEND_QUOTE; + } + } + } + // Blank separator is least significant, after others. + if (bBlankSep) + { + rcDetectSep = cBlank; + return FIELDEND_QUOTE; + } return DONTKNOW_QUOTE; } @@ -604,7 +627,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p, const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep ) { - if ((nQuotes % 2) == 0) + if ((nQuotes & 1) == 0) { if (p[-1] == cStr) return SECOND_QUOTE; @@ -2388,7 +2411,7 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr ) } OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak, - OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep ) + OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep, sal_uInt32 nMaxSourceLines ) { enum RetryState { @@ -2413,6 +2436,8 @@ Label_RetryWithNewSep: if (bEmbeddedLineBreak) { + sal_uInt32 nLine = 0; + const sal_Unicode* pSeps = rFieldSeparators.getStr(); QuoteType eQuoteState = FIELDEND_QUOTE; @@ -2444,10 +2469,11 @@ Label_RetryWithNewSep: { eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep); - if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ') + if (eRetryState == RetryState::ALLOW && rcDetectSep) { eRetryState = RetryState::RETRY; - rFieldSeparators += OUString(' '); + rFieldSeparators += OUStringChar(rcDetectSep); + pSeps = rFieldSeparators.getStr(); goto Label_RetryWithNewSep; } @@ -2493,10 +2519,14 @@ Label_RetryWithNewSep: ++p; } - if (nQuotes % 2 == 0) + if ((nQuotes & 1) == 0) // We still have a (theoretical?) problem here if due to - // nArbitraryLineLengthLimit we split a string right between a - // doubled quote pair. + // nArbitraryLineLengthLimit (or nMaxSourceLines below) we + // split a string right between a doubled quote pair. + break; + else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0) + // Unconditionally increment nLine even if nMaxSourceLines==0 + // so it can be observed in debugger. break; else { diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx index 91163c4c34e8..25f795743731 100644 --- a/sc/source/ui/inc/impex.hxx +++ b/sc/source/ui/inc/impex.hxx @@ -180,14 +180,25 @@ public: The quote character used. @param rcDetectSep - If 0 then attempt to detect a possible space (blank) separator if + If 0 then attempt to detect a possible separator if rFieldSeparators doesn't include it already. This can be necessary because of the "accept broken misquoted CSV fields" feature that tries to ignore trailing blanks after a quoted field and if no separator follows continues to add content to the field assuming the single double quote was in error. - If this blank separator is detected it is added to rFieldSeparators and the + It is also necessary if the only possible separator was not selected and + not included in rFieldSeparators and a line starts with a quoted field, in + which case appending lines is tried until end of file. + If a separator is detected it is added to rFieldSeparators and the line is reread with the new separators + @param nMaxSourceLines + Maximum source lines to read and combine into one logical line for embedded + new line purpose. Should be limited for the preview dialog because only + non-matching separators selected otherwise would lead to trying to + concatenate lines until file end. + If 0 no limit other than the internal arbitrary resulting line length + limit. + check Stream::good() to detect IO problems during read @ATTENTION @@ -208,7 +219,8 @@ public: */ SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak, - OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep ); + OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep, + sal_uInt32 nMaxSourceLines = 0 ); #endif diff --git a/sc/source/ui/inc/scuiasciiopt.hxx b/sc/source/ui/inc/scuiasciiopt.hxx index 1b263931bc6f..472d1a498267 100644 --- a/sc/source/ui/inc/scuiasciiopt.hxx +++ b/sc/source/ui/inc/scuiasciiopt.hxx @@ -52,7 +52,7 @@ class ScImportAsciiDlg : public weld::GenericDialogController rtl_TextEncoding meCharSet; /// Selected char set. bool mbCharSetSystem; /// Is System char set selected? ScImportAsciiCall const meCall; /// How the dialog is called (see asciiopt.hxx) - bool mbDetectSpaceSep; /// Whether to detect a possible space separator. + bool mbDetectSep; /// Whether to detect a possible separator. std::unique_ptr<weld::Label> mxFtCharSet; std::unique_ptr<SvxTextEncodingBox> mxLbCharSet;