This is an automated email from the ASF dual-hosted git repository.
damjan pushed a commit to branch AOO41X
in repository https://gitbox.apache.org/repos/asf/openoffice.git
commit 2cbc43e98ea7b5f261a50824b6707a1bf176f0b7
Author: damjan <damjan@13f79535-47bb-0310-9956-ffa450edef68>
AuthorDate: Sun Apr 17 16:44:43 2016 +0000
Make CSV line parsers consistent with CSV field parsers.
Our CSV field parsing algorithms treats fields starting with a quote
(immediately at the beginning of the row, or after the field delimiter) as
quoted. A quoted field ends at the corresponding closing quote, and any
remaining text between the closing quote and the next field delimeter or end
of line is appended to the text already extracted from the field, but not
processed further. Any quotes in this extra text are taken verbatim - they
do not quote anything.
Our CSV line parsers were big hacks - they essentially read and concatenate
lines until an even number of quote characters is found, and then feed this
through the CSV field parsers.
This patch rewrites the line parsers to work exactly how the field parsers
work. Text such as:
"another" ",something else
is now correctly parsed by both Calc and Base as:
[another "],[something else]
instead of breaking all further parsing.
Patch by: me
git-svn-id: https://svn.apache.org/repos/asf/openoffice/trunk@1739628
13f79535-47bb-0310-9956-ffa450edef68
(cherry picked from commit bc1fc15f4dddfc075a011a1203c162b446e72868)
---
main/connectivity/source/drivers/flat/ETable.cxx | 62 +++++++++++++++++++++---
main/tools/source/stream/stream.cxx | 59 +++++++++++++++-------
2 files changed, 97 insertions(+), 24 deletions(-)
diff --git a/main/connectivity/source/drivers/flat/ETable.cxx
b/main/connectivity/source/drivers/flat/ETable.cxx
index 396d9df6d7..af28717c07 100644
--- a/main/connectivity/source/drivers/flat/ETable.cxx
+++ b/main/connectivity/source/drivers/flat/ETable.cxx
@@ -907,14 +907,64 @@ sal_Bool OFlatTable::readLine(QuotedTokenizedString&
line, sal_Int32& _rnCurrent
return sal_False;
QuotedTokenizedString sLine = line; // check if the string continues on
next line
- while( (sLine.GetString().GetTokenCount(m_cStringDelimiter) % 2) != 1 )
+ xub_StrLen nLastOffset = 0;
+ bool isQuoted = false;
+ bool isFieldStarting = true;
+ while (true)
{
- m_pFileStream->ReadByteStringLine(sLine,nEncoding);
- if ( !m_pFileStream->IsEof() )
+ bool wasQuote = false;
+ const sal_Unicode *p;
+ p = sLine.GetString().GetBuffer();
+ p += nLastOffset;
+
+ while (*p)
{
- line.GetString().Append('\n');
- line.GetString() += sLine.GetString();
- sLine = line;
+ if (isQuoted)
+ {
+ if (*p == m_cStringDelimiter)
+ wasQuote = !wasQuote;
+ else
+ {
+ if (wasQuote)
+ {
+ wasQuote = false;
+ isQuoted = false;
+ if (*p == m_cFieldDelimiter)
+ isFieldStarting = true;
+ }
+ }
+ }
+ else
+ {
+ if (isFieldStarting)
+ {
+ isFieldStarting = false;
+ if (*p == m_cStringDelimiter)
+ isQuoted = true;
+ else if (*p == m_cFieldDelimiter)
+ isFieldStarting = true;
+ }
+ else if (*p == m_cFieldDelimiter)
+ isFieldStarting = true;
+ }
+ ++p;
+ }
+
+ if (wasQuote)
+ isQuoted = false;
+
+ if (isQuoted)
+ {
+ nLastOffset = sLine.Len();
+ m_pFileStream->ReadByteStringLine(sLine,nEncoding);
+ if ( !m_pFileStream->IsEof() )
+ {
+ line.GetString().Append('\n');
+ line.GetString() += sLine.GetString();
+ sLine = line;
+ }
+ else
+ break;
}
else
break;
diff --git a/main/tools/source/stream/stream.cxx
b/main/tools/source/stream/stream.cxx
index 67e5b370d0..8de4768ceb 100644
--- a/main/tools/source/stream/stream.cxx
+++ b/main/tools/source/stream/stream.cxx
@@ -1128,38 +1128,59 @@ sal_Bool SvStream::ReadCsvLine( String& rStr, sal_Bool
bEmbeddedLineBreak,
{
const sal_Unicode* pSeps = rFieldSeparators.GetBuffer();
xub_StrLen nLastOffset = 0;
- xub_StrLen nQuotes = 0;
+ bool isQuoted = false;
+ bool isFieldStarting = true;
while (!IsEof() && rStr.Len() < STRING_MAXLEN)
{
+ bool wasQuote = false;
bool bBackslashEscaped = false;
- const sal_Unicode *p, *pStart;
- p = pStart = rStr.GetBuffer();
+ const sal_Unicode *p;
+ p = rStr.GetBuffer();
p += nLastOffset;
while (*p)
{
- if (nQuotes)
+ if (isQuoted)
{
if (*p == cFieldQuote && !bBackslashEscaped)
- ++nQuotes;
- else if (bAllowBackslashEscape)
+ wasQuote = !wasQuote;
+ else
{
- if (*p == '\\')
- bBackslashEscaped = !bBackslashEscaped;
- else
- bBackslashEscaped = false;
+ if (bAllowBackslashEscape)
+ {
+ if (*p == '\\')
+ bBackslashEscaped = !bBackslashEscaped;
+ else
+ bBackslashEscaped = false;
+ }
+ if (wasQuote)
+ {
+ wasQuote = false;
+ isQuoted = false;
+ if (lcl_UnicodeStrChr( pSeps, *p ))
+ isFieldStarting = true;
+ }
}
}
- else if (*p == cFieldQuote && (p == pStart ||
- lcl_UnicodeStrChr( pSeps, p[-1])))
- nQuotes = 1;
- // A quote character inside a field content does not start
- // a quote.
+ else
+ {
+ if (isFieldStarting)
+ {
+ isFieldStarting = false;
+ if (*p == cFieldQuote)
+ isQuoted = true;
+ else if (lcl_UnicodeStrChr( pSeps, *p ))
+ isFieldStarting = true;
+ }
+ else if (lcl_UnicodeStrChr( pSeps, *p ))
+ isFieldStarting = true;
+ }
++p;
}
- if (nQuotes % 2 == 0)
- break;
- else
+ if (wasQuote)
+ isQuoted = false;
+
+ if (isQuoted)
{
nLastOffset = rStr.Len();
String aNext;
@@ -1167,6 +1188,8 @@ sal_Bool SvStream::ReadCsvLine( String& rStr, sal_Bool
bEmbeddedLineBreak,
rStr += sal_Unicode(_LF);
rStr += aNext;
}
+ else
+ break;
}
}
return nError == SVSTREAM_OK;