Hi, PFA a patch that attempts to fix the bug that \. on a line by itself is handled incorrectly by COPY FROM ... CSV. This issue has been discussed several times previously, for instance in [1] and [2], and mentioned in the doc for \copy in commit 42d3125.
There's one case that works today: when the line is part of a multi-line quoted section, and the data is read from a file, not from the client. In other situations, an error is raised or the data is cut at the point of \. without an error. The patch addresses that issue in the server and in psql, except for the case of inlined data, where \. cannot be both valid data and an EOF marker at the same time, so it keeps treating it as an EOF marker. [1] https://www.postgresql.org/message-id/10e3eff6-eb04-4b3f-aeb4-b920192b9...@manitou-mail.org [2] https://www.postgresql.org/message-id/8aeab305-5e94-4fa5-82bf-6da6baee6e05%40app.fastmail.com Best regards, -- Daniel Vérité https://postgresql.verite.pro/ Twitter: @DanielVerite
From 868b1e065cf714f315bab65129fd05a1d60fc81b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20V=C3=A9rit=C3=A9?= <dan...@manitou-mail.org> Date: Mon, 18 Dec 2023 20:47:02 +0100 Subject: [PATCH v1] Support backslash-dot on a line by itself as valid data in COPY FROM...CSV --- doc/src/sgml/ref/psql-ref.sgml | 4 ---- src/backend/commands/copyfromparse.c | 13 ++--------- src/bin/psql/copy.c | 32 ++++++++++++++++++++-------- src/test/regress/expected/copy.out | 15 +++++++++++++ src/test/regress/sql/copy.sql | 11 ++++++++++ 5 files changed, 51 insertions(+), 24 deletions(-) diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml index cc7d797159..d94e3cacfc 100644 --- a/doc/src/sgml/ref/psql-ref.sgml +++ b/doc/src/sgml/ref/psql-ref.sgml @@ -1119,10 +1119,6 @@ INSERT INTO tbl1 VALUES ($1, $2) \bind 'first value' 'second value' \g destination, because all data must pass through the client/server connection. For large amounts of data the <acronym>SQL</acronym> command might be preferable. - Also, because of this pass-through method, <literal>\copy - ... from</literal> in <acronym>CSV</acronym> mode will erroneously - treat a <literal>\.</literal> data value alone on a line as an - end-of-input marker. </para> </tip> diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index f553734582..b4c291b25b 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -1137,7 +1137,6 @@ CopyReadLineText(CopyFromState cstate) bool result = false; /* CSV variables */ - bool first_char_in_line = true; bool in_quote = false, last_was_esc = false; char quotec = '\0'; @@ -1335,10 +1334,9 @@ CopyReadLineText(CopyFromState cstate) } /* - * In CSV mode, we only recognize \. alone on a line. This is because - * \. is a valid CSV data value. + * In CSV mode, \. is a valid CSV data value anywhere in the line. */ - if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line)) + if (c == '\\' && !cstate->opts.csv_mode) { char c2; @@ -1442,14 +1440,7 @@ CopyReadLineText(CopyFromState cstate) } } - /* - * This label is for CSV cases where \. appears at the start of a - * line, but there is more text after it, meaning it was a data value. - * We are more strict for \. in CSV mode because \. could be a data - * value, while in non-CSV mode, \. cannot be a data value. - */ not_end_of_copy: - first_char_in_line = false; } /* end of outer loop */ /* diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c index dbbbdb8898..f31a7acbb6 100644 --- a/src/bin/psql/copy.c +++ b/src/bin/psql/copy.c @@ -620,20 +620,34 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res) /* current line is done? */ if (buf[buflen - 1] == '\n') { - /* check for EOF marker, but not on a partial line */ - if (at_line_begin) + /* + * When at the beginning of the line, check for EOF marker. + * If the marker is found and the data is inlined, + * we must stop at this point. If not, the \. line can be + * sent to the server, and we let it decide whether + * it's an EOF or not depending on the format: in + * basic TEXT, \. is going to be interpreted as an EOF, in + * CSV, it will not. + */ + if (at_line_begin && copystream == pset.cur_cmd_source) { - /* - * This code erroneously assumes '\.' on a line alone - * inside a quoted CSV string terminates the \copy. - * https://www.postgresql.org/message-id/e1tdnvq-0001ju...@wrigleys.postgresql.org - * - * https://www.postgresql.org/message-id/bfcd57e4-8f23-4c3e-a5db-2571d0920...@beta.fastmail.com - */ if ((linelen == 3 && memcmp(fgresult, "\\.\n", 3) == 0) || (linelen == 4 && memcmp(fgresult, "\\.\r\n", 4) == 0)) { copydone = true; + /* + * Remove the EOF marker from the data sent, unless + * using the obsolete v2 protocol that needs it. + * In the case of CSV, the EOF marker must be + * removed, otherwise it would be interpreted by + * the server as valid data. + */ + if (copystream == pset.cur_cmd_source && + PQprotocolVersion(conn) >= 3) + { + *fgresult='\0'; + buflen -= linelen; + } } } diff --git a/src/test/regress/expected/copy.out b/src/test/regress/expected/copy.out index b48365ec98..fc1e6cfd7b 100644 --- a/src/test/regress/expected/copy.out +++ b/src/test/regress/expected/copy.out @@ -32,6 +32,21 @@ select * from copytest except select * from copytest2; -------+------+-------- (0 rows) +--- test unquoted .\ as data inside CSV +truncate copytest2; +insert into copytest2(test) values('line1'), ('\.'), ('line2'); +copy (select test from copytest2 order by test collate "C") to :'filename' csv; +-- get the data back in with copy +truncate copytest2; +copy copytest2(test) from :'filename' csv; +select test from copytest2 order by test collate "C"; + test +------- + \. + line1 + line2 +(3 rows) + -- test header line feature create temp table copytest3 ( c1 int, diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql index 43d2e906dd..24ecb0485a 100644 --- a/src/test/regress/sql/copy.sql +++ b/src/test/regress/sql/copy.sql @@ -38,6 +38,17 @@ copy copytest2 from :'filename' csv quote '''' escape E'\\'; select * from copytest except select * from copytest2; +--- test unquoted .\ as data inside CSV + +truncate copytest2; + +insert into copytest2(test) values('line1'), ('\.'), ('line2'); +copy (select test from copytest2 order by test collate "C") to :'filename' csv; +-- get the data back in with copy +truncate copytest2; +copy copytest2(test) from :'filename' csv; +select test from copytest2 order by test collate "C"; + -- test header line feature -- 2.34.1