Tom Lane wrote:
> I've looked over this patch and I generally agree that this is a
> reasonable solution.
Thanks for reviewing this!
> I'm also wondering why the patch adds a test for
> "PQprotocolVersion(conn) >= 3" in handleCopyIn.
I've removed this in the attached update.
> I concur with Robert's doubts about some of the doc changes though.
> In particular, since we're not changing the behavior for non-CSV
> mode, we shouldn't remove any statements that apply to non-CSV mode.
I don't quite understand the issues with the doc changes. Let me
detail the changes.
The first change is under
<refsect2>
<title>CSV Format</title>
so it does no concern non-CSV modes.
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -761,11 +761,7 @@ COPY <replaceable class="parameter">count</replaceable>
format, <literal>\.</literal>, the end-of-data marker, could also appear
as a data value. To avoid any misinterpretation, a <literal>\.</literal>
data value appearing as a lone entry on a line is automatically
- quoted on output, and on input, if quoted, is not interpreted as the
- end-of-data marker. If you are loading a file created by another
- application that has a single unquoted column and might have a
- value of <literal>\.</literal>, you might need to quote that value in
the
- input file.
+ quoted on output.
</para>
The part about quoting output is kept because the code still does that.
About this bit:
"and on input, if quoted, is not interpreted as the end-of-data marker."
So the patch removes that part. The reason is \. is now not interpreted as
EOD in both cases, quoted or unquoted, conforming to spec.
Previously, what the reader should have understood by "if quoted, ..."
is that it implies "if not quoted, then .\ will be interpreted as EOD
even though that behavior does not conform to the CSV spec".
If we documented the new behavior, it would be something like:
when quoted, it works as expected, and when unquoted, it works as
expected too. Isn't it simpler not to say anything?
About the next sentence:
"If you are loading a file created by another application
that has a single unquoted column and might have a value of \., you
might need to quote that value in the input file."
It's removed as well because it's not longer necessary
to do that.
The other hunk is in psql doc:
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -1119,10 +1119,6 @@ INSERT INTO tbl1 VALUES ($1, $2) \bind 'first value'
'second value' \g
destination, because all data must pass through the client/server
connection. For large amounts of data the <acronym>SQL</acronym>
command might be preferable.
- Also, because of this pass-through method, <literal>\copy
- ... from</literal> in <acronym>CSV</acronym> mode will erroneously
- treat a <literal>\.</literal> data value alone on a line as an
- end-of-input marker.
That behavior no longer happens, so this gets removed as well.
Best regards,
--
Daniel Vérité
https://postgresql.verite.pro/
Twitter: @DanielVerite
From 5a16b81f10e47fe1ac3842abd34e8bef7e639e26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20V=C3=A9rit=C3=A9?= <[email protected]>
Date: Fri, 5 Apr 2024 14:22:49 +0200
Subject: [PATCH v4] Support backslash-dot on a line by itself as valid data in
COPY FROM...CSV
---
doc/src/sgml/ref/copy.sgml | 6 +--
doc/src/sgml/ref/psql-ref.sgml | 4 --
src/backend/commands/copyfromparse.c | 57 +++++++---------------------
src/bin/psql/copy.c | 30 ++++++++++-----
src/test/regress/expected/copy.out | 18 +++++++++
src/test/regress/sql/copy.sql | 12 ++++++
6 files changed, 65 insertions(+), 62 deletions(-)
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 33ce7c4ea6..a63f073c1b 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -814,11 +814,7 @@ COPY <replaceable class="parameter">count</replaceable>
format, <literal>\.</literal>, the end-of-data marker, could also appear
as a data value. To avoid any misinterpretation, a <literal>\.</literal>
data value appearing as a lone entry on a line is automatically
- quoted on output, and on input, if quoted, is not interpreted as the
- end-of-data marker. If you are loading a file created by another
- application that has a single unquoted column and might have a
- value of <literal>\.</literal>, you might need to quote that value in the
- input file.
+ quoted on output.
</para>
<note>
diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml
index cc7d797159..d94e3cacfc 100644
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -1119,10 +1119,6 @@ INSERT INTO tbl1 VALUES ($1, $2) \bind 'first value'
'second value' \g
destination, because all data must pass through the client/server
connection. For large amounts of data the <acronym>SQL</acronym>
command might be preferable.
- Also, because of this pass-through method, <literal>\copy
- ... from</literal> in <acronym>CSV</acronym> mode will erroneously
- treat a <literal>\.</literal> data value alone on a line as an
- end-of-input marker.
</para>
</tip>
diff --git a/src/backend/commands/copyfromparse.c
b/src/backend/commands/copyfromparse.c
index 7ddd27f5c6..2c6ce5e4f8 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -136,14 +136,6 @@ if (1) \
} \
} else ((void) 0)
-/* Undo any read-ahead and jump out of the block. */
-#define NO_END_OF_COPY_GOTO \
-if (1) \
-{ \
- input_buf_ptr = prev_raw_ptr + 1; \
- goto not_end_of_copy; \
-} else ((void) 0)
-
/* NOTE: there's a copy of this in copyto.c */
static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
@@ -1182,7 +1174,6 @@ CopyReadLineText(CopyFromState cstate)
bool result = false;
/* CSV variables */
- bool first_char_in_line = true;
bool in_quote = false,
last_was_esc = false;
char quotec = '\0';
@@ -1380,10 +1371,9 @@ CopyReadLineText(CopyFromState cstate)
}
/*
- * In CSV mode, we only recognize \. alone on a line. This is
because
- * \. is a valid CSV data value.
+ * In CSV mode, backslash is a normal character.
*/
- if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line))
+ if (c == '\\' && !cstate->opts.csv_mode)
{
char c2;
@@ -1416,21 +1406,15 @@ CopyReadLineText(CopyFromState cstate)
if (c2 == '\n')
{
- if (!cstate->opts.csv_mode)
- ereport(ERROR,
-
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-
errmsg("end-of-copy marker does not match previous newline style")));
- else
- NO_END_OF_COPY_GOTO;
+ ereport(ERROR,
+
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+
errmsg("end-of-copy marker does not match previous newline style")));
}
else if (c2 != '\r')
{
- if (!cstate->opts.csv_mode)
- ereport(ERROR,
-
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-
errmsg("end-of-copy marker corrupt")));
- else
- NO_END_OF_COPY_GOTO;
+ ereport(ERROR,
+
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+
errmsg("end-of-copy marker corrupt")));
}
}
@@ -1441,12 +1425,9 @@ CopyReadLineText(CopyFromState cstate)
if (c2 != '\r' && c2 != '\n')
{
- if (!cstate->opts.csv_mode)
- ereport(ERROR,
-
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-
errmsg("end-of-copy marker corrupt")));
- else
- NO_END_OF_COPY_GOTO;
+ ereport(ERROR,
+
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy
marker corrupt")));
}
if ((cstate->eol_type == EOL_NL && c2 != '\n')
||
@@ -1470,7 +1451,7 @@ CopyReadLineText(CopyFromState cstate)
result = true; /* report EOF */
break;
}
- else if (!cstate->opts.csv_mode)
+ else
{
/*
* If we are here, it means we found a
backslash followed by
@@ -1478,23 +1459,11 @@ CopyReadLineText(CopyFromState cstate)
* after a backslash is special, so we skip
over that second
* character too. If we didn't do that \\.
would be
* considered an eof-of copy, while in non-CSV
mode it is a
- * literal backslash followed by a period. In
CSV mode,
- * backslashes are not special, so we want to
process the
- * character after the backslash just like a
normal character,
- * so we don't increment in those cases.
+ * literal backslash followed by a period.
*/
input_buf_ptr++;
}
}
-
- /*
- * This label is for CSV cases where \. appears at the start of
a
- * line, but there is more text after it, meaning it was a data
value.
- * We are more strict for \. in CSV mode because \. could be a
data
- * value, while in non-CSV mode, \. cannot be a data value.
- */
-not_end_of_copy:
- first_char_in_line = false;
} /* end of outer
loop */
/*
diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c
index 961ae32694..a39818b193 100644
--- a/src/bin/psql/copy.c
+++ b/src/bin/psql/copy.c
@@ -620,20 +620,32 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool
isbinary, PGresult **res)
/* current line is done? */
if (buf[buflen - 1] == '\n')
{
- /* check for EOF marker, but not on a
partial line */
- if (at_line_begin)
+ /*
+ * When at the beginning of the line,
check for EOF marker.
+ * If the marker is found and the data
is inlined,
+ * we must stop at this point. If not,
the \. line can be
+ * sent to the server, and we let it
decide whether
+ * it's an EOF or not depending on the
format: in
+ * basic TEXT, \. is going to be
interpreted as an EOF, in
+ * CSV, it will not.
+ */
+ if (at_line_begin && copystream ==
pset.cur_cmd_source)
{
- /*
- * This code erroneously
assumes '\.' on a line alone
- * inside a quoted CSV string
terminates the \copy.
- *
https://www.postgresql.org/message-id/[email protected]
- *
- *
https://www.postgresql.org/message-id/[email protected]
- */
if ((linelen == 3 &&
memcmp(fgresult, "\\.\n", 3) == 0) ||
(linelen == 4 &&
memcmp(fgresult, "\\.\r\n", 4) == 0))
{
copydone = true;
+ /*
+ * Remove the EOF
marker from the data sent.
+ * In the case of CSV,
the EOF marker must be
+ * removed, otherwise
it would be interpreted by
+ * the server as valid
data.
+ */
+ if (copystream ==
pset.cur_cmd_source)
+ {
+ *fgresult='\0';
+ buflen -=
linelen;
+ }
}
}
diff --git a/src/test/regress/expected/copy.out
b/src/test/regress/expected/copy.out
index b48365ec98..645e849bbd 100644
--- a/src/test/regress/expected/copy.out
+++ b/src/test/regress/expected/copy.out
@@ -32,6 +32,24 @@ select * from copytest except select * from copytest2;
-------+------+--------
(0 rows)
+--- test unquoted \. as data inside CSV
+-- do not use copy out to export the data, as it would quote \.
+\o :filename
+\qecho line1
+\qecho '\\.'
+\qecho line2
+\o
+-- get the data back in with copy
+truncate copytest2;
+copy copytest2(test) from :'filename' csv;
+select test from copytest2 order by test collate "C";
+ test
+-------
+ \.
+ line1
+ line2
+(3 rows)
+
-- test header line feature
create temp table copytest3 (
c1 int,
diff --git a/src/test/regress/sql/copy.sql b/src/test/regress/sql/copy.sql
index 43d2e906dd..68a58740cb 100644
--- a/src/test/regress/sql/copy.sql
+++ b/src/test/regress/sql/copy.sql
@@ -38,6 +38,18 @@ copy copytest2 from :'filename' csv quote '''' escape E'\\';
select * from copytest except select * from copytest2;
+--- test unquoted \. as data inside CSV
+-- do not use copy out to export the data, as it would quote \.
+\o :filename
+\qecho line1
+\qecho '\\.'
+\qecho line2
+\o
+-- get the data back in with copy
+truncate copytest2;
+copy copytest2(test) from :'filename' csv;
+select test from copytest2 order by test collate "C";
+
-- test header line feature
--
2.34.1