Hi,
I just noticed that if you load a file using psql:
\copy <table> from <local file>
it sends every line as a separate FE/BE protocol CopyData packet. That's
pretty wasteful if the lines are narrow. The overhead of each CopyData
packet is 5 bytes.
To demonstrate, I generated a simple test file with the string "foobar"
repeated 10 million times:
$ perl -le 'for (1..10000000) { print "foobar" }' > /tmp/testdata
and loaded that into a temp table with psql:
create temporary table copytest (t text) on commit delete rows;
\copy copytest from '/tmp/testdata';
I repeated and timed the \copy a few times; it takes about about 3
seconds on my laptop:
postgres=# \copy copytest from '/tmp/testdata';
COPY 10000000
Time: 3039.625 ms (00:03.040)
Wireshark says that that involved about 120 MB of network traffic. The
size of the file on disk is only 70 MB.
The attached patch modifies psql so that it buffers up 8 kB of data into
each CopyData message, instead of sending one per line. That makes the
operation faster:
postgres=# \copy copytest from '/tmp/testdata';
COPY 10000000
Time: 2490.268 ms (00:02.490)
And wireshark confirms that there's now only a bit over 70 MB of network
traffic.
I'll add this to the next commitfest. There's similar inefficiency in
the server side in COPY TO, but I'll leave that for another patch.
- Heikki
>From e49334de4eb51f5ba061bf48d1979a7ab8ef0de9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Sun, 7 Feb 2021 00:10:05 +0200
Subject: [PATCH 1/1] In psql \copy from, send data to server in larger chunks.
Previously, we would send each line as a separate CopyData message.
That's pretty wasteful if the table is narrow, as each CopyData message
has 5 bytes of overhead. For efficiency, buffer up and pack 8 kB of input
data into each CopyData message.
---
src/bin/psql/copy.c | 114 +++++++++++++++++++++++++++-----------------
1 file changed, 69 insertions(+), 45 deletions(-)
diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c
index 78f0dc5a507..8c56f2470e2 100644
--- a/src/bin/psql/copy.c
+++ b/src/bin/psql/copy.c
@@ -581,77 +581,101 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res)
else
{
bool copydone = false;
+ int buflen;
+ bool at_line_begin = true;
+ if (showprompt)
+ {
+ const char *prompt = get_prompt(PROMPT_COPY, NULL);
+
+ fputs(prompt, stdout);
+ fflush(stdout);
+ }
+
+ /*
+ * In text mode, We have to read the input one line at a time, so that
+ * we can stop reading at the EOF marker (\.). We mustn't read beyond
+ * the EOF marker, because if the data is inlined in a SQL script, we
+ * would eat up the commands after the EOF marker.
+ */
+ buflen = 0;
while (!copydone)
- { /* for each input line ... */
- bool firstload;
- bool linedone;
+ {
+ int linelen;
+ char *fgresult;
- if (showprompt)
+ if (at_line_begin && showprompt)
{
- const char *prompt = get_prompt(PROMPT_COPY, NULL);
+ const char *prompt;
+ sigint_interrupt_enabled = false;
+
+ prompt = get_prompt(PROMPT_COPY, NULL);
fputs(prompt, stdout);
fflush(stdout);
- }
- firstload = true;
- linedone = false;
-
- while (!linedone)
- { /* for each bufferload in line ... */
- int linelen;
- char *fgresult;
-
- /* enable longjmp while waiting for input */
sigint_interrupt_enabled = true;
+ }
- fgresult = fgets(buf, sizeof(buf), copystream);
-
- sigint_interrupt_enabled = false;
+ /* enable longjmp while waiting for input */
+ sigint_interrupt_enabled = true;
- if (!fgresult)
- {
- copydone = true;
- break;
- }
+ fgresult = fgets(&buf[buflen], COPYBUFSIZ - buflen, copystream);
- linelen = strlen(buf);
+ sigint_interrupt_enabled = false;
- /* current line is done? */
- if (linelen > 0 && buf[linelen - 1] == '\n')
- linedone = true;
+ if (!fgresult)
+ copydone = true;
+ else
+ {
+ linelen = strlen(fgresult);
+ buflen += linelen;
- /* check for EOF marker, but not on a partial line */
- if (firstload)
+ if (buf[buflen - 1] == '\n')
{
- /*
- * This code erroneously assumes '\.' on a line alone
- * inside a quoted CSV string terminates the \copy.
- * https://www.postgresql.org/message-id/e1tdnvq-0001ju...@wrigleys.postgresql.org
- */
- if (strcmp(buf, "\\.\n") == 0 ||
- strcmp(buf, "\\.\r\n") == 0)
+ /* check for EOF marker, but not on a partial line */
+ if (at_line_begin)
{
- copydone = true;
- break;
+ /*
+ * This code erroneously assumes '\.' on a line alone
+ * inside a quoted CSV string terminates the \copy.
+ * https://www.postgresql.org/message-id/e1tdnvq-0001ju...@wrigleys.postgresql.org
+ */
+ if ((linelen == 3 && memcmp(fgresult, "\\.\n", 3) == 0) ||
+ (linelen == 4 && memcmp(fgresult, "\\.\r\n", 4) == 0))
+ {
+ copydone = true;
+ }
}
- firstload = false;
+ if (copystream == pset.cur_cmd_source)
+ {
+ pset.lineno++;
+ pset.stmt_lineno++;
+ }
+ at_line_begin = true;
}
+ else
+ at_line_begin = false;
+ }
- if (PQputCopyData(conn, buf, linelen) <= 0)
+ /*
+ * If the buffer is full, or we've reached the EOF, flush it.
+ *
+ * Make sure there's always space for four more bytes in the buffer,
+ * plus a NUL terminator. That way, an EOF marker is never split
+ * across two fgets() calls, which simplies the logic.
+ */
+ if (buflen >= COPYBUFSIZ - 5 || (copydone && buflen > 0))
+ {
+ if (PQputCopyData(conn, buf, buflen) <= 0)
{
OK = false;
copydone = true;
break;
}
- }
- if (copystream == pset.cur_cmd_source)
- {
- pset.lineno++;
- pset.stmt_lineno++;
+ buflen = 0;
}
}
}
--
2.30.0