Hi,

I just noticed that if you load a file using psql:

\copy <table> from <local file>

it sends every line as a separate FE/BE protocol CopyData packet. That's pretty wasteful if the lines are narrow. The overhead of each CopyData packet is 5 bytes.

To demonstrate, I generated a simple test file with the string "foobar" repeated 10 million times:

$ perl -le 'for (1..10000000) { print "foobar" }' > /tmp/testdata

and loaded that into a temp table with psql:

create temporary table copytest (t text) on commit delete rows;
\copy copytest from '/tmp/testdata';

I repeated and timed the \copy a few times; it takes about about 3 seconds on my laptop:

postgres=# \copy copytest from '/tmp/testdata';
COPY 10000000
Time: 3039.625 ms (00:03.040)

Wireshark says that that involved about 120 MB of network traffic. The size of the file on disk is only 70 MB.

The attached patch modifies psql so that it buffers up 8 kB of data into each CopyData message, instead of sending one per line. That makes the operation faster:

postgres=# \copy copytest from '/tmp/testdata';
COPY 10000000
Time: 2490.268 ms (00:02.490)

And wireshark confirms that there's now only a bit over 70 MB of network traffic.

I'll add this to the next commitfest. There's similar inefficiency in the server side in COPY TO, but I'll leave that for another patch.

- Heikki
>From e49334de4eb51f5ba061bf48d1979a7ab8ef0de9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Sun, 7 Feb 2021 00:10:05 +0200
Subject: [PATCH 1/1] In psql \copy from, send data to server in larger chunks.

Previously, we would send each line as a separate CopyData message.
That's pretty wasteful if the table is narrow, as each CopyData message
has 5 bytes of overhead. For efficiency, buffer up and pack 8 kB of input
data into each CopyData message.
---
 src/bin/psql/copy.c | 114 +++++++++++++++++++++++++++-----------------
 1 file changed, 69 insertions(+), 45 deletions(-)

diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c
index 78f0dc5a507..8c56f2470e2 100644
--- a/src/bin/psql/copy.c
+++ b/src/bin/psql/copy.c
@@ -581,77 +581,101 @@ handleCopyIn(PGconn *conn, FILE *copystream, bool isbinary, PGresult **res)
 	else
 	{
 		bool		copydone = false;
+		int			buflen;
+		bool		at_line_begin = true;
 
+		if (showprompt)
+		{
+			const char *prompt = get_prompt(PROMPT_COPY, NULL);
+
+			fputs(prompt, stdout);
+			fflush(stdout);
+		}
+
+		/*
+		 * In text mode, We have to read the input one line at a time, so that
+		 * we can stop reading at the EOF marker (\.).  We mustn't read beyond
+		 * the EOF marker, because if the data is inlined in a SQL script, we
+		 * would eat up the commands after the EOF marker.
+		 */
+		buflen = 0;
 		while (!copydone)
-		{						/* for each input line ... */
-			bool		firstload;
-			bool		linedone;
+		{
+			int			linelen;
+			char	   *fgresult;
 
-			if (showprompt)
+			if (at_line_begin && showprompt)
 			{
-				const char *prompt = get_prompt(PROMPT_COPY, NULL);
+				const char *prompt;
 
+				sigint_interrupt_enabled = false;
+
+				prompt = get_prompt(PROMPT_COPY, NULL);
 				fputs(prompt, stdout);
 				fflush(stdout);
-			}
 
-			firstload = true;
-			linedone = false;
-
-			while (!linedone)
-			{					/* for each bufferload in line ... */
-				int			linelen;
-				char	   *fgresult;
-
-				/* enable longjmp while waiting for input */
 				sigint_interrupt_enabled = true;
+			}
 
-				fgresult = fgets(buf, sizeof(buf), copystream);
-
-				sigint_interrupt_enabled = false;
+			/* enable longjmp while waiting for input */
+			sigint_interrupt_enabled = true;
 
-				if (!fgresult)
-				{
-					copydone = true;
-					break;
-				}
+			fgresult = fgets(&buf[buflen], COPYBUFSIZ - buflen, copystream);
 
-				linelen = strlen(buf);
+			sigint_interrupt_enabled = false;
 
-				/* current line is done? */
-				if (linelen > 0 && buf[linelen - 1] == '\n')
-					linedone = true;
+			if (!fgresult)
+				copydone = true;
+			else
+			{
+				linelen = strlen(fgresult);
+				buflen += linelen;
 
-				/* check for EOF marker, but not on a partial line */
-				if (firstload)
+				if (buf[buflen - 1] == '\n')
 				{
-					/*
-					 * This code erroneously assumes '\.' on a line alone
-					 * inside a quoted CSV string terminates the \copy.
-					 * https://www.postgresql.org/message-id/e1tdnvq-0001ju...@wrigleys.postgresql.org
-					 */
-					if (strcmp(buf, "\\.\n") == 0 ||
-						strcmp(buf, "\\.\r\n") == 0)
+					/* check for EOF marker, but not on a partial line */
+					if (at_line_begin)
 					{
-						copydone = true;
-						break;
+						/*
+						 * This code erroneously assumes '\.' on a line alone
+						 * inside a quoted CSV string terminates the \copy.
+						 * https://www.postgresql.org/message-id/e1tdnvq-0001ju...@wrigleys.postgresql.org
+						 */
+						if ((linelen == 3 && memcmp(fgresult, "\\.\n", 3) == 0) ||
+							(linelen == 4 && memcmp(fgresult, "\\.\r\n", 4) == 0))
+						{
+							copydone = true;
+						}
 					}
 
-					firstload = false;
+					if (copystream == pset.cur_cmd_source)
+					{
+						pset.lineno++;
+						pset.stmt_lineno++;
+					}
+					at_line_begin = true;
 				}
+				else
+					at_line_begin = false;
+			}
 
-				if (PQputCopyData(conn, buf, linelen) <= 0)
+			/*
+			 * If the buffer is full, or we've reached the EOF, flush it.
+			 *
+			 * Make sure there's always space for four more bytes in the buffer,
+			 * plus a NUL terminator. That way, an EOF marker is never split
+			 * across two fgets() calls, which simplies the logic.
+			 */
+			if (buflen >= COPYBUFSIZ - 5 || (copydone && buflen > 0))
+			{
+				if (PQputCopyData(conn, buf, buflen) <= 0)
 				{
 					OK = false;
 					copydone = true;
 					break;
 				}
-			}
 
-			if (copystream == pset.cur_cmd_source)
-			{
-				pset.lineno++;
-				pset.stmt_lineno++;
+				buflen = 0;
 			}
 		}
 	}
-- 
2.30.0

Reply via email to