Re: Parallel copy

Heikki Linnakangas Fri, 30 Oct 2020 09:53:39 -0700

On 30/10/2020 18:36, Heikki Linnakangas wrote:

Whether the leader process finds the EOLs or the worker processes, it's
pretty clear that it needs to be done ASAP, for a chunk at a time,
because that cannot be done in parallel. I think some refactoring in
CopyReadLine() and friends would be in order. It probably would be
faster, or at least not slower, to find all the EOLs in a block in one
tight loop, even when parallel copy is not used.

Something like the attached. It passes the regression tests, but it'squite incomplete. It's missing handing of "\." as end-of-file marker,and I haven't tested encoding conversions at all, for starters. Quicktesting suggests that this a little bit faster than the current code,but the difference is small; I had to use a "WHERE false" option toreally see the difference.

The crucial thing here is that there's a new function, ParseLinesText(),to find all end-of-line characters in a buffer in one go. In this patch,it's used against 'raw_buf', but with parallel copy, you could point itat a block in shared memory instead.


- Heikki

>From af3be3bd4e77b66f4605393617da0d15ec21e15b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Fri, 30 Oct 2020 18:51:10 +0200
Subject: [PATCH 1/1] WIP: Find all line-endings in COPY in chunks.

Refactor CopyReadLines and friends to find all the line-endings in the
buffer in one go, before splitting the lines further.
---
 src/backend/commands/copy.c | 972 ++++++++++++++++++++----------------
 1 file changed, 536 insertions(+), 436 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 36ddcdccdb8..fbf11cb2550 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -95,6 +95,18 @@ typedef enum CopyInsertMethod
 	CIM_MULTI_CONDITIONAL		/* use table_multi_insert only if valid */
 } CopyInsertMethod;
 
+
+/*
+ * Represents the heap insert method to be used during COPY FROM.
+ */
+typedef enum ParseLinesState
+{
+	PLSTATE_NORMAL,
+	PLSTATE_ESCAPE,
+	PLSTATE_IN_QUOTE,
+	PLSTATE_ESCAPE_IN_QUOTE
+} ParseLinesState;
+
 /*
  * This struct contains all the state variables used throughout a COPY
  * operation. For simplicity, we use the same struct for all variants of COPY,
@@ -110,6 +122,24 @@ typedef enum CopyInsertMethod
  * it's faster to make useless comparisons to trailing bytes than it is to
  * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true
  * when we have to do it the hard way.
+ *
+ * COPY FROM buffers:
+ *
+ * In COPY FROM processing, there are three levels of buffers:
+ *
+ * raw_buf       - contains raw data read from file/client
+ * converted_buf - contains the data in 'raw_buf', but converted to server encoding
+ * line_buf      - contains "current" line of data, without the end-of-line char
+ *
+ *
+ * In simple cases, no encoding conversion are needed, and converted_buf always
+ * points to raw_buf. If the encoding_embeds_ascii==true, encoding conversion is
+ * performed on the raw buffer, before splitting it to lines. converted_buf contains
+ * the converted version in that case.
+ *
+ * Usually, line_buf pointer points in the middle of converted_buf, but when a line
+ * is split by a raw-buffer boundary, the incomplete line is reassembled
+ * in a separate buffer (split_line_buf), and line_buf points to that.
  */
 typedef struct CopyStateData
 {
@@ -205,16 +235,34 @@ typedef struct CopyStateData
 	char	  **raw_fields;
 
 	/*
-	 * Similarly, line_buf holds the whole input line being processed. The
+	 * These variables are used to track state of parsing raw data into
+	 * lines in COPY FROM.
+	 */
+	bool		last_was_cr;
+	ParseLinesState parse_lines_state;
+
+	int			last_line_no; /* last line in 'endlines', -1 if EOF not reached yet */
+
+	int			nextline;
+	int		   *endlines; /* line ending positions within raw_buf */
+	int			numlines;
+
+	/* split_line_buf holds partial line carried over from previous buf */
+	StringInfoData split_line_buf;
+
+	/*
+	 * Similarly, line_buf holds the current input line being processed. The
 	 * input cycle is first to read the whole line into line_buf, convert it
 	 * to server encoding there, and then extract the individual attribute
 	 * fields into attribute_buf.  line_buf is preserved unmodified so that we
 	 * can display it in error messages if appropriate.  (In binary mode,
 	 * line_buf is not used.)
 	 */
-	StringInfoData line_buf;
+	char	   *line_buf;
+	int			line_len;
 	bool		line_buf_converted; /* converted to server encoding? */
 	bool		line_buf_valid; /* contains the row being processed? */
+	bool		line_buf_alloced;
 
 	/*
 	 * Finally, raw_buf holds raw data read from the data source (file or
@@ -230,6 +278,9 @@ typedef struct CopyStateData
 	int			raw_buf_len;	/* total # of bytes stored */
 	/* Shorthand for number of unconsumed bytes available in raw_buf */
 #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
+
+	char	   *converted_buf;
+	int			converted_buf_len;
 } CopyStateData;
 
 /* DestReceiver for COPY (query) TO */
@@ -288,72 +339,6 @@ typedef struct CopyMultiInsertInfo
 	int			ti_options;		/* table insert options */
 } CopyMultiInsertInfo;
 
-
-/*
- * These macros centralize code used to process line_buf and raw_buf buffers.
- * They are macros because they often do continue/break control and to avoid
- * function call overhead in tight COPY loops.
- *
- * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
- * prevent the continue/break processing from working.  We end the "if (1)"
- * with "else ((void) 0)" to ensure the "if" does not unintentionally match
- * any "else" in the calling code, and to avoid any compiler warnings about
- * empty statements.  See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
- */
-
-/*
- * This keeps the character read at the top of the loop in the buffer
- * even if there is more than one read-ahead.
- */
-#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
-if (1) \
-{ \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
-	{ \
-		raw_buf_ptr = prev_raw_ptr; /* undo fetch */ \
-		need_data = true; \
-		continue; \
-	} \
-} else ((void) 0)
-
-/* This consumes the remainder of the buffer and breaks */
-#define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
-if (1) \
-{ \
-	if (raw_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
-	{ \
-		if (extralen) \
-			raw_buf_ptr = copy_buf_len; /* consume the partial character */ \
-		/* backslash just before EOF, treat as data char */ \
-		result = true; \
-		break; \
-	} \
-} else ((void) 0)
-
-/*
- * Transfer any approved data to line_buf; must do this to be sure
- * there is some room in raw_buf.
- */
-#define REFILL_LINEBUF \
-if (1) \
-{ \
-	if (raw_buf_ptr > cstate->raw_buf_index) \
-	{ \
-		appendBinaryStringInfo(&cstate->line_buf, \
-							 cstate->raw_buf + cstate->raw_buf_index, \
-							   raw_buf_ptr - cstate->raw_buf_index); \
-		cstate->raw_buf_index = raw_buf_ptr; \
-	} \
-} else ((void) 0)
-
-/* Undo any read-ahead and jump out of the block. */
-#define NO_END_OF_COPY_GOTO \
-if (1) \
-{ \
-	raw_buf_ptr = prev_raw_ptr + 1; \
-	goto not_end_of_copy; \
-} else ((void) 0)
-
 static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
 
 
@@ -371,7 +356,8 @@ static uint64 DoCopyTo(CopyState cstate);
 static uint64 CopyTo(CopyState cstate);
 static void CopyOneRowTo(CopyState cstate, TupleTableSlot *slot);
 static bool CopyReadLine(CopyState cstate);
-static bool CopyReadLineText(CopyState cstate);
+static void ParseLinesText(CopyState cstate);
+static void ParseLinesCSV(CopyState cstate);
 static int	CopyReadAttributesText(CopyState cstate);
 static int	CopyReadAttributesCSV(CopyState cstate);
 static Datum CopyReadBinaryAttribute(CopyState cstate, FmgrInfo *flinfo,
@@ -382,7 +368,7 @@ static void CopyAttributeOutCSV(CopyState cstate, char *string,
 								bool use_quote, bool single_attr);
 static List *CopyGetAttnums(TupleDesc tupDesc, Relation rel,
 							List *attnamelist);
-static char *limit_printout_length(const char *str);
+static char *limit_printout_length(const char *str, int slen);
 
 /* Low-level communications functions */
 static void SendCopyBegin(CopyState cstate);
@@ -399,6 +385,7 @@ static bool CopyGetInt32(CopyState cstate, int32 *val);
 static void CopySendInt16(CopyState cstate, int16 val);
 static bool CopyGetInt16(CopyState cstate, int16 *val);
 static bool CopyLoadRawBuf(CopyState cstate);
+static bool CopyLoadAndConvertBuf(CopyState cstate);
 static int	CopyReadBinaryData(CopyState cstate, char *dest, int nbytes);
 
 
@@ -2311,7 +2298,7 @@ CopyFromErrorCallback(void *arg)
 			/* error is relevant to a particular column */
 			char	   *attval;
 
-			attval = limit_printout_length(cstate->cur_attval);
+			attval = limit_printout_length(cstate->cur_attval, strlen(cstate->cur_attval));
 			errcontext("COPY %s, line %s, column %s: \"%s\"",
 					   cstate->cur_relname, curlineno_str,
 					   cstate->cur_attname, attval);
@@ -2341,7 +2328,7 @@ CopyFromErrorCallback(void *arg)
 			{
 				char	   *lineval;
 
-				lineval = limit_printout_length(cstate->line_buf.data);
+				lineval = limit_printout_length(cstate->line_buf, cstate->line_len);
 				errcontext("COPY %s, line %s: \"%s\"",
 						   cstate->cur_relname, curlineno_str, lineval);
 				pfree(lineval);
@@ -2361,11 +2348,10 @@ CopyFromErrorCallback(void *arg)
  * Returns a pstrdup'd copy of the input.
  */
 static char *
-limit_printout_length(const char *str)
+limit_printout_length(const char *str, int slen)
 {
 #define MAX_COPY_DATA_DISPLAY 100
 
-	int			slen = strlen(str);
 	int			len;
 	char	   *res;
 
@@ -2819,7 +2805,6 @@ CopyFrom(CopyState cstate)
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("cannot perform COPY FREEZE because the table was not created or truncated in the current subtransaction")));
-
 		ti_options |= TABLE_INSERT_FROZEN;
 	}
 
@@ -3224,7 +3209,7 @@ CopyFrom(CopyState cstate)
 					/* Add this tuple to the tuple buffer */
 					CopyMultiInsertInfoStore(&multiInsertInfo,
 											 resultRelInfo, myslot,
-											 cstate->line_buf.len,
+											 100, // FIXME cstate->line_buf.len,
 											 cstate->cur_lineno);
 
 					/*
@@ -3387,16 +3372,30 @@ BeginCopyFrom(ParseState *pstate,
 
 	/*
 	 * Set up variables to avoid per-attribute overhead.  attribute_buf and
-	 * raw_buf are used in both text and binary modes, but we use line_buf
-	 * only in text mode.
+	 * raw_buf are used in both text and binary modes, but text mode has
+	 * some extra state.
 	 */
 	initStringInfo(&cstate->attribute_buf);
 	cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
 	cstate->raw_buf_index = cstate->raw_buf_len = 0;
 	if (!cstate->binary)
 	{
-		initStringInfo(&cstate->line_buf);
+		cstate->last_was_cr = false;
+		cstate->parse_lines_state = PLSTATE_NORMAL;
+		cstate->last_line_no = -1;
+		cstate->nextline = 0;
+		cstate->endlines = palloc((RAW_BUF_SIZE + 1) * sizeof(int));
+		cstate->numlines = 0;
+
+		initStringInfo(&cstate->split_line_buf);
+
+		cstate->line_buf = NULL;
+		cstate->line_len = 0;
 		cstate->line_buf_converted = false;
+		cstate->line_buf_valid = false;
+		cstate->line_buf_alloced = false;
+
+		cstate->converted_buf = NULL;
 	}
 
 	/* Assign range table, we'll need it in CopyFrom. */
@@ -3634,7 +3633,7 @@ NextCopyFromRawFields(CopyState cstate, char ***fields, int *nfields)
 	 * characters, we act as though it was newline followed by EOF, ie,
 	 * process the line and then exit loop on next iteration.
 	 */
-	if (done && cstate->line_buf.len == 0)
+	if (done && cstate->line_len == 0)
 		return false;
 
 	/* Parse the line into de-escaped field values */
@@ -3863,451 +3862,550 @@ EndCopyFrom(CopyState cstate)
 static bool
 CopyReadLine(CopyState cstate)
 {
-	bool		result;
-
-	resetStringInfo(&cstate->line_buf);
-	cstate->line_buf_valid = true;
+	resetStringInfo(&cstate->split_line_buf);
 
 	/* Mark that encoding conversion hasn't occurred yet */
 	cstate->line_buf_converted = false;
+	cstate->line_buf_valid = false;
+
+	if (cstate->line_buf_alloced)
+		pfree(cstate->line_buf);
 
-	/* Parse data and transfer into line_buf */
-	result = CopyReadLineText(cstate);
+	if (cstate->split_line_buf.data > 0)
+		resetStringInfo(&cstate->split_line_buf);
 
-	if (result)
+	if (cstate->last_line_no != -1 && cstate->nextline > cstate->last_line_no)
+		return true;
+
+	/*
+	 * If we processed all lines from previous batch, load more
+	 */
+	if (cstate->nextline == cstate->numlines)
 	{
-		/*
-		 * Reached EOF.  In protocol version 3, we should ignore anything
-		 * after \. up to the protocol end of copy data.  (XXX maybe better
-		 * not to treat \. as special?)
-		 */
-		if (cstate->copy_dest == COPY_NEW_FE)
+		for (;;)
 		{
-			do
+			int			endpos;
+			bool		done;
+
+			cstate->nextline = 0;
+
+			/*
+			 * Transfer any remaining data from previous buffer to split_line_buf.
+			 */
+			if (cstate->numlines == 0)
+			{
+				/* this chunk contained no line-ends at all. */
+				endpos = 0;
+			}
+			else
+			{
+				endpos = cstate->endlines[cstate->numlines - 1];
+			}
+			appendBinaryStringInfo(&cstate->split_line_buf, cstate->raw_buf + endpos,
+								   cstate->raw_buf_len - endpos);
+
+			/* Get next raw (and possibly converted) buf */
+			done = !CopyLoadAndConvertBuf(cstate);
+
+			/* Detect line boundaries within the buffer */
+			if (cstate->csv_mode)
+				ParseLinesCSV(cstate);
+			else
+				ParseLinesText(cstate);
+
+			/*
+			 * If we reached the EOF, remember it, and add a sentinel end-of-line to
+			 * 'endlines' so that the logic below doesn't need to special case the
+			 * last line.
+			 */
+			if (done)
 			{
-				cstate->raw_buf_index = cstate->raw_buf_len;
-			} while (CopyLoadRawBuf(cstate));
+				cstate->last_line_no = cstate->numlines;
+				cstate->endlines[cstate->numlines] = cstate->converted_buf_len;
+				cstate->numlines++;
+				break;
+			}
+			else
+				cstate->last_line_no = -1;
+
+			if (cstate->numlines > 0)
+				break;
 		}
 	}
+
+	Assert(cstate->nextline < cstate->numlines);
+
+	/*
+	 * The first line in this buffer could be a contination of a split line that
+	 * started on previous buffer. Treat it specially.
+	 */
+	if (cstate->nextline == 0)
+	{
+		if (cstate->split_line_buf.len > 0)
+		{
+			appendBinaryStringInfo(&cstate->split_line_buf, cstate->converted_buf,
+								   cstate->endlines[0]);
+			cstate->line_buf = cstate->split_line_buf.data;
+			cstate->line_len = cstate->split_line_buf.len;
+		}
+		else
+		{
+			cstate->line_buf = cstate->converted_buf;
+			cstate->line_len = cstate->endlines[0];
+		}
+	}
+	else
+	{
+		int startpos;
+		int endpos;
+
+		startpos = cstate->endlines[cstate->nextline - 1];
+		endpos = cstate->endlines[cstate->nextline];
+
+		cstate->line_buf = cstate->converted_buf + startpos;
+		cstate->line_len = endpos - startpos;
+	}
+
+	if (cstate->nextline == cstate->last_line_no)
+	{
+		/*
+		 * EOF at start of line means we're done.  If we see EOF after some
+		 * characters, we act as though it was newline followed by EOF, ie,
+		 * process the line and then exit loop on next iteration.
+		 */
+		if (cstate->line_len == 0)
+			return true;
+	}
 	else
 	{
 		/*
 		 * If we didn't hit EOF, then we must have transferred the EOL marker
 		 * to line_buf along with the data.  Get rid of it.
 		 */
-		switch (cstate->eol_type)
+		if (cstate->nextline != cstate->last_line_no)
 		{
-			case EOL_NL:
-				Assert(cstate->line_buf.len >= 1);
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
-				cstate->line_buf.len--;
-				cstate->line_buf.data[cstate->line_buf.len] = '\0';
-				break;
-			case EOL_CR:
-				Assert(cstate->line_buf.len >= 1);
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
-				cstate->line_buf.len--;
-				cstate->line_buf.data[cstate->line_buf.len] = '\0';
-				break;
-			case EOL_CRNL:
-				Assert(cstate->line_buf.len >= 2);
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
-				Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
-				cstate->line_buf.len -= 2;
-				cstate->line_buf.data[cstate->line_buf.len] = '\0';
-				break;
-			case EOL_UNKNOWN:
-				/* shouldn't get here */
-				Assert(false);
-				break;
+			switch (cstate->eol_type)
+			{
+				case EOL_NL:
+					Assert(cstate->line_len >= 1);
+					Assert(cstate->line_buf[cstate->line_len - 1] == '\n');
+					cstate->line_len--;
+					cstate->line_buf[cstate->line_len] = '\0';
+					break;
+				case EOL_CR:
+					Assert(cstate->line_len >= 1);
+					Assert(cstate->line_buf[cstate->line_len - 1] == '\r');
+					cstate->line_len--;
+					cstate->line_buf[cstate->line_len] = '\0';
+					break;
+				case EOL_CRNL:
+					Assert(cstate->line_len >= 2);
+					Assert(cstate->line_buf[cstate->line_len - 2] == '\r');
+					Assert(cstate->line_buf[cstate->line_len - 1] == '\n');
+					cstate->line_len -= 2;
+					cstate->line_buf[cstate->line_len] = '\0';
+					break;
+				case EOL_UNKNOWN:
+					/* shouldn't get here */
+					Assert(false);
+					break;
+			}
 		}
 	}
+	cstate->nextline++;
 
-	/* Done reading the line.  Convert it to server encoding. */
-	if (cstate->need_transcoding)
+	cstate->line_buf_valid = true;
+	cstate->line_buf_alloced = false;
+
+	/*
+	 * Done reading the line.  Convert it to server encoding. If the encoding was
+	 * one that embeds ASCII, we did it for the whole raw buffer already
+	 */
+	if (cstate->need_transcoding && !cstate->encoding_embeds_ascii)
 	{
 		char	   *cvt;
 
-		cvt = pg_any_to_server(cstate->line_buf.data,
-							   cstate->line_buf.len,
+		cvt = pg_any_to_server(cstate->line_buf, cstate->line_len,
 							   cstate->file_encoding);
-		if (cvt != cstate->line_buf.data)
+		if (cvt != cstate->line_buf)
 		{
 			/* transfer converted data back to line_buf */
-			resetStringInfo(&cstate->line_buf);
-			appendBinaryStringInfo(&cstate->line_buf, cvt, strlen(cvt));
-			pfree(cvt);
+			cstate->line_buf = cvt;
+			cstate->line_len = strlen(cvt);
+			cstate->line_buf_alloced = true;
 		}
 	}
 
 	/* Now it's safe to use the buffer in error messages */
 	cstate->line_buf_converted = true;
 
-	return result;
+	return false;
 }
 
-/*
- * CopyReadLineText - inner loop of CopyReadLine for text mode
- */
 static bool
-CopyReadLineText(CopyState cstate)
+CopyLoadAndConvertBuf(CopyState cstate)
 {
-	char	   *copy_raw_buf;
-	int			raw_buf_ptr;
-	int			copy_buf_len;
-	bool		need_data = false;
-	bool		hit_eof = false;
-	bool		result = false;
-	char		mblen_str[2];
-
-	/* CSV variables */
-	bool		first_char_in_line = true;
-	bool		in_quote = false,
-				last_was_esc = false;
-	char		quotec = '\0';
-	char		escapec = '\0';
+	bool		moredata;
 
-	if (cstate->csv_mode)
+	/* Get next raw buf */
+	moredata = CopyLoadRawBuf(cstate);
+
+	/* convert if necessary */
+	if (cstate->encoding_embeds_ascii)
 	{
-		quotec = cstate->quote[0];
-		escapec = cstate->escape[0];
-		/* ignore special escape processing if it's the same as quotec */
-		if (quotec == escapec)
-			escapec = '\0';
+		Assert(cstate->need_transcoding);
+				
+		if (cstate->converted_buf && cstate->converted_buf != cstate->raw_buf)
+			pfree(cstate->converted_buf);
+
+		while (moredata && cstate->raw_buf_len < MAX_CONVERSION_GROWTH)
+			moredata = CopyLoadRawBuf(cstate);
+
+		if (!moredata)
+		{
+			cstate->raw_buf_index = cstate->raw_buf_len;
+		}
+		else
+		{
+			/* Find length */
+			char	   *p;
+			char	   *pend;
+
+			p = cstate->raw_buf;
+			pend = cstate->raw_buf + cstate->raw_buf_len;
+			while (p < pend - MAX_CONVERSION_GROWTH)
+			{
+				if (IS_HIGHBIT_SET(*p))
+				{
+					int			mblen;
+
+					mblen = pg_encoding_mblen(cstate->file_encoding, p);
+					p += mblen;
+				}
+				else
+					p++;
+			}
+			cstate->raw_buf_index = pend - p;
+		}
+		cstate->converted_buf = pg_any_to_server(cstate->raw_buf,
+												 cstate->raw_buf_index,
+												 cstate->file_encoding);
+		if (cstate->converted_buf != cstate->raw_buf)
+			cstate->converted_buf_len = strlen(cstate->converted_buf);
+		else
+			cstate->converted_buf_len = cstate->raw_buf_index;
+	}
+	else
+	{
+		cstate->converted_buf = cstate->raw_buf;
+		cstate->converted_buf_len = cstate->raw_buf_len;
+		cstate->raw_buf_index = cstate->raw_buf_len;
 	}
 
-	mblen_str[1] = '\0';
+	return moredata;
+}
+
+/*
+ * Find all newlines (or CRs or CRLNs) in the buffer in cstate->converted_buf.
+ *
+ * The positions of the newlines are stored in cstate->endlines array.
+ * Each position points to the *next* character, after the newline.
+ *
+ * A position can also be 0, meaning that there was a newline immediatedly
+ * before the current buffer. That case can currently only arise when
+ * processing the first line in EOL_UNKNOWN mode, and we see a CR at the
+ * end a buffer. In that case, we won't know until we see the first
+ * character of the next buffer, that the CR at the end of the previous
+ * buffer was really the end-of-line.
+ */
+static void
+ParseLinesText(CopyState cstate)
+{
+	/* pre-requisites: there is data in converted_buf */
+	char	   *startp;
+	char	   *p;
+	char	   *endp;
+	int		   *endlines;
+	int			nlines;
 
 	/*
-	 * The objective of this loop is to transfer the entire next input line
-	 * into line_buf.  Hence, we only care for detecting newlines (\r and/or
-	 * \n) and the end-of-copy marker (\.).
-	 *
-	 * In CSV mode, \r and \n inside a quoted field are just part of the data
-	 * value and are put in line_buf.  We keep just enough state to know if we
-	 * are currently in a quoted field or not.
-	 *
-	 * These four characters, and the CSV escape and quote characters, are
-	 * assumed the same in frontend and backend encodings.
+	 * TODO: support multibyte encodings. Plan:
 	 *
-	 * For speed, we try to move data from raw_buf to line_buf in chunks
-	 * rather than one character at a time.  raw_buf_ptr points to the next
-	 * character to examine; any characters from raw_buf_index to raw_buf_ptr
-	 * have been determined to be part of the line, but not yet transferred to
-	 * line_buf.
+	 * If encoding_embeds_ascii, the caller converts the raw buffer
+	 * before calling this function, scanning through the buffer with
+	 * pg_mblen() to find the multibyte character boundary. Stash any
+	 * remaining bytes for next call.
 	 *
-	 * For a little extra speed within the loop, we copy raw_buf and
-	 * raw_buf_len into local variables.
+	 * Otherwise, the conversion can be done separately on each line, after
+	 * calling this function.
 	 */
-	copy_raw_buf = cstate->raw_buf;
-	raw_buf_ptr = cstate->raw_buf_index;
-	copy_buf_len = cstate->raw_buf_len;
 
-	for (;;)
-	{
-		int			prev_raw_ptr;
-		char		c;
+	p = cstate->converted_buf;
+	startp = cstate->converted_buf;
+	endp = cstate->converted_buf + cstate->converted_buf_len;
 
-		/*
-		 * Load more data if needed.  Ideally we would just force four bytes
-		 * of read-ahead and avoid the many calls to
-		 * IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(), but the COPY_OLD_FE protocol
-		 * does not allow us to read too far ahead or we might read into the
-		 * next data, so we read-ahead only as far we know we can.  One
-		 * optimization would be to read-ahead four byte here if
-		 * cstate->copy_dest != COPY_OLD_FE, but it hardly seems worth it,
-		 * considering the size of the buffer.
-		 */
-		if (raw_buf_ptr >= copy_buf_len || need_data)
-		{
-			REFILL_LINEBUF;
+	endlines = cstate->endlines;
+	nlines = 0;
 
-			/*
-			 * Try to read some more data.  This will certainly reset
-			 * raw_buf_index to zero, and raw_buf_ptr must go with it.
-			 */
-			if (!CopyLoadRawBuf(cstate))
-				hit_eof = true;
-			raw_buf_ptr = 0;
-			copy_buf_len = cstate->raw_buf_len;
+	if (cstate->eol_type == EOL_UNKNOWN)
+	{
+		while (p < endp)
+		{
+			char		c = *(p++);
 
-			/*
-			 * If we are completely out of data, break out of the loop,
-			 * reporting EOF.
-			 */
-			if (copy_buf_len <= 0)
+			if (c == '\n')
+			{
+				if (cstate->last_was_cr)
+					cstate->eol_type = EOL_CRNL;
+				else
+					cstate->eol_type = EOL_NL;
+				endlines[nlines++] = p - startp;
+				break;
+			}
+			else if (cstate->last_was_cr)
 			{
-				result = true;
+				/*
+				 * The previous character was \r, and this character is the first
+				 * character of the next line. The line ended just *before* this
+				 * character.
+				 */
+				endlines[nlines++] = (p - 1) - startp;
+				cstate->eol_type = EOL_CR;
+				cstate->last_was_cr = false; /* not used in EOL_CR mode */
 				break;
 			}
-			need_data = false;
+			else if (c == '\r')
+			{
+				cstate->last_was_cr = true;
+			}
 		}
+		/* continue processing according to the new 'eol_type' */
+	}
 
-		/* OK to fetch a character */
-		prev_raw_ptr = raw_buf_ptr;
-		c = copy_raw_buf[raw_buf_ptr++];
-
-		if (cstate->csv_mode)
+	if (cstate->eol_type == EOL_NL)
+	{
+		while (p < endp)
 		{
-			/*
-			 * If character is '\\' or '\r', we may need to look ahead below.
-			 * Force fetch of the next character if we don't already have it.
-			 * We need to do this before changing CSV state, in case one of
-			 * these characters is also the quote or escape character.
-			 *
-			 * Note: old-protocol does not like forced prefetch, but it's OK
-			 * here since we cannot validly be at EOF.
-			 */
-			if (c == '\\' || c == '\r')
+			char		c = *(p++);
+
+			/* Process \n */
+			if (c == '\n')
 			{
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
+				endlines[nlines++] = p - startp;
 			}
 
-			/*
-			 * Dealing with quotes and escapes here is mildly tricky. If the
-			 * quote char is also the escape char, there's no problem - we
-			 * just use the char as a toggle. If they are different, we need
-			 * to ensure that we only take account of an escape inside a
-			 * quoted field and immediately preceding a quote char, and not
-			 * the second in an escape-escape sequence.
-			 */
-			if (in_quote && c == escapec)
-				last_was_esc = !last_was_esc;
-			if (c == quotec && !last_was_esc)
-				in_quote = !in_quote;
-			if (c != escapec)
-				last_was_esc = false;
-
-			/*
-			 * Updating the line count for embedded CR and/or LF chars is
-			 * necessarily a little fragile - this test is probably about the
-			 * best we can do.  (XXX it's arguable whether we should do this
-			 * at all --- is cur_lineno a physical or logical count?)
-			 */
-			if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
-				cstate->cur_lineno++;
+			/* Process \r */
+			if (c == '\r')
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						 errmsg("literal carriage return found in data"),
+						 errhint("Use \"\\r\" to represent carriage return.")));
 		}
-
-		/* Process \r */
-		if (c == '\r' && (!cstate->csv_mode || !in_quote))
+	}
+	else if (cstate->eol_type == EOL_CR)
+	{
+		while (p < endp)
 		{
-			/* Check for \r\n on first line, _and_ handle \r\n. */
-			if (cstate->eol_type == EOL_UNKNOWN ||
-				cstate->eol_type == EOL_CRNL)
-			{
-				/*
-				 * If need more data, go back to loop top to load it.
-				 *
-				 * Note that if we are at EOF, c will wind up as '\0' because
-				 * of the guaranteed pad of raw_buf.
-				 */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
+			char		c = *(p++);
 
-				/* get next char */
-				c = copy_raw_buf[raw_buf_ptr];
+			/* Process \r */
+			if (c == '\r')
+				endlines[nlines++] = p - startp;
 
-				if (c == '\n')
+			/* Process \n */
+			if (c == '\r')
+				ereport(ERROR,
+						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+						 errmsg("literal newline found in data"),
+						 errhint("Use \"\\n\" to represent carriage return.")));
+		}
+	}
+	else if (cstate->eol_type == EOL_CRNL)
+	{
+		while (p < endp)
+		{
+			char		c = *(p++);
+
+			if (c == '\n')
+			{
+				if (cstate->last_was_cr)
 				{
-					raw_buf_ptr++;	/* eat newline */
-					cstate->eol_type = EOL_CRNL;	/* in case not set yet */
+					endlines[nlines++] = p - startp;
+					cstate->last_was_cr = false;
 				}
 				else
-				{
-					/* found \r, but no \n */
-					if (cstate->eol_type == EOL_CRNL)
-						ereport(ERROR,
-								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-								 !cstate->csv_mode ?
-								 errmsg("literal carriage return found in data") :
-								 errmsg("unquoted carriage return found in data"),
-								 !cstate->csv_mode ?
-								 errhint("Use \"\\r\" to represent carriage return.") :
-								 errhint("Use quoted CSV field to represent carriage return.")));
-
-					/*
-					 * if we got here, it is the first line and we didn't find
-					 * \n, so don't consume the peeked character
-					 */
-					cstate->eol_type = EOL_CR;
-				}
+					ereport(ERROR,
+							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+							 errmsg("literal newline found in data"),
+							 errhint("Use \"\\n\" to represent carriage return.")));
 			}
-			else if (cstate->eol_type == EOL_NL)
+			else if (cstate->last_was_cr)
+			{
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-						 !cstate->csv_mode ?
-						 errmsg("literal carriage return found in data") :
-						 errmsg("unquoted carriage return found in data"),
-						 !cstate->csv_mode ?
-						 errhint("Use \"\\r\" to represent carriage return.") :
-						 errhint("Use quoted CSV field to represent carriage return.")));
-			/* If reach here, we have found the line terminator */
-			break;
+						 errmsg("literal carriage return found in data"),
+						 errhint("Use \"\\r\" to represent carriage return.")));
+			}
+			else if (c == '\r')
+			{
+				cstate->last_was_cr = true;
+			}
 		}
+	}
 
-		/* Process \n */
-		if (c == '\n' && (!cstate->csv_mode || !in_quote))
-		{
-			if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
-				ereport(ERROR,
-						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-						 !cstate->csv_mode ?
-						 errmsg("literal newline found in data") :
-						 errmsg("unquoted newline found in data"),
-						 !cstate->csv_mode ?
-						 errhint("Use \"\\n\" to represent newline.") :
-						 errhint("Use quoted CSV field to represent newline.")));
-			cstate->eol_type = EOL_NL;	/* in case not set yet */
-			/* If reach here, we have found the line terminator */
-			break;
-		}
+	cstate->numlines = nlines;
+}
 
-		/*
-		 * In CSV mode, we only recognize \. alone on a line.  This is because
-		 * \. is a valid CSV data value.
-		 */
-		if (c == '\\' && (!cstate->csv_mode || first_char_in_line))
-		{
-			char		c2;
+/*
+ * Like ParseLinesText, but in CSV mode.
+ */
+static void
+ParseLinesCSV(CopyState cstate)
+{
+	/* pre-requisites: there is data in converted_buf */
+	char	   *startp;
+	char	   *p;
+	char	   *endp;
+	int		   *endlines;
+	int			nlines;
+	int			state = cstate->parse_lines_state;
+	char		quotec = '\0';
+	char		escapec = '\0';
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-			IF_NEED_REFILL_AND_EOF_BREAK(0);
+	quotec = cstate->quote[0];
+	escapec = cstate->escape[0];
+	/* ignore special escape processing if it's the same as quotec */
+	if (quotec == escapec)
+		escapec = '\0';
 
-			/* -----
-			 * get next character
-			 * Note: we do not change c so if it isn't \., we can fall
-			 * through and continue processing for file encoding.
-			 * -----
-			 */
-			c2 = copy_raw_buf[raw_buf_ptr];
+	p = cstate->converted_buf;
+	startp = cstate->converted_buf;
+	endp = cstate->converted_buf + cstate->converted_buf_len;
 
-			if (c2 == '.')
-			{
-				raw_buf_ptr++;	/* consume the '.' */
+	endlines = cstate->endlines;
+	nlines = 0;
 
-				/*
-				 * Note: if we loop back for more data here, it does not
-				 * matter that the CSV state change checks are re-executed; we
-				 * will come back here with no important state changed.
-				 */
-				if (cstate->eol_type == EOL_CRNL)
-				{
-					/* Get the next character */
-					IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-					/* if hit_eof, c2 will become '\0' */
-					c2 = copy_raw_buf[raw_buf_ptr++];
+	while (p < endp)
+	{
+		char		c = *(p++);
+		bool		last_was_cr;
+
+		last_was_cr = cstate->last_was_cr;
+		cstate->last_was_cr = false;
 
-					if (c2 == '\n')
+		switch (state)
+		{
+			case PLSTATE_NORMAL:
+				if (c == '\n')
+				{
+					if (cstate->eol_type == EOL_NL)
+						endlines[nlines++] = p - startp;
+					else if (cstate->eol_type == EOL_CR)
+						ereport(ERROR,
+								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+								 errmsg("unquoted newline found in data"),
+								 errhint("Use quoted CSV field to represent newline.")));
+					else if (cstate->eol_type == EOL_CRNL)
 					{
-						if (!cstate->csv_mode)
+						if (last_was_cr)
+							endlines[nlines++] = p - startp;
+						else
 							ereport(ERROR,
 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-									 errmsg("end-of-copy marker does not match previous newline style")));
+									 errmsg("unquoted newline found in data"),
+									 errhint("Use quoted CSV field to represent newline.")));
+					}
+					else if (cstate->eol_type == EOL_UNKNOWN)
+					{
+						if (last_was_cr)
+							cstate->eol_type = EOL_CRNL;
 						else
-							NO_END_OF_COPY_GOTO;
+							cstate->eol_type = EOL_NL;
+						endlines[nlines++] = p - startp;
 					}
-					else if (c2 != '\r')
+				}
+				else if (c == '\r')
+				{
+					if (cstate->eol_type == EOL_NL)
+						ereport(ERROR,
+								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+								 errmsg("unquoted carriage return found in data"),
+								 errhint("Use quoted CSV field to represent carriage return.")));
+					else if (cstate->eol_type == EOL_CR)
+						endlines[nlines++] = p - startp;
+					else if (cstate->eol_type == EOL_CRNL)
 					{
-						if (!cstate->csv_mode)
+						if (last_was_cr)
 							ereport(ERROR,
 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-									 errmsg("end-of-copy marker corrupt")));
+									 errmsg("unquoted carriage return found in data"),
+									 errhint("Use quoted CSV field to represent carriage return.")));
+						cstate->last_was_cr = true;
+					}
+					else if (cstate->eol_type == EOL_UNKNOWN)
+					{
+						if (last_was_cr)
+						{
+							/* oops, the previous char was actually a line boundary already */
+							cstate->eol_type = EOL_CR;
+							endlines[nlines++] = (p - 1) - startp;
+							endlines[nlines++] = p - startp;
+						}
 						else
-							NO_END_OF_COPY_GOTO;
+							cstate->last_was_cr = true;
 					}
 				}
-
-				/* Get the next character */
-				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
-				/* if hit_eof, c2 will become '\0' */
-				c2 = copy_raw_buf[raw_buf_ptr++];
-
-				if (c2 != '\r' && c2 != '\n')
+				else if (c == escapec)
+					state = PLSTATE_ESCAPE;
+				else if (c == quotec)
+					state = PLSTATE_IN_QUOTE;
+				else if (last_was_cr)
 				{
-					if (!cstate->csv_mode)
+					if (cstate->eol_type == EOL_CRNL)
 						ereport(ERROR,
 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-								 errmsg("end-of-copy marker corrupt")));
+								 errmsg("unquoted carriage return found in data"),
+								 errhint("Use quoted CSV field to represent carriage return.")));
 					else
-						NO_END_OF_COPY_GOTO;
+					{
+						Assert(cstate->eol_type == EOL_UNKNOWN);
+						cstate->eol_type = EOL_CR;
+						endlines[nlines++] = p - startp;
+					}
 				}
+				break;
 
-				if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
-					(cstate->eol_type == EOL_CRNL && c2 != '\n') ||
-					(cstate->eol_type == EOL_CR && c2 != '\r'))
+			case PLSTATE_ESCAPE:
+				if (quotec == escapec && c != quotec)
 				{
-					ereport(ERROR,
-							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-							 errmsg("end-of-copy marker does not match previous newline style")));
+					/* the escape was actually a quote */
+					state = PLSTATE_IN_QUOTE;
 				}
-
-				/*
-				 * Transfer only the data before the \. into line_buf, then
-				 * discard the data and the \. sequence.
-				 */
-				if (prev_raw_ptr > cstate->raw_buf_index)
-					appendBinaryStringInfo(&cstate->line_buf,
-										   cstate->raw_buf + cstate->raw_buf_index,
-										   prev_raw_ptr - cstate->raw_buf_index);
-				cstate->raw_buf_index = raw_buf_ptr;
-				result = true;	/* report EOF */
+				else
+					state = PLSTATE_NORMAL;
 				break;
-			}
-			else if (!cstate->csv_mode)
 
-				/*
-				 * If we are here, it means we found a backslash followed by
-				 * something other than a period.  In non-CSV mode, anything
-				 * after a backslash is special, so we skip over that second
-				 * character too.  If we didn't do that \\. would be
-				 * considered an eof-of copy, while in non-CSV mode it is a
-				 * literal backslash followed by a period.  In CSV mode,
-				 * backslashes are not special, so we want to process the
-				 * character after the backslash just like a normal character,
-				 * so we don't increment in those cases.
-				 */
-				raw_buf_ptr++;
-		}
-
-		/*
-		 * This label is for CSV cases where \. appears at the start of a
-		 * line, but there is more text after it, meaning it was a data value.
-		 * We are more strict for \. in CSV mode because \. could be a data
-		 * value, while in non-CSV mode, \. cannot be a data value.
-		 */
-not_end_of_copy:
-
-		/*
-		 * Process all bytes of a multi-byte character as a group.
-		 *
-		 * We only support multi-byte sequences where the first byte has the
-		 * high-bit set, so as an optimization we can avoid this block
-		 * entirely if it is not set.
-		 */
-		if (cstate->encoding_embeds_ascii && IS_HIGHBIT_SET(c))
-		{
-			int			mblen;
-
-			/*
-			 * It is enough to look at the first byte in all our encodings, to
-			 * get the length.  (GB18030 is a bit special, but still works for
-			 * our purposes; see comment in pg_gb18030_mblen())
-			 */
-			mblen_str[0] = c;
-			mblen = pg_encoding_mblen(cstate->file_encoding, mblen_str);
+			case PLSTATE_IN_QUOTE:
+				if (c == escapec)
+					state = PLSTATE_ESCAPE_IN_QUOTE;
+				else if (c == quotec)
+					state = PLSTATE_NORMAL;
+				break;
 
-			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(mblen - 1);
-			IF_NEED_REFILL_AND_EOF_BREAK(mblen - 1);
-			raw_buf_ptr += mblen - 1;
+			case PLSTATE_ESCAPE_IN_QUOTE:
+				if (quotec == escapec && c != quotec)
+				{
+					/* the escape was actually the end quote */
+					state = PLSTATE_NORMAL;
+					continue; /* process this byte again, as a normal */
+				}
+				else
+					state = PLSTATE_IN_QUOTE;
+				break;
 		}
-		first_char_in_line = false;
-	}							/* end of outer loop */
-
-	/*
-	 * Transfer any still-uncopied data to line_buf.
-	 */
-	REFILL_LINEBUF;
-
-	return result;
+	}
+	cstate->numlines = nlines;
+	cstate->parse_lines_state = state;
 }
 
 /*
@@ -4344,6 +4442,8 @@ GetDecimalFromHex(char hex)
 static int
 CopyReadAttributesText(CopyState cstate)
 {
+	char	   *line_buf = cstate->line_buf;
+	int			len = cstate->line_len;
 	char		delimc = cstate->delim[0];
 	int			fieldno;
 	char	   *output_ptr;
@@ -4356,7 +4456,7 @@ CopyReadAttributesText(CopyState cstate)
 	 */
 	if (cstate->max_fields <= 0)
 	{
-		if (cstate->line_buf.len != 0)
+		if (len != 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 					 errmsg("extra data after last expected column")));
@@ -4372,13 +4472,13 @@ CopyReadAttributesText(CopyState cstate)
 	 * it this way because enlarging attribute_buf mid-stream would invalidate
 	 * pointers already stored into cstate->raw_fields[].
 	 */
-	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
-		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
+	if (cstate->attribute_buf.maxlen <= len)
+		enlargeStringInfo(&cstate->attribute_buf, len);
 	output_ptr = cstate->attribute_buf.data;
 
 	/* set pointer variables for loop */
-	cur_ptr = cstate->line_buf.data;
-	line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
+	cur_ptr = line_buf;
+	line_end_ptr = line_buf + len;
 
 	/* Outer loop iterates over fields */
 	fieldno = 0;
@@ -4586,7 +4686,7 @@ CopyReadAttributesCSV(CopyState cstate)
 	 */
 	if (cstate->max_fields <= 0)
 	{
-		if (cstate->line_buf.len != 0)
+		if (cstate->line_len != 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 					 errmsg("extra data after last expected column")));
@@ -4602,13 +4702,13 @@ CopyReadAttributesCSV(CopyState cstate)
 	 * it this way because enlarging attribute_buf mid-stream would invalidate
 	 * pointers already stored into cstate->raw_fields[].
 	 */
-	if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
-		enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
+	if (cstate->attribute_buf.maxlen <= cstate->line_len)
+		enlargeStringInfo(&cstate->attribute_buf, cstate->line_len);
 	output_ptr = cstate->attribute_buf.data;
 
 	/* set pointer variables for loop */
-	cur_ptr = cstate->line_buf.data;
-	line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
+	cur_ptr = cstate->line_buf;
+	line_end_ptr = cstate->line_buf + cstate->line_len;
 
 	/* Outer loop iterates over fields */
 	fieldno = 0;
-- 
2.20.1

Re: Parallel copy

Reply via email to