[PATCHES] fix CSV multiline parsing - proof of concept

Andrew Dunstan Sun, 06 Feb 2005 08:24:35 -0800

Attached is a proof-of-concept patch (i.e. not intended for application just yet) to fix the problem of parsing CSV multiline fields.

Originally I indicated that the way to solve this IMHO was to the combine reading and parsing phases of COPY for CSV. However, there's a lot going on there and I adopted a somewhat less invasive approach, which detects if a CR and/orNL should be part of a data value and if so treats it as just another character. Also, it removes the escaping nature of backslash for NL and CR in CSV, which is clearly a bug.

One thing I noticed is that (unless I misread the code) our standard detection of the end marker \.<EOL> doesn't seem to require that it be at the beginning of a line, as the docs say it should. I didn't change that but did build a test for it into the special CSV code.

comments welcome.

cheers

andrew

*** copy.c.orig	Wed May 26 00:41:10 2004
--- copy.c	Sun Feb  6 10:36:29 2005
***************
*** 137,142 ****
--- 137,143 ----
  		 char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
  		 List *force_notnull_atts);
  static bool CopyReadLine(void);
+ static bool CopyReadLineCSV(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
  							   CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1683,1689 ****
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = CopyReadLine();
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF
--- 1684,1690 ----
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = csv_mode ? CopyReadLineCSV(quote, escape) : CopyReadLine();
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF
***************
*** 2159,2164 ****
--- 2160,2398 ----
  	return result;
  }
  
+ /*
+  * Read a line for CSV copy mode. Differences from standard mode:
+  * . CR an NL are not special inside quoted fields - they just get added
+  *   to the buffer.
+  * . \ is not magical except as the start of the end of data marker.
+  *
+  */
+ 
+ static bool
+ CopyReadLineCSV(char * quote, char * escape)
+ {
+ 	bool		result;
+ 	bool		change_encoding = (client_encoding != server_encoding);
+ 	int			c;
+ 	int			mblen;
+ 	int			j;
+ 	unsigned char s[2];
+ 	char	   *cvt;
+ 	bool        in_quote = false;
+ 	char        quotec = quote[0];
+ 	char        escapec = escape[0];
+ 
+ 	s[1] = 0;
+ 
+ 	/* reset line_buf to empty */
+ 	line_buf.len = 0;
+ 	line_buf.data[0] = '\0';
+ 	line_buf.cursor = 0;
+ 
+ 	/* mark that encoding conversion hasn't occurred yet */
+ 	line_buf_converted = false;
+ 
+ 	/* set default status */
+ 	result = false;
+ 
+ 	/*
+ 	 * In this loop we only care for detecting newlines (\r and/or \n)
+ 	 * and the end-of-copy marker (\.).  These four
+ 	 * characters, and only these four, are assumed the same in frontend
+ 	 * and backend encodings.  We do not assume that second and later bytes
+ 	 * of a frontend multibyte character couldn't look like ASCII characters.
+ 	 *
+ 	 * What about the encoding implications of the quote / excape chars?
+ 	 *
+ 	 * However, CR and NL characters that are inside a quoted field are
+ 	 * not special, and are simply a part of the data value. The rough
+ 	 * and ready rule we use to recognise this situation is to count
+ 	 * the number of quote and escape characters on the line - if it's odd
+ 	 * we are inside a quoted field (so these chars act like a toggle).
+ 	 */
+ 	for (;;)
+ 	{
+ 		c = CopyGetChar();
+ 		if (c == EOF)
+ 		{
+ 			result = true;
+ 			break;
+ 		}
+ 
+ 		/* not a complete parsing rule but good enough for here */
+ 		if (c == quotec || c == escapec)
+ 			in_quote = ! in_quote;
+ 
+ 		/* 
+ 		 * updating the line count for embedded CR and/or LF chars is
+ 		 * necessarily a little fragile - this test is probably about
+ 		 * the best we can do.
+ 		 */
+ 		if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
+ 			copy_lineno++;
+ 		
+ 		if (!in_quote && c == '\r')
+ 		{
+ 			if (eol_type == EOL_NL)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ 						 errmsg("unquoted carriage return found in CSV data"),
+ 				  errhint("Use quoted CSV field to represent carriage return.")));
+ 			/* Check for \r\n on first line, _and_ handle \r\n. */
+ 			if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
+ 			{
+ 				int			c2 = CopyPeekChar();
+ 
+ 				if (c2 == '\n')
+ 				{
+ 					CopyDonePeek(c2, true);		/* eat newline */
+ 					eol_type = EOL_CRNL;
+ 				}
+ 				else
+ 				{
+ 					/* found \r, but no \n */
+ 					if (eol_type == EOL_CRNL)
+ 						ereport(ERROR,
+ 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ 						 errmsg("unquoted carriage return found in CSV data"),
+ 								 errhint("Use quoted CSV field to represent carriage return.")));
+ 
+ 					/*
+ 					 * if we got here, it is the first line and we didn't
+ 					 * get \n, so put it back
+ 					 */
+ 					CopyDonePeek(c2, false);
+ 					eol_type = EOL_CR;
+ 				}
+ 			}
+ 			break;
+ 		}
+ 		if (!in_quote && c == '\n')
+ 		{
+ 			if (eol_type == EOL_CR || eol_type == EOL_CRNL)
+ 				ereport(ERROR,
+ 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ 						 errmsg("unquoted newline found in CSV data"),
+ 						 errhint("Use quoted CSV field to represent newline.")));
+ 			eol_type = EOL_NL;
+ 			break;
+ 		}
+ 
+ 		/* \ is only potentially magical at the start of a line */
+ 		if (line_buf.len == 0 && c == '\\')
+ 		{
+ 			int			c2 = CopyPeekChar();
+ 
+ 			if (c2 == EOF)
+ 			{
+ 				result = true;
+ 
+ 				CopyDonePeek(c2, true); /* eat it - do we need to? */
+ 
+ 				break;
+ 			}
+ 			if (c2 == '.')
+ 			{
+ 
+ 				CopyDonePeek(c2, true); /* so we can keep calling GetChar() */
+ 
+ 				if (eol_type == EOL_CRNL)
+ 				{
+ 					c = CopyGetChar();
+ 					if (c == '\n')
+ 						ereport(ERROR,
+ 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ 								 errmsg("end-of-copy marker does not match previous newline style")));
+ 					if (c != '\r')
+ 						ereport(ERROR,
+ 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ 								 errmsg("end-of-copy marker corrupt")));
+ 				}
+ 				c = CopyGetChar();
+ 				if (c != '\r' && c != '\n')
+ 					ereport(ERROR,
+ 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ 							 errmsg("end-of-copy marker corrupt")));
+ 				if ((eol_type == EOL_NL && c != '\n') ||
+ 					(eol_type == EOL_CRNL && c != '\n') ||
+ 					(eol_type == EOL_CR && c != '\r'))
+ 					ereport(ERROR,
+ 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ 							 errmsg("end-of-copy marker does not match previous newline style")));
+ 
+ 				/*
+ 				 * In protocol version 3, we should ignore anything
+ 				 * after \. up to the protocol end of copy data.  (XXX
+ 				 * maybe better not to treat \. as special?)
+ 				 */
+ 				if (copy_dest == COPY_NEW_FE)
+ 				{
+ 					while (c != EOF)
+ 						c = CopyGetChar();
+ 				}
+ 				result = true;	/* report EOF */
+ 				break;
+ 			}
+ 
+ 			CopyDonePeek(c2, false); /* not a dot, so put it back */
+ 
+ 		}
+ 
+ 		appendStringInfoCharMacro(&line_buf, c);
+ 
+ 		/*
+ 		 * When client encoding != server, must be careful to read the
+ 		 * extra bytes of a multibyte character exactly, since the encoding
+ 		 * might not ensure they don't look like ASCII.  When the encodings
+ 		 * are the same, we need not do this, since no server encoding we
+ 		 * use has ASCII-like following bytes.
+ 		 */
+ 		if (change_encoding)
+ 		{
+ 			s[0] = c;
+ 			mblen = pg_encoding_mblen(client_encoding, s);
+ 			for (j = 1; j < mblen; j++)
+ 			{
+ 				c = CopyGetChar();
+ 				if (c == EOF)
+ 				{
+ 					result = true;
+ 					break;
+ 				}
+ 				appendStringInfoCharMacro(&line_buf, c);
+ 			}
+ 			if (result)
+ 				break;			/* out of outer loop */
+ 		}
+ 	} /* end of outer loop */
+ 
+ 	/*
+ 	 * Done reading the line.  Convert it to server encoding.
+ 	 *
+ 	 * Note: set line_buf_converted to true *before* attempting conversion;
+ 	 * this prevents infinite recursion during error reporting should
+ 	 * pg_client_to_server() issue an error, due to copy_in_error_callback
+ 	 * again attempting the same conversion.  We'll end up issuing the message
+ 	 * without conversion, which is bad but better than nothing ...
+ 	 */
+ 	line_buf_converted = true;
+ 
+ 	if (change_encoding)
+ 	{
+ 		cvt = (char *) pg_client_to_server((unsigned char *) line_buf.data,
+ 										   line_buf.len);
+ 		if (cvt != line_buf.data)
+ 		{
+ 			/* transfer converted data back to line_buf */
+ 			line_buf.len = 0;
+ 			line_buf.data[0] = '\0';
+ 			appendBinaryStringInfo(&line_buf, cvt, strlen(cvt));
+ 		}
+ 	}
+ 
+ 	return result;
+ }
+ 
  /*----------
   * Read the value of a single attribute, performing de-escaping as needed.
   *
***************
*** 2332,2366 ****
  
  	for (;;)
  	{
- 		/* handle multiline quoted fields */
- 		if (in_quote && line_buf.cursor >= line_buf.len)
- 		{
- 			bool done;
- 
- 			switch(eol_type)
- 			{
- 				case EOL_NL:
- 					appendStringInfoString(&attribute_buf,"\n");
- 					break;
- 				case EOL_CR:
- 					appendStringInfoString(&attribute_buf,"\r");
- 					break;
- 				case EOL_CRNL:
- 					appendStringInfoString(&attribute_buf,"\r\n");
- 					break;
- 				case EOL_UNKNOWN:
- 					/* shouldn't happen - just keep going */
- 					break;
- 			}
- 
- 			copy_lineno++;
- 			done = CopyReadLine();
- 			if (done && line_buf.len == 0)
- 				break;
- 			start_cursor = line_buf.cursor;
- 		}
- 
  		end_cursor = line_buf.cursor;
  		if (line_buf.cursor >= line_buf.len)
  			break;
  		c = line_buf.data[line_buf.cursor++];
--- 2566,2573 ----
  
  	for (;;)
  	{
  		end_cursor = line_buf.cursor;
+ 
  		if (line_buf.cursor >= line_buf.len)
  			break;
  		c = line_buf.data[line_buf.cursor++];

---------------------------(end of broadcast)---------------------------
TIP 2: you can get off all lists at once with the unregister command
    (send "unregister YourEmailAddressHere" to [EMAIL PROTECTED])

[PATCHES] fix CSV multiline parsing - proof of concept

Reply via email to