Re: [PATCHES] CSV multiline final fix

Andrew Dunstan Mon, 21 Feb 2005 21:06:18 -0800

Andrew Dunstan wrote:

Bruce Momjian said:

Shame we had to duplicate CopyReadLine() in a sense.

If you can find a clean way to merge them please do - I'll be very grateful.
My head started to spin when I tried. In general I dislike having more than
2 or 2 levels of logic in a given piece of code.

Previous comment courtesy clumsy fingers and the Department of Redundancy Department (of course, I meant 2 or 3).

Anyway, please review this patch for copy.c - it's possibly more to your taste. It's less redundant, but I'm not sure it's more clear.

cheers

andrew

*** copy.c.orig	Mon Feb 21 23:12:41 2005
--- copy.c	Mon Feb 21 23:35:22 2005
***************
*** 98,104 ****
  static EolType eol_type;		/* EOL type of input */
  static int	client_encoding;	/* remote side's character encoding */
  static int	server_encoding;	/* local encoding */
- static bool embedded_line_warning;
  
  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;		/* is it a binary copy? */
--- 98,103 ----
***************
*** 139,145 ****
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
  		 List *force_notnull_atts);
! static bool CopyReadLine(void);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
  				  CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
--- 138,144 ----
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
  		 List *force_notnull_atts);
! static bool CopyReadLine(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
  				  CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1191,1197 ****
  	attr = tupDesc->attrs;
  	num_phys_attrs = tupDesc->natts;
  	attr_count = list_length(attnumlist);
- 	embedded_line_warning = false;
  
  	/*
  	 * Get info about the columns we need to process.
--- 1190,1195 ----
***************
*** 1718,1724 ****
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = CopyReadLine();
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF after
--- 1716,1723 ----
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = csv_mode ? 
! 				CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF after
***************
*** 2006,2012 ****
   * by newline.
   */
  static bool
! CopyReadLine(void)
  {
  	bool		result;
  	bool		change_encoding = (client_encoding != server_encoding);
--- 2005,2011 ----
   * by newline.
   */
  static bool
! CopyReadLine(char * quote, char * escape)
  {
  	bool		result;
  	bool		change_encoding = (client_encoding != server_encoding);
***************
*** 2015,2020 ****
--- 2014,2032 ----
  	int			j;
  	unsigned char s[2];
  	char	   *cvt;
+ 	bool        in_quote = false, last_was_esc = false, csv_mode = false;
+ 	char        quotec = '\0', escapec = '\0';
+ 
+ 	if (quote)
+ 	{
+ 		csv_mode = true;
+ 		quotec = quote[0];
+ 		escapec = escape[0];
+ 		/* ignore special escape processing if it's the same as quotec */
+ 		if (quotec == escapec)
+ 			escapec = '\0';
+ 	}
+ 
  
  	s[1] = 0;
  
***************
*** 2031,2041 ****
  
  	/*
  	 * In this loop we only care for detecting newlines (\r and/or \n) and
! 	 * the end-of-copy marker (\.).  For backwards compatibility we allow
  	 * backslashes to escape newline characters.  Backslashes other than
  	 * the end marker get put into the line_buf, since CopyReadAttribute
! 	 * does its own escape processing.	These four characters, and only
! 	 * these four, are assumed the same in frontend and backend encodings.
  	 * We do not assume that second and later bytes of a frontend
  	 * multibyte character couldn't look like ASCII characters.
  	 */
--- 2043,2062 ----
  
  	/*
  	 * In this loop we only care for detecting newlines (\r and/or \n) and
! 	 * the end-of-copy marker (\.).  
! 	 *
! 	 * In Text mode, for backwards compatibility we allow
  	 * backslashes to escape newline characters.  Backslashes other than
  	 * the end marker get put into the line_buf, since CopyReadAttribute
! 	 * does its own escape processing.	
! 	 *
! 	 * In CSV mode, CR and NL inside q quoted field are just part of the
! 	 * data value and are put in line_buf. We keep just enough state
! 	 * to know if we are currently in a quoted field or not.
! 	 *
! 	 * These four characters, and only these four, are assumed the same in 
! 	 * frontend and backend encodings.
! 	 *
  	 * We do not assume that second and later bytes of a frontend
  	 * multibyte character couldn't look like ASCII characters.
  	 */
***************
*** 2047,2059 ****
  			result = true;
  			break;
  		}
! 		if (c == '\r')
  		{
  			if (eol_type == EOL_NL)
! 				ereport(ERROR,
! 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 						 errmsg("literal carriage return found in data"),
! 				  errhint("Use \"\\r\" to represent carriage return.")));
  			/* Check for \r\n on first line, _and_ handle \r\n. */
  			if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
  			{
--- 2068,2116 ----
  			result = true;
  			break;
  		}
! 
! 		if (csv_mode)
! 		{
! 			/*  
! 			 * Dealing with quotes and escapes here is mildly tricky. If the
! 			 * quote char is also the escape char, there's no problem - we  
! 			 * just use the char as a toggle. If they are different, we need
! 			 * to ensure that we only take account of an escape inside a quoted
! 			 * field and immediately preceding a quote char, and not the
! 			 * second in a escape-escape sequence.
! 			 */ 
! 
! 			if (in_quote && c == escapec)
! 				last_was_esc = ! last_was_esc;
! 			if (c == quotec && ! last_was_esc)
! 				in_quote = ! in_quote;
! 			if (c != escapec)
! 				last_was_esc = false;
! 
! 			/*
! 			 * updating the line count for embedded CR and/or LF chars is 
! 			 * necessarily a little fragile - this test is probably about 
! 			 * the best we can do.
! 			 */ 
! 			if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) 
! 				copy_lineno++; 
! 		}
! 
! 		if (!in_quote && c == '\r')
  		{
  			if (eol_type == EOL_NL)
! 			{
! 				if (! csv_mode)
! 					ereport(ERROR,
! 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 							 errmsg("literal carriage return found in data"),
! 							 errhint("Use \"\\r\" to represent carriage return.")));
! 				else
! 					ereport(ERROR,
! 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 							 errmsg("unquoted carriage return found in CSV data"),
! 							 errhint("Use quoted CSV field to represent carriage return.")));
! 			}
  			/* Check for \r\n on first line, _and_ handle \r\n. */
  			if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
  			{
***************
*** 2068,2077 ****
  				{
  					/* found \r, but no \n */
  					if (eol_type == EOL_CRNL)
! 						ereport(ERROR,
! 								(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 						 errmsg("literal carriage return found in data"),
! 								 errhint("Use \"\\r\" to represent carriage return.")));
  
  					/*
  					 * if we got here, it is the first line and we didn't
--- 2125,2143 ----
  				{
  					/* found \r, but no \n */
  					if (eol_type == EOL_CRNL)
! 					{
! 						if (!csv_mode)
! 							ereport(ERROR,
! 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 									 errmsg("literal carriage return found in data"),
! 									 errhint("Use \"\\r\" to represent carriage return.")));
! 						else
! 							ereport(ERROR,
! 									(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 									 errmsg("unquoted carriage return found in data"),
! 									 errhint("Use quoted CSV field to represent carriage return.")));
! 
! 					}
  
  					/*
  					 * if we got here, it is the first line and we didn't
***************
*** 2083,2108 ****
  			}
  			break;
  		}
! 		if (c == '\n')
  		{
  			if (eol_type == EOL_CR || eol_type == EOL_CRNL)
! 				ereport(ERROR,
! 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 						 errmsg("literal newline found in data"),
! 						 errhint("Use \"\\n\" to represent newline.")));
  			eol_type = EOL_NL;
  			break;
  		}
! 		if (c == '\\')
  		{
! 			c = CopyGetChar();
! 			if (c == EOF)
  			{
  				result = true;
  				break;
  			}
! 			if (c == '.')
  			{
  				if (eol_type == EOL_CRNL)
  				{
  					c = CopyGetChar();
--- 2149,2195 ----
  			}
  			break;
  		}
! 		if (!in_quote && c == '\n')
  		{
  			if (eol_type == EOL_CR || eol_type == EOL_CRNL)
! 			{
! 				if (!csv_mode)
! 					ereport(ERROR,
! 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 							 errmsg("literal newline found in data"),
! 							 errhint("Use \"\\n\" to represent newline.")));
! 				else
! 					ereport(ERROR,
! 							(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! 							 errmsg("unquoted newline found in data"),
! 							 errhint("Use quoted CSV field to represent newline.")));
! 					
! 			}
  			eol_type = EOL_NL;
  			break;
  		}
! 
! 		if ((line_buf.len == 0 || !csv_mode) && c == '\\')
  		{
! 			int c2;
! 			
! 			if (csv_mode)
! 				c2 = CopyPeekChar();
! 			else
! 				c2 = c = CopyGetChar();
! 
! 			if (c2 == EOF)
  			{
  				result = true;
+ 				if (csv_mode)
+ 					CopyDonePeek(c2, true);
  				break;
  			}
! 			if (c2 == '.')
  			{
+ 				if (csv_mode)
+ 					CopyDonePeek(c2, true); /* allow keep calling GetChar() */
+ 
  				if (eol_type == EOL_CRNL)
  				{
  					c = CopyGetChar();
***************
*** 2140,2147 ****
  				result = true;	/* report EOF */
  				break;
  			}
! 			/* not EOF mark, so emit \ and following char literally */
! 			appendStringInfoCharMacro(&line_buf, '\\');
  		}
  
  		appendStringInfoCharMacro(&line_buf, c);
--- 2227,2238 ----
  				result = true;	/* report EOF */
  				break;
  			}
! 			
! 			if (csv_mode)
! 				CopyDonePeek(c2, false); /* not a dot, so put it back */ 
! 			else
! 				/* not EOF mark, so emit \ and following char literally */
! 				appendStringInfoCharMacro(&line_buf, '\\');
  		}
  
  		appendStringInfoCharMacro(&line_buf, c);
***************
*** 2369,2402 ****
  
  	for (;;)
  	{
- 		/* handle multiline quoted fields */
- 		if (in_quote && line_buf.cursor >= line_buf.len)
- 		{
- 			bool		done;
- 
- 			switch (eol_type)
- 			{
- 				case EOL_NL:
- 					appendStringInfoString(&attribute_buf, "\n");
- 					break;
- 				case EOL_CR:
- 					appendStringInfoString(&attribute_buf, "\r");
- 					break;
- 				case EOL_CRNL:
- 					appendStringInfoString(&attribute_buf, "\r\n");
- 					break;
- 				case EOL_UNKNOWN:
- 					/* shouldn't happen - just keep going */
- 					break;
- 			}
- 
- 			copy_lineno++;
- 			done = CopyReadLine();
- 			if (done && line_buf.len == 0)
- 				break;
- 			start_cursor = line_buf.cursor;
- 		}
- 
  		end_cursor = line_buf.cursor;
  		if (line_buf.cursor >= line_buf.len)
  			break;
--- 2460,2465 ----
***************
*** 2629,2653 ****
  		 !use_quote && (c = *test_string) != '\0';
  		 test_string += mblen)
  	{
- 		/*
- 		 * We don't know here what the surrounding line end characters
- 		 * might be. It might not even be under postgres' control. So
- 		 * we simple warn on ANY embedded line ending character.
- 		 *
- 		 * This warning will disappear when we make line parsing field-aware,
- 		 * so that we can reliably read in embedded line ending characters
- 		 * regardless of the file's line-end context.
- 		 *
- 		 */
- 
- 		if (!embedded_line_warning  && (c == '\n' || c == '\r') )
- 		{
- 			embedded_line_warning = true;
- 			elog(WARNING,
- 				 "CSV fields with embedded linefeed or carriage return "
- 				 "characters might not be able to be reimported");
- 		}
- 
  		if (c == delimc || c == quotec || c == '\n' || c == '\r')
  			use_quote = true;
  		if (!same_encoding)
--- 2692,2697 ----

---------------------------(end of broadcast)---------------------------
TIP 3: if posting/reading through Usenet, please send an appropriate
      subscribe-nomail command to [EMAIL PROTECTED] so that your
      message can get through to the mailing list cleanly

Re: [PATCHES] CSV multiline final fix

Reply via email to