Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Bruce Momjian

Wow, that is significantly different.  Thanks.

Your patch has been added to the PostgreSQL unapplied patches list at:

http://momjian.postgresql.org/cgi-bin/pgpatches

It will be applied as soon as one of the PostgreSQL committers reviews
and approves it.

---


Andrew Dunstan wrote:
> 
> 
> Andrew Dunstan wrote:
> 
> >Bruce Momjian said:
> >  
> >
> >>Shame we had to duplicate CopyReadLine() in a sense.
> >>
> >>
> >>
> >>
> >
> >
> >If you can find a clean way to merge them please do - I'll be very grateful.
> > My head started to spin when I tried. In general I dislike having more than
> >2 or 2 levels of logic in a given piece of code.
> >
> >
> >  
> >
> 
> Previous comment courtesy clumsy fingers and the Department of 
> Redundancy Department (of course, I meant 2 or 3).
> 
> Anyway, please review this patch for copy.c - it's possibly more to your 
> taste. It's less redundant, but I'm not sure it's more clear.
> 
> cheers
> 
> andrew


-- 
  Bruce Momjian|  http://candle.pha.pa.us
  pgman@candle.pha.pa.us   |  (610) 359-1001
  +  If your life is a hard drive, |  13 Roberts Road
  +  Christ can be your backup.|  Newtown Square, Pennsylvania 19073

---(end of broadcast)---
TIP 4: Don't 'kill -9' the postmaster


Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Andrew Dunstan

Andrew Dunstan wrote:
Bruce Momjian said:
 

Shame we had to duplicate CopyReadLine() in a sense.
   


If you can find a clean way to merge them please do - I'll be very grateful.
My head started to spin when I tried. In general I dislike having more than
2 or 2 levels of logic in a given piece of code.
 

Previous comment courtesy clumsy fingers and the Department of 
Redundancy Department (of course, I meant 2 or 3).

Anyway, please review this patch for copy.c - it's possibly more to your 
taste. It's less redundant, but I'm not sure it's more clear.

cheers
andrew
*** copy.c.orig	Mon Feb 21 23:12:41 2005
--- copy.c	Mon Feb 21 23:35:22 2005
***
*** 98,104 
  static EolType eol_type;		/* EOL type of input */
  static int	client_encoding;	/* remote side's character encoding */
  static int	server_encoding;	/* local encoding */
- static bool embedded_line_warning;
  
  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;		/* is it a binary copy? */
--- 98,103 
***
*** 139,145 
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
  		 List *force_notnull_atts);
! static bool CopyReadLine(void);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
    CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
--- 138,144 
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
  		 List *force_notnull_atts);
! static bool CopyReadLine(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
    CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***
*** 1191,1197 
  	attr = tupDesc->attrs;
  	num_phys_attrs = tupDesc->natts;
  	attr_count = list_length(attnumlist);
- 	embedded_line_warning = false;
  
  	/*
  	 * Get info about the columns we need to process.
--- 1190,1195 
***
*** 1718,1724 
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = CopyReadLine();
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF after
--- 1716,1723 
  			ListCell   *cur;
  
  			/* Actually read the line into memory here */
! 			done = csv_mode ? 
! CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
  
  			/*
  			 * EOF at start of line means we're done.  If we see EOF after
***
*** 2006,2012 
   * by newline.
   */
  static bool
! CopyReadLine(void)
  {
  	bool		result;
  	bool		change_encoding = (client_encoding != server_encoding);
--- 2005,2011 
   * by newline.
   */
  static bool
! CopyReadLine(char * quote, char * escape)
  {
  	bool		result;
  	bool		change_encoding = (client_encoding != server_encoding);
***
*** 2015,2020 
--- 2014,2032 
  	int			j;
  	unsigned char s[2];
  	char	   *cvt;
+ 	boolin_quote = false, last_was_esc = false, csv_mode = false;
+ 	charquotec = '\0', escapec = '\0';
+ 
+ 	if (quote)
+ 	{
+ 		csv_mode = true;
+ 		quotec = quote[0];
+ 		escapec = escape[0];
+ 		/* ignore special escape processing if it's the same as quotec */
+ 		if (quotec == escapec)
+ 			escapec = '\0';
+ 	}
+ 
  
  	s[1] = 0;
  
***
*** 2031,2041 
  
  	/*
  	 * In this loop we only care for detecting newlines (\r and/or \n) and
! 	 * the end-of-copy marker (\.).  For backwards compatibility we allow
  	 * backslashes to escape newline characters.  Backslashes other than
  	 * the end marker get put into the line_buf, since CopyReadAttribute
! 	 * does its own escape processing.	These four characters, and only
! 	 * these four, are assumed the same in frontend and backend encodings.
  	 * We do not assume that second and later bytes of a frontend
  	 * multibyte character couldn't look like ASCII characters.
  	 */
--- 2043,2062 
  
  	/*
  	 * In this loop we only care for detecting newlines (\r and/or \n) and
! 	 * the end-of-copy marker (\.).  
! 	 *
! 	 * In Text mode, for backwards compatibility we allow
  	 * backslashes to escape newline characters.  Backslashes other than
  	 * the end marker get put into the line_buf, since CopyReadAttribute
! 	 * does its own escape processing.	
! 	 *
! 	 * In CSV mode, CR and NL inside q quoted field are just part of the
! 	 * data value and are put in line_buf. We keep just enough state
! 	 * to know if we are currently in a quoted field or not.
! 	 *
! 	 * These four characters, and only these four, are assumed the same in 
! 	 * frontend and backend encodings.
! 	 *
  	 * We do not assume that second and later bytes of a frontend
  	 * multibyte character couldn't look like ASCII characters.
  	 */
***
***

Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Bruce Momjian
Andrew Dunstan wrote:
> Bruce Momjian said:
> >
> > Shame we had to duplicate CopyReadLine() in a sense.
> >
> >
> 
> 
> If you can find a clean way to merge them please do - I'll be very grateful.
>  My head started to spin when I tried. In general I dislike having more than
> 2 or 2 levels of logic in a given piece of code.

OK, will look at that.  Thanks.

-- 
  Bruce Momjian|  http://candle.pha.pa.us
  pgman@candle.pha.pa.us   |  (610) 359-1001
  +  If your life is a hard drive, |  13 Roberts Road
  +  Christ can be your backup.|  Newtown Square, Pennsylvania 19073

---(end of broadcast)---
TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]


Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Andrew Dunstan
Bruce Momjian said:
>
> Shame we had to duplicate CopyReadLine() in a sense.
>
>


If you can find a clean way to merge them please do - I'll be very grateful.
 My head started to spin when I tried. In general I dislike having more than
2 or 2 levels of logic in a given piece of code.

cheers

andrew



---(end of broadcast)---
TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED]


Re: [PATCHES] CSV multiline final fix

2005-02-21 Thread Bruce Momjian

Shame we had to duplicate CopyReadLine() in a sense.


Your patch has been added to the PostgreSQL unapplied patches list at:

http://momjian.postgresql.org/cgi-bin/pgpatches

It will be applied as soon as one of the PostgreSQL committers reviews
and approves it.

---


Andrew Dunstan wrote:
> 
> Well, in response to the huge number (0) of comments on my POC patch to 
> fix this, I prepared the attached patch, which improves on my previous 
> effort a bit (there was one obscure failure case which is now handled).
> 
> Basically, all the required logic is in a new function CopyReadLineCSV() 
> which is almost but not quite like CopyReadLine(). The new function 
> keeps just enough state to know if a line ending sequence (CR, CRLF, or 
> LF) is part of a quoted field or not. This gets rid of the need for 
> special casing embedded line endings on input elsewhere, so that is 
> removed, as is the warning about them on output that we added back in 
> december (as we then thought just before release). Lastly, the docs are 
> also patched.
> 
> Also attached is my tiny test file - maybe we need to cover this in 
> regression tests?
> 
> cheers
> 
> andrew

> diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml 
> ./doc/src/sgml/ref/copy.sgml
> *** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgmlMon Jan 
>  3 19:39:53 2005
> --- ./doc/src/sgml/ref/copy.sgml  Sun Feb 20 19:18:54 2005
> ***
> *** 496,508 
>   
>CSV mode will both recognize and produce CSV files with quoted
>values containing embedded carriage returns and line feeds. Thus
> !  the files are not strictly one line per table row like text-mode
> !  files. However, PostgreSQL will reject
> !  COPY input if any fields contain embedded line
> !  end character sequences that do not match the line ending
> !  convention used in the CSV file itself. It is generally safer to
> !  import data containing embedded line end characters using the
> !  text or binary formats rather than CSV.
>   
>  
>   
> --- 496,503 
>   
>CSV mode will both recognize and produce CSV files with quoted
>values containing embedded carriage returns and line feeds. Thus
> !  the files are not strictly one line per table row as are text-mode
> !  files.
>   
>  
>   
> ***
> *** 513,518 
> --- 508,515 
>might encounter some files that cannot be imported using this
>mechanism, and COPY might produce files that other
>programs cannot process.
> +  It is generally safer to import data using the text or binary formats,
> +  if possible, rather than using CSV format.
>   
>  
>   
> diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c 
> ./src/backend/commands/copy.c
> *** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c   Fri Dec 
> 31 16:59:41 2004
> --- ./src/backend/commands/copy.c Sun Feb 20 13:40:56 2005
> ***
> *** 98,104 
>   static EolType eol_type;/* EOL type of input */
>   static int  client_encoding;/* remote side's character encoding */
>   static int  server_encoding;/* local encoding */
> - static bool embedded_line_warning;
>   
>   /* these are just for error messages, see copy_in_error_callback */
>   static bool copy_binary;/* is it a binary copy? */
> --- 98,103 
> ***
> *** 140,145 
> --- 139,145 
>char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
>List *force_notnull_atts);
>   static bool CopyReadLine(void);
> + static bool CopyReadLineCSV(char * quote, char * escape);
>   static char *CopyReadAttribute(const char *delim, const char *null_print,
> CopyReadResult *result, bool *isnull);
>   static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
> ***
> *** 1191,1197 
>   attr = tupDesc->attrs;
>   num_phys_attrs = tupDesc->natts;
>   attr_count = list_length(attnumlist);
> - embedded_line_warning = false;
>   
>   /*
>* Get info about the columns we need to process.
> --- 1191,1196 
> ***
> *** 1718,1724 
>   ListCell   *cur;
>   
>   /* Actually read the line into memory here */
> ! done = CopyReadLine();
>   
>   /*
>* EOF at start of line means we're done.  If we see 
> EOF after
> --- 1717,1723 
>   ListCell   *cur;
>   
>   /* Actually read the line into memory here */
> ! done = csv_mode ? CopyReadLineCSV(quote, escape) : 
> CopyReadLine();
>   
>   /*
>* EOF at start of line means