Re: [PATCHES] CSV multiline final fix

Bruce Momjian Mon, 21 Feb 2005 13:42:13 -0800

Shame we had to duplicate CopyReadLine() in a sense.


Your patch has been added to the PostgreSQL unapplied patches list at:

        http://momjian.postgresql.org/cgi-bin/pgpatches

It will be applied as soon as one of the PostgreSQL committers reviews
and approves it.

---------------------------------------------------------------------------


Andrew Dunstan wrote:
> 
> Well, in response to the huge number (0) of comments on my POC patch to 
> fix this, I prepared the attached patch, which improves on my previous 
> effort a bit (there was one obscure failure case which is now handled).
> 
> Basically, all the required logic is in a new function CopyReadLineCSV() 
> which is almost but not quite like CopyReadLine(). The new function 
> keeps just enough state to know if a line ending sequence (CR, CRLF, or 
> LF) is part of a quoted field or not. This gets rid of the need for 
> special casing embedded line endings on input elsewhere, so that is 
> removed, as is the warning about them on output that we added back in 
> december (as we then thought just before release). Lastly, the docs are 
> also patched.
> 
> Also attached is my tiny test file - maybe we need to cover this in 
> regression tests?
> 
> cheers
> 
> andrew

> diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml 
> ./doc/src/sgml/ref/copy.sgml
> *** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml    Mon Jan 
>  3 19:39:53 2005
> --- ./doc/src/sgml/ref/copy.sgml      Sun Feb 20 19:18:54 2005
> ***************
> *** 496,508 ****
>       <para>
>        CSV mode will both recognize and produce CSV files with quoted
>        values containing embedded carriage returns and line feeds. Thus
> !      the files are not strictly one line per table row like text-mode
> !      files. However, <productname>PostgreSQL</productname> will reject
> !      <command>COPY</command> input if any fields contain embedded line
> !      end character sequences that do not match the line ending
> !      convention used in the CSV file itself. It is generally safer to
> !      import data containing embedded line end characters using the
> !      text or binary formats rather than CSV.
>       </para>
>      </note>
>   
> --- 496,503 ----
>       <para>
>        CSV mode will both recognize and produce CSV files with quoted
>        values containing embedded carriage returns and line feeds. Thus
> !      the files are not strictly one line per table row as are text-mode
> !      files.
>       </para>
>      </note>
>   
> ***************
> *** 513,518 ****
> --- 508,515 ----
>        might encounter some files that cannot be imported using this
>        mechanism, and <command>COPY</> might produce files that other
>        programs cannot process.
> +      It is generally safer to import data using the text or binary formats,
> +      if possible, rather than using CSV format.
>       </para>
>      </note>
>       
> diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c 
> ./src/backend/commands/copy.c
> *** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c   Fri Dec 
> 31 16:59:41 2004
> --- ./src/backend/commands/copy.c     Sun Feb 20 13:40:56 2005
> ***************
> *** 98,104 ****
>   static EolType eol_type;            /* EOL type of input */
>   static int  client_encoding;        /* remote side's character encoding */
>   static int  server_encoding;        /* local encoding */
> - static bool embedded_line_warning;
>   
>   /* these are just for error messages, see copy_in_error_callback */
>   static bool copy_binary;            /* is it a binary copy? */
> --- 98,103 ----
> ***************
> *** 140,145 ****
> --- 139,145 ----
>    char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
>                List *force_notnull_atts);
>   static bool CopyReadLine(void);
> + static bool CopyReadLineCSV(char * quote, char * escape);
>   static char *CopyReadAttribute(const char *delim, const char *null_print,
>                                 CopyReadResult *result, bool *isnull);
>   static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
> ***************
> *** 1191,1197 ****
>       attr = tupDesc->attrs;
>       num_phys_attrs = tupDesc->natts;
>       attr_count = list_length(attnumlist);
> -     embedded_line_warning = false;
>   
>       /*
>        * Get info about the columns we need to process.
> --- 1191,1196 ----
> ***************
> *** 1718,1724 ****
>                       ListCell   *cur;
>   
>                       /* Actually read the line into memory here */
> !                     done = CopyReadLine();
>   
>                       /*
>                        * EOF at start of line means we're done.  If we see 
> EOF after
> --- 1717,1723 ----
>                       ListCell   *cur;
>   
>                       /* Actually read the line into memory here */
> !                     done = csv_mode ? CopyReadLineCSV(quote, escape) : 
> CopyReadLine();
>   
>                       /*
>                        * EOF at start of line means we're done.  If we see 
> EOF after
> ***************
> *** 2194,2199 ****
> --- 2193,2448 ----
>       return result;
>   }
>   
> + /*
> +  * Read a line for CSV copy mode. Differences from standard mode:
> +  * . CR an NL are not special inside quoted fields - they just get added
> +  *   to the buffer.
> +  * . \ is not magical except as the start of the end of data marker.
> +  *
> +  */
> + 
> + static bool
> + CopyReadLineCSV(char * quote, char * escape)
> + {
> +     bool            result;
> +     bool            change_encoding = (client_encoding != server_encoding);
> +     int                     c;
> +     int                     mblen;
> +     int                     j;
> +     unsigned char s[2];
> +     char       *cvt;
> +     bool        in_quote = false, last_was_esc = false;
> +     char        quotec = quote[0];
> +     char        escapec = escape[0];
> + 
> +     s[1] = 0;
> + 
> +     /* ignore special escape processing if it's the same as quote */
> +     if (quotec == escapec)
> +             escapec = '\0';
> + 
> +     /* reset line_buf to empty */
> +     line_buf.len = 0;
> +     line_buf.data[0] = '\0';
> +     line_buf.cursor = 0;
> + 
> +     /* mark that encoding conversion hasn't occurred yet */
> +     line_buf_converted = false;
> + 
> +     /* set default status */
> +     result = false;
> + 
> +     /*
> +      * In this loop we only care for detecting newlines (\r and/or \n)
> +      * and the end-of-copy marker (\.).  These four
> +      * characters, and only these four, are assumed the same in frontend
> +      * and backend encodings.  We do not assume that second and later bytes
> +      * of a frontend multibyte character couldn't look like ASCII 
> characters.
> +      *
> +      * What about the encoding implications of the quote / excape chars?
> +      *
> +      * However, CR and NL characters that are inside a quoted field are
> +      * not special, and are simply a part of the data value. The parsing 
> rule
> +      * used is a bit rough and ready, but probably adequate for our 
> purposes.
> +      */
> + 
> +     for (;;)
> +     {
> +             c = CopyGetChar();
> +             if (c == EOF)
> +             {
> +                     result = true;
> +                     break;
> +             }
> + 
> +             /*
> +              * Dealing with quotes and escapes here is mildly tricky. If the
> +              * quote char is also the escape char, there's no problem - we
> +              * just use the char as a toggle. If they are different, we need
> +              * to ensure that we only take account of an escape inside a 
> quoted
> +              * field and immediately preceding a quote char, and not the
> +              * second in a escape-escape sequence.
> +              */
> + 
> +             if (in_quote && c == escapec)
> +                     last_was_esc = ! last_was_esc;
> + 
> +             if (c == quotec && ! last_was_esc)
> +                     in_quote = ! in_quote;
> + 
> +             if (c != escapec)
> +                     last_was_esc = false;
> + 
> +             /* 
> +              * updating the line count for embedded CR and/or LF chars is
> +              * necessarily a little fragile - this test is probably about
> +              * the best we can do.
> +              */
> +             if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
> +                     copy_lineno++;
> +             
> +             if (!in_quote && c == '\r')
> +             {
> +                     if (eol_type == EOL_NL)
> +                             ereport(ERROR,
> +                                             
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> +                                              errmsg("unquoted carriage 
> return found in CSV data"),
> +                               errhint("Use quoted CSV field to represent 
> carriage return.")));
> +                     /* Check for \r\n on first line, _and_ handle \r\n. */
> +                     if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
> +                     {
> +                             int                     c2 = CopyPeekChar();
> + 
> +                             if (c2 == '\n')
> +                             {
> +                                     CopyDonePeek(c2, true);         /* eat 
> newline */
> +                                     eol_type = EOL_CRNL;
> +                             }
> +                             else
> +                             {
> +                                     /* found \r, but no \n */
> +                                     if (eol_type == EOL_CRNL)
> +                                             ereport(ERROR,
> +                                                             
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> +                                              errmsg("unquoted carriage 
> return found in CSV data"),
> +                                                              errhint("Use 
> quoted CSV field to represent carriage return.")));
> + 
> +                                     /*
> +                                      * if we got here, it is the first line 
> and we didn't
> +                                      * get \n, so put it back
> +                                      */
> +                                     CopyDonePeek(c2, false);
> +                                     eol_type = EOL_CR;
> +                             }
> +                     }
> +                     break;
> +             }
> +             if (!in_quote && c == '\n')
> +             {
> +                     if (eol_type == EOL_CR || eol_type == EOL_CRNL)
> +                             ereport(ERROR,
> +                                             
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> +                                              errmsg("unquoted newline found 
> in CSV data"),
> +                                              errhint("Use quoted CSV field 
> to represent newline.")));
> +                     eol_type = EOL_NL;
> +                     break;
> +             }
> + 
> +             /* \ is only potentially magical at the start of a line */
> +             if (line_buf.len == 0 && c == '\\')
> +             {
> +                     int                     c2 = CopyPeekChar();
> + 
> +                     if (c2 == EOF)
> +                     {
> +                             result = true;
> + 
> +                             CopyDonePeek(c2, true); /* eat it - do we need 
> to? */
> + 
> +                             break;
> +                     }
> +                     if (c2 == '.')
> +                     {
> + 
> +                             CopyDonePeek(c2, true); /* so we can keep 
> calling GetChar() */
> + 
> +                             if (eol_type == EOL_CRNL)
> +                             {
> +                                     c = CopyGetChar();
> +                                     if (c == '\n')
> +                                             ereport(ERROR,
> +                                                             
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> +                                                              
> errmsg("end-of-copy marker does not match previous newline style")));
> +                                     if (c != '\r')
> +                                             ereport(ERROR,
> +                                                             
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> +                                                              
> errmsg("end-of-copy marker corrupt")));
> +                             }
> +                             c = CopyGetChar();
> +                             if (c != '\r' && c != '\n')
> +                                     ereport(ERROR,
> +                                                     
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> +                                                      errmsg("end-of-copy 
> marker corrupt")));
> +                             if ((eol_type == EOL_NL && c != '\n') ||
> +                                     (eol_type == EOL_CRNL && c != '\n') ||
> +                                     (eol_type == EOL_CR && c != '\r'))
> +                                     ereport(ERROR,
> +                                                     
> (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
> +                                                      errmsg("end-of-copy 
> marker does not match previous newline style")));
> + 
> +                             /*
> +                              * In protocol version 3, we should ignore 
> anything
> +                              * after \. up to the protocol end of copy 
> data.  (XXX
> +                              * maybe better not to treat \. as special?)
> +                              */
> +                             if (copy_dest == COPY_NEW_FE)
> +                             {
> +                                     while (c != EOF)
> +                                             c = CopyGetChar();
> +                             }
> +                             result = true;  /* report EOF */
> +                             break;
> +                     }
> + 
> +                     CopyDonePeek(c2, false); /* not a dot, so put it back */
> + 
> +             }
> + 
> +             appendStringInfoCharMacro(&line_buf, c);
> + 
> +             /*
> +              * When client encoding != server, must be careful to read the
> +              * extra bytes of a multibyte character exactly, since the 
> encoding
> +              * might not ensure they don't look like ASCII.  When the 
> encodings
> +              * are the same, we need not do this, since no server encoding 
> we
> +              * use has ASCII-like following bytes.
> +              */
> +             if (change_encoding)
> +             {
> +                     s[0] = c;
> +                     mblen = pg_encoding_mblen(client_encoding, s);
> +                     for (j = 1; j < mblen; j++)
> +                     {
> +                             c = CopyGetChar();
> +                             if (c == EOF)
> +                             {
> +                                     result = true;
> +                                     break;
> +                             }
> +                             appendStringInfoCharMacro(&line_buf, c);
> +                     }
> +                     if (result)
> +                             break;                  /* out of outer loop */
> +             }
> +     } /* end of outer loop */
> + 
> +     /*
> +      * Done reading the line.  Convert it to server encoding.
> +      *
> +      * Note: set line_buf_converted to true *before* attempting conversion;
> +      * this prevents infinite recursion during error reporting should
> +      * pg_client_to_server() issue an error, due to copy_in_error_callback
> +      * again attempting the same conversion.  We'll end up issuing the 
> message
> +      * without conversion, which is bad but better than nothing ...
> +      */
> +     line_buf_converted = true;
> + 
> +     if (change_encoding)
> +     {
> +             cvt = (char *) pg_client_to_server((unsigned char *) 
> line_buf.data,
> +                                                                             
>    line_buf.len);
> +             if (cvt != line_buf.data)
> +             {
> +                     /* transfer converted data back to line_buf */
> +                     line_buf.len = 0;
> +                     line_buf.data[0] = '\0';
> +                     appendBinaryStringInfo(&line_buf, cvt, strlen(cvt));
> +             }
> +     }
> + 
> +     return result;
> + }
> + 
>   /*----------
>    * Read the value of a single attribute, performing de-escaping as needed.
>    *
> ***************
> *** 2369,2402 ****
>   
>       for (;;)
>       {
> -             /* handle multiline quoted fields */
> -             if (in_quote && line_buf.cursor >= line_buf.len)
> -             {
> -                     bool            done;
> - 
> -                     switch (eol_type)
> -                     {
> -                             case EOL_NL:
> -                                     appendStringInfoString(&attribute_buf, 
> "\n");
> -                                     break;
> -                             case EOL_CR:
> -                                     appendStringInfoString(&attribute_buf, 
> "\r");
> -                                     break;
> -                             case EOL_CRNL:
> -                                     appendStringInfoString(&attribute_buf, 
> "\r\n");
> -                                     break;
> -                             case EOL_UNKNOWN:
> -                                     /* shouldn't happen - just keep going */
> -                                     break;
> -                     }
> - 
> -                     copy_lineno++;
> -                     done = CopyReadLine();
> -                     if (done && line_buf.len == 0)
> -                             break;
> -                     start_cursor = line_buf.cursor;
> -             }
> - 
>               end_cursor = line_buf.cursor;
>               if (line_buf.cursor >= line_buf.len)
>                       break;
> --- 2618,2623 ----
> ***************
> *** 2629,2653 ****
>                !use_quote && (c = *test_string) != '\0';
>                test_string += mblen)
>       {
> -             /*
> -              * We don't know here what the surrounding line end characters
> -              * might be. It might not even be under postgres' control. So
> -              * we simple warn on ANY embedded line ending character.
> -              *
> -              * This warning will disappear when we make line parsing 
> field-aware,
> -              * so that we can reliably read in embedded line ending 
> characters
> -              * regardless of the file's line-end context.
> -              *
> -              */
> - 
> -             if (!embedded_line_warning  && (c == '\n' || c == '\r') )
> -             {
> -                     embedded_line_warning = true;
> -                     elog(WARNING,
> -                              "CSV fields with embedded linefeed or carriage 
> return "
> -                              "characters might not be able to be 
> reimported");
> -             }
> - 
>               if (c == delimc || c == quotec || c == '\n' || c == '\r')
>                       use_quote = true;
>               if (!same_encoding)
> --- 2850,2855 ----


> 
> ---------------------------(end of broadcast)---------------------------
> TIP 6: Have you searched our list archives?
> 
>                http://archives.postgresql.org

-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  [email protected]               |  (610) 359-1001
  +  If your life is a hard drive,     |  13 Roberts Road
  +  Christ can be your backup.        |  Newtown Square, Pennsylvania 19073

---------------------------(end of broadcast)---------------------------
TIP 5: Have you checked our extensive FAQ?

               http://www.postgresql.org/docs/faq

Re: [PATCHES] CSV multiline final fix

Reply via email to