Attached is an undocumented patch that implements COPY for CSVs according to the scheme recently discussed. I believe it handles all the straightforward and perverse cases that were discussed (including multiline fields and partially quoted fields).


I have done some light testing on it, but would appreciate some assistance with testing, as well as any useful review comments.

cheers

andrew
Index: src/backend/commands/copy.c
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/backend/commands/copy.c,v
retrieving revision 1.219
diff -c -r1.219 copy.c
*** src/backend/commands/copy.c 6 Apr 2004 13:21:33 -0000       1.219
--- src/backend/commands/copy.c 8 Apr 2004 18:50:38 -0000
***************
*** 70,76 ****
  typedef enum CopyReadResult
  {
        NORMAL_ATTR,
!       END_OF_LINE
  } CopyReadResult;
  
  /*
--- 70,77 ----
  typedef enum CopyReadResult
  {
        NORMAL_ATTR,
!       END_OF_LINE,
!       UNTERMINATED_FIELD
  } CopyReadResult;
  
  /*
***************
*** 136,141 ****
--- 137,144 ----
  static bool CopyReadLine(void);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                                                           CopyReadResult *result, 
bool *isnull);
+ static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
+                                                          CopyReadResult *result, 
bool *isnull);
  static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo,
                                                Oid typelem, bool *isnull);
  static void CopyAttributeOut(char *string, char *delim);
***************
*** 682,687 ****
--- 685,691 ----
        List       *attnumlist;
        bool            binary = false;
        bool            oids = false;
+       bool        csv_mode = false;
        char       *delim = NULL;
        char       *null_print = NULL;
        Relation        rel;
***************
*** 744,751 ****
        if (!delim)
                delim = "\t";
  
        if (!null_print)
!               null_print = "\\N";
  
        /*
         * Open and lock the relation, using the appropriate lock type.
--- 748,758 ----
        if (!delim)
                delim = "\t";
  
+       if (is_from && strlen(delim) > 1)
+               csv_mode = true;
+ 
        if (!null_print)
!               null_print = csv_mode ? "" : "\\N";
  
        /*
         * Open and lock the relation, using the appropriate lock type.
***************
*** 772,783 ****
                                           "psql's \\copy command also works for 
anyone.")));
  
        /*
!        * Presently, only single-character delimiter strings are supported.
         */
!       if (strlen(delim) != 1)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                 errmsg("COPY delimiter must be a single character")));
  
        /*
         * Don't allow the delimiter to appear in the null string.
--- 779,805 ----
                                           "psql's \\copy command also works for 
anyone.")));
  
        /*
!        * Only single-character delimiter strings are supported,
!        * except in CSV mode, where the string must be 
!        * delimiter-char quote-char [escape-char]
         */
!       if (!csv_mode && strlen(delim) != 1)
                ereport(ERROR,
                                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                 errmsg("COPY delimiter must be a single character")));
+       else if (csv_mode)
+       {
+               if(strlen(delim) > 3)
+                       ereport(ERROR,
+                               (errcode(ERRCODE_SYNTAX_ERROR),
+                                errmsg("COPY delimiters for CSV must be a 2 or 3 
characters")));
+               if (delim[0] == delim[1] || 
+                       (strlen(delim) == 3 && delim[0] == delim[2]))
+                       ereport(ERROR,
+                               (errcode(ERRCODE_SYNTAX_ERROR),
+                                errmsg("CSV delimiter character must not be same as 
quote character or escape character")));
+                       
+       }
  
        /*
         * Don't allow the delimiter to appear in the null string.
***************
*** 788,793 ****
--- 810,824 ----
                                 errmsg("COPY delimiter must not appear in the NULL 
specification")));
  
        /*
+        * Don't allow OIDs in CSV mode
+        */
+ 
+       if (csv_mode && oids)
+               ereport(ERROR,
+                               (errcode(ERRCODE_SYNTAX_ERROR),
+                                errmsg("Cannot specify OIDS in CSV mode ")));
+ 
+       /*
         * Don't allow COPY w/ OIDs to or from a table without them
         */
        if (oids && !rel->rd_rel->relhasoids)
***************
*** 1263,1268 ****
--- 1294,1300 ----
        Datum      *values;
        char       *nulls;
        bool            done = false;
+       bool        csv_mode;
        bool            isnull;
        ResultRelInfo *resultRelInfo;
        EState     *estate = CreateExecutorState(); /* for ExecConstraints() */
***************
*** 1280,1285 ****
--- 1312,1318 ----
        num_phys_attrs = tupDesc->natts;
        attr_count = length(attnumlist);
        num_defaults = 0;
+       csv_mode = (strlen(delim) > 1);
  
        /*
         * We need a ResultRelInfo so we can use the regular executor's
***************
*** 1499,1504 ****
--- 1532,1538 ----
  
                        if (file_has_oids)
                        {
+                               /* can't be in CSV mode here */
                                string = CopyReadAttribute(delim, null_print,
                                                                                   
&result, &isnull);
  
***************
*** 1537,1544 ****
                                                         errmsg("missing data for 
column \"%s\"",
                                                                        
NameStr(attr[m]->attname))));
  
!                               string = CopyReadAttribute(delim, null_print,
!                                                                                  
&result, &isnull);
  
                                if (isnull)
                                {
--- 1571,1591 ----
                                                         errmsg("missing data for 
column \"%s\"",
                                                                        
NameStr(attr[m]->attname))));
  
!                               if (csv_mode)
!                               {
!                                       string = CopyReadAttributeCSV(delim, 
null_print, 
!                                                                                      
           &result, &isnull);
!                                       if (result == UNTERMINATED_FIELD)
!                                               ereport(ERROR,
!                                                               
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                                                                errmsg("unterminated 
CSV quoted field")));
!                               }
!                               else
!                               {
!                                       string = CopyReadAttribute(delim, null_print, 
!                                                                                      
    &result, &isnull);
!                               }
!                                       
  
                                if (isnull)
                                {
***************
*** 2061,2066 ****
--- 2108,2263 ----
        /* check whether raw input matched null marker */
        input_len = end_cursor - start_cursor;
        if (input_len == strlen(null_print) &&
+               strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0)
+               *isnull = true;
+       else
+               *isnull = false;
+ 
+       return attribute_buf.data;
+ }
+ 
+ 
+ /*
+  * Read the value of a single attribute in CSV mode, 
+  * performing de-escaping as needed. Escaping does not follow the normal
+  * PostgreSQL text mode, but instead "standard" (i.e. common) CSV usage.
+  *
+  * Quoted fields can span lines, in which case the line end is embedded
+  * in the returned string.
+  *
+  * delim is a 2- or 3-character string. The first character is the
+  * field delimiter, the second the quote character, the third is the
+  * escape character indise quotes, and defaults to the quote character.
+  *
+  * null_print is the null marker string.  Note that this is compared to
+  * the pre-de-escaped input string (thus if it is quoted it is not a NULL).
+  *
+  * *result is set to indicate what terminated the read:
+  *            NORMAL_ATTR:    column delimiter
+  *            END_OF_LINE:    end of line
+  *      UNTERMINATED_FIELD no quote detected at end of a quoted field
+  *
+  * In any case, the string read up to the terminator (or end of file)
+  * is returned.
+  *
+  * *isnull is set true or false depending on whether the input matched
+  * the null marker.  Note that the caller cannot check this since the
+  * returned string will be the post-de-escaping equivalent, which may
+  * look the same as some valid data string.
+  *----------
+  */
+ 
+ static char *
+ CopyReadAttributeCSV(const char *delim, const char *null_print,
+                                 CopyReadResult *result, bool *isnull)
+ {
+       char        delimc = delim[0];
+       char        quotec = delim[1];
+       char        escapec = delim[2] ? delim[2] : delim[1];
+       char            c;
+       int                     start_cursor = line_buf.cursor;
+       int                     end_cursor = start_cursor;;
+       int                     input_len;
+     bool        in_quote = false;
+       bool        saw_quote = false;
+ 
+       /* reset attribute_buf to empty */
+       attribute_buf.len = 0;
+       attribute_buf.data[0] = '\0';
+ 
+       /* set default status */
+       *result = END_OF_LINE;
+ 
+       for (;;)
+       {
+               /* handle multiline quoted fields */
+               if (in_quote && line_buf.cursor >= line_buf.len)
+               {
+                       bool done;
+ 
+                       switch(eol_type)
+                       {
+                               case EOL_NL:
+                                       appendStringInfoString(&attribute_buf,"\n");
+                                       break;
+                               case EOL_CR:
+                                       appendStringInfoString(&attribute_buf,"\r");
+                                       break;
+                               case EOL_CRNL:
+                                       appendStringInfoString(&attribute_buf,"\r\n");
+                                       break;
+                               case EOL_UNKNOWN:
+                                       /* shouldn't happen - just keep going */
+                                       break;
+                       }
+ 
+                       copy_lineno++;
+                       done = CopyReadLine();
+                       if (done && line_buf.len == 0)
+                               break;
+                       start_cursor = line_buf.cursor;
+               }
+ 
+               end_cursor = line_buf.cursor;
+               if (line_buf.cursor >= line_buf.len)
+                       break;
+               c = line_buf.data[line_buf.cursor++];
+               /* 
+                * unquoted field delimiter 
+                */
+               if (!in_quote && c == delimc)
+               {
+                       *result = NORMAL_ATTR;
+                       break;
+               }
+               /* 
+                * start of quoted field (or part of field) 
+                */
+               if (!in_quote && c == quotec)
+               {
+                       saw_quote = true;
+                       in_quote = true;
+                       continue;
+               }
+               /* 
+                * escape within a quoted field
+                */
+               if (in_quote && c == escapec)
+               {
+                       /* 
+                        * peek at the next char if available, and escape it if it
+                        * is an escape char or a quote char
+                        */
+                       if (line_buf.cursor <= line_buf.len)
+                       {
+                               char nextc = line_buf.data[line_buf.cursor];
+                               if (nextc == escapec || nextc == quotec)
+                               {
+                                       appendStringInfoCharMacro(&attribute_buf, 
nextc);
+                                       line_buf.cursor++;
+                                       continue;
+                               }
+                       }
+               } 
+               /*
+                * end of quoted field. 
+                * Must do this test after testing for escape in case quote char
+                * and escape char are the same (which is the common case).
+                */
+               if(in_quote && c == quotec)
+               {
+                       in_quote = false;
+                       continue;
+               }
+               appendStringInfoCharMacro(&attribute_buf, c);
+       }
+ 
+       if (in_quote)
+               *result = UNTERMINATED_FIELD;
+ 
+       /* check whether raw input matched null marker */
+       input_len = end_cursor - start_cursor;
+       if (!saw_quote && input_len == strlen(null_print) &&
                strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0)
                *isnull = true;
        else
---------------------------(end of broadcast)---------------------------
TIP 6: Have you searched our list archives?

               http://archives.postgresql.org

Reply via email to