Andrew Dunstan wrote:
Bruce Momjian said:
Shame we had to duplicate CopyReadLine() in a sense.
If you can find a clean way to merge them please do - I'll be very grateful. My head started to spin when I tried. In general I dislike having more than 2 or 2 levels of logic in a given piece of code.
Previous comment courtesy clumsy fingers and the Department of Redundancy Department (of course, I meant 2 or 3).
Anyway, please review this patch for copy.c - it's possibly more to your taste. It's less redundant, but I'm not sure it's more clear.
cheers
andrew
*** copy.c.orig Mon Feb 21 23:12:41 2005
--- copy.c Mon Feb 21 23:35:22 2005
***************
*** 98,104 ****
static EolType eol_type; /* EOL type of input */
static int client_encoding; /* remote side's character encoding */
static int server_encoding; /* local encoding */
- static bool embedded_line_warning;
/* these are just for error messages, see copy_in_error_callback */
static bool copy_binary; /* is it a binary copy? */
--- 98,103 ----
***************
*** 139,145 ****
static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
List *force_notnull_atts);
! static bool CopyReadLine(void);
static char *CopyReadAttribute(const char *delim, const char *null_print,
CopyReadResult *result, bool *isnull);
static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
--- 138,144 ----
static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
List *force_notnull_atts);
! static bool CopyReadLine(char * quote, char * escape);
static char *CopyReadAttribute(const char *delim, const char *null_print,
CopyReadResult *result, bool *isnull);
static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1191,1197 ****
attr = tupDesc->attrs;
num_phys_attrs = tupDesc->natts;
attr_count = list_length(attnumlist);
- embedded_line_warning = false;
/*
* Get info about the columns we need to process.
--- 1190,1195 ----
***************
*** 1718,1724 ****
ListCell *cur;
/* Actually read the line into memory here */
! done = CopyReadLine();
/*
* EOF at start of line means we're done. If we see EOF after
--- 1716,1723 ----
ListCell *cur;
/* Actually read the line into memory here */
! done = csv_mode ?
! CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);
/*
* EOF at start of line means we're done. If we see EOF after
***************
*** 2006,2012 ****
* by newline.
*/
static bool
! CopyReadLine(void)
{
bool result;
bool change_encoding = (client_encoding != server_encoding);
--- 2005,2011 ----
* by newline.
*/
static bool
! CopyReadLine(char * quote, char * escape)
{
bool result;
bool change_encoding = (client_encoding != server_encoding);
***************
*** 2015,2020 ****
--- 2014,2032 ----
int j;
unsigned char s[2];
char *cvt;
+ bool in_quote = false, last_was_esc = false, csv_mode = false;
+ char quotec = '\0', escapec = '\0';
+
+ if (quote)
+ {
+ csv_mode = true;
+ quotec = quote[0];
+ escapec = escape[0];
+ /* ignore special escape processing if it's the same as quotec */
+ if (quotec == escapec)
+ escapec = '\0';
+ }
+
s[1] = 0;
***************
*** 2031,2041 ****
/*
* In this loop we only care for detecting newlines (\r and/or \n) and
! * the end-of-copy marker (\.). For backwards compatibility we allow
* backslashes to escape newline characters. Backslashes other than
* the end marker get put into the line_buf, since CopyReadAttribute
! * does its own escape processing. These four characters, and only
! * these four, are assumed the same in frontend and backend encodings.
* We do not assume that second and later bytes of a frontend
* multibyte character couldn't look like ASCII characters.
*/
--- 2043,2062 ----
/*
* In this loop we only care for detecting newlines (\r and/or \n) and
! * the end-of-copy marker (\.).
! *
! * In Text mode, for backwards compatibility we allow
* backslashes to escape newline characters. Backslashes other than
* the end marker get put into the line_buf, since CopyReadAttribute
! * does its own escape processing.
! *
! * In CSV mode, CR and NL inside q quoted field are just part of the
! * data value and are put in line_buf. We keep just enough state
! * to know if we are currently in a quoted field or not.
! *
! * These four characters, and only these four, are assumed the same in
! * frontend and backend encodings.
! *
* We do not assume that second and later bytes of a frontend
* multibyte character couldn't look like ASCII characters.
*/
***************
*** 2047,2059 ****
result = true;
break;
}
! if (c == '\r')
{
if (eol_type == EOL_NL)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("literal carriage return found in data"),
! errhint("Use \"\\r\" to represent carriage return.")));
/* Check for \r\n on first line, _and_ handle \r\n. */
if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
{
--- 2068,2116 ----
result = true;
break;
}
!
! if (csv_mode)
! {
! /*
! * Dealing with quotes and escapes here is mildly tricky. If the
! * quote char is also the escape char, there's no problem - we
! * just use the char as a toggle. If they are different, we need
! * to ensure that we only take account of an escape inside a quoted
! * field and immediately preceding a quote char, and not the
! * second in a escape-escape sequence.
! */
!
! if (in_quote && c == escapec)
! last_was_esc = ! last_was_esc;
! if (c == quotec && ! last_was_esc)
! in_quote = ! in_quote;
! if (c != escapec)
! last_was_esc = false;
!
! /*
! * updating the line count for embedded CR and/or LF chars is
! * necessarily a little fragile - this test is probably about
! * the best we can do.
! */
! if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
! copy_lineno++;
! }
!
! if (!in_quote && c == '\r')
{
if (eol_type == EOL_NL)
! {
! if (! csv_mode)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("literal carriage return found in data"),
! errhint("Use \"\\r\" to represent carriage return.")));
! else
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("unquoted carriage return found in CSV data"),
! errhint("Use quoted CSV field to represent carriage return.")));
! }
/* Check for \r\n on first line, _and_ handle \r\n. */
if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
{
***************
*** 2068,2077 ****
{
/* found \r, but no \n */
if (eol_type == EOL_CRNL)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("literal carriage return found in data"),
! errhint("Use \"\\r\" to represent carriage return.")));
/*
* if we got here, it is the first line and we didn't
--- 2125,2143 ----
{
/* found \r, but no \n */
if (eol_type == EOL_CRNL)
! {
! if (!csv_mode)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("literal carriage return found in data"),
! errhint("Use \"\\r\" to represent carriage return.")));
! else
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("unquoted carriage return found in data"),
! errhint("Use quoted CSV field to represent carriage return.")));
!
! }
/*
* if we got here, it is the first line and we didn't
***************
*** 2083,2108 ****
}
break;
}
! if (c == '\n')
{
if (eol_type == EOL_CR || eol_type == EOL_CRNL)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("literal newline found in data"),
! errhint("Use \"\\n\" to represent newline.")));
eol_type = EOL_NL;
break;
}
! if (c == '\\')
{
! c = CopyGetChar();
! if (c == EOF)
{
result = true;
break;
}
! if (c == '.')
{
if (eol_type == EOL_CRNL)
{
c = CopyGetChar();
--- 2149,2195 ----
}
break;
}
! if (!in_quote && c == '\n')
{
if (eol_type == EOL_CR || eol_type == EOL_CRNL)
! {
! if (!csv_mode)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("literal newline found in data"),
! errhint("Use \"\\n\" to represent newline.")));
! else
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("unquoted newline found in data"),
! errhint("Use quoted CSV field to represent newline.")));
!
! }
eol_type = EOL_NL;
break;
}
!
! if ((line_buf.len == 0 || !csv_mode) && c == '\\')
{
! int c2;
!
! if (csv_mode)
! c2 = CopyPeekChar();
! else
! c2 = c = CopyGetChar();
!
! if (c2 == EOF)
{
result = true;
+ if (csv_mode)
+ CopyDonePeek(c2, true);
break;
}
! if (c2 == '.')
{
+ if (csv_mode)
+ CopyDonePeek(c2, true); /* allow keep calling GetChar() */
+
if (eol_type == EOL_CRNL)
{
c = CopyGetChar();
***************
*** 2140,2147 ****
result = true; /* report EOF */
break;
}
! /* not EOF mark, so emit \ and following char literally */
! appendStringInfoCharMacro(&line_buf, '\\');
}
appendStringInfoCharMacro(&line_buf, c);
--- 2227,2238 ----
result = true; /* report EOF */
break;
}
!
! if (csv_mode)
! CopyDonePeek(c2, false); /* not a dot, so put it back */
! else
! /* not EOF mark, so emit \ and following char literally */
! appendStringInfoCharMacro(&line_buf, '\\');
}
appendStringInfoCharMacro(&line_buf, c);
***************
*** 2369,2402 ****
for (;;)
{
- /* handle multiline quoted fields */
- if (in_quote && line_buf.cursor >= line_buf.len)
- {
- bool done;
-
- switch (eol_type)
- {
- case EOL_NL:
- appendStringInfoString(&attribute_buf, "\n");
- break;
- case EOL_CR:
- appendStringInfoString(&attribute_buf, "\r");
- break;
- case EOL_CRNL:
- appendStringInfoString(&attribute_buf, "\r\n");
- break;
- case EOL_UNKNOWN:
- /* shouldn't happen - just keep going */
- break;
- }
-
- copy_lineno++;
- done = CopyReadLine();
- if (done && line_buf.len == 0)
- break;
- start_cursor = line_buf.cursor;
- }
-
end_cursor = line_buf.cursor;
if (line_buf.cursor >= line_buf.len)
break;
--- 2460,2465 ----
***************
*** 2629,2653 ****
!use_quote && (c = *test_string) != '\0';
test_string += mblen)
{
- /*
- * We don't know here what the surrounding line end characters
- * might be. It might not even be under postgres' control. So
- * we simple warn on ANY embedded line ending character.
- *
- * This warning will disappear when we make line parsing field-aware,
- * so that we can reliably read in embedded line ending characters
- * regardless of the file's line-end context.
- *
- */
-
- if (!embedded_line_warning && (c == '\n' || c == '\r') )
- {
- embedded_line_warning = true;
- elog(WARNING,
- "CSV fields with embedded linefeed or carriage return "
- "characters might not be able to be reimported");
- }
-
if (c == delimc || c == quotec || c == '\n' || c == '\r')
use_quote = true;
if (!same_encoding)
--- 2692,2697 ----
---------------------------(end of broadcast)---------------------------
TIP 3: if posting/reading through Usenet, please send an appropriate
subscribe-nomail command to [EMAIL PROTECTED] so that your
message can get through to the mailing list cleanly
