Here's a patch to speed up CopyReadAttributesCSV. On the test case I've
been playing with, loading the TPC-H partsupp table, about 20%
CopyReadAttributesCSV (inlined into DoCopy, DoCopy itself is insignificant):
samples % image name symbol name
8136 25.8360 postgres CopyReadLine
6350 20.1645 postgres DoCopy
2181 6.9258 postgres pg_verify_mbstr_len
2157 6.8496 reiserfs (no symbols)
1668 5.2968 libc-2.7.so memcpy
1142 3.6264 libc-2.7.so ____strtod_l_internal
951 3.0199 postgres heap_formtuple
904 2.8707 libc-2.7.so ____strtol_l_internal
619 1.9656 libc-2.7.so memset
442 1.4036 libc-2.7.so strlen
341 1.0828 postgres hash_any
329 1.0447 postgres pg_atoi
300 0.9527 postgres AllocSetAlloc
With this patch, the usage of that function goes down to ~13%
samples % image name symbol name
7191 28.7778 postgres CopyReadLine
3257 13.0343 postgres DoCopy
2127 8.5121 reiserfs (no symbols)
1914 7.6597 postgres pg_verify_mbstr_len
1413 5.6547 libc-2.7.so memcpy
920 3.6818 libc-2.7.so ____strtod_l_internal
784 3.1375 libc-2.7.so ____strtol_l_internal
745 2.9814 postgres heap_formtuple
508 2.0330 libc-2.7.so memset
398 1.5928 libc-2.7.so strlen
315 1.2606 postgres hash_any
255 1.0205 postgres AllocSetAlloc
The trick is to split the loop in CopyReadAttributesCSV into two parts,
inside quotes, and outside quotes, saving some instructions in both
parts.
Your mileage may vary, but I'm quite happy with this. I haven't tested
it much yet, but I wouldn't expect it to be a loss in any interesting
scenario. The code also doesn't look much worse after the patch, perhaps
even better.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.295
diff -c -r1.295 copy.c
*** src/backend/commands/copy.c 1 Jan 2008 19:45:48 -0000 1.295
--- src/backend/commands/copy.c 29 Feb 2008 20:57:09 -0000
***************
*** 2913,2919 ****
for (;;)
{
bool found_delim = false;
- bool in_quote = false;
bool saw_quote = false;
char *start_ptr;
char *end_ptr;
--- 3146,3151 ----
***************
*** 2934,3000 ****
{
char c;
! end_ptr = cur_ptr;
! if (cur_ptr >= line_end_ptr)
! break;
! c = *cur_ptr++;
! /* unquoted field delimiter */
! if (c == delimc && !in_quote)
{
! found_delim = true;
! break;
! }
! /* start of quoted field (or part of field) */
! if (c == quotec && !in_quote)
! {
! saw_quote = true;
! in_quote = true;
! continue;
}
! /* escape within a quoted field */
! if (c == escapec && in_quote)
{
! /*
! * peek at the next char if available, and escape it if it is
! * an escape char or a quote char
! */
! if (cur_ptr < line_end_ptr)
! {
! char nextc = *cur_ptr;
! if (nextc == escapec || nextc == quotec)
{
! *output_ptr++ = nextc;
! cur_ptr++;
! continue;
}
}
! }
! /*
! * end of quoted field. Must do this test after testing for escape
! * in case quote char and escape char are the same (which is the
! * common case).
! */
! if (c == quotec && in_quote)
! {
! in_quote = false;
! continue;
}
-
- /* Add c to output string */
- *output_ptr++ = c;
}
/* Terminate attribute value in output area */
*output_ptr++ = '\0';
- /* Shouldn't still be in quote mode */
- if (in_quote)
- ereport(ERROR,
- (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
- errmsg("unterminated CSV quoted field")));
-
/* Check whether raw input matched null marker */
input_len = end_ptr - start_ptr;
if (!saw_quote && input_len == cstate->null_print_len &&
--- 3166,3241 ----
{
char c;
! /* Not in quote */
! for (;;)
{
! end_ptr = cur_ptr;
! if (cur_ptr >= line_end_ptr)
! goto endfield;
! c = *cur_ptr++;
! /* unquoted field delimiter */
! if (c == delimc)
! {
! found_delim = true;
! goto endfield;
! }
! /* start of quoted field (or part of field) */
! if (c == quotec)
! {
! saw_quote = true;
! break;
! }
! /* Add c to output string */
! *output_ptr++ = c;
}
!
! /* In quote */
! for (;;)
{
! end_ptr = cur_ptr;
! if (cur_ptr >= line_end_ptr)
! ereport(ERROR,
! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
! errmsg("unterminated CSV quoted field")));
! c = *cur_ptr++;
!
! /* escape within a quoted field */
! if (c == escapec)
! {
! /*
! * peek at the next char if available, and escape it if it is
! * an escape char or a quote char
! */
! if (cur_ptr < line_end_ptr)
{
! char nextc = *cur_ptr;
!
! if (nextc == escapec || nextc == quotec)
! {
! *output_ptr++ = nextc;
! cur_ptr++;
! continue;
! }
}
}
! /*
! * end of quoted field. Must do this test after testing for escape
! * in case quote char and escape char are the same (which is the
! * common case).
! */
! if (c == quotec)
! break;
! /* Add c to output string */
! *output_ptr++ = c;
}
}
+ endfield:
/* Terminate attribute value in output area */
*output_ptr++ = '\0';
/* Check whether raw input matched null marker */
input_len = end_ptr - start_ptr;
if (!saw_quote && input_len == cstate->null_print_len &&
---------------------------(end of broadcast)---------------------------
TIP 5: don't forget to increase your free space map settings