Andrew Dunstan wrote:
Another question that occurred to me - did you try using strpbrk() to look for the next interesting character rather than your homegrown searcher gadget? If so, how did that perform?
It looks like strpbrk() performs poorly: unpatched: testname | min duration ----------+----------------- all | 00:00:08.099656 1/2 | 00:00:06.734241 1/4 | 00:00:06.016946 1/8 | 00:00:05.622122 1/16 | 00:00:05.304252 none | 00:00:05.155755 (6 rows) strpbrk: testname | min duration ----------+----------------- all | 00:00:22.980997 1/2 | 00:00:13.724834 1/4 | 00:00:08.980246 1/8 | 00:00:06.582357 1/16 | 00:00:05.291485 none | 00:00:06.239468 (6 rows) memchr: testname | min duration ----------+----------------- all | 00:00:13.684479 1/2 | 00:00:09.509213 1/4 | 00:00:06.921959 1/8 | 00:00:05.654761 1/16 | 00:00:04.719027 none | 00:00:03.718361 (6 rows)Attached is the test script and patches I used, if someone wants to test this on another platform.
-- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.295
diff -c -r1.295 copy.c
*** src/backend/commands/copy.c 1 Jan 2008 19:45:48 -0000 1.295
--- src/backend/commands/copy.c 5 Mar 2008 14:44:58 -0000
***************
*** 67,72 ****
--- 67,275 ----
EOL_CRNL
} EolType;
+
+ /***** Fast byte searcher functions *****
+ *
+ * CopyReadLine needs to search for newlines, as well as quoting and escape
+ * characters to determine which newlines are real and which ones are escaped.
+ * Doing that in the naive way, looping through the input byte by byte and
+ * comparing against the interesting characters, can be slow.
+ *
+ * To speed that up, we provide a "searcher" interface, that can be used to
+ * search a piece of memory for up to 4 different bytes simultaneously. It's
+ * similar to memchr, but allows searching for multiple chars at the same
+ * time.
+ *
+ * To start a search on a new block of memory, call init_searcher. Then call
+ * next_searcher repeatedly to return all the occurrances of the bytes being
+ * searched for.
+ *
+ * The searcher implementation uses memchr to search for the bytes, and keeps
+ * track of where the next occurrance of each is. Using memchr allows us
+ * to take advantage of any platform-specific optimizations.
+ */
+
+ /*
+ * Struct used to store state of the current search between calls to
+ * next_searcher. Initialized by init_search.
+ */
+ typedef struct {
+ /* Each element in c corresponds the element in s. These are sorted
+ * by the pointers (ptr) */
+ int n; /* elements used in the arrays */
+ char c[4]; /* bytes we're looking for */
+ char *ptr[4]; /* pointers to next occurances of the bytes */
+ char *end; /* end of block being searched. Last byte to search is end-1 */
+ } searcher;
+
+ /* Functions internal to "searcher" */
+
+ #define swap_searcher_entries(searcher, a, b) do { \
+ char tmpc = (searcher)->c[a]; \
+ char *tmpptr = (searcher)->ptr[a]; \
+ (searcher)->c[a] = (searcher)->c[b]; \
+ (searcher)->ptr[a] = (searcher)->ptr[b]; \
+ (searcher)->c[b] = tmpc; \
+ (searcher)->ptr[b] = tmpptr; \
+ } while(0)
+
+ static void
+ sort_searcher(searcher *s)
+ {
+ switch(s->n)
+ {
+ case 0:
+ case 1:
+ /* Nothing to do */
+ break;
+ case 2:
+ if (s->ptr[0] > s->ptr[1])
+ swap_searcher_entries(s, 0, 1);
+ break;
+ case 3:
+ if (s->ptr[0] > s->ptr[2])
+ swap_searcher_entries(s, 0, 2);
+ if (s->ptr[0] > s->ptr[1])
+ swap_searcher_entries(s, 0, 1);
+ if (s->ptr[1] > s->ptr[2])
+ swap_searcher_entries(s, 1, 2);
+ break;
+ case 4:
+ if (s->ptr[0] > s->ptr[1])
+ swap_searcher_entries(s, 0, 1);
+ if (s->ptr[2] > s->ptr[3])
+ swap_searcher_entries(s, 2, 3);
+ if (s->ptr[1] > s->ptr[2])
+ swap_searcher_entries(s, 2, 3);
+ if (s->ptr[0] > s->ptr[1])
+ swap_searcher_entries(s, 0, 1);
+ if (s->ptr[2] > s->ptr[3])
+ swap_searcher_entries(s, 2, 3);
+ break;
+ }
+ }
+
+ /* Remove the topmost element */
+ static void
+ remove_top(searcher *searcher)
+ {
+ int i;
+
+ searcher->n--;
+
+ /* Move up the remaining items */
+ for (i = 0; i < searcher->n; i++)
+ {
+ searcher->c[i] = searcher->c[i + 1];
+ searcher->ptr[i] = searcher->ptr[i + 1];
+ }
+ }
+
+ /*
+ * The first element in the list has been replaced by the caller.
+ * Move it to the right place in the list, to keep it sorted.
+ */
+ static void
+ sift_down(searcher *searcher)
+ {
+ if (searcher->n < 2 || searcher->ptr[0] <= searcher->ptr[1])
+ return;
+ swap_searcher_entries(searcher, 0, 1);
+
+ if (searcher->n < 3 || searcher->ptr[1] <= searcher->ptr[2])
+ return;
+ swap_searcher_entries(searcher, 1, 2);
+
+ if (searcher->n < 4 || searcher->ptr[2] <= searcher->ptr[3])
+ return;
+ swap_searcher_entries(searcher, 2, 3);
+ }
+
+ /*
+ * Starts a new search in a block of memory.
+ *
+ * searcher - for storing state between calls. Opaque to caller,
+ * init_searcher will initialize this.
+ * str - block of memory to search
+ * len - length of str
+ * c - bytes to search for, max 4.
+ * n - number of bytes in c
+ */
+ static void
+ init_searcher(searcher *searcher, char *str, int len, char c[4], int n)
+ {
+ int i;
+
+ Assert(n <= 4);
+
+ searcher->n = 0;
+ searcher->end = str + len;
+
+ /* Find first occurances of the searched-for bytes */
+ for (i = 0; i < n; i++)
+ {
+ char *hit = memchr(str, c[i], len);
+ if (hit != NULL)
+ {
+ searcher->c[searcher->n] = c[i];
+ searcher->ptr[searcher->n] = hit;
+ searcher->n++;
+ }
+ /*
+ * If the byte doesn't occur in the string at all, we don't
+ * need to put it in the list.
+ */
+ }
+ sort_searcher(searcher);
+ }
+
+ /*
+ * Look for the next interesting character
+ *
+ * searcher - state, opaque to caller
+ * str - block to search.
+ *
+ * 'str' must be part of the block passed to init_searcher, and >=
+ * the return value of last call to next_searcher.
+ *
+ * Returns the offset of the next interesting location, relative to 'ss'
+ */
+ static char *
+ next_searcher(searcher *searcher, char *str)
+ {
+ Assert(str < searcher->end);
+
+ if (searcher->n == 0)
+ return NULL;
+
+ /*
+ * If the caller skipped over the next pointer we had memorized, we have to
+ * search for the one after that.
+ */
+ while (str > searcher->ptr[0])
+ {
+ char *s = memchr(str, searcher->c[0], searcher->end - str);
+
+ if (s == NULL)
+ {
+ remove_top(searcher);
+ if (searcher->n == 0)
+ return NULL;
+ }
+ else
+ {
+ searcher->ptr[0] = s;
+ sift_down(searcher);
+ }
+ }
+
+ /*
+ * Return the next interesting location we had memorized.
+ */
+ return searcher->ptr[0];
+ }
+
+
/*
* This struct contains all the state variables used throughout a COPY
* operation. For simplicity, we use the same struct for all variants of COPY,
***************
*** 159,164 ****
--- 362,369 ----
char *raw_buf;
int raw_buf_index; /* next byte to process */
int raw_buf_len; /* total # of bytes stored */
+
+ searcher searcher;
} CopyStateData;
typedef CopyStateData *CopyState;
***************
*** 2285,2299 ****
--- 2490,2516 ----
last_was_esc = false;
char quotec = '\0';
char escapec = '\0';
+ char search_chars[4];
+ int nsearch_chars = 0;
+
+ search_chars[nsearch_chars++] = '\n';
+ search_chars[nsearch_chars++] = '\r';
if (cstate->csv_mode)
{
quotec = cstate->quote[0];
escapec = cstate->escape[0];
+
+ search_chars[nsearch_chars++] = quotec;
+
/* ignore special escape processing if it's the same as quotec */
if (quotec == escapec)
escapec = '\0';
+ else
+ search_chars[nsearch_chars++] = escapec;
}
+ else
+ search_chars[nsearch_chars++] = '\\';
mblen_str[1] = '\0';
***************
*** 2360,2368 ****
--- 2577,2603 ----
break;
}
need_data = false;
+
+ if (!cstate->encoding_embeds_ascii)
+ init_searcher(&cstate->searcher, copy_raw_buf, copy_buf_len,
+ search_chars, nsearch_chars);
}
/* OK to fetch a character */
+ if ((!cstate->csv_mode || !first_char_in_line)
+ && !cstate->encoding_embeds_ascii
+ && (raw_buf_ptr + 4 <= copy_buf_len))
+ {
+ char *next = next_searcher(&cstate->searcher,
+ ©_raw_buf[raw_buf_ptr]);
+
+ if (next != NULL)
+ {
+ raw_buf_ptr = next - copy_raw_buf;
+ first_char_in_line = false;
+ }
+ }
+
prev_raw_ptr = raw_buf_ptr;
c = copy_raw_buf[raw_buf_ptr++];
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.296
diff -c -r1.296 copy.c
*** src/backend/commands/copy.c 8 Mar 2008 01:16:26 -0000 1.296
--- src/backend/commands/copy.c 9 Mar 2008 09:34:13 -0000
***************
*** 2285,2299 ****
--- 2285,2311 ----
last_was_esc = false;
char quotec = '\0';
char escapec = '\0';
+ char search_chars[5] = { 0, 0, 0, 0, 0 };
+ int nsearch_chars = 0;
+
+ search_chars[nsearch_chars++] = '\n';
+ search_chars[nsearch_chars++] = '\r';
if (cstate->csv_mode)
{
quotec = cstate->quote[0];
escapec = cstate->escape[0];
+
+ search_chars[nsearch_chars++] = quotec;
+
/* ignore special escape processing if it's the same as quotec */
if (quotec == escapec)
escapec = '\0';
+ else
+ search_chars[nsearch_chars++] = escapec;
}
+ else
+ search_chars[nsearch_chars++] = '\\';
mblen_str[1] = '\0';
***************
*** 2363,2368 ****
--- 2375,2394 ----
}
/* OK to fetch a character */
+ if ((!cstate->csv_mode || !first_char_in_line)
+ && !cstate->encoding_embeds_ascii
+ && (raw_buf_ptr + 4 <= copy_buf_len))
+ {
+
+ char *next = strpbrk(©_raw_buf[raw_buf_ptr], search_chars);
+
+ if (next != NULL)
+ {
+ raw_buf_ptr = next - copy_raw_buf;
+ first_char_in_line = false;
+ }
+ }
+
prev_raw_ptr = raw_buf_ptr;
c = copy_raw_buf[raw_buf_ptr++];
copy-script.sh
Description: application/shellscript
-- Sent via pgsql-patches mailing list ([email protected]) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-patches
