Re: [PATCHES] CopyReadLineText optimization

Heikki Linnakangas Mon, 10 Mar 2008 09:09:34 -0700

Andrew Dunstan wrote:

Another question that occurred to me - did you try using strpbrk() to look for the next interesting character rather than your homegrown searcher gadget? If so, how did that perform?


It looks like strpbrk() performs poorly:

unpatched:
 testname |  min duration
----------+-----------------
 all      | 00:00:08.099656
 1/2      | 00:00:06.734241
 1/4      | 00:00:06.016946
 1/8      | 00:00:05.622122
 1/16     | 00:00:05.304252
 none     | 00:00:05.155755
(6 rows)

strpbrk:

 testname |  min duration
----------+-----------------
 all      | 00:00:22.980997
 1/2      | 00:00:13.724834
 1/4      | 00:00:08.980246
 1/8      | 00:00:06.582357
 1/16     | 00:00:05.291485
 none     | 00:00:06.239468
(6 rows)

memchr:

 testname |  min duration
----------+-----------------
 all      | 00:00:13.684479
 1/2      | 00:00:09.509213
 1/4      | 00:00:06.921959
 1/8      | 00:00:05.654761
 1/16     | 00:00:04.719027
 none     | 00:00:03.718361
(6 rows)

Attached is the test script and patches I used, if someone wants to test this on another platform.


--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com

Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.295
diff -c -r1.295 copy.c
*** src/backend/commands/copy.c	1 Jan 2008 19:45:48 -0000	1.295
--- src/backend/commands/copy.c	5 Mar 2008 14:44:58 -0000
***************
*** 67,72 ****
--- 67,275 ----
  	EOL_CRNL
  } EolType;
  
+ 
+ /***** Fast byte searcher functions *****
+  *
+  * CopyReadLine needs to search for newlines, as well as quoting and escape
+  * characters to determine which newlines are real and which ones are escaped.
+  * Doing that in the naive way, looping through the input byte by byte and
+  * comparing against the interesting characters, can be slow.
+  *
+  * To speed that up, we provide a "searcher" interface, that can be used to
+  * search a piece of memory for up to 4 different bytes simultaneously. It's
+  * similar to memchr, but allows searching for multiple chars at the same 
+  * time.
+  * 
+  * To start a search on a new block of memory, call init_searcher. Then call
+  * next_searcher repeatedly to return all the occurrances of the bytes being
+  * searched for.
+  *
+  * The searcher implementation uses memchr to search for the bytes, and keeps
+  * track of where the next occurrance of each is. Using memchr allows us
+  * to take advantage of any platform-specific optimizations.
+  */
+ 
+ /*
+  * Struct used to store state of the current search between calls to 
+  * next_searcher. Initialized by init_search.
+  */
+ typedef struct {
+ 	/* Each element in c corresponds the element in s. These are sorted
+ 	 * by the pointers (ptr) */
+ 	int n;		/* elements used in the arrays */
+ 	char c[4];	/* bytes we're looking for */
+ 	char *ptr[4]; /* pointers to next occurances of the bytes */
+ 	char *end;	/* end of block being searched. Last byte to search is end-1 */
+ } searcher;
+ 
+ /* Functions internal to "searcher" */
+ 
+ #define swap_searcher_entries(searcher, a, b) do { \
+ 	char tmpc = (searcher)->c[a]; \
+ 	char *tmpptr = (searcher)->ptr[a]; \
+ 	(searcher)->c[a] = (searcher)->c[b]; \
+ 	(searcher)->ptr[a] = (searcher)->ptr[b]; \
+ 	(searcher)->c[b] = tmpc; \
+ 	(searcher)->ptr[b] = tmpptr; \
+ } while(0)
+ 
+ static void
+ sort_searcher(searcher *s)
+ {
+ 	switch(s->n)
+ 	{
+ 		case 0:
+ 		case 1:
+ 			/* Nothing to do */
+ 			break;
+ 		case 2:
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			break;
+ 		case 3:
+ 			if (s->ptr[0] > s->ptr[2])
+ 				swap_searcher_entries(s, 0, 2);
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			if (s->ptr[1] > s->ptr[2])
+ 				swap_searcher_entries(s, 1, 2);
+ 			break;
+ 		case 4:
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			if (s->ptr[2] > s->ptr[3])
+ 				swap_searcher_entries(s, 2, 3);
+ 			if (s->ptr[1] > s->ptr[2])
+ 				swap_searcher_entries(s, 2, 3);
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			if (s->ptr[2] > s->ptr[3])
+ 				swap_searcher_entries(s, 2, 3);
+ 			break;
+ 	}
+ }
+ 
+ /* Remove the topmost element */
+ static void
+ remove_top(searcher *searcher)
+ {
+ 	int i;
+ 
+ 	searcher->n--;
+ 
+ 	/* Move up the remaining items */
+ 	for (i = 0; i < searcher->n; i++)
+ 	{
+ 		searcher->c[i] = searcher->c[i + 1];
+ 		searcher->ptr[i] = searcher->ptr[i + 1];
+ 	}
+ }
+ 
+ /*
+  * The first element in the list has been replaced by the caller.
+  * Move it to the right place in the list, to keep it sorted.
+  */
+ static void
+ sift_down(searcher *searcher)
+ {
+ 	if (searcher->n < 2 || searcher->ptr[0] <= searcher->ptr[1])
+ 		return;
+ 	swap_searcher_entries(searcher, 0, 1);
+ 
+ 	if (searcher->n < 3 || searcher->ptr[1] <= searcher->ptr[2])
+ 		return;
+ 	swap_searcher_entries(searcher, 1, 2);
+ 
+ 	if (searcher->n < 4 || searcher->ptr[2] <= searcher->ptr[3])
+ 		return;
+ 	swap_searcher_entries(searcher, 2, 3);
+ }
+ 
+ /*
+  * Starts a new search in a block of memory.
+  *
+  *  searcher	- for storing state between calls. Opaque to caller, 
+  *				  init_searcher will initialize this.
+  *  str			- block of memory to search
+  *  len			- length of str
+  *  c			- bytes to search for, max 4.
+  *  n			- number of bytes in c
+  */
+ static void
+ init_searcher(searcher *searcher, char *str, int len, char c[4], int n)
+ {
+ 	int i;
+ 
+ 	Assert(n <= 4);
+ 
+ 	searcher->n = 0;
+ 	searcher->end = str + len;
+ 
+ 	/* Find first occurances of the searched-for bytes */
+ 	for (i = 0; i < n; i++)
+ 	{
+ 		char *hit = memchr(str, c[i], len);
+ 		if (hit != NULL)
+ 		{
+ 			searcher->c[searcher->n] = c[i];
+ 			searcher->ptr[searcher->n] = hit;
+ 			searcher->n++;
+ 		}
+ 		/*
+ 		 * If the byte doesn't occur in the string at all, we don't
+ 		 * need to put it in the list.
+ 		 */
+ 	}
+ 	sort_searcher(searcher);
+ }
+ 
+ /*
+  * Look for the next interesting character 
+  *
+  * searcher - state, opaque to caller
+  * str		- block to search.
+  *
+  * 'str' must be part of the block passed to init_searcher, and >=
+  * the return value of last call to next_searcher.
+  *
+  * Returns the offset of the next interesting location, relative to 'ss'
+  */
+ static char *
+ next_searcher(searcher *searcher, char *str)
+ {
+ 	Assert(str < searcher->end);
+ 
+ 	if (searcher->n == 0)
+ 		return NULL;
+ 
+ 	/*
+ 	 * If the caller skipped over the next pointer we had memorized, we have to
+ 	 * search for the one after that.
+ 	 */
+ 	while (str > searcher->ptr[0])
+ 	{
+ 		char *s = memchr(str, searcher->c[0], searcher->end - str);
+ 
+ 		if (s == NULL)
+ 		{
+ 			remove_top(searcher);
+ 			if (searcher->n == 0)
+ 				return NULL;
+ 		}
+ 		else
+ 		{
+ 			searcher->ptr[0] = s;
+ 			sift_down(searcher);
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Return the next interesting location we had memorized.
+ 	 */
+ 	return searcher->ptr[0];
+ }
+ 
+ 
  /*
   * This struct contains all the state variables used throughout a COPY
   * operation. For simplicity, we use the same struct for all variants of COPY,
***************
*** 159,164 ****
--- 362,369 ----
  	char	   *raw_buf;
  	int			raw_buf_index;	/* next byte to process */
  	int			raw_buf_len;	/* total # of bytes stored */
+ 
+ 	searcher	searcher;
  } CopyStateData;
  
  typedef CopyStateData *CopyState;
***************
*** 2285,2299 ****
--- 2490,2516 ----
  				last_was_esc = false;
  	char		quotec = '\0';
  	char		escapec = '\0';
+ 	char		search_chars[4];
+ 	int			nsearch_chars = 0;
+ 
+ 	search_chars[nsearch_chars++] = '\n';
+ 	search_chars[nsearch_chars++] = '\r';
  
  	if (cstate->csv_mode)
  	{
  		quotec = cstate->quote[0];
  		escapec = cstate->escape[0];
+ 
+ 		search_chars[nsearch_chars++] = quotec;
+ 
  		/* ignore special escape processing if it's the same as quotec */
  		if (quotec == escapec)
  			escapec = '\0';
+ 		else
+ 			search_chars[nsearch_chars++] = escapec;
  	}
+ 	else
+ 		search_chars[nsearch_chars++] = '\\';
  
  	mblen_str[1] = '\0';
  
***************
*** 2360,2368 ****
--- 2577,2603 ----
  				break;
  			}
  			need_data = false;
+ 
+ 			if (!cstate->encoding_embeds_ascii)
+ 				init_searcher(&cstate->searcher, copy_raw_buf, copy_buf_len,
+ 							  search_chars, nsearch_chars);
  		}
  
  		/* OK to fetch a character */
+ 		if ((!cstate->csv_mode || !first_char_in_line)
+ 			&& !cstate->encoding_embeds_ascii
+ 			&& (raw_buf_ptr + 4 <= copy_buf_len))
+ 		{
+ 			char *next = next_searcher(&cstate->searcher, 
+ 									   &copy_raw_buf[raw_buf_ptr]);
+ 
+ 			if (next != NULL)
+ 			{
+ 				raw_buf_ptr = next - copy_raw_buf;
+ 				first_char_in_line = false;
+ 			}
+ 		}
+ 
  		prev_raw_ptr = raw_buf_ptr;
  		c = copy_raw_buf[raw_buf_ptr++];

Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.296
diff -c -r1.296 copy.c
*** src/backend/commands/copy.c	8 Mar 2008 01:16:26 -0000	1.296
--- src/backend/commands/copy.c	9 Mar 2008 09:34:13 -0000
***************
*** 2285,2299 ****
--- 2285,2311 ----
  				last_was_esc = false;
  	char		quotec = '\0';
  	char		escapec = '\0';
+ 	char		search_chars[5] = { 0, 0, 0, 0, 0 };
+ 	int			nsearch_chars = 0;
+ 
+ 	search_chars[nsearch_chars++] = '\n';
+ 	search_chars[nsearch_chars++] = '\r';
  
  	if (cstate->csv_mode)
  	{
  		quotec = cstate->quote[0];
  		escapec = cstate->escape[0];
+ 
+ 		search_chars[nsearch_chars++] = quotec;
+ 
  		/* ignore special escape processing if it's the same as quotec */
  		if (quotec == escapec)
  			escapec = '\0';
+ 		else
+ 			search_chars[nsearch_chars++] = escapec;
  	}
+ 	else
+ 		search_chars[nsearch_chars++] = '\\';
  
  	mblen_str[1] = '\0';
  
***************
*** 2363,2368 ****
--- 2375,2394 ----
  		}
  
  		/* OK to fetch a character */
+ 		if ((!cstate->csv_mode || !first_char_in_line)
+ 			&& !cstate->encoding_embeds_ascii
+ 			&& (raw_buf_ptr + 4 <= copy_buf_len))
+ 		{
+ 			
+ 			char *next = strpbrk(&copy_raw_buf[raw_buf_ptr], search_chars);
+ 
+ 			if (next != NULL)
+ 			{
+ 				raw_buf_ptr = next - copy_raw_buf;
+ 				first_char_in_line = false;
+ 			}
+ 		}
+ 
  		prev_raw_ptr = raw_buf_ptr;
  		c = copy_raw_buf[raw_buf_ptr++];

copy-script.sh
Description: application/shellscript

-- 
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-patches

Re: [PATCHES] CopyReadLineText optimization

Reply via email to