Heikki Linnakangas wrote:
Heikki Linnakangas wrote:
Attached is a patch that modifies CopyReadLineText so that it uses memchr to speed up the scan. The nice thing about memchr is that we can take advantage of any clever optimizations that might be in libc or compiler.


Here's an updated version of the patch. The principle is the same, but the same optimization is now used for CSV input as well, and there's more comments.

Another update attached: It occurred to me that the memchr approach is only safe for server encodings, where the non-first bytes of a multi-byte character always have the hi-bit set.

--
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.295
diff -c -r1.295 copy.c
*** src/backend/commands/copy.c	1 Jan 2008 19:45:48 -0000	1.295
--- src/backend/commands/copy.c	5 Mar 2008 14:44:58 -0000
***************
*** 67,72 ****
--- 67,275 ----
  	EOL_CRNL
  } EolType;
  
+ 
+ /***** Fast byte searcher functions *****
+  *
+  * CopyReadLine needs to search for newlines, as well as quoting and escape
+  * characters to determine which newlines are real and which ones are escaped.
+  * Doing that in the naive way, looping through the input byte by byte and
+  * comparing against the interesting characters, can be slow.
+  *
+  * To speed that up, we provide a "searcher" interface, that can be used to
+  * search a piece of memory for up to 4 different bytes simultaneously. It's
+  * similar to memchr, but allows searching for multiple chars at the same 
+  * time.
+  * 
+  * To start a search on a new block of memory, call init_searcher. Then call
+  * next_searcher repeatedly to return all the occurrances of the bytes being
+  * searched for.
+  *
+  * The searcher implementation uses memchr to search for the bytes, and keeps
+  * track of where the next occurrance of each is. Using memchr allows us
+  * to take advantage of any platform-specific optimizations.
+  */
+ 
+ /*
+  * Struct used to store state of the current search between calls to 
+  * next_searcher. Initialized by init_search.
+  */
+ typedef struct {
+ 	/* Each element in c corresponds the element in s. These are sorted
+ 	 * by the pointers (ptr) */
+ 	int n;		/* elements used in the arrays */
+ 	char c[4];	/* bytes we're looking for */
+ 	char *ptr[4]; /* pointers to next occurances of the bytes */
+ 	char *end;	/* end of block being searched. Last byte to search is end-1 */
+ } searcher;
+ 
+ /* Functions internal to "searcher" */
+ 
+ #define swap_searcher_entries(searcher, a, b) do { \
+ 	char tmpc = (searcher)->c[a]; \
+ 	char *tmpptr = (searcher)->ptr[a]; \
+ 	(searcher)->c[a] = (searcher)->c[b]; \
+ 	(searcher)->ptr[a] = (searcher)->ptr[b]; \
+ 	(searcher)->c[b] = tmpc; \
+ 	(searcher)->ptr[b] = tmpptr; \
+ } while(0)
+ 
+ static void
+ sort_searcher(searcher *s)
+ {
+ 	switch(s->n)
+ 	{
+ 		case 0:
+ 		case 1:
+ 			/* Nothing to do */
+ 			break;
+ 		case 2:
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			break;
+ 		case 3:
+ 			if (s->ptr[0] > s->ptr[2])
+ 				swap_searcher_entries(s, 0, 2);
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			if (s->ptr[1] > s->ptr[2])
+ 				swap_searcher_entries(s, 1, 2);
+ 			break;
+ 		case 4:
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			if (s->ptr[2] > s->ptr[3])
+ 				swap_searcher_entries(s, 2, 3);
+ 			if (s->ptr[1] > s->ptr[2])
+ 				swap_searcher_entries(s, 2, 3);
+ 			if (s->ptr[0] > s->ptr[1])
+ 				swap_searcher_entries(s, 0, 1);
+ 			if (s->ptr[2] > s->ptr[3])
+ 				swap_searcher_entries(s, 2, 3);
+ 			break;
+ 	}
+ }
+ 
+ /* Remove the topmost element */
+ static void
+ remove_top(searcher *searcher)
+ {
+ 	int i;
+ 
+ 	searcher->n--;
+ 
+ 	/* Move up the remaining items */
+ 	for (i = 0; i < searcher->n; i++)
+ 	{
+ 		searcher->c[i] = searcher->c[i + 1];
+ 		searcher->ptr[i] = searcher->ptr[i + 1];
+ 	}
+ }
+ 
+ /*
+  * The first element in the list has been replaced by the caller.
+  * Move it to the right place in the list, to keep it sorted.
+  */
+ static void
+ sift_down(searcher *searcher)
+ {
+ 	if (searcher->n < 2 || searcher->ptr[0] <= searcher->ptr[1])
+ 		return;
+ 	swap_searcher_entries(searcher, 0, 1);
+ 
+ 	if (searcher->n < 3 || searcher->ptr[1] <= searcher->ptr[2])
+ 		return;
+ 	swap_searcher_entries(searcher, 1, 2);
+ 
+ 	if (searcher->n < 4 || searcher->ptr[2] <= searcher->ptr[3])
+ 		return;
+ 	swap_searcher_entries(searcher, 2, 3);
+ }
+ 
+ /*
+  * Starts a new search in a block of memory.
+  *
+  *  searcher	- for storing state between calls. Opaque to caller, 
+  *				  init_searcher will initialize this.
+  *  str			- block of memory to search
+  *  len			- length of str
+  *  c			- bytes to search for, max 4.
+  *  n			- number of bytes in c
+  */
+ static void
+ init_searcher(searcher *searcher, char *str, int len, char c[4], int n)
+ {
+ 	int i;
+ 
+ 	Assert(n <= 4);
+ 
+ 	searcher->n = 0;
+ 	searcher->end = str + len;
+ 
+ 	/* Find first occurances of the searched-for bytes */
+ 	for (i = 0; i < n; i++)
+ 	{
+ 		char *hit = memchr(str, c[i], len);
+ 		if (hit != NULL)
+ 		{
+ 			searcher->c[searcher->n] = c[i];
+ 			searcher->ptr[searcher->n] = hit;
+ 			searcher->n++;
+ 		}
+ 		/*
+ 		 * If the byte doesn't occur in the string at all, we don't
+ 		 * need to put it in the list.
+ 		 */
+ 	}
+ 	sort_searcher(searcher);
+ }
+ 
+ /*
+  * Look for the next interesting character 
+  *
+  * searcher - state, opaque to caller
+  * str		- block to search.
+  *
+  * 'str' must be part of the block passed to init_searcher, and >=
+  * the return value of last call to next_searcher.
+  *
+  * Returns the offset of the next interesting location, relative to 'ss'
+  */
+ static char *
+ next_searcher(searcher *searcher, char *str)
+ {
+ 	Assert(str < searcher->end);
+ 
+ 	if (searcher->n == 0)
+ 		return NULL;
+ 
+ 	/*
+ 	 * If the caller skipped over the next pointer we had memorized, we have to
+ 	 * search for the one after that.
+ 	 */
+ 	while (str > searcher->ptr[0])
+ 	{
+ 		char *s = memchr(str, searcher->c[0], searcher->end - str);
+ 
+ 		if (s == NULL)
+ 		{
+ 			remove_top(searcher);
+ 			if (searcher->n == 0)
+ 				return NULL;
+ 		}
+ 		else
+ 		{
+ 			searcher->ptr[0] = s;
+ 			sift_down(searcher);
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Return the next interesting location we had memorized.
+ 	 */
+ 	return searcher->ptr[0];
+ }
+ 
+ 
  /*
   * This struct contains all the state variables used throughout a COPY
   * operation. For simplicity, we use the same struct for all variants of COPY,
***************
*** 159,164 ****
--- 362,369 ----
  	char	   *raw_buf;
  	int			raw_buf_index;	/* next byte to process */
  	int			raw_buf_len;	/* total # of bytes stored */
+ 
+ 	searcher	searcher;
  } CopyStateData;
  
  typedef CopyStateData *CopyState;
***************
*** 2285,2299 ****
--- 2490,2516 ----
  				last_was_esc = false;
  	char		quotec = '\0';
  	char		escapec = '\0';
+ 	char		search_chars[4];
+ 	int			nsearch_chars = 0;
+ 
+ 	search_chars[nsearch_chars++] = '\n';
+ 	search_chars[nsearch_chars++] = '\r';
  
  	if (cstate->csv_mode)
  	{
  		quotec = cstate->quote[0];
  		escapec = cstate->escape[0];
+ 
+ 		search_chars[nsearch_chars++] = quotec;
+ 
  		/* ignore special escape processing if it's the same as quotec */
  		if (quotec == escapec)
  			escapec = '\0';
+ 		else
+ 			search_chars[nsearch_chars++] = escapec;
  	}
+ 	else
+ 		search_chars[nsearch_chars++] = '\\';
  
  	mblen_str[1] = '\0';
  
***************
*** 2360,2368 ****
--- 2577,2603 ----
  				break;
  			}
  			need_data = false;
+ 
+ 			if (!cstate->encoding_embeds_ascii)
+ 				init_searcher(&cstate->searcher, copy_raw_buf, copy_buf_len,
+ 							  search_chars, nsearch_chars);
  		}
  
  		/* OK to fetch a character */
+ 		if ((!cstate->csv_mode || !first_char_in_line)
+ 			&& !cstate->encoding_embeds_ascii
+ 			&& (raw_buf_ptr + 4 <= copy_buf_len))
+ 		{
+ 			char *next = next_searcher(&cstate->searcher, 
+ 									   &copy_raw_buf[raw_buf_ptr]);
+ 
+ 			if (next != NULL)
+ 			{
+ 				raw_buf_ptr = next - copy_raw_buf;
+ 				first_char_in_line = false;
+ 			}
+ 		}
+ 
  		prev_raw_ptr = raw_buf_ptr;
  		c = copy_raw_buf[raw_buf_ptr++];
  
--
Sent via pgsql-patches mailing list (pgsql-patches@postgresql.org)
To make changes to your subscription:
http://mail.postgresql.org/mj/mj_wwwusr?domain=postgresql.org&extra=pgsql-patches

Reply via email to