Re: [HACKERS] like/ilike improvements

Andrew Dunstan Thu, 20 Sep 2007 06:43:19 -0700


Guillaume Smet wrote:

app_hls

On 9/20/07, Andrew Dunstan <[EMAIL PROTECTED]> wrote:

Can you retry both sets of tests but this time in C locale? The lower()
code works differently in C locale, and it might be that we need to look
at tweaking just one case.

Please try the attached patch, which goes back to using a special casefor single-byte ILIKE. I want to make sure that at the very least wedon't cause a performance regression with the code done this release. Ican't see an obvious way around the problem for multi-byte case -lower() then requires converting to and from wchar, and I don't see away of avoiding calling lower(). If this is a major blocker I wouldsuggest you look at an alternative to using ILIKE for your UTF8 data.


cheers

andrew

Index: src/backend/utils/adt/like.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like.c,v
retrieving revision 1.69
diff -c -r1.69 like.c
*** src/backend/utils/adt/like.c	2 Jun 2007 02:03:42 -0000	1.69
--- src/backend/utils/adt/like.c	20 Sep 2007 13:12:39 -0000
***************
*** 36,41 ****
--- 36,43 ----
  
  static int	UTF8_MatchText(char *t, int tlen, char *p, int plen);
  
+ static int	SB_IMatchText(char *t, int tlen, char *p, int plen);
+ 
  static int	GenericMatchText(char *s, int slen, char* p, int plen);
  static int	Generic_Text_IC_like(text *str, text *pat);
  
***************
*** 104,109 ****
--- 106,117 ----
  
  #include "like_match.c"
  
+ /* setup to compile like_match.c for single byte case insensitive matches */
+ #define MATCH_LOWER
+ #define NextChar(p, plen) NextByte((p), (plen))
+ #define MatchText SB_IMatchText
+ 
+ #include "like_match.c"
  
  /* setup to compile like_match.c for UTF8 encoding, using fast NextChar */
  
***************
*** 132,146 ****
  	int			slen,
  				plen;
  
! 	/* Force inputs to lower case to achieve case insensitivity */
! 	str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
! 	pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
! 	s = VARDATA(str);
! 	slen = (VARSIZE(str) - VARHDRSZ);
! 	p = VARDATA(pat);
! 	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	return GenericMatchText(s, slen, p, plen);
  }
  
  /*
--- 140,171 ----
  	int			slen,
  				plen;
  
! 	/* For efficiency reasons, in the single byte case we don't call
! 	 * lower() on the pattern and text, but instead call to_lower on each
! 	 * character.  In the multi-byte case we don't have much choice :-(
! 	 */
  
! 	if (pg_database_encoding_max_length() > 1)
! 	{
! 		pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		if (GetDatabaseEncoding() == PG_UTF8)
! 			return UTF8_MatchText(s, slen, p, plen);
! 		else
! 			return MB_MatchText(s, slen, p, plen);
! 	}
! 	else
! 	{
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		return SB_IMatchText(s, slen, p, plen);
! 	}
  }
  
  /*
Index: src/backend/utils/adt/like_match.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like_match.c,v
retrieving revision 1.16
diff -c -r1.16 like_match.c
*** src/backend/utils/adt/like_match.c	2 Jun 2007 02:03:42 -0000	1.16
--- src/backend/utils/adt/like_match.c	20 Sep 2007 13:12:39 -0000
***************
*** 13,18 ****
--- 13,19 ----
   * NextChar 
   * MatchText - to name of function wanted
   * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
+  * MATCH_LOWER - define iff using to_lower on text chars
   *
   * Copyright (c) 1996-2007, PostgreSQL Global Development Group
   *
***************
*** 68,73 ****
--- 69,80 ----
   *--------------------
   */
  
+ #ifdef MATCH_LOWER
+ #define TCHAR(t) tolower((t))
+ #else
+ #define TCHAR(t) (t)
+ #endif
+ 
  static int
  MatchText(char *t, int tlen, char *p, int plen)
  {
***************
*** 143,155 ****
  			else
  			{
  
! 				char firstpat = *p ;
  
  				if (*p == '\\')
  				{
  					if (plen < 2)
  						return LIKE_FALSE;
! 					firstpat = p[1];
  				}
  
  				while (tlen > 0)
--- 150,162 ----
  			else
  			{
  
! 				char firstpat = TCHAR(*p) ;
  
  				if (*p == '\\')
  				{
  					if (plen < 2)
  						return LIKE_FALSE;
! 					firstpat = TCHAR(p[1]);
  				}
  
  				while (tlen > 0)
***************
*** 158,164 ****
  					 * Optimization to prevent most recursion: don't recurse
  					 * unless first pattern byte matches first text byte.
  					 */
! 					if (*t == firstpat)
  					{
  						int			matched = MatchText(t, tlen, p, plen);
  						
--- 165,171 ----
  					 * Optimization to prevent most recursion: don't recurse
  					 * unless first pattern byte matches first text byte.
  					 */
! 					if (TCHAR(*t) == firstpat)
  					{
  						int			matched = MatchText(t, tlen, p, plen);
  						
***************
*** 183,189 ****
  			NextByte(p, plen);
  			continue;
  		}
! 		else if (*t != *p)
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
--- 190,196 ----
  			NextByte(p, plen);
  			continue;
  		}
! 		else if (TCHAR(*t) != TCHAR(*p))
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
***************
*** 338,340 ****
--- 345,352 ----
  #undef do_like_escape
  #endif
  
+ #undef TCHAR
+ 
+ #ifdef MATCH_LOWER
+ #undef MATCH_LOWER
+ #endif

---------------------------(end of broadcast)---------------------------
TIP 3: Have you checked our extensive FAQ?

               http://www.postgresql.org/docs/faq

Re: [HACKERS] like/ilike improvements

Reply via email to