Re: [PATCHES] UTF8MatchText

Andrew Dunstan Thu, 17 May 2007 20:08:15 -0700


Tom Lane wrote:

ITAGAKI Takahiro <[EMAIL PROTECTED]> writes:

Yes, I only used the 'disjoint representations for first-bytes and
not-first-bytes of MB characters' feature in UTF8. Other encodings
allows both [AB] and [BA] for MB character patterns. UTF8Match() does
not cope with those encodings; If we have '[AB][AB]' in a table and
search it with LIKE '%[BA]%', we judge that they are matched by mistake.


AFAICS, the patch does *not* make that mistake because % will not
advance over a fractional character.



Yeah, I think that's right.

Attached is my current WIP patch. If we decide that this optimisationcan in fact be applied to all backend encodings, that will be easilyincorporated. It will simplify the code further. Note that all thecommon code in the MatchText and do_like_escape functions has beenfactored - and the bytea functions just call the single-byte textversions - AFAICS the effect will be identical to having the specialisedversions. (I'm always happy when code volume can be reduced.)


cheers

andrew

Index: src/backend/utils/adt/like.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like.c,v
retrieving revision 1.68
diff -c -r1.68 like.c
*** src/backend/utils/adt/like.c	27 Feb 2007 23:48:08 -0000	1.68
--- src/backend/utils/adt/like.c	18 May 2007 02:47:41 -0000
***************
*** 28,48 ****
  #define LIKE_ABORT						(-1)
  
  
! static int	MatchText(char *t, int tlen, char *p, int plen);
! static int	MatchTextIC(char *t, int tlen, char *p, int plen);
! static int	MatchBytea(char *t, int tlen, char *p, int plen);
! static text *do_like_escape(text *, text *);
  
! static int	MBMatchText(char *t, int tlen, char *p, int plen);
! static int	MBMatchTextIC(char *t, int tlen, char *p, int plen);
  static text *MB_do_like_escape(text *, text *);
  
  /*--------------------
   * Support routine for MatchText. Compares given multibyte streams
   * as wide characters. If they match, returns 1 otherwise returns 0.
   *--------------------
   */
! static int
  wchareq(char *p1, char *p2)
  {
  	int			p1_len;
--- 28,50 ----
  #define LIKE_ABORT						(-1)
  
  
! static int	SB_MatchText(char *t, int tlen, char *p, int plen);
! static int	SB_MatchTextIC(char *t, int tlen, char *p, int plen);
! static text *SB_do_like_escape(text *, text *);
  
! static int	MB_MatchText(char *t, int tlen, char *p, int plen);
  static text *MB_do_like_escape(text *, text *);
  
+ static int	UTF8_MatchText(char *t, int tlen, char *p, int plen);
+ static int	GenericMatchText(char *s, int slen, char* p, int plen);
+ static int	mbtexticlike(text *str, text *pat);
+ 
  /*--------------------
   * Support routine for MatchText. Compares given multibyte streams
   * as wide characters. If they match, returns 1 otherwise returns 0.
   *--------------------
   */
! static __inline__ int
  wchareq(char *p1, char *p2)
  {
  	int			p1_len;
***************
*** 72,86 ****
   * of getting a single character transformed to the system's wchar_t format.
   * So now, we just downcase the strings using lower() and apply regular LIKE
   * comparison.	This should be revisited when we install better locale support.
-  *
-  * Note that MBMatchText and MBMatchTextIC do exactly the same thing now.
-  * Is it worth refactoring to avoid duplicated code?  They might become
-  * different again in the future.
   */
  
  /* Set up to compile like_match.c for multibyte characters */
  #define CHAREQ(p1, p2) wchareq(p1, p2)
- #define ICHAREQ(p1, p2) wchareq(p1, p2)
  #define NextChar(p, plen) \
  	do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
--- 74,86 ----
   * of getting a single character transformed to the system's wchar_t format.
   * So now, we just downcase the strings using lower() and apply regular LIKE
   * comparison.	This should be revisited when we install better locale support.
   */
  
+ #define NextByte(p, plen)	((p)++, (plen)--)
+ #define BYTEEQ(p1, p2)		(*(p1) == *(p2))
+ 
  /* Set up to compile like_match.c for multibyte characters */
  #define CHAREQ(p1, p2) wchareq(p1, p2)
  #define NextChar(p, plen) \
  	do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
***************
*** 90,122 ****
  			 *(dst)++ = *(src)++; \
  	   } while (0)
  
! #define MatchText	MBMatchText
! #define MatchTextIC MBMatchTextIC
  #define do_like_escape	MB_do_like_escape
  
  #include "like_match.c"
  
- #undef CHAREQ
- #undef ICHAREQ
- #undef NextChar
- #undef CopyAdvChar
- #undef MatchText
- #undef MatchTextIC
- #undef do_like_escape
- 
  /* Set up to compile like_match.c for single-byte characters */
! #define CHAREQ(p1, p2) (*(p1) == *(p2))
! #define ICHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2)))
! #define NextChar(p, plen) ((p)++, (plen)--)
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
  #include "like_match.c"
  
- /* And some support for BYTEA */
- #define BYTEA_CHAREQ(p1, p2) (*(p1) == *(p2))
- #define BYTEA_NextChar(p, plen) ((p)++, (plen)--)
- #define BYTEA_CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
  
  /*
   *	interface routines called by the function manager
--- 90,170 ----
  			 *(dst)++ = *(src)++; \
  	   } while (0)
  
! #define MatchText	MB_MatchText
  #define do_like_escape	MB_do_like_escape
  
  #include "like_match.c"
  
  /* Set up to compile like_match.c for single-byte characters */
! #define CHAREQ(p1, p2) BYTEEQ(p1, p2)
! #define NextChar(p, plen) NextByte(p, plen)
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
+ #define MatchText	SB_MatchText
+ #define do_like_escape	SB_do_like_escape
+ 
+ #include "like_match.c"
+ 
+ /* set up to compile like_match.c for single byte case insensitive matching */
+ 
+ #define CHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2)))
+ #define NextChar(p, plen) NextByte(p, plen)
+ #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
+ 
+ #define MatchText	SB_MatchTextIC
+ 
+ #include "like_match.c"
+ 
+ /* set up for UTF8 match optimisation */
+ 
+ #define CHAREQ(p1, p2) wchareq(p1, p2)
+ #define NextChar(p, plen) \
+ 	do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
+ #define CopyAdvChar(dst, src, srclen) \
+ 	do { int __l = pg_mblen(src); \
+ 		 (srclen) -= __l; \
+ 		 while (__l-- > 0) \
+ 			 *(dst)++ = *(src)++; \
+ 	   } while (0)
+ 
+ #define MatchText	UTF8_MatchText
+ #define UTF8_OPT
+ 
  #include "like_match.c"
  
  
+ static __inline__ int
+ GenericMatchText(char *s, int slen, char* p, int plen)
+ {
+ 	if (pg_database_encoding_max_length() == 1)
+ 		return SB_MatchText(s, slen, p, plen);
+ 	else if (GetDatabaseEncoding() == PG_UTF8)
+ 		return UTF8_MatchText(s, slen, p, plen);
+ 	else
+ 		return MB_MatchText(s, slen, p, plen);
+ }
+ 
+ static __inline__ int
+ mbtexticlike(text *str, text *pat)
+ {
+ 	char	   *s,
+ 			   *p;
+ 	int			slen,
+ 				plen;
+ 
+ 	/* Force inputs to lower case to achieve case insensitivity */
+ 	str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
+ 	pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
+ 	s = VARDATA(str);
+ 	slen = (VARSIZE(str) - VARHDRSZ);
+ 	p = VARDATA(pat);
+ 	plen = (VARSIZE(pat) - VARHDRSZ);
+ 
+ 	if (GetDatabaseEncoding() == PG_UTF8)
+ 		return UTF8_MatchText(s, slen, p, plen);
+ 	else
+ 		return MB_MatchText(s, slen, p, plen);
+ }
  
  /*
   *	interface routines called by the function manager
***************
*** 138,147 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 186,192 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 162,171 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 207,213 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 186,195 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 228,234 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 210,219 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 249,255 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 234,240 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (MatchBytea(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 270,276 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (SB_MatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 255,261 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (MatchBytea(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 291,297 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (SB_MatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 281,305 ****
  		slen = strlen(s);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
  	}
  	else
  	{
- 		/* Force inputs to lower case to achieve case insensitivity */
  		text	   *strtext;
  
  		strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 		strtext = DatumGetTextP(DirectFunctionCall1(lower,
! 												  PointerGetDatum(strtext)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 
! 		s = VARDATA(strtext);
! 		slen = (VARSIZE(strtext) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
  	}
  
  	PG_RETURN_BOOL(result);
--- 317,331 ----
  		slen = strlen(s);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (SB_MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
  	}
  	else
  	{
  		text	   *strtext;
  
  		strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 		result = (mbtexticlike(strtext, pat) == LIKE_TRUE);
  	}
  
  	PG_RETURN_BOOL(result);
***************
*** 322,346 ****
  		slen = strlen(s);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
  	}
  	else
  	{
- 		/* Force inputs to lower case to achieve case insensitivity */
  		text	   *strtext;
  
  		strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 		strtext = DatumGetTextP(DirectFunctionCall1(lower,
! 												  PointerGetDatum(strtext)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 
! 		s = VARDATA(strtext);
! 		slen = (VARSIZE(strtext) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
  	}
  
  	PG_RETURN_BOOL(result);
--- 348,362 ----
  		slen = strlen(s);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (SB_MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
  	}
  	else
  	{
  		text	   *strtext;
  
  		strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 		result = (mbtexticlike(strtext, pat) != LIKE_TRUE);
  	}
  
  	PG_RETURN_BOOL(result);
***************
*** 363,383 ****
  		slen = (VARSIZE(str) - VARHDRSZ);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
  	}
  	else
! 	{
! 		/* Force inputs to lower case to achieve case insensitivity */
! 		str = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(str)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
! 	}
  
  	PG_RETURN_BOOL(result);
  }
--- 379,388 ----
  		slen = (VARSIZE(str) - VARHDRSZ);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (SB_MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
  	}
  	else
! 		result = (mbtexticlike(str, pat) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 399,419 ****
  		slen = (VARSIZE(str) - VARHDRSZ);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
  	}
  	else
! 	{
! 		/* Force inputs to lower case to achieve case insensitivity */
! 		str = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(str)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
! 	}
  
  	PG_RETURN_BOOL(result);
  }
--- 404,413 ----
  		slen = (VARSIZE(str) - VARHDRSZ);
  		p = VARDATA(pat);
  		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (SB_MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
  	}
  	else
! 		result = (mbtexticlike(str, pat) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 430,436 ****
  	text	   *result;
  
  	if (pg_database_encoding_max_length() == 1)
! 		result = do_like_escape(pat, esc);
  	else
  		result = MB_do_like_escape(pat, esc);
  
--- 424,430 ----
  	text	   *result;
  
  	if (pg_database_encoding_max_length() == 1)
! 		result = SB_do_like_escape(pat, esc);
  	else
  		result = MB_do_like_escape(pat, esc);
  
***************
*** 446,624 ****
  {
  	bytea	   *pat = PG_GETARG_BYTEA_P(0);
  	bytea	   *esc = PG_GETARG_BYTEA_P(1);
! 	bytea	   *result;
! 	char	   *p,
! 			   *e,
! 			   *r;
! 	int			plen,
! 				elen;
! 	bool		afterescape;
! 
! 	p = VARDATA(pat);
! 	plen = (VARSIZE(pat) - VARHDRSZ);
! 	e = VARDATA(esc);
! 	elen = (VARSIZE(esc) - VARHDRSZ);
! 
! 	/*
! 	 * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
! 	 * trying to calculate the size more accurately than that.
! 	 */
! 	result = (text *) palloc(plen * 2 + VARHDRSZ);
! 	r = VARDATA(result);
! 
! 	if (elen == 0)
! 	{
! 		/*
! 		 * No escape character is wanted.  Double any backslashes in the
! 		 * pattern to make them act like ordinary characters.
! 		 */
! 		while (plen > 0)
! 		{
! 			if (*p == '\\')
! 				*r++ = '\\';
! 			BYTEA_CopyAdvChar(r, p, plen);
! 		}
! 	}
! 	else
! 	{
! 		/*
! 		 * The specified escape must be only a single character.
! 		 */
! 		BYTEA_NextChar(e, elen);
! 		if (elen != 0)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
! 					 errmsg("invalid escape string"),
! 				  errhint("Escape string must be empty or one character.")));
! 
! 		e = VARDATA(esc);
! 
! 		/*
! 		 * If specified escape is '\', just copy the pattern as-is.
! 		 */
! 		if (*e == '\\')
! 		{
! 			memcpy(result, pat, VARSIZE(pat));
! 			PG_RETURN_BYTEA_P(result);
! 		}
! 
! 		/*
! 		 * Otherwise, convert occurrences of the specified escape character to
! 		 * '\', and double occurrences of '\' --- unless they immediately
! 		 * follow an escape character!
! 		 */
! 		afterescape = false;
! 		while (plen > 0)
! 		{
! 			if (BYTEA_CHAREQ(p, e) && !afterescape)
! 			{
! 				*r++ = '\\';
! 				BYTEA_NextChar(p, plen);
! 				afterescape = true;
! 			}
! 			else if (*p == '\\')
! 			{
! 				*r++ = '\\';
! 				if (!afterescape)
! 					*r++ = '\\';
! 				BYTEA_NextChar(p, plen);
! 				afterescape = false;
! 			}
! 			else
! 			{
! 				BYTEA_CopyAdvChar(r, p, plen);
! 				afterescape = false;
! 			}
! 		}
! 	}
! 
! 	SET_VARSIZE(result, r - ((char *) result));
  
! 	PG_RETURN_BYTEA_P(result);
  }
  
- /*
-  * Same as above, but specifically for bytea (binary) datatype
-  */
- static int
- MatchBytea(char *t, int tlen, char *p, int plen)
- {
- 	/* Fast path for match-everything pattern */
- 	if ((plen == 1) && (*p == '%'))
- 		return LIKE_TRUE;
- 
- 	while ((tlen > 0) && (plen > 0))
- 	{
- 		if (*p == '\\')
- 		{
- 			/* Next pattern char must match literally, whatever it is */
- 			BYTEA_NextChar(p, plen);
- 			if ((plen <= 0) || !BYTEA_CHAREQ(t, p))
- 				return LIKE_FALSE;
- 		}
- 		else if (*p == '%')
- 		{
- 			/* %% is the same as % according to the SQL standard */
- 			/* Advance past all %'s */
- 			while ((plen > 0) && (*p == '%'))
- 				BYTEA_NextChar(p, plen);
- 			/* Trailing percent matches everything. */
- 			if (plen <= 0)
- 				return LIKE_TRUE;
- 
- 			/*
- 			 * Otherwise, scan for a text position at which we can match the
- 			 * rest of the pattern.
- 			 */
- 			while (tlen > 0)
- 			{
- 				/*
- 				 * Optimization to prevent most recursion: don't recurse
- 				 * unless first pattern char might match this text char.
- 				 */
- 				if (BYTEA_CHAREQ(t, p) || (*p == '\\') || (*p == '_'))
- 				{
- 					int			matched = MatchBytea(t, tlen, p, plen);
- 
- 					if (matched != LIKE_FALSE)
- 						return matched; /* TRUE or ABORT */
- 				}
- 
- 				BYTEA_NextChar(t, tlen);
- 			}
- 
- 			/*
- 			 * End of text with no match, so no point in trying later places
- 			 * to start matching this pattern.
- 			 */
- 			return LIKE_ABORT;
- 		}
- 		else if ((*p != '_') && !BYTEA_CHAREQ(t, p))
- 		{
- 			/*
- 			 * Not the single-character wildcard and no explicit match? Then
- 			 * time to quit...
- 			 */
- 			return LIKE_FALSE;
- 		}
- 
- 		BYTEA_NextChar(t, tlen);
- 		BYTEA_NextChar(p, plen);
- 	}
- 
- 	if (tlen > 0)
- 		return LIKE_FALSE;		/* end of pattern, but not of text */
- 
- 	/* End of input string.  Do we have matching pattern remaining? */
- 	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
- 										 * pattern */
- 		BYTEA_NextChar(p, plen);
- 	if (plen <= 0)
- 		return LIKE_TRUE;
- 
- 	/*
- 	 * End of text with no match, so no point in trying later places to start
- 	 * matching this pattern.
- 	 */
- 	return LIKE_ABORT;
- }	/* MatchBytea() */
--- 440,447 ----
  {
  	bytea	   *pat = PG_GETARG_BYTEA_P(0);
  	bytea	   *esc = PG_GETARG_BYTEA_P(1);
! 	bytea	   *result = SB_do_like_escape((text *)pat, (text *)esc);
  
! 	PG_RETURN_BYTEA_P((bytea *)result);
  }
  
Index: src/backend/utils/adt/like_match.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like_match.c,v
retrieving revision 1.15
diff -c -r1.15 like_match.c
*** src/backend/utils/adt/like_match.c	27 Feb 2007 23:48:08 -0000	1.15
--- src/backend/utils/adt/like_match.c	18 May 2007 02:47:41 -0000
***************
*** 9,19 ****
   * Before the inclusion, we need to define following macros:
   *
   * CHAREQ
-  * ICHAREQ
   * NextChar
   * CopyAdvChar
   * MatchText (MBMatchText)
-  * MatchTextIC (MBMatchTextIC)
   * do_like_escape (MB_do_like_escape)
   *
   * Copyright (c) 1996-2007, PostgreSQL Global Development Group
--- 9,17 ----
***************
*** 82,88 ****
  		if (*p == '\\')
  		{
  			/* Next pattern char must match literally, whatever it is */
! 			NextChar(p, plen);
  			if ((plen <= 0) || !CHAREQ(t, p))
  				return LIKE_FALSE;
  		}
--- 80,86 ----
  		if (*p == '\\')
  		{
  			/* Next pattern char must match literally, whatever it is */
! 			NextByte(p, plen);
  			if ((plen <= 0) || !CHAREQ(t, p))
  				return LIKE_FALSE;
  		}
***************
*** 91,97 ****
  			/* %% is the same as % according to the SQL standard */
  			/* Advance past all %'s */
  			while ((plen > 0) && (*p == '%'))
! 				NextChar(p, plen);
  			/* Trailing percent matches everything. */
  			if (plen <= 0)
  				return LIKE_TRUE;
--- 89,95 ----
  			/* %% is the same as % according to the SQL standard */
  			/* Advance past all %'s */
  			while ((plen > 0) && (*p == '%'))
! 				NextByte(p, plen);
  			/* Trailing percent matches everything. */
  			if (plen <= 0)
  				return LIKE_TRUE;
***************
*** 123,129 ****
  			 */
  			return LIKE_ABORT;
  		}
! 		else if ((*p != '_') && !CHAREQ(t, p))
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
--- 121,146 ----
  			 */
  			return LIKE_ABORT;
  		}
! #ifdef UTF8_OPT
! 		/* 
! 		 * UTF8 is optimised to do byte at a time matching in most cases, 
! 		 * thus saving expensive calls to NextChar.
! 		 *
! 		 * UTF8 has disjoint representations for first-bytes and 
! 		 * not-first-bytes of MB characters, and thus it is
! 		 * impossible to make a false match in which an MB pattern 
! 		 * character is matched to the end of one data character 
! 		 * plus the start of another.
! 		 * In character sets without that property, we have to use the 
! 		 * slow way to ensure we don't make out-of-sync matches.
! 		 */
! 		else if (*p == '_')
! 		{
! 			NextChar(t, tlen);
! 			NextByte(p, plen);
! 			continue;
! 		}
! 		else if (!BYTEEQ(t, p))
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
***************
*** 132,215 ****
  			return LIKE_FALSE;
  		}
  
! 		NextChar(t, tlen);
! 		NextChar(p, plen);
! 	}
! 
! 	if (tlen > 0)
! 		return LIKE_FALSE;		/* end of pattern, but not of text */
! 
! 	/* End of input string.  Do we have matching pattern remaining? */
! 	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
! 										 * pattern */
! 		NextChar(p, plen);
! 	if (plen <= 0)
! 		return LIKE_TRUE;
! 
! 	/*
! 	 * End of text with no match, so no point in trying later places to start
! 	 * matching this pattern.
! 	 */
! 	return LIKE_ABORT;
! }	/* MatchText() */
! 
! /*
!  * Same as above, but ignore case
!  */
! static int
! MatchTextIC(char *t, int tlen, char *p, int plen)
! {
! 	/* Fast path for match-everything pattern */
! 	if ((plen == 1) && (*p == '%'))
! 		return LIKE_TRUE;
! 
! 	while ((tlen > 0) && (plen > 0))
! 	{
! 		if (*p == '\\')
! 		{
! 			/* Next pattern char must match literally, whatever it is */
! 			NextChar(p, plen);
! 			if ((plen <= 0) || !ICHAREQ(t, p))
! 				return LIKE_FALSE;
! 		}
! 		else if (*p == '%')
! 		{
! 			/* %% is the same as % according to the SQL standard */
! 			/* Advance past all %'s */
! 			while ((plen > 0) && (*p == '%'))
! 				NextChar(p, plen);
! 			/* Trailing percent matches everything. */
! 			if (plen <= 0)
! 				return LIKE_TRUE;
! 
! 			/*
! 			 * Otherwise, scan for a text position at which we can match the
! 			 * rest of the pattern.
! 			 */
! 			while (tlen > 0)
! 			{
! 				/*
! 				 * Optimization to prevent most recursion: don't recurse
! 				 * unless first pattern char might match this text char.
! 				 */
! 				if (ICHAREQ(t, p) || (*p == '\\') || (*p == '_'))
! 				{
! 					int			matched = MatchTextIC(t, tlen, p, plen);
! 
! 					if (matched != LIKE_FALSE)
! 						return matched; /* TRUE or ABORT */
! 				}
! 
! 				NextChar(t, tlen);
! 			}
! 
! 			/*
! 			 * End of text with no match, so no point in trying later places
! 			 * to start matching this pattern.
! 			 */
! 			return LIKE_ABORT;
! 		}
! 		else if ((*p != '_') && !ICHAREQ(t, p))
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
--- 149,163 ----
  			return LIKE_FALSE;
  		}
  
! 		NextByte(t, tlen);
! 		NextByte(p, plen);
! #else
! 		/* 
! 		 * Branch for non-utf8 multi-byte charsets and also for single-byte
! 		 * charsets which don't gain any benefir from the above optimisation.
! 		 */
! 		   
! 		else if ((*p != '_') && !CHAREQ(t, p))
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
***************
*** 220,225 ****
--- 168,175 ----
  
  		NextChar(t, tlen);
  		NextChar(p, plen);
+ 
+ #endif /* UTF8_OPT */
  	}
  
  	if (tlen > 0)
***************
*** 228,234 ****
  	/* End of input string.  Do we have matching pattern remaining? */
  	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
  										 * pattern */
! 		NextChar(p, plen);
  	if (plen <= 0)
  		return LIKE_TRUE;
  
--- 178,184 ----
  	/* End of input string.  Do we have matching pattern remaining? */
  	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
  										 * pattern */
! 		NextByte(p, plen);
  	if (plen <= 0)
  		return LIKE_TRUE;
  
***************
*** 237,248 ****
  	 * matching this pattern.
  	 */
  	return LIKE_ABORT;
! }	/* MatchTextIC() */
  
  /*
   * like_escape() --- given a pattern and an ESCAPE string,
   * convert the pattern to use Postgres' standard backslash escape convention.
   */
  static text *
  do_like_escape(text *pat, text *esc)
  {
--- 187,200 ----
  	 * matching this pattern.
  	 */
  	return LIKE_ABORT;
! }	/* MatchText() */
  
  /*
   * like_escape() --- given a pattern and an ESCAPE string,
   * convert the pattern to use Postgres' standard backslash escape convention.
   */
+ #ifdef do_like_escape
+ 
  static text *
  do_like_escape(text *pat, text *esc)
  {
***************
*** 336,338 ****
--- 288,304 ----
  
  	return result;
  }
+ #endif /* do_like_escape */
+ 
+ #undef CHAREQ
+ #undef NextChar
+ #undef CopyAdvChar
+ #undef MatchText
+ 
+ #ifdef do_like_escape
+ #undef do_like_escape
+ #endif
+ 
+ #ifdef UTF8_OPT
+ #undef UTF8_OPT
+ #endif

---------------------------(end of broadcast)---------------------------
TIP 3: Have you checked our extensive FAQ?

               http://www.postgresql.org/docs/faq

Re: [PATCHES] UTF8MatchText

Reply via email to