Index: src/backend/utils/mb/conv.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/conv.c,v
retrieving revision 1.52
diff -c -r1.52 conv.c
*** src/backend/utils/mb/conv.c	7 Mar 2005 04:30:52 -0000	1.52
--- src/backend/utils/mb/conv.c	5 Jun 2005 04:40:53 -0000
***************
*** 361,372 ****
  			iutf = *utf++ << 8;
  			iutf |= *utf++;
  		}
! 		else
  		{
  			iutf = *utf++ << 16;
  			iutf |= *utf++ << 8;
  			iutf |= *utf++;
  		}
  		p = bsearch(&iutf, map, size,
  					sizeof(pg_utf_to_local), compare1);
  		if (p == NULL)
--- 361,379 ----
  			iutf = *utf++ << 8;
  			iutf |= *utf++;
  		}
! 		else if (l == 3)
  		{
  			iutf = *utf++ << 16;
  			iutf |= *utf++ << 8;
  			iutf |= *utf++;
  		}
+ 		else if (l == 4)
+ 		{
+ 			iutf = *utf++ << 24;
+ 			iutf |= *utf++ << 16;
+ 			iutf |= *utf++ << 8;
+ 			iutf |= *utf++;
+ 		}
  		p = bsearch(&iutf, map, size,
  					sizeof(pg_utf_to_local), compare1);
  		if (p == NULL)
Index: src/backend/utils/mb/wchar.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v
retrieving revision 1.43
diff -c -r1.43 wchar.c
*** src/backend/utils/mb/wchar.c	14 Mar 2005 18:31:20 -0000	1.43
--- src/backend/utils/mb/wchar.c	5 Jun 2005 04:40:54 -0000
***************
*** 406,413 ****
  		len = 1;
  	else if ((*s & 0xe0) == 0xc0)
  		len = 2;
! 	else if ((*s & 0xe0) == 0xe0)
! 		len = 3;
  	return (len);
  }
  
--- 406,419 ----
  		len = 1;
  	else if ((*s & 0xe0) == 0xc0)
  		len = 2;
!         else if ((*s & 0xf0) == 0xe0)
!                 len = 3;
!         else if ((*s & 0xf8) == 0xf0)
!                 len = 4;
!         else if ((*s & 0xfc) == 0xf8)
!                 len = 5;
!         else if ((*s & 0xfe) == 0xfc)
!                 len = 6;
  	return (len);
  }
  
***************
*** 721,727 ****
  	{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},		/* 3; PG_EUC_KR */
  	{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},		/* 4; PG_EUC_TW */
  	{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},		/* 5; PG_JOHAB */
! 	{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3},			/* 6; PG_UTF8 */
  	{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, 		/* 7; PG_MULE_INTERNAL */
  	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 8; PG_LATIN1 */
  	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 9; PG_LATIN2 */
--- 727,733 ----
  	{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},		/* 3; PG_EUC_KR */
  	{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},		/* 4; PG_EUC_TW */
  	{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},		/* 5; PG_JOHAB */
! 	{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4},		/* 6; PG_UTF8 */
  	{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, 		/* 7; PG_MULE_INTERNAL */
  	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 8; PG_LATIN1 */
  	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 9; PG_LATIN2 */
***************
*** 800,805 ****
--- 806,836 ----
  
  #ifndef FRONTEND
  
+ bool pg_utf8_islegal(const unsigned char *source, int length) {
+     unsigned char a;
+     const unsigned char *srcptr = source+length;
+     switch (length) {
+        default: return false;
+         /* Everything else falls through when "true"... */
+        case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+        case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+        case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+         switch (*source) {
+             /* no fall-through in this inner switch */
+             case 0xE0: if (a < 0xA0) return false; break;
+             case 0xED: if (a > 0x9F) return false; break;
+             case 0xF0: if (a < 0x90) return false; break;
+             case 0xF4: if (a > 0x8F) return false; break;
+             default:   if (a < 0x80) return false;
+         }
+ 
+     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+     }
+     if (*source > 0xF4) return false;
+     return true;
+ }
+ 
+ 
  /*
   * Verify mbstr to make sure that it has a valid character sequence.
   * mbstr is not necessarily NULL terminated; length of mbstr is
***************
*** 823,873 ****
  
  	while (len > 0 && *mbstr)
  	{
- 		/* special UTF8 check */
- 		if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
- 		{
- 			if (noError)
- 				return false;
- 			ereport(ERROR,
- 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
- 					 errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
- 		}
- 
  		l = pg_mblen(mbstr);
! 
! 		for (i = 1; i < l; i++)
! 		{
! 			/*
! 			 * we expect that every multibyte char consists of bytes
! 			 * having the 8th bit set
! 			 */
! 			if (i >= len || (mbstr[i] & 0x80) == 0)
  			{
! 				char		buf[8 * 2 + 1];
! 				char	   *p = buf;
! 				int			j,
  							jlimit;
  
! 				if (noError)
! 					return false;
  
! 				jlimit = Min(l, len);
! 				jlimit = Min(jlimit, 8);		/* prevent buffer overrun */
  
! 				for (j = 0; j < jlimit; j++)
! 					p += sprintf(p, "%02x", mbstr[j]);
  
! 				ereport(ERROR,
! 						(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 				errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
! 					   GetDatabaseEncodingName(), buf)));
  			}
  		}
- 
  		len -= l;
  		mbstr += l;
  	}
- 
  	return true;
  }
  
--- 854,900 ----
  
  	while (len > 0 && *mbstr)
  	{
  		l = pg_mblen(mbstr);
! 		
! 		/* special UTF-8 check */
! 		if (encoding == PG_UTF8) {
!             		if(!pg_utf8_islegal(mbstr,l)) {
!                     		if (noError) return false;
! 				ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near byte %c",*mbstr)));
! 			}
! 		} else {
! 			for (i = 1; i < l; i++)
  			{
!                         	/*
!                     		* we expect that every multibyte char consists of bytes
!                                 * having the 8th bit set
!                                 */
!                     		if (i >= len || (mbstr[i] & 0x80) == 0)
!                         	{
!                             		char		buf[8 * 2 + 1];
!                                         char		*p = buf;
!                                         int		j,
  							jlimit;
  
! 					if (noError)
! 						return false;
  
! 					jlimit = Min(l, len);
! 					jlimit = Min(jlimit, 8);		/* prevent buffer overrun */
  
! 					for (j = 0; j < jlimit; j++)
! 						p += sprintf(p, "%02x", mbstr[j]);
  
! 					ereport(ERROR,
! 							(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
! 					errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
! 						GetDatabaseEncodingName(), buf)));
! 				}
  			}
  		}
  		len -= l;
  		mbstr += l;
  	}
  	return true;
  }
  
Index: src/include/mb/pg_wchar.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v
retrieving revision 1.58
diff -c -r1.58 pg_wchar.h
*** src/include/mb/pg_wchar.h	14 Mar 2005 18:31:24 -0000	1.58
--- src/include/mb/pg_wchar.h	5 Jun 2005 04:41:08 -0000
***************
*** 340,343 ****
--- 340,345 ----
  extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
  extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
  
+ extern bool pg_utf8_islegal(const unsigned char *source, int length);
+ 
  #endif   /* PG_WCHAR_H */
