The attached patch is intended to ensure that chr() does not produce invalidly encoded data, as recently discussed on -hackers. For UTF8, we treat its argument as a Unicode code point; for all other multi-byte encodings, we raise an error on any argument greater than 127. For all encodings we raise an error if the argument is 0 (we don't allow null bytes in text data). The ascii() function is adjusted so that it remains the inverse of chr() - i.e. for UTF8 it returns the Unicode code point, and it raises an error for any other multi-byte encoding if the aregument is outside the ASCII range. I have tested thius inverse property across the entire Unicode code point range, 0x01 .. 0x1ffff.

TODO: alter docs to reflect changes.

cheers

andrew


Index: src/backend/utils/adt/oracle_compat.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/oracle_compat.c,v
retrieving revision 1.70
diff -c -r1.70 oracle_compat.c
*** src/backend/utils/adt/oracle_compat.c	27 Feb 2007 23:48:08 -0000	1.70
--- src/backend/utils/adt/oracle_compat.c	14 Sep 2007 15:44:41 -0000
***************
*** 1246,1251 ****
--- 1246,1258 ----
   *
   *	 Returns the decimal representation of the first character from
   *	 string.
+  *   If the string is empty we return 0.
+  *   If the database encoding is UTF8, we return the Unicode codepoint. 
+  *   If the database encoding is any other multi-byte encoding, we
+  *   return the value of the first byte if it is an ASCII character
+  *   (range 1 .. 127), or raise an error.
+  *   For all other encodings we return the value of the first byte,
+  *   (range 1..255).
   *
   ********************************************************************/
  
***************
*** 1253,1263 ****
  ascii(PG_FUNCTION_ARGS)
  {
  	text	   *string = PG_GETARG_TEXT_P(0);
  
  	if (VARSIZE(string) <= VARHDRSZ)
  		PG_RETURN_INT32(0);
  
! 	PG_RETURN_INT32((int32) *((unsigned char *) VARDATA(string)));
  }
  
  /********************************************************************
--- 1260,1316 ----
  ascii(PG_FUNCTION_ARGS)
  {
  	text	   *string = PG_GETARG_TEXT_P(0);
+ 	int encoding = GetDatabaseEncoding();
+ 	unsigned char *data;
  
  	if (VARSIZE(string) <= VARHDRSZ)
  		PG_RETURN_INT32(0);
  
! 	data = (unsigned char *) VARDATA(string);
! 
! 	if (encoding == PG_UTF8 && *data > 127)
! 	{
! 		/* return the code point for Unicode */
! 
! 		int result = 0, tbytes = 0, i;
! 
! 		if (*data >= 0xF0)
! 		{
! 			result = *data & 0x07;
! 			tbytes = 3;
! 		}
! 		else if (*data >= 0xE0)
! 		{
! 			result = *data & 0x0F;
! 			tbytes = 2;
! 		}
! 		else
! 		{
! 			Assert (*data > 0xC0);
! 			result = *data & 0x1f;
! 			tbytes = 1;
! 		}
! 
! 		Assert (tbytes > 0);
! 
! 		for (i = 1; i <= tbytes; i++)
! 		{
! 			Assert ((data[i] & 0xC0) == 0x80);
! 			result = (result << 6) + (data[i] & 0x3f);
! 		}
! 
! 		PG_RETURN_INT32(result);
! 	}
! 	else
! 	{
! 		if (pg_encoding_max_length(encoding) > 1 && *data > 127)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("requested character too large")));
! 
! 
! 		PG_RETURN_INT32((int32) *data);
! 	}
  }
  
  /********************************************************************
***************
*** 1270,1288 ****
   *
   * Purpose:
   *
!  *	Returns the character having the binary equivalent to val
   *
   ********************************************************************/
  
  Datum
  chr(PG_FUNCTION_ARGS)
  {
! 	int32		cvalue = PG_GETARG_INT32(0);
  	text	   *result;
  
! 	result = (text *) palloc(VARHDRSZ + 1);
! 	SET_VARSIZE(result, VARHDRSZ + 1);
! 	*VARDATA(result) = (char) cvalue;
  
  	PG_RETURN_TEXT_P(result);
  }
--- 1323,1418 ----
   *
   * Purpose:
   *
!  *	Returns the character having the binary equivalent to val.
!  *
!  * For UTF8 we treat the argumwent as a Unicode code point.
!  * For other multi-byte encodings we raise an error for arguments
!  * outside the strict ASCII range (1..127).
!  *
!  * It's important that we don't ever return a value that is not valid
!  * in the database encoding, so that this doesn't become a way for
!  * invalid data to enter the database.
   *
   ********************************************************************/
  
  Datum
  chr(PG_FUNCTION_ARGS)
  {
! 	uint32		cvalue = PG_GETARG_UINT32(0);
  	text	   *result;
+ 	int encoding = GetDatabaseEncoding();
+ 
+ 	if (encoding == PG_UTF8 && cvalue > 127)
+ 	{
+ 		/* for Unicode we treat the argument as a code point */
+ 		int bytes ;
+ 		char *wch;
  
! 		/* We only allow valid Unicode code points */
! 		if (cvalue > 0x001fffff)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("requested character too large for encoding: %d", 
! 							cvalue)));
! 
! 		if (cvalue > 0xffff)
! 			bytes = 4;
! 		else if (cvalue > 0x07ff)
! 			bytes = 3;
! 		else
! 			bytes = 2;
! 
! 		result = (text *) palloc(VARHDRSZ + bytes);
! 		SET_VARSIZE(result, VARHDRSZ + bytes);
! 		wch = VARDATA(result);
! 
! 		if (bytes == 2)
! 		{
! 			wch[0] = 0xC0 | ((cvalue >> 6) & 0x1F);
! 			wch[1] = 0x80 | (cvalue & 0x3F);;
! 		}
! 		else if (bytes == 3)
! 		{
! 			wch[0] = 0xE0 | ((cvalue >> 12) & 0x0F);
! 			wch[1] = 0x80 | ((cvalue >> 6) & 0x3F);
! 			wch[2] = 0x80 | (cvalue & 0x3F);
! 		}
! 		else
! 		{
! 			wch[0] = 0xF0 | ((cvalue >> 18) & 0x07);
! 			wch[1] = 0x80 | ((cvalue >> 12) & 0x3F);
! 			wch[2] = 0x80 | ((cvalue >> 6) & 0x3F);
! 			wch[3] = 0x80 | (cvalue & 0x3F);
! 		}
! 		
! 	}
! 
! 	else
! 	{
! 		bool is_mb;
! 
! 		/* Error out on arguments that make no sense or that we
! 		 * can't validly represent in the encoding.
! 		 */
! 
! 		if (cvalue == 0)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("null character not permitted")));
! 
! 		is_mb = pg_encoding_max_length(encoding) > 1;
! 
! 		if ((is_mb && (cvalue > 255)) || (! is_mb && (cvalue > 127)))
! 			ereport(ERROR,
! 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
! 					 errmsg("requested character too large for encoding: %d",
! 							cvalue)));
! 		
! 
! 		result = (text *) palloc(VARHDRSZ + 1);
! 		SET_VARSIZE(result, VARHDRSZ + 1);
! 		*VARDATA(result) = (char) cvalue;
! 	}
  
  	PG_RETURN_TEXT_P(result);
  }
---------------------------(end of broadcast)---------------------------
TIP 4: Have you searched our list archives?

               http://archives.postgresql.org

Reply via email to