Re: [HACKERS] [rfc] unicode escapes for extended strings

Marko Kreen Wed, 09 Sep 2009 08:27:22 -0700

Unicode escapes for extended strings.

On 4/16/09, Marko Kreen <mark...@gmail.com> wrote:
> Reasons:
>
>  - More people are familiar with \u escaping, as it's standard
>   in Java/C#/Python, probably more..
>  - U& strings will not work when stdstr=off.
>
>  Syntax:
>
>   \uXXXX      - 16-bit value
>   \UXXXXXXXX  - 32-bit value
>
>  Additionally, both \u and \U can be used to specify UTF-16 surrogate
>  pairs to encode characters with value > 0xFFFF.  This is exact behaviour
>  used by Java/C#/Python.  (except that Java does not have \U)


v3 of the patch:

    - convert to new reentrant lexer API
    - add lexer targets to avoid fallback to default
    - completely disallow \U\u without proper number of hex values
    - fix logic bug in surrogate pair handling

-- 
marko

diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml
index 7637eab..b6f26cc 100644
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -394,6 +394,14 @@ SELECT 'foo'      'bar';
         </entry>
         <entry>hexadecimal byte value</entry>
        </row>
+       <row>
+        <entry>
+         <literal>\u<replaceable>xxxx</replaceable></literal>,
+         <literal>\U<replaceable>xxxxxxxx</replaceable></literal>
+         (<replaceable>x</replaceable> = 0 - 9, A - F)
+        </entry>
+        <entry>16 or 32-bit hexadecimal Unicode character value.</entry>
+       </row>
       </tbody>
       </tgroup>
      </table>
@@ -407,6 +415,14 @@ SELECT 'foo'      'bar';
     </para>
 
     <para>
+	 The Unicode escape syntax works fully only when the server encoding is UTF8.
+	 When other server encodings are used, only code points in the ASCII range
+	 (up to <literal>\u007F</>) can be specified.  Both <literal>\u</> and <literal>\U</>
+	 can also be used to specify UTF-16 surrogate pair to escape characters
+	 with value larger than <literal>\uFFFF</>.
+	</para>
+
+    <para>
      It is your responsibility that the byte sequences you create are
      valid characters in the server character set encoding.  When the
      server encoding is UTF-8, then the alternative Unicode escape
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index f404f9d..8ca3007 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -97,6 +97,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
 extern int	base_yyget_column(yyscan_t yyscanner);
 extern void base_yyset_column(int column_no, yyscan_t yyscanner);
 
+static void addunicode(pg_wchar c, yyscan_t yyscanner);
+
 %}
 
 %option reentrant
@@ -134,6 +136,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
  *  <xus> quoted string with Unicode escapes
+ *  <xeu> Unicode surrogate escape in extended string
  */
 
 %x xb
@@ -145,6 +148,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
 %x xdolq
 %x xui
 %x xus
+%x xeu
 
 /*
  * In order to make the world safe for Windows and Mac clients as well as
@@ -223,6 +227,8 @@ xeinside		[^\\']+
 xeescape		[\\][^0-7]
 xeoctesc		[\\][0-7]{1,3}
 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
+xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
+xeunicodebad	[\\]([uU])
 
 /* Extended quote
  * xqdouble implements embedded quote, ''''
@@ -535,6 +541,51 @@ other			.
 <xe>{xeinside}  {
 					addlit(yytext, yyleng, yyscanner);
 				}
+<xe>{xeunicode} {
+					pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+					check_escape_warning(yyscanner);
+
+					/*
+					 * handle UTF-16 surrogates:
+					 *   [0xD800..0xDC00) - first elem.
+					 *   [0xDC00..0xE000) - second elem.
+					 */
+					if (c >= 0xD800 && c < 0xE000)
+					{
+						if (c >= 0xDC00)
+							yyerror("invalid Unicode surrogate pair");
+
+						yyextra->utf16_top_part = ((c & 0x3FF) << 10) + 0x10000;
+						BEGIN(xeu);
+					}
+					else
+						addunicode(c, yyscanner);
+				}
+<xeu>{xeunicode} {
+					pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+					if (c < 0xDC00 || c >= 0xE000)
+						yyerror("invalid Unicode surrogate pair");
+
+					c = (c & 0x3FF) + yyextra->utf16_top_part;
+
+					addunicode(c, yyscanner);
+
+					BEGIN(xe);
+				}
+<xeu>.			|
+<xeu>\n			|
+<xeu><<EOF>>	{ yyerror("invalid Unicode surrogate pair"); }
+
+<xe>{xeunicodebad}	{
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+								 errmsg("invalid Unicode escape"),
+								 errhint("Unicode escapes must be full-length: \\uXXXX or \\UXXXXXXXX."),
+								 lexer_errposition()));
+					}
+
 <xe>{xeescape}  {
 					if (yytext[1] == '\'')
 					{
@@ -1263,3 +1314,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
 	if (ptr)
 		pfree(ptr);
 }
+
+static void
+addunicode(pg_wchar c, base_yyscan_t yyscanner)
+{
+	char buf[8];
+
+	if (c == 0 || c > 0x10FFFF)
+		yyerror("invalid Unicode escape value");
+	if (c > 0x7F)
+	{
+		if (GetDatabaseEncoding() != PG_UTF8)
+			yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+		yyextra->saw_non_ascii = true;
+	}
+	unicode_to_utf8(c, (unsigned char *)buf);
+	addlit(buf, pg_mblen(buf), yyscanner);
+}
+
diff --git a/src/include/parser/gramparse.h b/src/include/parser/gramparse.h
index a54a1b1..0ef9bf4 100644
--- a/src/include/parser/gramparse.h
+++ b/src/include/parser/gramparse.h
@@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
 	int			xcdepth;		/* depth of nesting in slash-star comments */
 	char	   *dolqstart;		/* current $foo$ quote start string */
 
+	/* first part of UTF16 surrogate for unicode escapes */
+	uint32		utf16_top_part;
+
 	/* state variables for literal-lexing warnings */
 	bool		warn_on_first_escape;
 	bool		saw_non_ascii;

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] [rfc] unicode escapes for extended strings

Reply via email to