[HACKERS] [rfc] unicode escapes for extended strings

Marko Kreen Thu, 16 Apr 2009 10:49:37 -0700

Seems I'm bad at communicating in english, so here is C variant of
my proposal to bring \u escaping into extended strings.  Reasons:


- More people are familiar with \u escaping, as it's standard
  in Java/C#/Python, probably more..
- U& strings will not work when stdstr=off.

Syntax:

  \uXXXX      - 16-bit value
  \UXXXXXXXX  - 32-bit value

Additionally, both \u and \U can be used to specify UTF-16 surrogate
pairs to encode characters with value > 0xFFFF.  This is exact behaviour
used by Java/C#/Python.  (except that Java does not have \U)


I'm ok with this patch left to 8.5.

-- 
marko

diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml
index a559d75..fdb0cc5 100644
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -394,6 +394,14 @@ SELECT 'foo'      'bar';
         </entry>
         <entry>hexadecimal byte value</entry>
        </row>
+       <row>
+        <entry>
+         <literal>\u<replaceable>xxxx</replaceable></literal>,
+         <literal>\U<replaceable>xxxxxxxx</replaceable></literal>
+         (<replaceable>x</replaceable> = 0 - 9, A - F)
+        </entry>
+        <entry>16 or 32-bit hexadecimal Unicode character value.</entry>
+       </row>
       </tbody>
       </tgroup>
      </table>
@@ -407,6 +415,14 @@ SELECT 'foo'      'bar';
     </para>
 
     <para>
+	 The Unicode escape syntax works fully only when the server encoding is UTF8.
+	 When other server encodings are used, only code points in the ASCII range
+	 (up to <literal>\u007F</>) can be specified.  Both <literal>\u</> and <literal>\U</>
+	 can also be used to specify UTF-16 surrogate pair to escape characters
+	 with value larger than <literal>\uFFFF</>.
+	</para>
+
+    <para>
      It is your responsibility that the byte sequences you create are
      valid characters in the server character set encoding.  When the
      server encoding is UTF-8, then the alternative Unicode escape
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index a070e85..c0695f1 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -98,6 +98,11 @@ static char *scanbuf;
 
 static unsigned char unescape_single_char(unsigned char c);
 
+/* first part of unicode surrogate */
+static unsigned long xeu_surrogate1;
+
+static void addunicode(pg_wchar c);
+
 %}
 
 %option 8bit
@@ -128,6 +133,7 @@ static unsigned char unescape_single_char(unsigned char c);
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
  *  <xus> quoted string with Unicode escapes
+ *  <xeu> Unicode surrogate escape in extended string
  */
 
 %x xb
@@ -139,6 +145,7 @@ static unsigned char unescape_single_char(unsigned char c);
 %x xdolq
 %x xui
 %x xus
+%x xeu
 
 /*
  * In order to make the world safe for Windows and Mac clients as well as
@@ -217,6 +224,7 @@ xeinside		[^\\']+
 xeescape		[\\][^0-7]
 xeoctesc		[\\][0-7]{1,3}
 xehexesc		[\\]x[0-9A-Fa-f]{1,2}
+xeunicode		[\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
 
 /* Extended quote
  * xqdouble implements embedded quote, ''''
@@ -506,6 +514,37 @@ other			.
 <xe>{xeinside}  {
 					addlit(yytext, yyleng);
 				}
+<xe>{xeunicode} {
+					pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+					check_escape_warning();
+
+					if (c >= 0xD800 && c < 0xDC00)
+					{
+						xeu_surrogate1 = c;
+						BEGIN(xeu);
+					}
+					else if (c >= 0xDC00 && c < 0xE000)
+						yyerror("invalid Unicode escape value");
+
+					addunicode(c);
+				}
+<xeu>{xeunicode} {
+					pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+					if (c < 0xDC00 || c >= 0xE000)
+						yyerror("invalid Unicode surrogate pair");
+
+					c = ((xeu_surrogate1 & 0x3FF) << 10) | (c & 0x3FF);
+
+					addunicode(c + 0x10000);
+
+					BEGIN(xe);
+				}
+<xeu>.			{
+					yyerror("invalid Unicode surrogate pair");
+				}
+
 <xe>{xeescape}  {
 					if (yytext[1] == '\'')
 					{
@@ -1153,3 +1192,18 @@ check_escape_warning(void)
 				 lexer_errposition()));
 	warn_on_first_escape = false;	/* warn only once per string */
 }
+
+static void
+addunicode(pg_wchar c)
+{
+	char buf[8];
+
+	if (c == 0 || c > 0x10FFFF)
+		yyerror("invalid Unicode escape value");
+	if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
+		yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+
+	unicode_to_utf8(c, (unsigned char *)buf);
+	addlit(buf, pg_mblen(buf));
+}
+

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] [rfc] unicode escapes for extended strings

Reply via email to