Seems I'm bad at communicating in english, so here is C variant of
my proposal to bring \u escaping into extended strings. Reasons:
- More people are familiar with \u escaping, as it's standard
in Java/C#/Python, probably more..
- U& strings will not work when stdstr=off.
Syntax:
\uXXXX - 16-bit value
\UXXXXXXXX - 32-bit value
Additionally, both \u and \U can be used to specify UTF-16 surrogate
pairs to encode characters with value > 0xFFFF. This is exact behaviour
used by Java/C#/Python. (except that Java does not have \U)
I'm ok with this patch left to 8.5.
--
marko
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml
index a559d75..fdb0cc5 100644
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -394,6 +394,14 @@ SELECT 'foo' 'bar';
</entry>
<entry>hexadecimal byte value</entry>
</row>
+ <row>
+ <entry>
+ <literal>\u<replaceable>xxxx</replaceable></literal>,
+ <literal>\U<replaceable>xxxxxxxx</replaceable></literal>
+ (<replaceable>x</replaceable> = 0 - 9, A - F)
+ </entry>
+ <entry>16 or 32-bit hexadecimal Unicode character value.</entry>
+ </row>
</tbody>
</tgroup>
</table>
@@ -407,6 +415,14 @@ SELECT 'foo' 'bar';
</para>
<para>
+ The Unicode escape syntax works fully only when the server encoding is UTF8.
+ When other server encodings are used, only code points in the ASCII range
+ (up to <literal>\u007F</>) can be specified. Both <literal>\u</> and <literal>\U</>
+ can also be used to specify UTF-16 surrogate pair to escape characters
+ with value larger than <literal>\uFFFF</>.
+ </para>
+
+ <para>
It is your responsibility that the byte sequences you create are
valid characters in the server character set encoding. When the
server encoding is UTF-8, then the alternative Unicode escape
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index a070e85..c0695f1 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -98,6 +98,11 @@ static char *scanbuf;
static unsigned char unescape_single_char(unsigned char c);
+/* first part of unicode surrogate */
+static unsigned long xeu_surrogate1;
+
+static void addunicode(pg_wchar c);
+
%}
%option 8bit
@@ -128,6 +133,7 @@ static unsigned char unescape_single_char(unsigned char c);
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
* <xus> quoted string with Unicode escapes
+ * <xeu> Unicode surrogate escape in extended string
*/
%x xb
@@ -139,6 +145,7 @@ static unsigned char unescape_single_char(unsigned char c);
%x xdolq
%x xui
%x xus
+%x xeu
/*
* In order to make the world safe for Windows and Mac clients as well as
@@ -217,6 +224,7 @@ xeinside [^\\']+
xeescape [\\][^0-7]
xeoctesc [\\][0-7]{1,3}
xehexesc [\\]x[0-9A-Fa-f]{1,2}
+xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
/* Extended quote
* xqdouble implements embedded quote, ''''
@@ -506,6 +514,37 @@ other .
<xe>{xeinside} {
addlit(yytext, yyleng);
}
+<xe>{xeunicode} {
+ pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+ check_escape_warning();
+
+ if (c >= 0xD800 && c < 0xDC00)
+ {
+ xeu_surrogate1 = c;
+ BEGIN(xeu);
+ }
+ else if (c >= 0xDC00 && c < 0xE000)
+ yyerror("invalid Unicode escape value");
+
+ addunicode(c);
+ }
+<xeu>{xeunicode} {
+ pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+ if (c < 0xDC00 || c >= 0xE000)
+ yyerror("invalid Unicode surrogate pair");
+
+ c = ((xeu_surrogate1 & 0x3FF) << 10) | (c & 0x3FF);
+
+ addunicode(c + 0x10000);
+
+ BEGIN(xe);
+ }
+<xeu>. {
+ yyerror("invalid Unicode surrogate pair");
+ }
+
<xe>{xeescape} {
if (yytext[1] == '\'')
{
@@ -1153,3 +1192,18 @@ check_escape_warning(void)
lexer_errposition()));
warn_on_first_escape = false; /* warn only once per string */
}
+
+static void
+addunicode(pg_wchar c)
+{
+ char buf[8];
+
+ if (c == 0 || c > 0x10FFFF)
+ yyerror("invalid Unicode escape value");
+ if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
+ yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+
+ unicode_to_utf8(c, (unsigned char *)buf);
+ addlit(buf, pg_mblen(buf));
+}
+
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers