Unicode escapes for extended strings. On 4/16/09, Marko Kreen <mark...@gmail.com> wrote: > Reasons: > > - More people are familiar with \u escaping, as it's standard > in Java/C#/Python, probably more.. > - U& strings will not work when stdstr=off. > > Syntax: > > \uXXXX - 16-bit value > \UXXXXXXXX - 32-bit value > > Additionally, both \u and \U can be used to specify UTF-16 surrogate > pairs to encode characters with value > 0xFFFF. This is exact behaviour > used by Java/C#/Python. (except that Java does not have \U)
v3 of the patch: - convert to new reentrant lexer API - add lexer targets to avoid fallback to default - completely disallow \U\u without proper number of hex values - fix logic bug in surrogate pair handling -- marko
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index 7637eab..b6f26cc 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -394,6 +394,14 @@ SELECT 'foo' 'bar'; </entry> <entry>hexadecimal byte value</entry> </row> + <row> + <entry> + <literal>\u<replaceable>xxxx</replaceable></literal>, + <literal>\U<replaceable>xxxxxxxx</replaceable></literal> + (<replaceable>x</replaceable> = 0 - 9, A - F) + </entry> + <entry>16 or 32-bit hexadecimal Unicode character value.</entry> + </row> </tbody> </tgroup> </table> @@ -407,6 +415,14 @@ SELECT 'foo' 'bar'; </para> <para> + The Unicode escape syntax works fully only when the server encoding is UTF8. + When other server encodings are used, only code points in the ASCII range + (up to <literal>\u007F</>) can be specified. Both <literal>\u</> and <literal>\U</> + can also be used to specify UTF-16 surrogate pair to escape characters + with value larger than <literal>\uFFFF</>. + </para> + + <para> It is your responsibility that the byte sequences you create are valid characters in the server character set encoding. When the server encoding is UTF-8, then the alternative Unicode escape diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index f404f9d..8ca3007 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -97,6 +97,8 @@ static void check_escape_warning(base_yyscan_t yyscanner); extern int base_yyget_column(yyscan_t yyscanner); extern void base_yyset_column(int column_no, yyscan_t yyscanner); +static void addunicode(pg_wchar c, yyscan_t yyscanner); + %} %option reentrant @@ -134,6 +136,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner); * <xdolq> $foo$ quoted strings * <xui> quoted identifier with Unicode escapes * <xus> quoted string with Unicode escapes + * <xeu> Unicode surrogate escape in extended string */ %x xb @@ -145,6 +148,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner); %x xdolq %x xui %x xus +%x xeu /* * In order to make the world safe for Windows and Mac clients as well as @@ -223,6 +227,8 @@ xeinside [^\\']+ xeescape [\\][^0-7] xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} +xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) +xeunicodebad [\\]([uU]) /* Extended quote * xqdouble implements embedded quote, '''' @@ -535,6 +541,51 @@ other . <xe>{xeinside} { addlit(yytext, yyleng, yyscanner); } +<xe>{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + check_escape_warning(yyscanner); + + /* + * handle UTF-16 surrogates: + * [0xD800..0xDC00) - first elem. + * [0xDC00..0xE000) - second elem. + */ + if (c >= 0xD800 && c < 0xE000) + { + if (c >= 0xDC00) + yyerror("invalid Unicode surrogate pair"); + + yyextra->utf16_top_part = ((c & 0x3FF) << 10) + 0x10000; + BEGIN(xeu); + } + else + addunicode(c, yyscanner); + } +<xeu>{xeunicode} { + pg_wchar c = strtoul(yytext+2, NULL, 16); + + if (c < 0xDC00 || c >= 0xE000) + yyerror("invalid Unicode surrogate pair"); + + c = (c & 0x3FF) + yyextra->utf16_top_part; + + addunicode(c, yyscanner); + + BEGIN(xe); + } +<xeu>. | +<xeu>\n | +<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); } + +<xe>{xeunicodebad} { + ereport(ERROR, + (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be full-length: \\uXXXX or \\UXXXXXXXX."), + lexer_errposition())); + } + <xe>{xeescape} { if (yytext[1] == '\'') { @@ -1263,3 +1314,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner) if (ptr) pfree(ptr); } + +static void +addunicode(pg_wchar c, base_yyscan_t yyscanner) +{ + char buf[8]; + + if (c == 0 || c > 0x10FFFF) + yyerror("invalid Unicode escape value"); + if (c > 0x7F) + { + if (GetDatabaseEncoding() != PG_UTF8) + yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); + yyextra->saw_non_ascii = true; + } + unicode_to_utf8(c, (unsigned char *)buf); + addlit(buf, pg_mblen(buf), yyscanner); +} + diff --git a/src/include/parser/gramparse.h b/src/include/parser/gramparse.h index a54a1b1..0ef9bf4 100644 --- a/src/include/parser/gramparse.h +++ b/src/include/parser/gramparse.h @@ -71,6 +71,9 @@ typedef struct base_yy_extra_type int xcdepth; /* depth of nesting in slash-star comments */ char *dolqstart; /* current $foo$ quote start string */ + /* first part of UTF16 surrogate for unicode escapes */ + uint32 utf16_top_part; + /* state variables for literal-lexing warnings */ bool warn_on_first_escape; bool saw_non_ascii;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers