Unicode escapes for extended strings.
On 4/16/09, Marko Kreen <[email protected]> wrote:
> Reasons:
>
> - More people are familiar with \u escaping, as it's standard
> in Java/C#/Python, probably more..
> - U& strings will not work when stdstr=off.
>
> Syntax:
>
> \uXXXX - 16-bit value
> \UXXXXXXXX - 32-bit value
>
> Additionally, both \u and \U can be used to specify UTF-16 surrogate
> pairs to encode characters with value > 0xFFFF. This is exact behaviour
> used by Java/C#/Python. (except that Java does not have \U)
v3 of the patch:
- convert to new reentrant lexer API
- add lexer targets to avoid fallback to default
- completely disallow \U\u without proper number of hex values
- fix logic bug in surrogate pair handling
--
marko
diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml
index 7637eab..b6f26cc 100644
--- a/doc/src/sgml/syntax.sgml
+++ b/doc/src/sgml/syntax.sgml
@@ -394,6 +394,14 @@ SELECT 'foo' 'bar';
</entry>
<entry>hexadecimal byte value</entry>
</row>
+ <row>
+ <entry>
+ <literal>\u<replaceable>xxxx</replaceable></literal>,
+ <literal>\U<replaceable>xxxxxxxx</replaceable></literal>
+ (<replaceable>x</replaceable> = 0 - 9, A - F)
+ </entry>
+ <entry>16 or 32-bit hexadecimal Unicode character value.</entry>
+ </row>
</tbody>
</tgroup>
</table>
@@ -407,6 +415,14 @@ SELECT 'foo' 'bar';
</para>
<para>
+ The Unicode escape syntax works fully only when the server encoding is UTF8.
+ When other server encodings are used, only code points in the ASCII range
+ (up to <literal>\u007F</>) can be specified. Both <literal>\u</> and <literal>\U</>
+ can also be used to specify UTF-16 surrogate pair to escape characters
+ with value larger than <literal>\uFFFF</>.
+ </para>
+
+ <para>
It is your responsibility that the byte sequences you create are
valid characters in the server character set encoding. When the
server encoding is UTF-8, then the alternative Unicode escape
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index f404f9d..8ca3007 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -97,6 +97,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
extern int base_yyget_column(yyscan_t yyscanner);
extern void base_yyset_column(int column_no, yyscan_t yyscanner);
+static void addunicode(pg_wchar c, yyscan_t yyscanner);
+
%}
%option reentrant
@@ -134,6 +136,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
* <xdolq> $foo$ quoted strings
* <xui> quoted identifier with Unicode escapes
* <xus> quoted string with Unicode escapes
+ * <xeu> Unicode surrogate escape in extended string
*/
%x xb
@@ -145,6 +148,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
%x xdolq
%x xui
%x xus
+%x xeu
/*
* In order to make the world safe for Windows and Mac clients as well as
@@ -223,6 +227,8 @@ xeinside [^\\']+
xeescape [\\][^0-7]
xeoctesc [\\][0-7]{1,3}
xehexesc [\\]x[0-9A-Fa-f]{1,2}
+xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
+xeunicodebad [\\]([uU])
/* Extended quote
* xqdouble implements embedded quote, ''''
@@ -535,6 +541,51 @@ other .
<xe>{xeinside} {
addlit(yytext, yyleng, yyscanner);
}
+<xe>{xeunicode} {
+ pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+ check_escape_warning(yyscanner);
+
+ /*
+ * handle UTF-16 surrogates:
+ * [0xD800..0xDC00) - first elem.
+ * [0xDC00..0xE000) - second elem.
+ */
+ if (c >= 0xD800 && c < 0xE000)
+ {
+ if (c >= 0xDC00)
+ yyerror("invalid Unicode surrogate pair");
+
+ yyextra->utf16_top_part = ((c & 0x3FF) << 10) + 0x10000;
+ BEGIN(xeu);
+ }
+ else
+ addunicode(c, yyscanner);
+ }
+<xeu>{xeunicode} {
+ pg_wchar c = strtoul(yytext+2, NULL, 16);
+
+ if (c < 0xDC00 || c >= 0xE000)
+ yyerror("invalid Unicode surrogate pair");
+
+ c = (c & 0x3FF) + yyextra->utf16_top_part;
+
+ addunicode(c, yyscanner);
+
+ BEGIN(xe);
+ }
+<xeu>. |
+<xeu>\n |
+<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
+
+<xe>{xeunicodebad} {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+ errmsg("invalid Unicode escape"),
+ errhint("Unicode escapes must be full-length: \\uXXXX or \\UXXXXXXXX."),
+ lexer_errposition()));
+ }
+
<xe>{xeescape} {
if (yytext[1] == '\'')
{
@@ -1263,3 +1314,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
if (ptr)
pfree(ptr);
}
+
+static void
+addunicode(pg_wchar c, base_yyscan_t yyscanner)
+{
+ char buf[8];
+
+ if (c == 0 || c > 0x10FFFF)
+ yyerror("invalid Unicode escape value");
+ if (c > 0x7F)
+ {
+ if (GetDatabaseEncoding() != PG_UTF8)
+ yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
+ yyextra->saw_non_ascii = true;
+ }
+ unicode_to_utf8(c, (unsigned char *)buf);
+ addlit(buf, pg_mblen(buf), yyscanner);
+}
+
diff --git a/src/include/parser/gramparse.h b/src/include/parser/gramparse.h
index a54a1b1..0ef9bf4 100644
--- a/src/include/parser/gramparse.h
+++ b/src/include/parser/gramparse.h
@@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
int xcdepth; /* depth of nesting in slash-star comments */
char *dolqstart; /* current $foo$ quote start string */
+ /* first part of UTF16 surrogate for unicode escapes */
+ uint32 utf16_top_part;
+
/* state variables for literal-lexing warnings */
bool warn_on_first_escape;
bool saw_non_ascii;
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers