Re: proposal: unescape_text function

Pavel Stehule Tue, 23 Jun 2020 02:52:50 -0700

po 22. 6. 2020 v 5:48 odesílatel Pavel Stehule <pavel.steh...@gmail.com>
napsal:


> Hi
>
> There is one user request for unescape function in core.
>
>
> https://stackoverflow.com/questions/20124393/convert-escaped-unicode-character-back-to-actual-character-in-postgresql/20125412?noredirect=1#comment110502526_20125412
>
> This request is about possibility that we do with string literal via
> functional interface instead string literals only
>
> I wrote plpgsql function, but built in function can be simpler:
>
> CREATE OR REPLACE FUNCTION public.unescape(text, text)
>  RETURNS text
>  LANGUAGE plpgsql
>  AS $function$
>  DECLARE result text;
>  BEGIN
>    EXECUTE format('SELECT U&%s UESCAPE %s',
>                          quote_literal(replace($1, '\u','^')),
>                          quote_literal($2)) INTO result;
>    RETURN result;
>  END;
>  $function$
>
> postgres=# select unescape('Odpov\u011Bdn\u00E1 osoba','^');
>     unescape     -----------------
>  Odpovědná osoba(1 row)
>
> What do you think about this?
>

I changed the name to more accurately "unicode_unescape". Patch is assigned

Regards

Pavel


> Regards
>
> Pavel
>

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b7c450ea29..365ea17946 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3533,6 +3533,24 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>unicode_unescape</primary>
+        </indexterm>
+        <function>unicode_unescape</function> ( <parameter>string</parameter> <type>text</type>
+        <optional>, <parameter>escape_char</parameter> <type>text</type> </optional> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Evaluate escaped unicode chars (4 or 6 digits) to chars.
+       </para>
+       <para>
+        <literal>unicode_unescape('\0441\043B\043E\043D')</literal>
+        <returnvalue>слон</returnvalue>
+       </para></entry>
+      </row>
+
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index be86eb37fe..c7f94298c1 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -26,7 +26,6 @@
 #include "parser/parser.h"
 #include "parser/scansup.h"
 
-static bool check_uescapechar(unsigned char escape);
 static char *str_udeescape(const char *str, char escape,
 						   int position, core_yyscan_t yyscanner);
 
@@ -278,44 +277,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
 	return cur_token;
 }
 
-/* convert hex digit (caller should have verified that) to value */
-static unsigned int
-hexval(unsigned char c)
-{
-	if (c >= '0' && c <= '9')
-		return c - '0';
-	if (c >= 'a' && c <= 'f')
-		return c - 'a' + 0xA;
-	if (c >= 'A' && c <= 'F')
-		return c - 'A' + 0xA;
-	elog(ERROR, "invalid hexadecimal digit");
-	return 0;					/* not reached */
-}
-
-/* is Unicode code point acceptable? */
-static void
-check_unicode_value(pg_wchar c)
-{
-	if (!is_valid_unicode_codepoint(c))
-		ereport(ERROR,
-				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("invalid Unicode escape value")));
-}
-
-/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
-static bool
-check_uescapechar(unsigned char escape)
-{
-	if (isxdigit(escape)
-		|| escape == '+'
-		|| escape == '\''
-		|| escape == '"'
-		|| scanner_isspace(escape))
-		return false;
-	else
-		return true;
-}
-
 /*
  * Process Unicode escapes in "str", producing a palloc'd plain string
  *
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index 18169ec4f4..5a39edf450 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -228,3 +228,41 @@ scanner_isspace(char ch)
 		return true;
 	return false;
 }
+
+/* convert hex digit (caller should have verified that) to value */
+unsigned int
+hexval(unsigned char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 0xA;
+	elog(ERROR, "invalid hexadecimal digit");
+	return 0;					/* not reached */
+}
+
+/* is Unicode code point acceptable? */
+Oid
+check_unicode_value(pg_wchar c)
+{
+	if (!is_valid_unicode_codepoint(c))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape value")));
+}
+
+/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
+bool
+check_uescapechar(unsigned char escape)
+{
+	if (isxdigit(escape)
+		|| escape == '+'
+		|| escape == '\''
+		|| escape == '"'
+		|| scanner_isspace(escape))
+		return false;
+	else
+		return true;
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2eaabd6231..2934a1d9da 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -6139,3 +6139,202 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 
 	PG_RETURN_BOOL(result);
 }
+
+/*
+ * Process Unicode escapes in "str"
+ *
+ * escape: the escape character to use
+ */
+static void
+udeescape(StringInfo str, const char *instr, size_t len, char escape)
+{
+	pg_wchar	pair_first = 0;
+	char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+
+	while (len > 0)
+	{
+		if (instr[0] == escape)
+		{
+			if (len >= 2 &&
+				instr[1] == escape)
+			{
+				if (pair_first)
+					goto invalid_pair;
+				appendStringInfoChar(str, escape);
+				instr += 2;
+				len -= 2;
+			}
+			else if (len >= 5 &&
+					 isxdigit((unsigned char) instr[1]) &&
+					 isxdigit((unsigned char) instr[2]) &&
+					 isxdigit((unsigned char) instr[3]) &&
+					 isxdigit((unsigned char) instr[4]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(instr[1]) << 12) +
+					(hexval(instr[2]) << 8) +
+					(hexval(instr[3]) << 4) +
+					hexval(instr[4]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+				instr += 5;
+				len -= 5;
+			}
+			else if (len >= 8 &&
+					 instr[1] == '+' &&
+					 isxdigit((unsigned char) instr[2]) &&
+					 isxdigit((unsigned char) instr[3]) &&
+					 isxdigit((unsigned char) instr[4]) &&
+					 isxdigit((unsigned char) instr[5]) &&
+					 isxdigit((unsigned char) instr[6]) &&
+					 isxdigit((unsigned char) instr[7]))
+			{
+				pg_wchar	unicode;
+
+				unicode = (hexval(instr[2]) << 20) +
+					(hexval(instr[3]) << 16) +
+					(hexval(instr[4]) << 12) +
+					(hexval(instr[5]) << 8) +
+					(hexval(instr[6]) << 4) +
+					hexval(instr[7]);
+				check_unicode_value(unicode);
+				if (pair_first)
+				{
+					if (is_utf16_surrogate_second(unicode))
+					{
+						unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+						pair_first = 0;
+					}
+					else
+						goto invalid_pair;
+				}
+				else if (is_utf16_surrogate_second(unicode))
+					goto invalid_pair;
+
+				if (is_utf16_surrogate_first(unicode))
+					pair_first = unicode;
+				else
+				{
+					pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+					appendStringInfoString(str, cbuf);
+				}
+				instr += 8;
+				len -= 8;
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("invalid Unicode escape"),
+						 errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
+		}
+		else
+		{
+			if (pair_first)
+				goto invalid_pair;
+
+			appendStringInfoChar(str, *instr++);
+			len--;
+		}
+	}
+
+	/* unfinished surrogate pair? */
+	if (pair_first)
+		goto invalid_pair;
+
+	return;
+
+invalid_pair:
+	ereport(ERROR,
+			(errcode(ERRCODE_SYNTAX_ERROR),
+			 errmsg("invalid Unicode surrogate pair")));
+}
+
+/*
+ * Unescape unicode strings
+ */
+Datum
+unicode_uescape_with_escape_char(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *escchr_text;
+	text	   *result;
+	const char *escchr_ptr;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	if (PG_ARGISNULL(1))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("null value not allowed for escape char")));
+
+	escchr_text = PG_GETARG_TEXT_PP(1);
+	escchr_ptr = VARDATA_ANY(escchr_text);
+
+	if (VARSIZE_ANY_EXHDR(escchr_text) == 1 && !check_uescapechar(*escchr_ptr))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("invalid Unicode escape character")));
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  *escchr_ptr);
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+Datum
+unicode_uescape(PG_FUNCTION_ARGS)
+{
+	StringInfoData		str;
+	text	   *input_text;
+	text	   *result;
+
+	/* when input string is NULL, then result is NULL too */
+	if (PG_ARGISNULL(0))
+		PG_RETURN_NULL();
+
+	input_text = PG_GETARG_TEXT_PP(0);
+
+	initStringInfo(&str);
+
+	udeescape(&str,
+			  VARDATA_ANY(input_text),
+			  VARSIZE_ANY_EXHDR(input_text),
+			  '\\');
+
+	result = cstring_to_text_with_len(str.data, str.len);
+	pfree(str.data);
+
+	PG_RETURN_TEXT_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 61f2c2f5b4..42792fca3c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10936,4 +10936,11 @@
   proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
   prosrc => 'unicode_is_normalized' },
 
+{ oid => '1136', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text text',
+  proisstrict => 'f', prosrc => 'unicode_uescape_with_escape_char' },
+
+{ oid => '1137', descr => 'unescape Unicode chars in strings',
+  proname => 'unicode_unescape', prorettype => 'text', proargtypes => 'text',
+  proisstrict => 't', prosrc => 'unicode_uescape' }
 ]
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 7a6ee529ae..e1dc7b8a2a 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -15,6 +15,8 @@
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
+#include "mb/pg_wchar.h"
+
 extern char *scanstr(const char *s);
 
 extern char *downcase_truncate_identifier(const char *ident, int len,
@@ -27,4 +29,10 @@ extern void truncate_identifier(char *ident, int len, bool warn);
 
 extern bool scanner_isspace(char ch);
 
+extern unsigned int hexval(unsigned char c);
+
+extern Oid check_unicode_value(pg_wchar c);
+
+extern bool check_uescapechar(unsigned char escape);
+
 #endif							/* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
index 2a1e903696..deb67b566b 100644
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@@ -79,3 +79,15 @@ ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
 ERROR:  invalid normalization form: def
+SELECT unicode_unescape('\0441\043B\043E\043D');
+ unicode_unescape 
+------------------
+ слон
+(1 row)
+
+SELECT unicode_unescape('d!0061t!+000061', '!');
+ unicode_unescape 
+------------------
+ data
+(1 row)
+
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
index ccfc6fa77a..fd99031a1a 100644
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@@ -30,3 +30,6 @@ FROM
 ORDER BY num;
 
 SELECT is_normalized('abc', 'def');  -- run-time error
+
+SELECT unicode_unescape('\0441\043B\043E\043D');
+SELECT unicode_unescape('d!0061t!+000061', '!');

Re: proposal: unescape_text function

Reply via email to