(cloudberry) 01/06: Add pg_encoding_set_invalid()

maxyang Tue, 04 Mar 2025 22:05:02 -0800

This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


commit 57e45eaa25a354527e778b6d244ee653553cdd7c
Author: Andres Freund <[email protected]>
AuthorDate: Mon Feb 10 10:03:40 2025 -0500

    Add pg_encoding_set_invalid()
    
    There are cases where we cannot / do not want to error out for invalidly
    encoded input. In such cases it can be useful to replace e.g. an incomplete
    multi-byte characters with bytes that will trigger an error when getting
    validated as part of a larger string.
    
    Unfortunately, until now, for some encoding no such sequence existed. For
    those encodings this commit removes one previously accepted input 
combination
    - we consider that to be ok, as the chosen bytes are outside of the valid
    ranges for the encodings, we just previously failed to detect that.
    
    As we cannot add a new field to pg_wchar_table without breaking ABI, this is
    implemented "in-line" in the newly added function.
    
    Author: Noah Misch <[email protected]>
    Reviewed-by: Andres Freund <[email protected]>
    Backpatch-through: 13
    Security: CVE-2025-1094
---
 src/common/wchar.c                               | 55 +++++++++++++++++++++++-
 src/include/mb/pg_wchar.h                        |  3 +-
 src/test/regress/expected/conversion.out         |  4 ++
 src/test/regress/input/create_function_0.source  |  5 +++
 src/test/regress/output/create_function_0.source |  3 ++
 src/test/regress/regress.c                       | 50 +++++++++++++++++++++
 src/test/regress/sql/conversion.sql              |  3 ++
 7 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 0636b8765b..35885fb6de 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,25 @@
 #include "mb/pg_wchar.h"
 
 
+/*
+ * In today's multibyte encodings other than UTF8, this two-byte sequence
+ * ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
+ *
+ * For historical reasons, several verifychar implementations opt to reject
+ * this pair specifically.  Byte pair range constraints, in encoding
+ * originator documentation, always excluded this pair.  No core conversion
+ * could translate it.  However, longstanding verifychar implementations
+ * accepted any non-NUL byte.  big5_to_euc_tw and big5_to_mic even translate
+ * pairs not valid per encoding originator documentation.  To avoid tightening
+ * core or non-core conversions in a security patch, we sought this one pair.
+ *
+ * PQescapeString() historically used spaces for BYTE1; many other values
+ * could suffice for BYTE1.
+ */
+#define NONUTF8_INVALID_BYTE0 (0x8d)
+#define NONUTF8_INVALID_BYTE1 (' ')
+
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1532,6 +1551,11 @@ pg_big5_verifychar(const unsigned char *s, int len)
        if (len < l)
                return -1;
 
+       if (l == 2 &&
+               s[0] == NONUTF8_INVALID_BYTE0 &&
+               s[1] == NONUTF8_INVALID_BYTE1)
+               return -1;
+
        while (--l > 0)
        {
                if (*++s == '\0')
@@ -1581,6 +1605,11 @@ pg_gbk_verifychar(const unsigned char *s, int len)
        if (len < l)
                return -1;
 
+       if (l == 2 &&
+               s[0] == NONUTF8_INVALID_BYTE0 &&
+               s[1] == NONUTF8_INVALID_BYTE1)
+               return -1;
+
        while (--l > 0)
        {
                if (*++s == '\0')
@@ -1630,6 +1659,11 @@ pg_uhc_verifychar(const unsigned char *s, int len)
        if (len < l)
                return -1;
 
+       if (l == 2 &&
+               s[0] == NONUTF8_INVALID_BYTE0 &&
+               s[1] == NONUTF8_INVALID_BYTE1)
+               return -1;
+
        while (--l > 0)
        {
                if (*++s == '\0')
@@ -1858,6 +1892,19 @@ pg_utf8_islegal(const unsigned char *source, int length)
 }
 
 
+/*
+ * Fills the provided buffer with two bytes such that:
+ *   pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
+ */
+void
+pg_encoding_set_invalid(int encoding, char *dst)
+{
+       Assert(pg_encoding_max_length(encoding) > 1);
+
+       dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
+       dst[1] = NONUTF8_INVALID_BYTE1;
+}
+
 /*
  *-------------------------------------------------------------------
  * encoding info table
@@ -1980,5 +2027,11 @@ pg_encoding_max_length(int encoding)
 {
        Assert(PG_VALID_ENCODING(encoding));
 
-       return pg_wchar_table[encoding].maxmblen;
+       /*
+        * Check for the encoding despite the assert, due to some mingw versions
+        * otherwise issuing bogus warnings.
+        */
+       return PG_VALID_ENCODING(encoding) ?
+               pg_wchar_table[encoding].maxmblen :
+               pg_wchar_table[PG_SQL_ASCII].maxmblen;
 }
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index b6f22f6b30..7c0645b9c0 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -359,7 +359,7 @@ typedef struct pg_enc2name
 #endif
 } pg_enc2name;
 
-extern const pg_enc2name pg_enc2name_tbl[];
+extern PGDLLIMPORT const pg_enc2name pg_enc2name_tbl[];
 
 /*
  * Encoding names for gettext
@@ -573,6 +573,7 @@ extern int  pg_valid_server_encoding_id(int encoding);
  * (in addition to the ones just above).  The constant tables declared
  * earlier in this file are also available from libpgcommon.
  */
+extern void pg_encoding_set_invalid(int encoding, char *dst);
 extern int     pg_encoding_mblen(int encoding, const char *mbstr);
 extern int     pg_encoding_mblen_bounded(int encoding, const char *mbstr);
 extern int     pg_encoding_dsplen(int encoding, const char *mbstr);
diff --git a/src/test/regress/expected/conversion.out 
b/src/test/regress/expected/conversion.out
index c4eb0e74b0..88e599109d 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -1,6 +1,10 @@
 --
 -- create user defined conversion
 --
+SELECT FROM test_enc_setup();
+--
+(1 row)
+
 CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
 SET SESSION AUTHORIZATION regress_conversion_user;
 CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;
diff --git a/src/test/regress/input/create_function_0.source 
b/src/test/regress/input/create_function_0.source
index f47f635789..54c76f9a8e 100644
--- a/src/test/regress/input/create_function_0.source
+++ b/src/test/regress/input/create_function_0.source
@@ -59,6 +59,11 @@ CREATE FUNCTION test_opclass_options_func(internal)
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
 
+
+CREATE FUNCTION test_enc_setup() RETURNS void
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_setup'
+    LANGUAGE C STRICT;
+
 CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, 
result OUT bytea)
     AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
     LANGUAGE C STRICT;
diff --git a/src/test/regress/output/create_function_0.source 
b/src/test/regress/output/create_function_0.source
index 342bc40e11..7d3908967a 100644
--- a/src/test/regress/output/create_function_0.source
+++ b/src/test/regress/output/create_function_0.source
@@ -46,6 +46,9 @@ CREATE FUNCTION test_opclass_options_func(internal)
     RETURNS void
     AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func'
     LANGUAGE C;
+CREATE FUNCTION test_enc_setup() RETURNS void
+    AS '@libdir@/regress@DLSUFFIX@', 'test_enc_setup'
+    LANGUAGE C STRICT;
 CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, 
result OUT bytea)
     AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion'
     LANGUAGE C STRICT;
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 351d79e1f0..d8fb52ebdd 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -1065,6 +1065,56 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
        PG_RETURN_NULL();
 }
 
+/* one-time tests for encoding infrastructure */
+PG_FUNCTION_INFO_V1(test_enc_setup);
+Datum
+test_enc_setup(PG_FUNCTION_ARGS)
+{
+       /* Test pg_encoding_set_invalid() */
+       for (int i = 0; i < _PG_LAST_ENCODING_; i++)
+       {
+               char            buf[2],
+                                       bigbuf[16];
+               int                     len,
+                                       mblen,
+                                       valid;
+
+               if (pg_encoding_max_length(i) == 1)
+                       continue;
+               pg_encoding_set_invalid(i, buf);
+               len = strnlen(buf, 2);
+               if (len != 2)
+                       elog(WARNING,
+                                "official invalid string for encoding \"%s\" 
has length %d",
+                                pg_enc2name_tbl[i].name, len);
+               mblen = pg_encoding_mblen(i, buf);
+               if (mblen != 2)
+                       elog(WARNING,
+                                "official invalid string for encoding \"%s\" 
has mblen %d",
+                                pg_enc2name_tbl[i].name, mblen);
+               valid = pg_encoding_verifymbstr(i, buf, len);
+               if (valid != 0)
+                       elog(WARNING,
+                                "official invalid string for encoding \"%s\" 
has valid prefix of length %d",
+                                pg_enc2name_tbl[i].name, valid);
+               valid = pg_encoding_verifymbstr(i, buf, 1);
+               if (valid != 0)
+                       elog(WARNING,
+                                "first byte of official invalid string for 
encoding \"%s\" has valid prefix of length %d",
+                                pg_enc2name_tbl[i].name, valid);
+               memset(bigbuf, ' ', sizeof(bigbuf));
+               bigbuf[0] = buf[0];
+               bigbuf[1] = buf[1];
+               valid = pg_encoding_verifymbstr(i, bigbuf, sizeof(bigbuf));
+               if (valid != 0)
+                       elog(WARNING,
+                                "trailing data changed official invalid string 
for encoding \"%s\" to have valid prefix of length %d",
+                                pg_enc2name_tbl[i].name, valid);
+       }
+
+       PG_RETURN_VOID();
+}
+
 /*
  * Call an encoding conversion or verification function.
  *
diff --git a/src/test/regress/sql/conversion.sql 
b/src/test/regress/sql/conversion.sql
index c12580d675..89de66299f 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -1,6 +1,9 @@
 --
 -- create user defined conversion
 --
+
+SELECT FROM test_enc_setup();
+
 CREATE USER regress_conversion_user WITH NOCREATEDB NOCREATEROLE;
 SET SESSION AUTHORIZATION regress_conversion_user;
 CREATE CONVERSION myconv FOR 'LATIN1' TO 'UTF8' FROM iso8859_1_to_utf8;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(cloudberry) 01/06: Add pg_encoding_set_invalid()

Reply via email to