This is an automated email from the ASF dual-hosted git repository.

maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git

commit d04a61da06025cbc4fd0cc396523cf6b17b3449c
Author: Andres Freund <[email protected]>
AuthorDate: Mon Feb 10 10:03:40 2025 -0500

    Fix handling of invalidly encoded data in escaping functions
    
    Previously invalidly encoded input to various escaping functions could lead 
to
    the escaped string getting incorrectly parsed by psql.  To be safe, escaping
    functions need to ensure that neither invalid nor incomplete multi-byte
    characters can be used to "escape" from being quoted.
    
    Functions which can report errors now return an error in more cases than
    before. Functions that cannot report errors now replace invalid input bytes
    with a byte sequence that cannot be used to escape the quotes and that is
    guaranteed to error out when a query is sent to the server.
    
    The following functions are fixed by this commit:
    - PQescapeLiteral()
    - PQescapeIdentifier()
    - PQescapeString()
    - PQescapeStringConn()
    - fmtId()
    - appendStringLiteral()
    
    Reported-by: Stephen Fewer <[email protected]>
    Reviewed-by: Noah Misch <[email protected]>
    Reviewed-by: Tom Lane <[email protected]>
    Backpatch-through: 13
    Security: CVE-2025-1094
---
 src/fe_utils/string_utils.c    | 170 ++++++++++++++++++++++++++++++++---------
 src/interfaces/libpq/fe-exec.c | 135 +++++++++++++++++++++++---------
 2 files changed, 237 insertions(+), 68 deletions(-)

diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index ae769012e8..9131b471cd 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -104,6 +104,7 @@ fmtIdEnc(const char *rawid, int encoding)
 
        const char *cp;
        bool            need_quotes = false;
+       size_t          remaining = strlen(rawid);
 
        /*
         * These checks need to match the identifier production in scan.l. Don't
@@ -117,7 +118,8 @@ fmtIdEnc(const char *rawid, int encoding)
        else
        {
                /* otherwise check the entire string */
-               for (cp = rawid; *cp; cp++)
+               cp = rawid;
+               for (size_t i = 0; i < remaining; i++, cp++)
                {
                        if (!((*cp >= 'a' && *cp <= 'z')
                                  || (*cp >= '0' && *cp <= '9')
@@ -153,17 +155,90 @@ fmtIdEnc(const char *rawid, int encoding)
        else
        {
                appendPQExpBufferChar(id_return, '"');
-               for (cp = rawid; *cp; cp++)
+
+               cp = &rawid[0];
+               while (remaining > 0)
                {
-                       /*
-                        * Did we find a double-quote in the string? Then make 
this a
-                        * double double-quote per SQL99. Before, we put in a
-                        * backslash/double-quote pair. - thomas 2000-08-05
-                        */
-                       if (*cp == '"')
-                               appendPQExpBufferChar(id_return, '"');
-                       appendPQExpBufferChar(id_return, *cp);
+                       int                     charlen;
+
+                       /* Fast path for plain ASCII */
+                       if (!IS_HIGHBIT_SET(*cp))
+                       {
+                               /*
+                                * Did we find a double-quote in the string? 
Then make this a
+                                * double double-quote per SQL99. Before, we 
put in a
+                                * backslash/double-quote pair. - thomas 
2000-08-05
+                                */
+                               if (*cp == '"')
+                                       appendPQExpBufferChar(id_return, '"');
+                               appendPQExpBufferChar(id_return, *cp);
+                               remaining--;
+                               cp++;
+                               continue;
+                       }
+
+                       /* Slow path for possible multibyte characters */
+                       charlen = pg_encoding_mblen(encoding, cp);
+
+                       if (remaining < charlen)
+                       {
+                               /*
+                                * If the character is longer than the 
available input,
+                                * replace the string with an invalid sequence. 
The invalid
+                                * sequence ensures that the escaped string 
will trigger an
+                                * error on the server-side, even if we can't 
directly report
+                                * an error here.
+                                */
+                               enlargePQExpBuffer(id_return, 2);
+                               pg_encoding_set_invalid(encoding,
+                                                                               
id_return->data + id_return->len);
+                               id_return->len += 2;
+                               id_return->data[id_return->len] = '\0';
+
+                               /* there's no more input data, so we can stop */
+                               break;
+                       }
+                       else if (pg_encoding_verifymbchar(encoding, cp, 
charlen) == -1)
+                       {
+                               /*
+                                * Multibyte character is invalid.  It's 
important to verify
+                                * that as invalid multi-byte characters could 
e.g. be used to
+                                * "skip" over quote characters, e.g. when 
parsing
+                                * character-by-character.
+                                *
+                                * Replace the bytes corresponding to the 
invalid character
+                                * with an invalid sequence, for the same 
reason as above.
+                                *
+                                * It would be a bit faster to verify the whole 
string the
+                                * first time we encounter a set highbit, but 
this way we can
+                                * replace just the invalid characters, which 
probably makes
+                                * it easier for users to find the invalidly 
encoded portion
+                                * of a larger string.
+                                */
+                               enlargePQExpBuffer(id_return, 2);
+                               pg_encoding_set_invalid(encoding,
+                                                                               
id_return->data + id_return->len);
+                               id_return->len += 2;
+                               id_return->data[id_return->len] = '\0';
+
+                               /*
+                                * Copy the rest of the string after the 
invalid multi-byte
+                                * character.
+                                */
+                               remaining -= charlen;
+                               cp += charlen;
+                       }
+                       else
+                       {
+                               for (int i = 0; i < charlen; i++)
+                               {
+                                       appendPQExpBufferChar(id_return, *cp);
+                                       remaining--;
+                                       cp++;
+                               }
+                       }
                }
+
                appendPQExpBufferChar(id_return, '"');
        }
 
@@ -290,6 +365,7 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
        size_t          length = strlen(str);
        const char *source = str;
        char       *target;
+       size_t          remaining = length;
 
        if (!enlargePQExpBuffer(buf, 2 * length + 2))
                return;
@@ -297,10 +373,10 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
        target = buf->data + buf->len;
        *target++ = '\'';
 
-       while (*source != '\0')
+       while (remaining > 0)
        {
                char            c = *source;
-               int                     len;
+               int                     charlen;
                int                     i;
 
                /* Fast path for plain ASCII */
@@ -312,39 +388,65 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
                        /* Copy the character */
                        *target++ = c;
                        source++;
+                       remaining--;
                        continue;
                }
 
                /* Slow path for possible multibyte characters */
-               len = PQmblen(source, encoding);
+               charlen = PQmblen(source, encoding);
 
-               /* Copy the character */
-               for (i = 0; i < len; i++)
+               if (remaining < charlen)
                {
-                       if (*source == '\0')
-                               break;
-                       *target++ = *source++;
-               }
+                       /*
+                        * If the character is longer than the available input, 
replace
+                        * the string with an invalid sequence. The invalid 
sequence
+                        * ensures that the escaped string will trigger an 
error on the
+                        * server-side, even if we can't directly report an 
error here.
+                        *
+                        * We know there's enough space for the invalid 
sequence because
+                        * the "target" buffer is 2 * length + 2 long, and at 
worst we're
+                        * replacing a single input byte with two invalid bytes.
+                        */
+                       pg_encoding_set_invalid(encoding, target);
+                       target += 2;
 
-               /*
-                * If we hit premature end of string (ie, incomplete multibyte
-                * character), try to pad out to the correct length with 
spaces. We
-                * may not be able to pad completely, but we will always be 
able to
-                * insert at least one pad space (since we'd not have quoted a
-                * multibyte character).  This should be enough to make a 
string that
-                * the server will error out on.
-                */
-               if (i < len)
+                       /* there's no more valid input data, so we can stop */
+                       break;
+               }
+               else if (pg_encoding_verifymbchar(encoding, source, charlen) == 
-1)
                {
-                       char       *stop = buf->data + buf->maxlen - 2;
+                       /*
+                        * Multibyte character is invalid.  It's important to 
verify that
+                        * as invalid multi-byte characters could e.g. be used 
to "skip"
+                        * over quote characters, e.g. when parsing
+                        * character-by-character.
+                        *
+                        * Replace the bytes corresponding to the invalid 
character with
+                        * an invalid sequence, for the same reason as above.
+                        *
+                        * It would be a bit faster to verify the whole string 
the first
+                        * time we encounter a set highbit, but this way we can 
replace
+                        * just the invalid characters, which probably makes it 
easier for
+                        * users to find the invalidly encoded portion of a 
larger string.
+                        */
+                       pg_encoding_set_invalid(encoding, target);
+                       target += 2;
+                       remaining -= charlen;
 
-                       for (; i < len; i++)
+                       /*
+                        * Copy the rest of the string after the invalid 
multi-byte
+                        * character.
+                        */
+                       source += charlen;
+               }
+               else
+               {
+                       /* Copy the character */
+                       for (i = 0; i < charlen; i++)
                        {
-                               if (target >= stop)
-                                       break;
-                               *target++ = ' ';
+                               *target++ = *source++;
+                               remaining--;
                        }
-                       break;
                }
        }
 
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index 596013596b..85696712b3 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -3878,15 +3878,15 @@ PQescapeStringInternal(PGconn *conn,
 {
        const char *source = from;
        char       *target = to;
-       size_t          remaining = length;
+       size_t          remaining = strnlen(from, length);
 
        if (error)
                *error = 0;
 
-       while (remaining > 0 && *source != '\0')
+       while (remaining > 0)
        {
                char            c = *source;
-               int                     len;
+               int                     charlen;
                int                     i;
 
                /* Fast path for plain ASCII */
@@ -3903,39 +3903,78 @@ PQescapeStringInternal(PGconn *conn,
                }
 
                /* Slow path for possible multibyte characters */
-               len = pg_encoding_mblen(encoding, source);
+               charlen = pg_encoding_mblen(encoding, source);
 
-               /* Copy the character */
-               for (i = 0; i < len; i++)
+               if (remaining < charlen)
                {
-                       if (remaining == 0 || *source == '\0')
-                               break;
-                       *target++ = *source++;
-                       remaining--;
-               }
+                       /*
+                        * If the character is longer than the available input, 
report an
+                        * error if possible, and replace the string with an 
invalid
+                        * sequence. The invalid sequence ensures that the 
escaped string
+                        * will trigger an error on the server-side, even if we 
can't
+                        * directly report an error here.
+                        *
+                        * This isn't *that* crucial when we can report an 
error to the
+                        * caller, but if we can't, the caller will use this 
string
+                        * unmodified and it needs to be safe for parsing.
+                        *
+                        * We know there's enough space for the invalid 
sequence because
+                        * the "to" buffer needs to be at least 2 * length + 1 
long, and
+                        * at worst we're replacing a single input byte with 
two invalid
+                        * bytes.
+                        */
+                       if (error)
+                               *error = 1;
+                       if (conn)
+                               appendPQExpBufferStr(&conn->errorMessage,
+                                                                        
libpq_gettext("incomplete multibyte character\n"));
 
-               /*
-                * If we hit premature end of string (ie, incomplete multibyte
-                * character), try to pad out to the correct length with 
spaces. We
-                * may not be able to pad completely, but we will always be 
able to
-                * insert at least one pad space (since we'd not have quoted a
-                * multibyte character).  This should be enough to make a 
string that
-                * the server will error out on.
-                */
-               if (i < len)
+                       pg_encoding_set_invalid(encoding, target);
+                       target += 2;
+
+                       /* there's no more input data, so we can stop */
+                       break;
+               }
+               else if (pg_encoding_verifymbchar(encoding, source, charlen) == 
-1)
                {
+                       /*
+                        * Multibyte character is invalid.  It's important to 
verify that
+                        * as invalid multi-byte characters could e.g. be used 
to "skip"
+                        * over quote characters, e.g. when parsing
+                        * character-by-character.
+                        *
+                        * Replace the bytes corresponding to the invalid 
character with
+                        * an invalid sequence, for the same reason as above.
+                        *
+                        * It would be a bit faster to verify the whole string 
the first
+                        * time we encounter a set highbit, but this way we can 
replace
+                        * just the invalid characters, which probably makes it 
easier for
+                        * users to find the invalidly encoded portion of a 
larger string.
+                        */
                        if (error)
                                *error = 1;
                        if (conn)
                                appendPQExpBufferStr(&conn->errorMessage,
-                                                                        
libpq_gettext("incomplete multibyte character\n"));
-                       for (; i < len; i++)
+                                                                        
libpq_gettext("invalid multibyte character\n"));
+
+                       pg_encoding_set_invalid(encoding, target);
+                       target += 2;
+                       remaining -= charlen;
+
+                       /*
+                        * Copy the rest of the string after the invalid 
multi-byte
+                        * character.
+                        */
+                       source += charlen;
+               }
+               else
+               {
+                       /* Copy the character */
+                       for (i = 0; i < charlen; i++)
                        {
-                               if (((size_t) (target - to)) / 2 >= length)
-                                       break;
-                               *target++ = ' ';
+                               *target++ = *source++;
+                               remaining--;
                        }
-                       break;
                }
        }
 
@@ -3989,9 +4028,10 @@ PQescapeInternal(PGconn *conn, const char *str, size_t 
len, bool as_ident)
        char       *rp;
        int                     num_quotes = 0; /* single or double, depending 
on as_ident */
        int                     num_backslashes = 0;
-       int                     input_len;
-       int                     result_size;
+       size_t          input_len = strlen(str);
+       size_t          result_size;
        char            quote_char = as_ident ? '"' : '\'';
+       bool            validated_mb = false;
 
        /* We must have a connection, else fail immediately. */
        if (!conn)
@@ -3999,8 +4039,12 @@ PQescapeInternal(PGconn *conn, const char *str, size_t 
len, bool as_ident)
 
        resetPQExpBuffer(&conn->errorMessage);
 
-       /* Scan the string for characters that must be escaped. */
-       for (s = str; (s - str) < len && *s != '\0'; ++s)
+       /*
+        * Scan the string for characters that must be escaped and for invalidly
+        * encoded data.
+        */
+       s = str;
+       for (size_t remaining = input_len; remaining > 0; remaining--, s++)
        {
                if (*s == quote_char)
                        ++num_quotes;
@@ -4013,21 +4057,42 @@ PQescapeInternal(PGconn *conn, const char *str, size_t 
len, bool as_ident)
                        /* Slow path for possible multibyte characters */
                        charlen = pg_encoding_mblen(conn->client_encoding, s);
 
-                       /* Multibyte character overruns allowable length. */
-                       if ((s - str) + charlen > len || memchr(s, 0, charlen) 
!= NULL)
+                       if (charlen > remaining)
                        {
                                appendPQExpBufferStr(&conn->errorMessage,
                                                                         
libpq_gettext("incomplete multibyte character\n"));
                                return NULL;
                        }
 
+                       /*
+                        * If we haven't already, check that multibyte 
characters are
+                        * valid. It's important to verify that as invalid 
multi-byte
+                        * characters could e.g. be used to "skip" over quote 
characters,
+                        * e.g. when parsing character-by-character.
+                        *
+                        * We check validity once, for the whole remainder of 
the string,
+                        * when we first encounter any multi-byte character. 
Some
+                        * encodings have optimized implementations for longer 
strings.
+                        */
+                       if (!validated_mb)
+                       {
+                               if 
(pg_encoding_verifymbstr(conn->client_encoding, s, remaining)
+                                       != strlen(s))
+                               {
+                                       
appendPQExpBufferStr(&conn->errorMessage,
+                                                                               
 libpq_gettext("invalid multibyte character\n"));
+                                       return NULL;
+                               }
+                               validated_mb = true;
+                       }
+
                        /* Adjust s, bearing in mind that for loop will 
increment it. */
                        s += charlen - 1;
+                       remaining -= charlen - 1;
                }
        }
 
        /* Allocate output buffer. */
-       input_len = s - str;
        result_size = input_len + num_quotes + 3;       /* two quotes, plus a 
NUL */
        if (!as_ident && num_backslashes > 0)
                result_size += num_backslashes + 2;
@@ -4073,7 +4138,8 @@ PQescapeInternal(PGconn *conn, const char *str, size_t 
len, bool as_ident)
        }
        else
        {
-               for (s = str; s - str < input_len; ++s)
+               s = str;
+               for (size_t remaining = input_len; remaining > 0; remaining--, 
s++)
                {
                        if (*s == quote_char || (!as_ident && *s == '\\'))
                        {
@@ -4091,6 +4157,7 @@ PQescapeInternal(PGconn *conn, const char *str, size_t 
len, bool as_ident)
                                        *rp++ = *s;
                                        if (--i == 0)
                                                break;
+                                       remaining--;
                                        ++s;            /* for loop will 
provide the final increment */
                                }
                        }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to