Like ICU, allow -1 length to mean that the input string is NUL-
terminated for pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().
This simplifies the API and code a bit.
Along with some other refactoring in this area, we are getting close to
the point where the collation provider can just be a table of methods,
which means we can add an extension hook to provide a different method
table. That still requires more work, I'm just mentioning it here for
context.
Regards,
Jeff Davis
From 6f0c0a9e05039cd295c6c090b3d98d381244b35c Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 21 Aug 2024 10:59:28 -0700
Subject: [PATCH v1] Allow length=-1 for NUL-terminated input to pg_strncoll(),
etc.
Like ICU, allow a length of -1 to be specified for NUL-terminated
arguments to pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().
Simplifies the code and comments.
---
src/backend/utils/adt/pg_locale.c | 186 ++++++++++--------------------
1 file changed, 64 insertions(+), 122 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 48b7e16d81b..26b0f4577f0 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1809,6 +1809,8 @@ get_collation_actual_version(char collprovider, const char *collcollate)
*
* Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
* invoke wcscoll_l().
+ *
+ * An input string length of -1 means that it's NUL-terminated.
*/
#ifdef WIN32
static int
@@ -1819,8 +1821,8 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
char *buf = sbuf;
char *a1p,
*a2p;
- int a1len = len1 * 2 + 2;
- int a2len = len2 * 2 + 2;
+ int a1len;
+ int a2len;
int r;
int result;
@@ -1830,6 +1832,14 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
Assert(false);
#endif
+ if (len1 == -1)
+ len1 = strlen(arg1);
+ if (len2 == -1)
+ len2 = strlen(arg2);
+
+ a1len = len1 * 2 + 2;
+ a2len = len2 * 2 + 2;
+
if (a1len + a2len > TEXTBUFLEN)
buf = palloc(a1len + a2len);
@@ -1876,40 +1886,10 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
}
#endif /* WIN32 */
-/*
- * pg_strcoll_libc
- *
- * Call strcoll_l() or wcscoll_l() as appropriate for the given locale,
- * platform, and database encoding. If the locale is NULL, use the database
- * collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- */
-static int
-pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
-{
- int result;
-
- Assert(locale->provider == COLLPROVIDER_LIBC);
-#ifdef WIN32
- if (GetDatabaseEncoding() == PG_UTF8)
- {
- size_t len1 = strlen(arg1);
- size_t len2 = strlen(arg2);
-
- result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
- }
- else
-#endif /* WIN32 */
- result = strcoll_l(arg1, arg2, locale->info.lt);
-
- return result;
-}
-
/*
* pg_strncoll_libc
*
- * Nul-terminate the arguments and call pg_strcoll_libc().
+ * An input string length of -1 means that it's NUL-terminated.
*/
static int
pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
@@ -1917,10 +1897,10 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
- size_t bufsize1 = len1 + 1;
- size_t bufsize2 = len2 + 1;
- char *arg1n;
- char *arg2n;
+ size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
+ size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
+ const char *arg1n;
+ const char *arg2n;
int result;
Assert(locale->provider == COLLPROVIDER_LIBC);
@@ -1934,16 +1914,32 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
if (bufsize1 + bufsize2 > TEXTBUFLEN)
buf = palloc(bufsize1 + bufsize2);
- arg1n = buf;
- arg2n = buf + bufsize1;
+ /* nul-terminate arguments if necessary */
+ if (len1 == -1)
+ {
+ arg1n = arg1;
+ }
+ else
+ {
+ char *buf1 = buf;
+ memcpy(buf1, arg1, len1);
+ buf1[len1] = '\0';
+ arg1n = buf1;
+ }
- /* nul-terminate arguments */
- memcpy(arg1n, arg1, len1);
- arg1n[len1] = '\0';
- memcpy(arg2n, arg2, len2);
- arg2n[len2] = '\0';
+ if (len2 == -1)
+ {
+ arg2n = arg2;
+ }
+ else
+ {
+ char *buf2 = buf + bufsize1;
+ memcpy(buf2, arg2, len2);
+ buf2[len2] = '\0';
+ arg2n = buf2;
+ }
- result = pg_strcoll_libc(arg1n, arg2n, locale);
+ result = strcoll_l(arg1n, arg2n, locale->info.lt);
if (buf != sbuf)
pfree(buf);
@@ -2015,8 +2011,6 @@ pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
* Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
* database encoding. An argument length of -1 means the string is
* NUL-terminated.
- *
- * Arguments must be encoded in the database encoding.
*/
static int
pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
@@ -2054,15 +2048,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
/*
* pg_strcoll
*
- * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
- * appropriate for the given locale, platform, and database encoding. If the
- * locale is not specified, use the database collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- *
- * The caller is responsible for breaking ties if the collation is
- * deterministic; this maintains consistency with pg_strxfrm(), which cannot
- * easily account for deterministic collations.
+ * Like pg_strncoll for NUL-terminated input strings.
*/
int
pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
@@ -2070,7 +2056,7 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
int result;
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strcoll_libc(arg1, arg2, locale);
+ result = pg_strncoll_libc(arg1, -1, arg2, -1, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
@@ -2089,11 +2075,8 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
* appropriate for the given locale, platform, and database encoding. If the
* locale is not specified, use the database collation.
*
- * Arguments must be encoded in the database encoding.
- *
- * This function may need to nul-terminate the arguments for libc functions;
- * so if the caller already has nul-terminated strings, it should call
- * pg_strcoll() instead.
+ * The input strings must be encoded in the database encoding. If an input
+ * string is NUL-terminated, its length may be specified as -1.
*
* The caller is responsible for breaking ties if the collation is
* deterministic; this maintains consistency with pg_strnxfrm(), which cannot
@@ -2119,14 +2102,6 @@ pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
}
-static size_t
-pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
- pg_locale_t locale)
-{
- Assert(locale->provider == COLLPROVIDER_LIBC);
- return strxfrm_l(dest, src, destsize, locale->info.lt);
-}
-
static size_t
pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
pg_locale_t locale)
@@ -2138,14 +2113,17 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
Assert(locale->provider == COLLPROVIDER_LIBC);
+ if (srclen == -1)
+ return strxfrm_l(dest, src, destsize, locale->info.lt);
+
if (bufsize > TEXTBUFLEN)
buf = palloc(bufsize);
- /* nul-terminate arguments */
+ /* nul-terminate argument */
memcpy(buf, src, srclen);
buf[srclen] = '\0';
- result = pg_strxfrm_libc(dest, buf, destsize, locale);
+ result = strxfrm_l(dest, buf, destsize, locale->info.lt);
if (buf != sbuf)
pfree(buf);
@@ -2326,20 +2304,7 @@ pg_strxfrm_enabled(pg_locale_t locale)
/*
* pg_strxfrm
*
- * Transforms 'src' to a nul-terminated string stored in 'dest' such that
- * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
- * untransformed strings.
- *
- * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
- * may be NULL.
- *
- * Not all providers support pg_strxfrm() safely. The caller should check
- * pg_strxfrm_enabled() first, otherwise this function may return wrong
- * results or an error.
- *
- * Returns the number of bytes needed (or more) to store the transformed
- * string, excluding the terminating nul byte. If the value returned is
- * 'destsize' or greater, the resulting contents of 'dest' are undefined.
+ * Like pg_strnxfrm for a NUL-terminated input string.
*/
size_t
pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
@@ -2347,7 +2312,7 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
size_t result = 0; /* keep compiler quiet */
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strxfrm_libc(dest, src, destsize, locale);
+ result = pg_strnxfrm_libc(dest, src, -1, destsize, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
@@ -2366,8 +2331,9 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
* ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
* untransformed strings.
*
- * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
- * be NULL.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1. If 'destsize'
+ * is zero, 'dest' may be NULL.
*
* Not all providers support pg_strnxfrm() safely. The caller should check
* pg_strxfrm_enabled() first, otherwise this function may return wrong
@@ -2376,10 +2342,6 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
* Returns the number of bytes needed (or more) to store the transformed
* string, excluding the terminating nul byte. If the value returned is
* 'destsize' or greater, the resulting contents of 'dest' are undefined.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm() instead.
*/
size_t
pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
@@ -2421,44 +2383,24 @@ pg_strxfrm_prefix_enabled(pg_locale_t locale)
/*
* pg_strxfrm_prefix
*
- * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
- * untransformed strings. The result is not nul-terminated.
- *
- * The provided 'src' must be nul-terminated.
- *
- * Not all providers support pg_strxfrm_prefix() safely. The caller should
- * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
- * wrong results or an error.
- *
- * If destsize is not large enough to hold the resulting byte sequence, stores
- * only the first destsize bytes in 'dest'. Returns the number of bytes
- * actually copied to 'dest'.
+ * Like pg_strnxfrm_prefix for a NUL-terminated input string.
*/
size_t
pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
pg_locale_t locale)
{
- size_t result = 0; /* keep compiler quiet */
-
-#ifdef USE_ICU
- if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
- else
-#endif
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
+ return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
}
/*
* pg_strnxfrm_prefix
*
* Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
+ * memcmp() on the byte sequence is equivalent to pg_strncoll() on
* untransformed strings. The result is not nul-terminated.
*
- * The provided 'src' must be nul-terminated.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1.
*
* Not all providers support pg_strnxfrm_prefix() safely. The caller should
* check pg_strxfrm_prefix_enabled() first, otherwise this function may return
@@ -2467,10 +2409,6 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
* If destsize is not large enough to hold the resulting byte sequence, stores
* only the first destsize bytes in 'dest'. Returns the number of bytes
* actually copied to 'dest'.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm_prefix() instead.
*/
size_t
pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
@@ -2661,6 +2599,8 @@ init_icu_converter(void)
/*
* Find length, in UChars, of given string if converted to UChar string.
+ *
+ * A length of -1 indicates that the input string is NUL-terminated.
*/
static size_t
uchar_length(UConverter *converter, const char *str, int32_t len)
@@ -2678,6 +2618,8 @@ uchar_length(UConverter *converter, const char *str, int32_t len)
/*
* Convert the given source string into a UChar string, stored in dest, and
* return the length (in UChars).
+ *
+ * A srclen of -1 indicates that the input string is NUL-terminated.
*/
static int32_t
uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
--
2.34.1