On Thu, 2023-03-23 at 10:16 -0700, Jeff Davis wrote:
> I could get rid of the SQL-callable function and move the rest of the
> changes into 0006. I'll see if that arrangement works better, and
> that
> way we can add the SQL-callable function later (or perhaps not at all
> if it's not desired).

Attached a new series that doesn't include the SQL-callable function.
It's probably better to just wait and see what functions seem actually
useful to users.

I included a new small patch to fix a potential UCollator leak and make
the errors more consistent.


-- 
Jeff Davis
PostgreSQL Contributor Team - AWS


From 7046e684b89fdae92fd5ea83fab12c5634cf3ba3 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Thu, 23 Mar 2023 21:50:47 -0700
Subject: [PATCH v8 1/4] Avoid potential UCollator leak for older ICU versions.

ICU versions 53 and earlier rely on icu_set_collation_attributes() to
process the attributes in the locale string. Refactor slightly to
avoid leaking the already-opened UCollator object if an error is
encountered.

Also centralize error reporting in pg_ucol_open() and make consistent
between versions.
---
 src/backend/utils/adt/pg_locale.c | 59 ++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 42b6ad45cb..f9bc30b85f 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -147,7 +147,8 @@ static size_t uchar_length(UConverter *converter,
 static int32_t uchar_convert(UConverter *converter,
 							 UChar *dest, int32_t destlen,
 							 const char *str, int32_t srclen);
-static void icu_set_collation_attributes(UCollator *collator, const char *loc);
+static void icu_set_collation_attributes(UCollator *collator, const char *loc,
+										 UErrorCode *status);
 #endif
 
 /*
@@ -2503,6 +2504,7 @@ pg_ucol_open(const char *loc_str)
 {
 	UCollator  *collator;
 	UErrorCode	status;
+	const char *orig_str = loc_str;
 	char	   *fixed_str = NULL;
 
 	/*
@@ -2552,11 +2554,27 @@ pg_ucol_open(const char *loc_str)
 	collator = ucol_open(loc_str, &status);
 	if (U_FAILURE(status))
 		ereport(ERROR,
+				/* use original string for error report */
 				(errmsg("could not open collator for locale \"%s\": %s",
-						loc_str, u_errorName(status))));
+						orig_str, u_errorName(status))));
 
 	if (U_ICU_VERSION_MAJOR_NUM < 54)
-		icu_set_collation_attributes(collator, loc_str);
+	{
+		status = U_ZERO_ERROR;
+		icu_set_collation_attributes(collator, loc_str, &status);
+
+		/*
+		 * Pretend the error came from ucol_open(), for consistent error
+		 * message across ICU versions.
+		 */
+		if (U_FAILURE(status))
+		{
+			ucol_close(collator);
+			ereport(ERROR,
+					(errmsg("could not open collator for locale \"%s\": %s",
+							orig_str, u_errorName(status))));
+		}
+	}
 
 	if (fixed_str != NULL)
 		pfree(fixed_str);
@@ -2706,9 +2724,9 @@ icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
  */
 pg_attribute_unused()
 static void
-icu_set_collation_attributes(UCollator *collator, const char *loc)
+icu_set_collation_attributes(UCollator *collator, const char *loc,
+							 UErrorCode *status)
 {
-	UErrorCode	status;
 	int32_t		len;
 	char	   *icu_locale_id;
 	char	   *lower_str;
@@ -2721,15 +2739,15 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 	 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
 	 * uloc_canonicalize().
 	 */
-	status = U_ZERO_ERROR;
-	len = uloc_canonicalize(loc, NULL, 0, &status);
+	*status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, NULL, 0, status);
 	icu_locale_id = palloc(len + 1);
-	status = U_ZERO_ERROR;
-	len = uloc_canonicalize(loc, icu_locale_id, len + 1, &status);
-	if (U_FAILURE(status))
+	*status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
+	if (U_FAILURE(*status))
 		ereport(ERROR,
 				(errmsg("canonicalization failed for locale string \"%s\": %s",
-						loc, u_errorName(status))));
+						loc, u_errorName(*status))));
 
 	lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
 
@@ -2751,7 +2769,7 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 			UColAttribute uattr;
 			UColAttributeValue uvalue;
 
-			status = U_ZERO_ERROR;
+			*status = U_ZERO_ERROR;
 
 			*e = '\0';
 			name = token;
@@ -2801,19 +2819,12 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 			else if (strcmp(value, "upper") == 0)
 				uvalue = UCOL_UPPER_FIRST;
 			else
-				status = U_ILLEGAL_ARGUMENT_ERROR;
-
-			if (status == U_ZERO_ERROR)
-				ucol_setAttribute(collator, uattr, uvalue, &status);
+			{
+				*status = U_ILLEGAL_ARGUMENT_ERROR;
+				break;
+			}
 
-			/*
-			 * Pretend the error came from ucol_open(), for consistent error
-			 * message across ICU versions.
-			 */
-			if (U_FAILURE(status))
-				ereport(ERROR,
-						(errmsg("could not open collator for locale \"%s\": %s",
-								loc, u_errorName(status))));
+			ucol_setAttribute(collator, uattr, uvalue, status);
 		}
 	}
 
-- 
2.34.1

From 90e37d306320c0d4c33d12e89b28039570d05b30 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 22 Mar 2023 10:06:23 -0700
Subject: [PATCH v8 2/4] initdb: emit message when using default ICU locale.

Also, minor cleanup to separate the code that chooses the default ICU
locale from the code that verifies that a specified locale can be
opened with ucol_open(). This cleanup creates a better place for an
important comment.
---
 src/bin/initdb/initdb.c | 77 ++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 25 deletions(-)

diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index bae97539fc..9bf094500d 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2242,46 +2242,73 @@ check_icu_locale_encoding(int user_enc)
 	return true;
 }
 
+#ifdef USE_ICU
+
 /*
- * Check that ICU accepts the locale name; or if not specified, retrieve the
- * default ICU locale.
+ * Determine default ICU locale by opening the default collator and reading
+ * its locale.
+ *
+ * NB: The default collator (opened using NULL) is different from the collator
+ * for the root locale (opened with "", "und", or "root"). The former depends
+ * on the environment (useful at initdb time) and the latter does not.
  */
-static void
-check_icu_locale(void)
+static char *
+default_icu_locale(void)
 {
-#ifdef USE_ICU
 	UCollator	*collator;
 	UErrorCode   status;
+	const char	*valid_locale;
+	char		*default_locale;
 
 	status = U_ZERO_ERROR;
-	collator = ucol_open(icu_locale, &status);
+	collator = ucol_open(NULL, &status);
+	if (U_FAILURE(status))
+		pg_fatal("could not open collator for default locale: %s",
+				 u_errorName(status));
+
+	status = U_ZERO_ERROR;
+	valid_locale = ucol_getLocaleByType(collator, ULOC_VALID_LOCALE,
+										&status);
 	if (U_FAILURE(status))
 	{
-		if (icu_locale)
-			pg_fatal("could not open collator for locale \"%s\": %s",
-					 icu_locale, u_errorName(status));
-		else
-			pg_fatal("could not open collator for default locale: %s",
-					 u_errorName(status));
+		ucol_close(collator);
+		pg_fatal("could not determine default ICU locale");
 	}
 
-	/* if not specified, get locale from default collator */
-	if (icu_locale == NULL)
-	{
-		const char	*default_locale;
+	default_locale = pg_strdup(valid_locale);
 
-		status = U_ZERO_ERROR;
-		default_locale = ucol_getLocaleByType(collator, ULOC_VALID_LOCALE,
-											  &status);
-		if (U_FAILURE(status))
-		{
-			ucol_close(collator);
-			pg_fatal("could not determine default ICU locale");
-		}
+	ucol_close(collator);
 
-		icu_locale = pg_strdup(default_locale);
+	return default_locale;
+}
+
+#endif
+
+/*
+ * If not specified, assign the default locale. Then check that ICU accepts
+ * the locale.
+ */
+static void
+check_icu_locale(void)
+{
+#ifdef USE_ICU
+	UCollator	*collator;
+	UErrorCode   status;
+
+	/* acquire default locale from the environment, if not specified */
+	if (icu_locale == NULL)
+	{
+		icu_locale = default_icu_locale();
+		printf(_("Using default ICU locale \"%s\".\n"), icu_locale);
 	}
 
+	/* check that the resulting locale can be opened */
+	status = U_ZERO_ERROR;
+	collator = ucol_open(icu_locale, &status);
+	if (U_FAILURE(status))
+		pg_fatal("could not open collator for locale \"%s\": %s",
+				 icu_locale, u_errorName(status));
+
 	ucol_close(collator);
 #endif
 }
-- 
2.34.1

From c52d0196aa437ed82a03acf5fb7e94eb6fe41bb6 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 15 Mar 2023 12:37:06 -0700
Subject: [PATCH v8 3/4] Canonicalize ICU locale names to language tags.

Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.

Canonicalization is important, because it's able to handle more kinds
of locale strings than ucol_open(). Without canonicalizing first, a
locale string like "fr_CA.UTF-8" will be misinterpreted by
ucol_open().

The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.

Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.ca...@j-davis.com
---
 src/backend/commands/collationcmds.c          | 60 ++++++++-----
 src/backend/commands/dbcommands.c             | 32 +++++++
 src/backend/utils/adt/pg_locale.c             | 87 +++++++++++++++++--
 src/bin/initdb/initdb.c                       | 79 ++++++++++++++++-
 src/bin/pg_dump/t/002_pg_dump.pl              |  4 +-
 src/include/utils/pg_locale.h                 |  1 +
 .../regress/expected/collate.icu.utf8.out     | 28 +++++-
 src/test/regress/sql/collate.icu.utf8.sql     | 13 +++
 8 files changed, 269 insertions(+), 35 deletions(-)

diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index 3d0aea0568..fe811229b3 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -165,6 +165,11 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 		else
 			colliculocale = NULL;
 
+		/*
+		 * When the ICU locale comes from an existing collation, do not
+		 * canonicalize to a language tag.
+		 */
+
 		datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
 		if (!isnull)
 			collicurules = TextDatumGetCString(datum);
@@ -254,10 +259,43 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 		}
 		else if (collprovider == COLLPROVIDER_ICU)
 		{
+#ifdef USE_ICU
+			char	*langtag;
+
 			if (!colliculocale)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("parameter \"locale\" must be specified")));
+
+			/*
+			 * During binary upgrade, preserve the locale string. Otherwise,
+			 * canonicalize to a language tag.
+			 */
+			if (!IsBinaryUpgrade)
+			{
+				langtag = icu_language_tag(colliculocale, WARNING);
+				if (langtag)
+				{
+					ereport(NOTICE,
+							(errmsg("using language tag \"%s\" for locale \"%s\"",
+									langtag, colliculocale)));
+
+					colliculocale = langtag;
+				}
+				else
+				{
+					ereport(WARNING,
+							(errmsg("could not convert locale \"%s\" to language tag",
+									colliculocale)));
+				}
+			}
+
+			check_icu_locale(colliculocale);
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("ICU is not supported in this build")));
+#endif
 		}
 
 		/*
@@ -576,26 +614,6 @@ cmpaliases(const void *a, const void *b)
 
 
 #ifdef USE_ICU
-/*
- * Get the ICU language tag for a locale name.
- * The result is a palloc'd string.
- */
-static char *
-get_icu_language_tag(const char *localename)
-{
-	char		buf[ULOC_FULLNAME_CAPACITY];
-	UErrorCode	status;
-
-	status = U_ZERO_ERROR;
-	uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("could not convert locale name \"%s\" to language tag: %s",
-						localename, u_errorName(status))));
-
-	return pstrdup(buf);
-}
-
 /*
  * Get a comment (specifically, the display name) for an ICU locale.
  * The result is a palloc'd string, or NULL if we can't get a comment
@@ -957,7 +975,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 			else
 				name = uloc_getAvailable(i);
 
-			langtag = get_icu_language_tag(name);
+			langtag = icu_language_tag(name, ERROR);
 
 			/*
 			 * Be paranoid about not allowing any non-ASCII strings into
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 4d5d5d6866..436f6ec60d 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1043,6 +1043,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 	if (dblocprovider == COLLPROVIDER_ICU)
 	{
+#ifdef USE_ICU
+		char	*langtag;
+
 		if (!(is_encoding_supported_by_icu(encoding)))
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1058,7 +1061,36 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("ICU locale must be specified")));
 
+		/*
+		 * During binary upgrade, or when the locale came from the template
+		 * database, preserve locale string. Otherwise, canonicalize to a
+		 * language tag.
+		 */
+		if (!IsBinaryUpgrade && dbiculocale != src_iculocale)
+		{
+			langtag = icu_language_tag(dbiculocale, WARNING);
+			if (langtag)
+			{
+				ereport(NOTICE,
+						(errmsg("using language tag \"%s\" for locale \"%s\"",
+								langtag, dbiculocale)));
+
+				dbiculocale = langtag;
+			}
+			else
+			{
+				ereport(WARNING,
+						(errmsg("could not convert locale \"%s\" to language tag",
+								dbiculocale)));
+			}
+		}
+
 		check_icu_locale(dbiculocale);
+#else
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ICU is not supported in this build")));
+#endif
 	}
 	else
 	{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index f9bc30b85f..366ec71b9f 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -2831,26 +2831,97 @@ icu_set_collation_attributes(UCollator *collator, const char *loc,
 	pfree(lower_str);
 }
 
-#endif							/* USE_ICU */
-
 /*
  * Check if the given locale ID is valid, and ereport(ERROR) if it isn't.
  */
 void
 check_icu_locale(const char *icu_locale)
 {
-#ifdef USE_ICU
 	UCollator  *collator;
 
 	collator = pg_ucol_open(icu_locale);
 	ucol_close(collator);
-#else
-	ereport(ERROR,
-			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			 errmsg("ICU is not supported in this build")));
-#endif
 }
 
+/*
+ * Return the BCP47 language tag representation of the requested locale.
+ *
+ * This function should be called before passing the string to ucol_open(),
+ * because conversion to a language tag also performs "level 2
+ * canonicalization". In addition to producing a consistent format, level 2
+ * canonicalization is able to more accurately interpret different input
+ * locale string formats, such as POSIX and .NET IDs.
+ */
+char *
+icu_language_tag(const char *loc_str, int elevel)
+{
+	UErrorCode	 status;
+	char		 lang[ULOC_LANG_CAPACITY];
+	char		*langtag;
+	size_t		 buflen = 32;	/* arbitrary starting buffer size */
+	const bool	 strict = true;
+
+	status = U_ZERO_ERROR;
+	uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
+	if (U_FAILURE(status))
+	{
+		ereport(elevel,
+				(errmsg("could not get language from locale \"%s\": %s",
+						loc_str, u_errorName(status))));
+		return NULL;
+	}
+
+	/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
+	if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+		return pstrdup("en-US-u-va-posix");
+
+	/*
+	 * A BCP47 language tag doesn't have a clearly-defined upper limit
+	 * (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
+	 * uloc_toLanguageTag() doesn't always return the ultimate length on the
+	 * first call, necessitating a loop.
+	 */
+	langtag = palloc(buflen);
+	while (true)
+	{
+		int32_t		len;
+
+		status = U_ZERO_ERROR;
+		len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
+
+		/*
+		 * If the result fits in the buffer exactly (len == buflen),
+		 * uloc_toLanguageTag() will return success without nul-terminating
+		 * the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
+		 * buflen and try again.
+		 */
+		if ((status == U_BUFFER_OVERFLOW_ERROR ||
+			 (U_SUCCESS(status) && len >= buflen)) &&
+			buflen < MaxAllocSize)
+		{
+			buflen = Min(buflen * 2, MaxAllocSize);
+			langtag = repalloc(langtag, buflen);
+			continue;
+		}
+
+		break;
+	}
+
+	if (U_FAILURE(status))
+	{
+		pfree(langtag);
+
+		ereport(elevel,
+				(errmsg("could not convert locale name \"%s\" to language tag: %s",
+						loc_str, u_errorName(status))));
+		return NULL;
+	}
+
+	return langtag;
+}
+
+#endif							/* USE_ICU */
+
 /*
  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
  * Therefore we keep them here rather than with the mbutils code.
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 9bf094500d..5cd146f1d0 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2282,11 +2282,78 @@ default_icu_locale(void)
 	return default_locale;
 }
 
+/*
+ * Convert to canonical BCP47 language tag. Must be consistent with
+ * icu_language_tag().
+ */
+static char *
+icu_language_tag(const char *loc_str)
+{
+	UErrorCode	 status;
+	char		 lang[ULOC_LANG_CAPACITY];
+	char		*langtag;
+	size_t		 buflen = 32;	/* arbitrary starting buffer size */
+	const bool	 strict = true;
+
+	status = U_ZERO_ERROR;
+	uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
+	if (U_FAILURE(status))
+	{
+		pg_fatal("could not get language from locale \"%s\": %s",
+				 loc_str, u_errorName(status));
+	}
+
+	/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
+	if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+		return pstrdup("en-US-u-va-posix");
+
+	/*
+	 * A BCP47 language tag doesn't have a clearly-defined upper limit
+	 * (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
+	 * uloc_toLanguageTag() doesn't always return the ultimate length on the
+	 * first call, necessitating a loop.
+	 */
+	langtag = pg_malloc(buflen);
+	while (true)
+	{
+		int32_t		len;
+
+		status = U_ZERO_ERROR;
+		len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
+
+		/*
+		 * If the result fits in the buffer exactly (len == buflen),
+		 * uloc_toLanguageTag() will return success without nul-terminating
+		 * the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
+		 * buflen and try again.
+		 */
+		if (status == U_BUFFER_OVERFLOW_ERROR ||
+			(U_SUCCESS(status) && len >= buflen))
+		{
+			buflen = buflen * 2;
+			langtag = pg_realloc(langtag, buflen);
+			continue;
+		}
+
+		break;
+	}
+
+	if (U_FAILURE(status))
+	{
+		pg_free(langtag);
+
+		pg_fatal("could not convert locale name \"%s\" to language tag: %s",
+				 loc_str, u_errorName(status));
+	}
+
+	return langtag;
+}
+
 #endif
 
 /*
- * If not specified, assign the default locale. Then check that ICU accepts
- * the locale.
+ * If not specified, assign the default locale. Then convert to a language
+ * tag, and check that ICU accepts it.
  */
 static void
 check_icu_locale(void)
@@ -2294,6 +2361,7 @@ check_icu_locale(void)
 #ifdef USE_ICU
 	UCollator	*collator;
 	UErrorCode   status;
+	char		*langtag;
 
 	/* acquire default locale from the environment, if not specified */
 	if (icu_locale == NULL)
@@ -2302,6 +2370,13 @@ check_icu_locale(void)
 		printf(_("Using default ICU locale \"%s\".\n"), icu_locale);
 	}
 
+	/* canonicalize to a language tag */
+	langtag = icu_language_tag(icu_locale);
+	printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
+		   langtag, icu_locale);
+	pg_free(icu_locale);
+	icu_locale = langtag;
+
 	/* check that the resulting locale can be opened */
 	status = U_ZERO_ERROR;
 	collator = ucol_open(icu_locale, &status);
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index a22f27f300..0b38c0537b 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -1837,9 +1837,9 @@ my %tests = (
 
 	'CREATE COLLATION icu_collation' => {
 		create_order => 76,
-		create_sql   => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'C');",
+		create_sql   => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'en-US-u-va-posix');",
 		regexp =>
-		  qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'C'(, version = '[^']*')?\);/m,
+		  qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'en-US-u-va-posix'(, version = '[^']*')?\);/m,
 		icu => 1,
 		like      => { %full_runs, section_pre_data => 1, },
 	},
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index dd822a68be..471b111650 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -121,6 +121,7 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 #ifdef USE_ICU
 extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
 extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
+extern char *icu_language_tag(const char *loc_str, int elevel);
 #endif
 extern void check_icu_locale(const char *icu_locale);
 
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index f135200c99..0d00df165a 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1019,6 +1019,7 @@ reset enable_seqscan;
 CREATE ROLE regress_test_role;
 CREATE SCHEMA test_schema;
 -- We need to do this this way to cope with varying names for encodings:
+SET client_min_messages TO WARNING;
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
@@ -1033,9 +1034,11 @@ BEGIN
           quote_literal(current_setting('lc_collate')) || ');';
 END
 $$;
+RESET client_min_messages;
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
 ERROR:  parameter "locale" must be specified
 CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */  DROP COLLATION testx;
+NOTICE:  using language tag "nonsense" for locale "nonsense"
 CREATE COLLATION test4 FROM nonsense;
 ERROR:  collation "nonsense" for encoding "UTF8" does not exist
 CREATE COLLATION test5 FROM test0;
@@ -1162,14 +1165,18 @@ SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;
 
 -- test ICU collation customization
 -- test the attributes handled by icu_set_collation_attributes()
+SET client_min_messages=WARNING;
 CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
+RESET client_min_messages;
 SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
  ?column? | ?column? 
 ----------+----------
  t        | t
 (1 row)
 
+SET client_min_messages=WARNING;
 CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
+RESET client_min_messages;
 SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
  ?column? | ?column? 
 ----------+----------
@@ -1177,7 +1184,9 @@ SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll
 (1 row)
 
 CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
+NOTICE:  using language tag "und-u-kf-lower" for locale "@colCaseFirst=lower"
 CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
+NOTICE:  using language tag "und-u-kf-upper" for locale "@colCaseFirst=upper"
 SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
  ?column? | ?column? 
 ----------+----------
@@ -1185,13 +1194,16 @@ SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcol
 (1 row)
 
 CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
+NOTICE:  using language tag "und-u-ka-shifted" for locale "@colAlternate=shifted"
 SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
  ?column? | ?column? 
 ----------+----------
  t        | t
 (1 row)
 
+SET client_min_messages=WARNING;
 CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
+RESET client_min_messages;
 SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
  ?column? | ?column? 
 ----------+----------
@@ -1199,10 +1211,12 @@ SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_n
 (1 row)
 
 CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
-ERROR:  could not open collator for locale "@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR
+NOTICE:  using language tag "und-u-kn-lower" for locale "@colNumeric=lower"
+ERROR:  could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR
 -- test that attributes not handled by icu_set_collation_attributes()
 -- (handled by ucol_open() directly) also work
 CREATE COLLATION testcoll_de_phonebook (provider = icu, locale = 'de@collation=phonebook');
+NOTICE:  using language tag "de-u-co-phonebk" for locale "de@collation=phonebook"
 SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE testcoll_de_phonebook;
  ?column? | ?column? 
 ----------+----------
@@ -1211,6 +1225,7 @@ SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE tes
 
 -- rules
 CREATE COLLATION testcoll_rules1 (provider = icu, locale = '', rules = '&a < g');
+NOTICE:  using language tag "und" for locale ""
 CREATE TABLE test7 (a text);
 -- example from https://unicode-org.github.io/icu/userguide/collation/customization/#syntax
 INSERT INTO test7 VALUES ('Abernathy'), ('apple'), ('bird'), ('Boston'), ('Graham'), ('green');
@@ -1238,10 +1253,13 @@ SELECT * FROM test7 ORDER BY a COLLATE testcoll_rules1;
 
 DROP TABLE test7;
 CREATE COLLATION testcoll_rulesx (provider = icu, locale = '', rules = '!!wrong!!');
-ERROR:  could not open collator for locale "" with rules "!!wrong!!": U_INVALID_FORMAT_ERROR
+NOTICE:  using language tag "und" for locale ""
+ERROR:  could not open collator for locale "und" with rules "!!wrong!!": U_INVALID_FORMAT_ERROR
 -- nondeterministic collations
 CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true);
+NOTICE:  using language tag "und" for locale ""
 CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false);
+NOTICE:  using language tag "und" for locale ""
 CREATE TABLE test6 (a int, b text);
 -- same string in different normal forms
 INSERT INTO test6 VALUES (1, U&'\00E4bc');
@@ -1291,7 +1309,9 @@ SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_nondet;
 (2 rows)
 
 CREATE COLLATION case_sensitive (provider = icu, locale = '');
+NOTICE:  using language tag "und" for locale ""
 CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false);
+NOTICE:  using language tag "und-u-ks-level2" for locale "@colStrength=secondary"
 SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
  ?column? | ?column? 
 ----------+----------
@@ -1306,6 +1326,7 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
 
 -- test language tags
 CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
+NOTICE:  using language tag "en-u-ks-level1" for locale "en-u-ks-level1"
 SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
  ?column? 
 ----------
@@ -1313,6 +1334,7 @@ SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
 (1 row)
 
 CREATE COLLATION lt_upperfirst (provider = icu, locale = 'und-u-kf-upper');
+NOTICE:  using language tag "und-u-kf-upper" for locale "und-u-kf-upper"
 SELECT 'Z' COLLATE lt_upperfirst < 'z' COLLATE lt_upperfirst;
  ?column? 
 ----------
@@ -1773,7 +1795,9 @@ SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text);
 (2 rows)
 
 -- accents
+SET client_min_messages=WARNING;
 CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false);
+RESET client_min_messages;
 CREATE TABLE test4 (a int, b text);
 INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');
 SELECT * FROM test4 WHERE b = 'cote';
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 8105ebc8ae..3bd98868b4 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -357,6 +357,8 @@ CREATE ROLE regress_test_role;
 CREATE SCHEMA test_schema;
 
 -- We need to do this this way to cope with varying names for encodings:
+SET client_min_messages TO WARNING;
+
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
@@ -370,6 +372,9 @@ BEGIN
           quote_literal(current_setting('lc_collate')) || ');';
 END
 $$;
+
+RESET client_min_messages;
+
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
 CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */  DROP COLLATION testx;
 
@@ -454,10 +459,14 @@ SELECT * FROM collate_test2 ORDER BY b COLLATE UNICODE;
 
 -- test the attributes handled by icu_set_collation_attributes()
 
+SET client_min_messages=WARNING;
 CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
+RESET client_min_messages;
 SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
 
+SET client_min_messages=WARNING;
 CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
+RESET client_min_messages;
 SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
 
 CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
@@ -467,7 +476,9 @@ SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcol
 CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
 SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
 
+SET client_min_messages=WARNING;
 CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
+RESET client_min_messages;
 SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
 
 CREATE COLLATION testcoll_error1 (provider = icu, locale = '@colNumeric=lower');
@@ -656,7 +667,9 @@ INSERT INTO inner_text VALUES ('a', NULL);
 SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text);
 
 -- accents
+SET client_min_messages=WARNING;
 CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false);
+RESET client_min_messages;
 
 CREATE TABLE test4 (a int, b text);
 INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');
-- 
2.34.1

From 1279aa6b7eb92a01cf9a7eaa3f5f15d595f674ed Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Fri, 17 Mar 2023 09:55:31 -0700
Subject: [PATCH v8 4/4] Validate ICU locales.

Ensure that it can be transformed into a language tag in "strict" mode
(which validates the attributes), that the language exists in ICU, and
that it can be opened.

Basic validation helps avoid minor mistakes and misspellings, which
often fall back to the root locale instead of the intended
locale. It's even more important to avoid such mistakes in ICU
versions 54 and earlier, where the same (misspelled) locale string
could fall back to different locales depending on the environment.

Discussion: https://postgr.es/m/11b1eeb7e7667fdd4178497aeb796c48d26e69b9.ca...@j-davis.com
Discussion: https://postgr.es/m/df2efad0cae7c65180df8e5ebb709e5eb4f2a82b.ca...@j-davis.com
---
 doc/src/sgml/config.sgml                      | 17 ++++
 src/backend/commands/collationcmds.c          | 14 ++--
 src/backend/commands/dbcommands.c             | 14 ++--
 src/backend/utils/adt/pg_locale.c             | 79 ++++++++++++++++---
 src/backend/utils/misc/guc_tables.c           |  9 +++
 src/backend/utils/misc/postgresql.conf.sample |  2 +
 src/include/utils/pg_locale.h                 |  2 +
 .../regress/expected/collate.icu.utf8.out     | 12 ++-
 src/test/regress/sql/collate.icu.utf8.sql     |  6 +-
 9 files changed, 121 insertions(+), 34 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 481f93cea1..78eae3ca65 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9784,6 +9784,23 @@ SET XML OPTION { DOCUMENT | CONTENT };
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-icu-locale-validation" xreflabel="icu_locale_validation">
+      <term><varname>icu_locale_validation</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>icu_locale_validation</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Validation is performed on an ICU locale specified for a new collation
+        or database. If this parameter is set to <literal>true</literal>, an
+        error is raised for a validation failure; if set to
+        <literal>false</literal>, a warning is issued. The default is
+        <literal>false</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-default-text-search-config" xreflabel="default_text_search_config">
       <term><varname>default_text_search_config</varname> (<type>string</type>)
       <indexterm>
diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index fe811229b3..2505ed559d 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -273,7 +273,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 			 */
 			if (!IsBinaryUpgrade)
 			{
-				langtag = icu_language_tag(colliculocale, WARNING);
+				int	elevel = icu_locale_validation ? ERROR : WARNING;
+
+				langtag = icu_language_tag(colliculocale, elevel);
 				if (langtag)
 				{
 					ereport(NOTICE,
@@ -282,15 +284,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 
 					colliculocale = langtag;
 				}
-				else
-				{
-					ereport(WARNING,
-							(errmsg("could not convert locale \"%s\" to language tag",
-									colliculocale)));
-				}
-			}
 
-			check_icu_locale(colliculocale);
+				icu_validate_locale(colliculocale);
+			}
 #else
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 436f6ec60d..40276af143 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1068,7 +1068,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 		 */
 		if (!IsBinaryUpgrade && dbiculocale != src_iculocale)
 		{
-			langtag = icu_language_tag(dbiculocale, WARNING);
+			int	elevel = icu_locale_validation ? ERROR : WARNING;
+
+			langtag = icu_language_tag(dbiculocale, elevel);
 			if (langtag)
 			{
 				ereport(NOTICE,
@@ -1077,15 +1079,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 				dbiculocale = langtag;
 			}
-			else
-			{
-				ereport(WARNING,
-						(errmsg("could not convert locale \"%s\" to language tag",
-								dbiculocale)));
-			}
-		}
 
-		check_icu_locale(dbiculocale);
+			icu_validate_locale(dbiculocale);
+		}
 #else
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 366ec71b9f..7b962d31bc 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -88,13 +88,14 @@
 
 #define		MAX_L10N_DATA		80
 
-
 /* GUC settings */
 char	   *locale_messages;
 char	   *locale_monetary;
 char	   *locale_numeric;
 char	   *locale_time;
 
+bool		icu_locale_validation = false;
+
 /*
  * lc_time localization cache.
  *
@@ -2831,18 +2832,6 @@ icu_set_collation_attributes(UCollator *collator, const char *loc,
 	pfree(lower_str);
 }
 
-/*
- * Check if the given locale ID is valid, and ereport(ERROR) if it isn't.
- */
-void
-check_icu_locale(const char *icu_locale)
-{
-	UCollator  *collator;
-
-	collator = pg_ucol_open(icu_locale);
-	ucol_close(collator);
-}
-
 /*
  * Return the BCP47 language tag representation of the requested locale.
  *
@@ -2920,6 +2909,70 @@ icu_language_tag(const char *loc_str, int elevel)
 	return langtag;
 }
 
+/*
+ * Perform best-effort check that the locale is a valid one.
+ */
+void
+icu_validate_locale(const char *loc_str)
+{
+	UCollator	*collator;
+	UErrorCode	 status;
+	int			 elevel	 = icu_locale_validation ? ERROR : WARNING;
+	char		*langtag;
+	char		 lang[ULOC_LANG_CAPACITY];
+	bool		 found	 = false;
+
+	/* check that it can be converted to a language tag */
+	langtag = icu_language_tag(loc_str, elevel);
+	pfree(langtag);
+
+	/* validate that we can extract the language */
+	status = U_ZERO_ERROR;
+	uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
+	if (U_FAILURE(status))
+	{
+		ereport(elevel,
+				(errmsg("could not get language from locale \"%s\": %s",
+						loc_str, u_errorName(status))));
+		return;
+	}
+
+	/* check for special language name */
+	if (strcmp(lang, "") == 0 ||
+		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 ||
+		strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+		found = true;
+
+	/* search for matching language within ICU */
+	for (int32_t i = 0; !found && i < uloc_countAvailable(); i++)
+	{
+		const char	*otherloc = uloc_getAvailable(i);
+		char		 otherlang[ULOC_LANG_CAPACITY];
+
+		status = U_ZERO_ERROR;
+		uloc_getLanguage(otherloc, otherlang, ULOC_LANG_CAPACITY, &status);
+		if (U_FAILURE(status))
+		{
+			ereport(elevel,
+					(errmsg("could not get language from locale \"%s\": %s",
+							loc_str, u_errorName(status))));
+			continue;
+		}
+
+		if (strcmp(lang, otherlang) == 0)
+			found = true;
+	}
+
+	if (!found)
+		ereport(elevel,
+				(errmsg("locale \"%s\" has unknown language \"%s\"",
+						loc_str, lang)));
+
+	/* check that it can be opened */
+	collator = pg_ucol_open(loc_str);
+	ucol_close(collator);
+}
+
 #endif							/* USE_ICU */
 
 /*
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 1c0583fe26..930bbc1877 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1586,6 +1586,15 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"icu_locale_validation", PGC_USERSET, CLIENT_CONN_LOCALE,
+		 gettext_noop("Raise an error for invalid ICU locale strings."),
+		 NULL
+		},
+		&icu_locale_validation,
+		false,
+		NULL, NULL, NULL
+	},
 	{
 		{"array_nulls", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS,
 			gettext_noop("Enable input of NULL elements in arrays."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d06074b86f..cff927e8be 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -730,6 +730,8 @@
 #lc_numeric = 'C'			# locale for number formatting
 #lc_time = 'C'				# locale for time formatting
 
+#icu_locale_validation = off		# validate ICU locale strings
+
 # default configuration for text search
 #default_text_search_config = 'pg_catalog.simple'
 
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 471b111650..a97735185a 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -40,6 +40,7 @@ extern PGDLLIMPORT char *locale_messages;
 extern PGDLLIMPORT char *locale_monetary;
 extern PGDLLIMPORT char *locale_numeric;
 extern PGDLLIMPORT char *locale_time;
+extern PGDLLIMPORT bool	 icu_locale_validation;
 
 /* lc_time localization cache */
 extern PGDLLIMPORT char *localized_abbrev_days[];
@@ -122,6 +123,7 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
 extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
 extern char *icu_language_tag(const char *loc_str, int elevel);
+extern void icu_validate_locale(const char *loc_str);
 #endif
 extern void check_icu_locale(const char *icu_locale);
 
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 0d00df165a..b6b806d753 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1037,8 +1037,16 @@ $$;
 RESET client_min_messages;
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
 ERROR:  parameter "locale" must be specified
-CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */  DROP COLLATION testx;
-NOTICE:  using language tag "nonsense" for locale "nonsense"
+SET icu_locale_validation = true;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails
+NOTICE:  using language tag "nonsense-nowhere" for locale "nonsense-nowhere"
+ERROR:  locale "nonsense-nowhere" has unknown language "nonsense"
+CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails
+ERROR:  could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR
+RESET icu_locale_validation;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx;
+NOTICE:  using language tag "nonsense-nowhere" for locale "nonsense-nowhere"
+WARNING:  locale "nonsense-nowhere" has unknown language "nonsense"
 CREATE COLLATION test4 FROM nonsense;
 ERROR:  collation "nonsense" for encoding "UTF8" does not exist
 CREATE COLLATION test5 FROM test0;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 3bd98868b4..7871bf5386 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -376,7 +376,11 @@ $$;
 RESET client_min_messages;
 
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
-CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */  DROP COLLATION testx;
+SET icu_locale_validation = true;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails
+CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails
+RESET icu_locale_validation;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx;
 
 CREATE COLLATION test4 FROM nonsense;
 CREATE COLLATION test5 FROM test0;
-- 
2.34.1

Reply via email to