Re: ICU locale validation / canonicalization

Jeff Davis Mon, 27 Feb 2023 21:57:48 -0800

On Mon, 2023-02-20 at 15:23 -0800, Jeff Davis wrote:
> 
> New patch attached. The new patch also includes a GUC that (when
> enabled) validates that the collator is actually found.


New patch attached.

Now it always preserves the exact locale string during pg_upgrade, and
does not attempt to canonicalize it. Before it was trying to be clever
by determining if the language tag was finding the same collator as the
original string -- I didn't find a problem with that, but it just
seemed a bit too clever. So, only newly-created locales and databases
have the ICU locale string canonicalized to a language tag.

Also, I added a SQL function pg_icu_language_tag() that can convert
locale strings to language tags, and check whether they exist or not.


-- 
Jeff Davis
PostgreSQL Contributor Team - AWS

From e7b67f0410a18c32cf271532f7a4719cf8c1c560 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Wed, 15 Feb 2023 23:05:08 -0800
Subject: [PATCH v3] ICU locale string canonicalization and validation.

Before storing the locale name in the catalog, convert to a BCP47
language tag. The language tag is an unambiguous representation of the
locale that holds all of the necessary information.

Also, add a new GUC icu_locale_validation. When set to true, it raises
an ERROR if the locale string is malformed or if it is not a valid
locale in ICU.

During pg_upgrade, the previous locale string is preserved verbatim.

Discussion: https://postgr.es/m/11b1eeb7e7667fdd4178497aeb796c48d26e69b9.ca...@j-davis.com
---
 doc/src/sgml/config.sgml                      |  16 +++
 doc/src/sgml/func.sgml                        |  17 +++
 src/backend/commands/collationcmds.c          | 101 +++++++++++---
 src/backend/commands/dbcommands.c             |  39 ++++++
 src/backend/utils/adt/pg_locale.c             | 128 ++++++++++++++++--
 src/backend/utils/misc/guc_tables.c           |  10 ++
 src/backend/utils/misc/postgresql.conf.sample |   2 +
 src/bin/pg_dump/t/002_pg_dump.pl              |   4 +-
 src/include/catalog/pg_proc.dat               |   5 +
 src/include/commands/dbcommands.h             |   1 +
 src/include/utils/pg_locale.h                 |   4 +
 .../regress/expected/collate.icu.utf8.out     |  87 +++++++++++-
 src/test/regress/sql/collate.icu.utf8.sql     |  24 +++-
 13 files changed, 405 insertions(+), 33 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e5c41cc6c6..f7fdb54a1b 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9776,6 +9776,22 @@ SET XML OPTION { DOCUMENT | CONTENT };
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-icu-locale-validation" xreflabel="icu_locale_validation">
+      <term><varname>icu_locale_validation</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>icu_locale_validation</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        If set to <literal>true</literal>, validates that ICU locale strings
+        are well-formed, and that they represent valid locale in ICU. Does not
+        cause any locale string to be rejected during <xref
+        linkend="pgupgrade"/>. The default is <literal>false</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-default-text-search-config" xreflabel="default_text_search_config">
       <term><varname>default_text_search_config</varname> (<type>string</type>)
       <indexterm>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 0cbdf63632..e2604c41ad 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -27406,6 +27406,23 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
         Use of this function is restricted to superusers.
        </para></entry>
       </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_icu_language_tag</primary>
+        </indexterm>
+        <function>pg_icu_language_tag</function> ( <parameter>locale</parameter> <type>text</type>, <parameter>validate</parameter> <type>boolean</type> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Canonicalizes the given <parameter>locale</parameter> string into a
+        BCP 47 language tag (see <xref
+        linkend="collation-managing-create-icu"/>). If
+        <parameter>validate</parameter> is <literal>true</literal>, check that
+        the resulting language tag represents a valid locale in ICU.
+       </para></entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index eb62d285ea..8edc22f579 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -47,6 +47,8 @@ typedef struct
 	int			enc;			/* encoding */
 } CollAliasData;
 
+extern bool icu_locale_validation;
+
 
 /*
  * CREATE COLLATION
@@ -240,10 +242,50 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 		}
 		else if (collprovider == COLLPROVIDER_ICU)
 		{
+#ifdef USE_ICU
+			char	*langtag;
+
 			if (!colliculocale)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("parameter \"locale\" must be specified")));
+
+			check_icu_locale(colliculocale);
+
+			/*
+			 * During binary upgrade, preserve locale string verbatim.
+			 * Otherwise, canonicalize to a language tag.
+			 */
+			if (!IsBinaryUpgrade)
+			{
+				int elevel = icu_locale_validation ? ERROR : WARNING;
+
+				langtag = icu_language_tag(colliculocale);
+				if (langtag)
+				{
+					ereport(NOTICE,
+							(errmsg("using language tag \"%s\" for locale \"%s\"",
+									langtag, colliculocale)));
+
+					if (!icu_collator_exists(langtag))
+						ereport(elevel,
+								(errmsg("ICU collator for language tag \"%s\" not found",
+										langtag)));
+
+					colliculocale = langtag;
+				}
+				else
+				{
+					ereport(elevel,
+							(errmsg("could not convert locale \"%s\" to language tag",
+									colliculocale)));
+				}
+			}
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("ICU is not supported in this build")));
+#endif
 		}
 
 		/*
@@ -556,26 +598,6 @@ cmpaliases(const void *a, const void *b)
 
 
 #ifdef USE_ICU
-/*
- * Get the ICU language tag for a locale name.
- * The result is a palloc'd string.
- */
-static char *
-get_icu_language_tag(const char *localename)
-{
-	char		buf[ULOC_FULLNAME_CAPACITY];
-	UErrorCode	status;
-
-	status = U_ZERO_ERROR;
-	uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("could not convert locale name \"%s\" to language tag: %s",
-						localename, u_errorName(status))));
-
-	return pstrdup(buf);
-}
-
 /*
  * Get a comment (specifically, the display name) for an ICU locale.
  * The result is a palloc'd string, or NULL if we can't get a comment
@@ -938,7 +960,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 			else
 				name = uloc_getAvailable(i);
 
-			langtag = get_icu_language_tag(name);
+			langtag = icu_language_tag(name);
+			if (langtag == NULL)
+				ereport(ERROR,
+						(errmsg("could not convert locale name \"%s\" to language tag",
+								name)));
 			iculocstr = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name;
 
 			/*
@@ -996,3 +1022,36 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 
 	PG_RETURN_INT32(ncreated);
 }
+
+/*
+ * pg_icu_language_tag
+ *
+ * Return the BCP47 language tag representation of the given locale string.
+ */
+Datum
+pg_icu_language_tag(PG_FUNCTION_ARGS)
+{
+#ifdef USE_ICU
+	text	*locale_text = PG_GETARG_TEXT_PP(0);
+	bool	 validate	 = PG_GETARG_BOOL(1);
+	char	*locale_cstr = text_to_cstring(locale_text);
+	char	*langtag	 = icu_language_tag(locale_cstr);
+
+	if (langtag == NULL)
+		ereport(ERROR,
+				(errmsg("could not convert locale \"%s\" to language tag",
+						locale_cstr)));
+
+	if (validate && !icu_collator_exists(langtag))
+		ereport(ERROR,
+				(errmsg("ICU collator for language tag \"%s\" not found",
+						langtag)));
+
+	PG_RETURN_TEXT_P(cstring_to_text(langtag));
+#else
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("ICU is not supported in this build")));
+	PG_RETURN_NULL();
+#endif
+}
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index a0259cc593..a19881ad9c 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -109,6 +109,7 @@ typedef struct CreateDBRelInfo
 	bool		permanent;		/* relation is permanent or unlogged */
 } CreateDBRelInfo;
 
+extern bool icu_locale_validation;
 
 /* non-export function prototypes */
 static void createdb_failure_callback(int code, Datum arg);
@@ -1029,6 +1030,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 	if (dblocprovider == COLLPROVIDER_ICU)
 	{
+#ifdef USE_ICU
+		char	*langtag;
+
 		if (!(is_encoding_supported_by_icu(encoding)))
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1045,6 +1049,41 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 					 errmsg("ICU locale must be specified")));
 
 		check_icu_locale(dbiculocale);
+
+		/*
+		 * During binary upgrade, preserve locale string verbatim. Otherwise,
+		 * canonicalize to a language tag.
+		 */
+		if (!IsBinaryUpgrade)
+		{
+			int elevel = icu_locale_validation ? ERROR : WARNING;
+
+			langtag = icu_language_tag(dbiculocale);
+			if (langtag)
+			{
+				ereport(NOTICE,
+						(errmsg("using language tag \"%s\" for locale \"%s\"",
+								langtag, dbiculocale)));
+
+				if (!icu_collator_exists(langtag))
+					ereport(elevel,
+							(errmsg("ICU collator for language tag \"%s\" not found",
+									langtag)));
+
+				dbiculocale = langtag;
+			}
+			else
+			{
+				ereport(elevel,
+						(errmsg("could not convert locale \"%s\" to language tag",
+								dbiculocale)));
+			}
+		}
+#else
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ICU is not supported in this build")));
+#endif
 	}
 	else
 	{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 4aa5eaa984..6378a6d5ca 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -2690,15 +2690,12 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 	}
 }
 
-#endif							/* USE_ICU */
-
 /*
  * Check if the given locale ID is valid, and ereport(ERROR) if it isn't.
  */
 void
 check_icu_locale(const char *icu_locale)
 {
-#ifdef USE_ICU
 	UCollator  *collator;
 	UErrorCode	status;
 
@@ -2712,13 +2709,128 @@ check_icu_locale(const char *icu_locale)
 	if (U_ICU_VERSION_MAJOR_NUM < 54)
 		icu_set_collation_attributes(collator, icu_locale);
 	ucol_close(collator);
-#else
-	ereport(ERROR,
-			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			 errmsg("ICU is not supported in this build")));
-#endif
 }
 
+/*
+ * Check if the locale string represents the root locale. It represents the
+ * root locale if the language part is "und", "root", or the empty string.
+ */
+static bool
+icu_is_root_locale(const char *locale)
+{
+	UErrorCode	 status;
+	char		*lang_part;
+	int32_t		 len;
+	bool		 result	   = false;
+
+	status = U_ZERO_ERROR;
+	len  = uloc_getLanguage(locale, NULL, 0, &status);
+	lang_part = palloc(len + 1);
+	status = U_ZERO_ERROR;
+	uloc_getLanguage(locale, lang_part, len + 1, &status);
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("could not get language name from locale string \"%s\": %s",
+						locale, u_errorName(status))));
+
+	if (pg_strcasecmp(lang_part, "root") == 0 ||
+		pg_strcasecmp(lang_part, "und") == 0 ||
+		pg_strcasecmp(lang_part, "") == 0)
+		result = true;
+
+	pfree(lang_part);
+	return result;
+}
+
+/*
+ * Special case to check for locales like "POSIX" or "C.UTF-8". These are not
+ * handled by ICU level 2 canonicalization.
+ */
+static bool
+icu_is_c_posix(const char *locale)
+{
+	if (pg_strcasecmp(locale, "c") == 0 ||
+		pg_strncasecmp(locale, "c.", sizeof("c.") - 1) == 0 ||
+		pg_strcasecmp(locale, "posix") == 0 ||
+		pg_strncasecmp(locale, "posix.", sizeof("posix.") - 1) == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * Check if the given language tag resolves to a valid locale in ICU.
+ *
+ * If the resulting collator falls back to the root locale, and the root
+ * locale was not explicitly requested, return false.
+ */
+bool
+icu_collator_exists(const char *langtag)
+{
+	UCollator	*collator;
+	const char	*valid_locale = NULL;
+	UErrorCode	 status;
+	bool		 result		  = false;
+
+	status = U_ZERO_ERROR;
+	collator = ucol_open(langtag, &status);
+	if (U_FAILURE(status))
+		return false;
+
+	status = U_ZERO_ERROR;
+	valid_locale = ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	if (icu_is_root_locale(langtag) ||
+		!icu_is_root_locale(valid_locale))
+		result = true;
+
+cleanup:
+	ucol_close(collator);
+	return result;
+}
+
+/*
+ * Return the BCP47 language tag representation of the requested locale; or
+ * NULL if a problem is encountered.
+ *
+ * This function should be called before passing the string to ucol_open(),
+ * because conversion to a language tag also performs "level 2
+ * canonicalization". In addition to producing a consistent result format,
+ * level 2 canonicalization is able to more accurately interpret different
+ * input locale string formats, such as POSIX and .NET IDs.
+ */
+char *
+icu_language_tag(const char *requested_locale)
+{
+	UErrorCode	 status;
+	char		*result;
+	int32_t		 len;
+	const bool	 strict = true;
+
+	/* c/posix locales aren't handled by uloc_getLanguageTag() */
+	if (icu_is_c_posix(requested_locale))
+		return pstrdup("en-US-u-va-posix");
+
+	status = U_ZERO_ERROR;
+	len = uloc_toLanguageTag(requested_locale, NULL, 0, strict, &status);
+
+	result = palloc(len + 1);
+
+	status = U_ZERO_ERROR;
+	uloc_toLanguageTag(requested_locale, result, len + 1, strict, &status);
+	if (U_FAILURE(status))
+	{
+		pfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+#endif							/* USE_ICU */
+
 /*
  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
  * Therefore we keep them here rather than with the mbutils code.
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 1c0583fe26..4aa53259dc 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -481,6 +481,7 @@ char	   *event_source;
 
 bool		row_security;
 bool		check_function_bodies = true;
+bool		icu_locale_validation = false;
 
 /*
  * This GUC exists solely for backward compatibility, check its definition for
@@ -1586,6 +1587,15 @@ struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
+	{
+		{"icu_locale_validation", PGC_SUSET, CLIENT_CONN_LOCALE,
+			gettext_noop("Validate ICU locale strings."),
+			NULL
+		},
+		&icu_locale_validation,
+		false,
+		NULL, NULL, NULL
+	},
 	{
 		{"array_nulls", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS,
 			gettext_noop("Enable input of NULL elements in arrays."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d06074b86f..cff927e8be 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -730,6 +730,8 @@
 #lc_numeric = 'C'			# locale for number formatting
 #lc_time = 'C'				# locale for time formatting
 
+#icu_locale_validation = off		# validate ICU locale strings
+
 # default configuration for text search
 #default_text_search_config = 'pg_catalog.simple'
 
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 72b19ee6cd..cb34f4af7f 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -1721,9 +1721,9 @@ my %tests = (
 
 	'CREATE COLLATION icu_collation' => {
 		create_order => 76,
-		create_sql   => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'C');",
+		create_sql   => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'en-US-u-va-posix');",
 		regexp =>
-		  qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'C'(, version = '[^']*')?\);/m,
+		  qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'en-US-u-va-posix'(, version = '[^']*')?\);/m,
 		icu => 1,
 		like      => { %full_runs, section_pre_data => 1, },
 	},
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index e2a7642a2b..7d5641572e 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11806,6 +11806,11 @@
   proname => 'pg_database_collation_actual_version', procost => '100',
   provolatile => 'v', prorettype => 'text', proargtypes => 'oid',
   prosrc => 'pg_database_collation_actual_version' },
+{ oid => '6273',
+  descr => 'get BCP47 language tag representation of locale',
+  proname => 'pg_icu_language_tag', procost => '100',
+  provolatile => 's', prorettype => 'text', proargtypes => 'text bool',
+  prosrc => 'pg_icu_language_tag' },
 
 # system management/monitoring related functions
 { oid => '3353', descr => 'list files in the log directory',
diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h
index 5fbc3ca752..0f0e827ff2 100644
--- a/src/include/commands/dbcommands.h
+++ b/src/include/commands/dbcommands.h
@@ -33,5 +33,6 @@ extern char *get_database_name(Oid dbid);
 extern bool have_createdb_privilege(void);
 
 extern void check_encoding_locale_matches(int encoding, const char *collate, const char *ctype);
+extern char *get_icu_locale(const char *requested_locale);
 
 #endif							/* DBCOMMANDS_H */
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index b8f22875a8..58cd4297bb 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -118,8 +118,12 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 #ifdef USE_ICU
 extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
 extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
+extern bool check_equivalent_icu_locales(const char *locale1,
+										 const char *locale2);
 #endif
 extern void check_icu_locale(const char *icu_locale);
+extern bool icu_collator_exists(const char *requested_locale);
+extern char *icu_language_tag(const char *requested_locale);
 
 /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
 extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 4354dc07b8..927718a937 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -11,6 +11,63 @@ SELECT getdatabaseencoding() <> 'UTF8' OR
 SET client_encoding TO UTF8;
 CREATE SCHEMA collate_tests;
 SET search_path = collate_tests;
+-- test language tag canonicalization
+SELECT pg_icu_language_tag('en_US', true);
+ pg_icu_language_tag 
+---------------------
+ en-US
+(1 row)
+
+SELECT pg_icu_language_tag('nonsense', false);
+ pg_icu_language_tag 
+---------------------
+ nonsense
+(1 row)
+
+SELECT pg_icu_language_tag('nonsense', true); -- error
+ERROR:  ICU collator for language tag "nonsense" not found
+SELECT pg_icu_language_tag('C.UTF-8', true);
+ pg_icu_language_tag 
+---------------------
+ en-US-u-va-posix
+(1 row)
+
+SELECT pg_icu_language_tag('POSIX', true);
+ pg_icu_language_tag 
+---------------------
+ en-US-u-va-posix
+(1 row)
+
+SELECT pg_icu_language_tag('en_US_POSIX', true);
+ pg_icu_language_tag 
+---------------------
+ en-US-u-va-posix
+(1 row)
+
+SELECT pg_icu_language_tag('@colStrength=secondary', true);
+ pg_icu_language_tag 
+---------------------
+ und-u-ks-level2
+(1 row)
+
+SELECT pg_icu_language_tag('', true);
+ pg_icu_language_tag 
+---------------------
+ und
+(1 row)
+
+SELECT pg_icu_language_tag('fr_CA.UTF-8', true);
+ pg_icu_language_tag 
+---------------------
+ fr-CA
+(1 row)
+
+SELECT pg_icu_language_tag('en_US@colStrength=primary', true);
+ pg_icu_language_tag 
+---------------------
+ en-US-u-ks-level1
+(1 row)
+
 CREATE TABLE collate_test1 (
     a int,
     b text COLLATE "en-x-icu" NOT NULL
@@ -1019,6 +1076,7 @@ reset enable_seqscan;
 CREATE ROLE regress_test_role;
 CREATE SCHEMA test_schema;
 -- We need to do this this way to cope with varying names for encodings:
+SET client_min_messages TO WARNING;
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
@@ -1033,9 +1091,24 @@ BEGIN
           quote_literal(current_setting('lc_collate')) || ');';
 END
 $$;
+RESET client_min_messages;
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
 ERROR:  parameter "locale" must be specified
-CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */  DROP COLLATION testx;
+SET icu_locale_validation = TRUE;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense'); -- fails
+NOTICE:  using language tag "nonsense" for locale "nonsense"
+ERROR:  ICU collator for language tag "nonsense" not found
+RESET icu_locale_validation;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense@colStrength=primary');
+NOTICE:  using language tag "nonsense-u-ks-level1" for locale "nonsense@colStrength=primary"
+WARNING:  ICU collator for language tag "nonsense-u-ks-level1" not found
+SELECT colliculocale FROM pg_collation WHERE collname='testx';
+    colliculocale     
+----------------------
+ nonsense-u-ks-level1
+(1 row)
+
+DROP COLLATION testx;
 CREATE COLLATION test4 FROM nonsense;
 ERROR:  collation "nonsense" for encoding "UTF8" does not exist
 CREATE COLLATION test5 FROM test0;
@@ -1144,6 +1217,7 @@ drop type textrange_en_us;
 -- test ICU collation customization
 -- test the attributes handled by icu_set_collation_attributes()
 CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes');
+NOTICE:  using language tag "und-u-kc-ks-level1" for locale "@colStrength=primary;colCaseLevel=yes"
 SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents;
  ?column? | ?column? 
 ----------+----------
@@ -1151,6 +1225,7 @@ SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignor
 (1 row)
 
 CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes');
+NOTICE:  using language tag "und-u-kb" for locale "@colBackwards=yes"
 SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards;
  ?column? | ?column? 
 ----------+----------
@@ -1158,7 +1233,9 @@ SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll
 (1 row)
 
 CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower');
+NOTICE:  using language tag "und-u-kf-lower" for locale "@colCaseFirst=lower"
 CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper');
+NOTICE:  using language tag "und-u-kf-upper" for locale "@colCaseFirst=upper"
 SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first;
  ?column? | ?column? 
 ----------+----------
@@ -1166,6 +1243,7 @@ SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcol
 (1 row)
 
 CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted');
+NOTICE:  using language tag "und-u-ka-shifted" for locale "@colAlternate=shifted"
 SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted;
  ?column? | ?column? 
 ----------+----------
@@ -1173,6 +1251,7 @@ SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE te
 (1 row)
 
 CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
+NOTICE:  using language tag "und-u-kn" for locale "@colNumeric=yes"
 SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric;
  ?column? | ?column? 
 ----------+----------
@@ -1184,6 +1263,7 @@ ERROR:  could not open collator for locale "@colNumeric=lower": U_ILLEGAL_ARGUME
 -- test that attributes not handled by icu_set_collation_attributes()
 -- (handled by ucol_open() directly) also work
 CREATE COLLATION testcoll_de_phonebook (provider = icu, locale = 'de@collation=phonebook');
+NOTICE:  using language tag "de-u-co-phonebk" for locale "de@collation=phonebook"
 SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE testcoll_de_phonebook;
  ?column? | ?column? 
 ----------+----------
@@ -1192,7 +1272,9 @@ SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE tes
 
 -- nondeterministic collations
 CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true);
+NOTICE:  using language tag "und" for locale ""
 CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false);
+NOTICE:  using language tag "und" for locale ""
 CREATE TABLE test6 (a int, b text);
 -- same string in different normal forms
 INSERT INTO test6 VALUES (1, U&'\00E4bc');
@@ -1242,7 +1324,9 @@ SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_nondet;
 (2 rows)
 
 CREATE COLLATION case_sensitive (provider = icu, locale = '');
+NOTICE:  using language tag "und" for locale ""
 CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false);
+NOTICE:  using language tag "und-u-ks-level2" for locale "@colStrength=secondary"
 SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
  ?column? | ?column? 
 ----------+----------
@@ -1710,6 +1794,7 @@ SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text);
 
 -- accents
 CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false);
+NOTICE:  using language tag "und-u-kc-ks-level1" for locale "@colStrength=primary;colCaseLevel=yes"
 CREATE TABLE test4 (a int, b text);
 INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté');
 SELECT * FROM test4 WHERE b = 'cote';
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index b0ddc7db44..26096f0627 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -15,6 +15,17 @@ SET client_encoding TO UTF8;
 CREATE SCHEMA collate_tests;
 SET search_path = collate_tests;
 
+-- test language tag canonicalization
+SELECT pg_icu_language_tag('en_US', true);
+SELECT pg_icu_language_tag('nonsense', false);
+SELECT pg_icu_language_tag('nonsense', true); -- error
+SELECT pg_icu_language_tag('C.UTF-8', true);
+SELECT pg_icu_language_tag('POSIX', true);
+SELECT pg_icu_language_tag('en_US_POSIX', true);
+SELECT pg_icu_language_tag('@colStrength=secondary', true);
+SELECT pg_icu_language_tag('', true);
+SELECT pg_icu_language_tag('fr_CA.UTF-8', true);
+SELECT pg_icu_language_tag('en_US@colStrength=primary', true);
 
 CREATE TABLE collate_test1 (
     a int,
@@ -357,6 +368,8 @@ CREATE ROLE regress_test_role;
 CREATE SCHEMA test_schema;
 
 -- We need to do this this way to cope with varying names for encodings:
+SET client_min_messages TO WARNING;
+
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
@@ -370,8 +383,17 @@ BEGIN
           quote_literal(current_setting('lc_collate')) || ');';
 END
 $$;
+
+RESET client_min_messages;
+
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
-CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */  DROP COLLATION testx;
+
+SET icu_locale_validation = TRUE;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense'); -- fails
+RESET icu_locale_validation;
+CREATE COLLATION testx (provider = icu, locale = 'nonsense@colStrength=primary');
+SELECT colliculocale FROM pg_collation WHERE collname='testx';
+DROP COLLATION testx;
 
 CREATE COLLATION test4 FROM nonsense;
 CREATE COLLATION test5 FROM test0;
-- 
2.34.1

Re: ICU locale validation / canonicalization

Reply via email to