On Mon, 2023-02-20 at 15:23 -0800, Jeff Davis wrote: > > New patch attached. The new patch also includes a GUC that (when > enabled) validates that the collator is actually found.
New patch attached. Now it always preserves the exact locale string during pg_upgrade, and does not attempt to canonicalize it. Before it was trying to be clever by determining if the language tag was finding the same collator as the original string -- I didn't find a problem with that, but it just seemed a bit too clever. So, only newly-created locales and databases have the ICU locale string canonicalized to a language tag. Also, I added a SQL function pg_icu_language_tag() that can convert locale strings to language tags, and check whether they exist or not. -- Jeff Davis PostgreSQL Contributor Team - AWS
From e7b67f0410a18c32cf271532f7a4719cf8c1c560 Mon Sep 17 00:00:00 2001 From: Jeff Davis <j...@j-davis.com> Date: Wed, 15 Feb 2023 23:05:08 -0800 Subject: [PATCH v3] ICU locale string canonicalization and validation. Before storing the locale name in the catalog, convert to a BCP47 language tag. The language tag is an unambiguous representation of the locale that holds all of the necessary information. Also, add a new GUC icu_locale_validation. When set to true, it raises an ERROR if the locale string is malformed or if it is not a valid locale in ICU. During pg_upgrade, the previous locale string is preserved verbatim. Discussion: https://postgr.es/m/11b1eeb7e7667fdd4178497aeb796c48d26e69b9.ca...@j-davis.com --- doc/src/sgml/config.sgml | 16 +++ doc/src/sgml/func.sgml | 17 +++ src/backend/commands/collationcmds.c | 101 +++++++++++--- src/backend/commands/dbcommands.c | 39 ++++++ src/backend/utils/adt/pg_locale.c | 128 ++++++++++++++++-- src/backend/utils/misc/guc_tables.c | 10 ++ src/backend/utils/misc/postgresql.conf.sample | 2 + src/bin/pg_dump/t/002_pg_dump.pl | 4 +- src/include/catalog/pg_proc.dat | 5 + src/include/commands/dbcommands.h | 1 + src/include/utils/pg_locale.h | 4 + .../regress/expected/collate.icu.utf8.out | 87 +++++++++++- src/test/regress/sql/collate.icu.utf8.sql | 24 +++- 13 files changed, 405 insertions(+), 33 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index e5c41cc6c6..f7fdb54a1b 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9776,6 +9776,22 @@ SET XML OPTION { DOCUMENT | CONTENT }; </listitem> </varlistentry> + <varlistentry id="guc-icu-locale-validation" xreflabel="icu_locale_validation"> + <term><varname>icu_locale_validation</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>icu_locale_validation</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + If set to <literal>true</literal>, validates that ICU locale strings + are well-formed, and that they represent valid locale in ICU. Does not + cause any locale string to be rejected during <xref + linkend="pgupgrade"/>. The default is <literal>false</literal>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-default-text-search-config" xreflabel="default_text_search_config"> <term><varname>default_text_search_config</varname> (<type>string</type>) <indexterm> diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 0cbdf63632..e2604c41ad 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -27406,6 +27406,23 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset Use of this function is restricted to superusers. </para></entry> </row> + + <row> + <entry role="func_table_entry"><para role="func_signature"> + <indexterm> + <primary>pg_icu_language_tag</primary> + </indexterm> + <function>pg_icu_language_tag</function> ( <parameter>locale</parameter> <type>text</type>, <parameter>validate</parameter> <type>boolean</type> ) + <returnvalue>text</returnvalue> + </para> + <para> + Canonicalizes the given <parameter>locale</parameter> string into a + BCP 47 language tag (see <xref + linkend="collation-managing-create-icu"/>). If + <parameter>validate</parameter> is <literal>true</literal>, check that + the resulting language tag represents a valid locale in ICU. + </para></entry> + </row> </tbody> </tgroup> </table> diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index eb62d285ea..8edc22f579 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -47,6 +47,8 @@ typedef struct int enc; /* encoding */ } CollAliasData; +extern bool icu_locale_validation; + /* * CREATE COLLATION @@ -240,10 +242,50 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e } else if (collprovider == COLLPROVIDER_ICU) { +#ifdef USE_ICU + char *langtag; + if (!colliculocale) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("parameter \"locale\" must be specified"))); + + check_icu_locale(colliculocale); + + /* + * During binary upgrade, preserve locale string verbatim. + * Otherwise, canonicalize to a language tag. + */ + if (!IsBinaryUpgrade) + { + int elevel = icu_locale_validation ? ERROR : WARNING; + + langtag = icu_language_tag(colliculocale); + if (langtag) + { + ereport(NOTICE, + (errmsg("using language tag \"%s\" for locale \"%s\"", + langtag, colliculocale))); + + if (!icu_collator_exists(langtag)) + ereport(elevel, + (errmsg("ICU collator for language tag \"%s\" not found", + langtag))); + + colliculocale = langtag; + } + else + { + ereport(elevel, + (errmsg("could not convert locale \"%s\" to language tag", + colliculocale))); + } + } +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif } /* @@ -556,26 +598,6 @@ cmpaliases(const void *a, const void *b) #ifdef USE_ICU -/* - * Get the ICU language tag for a locale name. - * The result is a palloc'd string. - */ -static char * -get_icu_language_tag(const char *localename) -{ - char buf[ULOC_FULLNAME_CAPACITY]; - UErrorCode status; - - status = U_ZERO_ERROR; - uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not convert locale name \"%s\" to language tag: %s", - localename, u_errorName(status)))); - - return pstrdup(buf); -} - /* * Get a comment (specifically, the display name) for an ICU locale. * The result is a palloc'd string, or NULL if we can't get a comment @@ -938,7 +960,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS) else name = uloc_getAvailable(i); - langtag = get_icu_language_tag(name); + langtag = icu_language_tag(name); + if (langtag == NULL) + ereport(ERROR, + (errmsg("could not convert locale name \"%s\" to language tag", + name))); iculocstr = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name; /* @@ -996,3 +1022,36 @@ pg_import_system_collations(PG_FUNCTION_ARGS) PG_RETURN_INT32(ncreated); } + +/* + * pg_icu_language_tag + * + * Return the BCP47 language tag representation of the given locale string. + */ +Datum +pg_icu_language_tag(PG_FUNCTION_ARGS) +{ +#ifdef USE_ICU + text *locale_text = PG_GETARG_TEXT_PP(0); + bool validate = PG_GETARG_BOOL(1); + char *locale_cstr = text_to_cstring(locale_text); + char *langtag = icu_language_tag(locale_cstr); + + if (langtag == NULL) + ereport(ERROR, + (errmsg("could not convert locale \"%s\" to language tag", + locale_cstr))); + + if (validate && !icu_collator_exists(langtag)) + ereport(ERROR, + (errmsg("ICU collator for language tag \"%s\" not found", + langtag))); + + PG_RETURN_TEXT_P(cstring_to_text(langtag)); +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); + PG_RETURN_NULL(); +#endif +} diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index a0259cc593..a19881ad9c 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -109,6 +109,7 @@ typedef struct CreateDBRelInfo bool permanent; /* relation is permanent or unlogged */ } CreateDBRelInfo; +extern bool icu_locale_validation; /* non-export function prototypes */ static void createdb_failure_callback(int code, Datum arg); @@ -1029,6 +1030,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) if (dblocprovider == COLLPROVIDER_ICU) { +#ifdef USE_ICU + char *langtag; + if (!(is_encoding_supported_by_icu(encoding))) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -1045,6 +1049,41 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) errmsg("ICU locale must be specified"))); check_icu_locale(dbiculocale); + + /* + * During binary upgrade, preserve locale string verbatim. Otherwise, + * canonicalize to a language tag. + */ + if (!IsBinaryUpgrade) + { + int elevel = icu_locale_validation ? ERROR : WARNING; + + langtag = icu_language_tag(dbiculocale); + if (langtag) + { + ereport(NOTICE, + (errmsg("using language tag \"%s\" for locale \"%s\"", + langtag, dbiculocale))); + + if (!icu_collator_exists(langtag)) + ereport(elevel, + (errmsg("ICU collator for language tag \"%s\" not found", + langtag))); + + dbiculocale = langtag; + } + else + { + ereport(elevel, + (errmsg("could not convert locale \"%s\" to language tag", + dbiculocale))); + } + } +#else + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif } else { diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 4aa5eaa984..6378a6d5ca 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -2690,15 +2690,12 @@ icu_set_collation_attributes(UCollator *collator, const char *loc) } } -#endif /* USE_ICU */ - /* * Check if the given locale ID is valid, and ereport(ERROR) if it isn't. */ void check_icu_locale(const char *icu_locale) { -#ifdef USE_ICU UCollator *collator; UErrorCode status; @@ -2712,13 +2709,128 @@ check_icu_locale(const char *icu_locale) if (U_ICU_VERSION_MAJOR_NUM < 54) icu_set_collation_attributes(collator, icu_locale); ucol_close(collator); -#else - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ICU is not supported in this build"))); -#endif } +/* + * Check if the locale string represents the root locale. It represents the + * root locale if the language part is "und", "root", or the empty string. + */ +static bool +icu_is_root_locale(const char *locale) +{ + UErrorCode status; + char *lang_part; + int32_t len; + bool result = false; + + status = U_ZERO_ERROR; + len = uloc_getLanguage(locale, NULL, 0, &status); + lang_part = palloc(len + 1); + status = U_ZERO_ERROR; + uloc_getLanguage(locale, lang_part, len + 1, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not get language name from locale string \"%s\": %s", + locale, u_errorName(status)))); + + if (pg_strcasecmp(lang_part, "root") == 0 || + pg_strcasecmp(lang_part, "und") == 0 || + pg_strcasecmp(lang_part, "") == 0) + result = true; + + pfree(lang_part); + return result; +} + +/* + * Special case to check for locales like "POSIX" or "C.UTF-8". These are not + * handled by ICU level 2 canonicalization. + */ +static bool +icu_is_c_posix(const char *locale) +{ + if (pg_strcasecmp(locale, "c") == 0 || + pg_strncasecmp(locale, "c.", sizeof("c.") - 1) == 0 || + pg_strcasecmp(locale, "posix") == 0 || + pg_strncasecmp(locale, "posix.", sizeof("posix.") - 1) == 0) + return true; + + return false; +} + +/* + * Check if the given language tag resolves to a valid locale in ICU. + * + * If the resulting collator falls back to the root locale, and the root + * locale was not explicitly requested, return false. + */ +bool +icu_collator_exists(const char *langtag) +{ + UCollator *collator; + const char *valid_locale = NULL; + UErrorCode status; + bool result = false; + + status = U_ZERO_ERROR; + collator = ucol_open(langtag, &status); + if (U_FAILURE(status)) + return false; + + status = U_ZERO_ERROR; + valid_locale = ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, &status); + if (U_FAILURE(status)) + goto cleanup; + + if (icu_is_root_locale(langtag) || + !icu_is_root_locale(valid_locale)) + result = true; + +cleanup: + ucol_close(collator); + return result; +} + +/* + * Return the BCP47 language tag representation of the requested locale; or + * NULL if a problem is encountered. + * + * This function should be called before passing the string to ucol_open(), + * because conversion to a language tag also performs "level 2 + * canonicalization". In addition to producing a consistent result format, + * level 2 canonicalization is able to more accurately interpret different + * input locale string formats, such as POSIX and .NET IDs. + */ +char * +icu_language_tag(const char *requested_locale) +{ + UErrorCode status; + char *result; + int32_t len; + const bool strict = true; + + /* c/posix locales aren't handled by uloc_getLanguageTag() */ + if (icu_is_c_posix(requested_locale)) + return pstrdup("en-US-u-va-posix"); + + status = U_ZERO_ERROR; + len = uloc_toLanguageTag(requested_locale, NULL, 0, strict, &status); + + result = palloc(len + 1); + + status = U_ZERO_ERROR; + uloc_toLanguageTag(requested_locale, result, len + 1, strict, &status); + if (U_FAILURE(status)) + { + pfree(result); + return NULL; + } + + return result; +} + +#endif /* USE_ICU */ + /* * These functions convert from/to libc's wchar_t, *not* pg_wchar_t. * Therefore we keep them here rather than with the mbutils code. diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 1c0583fe26..4aa53259dc 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -481,6 +481,7 @@ char *event_source; bool row_security; bool check_function_bodies = true; +bool icu_locale_validation = false; /* * This GUC exists solely for backward compatibility, check its definition for @@ -1586,6 +1587,15 @@ struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"icu_locale_validation", PGC_SUSET, CLIENT_CONN_LOCALE, + gettext_noop("Validate ICU locale strings."), + NULL + }, + &icu_locale_validation, + false, + NULL, NULL, NULL + }, { {"array_nulls", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS, gettext_noop("Enable input of NULL elements in arrays."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index d06074b86f..cff927e8be 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -730,6 +730,8 @@ #lc_numeric = 'C' # locale for number formatting #lc_time = 'C' # locale for time formatting +#icu_locale_validation = off # validate ICU locale strings + # default configuration for text search #default_text_search_config = 'pg_catalog.simple' diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl index 72b19ee6cd..cb34f4af7f 100644 --- a/src/bin/pg_dump/t/002_pg_dump.pl +++ b/src/bin/pg_dump/t/002_pg_dump.pl @@ -1721,9 +1721,9 @@ my %tests = ( 'CREATE COLLATION icu_collation' => { create_order => 76, - create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'C');", + create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'en-US-u-va-posix');", regexp => - qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'C'(, version = '[^']*')?\);/m, + qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'en-US-u-va-posix'(, version = '[^']*')?\);/m, icu => 1, like => { %full_runs, section_pre_data => 1, }, }, diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index e2a7642a2b..7d5641572e 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11806,6 +11806,11 @@ proname => 'pg_database_collation_actual_version', procost => '100', provolatile => 'v', prorettype => 'text', proargtypes => 'oid', prosrc => 'pg_database_collation_actual_version' }, +{ oid => '6273', + descr => 'get BCP47 language tag representation of locale', + proname => 'pg_icu_language_tag', procost => '100', + provolatile => 's', prorettype => 'text', proargtypes => 'text bool', + prosrc => 'pg_icu_language_tag' }, # system management/monitoring related functions { oid => '3353', descr => 'list files in the log directory', diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h index 5fbc3ca752..0f0e827ff2 100644 --- a/src/include/commands/dbcommands.h +++ b/src/include/commands/dbcommands.h @@ -33,5 +33,6 @@ extern char *get_database_name(Oid dbid); extern bool have_createdb_privilege(void); extern void check_encoding_locale_matches(int encoding, const char *collate, const char *ctype); +extern char *get_icu_locale(const char *requested_locale); #endif /* DBCOMMANDS_H */ diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index b8f22875a8..58cd4297bb 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -118,8 +118,12 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, #ifdef USE_ICU extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes); extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar); +extern bool check_equivalent_icu_locales(const char *locale1, + const char *locale2); #endif extern void check_icu_locale(const char *icu_locale); +extern bool icu_collator_exists(const char *requested_locale); +extern char *icu_language_tag(const char *requested_locale); /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */ extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out index 4354dc07b8..927718a937 100644 --- a/src/test/regress/expected/collate.icu.utf8.out +++ b/src/test/regress/expected/collate.icu.utf8.out @@ -11,6 +11,63 @@ SELECT getdatabaseencoding() <> 'UTF8' OR SET client_encoding TO UTF8; CREATE SCHEMA collate_tests; SET search_path = collate_tests; +-- test language tag canonicalization +SELECT pg_icu_language_tag('en_US', true); + pg_icu_language_tag +--------------------- + en-US +(1 row) + +SELECT pg_icu_language_tag('nonsense', false); + pg_icu_language_tag +--------------------- + nonsense +(1 row) + +SELECT pg_icu_language_tag('nonsense', true); -- error +ERROR: ICU collator for language tag "nonsense" not found +SELECT pg_icu_language_tag('C.UTF-8', true); + pg_icu_language_tag +--------------------- + en-US-u-va-posix +(1 row) + +SELECT pg_icu_language_tag('POSIX', true); + pg_icu_language_tag +--------------------- + en-US-u-va-posix +(1 row) + +SELECT pg_icu_language_tag('en_US_POSIX', true); + pg_icu_language_tag +--------------------- + en-US-u-va-posix +(1 row) + +SELECT pg_icu_language_tag('@colStrength=secondary', true); + pg_icu_language_tag +--------------------- + und-u-ks-level2 +(1 row) + +SELECT pg_icu_language_tag('', true); + pg_icu_language_tag +--------------------- + und +(1 row) + +SELECT pg_icu_language_tag('fr_CA.UTF-8', true); + pg_icu_language_tag +--------------------- + fr-CA +(1 row) + +SELECT pg_icu_language_tag('en_US@colStrength=primary', true); + pg_icu_language_tag +--------------------- + en-US-u-ks-level1 +(1 row) + CREATE TABLE collate_test1 ( a int, b text COLLATE "en-x-icu" NOT NULL @@ -1019,6 +1076,7 @@ reset enable_seqscan; CREATE ROLE regress_test_role; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: +SET client_min_messages TO WARNING; do $$ BEGIN EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' || @@ -1033,9 +1091,24 @@ BEGIN quote_literal(current_setting('lc_collate')) || ');'; END $$; +RESET client_min_messages; CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" ERROR: parameter "locale" must be specified -CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */ DROP COLLATION testx; +SET icu_locale_validation = TRUE; +CREATE COLLATION testx (provider = icu, locale = 'nonsense'); -- fails +NOTICE: using language tag "nonsense" for locale "nonsense" +ERROR: ICU collator for language tag "nonsense" not found +RESET icu_locale_validation; +CREATE COLLATION testx (provider = icu, locale = 'nonsense@colStrength=primary'); +NOTICE: using language tag "nonsense-u-ks-level1" for locale "nonsense@colStrength=primary" +WARNING: ICU collator for language tag "nonsense-u-ks-level1" not found +SELECT colliculocale FROM pg_collation WHERE collname='testx'; + colliculocale +---------------------- + nonsense-u-ks-level1 +(1 row) + +DROP COLLATION testx; CREATE COLLATION test4 FROM nonsense; ERROR: collation "nonsense" for encoding "UTF8" does not exist CREATE COLLATION test5 FROM test0; @@ -1144,6 +1217,7 @@ drop type textrange_en_us; -- test ICU collation customization -- test the attributes handled by icu_set_collation_attributes() CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes'); +NOTICE: using language tag "und-u-kc-ks-level1" for locale "@colStrength=primary;colCaseLevel=yes" SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignore_accents; ?column? | ?column? ----------+---------- @@ -1151,6 +1225,7 @@ SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE testcoll_ignor (1 row) CREATE COLLATION testcoll_backwards (provider = icu, locale = '@colBackwards=yes'); +NOTICE: using language tag "und-u-kb" for locale "@colBackwards=yes" SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll_backwards; ?column? | ?column? ----------+---------- @@ -1158,7 +1233,9 @@ SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE testcoll (1 row) CREATE COLLATION testcoll_lower_first (provider = icu, locale = '@colCaseFirst=lower'); +NOTICE: using language tag "und-u-kf-lower" for locale "@colCaseFirst=lower" CREATE COLLATION testcoll_upper_first (provider = icu, locale = '@colCaseFirst=upper'); +NOTICE: using language tag "und-u-kf-upper" for locale "@colCaseFirst=upper" SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcoll_upper_first; ?column? | ?column? ----------+---------- @@ -1166,6 +1243,7 @@ SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE testcol (1 row) CREATE COLLATION testcoll_shifted (provider = icu, locale = '@colAlternate=shifted'); +NOTICE: using language tag "und-u-ka-shifted" for locale "@colAlternate=shifted" SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE testcoll_shifted; ?column? | ?column? ----------+---------- @@ -1173,6 +1251,7 @@ SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE te (1 row) CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes'); +NOTICE: using language tag "und-u-kn" for locale "@colNumeric=yes" SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE testcoll_numeric; ?column? | ?column? ----------+---------- @@ -1184,6 +1263,7 @@ ERROR: could not open collator for locale "@colNumeric=lower": U_ILLEGAL_ARGUME -- test that attributes not handled by icu_set_collation_attributes() -- (handled by ucol_open() directly) also work CREATE COLLATION testcoll_de_phonebook (provider = icu, locale = 'de@collation=phonebook'); +NOTICE: using language tag "de-u-co-phonebk" for locale "de@collation=phonebook" SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE testcoll_de_phonebook; ?column? | ?column? ----------+---------- @@ -1192,7 +1272,9 @@ SELECT 'Goldmann' < 'Götz' COLLATE "de-x-icu", 'Goldmann' > 'Götz' COLLATE tes -- nondeterministic collations CREATE COLLATION ctest_det (provider = icu, locale = '', deterministic = true); +NOTICE: using language tag "und" for locale "" CREATE COLLATION ctest_nondet (provider = icu, locale = '', deterministic = false); +NOTICE: using language tag "und" for locale "" CREATE TABLE test6 (a int, b text); -- same string in different normal forms INSERT INTO test6 VALUES (1, U&'\00E4bc'); @@ -1242,7 +1324,9 @@ SELECT * FROM test6a WHERE b = ARRAY['äbc'] COLLATE ctest_nondet; (2 rows) CREATE COLLATION case_sensitive (provider = icu, locale = ''); +NOTICE: using language tag "und" for locale "" CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=secondary', deterministic = false); +NOTICE: using language tag "und-u-ks-level2" for locale "@colStrength=secondary" SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive; ?column? | ?column? ----------+---------- @@ -1710,6 +1794,7 @@ SELECT * FROM outer_text WHERE (f1, f2) NOT IN (SELECT * FROM inner_text); -- accents CREATE COLLATION ignore_accents (provider = icu, locale = '@colStrength=primary;colCaseLevel=yes', deterministic = false); +NOTICE: using language tag "und-u-kc-ks-level1" for locale "@colStrength=primary;colCaseLevel=yes" CREATE TABLE test4 (a int, b text); INSERT INTO test4 VALUES (1, 'cote'), (2, 'côte'), (3, 'coté'), (4, 'côté'); SELECT * FROM test4 WHERE b = 'cote'; diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql index b0ddc7db44..26096f0627 100644 --- a/src/test/regress/sql/collate.icu.utf8.sql +++ b/src/test/regress/sql/collate.icu.utf8.sql @@ -15,6 +15,17 @@ SET client_encoding TO UTF8; CREATE SCHEMA collate_tests; SET search_path = collate_tests; +-- test language tag canonicalization +SELECT pg_icu_language_tag('en_US', true); +SELECT pg_icu_language_tag('nonsense', false); +SELECT pg_icu_language_tag('nonsense', true); -- error +SELECT pg_icu_language_tag('C.UTF-8', true); +SELECT pg_icu_language_tag('POSIX', true); +SELECT pg_icu_language_tag('en_US_POSIX', true); +SELECT pg_icu_language_tag('@colStrength=secondary', true); +SELECT pg_icu_language_tag('', true); +SELECT pg_icu_language_tag('fr_CA.UTF-8', true); +SELECT pg_icu_language_tag('en_US@colStrength=primary', true); CREATE TABLE collate_test1 ( a int, @@ -357,6 +368,8 @@ CREATE ROLE regress_test_role; CREATE SCHEMA test_schema; -- We need to do this this way to cope with varying names for encodings: +SET client_min_messages TO WARNING; + do $$ BEGIN EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' || @@ -370,8 +383,17 @@ BEGIN quote_literal(current_setting('lc_collate')) || ');'; END $$; + +RESET client_min_messages; + CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale" -CREATE COLLATION testx (provider = icu, locale = 'nonsense'); /* never fails with ICU */ DROP COLLATION testx; + +SET icu_locale_validation = TRUE; +CREATE COLLATION testx (provider = icu, locale = 'nonsense'); -- fails +RESET icu_locale_validation; +CREATE COLLATION testx (provider = icu, locale = 'nonsense@colStrength=primary'); +SELECT colliculocale FROM pg_collation WHERE collname='testx'; +DROP COLLATION testx; CREATE COLLATION test4 FROM nonsense; CREATE COLLATION test5 FROM test0; -- 2.34.1