On 8/19/17 19:15, Peter Geoghegan wrote: > Noah Misch <n...@leadboat.com> wrote: >> I think you're contending that, as formulated, this is not a valid v10 open >> item. Are you? > > As the person that came up with this formulation, I'd like to give a > quick summary of my current understanding of the item's status: > > * We're in agreement that we ought to have initdb create initial > collations based on ICU locales, not based on distinct ICU > collations [1]. > > * We're in agreement that variant keywords should not be > created for each base locale/collation [2]. > > Once these two changes are made, I think that everything will be in good > shape as far as pg_collation name stability goes. It shouldn't take > Peter E. long to write the patch. I'm happy to write the patch on his > behalf if that saves time. > > We're also going to work on the documentation, to make keyword variants > like -emoji and -traditional at least somewhat discoverable, and to > explain the capabilities of custom ICU collations more generally.
Here are my patches to address this. -- Peter Eisentraut http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
From 5a70c7e97758bf06fd717b391b66f3cc0366f063 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut <pete...@gmx.net> Date: Mon, 21 Aug 2017 09:17:06 -0400 Subject: [PATCH 1/2] Expand set of predefined ICU locales Install language+region combinations even if they are not distinct from the language's base locale. This gives better long-term stability of the set of predefined locales and makes the predefined locales less implementation-dependent and more practical for users. --- doc/src/sgml/charset.sgml | 13 ++++++------- src/backend/commands/collationcmds.c | 15 ++++++++++++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 48ecfc5f48..f2a4acc115 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -653,9 +653,8 @@ <title>ICU collations</title> string will be accepted as a locale name.) See <ulink url="http://userguide.icu-project.org/locale"></ulink> for information on ICU locale naming. <command>initdb</command> uses the ICU - APIs to extract a set of locales with distinct collation rules to populate - the initial set of collations. Here are some example collations that - might be created: + APIs to extract a set of distinct locales to populate the initial set of + collations. Here are some example collations that might be created: <variablelist> <varlistentry> @@ -677,9 +676,9 @@ <title>ICU collations</title> <listitem> <para>German collation for Austria, default variant</para> <para> - (As of this writing, there is no, - say, <literal>de-DE-x-icu</literal> or <literal>de-CH-x-icu</literal>, - because those are equivalent to <literal>de-x-icu</literal>.) + (There are also, say, <literal>de-DE-x-icu</literal> + or <literal>de-CH-x-icu</literal>, but as of this writing, they are + equivalent to <literal>de-x-icu</literal>.) </para> </listitem> </varlistentry> @@ -690,6 +689,7 @@ <title>ICU collations</title> <para>German collation for Austria, phone book variant</para> </listitem> </varlistentry> + <varlistentry> <term><literal>und-x-icu</literal> (for <quote>undefined</quote>)</term> <listitem> @@ -724,7 +724,6 @@ <title>Copying Collations</title> <programlisting> CREATE COLLATION german FROM "de_DE"; CREATE COLLATION french FROM "fr-x-icu"; -CREATE COLLATION "de-DE-x-icu" FROM "de-x-icu"; </programlisting> </para> diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index 8572b2dedc..d36ce53560 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -667,7 +667,16 @@ pg_import_system_collations(PG_FUNCTION_ARGS) } #endif /* READ_LOCALE_A_OUTPUT */ - /* Load collations known to ICU */ + /* + * Load collations known to ICU + * + * We use uloc_countAvailable()/uloc_getAvailable() rather than + * ucol_countAvailable()/ucol_getAvailable(). The former returns a full + * set of language+region combinations, whereas the latter only returns + * language+region combinations of they are distinct from the language's + * base collation. So there might not be a de-DE or en-GB, which would be + * confusing. + */ #ifdef USE_ICU { int i; @@ -676,7 +685,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS) * Start the loop at -1 to sneak in the root locale without too much * code duplication. */ - for (i = -1; i < ucol_countAvailable(); i++) + for (i = -1; i < uloc_countAvailable(); i++) { /* * In ICU 4.2, ucol_getKeywordValuesForLocale() sometimes returns @@ -706,7 +715,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS) if (i == -1) name = ""; /* ICU root locale */ else - name = ucol_getAvailable(i); + name = uloc_getAvailable(i); langtag = get_icu_language_tag(name); collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name; -- 2.14.1
From 84c2fe583807369215f2d39ebf9b010862feee18 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut <pete...@gmx.net> Date: Mon, 21 Aug 2017 11:22:00 -0400 Subject: [PATCH 2/2] Don't install ICU collation keyword variants Users can still create them themselves. Instead, document Unicode TR 35 collation options for ICU, so users can create all this themselves. --- doc/src/sgml/charset.sgml | 97 ++++++++++++++++++++++++++++++------ src/backend/commands/collationcmds.c | 71 -------------------------- 2 files changed, 83 insertions(+), 85 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index f2a4acc115..30f2da2115 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -664,13 +664,6 @@ <title>ICU collations</title> </listitem> </varlistentry> - <varlistentry> - <term><literal>de-u-co-phonebk-x-icu</literal></term> - <listitem> - <para>German collation, phone book variant</para> - </listitem> - </varlistentry> - <varlistentry> <term><literal>de-AT-x-icu</literal></term> <listitem> @@ -683,13 +676,6 @@ <title>ICU collations</title> </listitem> </varlistentry> - <varlistentry> - <term><literal>de-AT-u-co-phonebk-x-icu</literal></term> - <listitem> - <para>German collation for Austria, phone book variant</para> - </listitem> - </varlistentry> - <varlistentry> <term><literal>und-x-icu</literal> (for <quote>undefined</quote>)</term> <listitem> @@ -709,6 +695,89 @@ <title>ICU collations</title> will draw an error along the lines of <quote>collation "de-x-icu" for encoding "WIN874" does not exist</>. </para> + + <para> + ICU allows collations to be customized beyond the basic language+country + set that is preloaded by <command>initdb</command>. Users are encouraged + to define their own collation objects that make use of these facilities to + suit the sorting behavior to their requirements. Here are some examples: + + <variablelist> + <varlistentry> + <term><literal>CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de-u-co-phonebk')</literal></term> + <listitem> + <para>German collation with phone book collation type</para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = 'und-u-co-emoji')</literal></term> + <listitem> + <para> + Root collation with Emoji collation type, per Unicode Technical Standard #51 + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION digitslast (provider = icu, locale = 'en-u-kr-latn-digit')</literal></term> + <listitem> + <para> + Sort digits after Latin letters. (The default is digits before letters.) + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper')</literal></term> + <listitem> + <para> + Sort upper-case letters before lower-case letters. (The default is + lower-case letters first.) + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION special (provider = icu, locale = 'en-u-kf-upper-kr-latn-digit')</literal></term> + <listitem> + <para> + Combines both of the above options. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CREATE COLLATION numeric (provider = icu, locale = 'en-u-kn-true')</literal></term> + <listitem> + <para> + Numeric ordering, sorts sequences of digits by their numeric value, + for example: <literal>A-21</literal> < <literal>A-123</literal>. + </para> + </listitem> + </varlistentry> + </variablelist> + + See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode + Technical Standard #35</ulink> + and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for + details. The list of possible collation types (<literal>co</literal> + subtag) can be found in + the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR + repository</ulink>. + The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale + Explorer</ulink> can be used to check the details of a particular locale + definition. + </para> + + <para> + Note that while this system allows creating collations that <quote>ignore + case</quote> or <quote>ignore accents</quote> or similar (using + the <literal>ks</literal> key), PostgreSQL does not at the moment allow + such collations to act in a truly case- or accent-insensitive manner. Any + strings that compare equal according to the collation but are not + byte-wise equal will be sorted according to their byte values. + </para> </sect4> </sect3> diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c index d36ce53560..9437731276 100644 --- a/src/backend/commands/collationcmds.c +++ b/src/backend/commands/collationcmds.c @@ -687,30 +687,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS) */ for (i = -1; i < uloc_countAvailable(); i++) { - /* - * In ICU 4.2, ucol_getKeywordValuesForLocale() sometimes returns - * values that will not be accepted by uloc_toLanguageTag(). Skip - * loading keyword variants in that version. (Both - * ucol_getKeywordValuesForLocale() and uloc_toLanguageTag() are - * new in ICU 4.2, so older versions are not supported at all.) - * - * XXX We have no information about ICU 4.3 through 4.7, but we - * know the code below works with 4.8. - */ -#if U_ICU_VERSION_MAJOR_NUM > 4 || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM > 2) -#define LOAD_ICU_KEYWORD_VARIANTS -#endif - const char *name; char *langtag; char *icucomment; const char *collcollate; Oid collid; -#ifdef LOAD_ICU_KEYWORD_VARIANTS - UEnumeration *en; - UErrorCode status; - const char *val; -#endif if (i == -1) name = ""; /* ICU root locale */ @@ -744,58 +725,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS) CreateComments(collid, CollationRelationId, 0, icucomment); } - - /* - * Add keyword variants, if enabled. - */ -#ifdef LOAD_ICU_KEYWORD_VARIANTS - status = U_ZERO_ERROR; - en = ucol_getKeywordValuesForLocale("collation", name, TRUE, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not get keyword values for locale \"%s\": %s", - name, u_errorName(status)))); - - status = U_ZERO_ERROR; - uenum_reset(en, &status); - while ((val = uenum_next(en, NULL, &status))) - { - char *localeid = psprintf("%s@collation=%s", name, val); - - langtag = get_icu_language_tag(localeid); - collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid; - - /* - * Be paranoid about not allowing any non-ASCII strings into - * pg_collation - */ - if (!is_all_ascii(langtag) || !is_all_ascii(collcollate)) - continue; - - collid = CollationCreate(psprintf("%s-x-icu", langtag), - nspid, GetUserId(), - COLLPROVIDER_ICU, -1, - collcollate, collcollate, - get_collation_actual_version(COLLPROVIDER_ICU, collcollate), - true, true); - if (OidIsValid(collid)) - { - ncreated++; - - CommandCounterIncrement(); - - icucomment = get_icu_locale_comment(localeid); - if (icucomment) - CreateComments(collid, CollationRelationId, 0, - icucomment); - } - } - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not get keyword values for locale \"%s\": %s", - name, u_errorName(status)))); - uenum_close(en); -#endif /* LOAD_ICU_KEYWORD_VARIANTS */ } } #endif /* USE_ICU */ -- 2.14.1
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers