On 2019-03-08 11:09, Peter Eisentraut wrote:
> On 2019-03-07 20:04, Daniel Verite wrote:
>> With previous versions, we'd need to call ucol_setAttribute(),
>> with the attributes and values defined here:
>> http://icu-project.org/apiref/icu4c/ucol_8h.html
>> for instance to get colStrength=secondary:
>>   ucol_setAttribute(coll, UCOL_STRENGTH , UCOL_SECONDARY, &status);
>> which I've just checked gives the expected result with ICU-4.2.
> 
> I see.  I'm thinking about adding some ad hoc code to
> pg_newlocale_from_collation() to parse these keywords ourselves, so we
> can provide the same interface for old ICU versions.  I'll send a
> separate patch for that.

Patches here.  This will allow all the existing collation customization
options as well as the ones being proposed in this thread to work in
older ICU versions.

-- 
Peter Eisentraut              http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
>From dc21791adee43a854247ff007b3757ab480e509d Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Mon, 11 Mar 2019 16:10:59 +0100
Subject: [PATCH 1/2] Add tests for ICU collation customization

---
 .../regress/expected/collate.icu.utf8.out     | 39 +++++++++++++++++++
 src/test/regress/sql/collate.icu.utf8.sql     | 21 ++++++++++
 2 files changed, 60 insertions(+)

diff --git a/src/test/regress/expected/collate.icu.utf8.out 
b/src/test/regress/expected/collate.icu.utf8.out
index f95d165288..4b94921cf8 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1100,6 +1100,45 @@ select textrange_en_us('A','Z') @> 'b'::text;
 
 drop type textrange_c;
 drop type textrange_en_us;
+-- test ICU collation customization
+CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = 
'@colStrength=primary;colCaseLevel=yes');
+SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE 
testcoll_ignore_accents;
+ ?column? | ?column? 
+----------+----------
+ t        | t
+(1 row)
+
+CREATE COLLATION testcoll_backwards (provider = icu, locale = 
'@colBackwards=yes');
+SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE 
testcoll_backwards;
+ ?column? | ?column? 
+----------+----------
+ t        | t
+(1 row)
+
+CREATE COLLATION testcoll_lower_first (provider = icu, locale = 
'@colCaseFirst=lower');
+CREATE COLLATION testcoll_upper_first (provider = icu, locale = 
'@colCaseFirst=upper');
+SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE 
testcoll_upper_first;
+ ?column? | ?column? 
+----------+----------
+ t        | t
+(1 row)
+
+CREATE COLLATION testcoll_shifted (provider = icu, locale = 
'@colAlternate=shifted');
+SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE 
testcoll_shifted;
+ ?column? | ?column? 
+----------+----------
+ t        | t
+(1 row)
+
+CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
+SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE 
testcoll_numeric;
+ ?column? | ?column? 
+----------+----------
+ t        | t
+(1 row)
+
+CREATE COLLATION testcoll_error1 (provider = icu, locale = 
'@colNumeric=lower');
+ERROR:  could not open collator for locale "@colNumeric=lower": 
U_ILLEGAL_ARGUMENT_ERROR
 -- cleanup
 SET client_min_messages TO warning;
 DROP SCHEMA collate_tests CASCADE;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql 
b/src/test/regress/sql/collate.icu.utf8.sql
index 0aeba3e202..73fb1232a7 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -425,6 +425,27 @@ CREATE INDEX collate_dep_test4i ON collate_dep_test4t (b 
COLLATE test0);
 drop type textrange_en_us;
 
 
+-- test ICU collation customization
+
+CREATE COLLATION testcoll_ignore_accents (provider = icu, locale = 
'@colStrength=primary;colCaseLevel=yes');
+SELECT 'aaá' > 'AAA' COLLATE "und-x-icu", 'aaá' < 'AAA' COLLATE 
testcoll_ignore_accents;
+
+CREATE COLLATION testcoll_backwards (provider = icu, locale = 
'@colBackwards=yes');
+SELECT 'coté' < 'côte' COLLATE "und-x-icu", 'coté' > 'côte' COLLATE 
testcoll_backwards;
+
+CREATE COLLATION testcoll_lower_first (provider = icu, locale = 
'@colCaseFirst=lower');
+CREATE COLLATION testcoll_upper_first (provider = icu, locale = 
'@colCaseFirst=upper');
+SELECT 'aaa' < 'AAA' COLLATE testcoll_lower_first, 'aaa' > 'AAA' COLLATE 
testcoll_upper_first;
+
+CREATE COLLATION testcoll_shifted (provider = icu, locale = 
'@colAlternate=shifted');
+SELECT 'de-luge' < 'deanza' COLLATE "und-x-icu", 'de-luge' > 'deanza' COLLATE 
testcoll_shifted;
+
+CREATE COLLATION testcoll_numeric (provider = icu, locale = '@colNumeric=yes');
+SELECT 'A-21' > 'A-123' COLLATE "und-x-icu", 'A-21' < 'A-123' COLLATE 
testcoll_numeric;
+
+CREATE COLLATION testcoll_error1 (provider = icu, locale = 
'@colNumeric=lower');
+
+
 -- cleanup
 SET client_min_messages TO warning;
 DROP SCHEMA collate_tests CASCADE;
-- 
2.21.0

>From 711ec5f45e83abab93d06cc1d65647d04c6e97d0 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <pe...@eisentraut.org>
Date: Mon, 11 Mar 2019 21:20:03 +0100
Subject: [PATCH 2/2] Add support for collation attributes on older ICU
 versions

Starting in ICU 54, collation customization attributes can be
specified in the locale string, for example
"@colStrength=primary;colCaseLevel=yes".  Add support for this for
older ICU versions as well, by adding some minimal parsing of the
attributes in the locale string and calling ucol_setAttribute() on
them.  This is essentially what never ICU versions do internally in
ucol_open().  This was we can offer this functionality in a consistent
way in all ICU versions supported by PostgreSQL.
---
 src/backend/utils/adt/pg_locale.c | 104 ++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/src/backend/utils/adt/pg_locale.c 
b/src/backend/utils/adt/pg_locale.c
index 50b8b31645..c6fedab8f4 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -58,6 +58,7 @@
 #include "catalog/pg_control.h"
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
+#include "utils/formatting.h"
 #include "utils/hsearch.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -132,6 +133,9 @@ static HTAB *collation_cache = NULL;
 static char *IsoLocaleName(const char *);      /* MSVC specific */
 #endif
 
+#ifdef USE_ICU
+static void icu_set_collation_attributes(UCollator *collator, const char *loc);
+#endif
 
 /*
  * pg_perm_setlocale
@@ -1380,6 +1384,9 @@ pg_newlocale_from_collation(Oid collid)
                                                (errmsg("could not open 
collator for locale \"%s\": %s",
                                                                collcollate, 
u_errorName(status))));
 
+                       if (U_ICU_VERSION_MAJOR_NUM < 54)
+                               icu_set_collation_attributes(collator, 
collcollate);
+
                        /* We will leak this string if we get an error below 
:-( */
                        result.info.icu.locale = 
MemoryContextStrdup(TopMemoryContext,
                                                                                
                                 collcollate);
@@ -1588,6 +1595,103 @@ icu_from_uchar(char **result, const UChar *buff_uchar, 
int32_t len_uchar)
        return len_result;
 }
 
+/*
+ * Parse collation attributes and apply them to the open collator.  This takes
+ * a string like "und@colStrength=primary;colCaseLevel=yes" and parses and
+ * applyes the key-value arguments.
+ *
+ * Starting with ICU version 54, the attributes are processed automatically by
+ * ucol_open(), so this is only necessary for emulating this behavior on older
+ * versions.
+ */
+pg_attribute_unused()
+static void
+icu_set_collation_attributes(UCollator *collator, const char *loc)
+{
+       char       *str = asc_tolower(loc, strlen(loc));
+
+       str = strchr(str, '@');
+       if (!str)
+               return;
+       str++;
+
+       for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
+       {
+               char       *e = strchr(token, '=');
+
+               if (e)
+               {
+                       char       *name;
+                       char       *value;
+                       UColAttribute uattr = -1;
+                       UColAttributeValue uvalue = -1;
+                       UErrorCode      status;
+
+                       status = U_ZERO_ERROR;
+
+                       *e = '\0';
+                       name = token;
+                       value = e + 1;
+
+                       /*
+                        * See attribute name and value lists in ICU 
i18n/coll.cpp
+                        */
+                       if (strcmp(name, "colstrength") == 0)
+                               uattr = UCOL_STRENGTH;
+                       else if (strcmp(name, "colbackwards") == 0)
+                               uattr = UCOL_FRENCH_COLLATION;
+                       else if (strcmp(name, "colcaselevel") == 0)
+                               uattr = UCOL_CASE_LEVEL;
+                       else if (strcmp(name, "colcasefirst") == 0)
+                               uattr = UCOL_CASE_FIRST;
+                       else if (strcmp(name, "colalternate") == 0)
+                               uattr = UCOL_ALTERNATE_HANDLING;
+                       else if (strcmp(name, "colnormalization") == 0)
+                               uattr = UCOL_NORMALIZATION_MODE;
+                       else if (strcmp(name, "colnumeric") == 0)
+                               uattr = UCOL_NUMERIC_COLLATION;
+                       /* ignore if unknown */
+
+                       if (strcmp(value, "primary") == 0)
+                               uvalue = UCOL_PRIMARY;
+                       else if (strcmp(value, "secondary") == 0)
+                               uvalue = UCOL_SECONDARY;
+                       else if (strcmp(value, "tertiary") == 0)
+                               uvalue = UCOL_TERTIARY;
+                       else if (strcmp(value, "quaternary") == 0)
+                               uvalue = UCOL_QUATERNARY;
+                       else if (strcmp(value, "identical") == 0)
+                               uvalue = UCOL_IDENTICAL;
+                       else if (strcmp(value, "no") == 0)
+                               uvalue = UCOL_OFF;
+                       else if (strcmp(value, "yes") == 0)
+                               uvalue = UCOL_ON;
+                       else if (strcmp(value, "shifted") == 0)
+                               uvalue = UCOL_SHIFTED;
+                       else if (strcmp(value, "non-ignorable") == 0)
+                               uvalue = UCOL_NON_IGNORABLE;
+                       else if (strcmp(value, "lower") == 0)
+                               uvalue = UCOL_LOWER_FIRST;
+                       else if (strcmp(value, "upper") == 0)
+                               uvalue = UCOL_UPPER_FIRST;
+                       else
+                               status = U_ILLEGAL_ARGUMENT_ERROR;
+
+                       if (uattr != -1 && uvalue != -1)
+                               ucol_setAttribute(collator, uattr, uvalue, 
&status);
+
+                       /*
+                        * Pretend the error came from ucol_open(), for 
consistent error
+                        * message across ICU versions.
+                        */
+                       if (U_FAILURE(status))
+                               ereport(ERROR,
+                                               (errmsg("could not open 
collator for locale \"%s\": %s",
+                                                               loc, 
u_errorName(status))));
+               }
+       }
+}
+
 #endif                                                 /* USE_ICU */
 
 /*
-- 
2.21.0

Reply via email to