Re: ICU locale validation / canonicalization

Jeff Davis Tue, 14 Mar 2023 10:11:01 -0700

On Tue, 2023-03-14 at 08:08 +0100, Peter Eisentraut wrote:
> Another issue that came to mind:  Right now, you can, say, develop
> SQL 
> schemas on a newer ICU version, say, your laptop, and then deploy
> them 
> on a server running an older ICU version.  If we have a cutoff beyond
> which we convert ICU locale IDs to language tags, then this won't
> work 
> anymore for certain combinations.  And RHEL/CentOS 7 is still pretty 
> popular.


If we just uloc_canonicalize() in icu_set_collation_attributes() then
versions 50-53 can support language tags. Patch attached.

One loose end is that we really should support language tags like "und"
in those older versions (54 and earlier). Your commit d72900bded
avoided the problem, but perhaps we should fix it by looking for "und"
and replacing it with "root" while opening, or something.


-- 
Jeff Davis
PostgreSQL Contributor Team - AWS

From ba1f0794d3be3fe7a9a365f4b6312bef9c215728 Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Tue, 14 Mar 2023 09:58:29 -0700
Subject: [PATCH v1] Support language tags in older ICU versions (53 and
 earlier).

By calling uloc_canonicalize() before parsing the attributes, the
existing locale attribute parsing logic works on language tags as
well.

Fix a small memory leak, too.
---
 src/backend/commands/collationcmds.c          |  8 +++---
 src/backend/utils/adt/pg_locale.c             | 26 ++++++++++++++++---
 .../regress/expected/collate.icu.utf8.out     |  8 ++++++
 src/test/regress/sql/collate.icu.utf8.sql     |  4 +++
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index 8949684afe..b8f2e7059f 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -950,7 +950,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 			const char *name;
 			char	   *langtag;
 			char	   *icucomment;
-			const char *iculocstr;
 			Oid			collid;
 
 			if (i == -1)
@@ -959,20 +958,19 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 				name = uloc_getAvailable(i);
 
 			langtag = get_icu_language_tag(name);
-			iculocstr = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name;
 
 			/*
 			 * Be paranoid about not allowing any non-ASCII strings into
 			 * pg_collation
 			 */
-			if (!pg_is_ascii(langtag) || !pg_is_ascii(iculocstr))
+			if (!pg_is_ascii(langtag) || !pg_is_ascii(langtag))
 				continue;
 
 			collid = CollationCreate(psprintf("%s-x-icu", langtag),
 									 nspid, GetUserId(),
 									 COLLPROVIDER_ICU, true, -1,
-									 NULL, NULL, iculocstr, NULL,
-									 get_collation_actual_version(COLLPROVIDER_ICU, iculocstr),
+									 NULL, NULL, langtag, NULL,
+									 get_collation_actual_version(COLLPROVIDER_ICU, langtag),
 									 true, true);
 			if (OidIsValid(collid))
 			{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 1d3d4d86d3..b9c7fbd511 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -2643,9 +2643,28 @@ pg_attribute_unused()
 static void
 icu_set_collation_attributes(UCollator *collator, const char *loc)
 {
-	char	   *str = asc_tolower(loc, strlen(loc));
+	UErrorCode	status;
+	int32_t		len;
+	char	   *icu_locale_id;
+	char	   *lower_str;
+	char	   *str;
 
-	str = strchr(str, '@');
+	/* first, make sure the string is an ICU format locale ID */
+	status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, NULL, 0, &status);
+	icu_locale_id = palloc(len + 1);
+	status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, icu_locale_id, len + 1, &status);
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("canonicalization failed for locale string \"%s\": %s",
+						loc, u_errorName(status))));
+
+	lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
+
+	pfree(icu_locale_id);
+
+	str = strchr(lower_str, '@');
 	if (!str)
 		return;
 	str++;
@@ -2660,7 +2679,6 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 			char	   *value;
 			UColAttribute uattr;
 			UColAttributeValue uvalue;
-			UErrorCode	status;
 
 			status = U_ZERO_ERROR;
 
@@ -2727,6 +2745,8 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 								loc, u_errorName(status))));
 		}
 	}
+
+	pfree(lower_str);
 }
 
 #endif							/* USE_ICU */
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index 9a3e12e42d..057c1d1bf6 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1304,6 +1304,14 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
  t        | t
 (1 row)
 
+-- test language tags
+CREATE COLLATION langtag (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
+SELECT 'aBcD' COLLATE langtag = 'AbCd' COLLATE langtag;
+ ?column? 
+----------
+ t
+(1 row)
+
 CREATE TABLE test1cs (x text COLLATE case_sensitive);
 CREATE TABLE test2cs (x text COLLATE case_sensitive);
 CREATE TABLE test3cs (x text COLLATE case_sensitive);
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 0790068f31..b9f5798715 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -518,6 +518,10 @@ CREATE COLLATION case_insensitive (provider = icu, locale = '@colStrength=second
 SELECT 'abc' <= 'ABC' COLLATE case_sensitive, 'abc' >= 'ABC' COLLATE case_sensitive;
 SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_insensitive;
 
+-- test language tags
+CREATE COLLATION langtag (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
+SELECT 'aBcD' COLLATE langtag = 'AbCd' COLLATE langtag;
+
 CREATE TABLE test1cs (x text COLLATE case_sensitive);
 CREATE TABLE test2cs (x text COLLATE case_sensitive);
 CREATE TABLE test3cs (x text COLLATE case_sensitive);
-- 
2.34.1

Re: ICU locale validation / canonicalization

Reply via email to