On Wed, 2025-12-17 at 11:39 +0100, Peter Eisentraut wrote:
> For Metaphone, I found the reference implementation linked from its 
> Wikipedia page, and it looks like our implementation is pretty
> closely 
> aligned to that.  That reference implementation also contains the 
> C-with-cedilla case explicitly.  The correct fix here would probably
> be 
> to change the implementation to work on wide characters.  But I think
> for the moment you could try a shortcut like, use pg_ascii_toupper(),
> but if the encoding is LATIN1 (or LATIN9 or whichever other encodings
> also contain C-with-cedilla at that code point), then explicitly 
> uppercase that one as well.  This would preserve the existing
> behavior.

Done, attached new patches.

Interestingly, WIN1256 encodes only the SMALL LETTER C WITH CEDILLA. I
think, for the purposes here, we can still consider it to "uppercase"
to \xc7, so that it can still be treated as the same sound. Technically
I think that would be an improvement over the current code in this edge
case, and suggests that case folding would be a better approach than
uppercasing.

Regards,
        Jeff Davis

From 8161ca49ae2044e004d3f36c04f60b03e97f4071 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 13:24:38 -0800
Subject: [PATCH v13 1/2] fuzzystrmatch: use pg_ascii_toupper().

fuzzystrmatch is designed for ASCII, so no need to rely on the global
LC_CTYPE setting.

TODO: what about \xc7 case? Also, what should the behavior be for
soundex()?

Discussion: https://postgr.es/m/[email protected]
---
 contrib/fuzzystrmatch/dmetaphone.c    | 45 +++++++++++++++++++++++++--
 contrib/fuzzystrmatch/fuzzystrmatch.c | 43 ++++++++++++++-----------
 2 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 227d8b11ddc..9a4e5ae7e0e 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -98,6 +98,7 @@ The remaining code is authored by Andrew Dunstan <[email protected]> and
 
 #include "postgres.h"
 
+#include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 
 /* turn off assertions for embedded function */
@@ -116,6 +117,9 @@ The remaining code is authored by Andrew Dunstan <[email protected]> and
 #include <assert.h>
 #include <ctype.h>
 
+#define SMALL_LETTER_C_WITH_CEDILLA		'\xe7'
+#define CAPITAL_LETTER_C_WITH_CEDILLA	'\xc7'
+
 /* prototype for the main function we got from the perl module */
 static void DoubleMetaphone(char *str, char **codes);
 
@@ -282,9 +286,46 @@ static void
 MakeUpper(metastring *s)
 {
 	char	   *i;
+	bool		c_with_cedilla;
+
+	/*
+	 * C WITH CEDILLA should be uppercased, as well.
+	 *
+	 * XXX: Only works in single-byte encodings that encode lowercase C WITH
+	 * CEDILLA as \xe7. Should have proper multibyte support.
+	 *
+	 * NB: WIN1256 encodes only the lowercase C WITH CEDILLA, but for the
+	 * purposes of metaphone, we can still "uppercase" it to \xc7 here so that
+	 * it's recognized later.
+	 */
+	switch (GetDatabaseEncoding())
+	{
+		case PG_LATIN1:
+		case PG_LATIN2:
+		case PG_LATIN3:
+		case PG_LATIN5:
+		case PG_LATIN8:
+		case PG_LATIN9:
+		case PG_LATIN10:
+		case PG_WIN1250:
+		case PG_WIN1252:
+		case PG_WIN1254:
+		case PG_WIN1256:
+		case PG_WIN1258:
+			c_with_cedilla = true;
+			break;
+		default:
+			c_with_cedilla = false;
+			break;
+	}
 
 	for (i = s->str; *i; i++)
-		*i = toupper((unsigned char) *i);
+	{
+		if (c_with_cedilla && *i == SMALL_LETTER_C_WITH_CEDILLA)
+			*i = CAPITAL_LETTER_C_WITH_CEDILLA;
+		else
+			*i = pg_ascii_toupper((unsigned char) *i);
+	}
 }
 
 
@@ -463,7 +504,7 @@ DoubleMetaphone(char *str, char **codes)
 					current += 1;
 				break;
 
-			case '\xc7':		/* C with cedilla */
+			case CAPITAL_LETTER_C_WITH_CEDILLA:
 				MetaphAdd(primary, "S");
 				MetaphAdd(secondary, "S");
 				current += 1;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
index e7cc314b763..319302af0e4 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -62,7 +62,7 @@ static const char *const soundex_table = "01230120022455012623010202";
 static char
 soundex_code(char letter)
 {
-	letter = toupper((unsigned char) letter);
+	letter = pg_ascii_toupper((unsigned char) letter);
 	/* Defend against non-ASCII letters */
 	if (letter >= 'A' && letter <= 'Z')
 		return soundex_table[letter - 'A'];
@@ -122,16 +122,21 @@ static const char _codes[26] = {
 static int
 getcode(char c)
 {
-	if (isalpha((unsigned char) c))
-	{
-		c = toupper((unsigned char) c);
-		/* Defend against non-ASCII letters */
-		if (c >= 'A' && c <= 'Z')
-			return _codes[c - 'A'];
-	}
+	c = pg_ascii_toupper((unsigned char) c);
+	/* Defend against non-ASCII letters */
+	if (c >= 'A' && c <= 'Z')
+		return _codes[c - 'A'];
+
 	return 0;
 }
 
+static bool
+ascii_isalpha(char c)
+{
+	return (c >= 'A' && c <= 'Z') ||
+		(c >= 'a' && c <= 'z');
+}
+
 #define isvowel(c)	(getcode(c) & 1)	/* AEIOU */
 
 /* These letters are passed through unchanged */
@@ -301,18 +306,18 @@ metaphone(PG_FUNCTION_ARGS)
  * accessing the array directly... */
 
 /* Look at the next letter in the word */
-#define Next_Letter (toupper((unsigned char) word[w_idx+1]))
+#define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
 /* Look at the current letter in the word */
-#define Curr_Letter (toupper((unsigned char) word[w_idx]))
+#define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
 /* Go N letters back. */
 #define Look_Back_Letter(n) \
-	(w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
+	(w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
 /* Previous letter.  I dunno, should this return null on failure? */
 #define Prev_Letter (Look_Back_Letter(1))
 /* Look two letters down.  It makes sure you don't walk off the string. */
 #define After_Next_Letter \
-	(Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
-#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
+	(Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
+#define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
 
 
 /* Allows us to safely look ahead an arbitrary # of letters */
@@ -340,7 +345,7 @@ Lookahead(char *word, int how_far)
 #define Phone_Len	(p_idx)
 
 /* Note is a letter is a 'break' in the word */
-#define Isbreak(c)	(!isalpha((unsigned char) (c)))
+#define Isbreak(c)	(!ascii_isalpha((unsigned char) (c)))
 
 
 static void
@@ -379,7 +384,7 @@ _metaphone(char *word,			/* IN */
 
 	/*-- The first phoneme has to be processed specially. --*/
 	/* Find our first letter */
-	for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
+	for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++)
 	{
 		/* On the off chance we were given nothing but crap... */
 		if (Curr_Letter == '\0')
@@ -478,7 +483,7 @@ _metaphone(char *word,			/* IN */
 		 */
 
 		/* Ignore non-alphas */
-		if (!isalpha((unsigned char) (Curr_Letter)))
+		if (!ascii_isalpha((unsigned char) (Curr_Letter)))
 			continue;
 
 		/* Drop duplicates, except CC */
@@ -731,7 +736,7 @@ _soundex(const char *instr, char *outstr)
 	Assert(outstr);
 
 	/* Skip leading non-alphabetic characters */
-	while (*instr && !isalpha((unsigned char) *instr))
+	while (*instr && !ascii_isalpha((unsigned char) *instr))
 		++instr;
 
 	/* If no string left, return all-zeroes buffer */
@@ -742,12 +747,12 @@ _soundex(const char *instr, char *outstr)
 	}
 
 	/* Take the first letter as is */
-	*outstr++ = (char) toupper((unsigned char) *instr++);
+	*outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
 
 	count = 1;
 	while (*instr && count < SOUNDEX_LEN)
 	{
-		if (isalpha((unsigned char) *instr) &&
+		if (ascii_isalpha((unsigned char) *instr) &&
 			soundex_code(*instr) != soundex_code(*(instr - 1)))
 		{
 			*outstr = soundex_code(*instr);
-- 
2.43.0

From 5d8d22077aaa6b7365c52b016ad0e22296b68b05 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 24 Nov 2025 14:00:52 -0800
Subject: [PATCH v13 2/2] Control LC_COLLATE with GUC.

Now that the global LC_COLLATE setting is not used for any in-core
purpose at all (see commit 5e6e42e44f), allow it to be set with a
GUC. This may be useful for extensions or procedural languages that
still depend on the global LC_COLLATE setting.

TODO: needs discussion

Discussion: https://postgr.es/m/[email protected]
---
 src/backend/utils/adt/pg_locale.c             | 59 +++++++++++++++++++
 src/backend/utils/init/postinit.c             |  2 +
 src/backend/utils/misc/guc_parameters.dat     |  9 +++
 src/backend/utils/misc/postgresql.conf.sample |  2 +
 src/bin/initdb/initdb.c                       |  3 +
 src/include/utils/guc_hooks.h                 |  2 +
 src/include/utils/pg_locale.h                 |  1 +
 7 files changed, 78 insertions(+)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index ee08ac045b7..6dfbe8af47b 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -81,6 +81,7 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
 extern char *get_collation_actual_version_libc(const char *collcollate);
 
 /* GUC settings */
+char	   *locale_collate;
 char	   *locale_messages;
 char	   *locale_monetary;
 char	   *locale_numeric;
@@ -369,6 +370,64 @@ assign_locale_time(const char *newval, void *extra)
 	CurrentLCTimeValid = false;
 }
 
+/*
+ * We allow LC_COLLATE to actually be set globally.
+ *
+ * Note: we normally disallow value = "" because it wouldn't have consistent
+ * semantics (it'd effectively just use the previous value).  However, this
+ * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
+ * not even if the attempted setting fails due to invalid environment value.
+ * The idea there is just to accept the environment setting *if possible*
+ * during startup, until we can read the proper value from postgresql.conf.
+ */
+bool
+check_locale_collate(char **newval, void **extra, GucSource source)
+{
+	int			locale_enc;
+	int			db_enc;
+
+	if (**newval == '\0')
+	{
+		if (source == PGC_S_DEFAULT)
+			return true;
+		else
+			return false;
+	}
+
+	locale_enc = pg_get_encoding_from_locale(*newval, true);
+	db_enc = GetDatabaseEncoding();
+
+	if (!(locale_enc == db_enc ||
+		  locale_enc == PG_SQL_ASCII ||
+		  db_enc == PG_SQL_ASCII ||
+		  locale_enc == -1))
+	{
+		if (source == PGC_S_FILE)
+		{
+			guc_free(*newval);
+			*newval = guc_strdup(LOG, "C");
+			if (!*newval)
+				return false;
+		}
+		else if (source != PGC_S_TEST)
+		{
+			ereport(WARNING,
+					(errmsg("encoding mismatch"),
+					 errdetail("Locale \"%s\" uses encoding \"%s\", which does not match database encoding \"%s\".",
+							   *newval, pg_encoding_to_char(locale_enc), pg_encoding_to_char(db_enc))));
+			return false;
+		}
+	}
+
+	return check_locale(LC_COLLATE, *newval, NULL);
+}
+
+void
+assign_locale_collate(const char *newval, void *extra)
+{
+	(void) pg_perm_setlocale(LC_COLLATE, newval);
+}
+
 /*
  * We allow LC_MESSAGES to actually be set globally.
  *
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index b7e94ca45bd..eee0b971590 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -404,6 +404,8 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect
 	 * the pg_database tuple.
 	 */
 	SetDatabaseEncoding(dbform->encoding);
+	/* Reset lc_collate to check encoding, and fall back to C if necessary */
+	SetConfigOption("lc_collate", locale_collate, PGC_POSTMASTER, PGC_S_FILE);
 	/* Record it as a GUC internal option, too */
 	SetConfigOption("server_encoding", GetDatabaseEncodingName(),
 					PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index ac0c7c36c56..cf7675aa2bb 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -1466,6 +1466,15 @@
   boot_val => 'PG_KRB_SRVTAB',
 },
 
+{ name => 'lc_collate', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_LOCALE',
+  short_desc => 'Sets the locale for text ordering in extensions.',
+  long_desc => 'An empty string means use the operating system setting.',
+  variable => 'locale_collate',
+  boot_val => '""',
+  check_hook => 'check_locale_collate',
+  assign_hook => 'assign_locale_collate',
+},
+
 { name => 'lc_messages', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_LOCALE',
   short_desc => 'Sets the language in which messages are displayed.',
   long_desc => 'An empty string means use the operating system setting.',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index dc9e2255f8a..19332e39e82 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -798,6 +798,8 @@
                                         # encoding
 
 # These settings are initialized by initdb, but they can be changed.
+#lc_collate = ''                        # locale for text ordering (only affects
+                                        # extensions)
 #lc_messages = ''                       # locale for system error message
                                         # strings
 #lc_monetary = 'C'                      # locale for monetary formatting
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 92fe2f531f7..8b2e7bfab6f 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -1312,6 +1312,9 @@ setup_config(void)
 	conflines = replace_guc_value(conflines, "shared_buffers",
 								  repltok, false);
 
+	conflines = replace_guc_value(conflines, "lc_collate",
+								  lc_collate, false);
+
 	conflines = replace_guc_value(conflines, "lc_messages",
 								  lc_messages, false);
 
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index fbe0b1e2e3d..f3bfc8dfb7e 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -66,6 +66,8 @@ extern bool check_huge_page_size(int *newval, void **extra, GucSource source);
 extern void assign_io_method(int newval, void *extra);
 extern bool check_io_max_concurrency(int *newval, void **extra, GucSource source);
 extern const char *show_in_hot_standby(void);
+extern bool check_locale_collate(char **newval, void **extra, GucSource source);
+extern void assign_locale_collate(const char *newval, void *extra);
 extern bool check_locale_messages(char **newval, void **extra, GucSource source);
 extern void assign_locale_messages(const char *newval, void *extra);
 extern bool check_locale_monetary(char **newval, void **extra, GucSource source);
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 86016b9344e..096ea1e4963 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -41,6 +41,7 @@
 #define UNICODE_CASEMAP_BUFSZ	(UNICODE_CASEMAP_LEN * MAX_MULTIBYTE_CHAR_LEN)
 
 /* GUC settings */
+extern PGDLLIMPORT char *locale_collate;
 extern PGDLLIMPORT char *locale_messages;
 extern PGDLLIMPORT char *locale_monetary;
 extern PGDLLIMPORT char *locale_numeric;
-- 
2.43.0

Reply via email to