On Wed, 2025-12-17 at 11:39 +0100, Peter Eisentraut wrote:
> For Metaphone, I found the reference implementation linked from its
> Wikipedia page, and it looks like our implementation is pretty
> closely
> aligned to that. That reference implementation also contains the
> C-with-cedilla case explicitly. The correct fix here would probably
> be
> to change the implementation to work on wide characters. But I think
> for the moment you could try a shortcut like, use pg_ascii_toupper(),
> but if the encoding is LATIN1 (or LATIN9 or whichever other encodings
> also contain C-with-cedilla at that code point), then explicitly
> uppercase that one as well. This would preserve the existing
> behavior.
Done, attached new patches.
Interestingly, WIN1256 encodes only the SMALL LETTER C WITH CEDILLA. I
think, for the purposes here, we can still consider it to "uppercase"
to \xc7, so that it can still be treated as the same sound. Technically
I think that would be an improvement over the current code in this edge
case, and suggests that case folding would be a better approach than
uppercasing.
Regards,
Jeff Davis
From 8161ca49ae2044e004d3f36c04f60b03e97f4071 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Wed, 19 Nov 2025 13:24:38 -0800
Subject: [PATCH v13 1/2] fuzzystrmatch: use pg_ascii_toupper().
fuzzystrmatch is designed for ASCII, so no need to rely on the global
LC_CTYPE setting.
TODO: what about \xc7 case? Also, what should the behavior be for
soundex()?
Discussion: https://postgr.es/m/[email protected]
---
contrib/fuzzystrmatch/dmetaphone.c | 45 +++++++++++++++++++++++++--
contrib/fuzzystrmatch/fuzzystrmatch.c | 43 ++++++++++++++-----------
2 files changed, 67 insertions(+), 21 deletions(-)
diff --git a/contrib/fuzzystrmatch/dmetaphone.c b/contrib/fuzzystrmatch/dmetaphone.c
index 227d8b11ddc..9a4e5ae7e0e 100644
--- a/contrib/fuzzystrmatch/dmetaphone.c
+++ b/contrib/fuzzystrmatch/dmetaphone.c
@@ -98,6 +98,7 @@ The remaining code is authored by Andrew Dunstan <[email protected]> and
#include "postgres.h"
+#include "mb/pg_wchar.h"
#include "utils/builtins.h"
/* turn off assertions for embedded function */
@@ -116,6 +117,9 @@ The remaining code is authored by Andrew Dunstan <[email protected]> and
#include <assert.h>
#include <ctype.h>
+#define SMALL_LETTER_C_WITH_CEDILLA '\xe7'
+#define CAPITAL_LETTER_C_WITH_CEDILLA '\xc7'
+
/* prototype for the main function we got from the perl module */
static void DoubleMetaphone(char *str, char **codes);
@@ -282,9 +286,46 @@ static void
MakeUpper(metastring *s)
{
char *i;
+ bool c_with_cedilla;
+
+ /*
+ * C WITH CEDILLA should be uppercased, as well.
+ *
+ * XXX: Only works in single-byte encodings that encode lowercase C WITH
+ * CEDILLA as \xe7. Should have proper multibyte support.
+ *
+ * NB: WIN1256 encodes only the lowercase C WITH CEDILLA, but for the
+ * purposes of metaphone, we can still "uppercase" it to \xc7 here so that
+ * it's recognized later.
+ */
+ switch (GetDatabaseEncoding())
+ {
+ case PG_LATIN1:
+ case PG_LATIN2:
+ case PG_LATIN3:
+ case PG_LATIN5:
+ case PG_LATIN8:
+ case PG_LATIN9:
+ case PG_LATIN10:
+ case PG_WIN1250:
+ case PG_WIN1252:
+ case PG_WIN1254:
+ case PG_WIN1256:
+ case PG_WIN1258:
+ c_with_cedilla = true;
+ break;
+ default:
+ c_with_cedilla = false;
+ break;
+ }
for (i = s->str; *i; i++)
- *i = toupper((unsigned char) *i);
+ {
+ if (c_with_cedilla && *i == SMALL_LETTER_C_WITH_CEDILLA)
+ *i = CAPITAL_LETTER_C_WITH_CEDILLA;
+ else
+ *i = pg_ascii_toupper((unsigned char) *i);
+ }
}
@@ -463,7 +504,7 @@ DoubleMetaphone(char *str, char **codes)
current += 1;
break;
- case '\xc7': /* C with cedilla */
+ case CAPITAL_LETTER_C_WITH_CEDILLA:
MetaphAdd(primary, "S");
MetaphAdd(secondary, "S");
current += 1;
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
index e7cc314b763..319302af0e4 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -62,7 +62,7 @@ static const char *const soundex_table = "01230120022455012623010202";
static char
soundex_code(char letter)
{
- letter = toupper((unsigned char) letter);
+ letter = pg_ascii_toupper((unsigned char) letter);
/* Defend against non-ASCII letters */
if (letter >= 'A' && letter <= 'Z')
return soundex_table[letter - 'A'];
@@ -122,16 +122,21 @@ static const char _codes[26] = {
static int
getcode(char c)
{
- if (isalpha((unsigned char) c))
- {
- c = toupper((unsigned char) c);
- /* Defend against non-ASCII letters */
- if (c >= 'A' && c <= 'Z')
- return _codes[c - 'A'];
- }
+ c = pg_ascii_toupper((unsigned char) c);
+ /* Defend against non-ASCII letters */
+ if (c >= 'A' && c <= 'Z')
+ return _codes[c - 'A'];
+
return 0;
}
+static bool
+ascii_isalpha(char c)
+{
+ return (c >= 'A' && c <= 'Z') ||
+ (c >= 'a' && c <= 'z');
+}
+
#define isvowel(c) (getcode(c) & 1) /* AEIOU */
/* These letters are passed through unchanged */
@@ -301,18 +306,18 @@ metaphone(PG_FUNCTION_ARGS)
* accessing the array directly... */
/* Look at the next letter in the word */
-#define Next_Letter (toupper((unsigned char) word[w_idx+1]))
+#define Next_Letter (pg_ascii_toupper((unsigned char) word[w_idx+1]))
/* Look at the current letter in the word */
-#define Curr_Letter (toupper((unsigned char) word[w_idx]))
+#define Curr_Letter (pg_ascii_toupper((unsigned char) word[w_idx]))
/* Go N letters back. */
#define Look_Back_Letter(n) \
- (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
+ (w_idx >= (n) ? pg_ascii_toupper((unsigned char) word[w_idx-(n)]) : '\0')
/* Previous letter. I dunno, should this return null on failure? */
#define Prev_Letter (Look_Back_Letter(1))
/* Look two letters down. It makes sure you don't walk off the string. */
#define After_Next_Letter \
- (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
-#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
+ (Next_Letter != '\0' ? pg_ascii_toupper((unsigned char) word[w_idx+2]) : '\0')
+#define Look_Ahead_Letter(n) pg_ascii_toupper((unsigned char) Lookahead(word+w_idx, n))
/* Allows us to safely look ahead an arbitrary # of letters */
@@ -340,7 +345,7 @@ Lookahead(char *word, int how_far)
#define Phone_Len (p_idx)
/* Note is a letter is a 'break' in the word */
-#define Isbreak(c) (!isalpha((unsigned char) (c)))
+#define Isbreak(c) (!ascii_isalpha((unsigned char) (c)))
static void
@@ -379,7 +384,7 @@ _metaphone(char *word, /* IN */
/*-- The first phoneme has to be processed specially. --*/
/* Find our first letter */
- for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
+ for (; !ascii_isalpha((unsigned char) (Curr_Letter)); w_idx++)
{
/* On the off chance we were given nothing but crap... */
if (Curr_Letter == '\0')
@@ -478,7 +483,7 @@ _metaphone(char *word, /* IN */
*/
/* Ignore non-alphas */
- if (!isalpha((unsigned char) (Curr_Letter)))
+ if (!ascii_isalpha((unsigned char) (Curr_Letter)))
continue;
/* Drop duplicates, except CC */
@@ -731,7 +736,7 @@ _soundex(const char *instr, char *outstr)
Assert(outstr);
/* Skip leading non-alphabetic characters */
- while (*instr && !isalpha((unsigned char) *instr))
+ while (*instr && !ascii_isalpha((unsigned char) *instr))
++instr;
/* If no string left, return all-zeroes buffer */
@@ -742,12 +747,12 @@ _soundex(const char *instr, char *outstr)
}
/* Take the first letter as is */
- *outstr++ = (char) toupper((unsigned char) *instr++);
+ *outstr++ = (char) pg_ascii_toupper((unsigned char) *instr++);
count = 1;
while (*instr && count < SOUNDEX_LEN)
{
- if (isalpha((unsigned char) *instr) &&
+ if (ascii_isalpha((unsigned char) *instr) &&
soundex_code(*instr) != soundex_code(*(instr - 1)))
{
*outstr = soundex_code(*instr);
--
2.43.0
From 5d8d22077aaa6b7365c52b016ad0e22296b68b05 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Mon, 24 Nov 2025 14:00:52 -0800
Subject: [PATCH v13 2/2] Control LC_COLLATE with GUC.
Now that the global LC_COLLATE setting is not used for any in-core
purpose at all (see commit 5e6e42e44f), allow it to be set with a
GUC. This may be useful for extensions or procedural languages that
still depend on the global LC_COLLATE setting.
TODO: needs discussion
Discussion: https://postgr.es/m/[email protected]
---
src/backend/utils/adt/pg_locale.c | 59 +++++++++++++++++++
src/backend/utils/init/postinit.c | 2 +
src/backend/utils/misc/guc_parameters.dat | 9 +++
src/backend/utils/misc/postgresql.conf.sample | 2 +
src/bin/initdb/initdb.c | 3 +
src/include/utils/guc_hooks.h | 2 +
src/include/utils/pg_locale.h | 1 +
7 files changed, 78 insertions(+)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index ee08ac045b7..6dfbe8af47b 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -81,6 +81,7 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context);
extern char *get_collation_actual_version_libc(const char *collcollate);
/* GUC settings */
+char *locale_collate;
char *locale_messages;
char *locale_monetary;
char *locale_numeric;
@@ -369,6 +370,64 @@ assign_locale_time(const char *newval, void *extra)
CurrentLCTimeValid = false;
}
+/*
+ * We allow LC_COLLATE to actually be set globally.
+ *
+ * Note: we normally disallow value = "" because it wouldn't have consistent
+ * semantics (it'd effectively just use the previous value). However, this
+ * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
+ * not even if the attempted setting fails due to invalid environment value.
+ * The idea there is just to accept the environment setting *if possible*
+ * during startup, until we can read the proper value from postgresql.conf.
+ */
+bool
+check_locale_collate(char **newval, void **extra, GucSource source)
+{
+ int locale_enc;
+ int db_enc;
+
+ if (**newval == '\0')
+ {
+ if (source == PGC_S_DEFAULT)
+ return true;
+ else
+ return false;
+ }
+
+ locale_enc = pg_get_encoding_from_locale(*newval, true);
+ db_enc = GetDatabaseEncoding();
+
+ if (!(locale_enc == db_enc ||
+ locale_enc == PG_SQL_ASCII ||
+ db_enc == PG_SQL_ASCII ||
+ locale_enc == -1))
+ {
+ if (source == PGC_S_FILE)
+ {
+ guc_free(*newval);
+ *newval = guc_strdup(LOG, "C");
+ if (!*newval)
+ return false;
+ }
+ else if (source != PGC_S_TEST)
+ {
+ ereport(WARNING,
+ (errmsg("encoding mismatch"),
+ errdetail("Locale \"%s\" uses encoding \"%s\", which does not match database encoding \"%s\".",
+ *newval, pg_encoding_to_char(locale_enc), pg_encoding_to_char(db_enc))));
+ return false;
+ }
+ }
+
+ return check_locale(LC_COLLATE, *newval, NULL);
+}
+
+void
+assign_locale_collate(const char *newval, void *extra)
+{
+ (void) pg_perm_setlocale(LC_COLLATE, newval);
+}
+
/*
* We allow LC_MESSAGES to actually be set globally.
*
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index b7e94ca45bd..eee0b971590 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -404,6 +404,8 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect
* the pg_database tuple.
*/
SetDatabaseEncoding(dbform->encoding);
+ /* Reset lc_collate to check encoding, and fall back to C if necessary */
+ SetConfigOption("lc_collate", locale_collate, PGC_POSTMASTER, PGC_S_FILE);
/* Record it as a GUC internal option, too */
SetConfigOption("server_encoding", GetDatabaseEncodingName(),
PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index ac0c7c36c56..cf7675aa2bb 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -1466,6 +1466,15 @@
boot_val => 'PG_KRB_SRVTAB',
},
+{ name => 'lc_collate', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_LOCALE',
+ short_desc => 'Sets the locale for text ordering in extensions.',
+ long_desc => 'An empty string means use the operating system setting.',
+ variable => 'locale_collate',
+ boot_val => '""',
+ check_hook => 'check_locale_collate',
+ assign_hook => 'assign_locale_collate',
+},
+
{ name => 'lc_messages', type => 'string', context => 'PGC_SUSET', group => 'CLIENT_CONN_LOCALE',
short_desc => 'Sets the language in which messages are displayed.',
long_desc => 'An empty string means use the operating system setting.',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index dc9e2255f8a..19332e39e82 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -798,6 +798,8 @@
# encoding
# These settings are initialized by initdb, but they can be changed.
+#lc_collate = '' # locale for text ordering (only affects
+ # extensions)
#lc_messages = '' # locale for system error message
# strings
#lc_monetary = 'C' # locale for monetary formatting
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 92fe2f531f7..8b2e7bfab6f 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -1312,6 +1312,9 @@ setup_config(void)
conflines = replace_guc_value(conflines, "shared_buffers",
repltok, false);
+ conflines = replace_guc_value(conflines, "lc_collate",
+ lc_collate, false);
+
conflines = replace_guc_value(conflines, "lc_messages",
lc_messages, false);
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index fbe0b1e2e3d..f3bfc8dfb7e 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -66,6 +66,8 @@ extern bool check_huge_page_size(int *newval, void **extra, GucSource source);
extern void assign_io_method(int newval, void *extra);
extern bool check_io_max_concurrency(int *newval, void **extra, GucSource source);
extern const char *show_in_hot_standby(void);
+extern bool check_locale_collate(char **newval, void **extra, GucSource source);
+extern void assign_locale_collate(const char *newval, void *extra);
extern bool check_locale_messages(char **newval, void **extra, GucSource source);
extern void assign_locale_messages(const char *newval, void *extra);
extern bool check_locale_monetary(char **newval, void **extra, GucSource source);
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 86016b9344e..096ea1e4963 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -41,6 +41,7 @@
#define UNICODE_CASEMAP_BUFSZ (UNICODE_CASEMAP_LEN * MAX_MULTIBYTE_CHAR_LEN)
/* GUC settings */
+extern PGDLLIMPORT char *locale_collate;
extern PGDLLIMPORT char *locale_messages;
extern PGDLLIMPORT char *locale_monetary;
extern PGDLLIMPORT char *locale_numeric;
--
2.43.0