On Wed, 2025-07-23 at 13:53 +0200, Daniel Verite wrote:
> > * The libc C.UTF-8 locale was a reasonable default (though not a
> > natural language collation). But now that we have C.UTF-8 available
> > from the builtin provider, then we should encourage that instead of
> > relying on the slower, platform-specific libc implementation.
> 
> Yes. In particular, we should encourage the ecosystem to support
> the new collation features so that they're widely available to
> end users.

Then I propose that we change the initdb default to builtin C.UTF-8.
Patch attached.

To get the old initdb behavior use --locale-provider=libc, and all the
other defaults will work as before.

The change would not disrupt upgrades (see commit 9637badd9f).

One annoyance: if your environment has an LC_CTYPE with a non-UTF-8
locale, then initdb forces LC_CTYPE=C and emits a warning.

I had previously tried, and failed, to change the default to ICU for
v16, so it's worth mentioning why I don't believe this proposal will
run into the same problems:

* ICU, while better than libc, didn't completely solve any of the
problems. This proposal completely solves the inconsistent primary key
problem, and is much faster than libc or ICU.

* In the version 16 change, we were still attempting to map environment
variables to ICU locales, which was never going to work very well. In
particular, as you pointed out, ICU has nothing to approximate the
C.UTF-8 locale. The current proposal doesn't attempt that kind of
cleverness.

Comments?

Regards,
        Jeff Davis

From 8ba8f74d28a64bfb006a76fbec64638f55f3660c Mon Sep 17 00:00:00 2001
From: Jeff Davis <j...@j-davis.com>
Date: Thu, 17 Jul 2025 13:07:50 -0700
Subject: [PATCH] initdb: default to builtin C.UTF-8

Discussion: https://postgr.es/m/918773d0-886b-4ce0-8b74-ae23496ad...@manitou-mail.org
---
 src/backend/commands/dbcommands.c             |  2 +-
 src/bin/initdb/initdb.c                       | 50 ++++++++++++++-----
 src/bin/initdb/t/001_initdb.pl                |  6 ++-
 src/bin/scripts/t/020_createdb.pl             | 23 +++++----
 .../modules/test_escape/t/001_test_escape.pl  |  2 +-
 5 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 502a45163c8..92a396b8406 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1052,7 +1052,7 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 		dbctype = src_ctype;
 	if (dblocprovider == '\0')
 		dblocprovider = src_locprovider;
-	if (dblocale == NULL)
+	if (dblocale == NULL && dblocprovider == src_locprovider)
 		dblocale = src_locale;
 	if (dbicurules == NULL)
 		dbicurules = src_icurules;
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 62bbd08d9f6..ea2a3299737 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -82,6 +82,7 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 
+#define DEFAULT_BUILTIN_LOCALE		"C.UTF-8"
 
 /* Ideally this would be in a .h file, but it hardly seems worth the trouble */
 extern const char *select_default_timezone(const char *share_path);
@@ -144,7 +145,7 @@ static char *lc_monetary = NULL;
 static char *lc_numeric = NULL;
 static char *lc_time = NULL;
 static char *lc_messages = NULL;
-static char locale_provider = COLLPROVIDER_LIBC;
+static char locale_provider = COLLPROVIDER_BUILTIN;
 static bool builtin_locale_specified = false;
 static char *datlocale = NULL;
 static bool icu_locale_specified = false;
@@ -2468,12 +2469,11 @@ setlocales(void)
 	lc_messages = canonname;
 #endif
 
-	if (locale_provider != COLLPROVIDER_LIBC && datlocale == NULL)
-		pg_fatal("locale must be specified if provider is %s",
-				 collprovider_name(locale_provider));
-
 	if (locale_provider == COLLPROVIDER_BUILTIN)
 	{
+		if (!datlocale)
+			datlocale = DEFAULT_BUILTIN_LOCALE;
+
 		if (strcmp(datlocale, "C") == 0)
 			canonname = "C";
 		else if (strcmp(datlocale, "C.UTF-8") == 0 ||
@@ -2491,6 +2491,9 @@ setlocales(void)
 	{
 		char	   *langtag;
 
+		if (!datlocale)
+			pg_fatal("locale must be specified if provider is icu");
+
 		/* canonicalize to a language tag */
 		langtag = icu_language_tag(datlocale);
 		printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
@@ -2686,6 +2689,29 @@ setup_locale_encoding(void)
 {
 	setlocales();
 
+	/*
+	 * For the builtin provider (other than the "C" locale), default encoding
+	 * to UTF-8. If lc_ctype is not compatible with UTF-8, also force lc_ctype
+	 * to "C". On windows, all locales are compatible with UTF-8.
+	 */
+	if (!encoding && locale_provider == COLLPROVIDER_BUILTIN &&
+		strcmp(datlocale, "C") != 0)
+	{
+#ifndef WIN32
+		int ctype_enc = pg_get_encoding_from_locale(lc_ctype, false);
+		if (!(ctype_enc == PG_UTF8 ||
+			  ctype_enc == PG_SQL_ASCII))
+		{
+			pg_log_warning("setting LC_CTYPE to \"C\"");
+			pg_log_warning_detail("Encoding of LC_CTYPE locale \"%s\" does not match encoding required by builtin locale \"%s\".",
+								  lc_ctype, datlocale);
+			pg_log_warning_hint("Specify a UTF-8 compatible locale with --lc-ctype, or choose a different locale provider.");
+			lc_ctype = "C";
+		}
+#endif
+		encoding = "UTF-8";
+	}
+
 	if (locale_provider == COLLPROVIDER_LIBC &&
 		strcmp(lc_ctype, lc_collate) == 0 &&
 		strcmp(lc_ctype, lc_time) == 0 &&
@@ -2721,10 +2747,11 @@ setup_locale_encoding(void)
 		ctype_enc = pg_get_encoding_from_locale(lc_ctype, true);
 
 		/*
-		 * If ctype_enc=SQL_ASCII, it's compatible with any encoding. ICU does
-		 * not support SQL_ASCII, so select UTF-8 instead.
+		 * If ctype_enc=SQL_ASCII, it's compatible with any encoding. Neither
+		 * ICU nor the builtin provider support SQL_ASCII, so select UTF-8
+		 * instead.
 		 */
-		if (locale_provider == COLLPROVIDER_ICU && ctype_enc == PG_SQL_ASCII)
+		if (locale_provider != COLLPROVIDER_LIBC && ctype_enc == PG_SQL_ASCII)
 			ctype_enc = PG_UTF8;
 
 		if (ctype_enc == -1)
@@ -2773,11 +2800,10 @@ setup_locale_encoding(void)
 		!check_locale_encoding(lc_collate, encodingid))
 		exit(1);				/* check_locale_encoding printed the error */
 
-	if (locale_provider == COLLPROVIDER_BUILTIN)
+	if (locale_provider == COLLPROVIDER_BUILTIN &&
+		strcmp(datlocale, "C") != 0)
 	{
-		if ((strcmp(datlocale, "C.UTF-8") == 0 ||
-			 strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
-			encodingid != PG_UTF8)
+		if(encodingid != PG_UTF8)
 			pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
 					 datlocale, "UTF-8");
 	}
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index b7ef7ed8d06..b73e5054108 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -200,13 +200,15 @@ else
 		'locale provider ICU fails since no ICU support');
 }
 
-command_fails(
+command_like(
 	[
 		'initdb', '--no-sync',
+		'--auth' => 'trust',
 		'--locale-provider' => 'builtin',
 		"$tempdir/data6"
 	],
-	'locale provider builtin fails without --locale');
+	qr/^\s+default collation:\s+C.UTF-8\n/ms,
+	'locale provider builtin defaults to C.UTF-8');
 
 command_ok(
 	[
diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl
index a8293390ede..4112acedb17 100644
--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@@ -130,15 +130,6 @@ else
 		'create database with ICU fails since no ICU support');
 }
 
-$node->command_fails(
-	[
-		'createdb',
-		'--template' => 'template0',
-		'--locale-provider' => 'builtin',
-		'tbuiltin1',
-	],
-	'create database with provider "builtin" fails without --locale');
-
 $node->command_ok(
 	[
 		'createdb',
@@ -223,11 +214,21 @@ $node->command_fails(
 	[
 		'createdb',
 		'--template' => 'template1',
-		'--locale-provider' => 'builtin',
+		'--locale-provider' => 'icu',
+		'--locale' => 'C',
+		'tbuiltin9',
+	],
+	'create database with provider "icu" not matching template');
+
+$node->command_fails(
+	[
+		'createdb',
+		'--template' => 'template1',
+		'--locale-provider' => 'libc',
 		'--locale' => 'C',
 		'tbuiltin9',
 	],
-	'create database with provider "builtin" not matching template');
+	'create database with provider "libc" not matching template');
 
 $node->command_fails([ 'createdb', 'foobar1' ],
 	'fails if database already exists');
diff --git a/src/test/modules/test_escape/t/001_test_escape.pl b/src/test/modules/test_escape/t/001_test_escape.pl
index 0d5aec3ed74..1de3224a04c 100644
--- a/src/test/modules/test_escape/t/001_test_escape.pl
+++ b/src/test/modules/test_escape/t/001_test_escape.pl
@@ -12,7 +12,7 @@ $node->init();
 $node->start();
 
 $node->safe_psql('postgres',
-	q(CREATE DATABASE db_sql_ascii ENCODING "sql_ascii" TEMPLATE template0;));
+	q(CREATE DATABASE db_sql_ascii LOCALE "C" ENCODING "sql_ascii" TEMPLATE template0;));
 
 my $cmd =
   [ 'test_escape', '--conninfo', $node->connstr . " dbname=db_sql_ascii" ];
-- 
2.43.0

Reply via email to