On Tue, 2025-07-29 at 16:55 -0700, Jeff Davis wrote:
> $SUBJECT makes it easier to test other providers, especially the
> regression tests.
Rebased.
Changes:
* Use environment variable name PG_LOCALE_PROVIDER, which seems more
consistent.
* Updated doc.
* If the provider is builtin and the LC_CTYPE or LC_COLLATE environment
variables aren't compatible with UTF-8, it can override those to "C".
But if --locale, --lc-ctype, or --lc-collate are specified and
incompatible, they will throw an error instead.
Note: when the provider is builtin, the overriding of LC_CTYPE and
LC_COLLATE don't matter a lot. LC_CTYPE affects the translation of
messages from the OS (but not Postgres messages), as well as a few
other places that are likely to be fixed soon (e.g. [1]). LC_COLLATE
has no effect when the provider is builtin. In any case, it only
happens when those environment variables aren't compatible with UTF-8,
and the user hasn't specified any locale settings on the command line.
I see this as more of a detail about how the defaults work together
that can easily be corrected if the user specifies something different.
Also note: if PG_LOCALE_PROVIDER=libc (or is unset), there should be no
behavior change with this patch.
I am planning to commit this soon.
Regards,
Jeff Davis
[1]
https://www.postgresql.org/message-id/[email protected]
From 59fb79143bc2e254cbc892480802c32178d32e02 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 17 Jul 2025 13:07:50 -0700
Subject: [PATCH v2] initdb: new environment variable PG_LOCALE_PROVIDER
Controls the locale provider used by initdb. Useful for testing.
Also specifies defaults for both the builtin provider and ICU, so that
plain initdb (without locale arguments specified) will succeed for any
provider. For the builtin provider's UTF-8 locales, if LC_CTYPE is not
compatible with UTF-8, forces LC_CTYPE=C to avoid such an error.
Discussion: https://postgr.es/m/[email protected]
---
doc/src/sgml/ref/initdb.sgml | 11 ++
src/bin/initdb/initdb.c | 103 ++++++++++++++++--
src/bin/initdb/t/001_initdb.pl | 11 +-
src/bin/scripts/t/020_createdb.pl | 69 +++++++-----
.../modules/test_escape/t/001_test_escape.pl | 2 +-
5 files changed, 150 insertions(+), 46 deletions(-)
diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index 7613174c18b..e0437357d27 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -662,6 +662,17 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry id="app-initdb-environment-pg-locale-provider">
+ <term><envar>PG_LOCALE_PROVIDER</envar></term>
+
+ <listitem>
+ <para>
+ Sets the locale provider; can be overridden using the
+ <option>--locale-provider</option> option.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="app-initdb-environment-pg-color">
<term><envar>PG_COLOR</envar></term>
<listitem>
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 92fe2f531f7..3b3ca9c377f 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -82,6 +82,9 @@
#include "mb/pg_wchar.h"
#include "miscadmin.h"
+#define DEFAULT_LOCALE_PROVIDER COLLPROVIDER_LIBC
+#define DEFAULT_BUILTIN_LOCALE "C.UTF-8"
+#define DEFAULT_ICU_LOCALE "und"
/* Ideally this would be in a .h file, but it hardly seems worth the trouble */
extern const char *select_default_timezone(const char *share_path);
@@ -144,7 +147,7 @@ static char *lc_monetary = NULL;
static char *lc_numeric = NULL;
static char *lc_time = NULL;
static char *lc_messages = NULL;
-static char locale_provider = COLLPROVIDER_LIBC;
+static char locale_provider = '\0';
static bool builtin_locale_specified = false;
static char *datlocale = NULL;
static bool icu_locale_specified = false;
@@ -2412,6 +2415,25 @@ icu_validate_locale(const char *loc_str)
#endif
}
+/*
+ * Is the given locale name UTF-8 compatible?
+ */
+static bool
+utf8_compatible(const char *localename)
+{
+#ifndef WIN32
+ int ctype_enc;
+
+ Assert(localename != NULL);
+ ctype_enc = pg_get_encoding_from_locale(localename, false);
+
+ return (ctype_enc == PG_UTF8 || ctype_enc == PG_SQL_ASCII);
+#else
+ /* on windows, all locales are compatible with UTF-8 */
+ return true;
+#endif
+}
+
/*
* set up the locale variables
*
@@ -2420,6 +2442,8 @@ icu_validate_locale(const char *loc_str)
static void
setlocales(void)
{
+ bool ctype_from_env;
+ bool collate_from_env;
char *canonname;
/* set empty lc_* and datlocale values to locale config if set */
@@ -2442,6 +2466,9 @@ setlocales(void)
datlocale = locale;
}
+ ctype_from_env = (lc_ctype == NULL);
+ collate_from_env = (lc_collate == NULL);
+
/*
* canonicalize locale names, and obtain any missing values from our
* current environment
@@ -2465,12 +2492,11 @@ setlocales(void)
lc_messages = canonname;
#endif
- if (locale_provider != COLLPROVIDER_LIBC && datlocale == NULL)
- pg_fatal("locale must be specified if provider is %s",
- collprovider_name(locale_provider));
-
if (locale_provider == COLLPROVIDER_BUILTIN)
{
+ if (!datlocale)
+ datlocale = DEFAULT_BUILTIN_LOCALE;
+
if (strcmp(datlocale, "C") == 0)
canonname = "C";
else if (strcmp(datlocale, "C.UTF-8") == 0 ||
@@ -2488,11 +2514,13 @@ setlocales(void)
{
char *langtag;
+ if (!datlocale)
+ datlocale = DEFAULT_ICU_LOCALE;
+
/* canonicalize to a language tag */
langtag = icu_language_tag(datlocale);
printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
langtag, datlocale);
- pg_free(datlocale);
datlocale = langtag;
icu_validate_locale(datlocale);
@@ -2505,6 +2533,46 @@ setlocales(void)
pg_fatal("ICU is not supported in this build");
#endif
}
+
+ /*
+ * If using the builtin provider with a locale requiring UTF-8, avoid
+ * taking incompatible settings from the environment.
+ */
+ if (locale_provider == COLLPROVIDER_BUILTIN &&
+ strcmp(datlocale, "C") != 0)
+ {
+ if (!encoding)
+ encoding = "UTF-8";
+
+ /*
+ * LC_CTYPE has little effect unless using the libc provider, but does
+ * still affect some places, such translation of error messages from
+ * the OS. Overriding it here may be an inconvenience, but in the
+ * absence of specified locale options, it's the best choice.
+ *
+ * XXX: minimize the effects of LC_CTYPE when not using libc.
+ */
+ if (ctype_from_env && !utf8_compatible(lc_ctype))
+ {
+ pg_log_warning("setting LC_CTYPE to \"C\"");
+ pg_log_warning_detail("Encoding of LC_CTYPE locale \"%s\" does not match encoding required by builtin locale \"%s\".",
+ lc_ctype, datlocale);
+ pg_log_warning_hint("Specify a UTF-8 compatible locale with --lc-ctype, or choose a different locale provider.");
+ lc_ctype = "C";
+ }
+
+ /*
+ * LC_COLLATE has no effect unless using the libc provider.
+ */
+ if (collate_from_env && !utf8_compatible(lc_collate))
+ {
+ pg_log_warning("setting LC_COLLATE to \"C\"");
+ pg_log_warning_detail("Encoding of LC_COLLATE locale \"%s\" does not match encoding required by builtin locale \"%s\".",
+ lc_collate, datlocale);
+ pg_log_warning_hint("Specify a UTF-8 compatible locale with --lc-collate, or choose a different locale provider.");
+ lc_collate = "C";
+ }
+ }
}
/*
@@ -2770,11 +2838,10 @@ setup_locale_encoding(void)
!check_locale_encoding(lc_collate, encodingid))
exit(1); /* check_locale_encoding printed the error */
- if (locale_provider == COLLPROVIDER_BUILTIN)
+ if (locale_provider == COLLPROVIDER_BUILTIN &&
+ strcmp(datlocale, "C") != 0)
{
- if ((strcmp(datlocale, "C.UTF-8") == 0 ||
- strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
- encodingid != PG_UTF8)
+ if (encodingid != PG_UTF8)
pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
datlocale, "UTF-8");
}
@@ -3418,6 +3485,22 @@ main(int argc, char *argv[])
exit(1);
}
+ if (locale_provider == '\0')
+ {
+ char *provider_name = getenv("PG_LOCALE_PROVIDER");
+
+ if (!provider_name)
+ locale_provider = DEFAULT_LOCALE_PROVIDER;
+ else if (strcmp(provider_name, "builtin") == 0)
+ locale_provider = COLLPROVIDER_BUILTIN;
+ else if (strcmp(provider_name, "icu") == 0)
+ locale_provider = COLLPROVIDER_ICU;
+ else if (strcmp(provider_name, "libc") == 0)
+ locale_provider = COLLPROVIDER_LIBC;
+ else
+ pg_fatal("unrecognized locale provider: %s", provider_name);
+ }
+
if (builtin_locale_specified && locale_provider != COLLPROVIDER_BUILTIN)
pg_fatal("%s cannot be specified unless locale provider \"%s\" is chosen",
"--builtin-locale", "builtin");
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index b7ef7ed8d06..ba3211a4aa6 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -113,14 +113,13 @@ SKIP:
if ($ENV{with_icu} eq 'yes')
{
- command_fails_like(
+ command_ok(
[
'initdb', '--no-sync',
'--locale-provider' => 'icu',
"$tempdir/data2"
],
- qr/initdb: error: locale must be specified if provider is icu/,
- 'locale provider ICU requires --icu-locale');
+ 'locale provider ICU default locale');
command_ok(
[
@@ -200,13 +199,15 @@ else
'locale provider ICU fails since no ICU support');
}
-command_fails(
+command_like(
[
'initdb', '--no-sync',
+ '--auth' => 'trust',
'--locale-provider' => 'builtin',
"$tempdir/data6"
],
- 'locale provider builtin fails without --locale');
+ qr/^\s+default collation:\s+C.UTF-8\n/ms,
+ 'locale provider builtin defaults to C.UTF-8');
command_ok(
[
diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl
index a8293390ede..6003d213e89 100644
--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@@ -16,6 +16,9 @@ my $node = PostgreSQL::Test::Cluster->new('main');
$node->init;
$node->start;
+my $datlocprovider = $node->safe_psql('postgres',
+ "SELECT datlocprovider FROM pg_database WHERE datname='template1'");
+
$node->issues_sql_like(
[ 'createdb', 'foobar1' ],
qr/statement: CREATE DATABASE foobar1/,
@@ -33,19 +36,6 @@ $node->issues_sql_like(
if ($ENV{with_icu} eq 'yes')
{
- # This fails because template0 uses libc provider and has no ICU
- # locale set. It would succeed if template0 used the icu
- # provider. XXX Maybe split into multiple tests?
- $node->command_fails(
- [
- 'createdb',
- '--template' => 'template0',
- '--encoding' => 'UTF8',
- '--locale-provider' => 'icu',
- 'foobar4',
- ],
- 'create database with ICU fails without ICU locale specified');
-
$node->issues_sql_like(
[
'createdb',
@@ -130,14 +120,18 @@ else
'create database with ICU fails since no ICU support');
}
-$node->command_fails(
- [
- 'createdb',
- '--template' => 'template0',
- '--locale-provider' => 'builtin',
- 'tbuiltin1',
- ],
- 'create database with provider "builtin" fails without --locale');
+if ($datlocprovider eq 'c')
+{
+ $node->command_fails(
+ [
+ 'createdb',
+ '--template' => 'template0',
+ '--encoding' => 'UTF8',
+ '--locale-provider' => 'builtin',
+ 'foobar4',
+ ],
+ 'create database with builtin provider fails without locale specified');
+}
$node->command_ok(
[
@@ -219,15 +213,30 @@ $node->command_fails(
],
'create database with provider "builtin" and ICU_RULES=""');
-$node->command_fails(
- [
- 'createdb',
- '--template' => 'template1',
- '--locale-provider' => 'builtin',
- '--locale' => 'C',
- 'tbuiltin9',
- ],
- 'create database with provider "builtin" not matching template');
+if ($datlocprovider eq 'b')
+{
+ $node->command_fails(
+ [
+ 'createdb',
+ '--template' => 'template1',
+ '--locale-provider' => 'libc',
+ '--locale' => 'C',
+ 'tbuiltin9',
+ ],
+ 'create database with provider "libc" not matching template');
+}
+else
+{
+ $node->command_fails(
+ [
+ 'createdb',
+ '--template' => 'template1',
+ '--locale-provider' => 'builtin',
+ '--locale' => 'C',
+ 'tbuiltin9',
+ ],
+ 'create database with provider "builtin" not matching template');
+}
$node->command_fails([ 'createdb', 'foobar1' ],
'fails if database already exists');
diff --git a/src/test/modules/test_escape/t/001_test_escape.pl b/src/test/modules/test_escape/t/001_test_escape.pl
index 0d5aec3ed74..b29f093db28 100644
--- a/src/test/modules/test_escape/t/001_test_escape.pl
+++ b/src/test/modules/test_escape/t/001_test_escape.pl
@@ -12,7 +12,7 @@ $node->init();
$node->start();
$node->safe_psql('postgres',
- q(CREATE DATABASE db_sql_ascii ENCODING "sql_ascii" TEMPLATE template0;));
+ q(CREATE DATABASE db_sql_ascii LOCALE_PROVIDER "builtin" LOCALE "C" ENCODING "sql_ascii" TEMPLATE template0;));
my $cmd =
[ 'test_escape', '--conninfo', $node->connstr . " dbname=db_sql_ascii" ];
--
2.43.0