https://github.com/python/cpython/commit/c89a66feb12110e68e63a6293e3ed9c9fd180412
commit: c89a66feb12110e68e63a6293e3ed9c9fd180412
branch: main
author: Adam Turner <9087854+aa-tur...@users.noreply.github.com>
committer: AA-Turner <9087854+aa-tur...@users.noreply.github.com>
date: 2025-07-15T10:45:41+01:00
summary:

GH-133711: Enable UTF-8 mode by default (PEP 686) (#133712)

Co-authored-by: Victor Stinner <vstin...@python.org>

files:
A 
Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst
M Doc/c-api/init_config.rst
M Doc/library/os.rst
M Doc/using/windows.rst
M Doc/whatsnew/3.15.rst
M Include/cpython/initconfig.h
M Lib/locale.py
M Lib/subprocess.py
M Lib/test/test_cmd_line.py
M Lib/test/test_embed.py
M Lib/test/test_utf8_mode.py
M Programs/_testembed.c
M Python/initconfig.c
M Python/preconfig.c

diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst
index 4fd10224262488..24be9ead3874d1 100644
--- a/Doc/c-api/init_config.rst
+++ b/Doc/c-api/init_config.rst
@@ -975,9 +975,7 @@ PyPreConfig
       Set to ``0`` or ``1`` by the :option:`-X utf8 <-X>` command line option
       and the :envvar:`PYTHONUTF8` environment variable.
 
-      Also set to ``1`` if the ``LC_CTYPE`` locale is ``C`` or ``POSIX``.
-
-      Default: ``-1`` in Python config and ``0`` in isolated config.
+      Default: ``1``.
 
 
 .. _c-preinit:
diff --git a/Doc/library/os.rst b/Doc/library/os.rst
index 1e54cfec609bd2..45ec6c7a51b7b0 100644
--- a/Doc/library/os.rst
+++ b/Doc/library/os.rst
@@ -108,6 +108,12 @@ Python UTF-8 Mode
 .. versionadded:: 3.7
    See :pep:`540` for more details.
 
+.. versionchanged:: next
+
+   Python UTF-8 mode is now enabled by default (:pep:`686`).
+   It may be disabled with by setting :envvar:`PYTHONUTF8=0 <PYTHONUTF8>` as
+   an environment variable or by using the :option:`-X utf8=0 <-X>` command 
line option.
+
 The Python UTF-8 Mode ignores the :term:`locale encoding` and forces the usage
 of the UTF-8 encoding:
 
@@ -139,31 +145,22 @@ level APIs also exhibit different default behaviours:
   default so that attempting to open a binary file in text mode is likely
   to raise an exception rather than producing nonsense data.
 
-The :ref:`Python UTF-8 Mode <utf8-mode>` is enabled if the LC_CTYPE locale is
-``C`` or ``POSIX`` at Python startup (see the :c:func:`PyConfig_Read`
-function).
-
-It can be enabled or disabled using the :option:`-X utf8 <-X>` command line
-option and the :envvar:`PYTHONUTF8` environment variable.
-
-If the :envvar:`PYTHONUTF8` environment variable is not set at all, then the
-interpreter defaults to using the current locale settings, *unless* the current
-locale is identified as a legacy ASCII-based locale (as described for
-:envvar:`PYTHONCOERCECLOCALE`), and locale coercion is either disabled or
-fails. In such legacy locales, the interpreter will default to enabling UTF-8
-mode unless explicitly instructed not to do so.
-
-The Python UTF-8 Mode can only be enabled at the Python startup. Its value
+The :ref:`Python UTF-8 Mode <utf8-mode>` is enabled by default.
+It can be disabled using the :option:`-X utf8=0 <-X>` command line
+option or the :envvar:`PYTHONUTF8=0 <PYTHONUTF8>` environment variable.
+The Python UTF-8 Mode can only be disabled at Python startup. Its value
 can be read from :data:`sys.flags.utf8_mode <sys.flags>`.
 
+If the UTF-8 mode is disabled, the interpreter defaults to using
+the current locale settings, *unless* the current locale is identified
+as a legacy ASCII-based locale (as described for 
:envvar:`PYTHONCOERCECLOCALE`),
+and locale coercion is either disabled or fails.
+In such legacy locales, the interpreter will default to enabling UTF-8 mode
+unless explicitly instructed not to do so.
+
 See also the :ref:`UTF-8 mode on Windows <win-utf8-mode>`
 and the :term:`filesystem encoding and error handler`.
 
-.. seealso::
-
-   :pep:`686`
-      Python 3.15 will make :ref:`utf8-mode` default.
-
 
 .. _os-procinfo:
 
diff --git a/Doc/using/windows.rst b/Doc/using/windows.rst
index 9628da3d2f6b12..7cc50bccb3724a 100644
--- a/Doc/using/windows.rst
+++ b/Doc/using/windows.rst
@@ -1006,6 +1006,9 @@ UTF-8 mode
 ==========
 
 .. versionadded:: 3.7
+.. versionchanged:: next
+
+   Python UTF-8 mode is now enabled by default (:pep:`686`).
 
 Windows still uses legacy encodings for the system encoding (the ANSI Code
 Page).  Python uses it for the default encoding of text files (e.g.
@@ -1014,20 +1017,22 @@ Page).  Python uses it for the default encoding of text 
files (e.g.
 This may cause issues because UTF-8 is widely used on the internet
 and most Unix systems, including WSL (Windows Subsystem for Linux).
 
-You can use the :ref:`Python UTF-8 Mode <utf8-mode>` to change the default text
-encoding to UTF-8. You can enable the :ref:`Python UTF-8 Mode <utf8-mode>` via
-the ``-X utf8`` command line option, or the ``PYTHONUTF8=1`` environment
-variable.  See :envvar:`PYTHONUTF8` for enabling UTF-8 mode, and
-:ref:`setting-envvars` for how to modify environment variables.
-
-When the :ref:`Python UTF-8 Mode <utf8-mode>` is enabled, you can still use the
+The :ref:`Python UTF-8 Mode <utf8-mode>`, enabled by default, can help by
+changing the default text encoding to UTF-8.
+When the :ref:`UTF-8 mode <utf8-mode>` is enabled, you can still use the
 system encoding (the ANSI Code Page) via the "mbcs" codec.
 
-Note that adding ``PYTHONUTF8=1`` to the default environment variables
-will affect all Python 3.7+ applications on your system.
-If you have any Python 3.7+ applications which rely on the legacy
-system encoding, it is recommended to set the environment variable
-temporarily or use the ``-X utf8`` command line option.
+You can disable the :ref:`Python UTF-8 Mode <utf8-mode>` via
+the ``-X utf8=0`` command line option, or the ``PYTHONUTF8=0`` environment
+variable.  See :envvar:`PYTHONUTF8` for disabling UTF-8 mode, and
+:ref:`setting-envvars` for how to modify environment variables.
+
+.. hint::
+   Adding ``PYTHONUTF8={0,1}`` to the default environment variables
+   will affect all Python 3.7+ applications on your system.
+   If you have any Python 3.7+ applications which rely on the legacy
+   system encoding, it is recommended to set the environment variable
+   temporarily or use the ``-X utf8`` command line option.
 
 .. note::
    Even when UTF-8 mode is disabled, Python uses UTF-8 by default
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index dd0bb6bd5b86b3..fe3d45b83a512e 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -172,11 +172,35 @@ production systems where traditional profiling approaches 
would be too intrusive
 Other language changes
 ======================
 
+* Python now uses UTF-8_ as the default encoding, independent of the system's
+  environment. This means that I/O operations without an explicit encoding,
+  e.g. ``open('flying-circus.txt')``, will use UTF-8.
+  UTF-8 is a widely-supported Unicode_ character encoding that has become a
+  *de facto* standard for representing text, including nearly every webpage
+  on the internet, many common file formats, programming languages, and more.
+
+  This only applies when no ``encoding`` argument is given. For best
+  compatibility between versions of Python, ensure that an explicit 
``encoding``
+  argument is always provided. The :ref:`opt-in encoding warning 
<io-encoding-warning>`
+  can be used to identify code that may be affected by this change.
+  The special special ``encoding='locale'`` argument uses the current locale
+  encoding, and has been supported since Python 3.10.
+
+  To retain the previous behaviour, Python's UTF-8 mode may be disabled with
+  the :envvar:`PYTHONUTF8=0 <PYTHONUTF8>` environment variable or the
+  :option:`-X utf8=0 <-X>` command line option.
+
+  .. seealso:: :pep:`686` for further details.
+
+  .. _UTF-8: https://en.wikipedia.org/wiki/UTF-8
+  .. _Unicode: https://home.unicode.org/
+
+  (Contributed by Adam Turner in :gh:`133711`; PEP 686 written by Inada Naoki.)
+
 * Several error messages incorrectly using the term "argument" have been 
corrected.
   (Contributed by Stan Ulbrych in :gh:`133382`.)
 
 
-
 New modules
 ===========
 
diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h
index 7ce4acfeb7177d..1c979d91a40850 100644
--- a/Include/cpython/initconfig.h
+++ b/Include/cpython/initconfig.h
@@ -102,15 +102,14 @@ typedef struct PyPreConfig {
 
     /* Enable UTF-8 mode? (PEP 540)
 
-       Disabled by default (equals to 0).
+      If equal to 1, use the UTF-8 encoding and use "surrogateescape" for the
+      stdin & stdout error handlers.
 
-       Set to 1 by "-X utf8" and "-X utf8=1" command line options.
-       Set to 1 by PYTHONUTF8=1 environment variable.
+      Enabled by default (equal to 1; PEP 686), or if Py_UTF8Mode=1,
+      or if "-X utf8=1" or PYTHONUTF8=1.
 
-       Set to 0 by "-X utf8=0" and PYTHONUTF8=0.
-
-       If equals to -1, it is set to 1 if the LC_CTYPE locale is "C" or
-       "POSIX", otherwise it is set to 0. Inherit Py_UTF8Mode value value. */
+       Set to 0 by "-X utf8=0" or PYTHONUTF8=0.
+    */
     int utf8_mode;
 
     /* If non-zero, enable the Python Development Mode.
diff --git a/Lib/locale.py b/Lib/locale.py
index dfedc6386cb891..0bde7ed51c66c1 100644
--- a/Lib/locale.py
+++ b/Lib/locale.py
@@ -651,7 +651,8 @@ def getpreferredencoding(do_setlocale=True):
         if sys.flags.warn_default_encoding:
             import warnings
             warnings.warn(
-                "UTF-8 Mode affects locale.getpreferredencoding(). Consider 
locale.getencoding() instead.",
+                "UTF-8 Mode affects locale.getpreferredencoding(). "
+                "Consider locale.getencoding() instead.",
                 EncodingWarning, 2)
         if sys.flags.utf8_mode:
             return 'utf-8'
diff --git a/Lib/subprocess.py b/Lib/subprocess.py
index 54c2eb515b60da..79251bd5310223 100644
--- a/Lib/subprocess.py
+++ b/Lib/subprocess.py
@@ -380,8 +380,7 @@ def _text_encoding():
 
     if sys.flags.utf8_mode:
         return "utf-8"
-    else:
-        return locale.getencoding()
+    return locale.getencoding()
 
 
 def call(*popenargs, timeout=None, **kwargs):
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
index c17d749d4a17ed..f30a1874ab96d4 100644
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@@ -300,6 +300,10 @@ def run_utf8_mode(arg):
             cmd = [sys.executable, '-X', 'utf8', '-c', code, arg]
             return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
 
+        def run_no_utf8_mode(arg):
+            cmd = [sys.executable, '-X', 'utf8=0', '-c', code, arg]
+            return subprocess.run(cmd, stdout=subprocess.PIPE, text=True)
+
         valid_utf8 = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
         # invalid UTF-8 byte sequences with a valid UTF-8 sequence
         # in the middle.
@@ -312,7 +316,8 @@ def run_utf8_mode(arg):
         )
         test_args = [valid_utf8, invalid_utf8]
 
-        for run_cmd in (run_default, run_c_locale, run_utf8_mode):
+        for run_cmd in (run_default, run_c_locale, run_utf8_mode,
+                        run_no_utf8_mode):
             with self.subTest(run_cmd=run_cmd):
                 for arg in test_args:
                     proc = run_cmd(arg)
diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py
index 89f4aebe28f4a1..22dfdb6bb6f138 100644
--- a/Lib/test/test_embed.py
+++ b/Lib/test/test_embed.py
@@ -543,7 +543,7 @@ class InitConfigTests(EmbeddingTestsMixin, 
unittest.TestCase):
         'configure_locale': True,
         'coerce_c_locale': False,
         'coerce_c_locale_warn': False,
-        'utf8_mode': False,
+        'utf8_mode': True,
     }
     if MS_WINDOWS:
         PRE_CONFIG_COMPAT.update({
@@ -560,7 +560,7 @@ class InitConfigTests(EmbeddingTestsMixin, 
unittest.TestCase):
         configure_locale=False,
         isolated=True,
         use_environment=False,
-        utf8_mode=False,
+        utf8_mode=True,
         dev_mode=False,
         coerce_c_locale=False,
     )
@@ -805,12 +805,6 @@ def get_expected_config(self, expected_preconfig, expected,
                         'stdio_encoding', 'stdio_errors'):
                 expected[key] = self.IGNORE_CONFIG
 
-        if not expected_preconfig['configure_locale']:
-            # UTF-8 Mode depends on the locale. There is no easy way
-            # to guess if UTF-8 Mode will be enabled or not if the locale
-            # is not configured.
-            expected_preconfig['utf8_mode'] = self.IGNORE_CONFIG
-
         if expected_preconfig['utf8_mode'] == 1:
             if expected['filesystem_encoding'] is self.GET_DEFAULT_CONFIG:
                 expected['filesystem_encoding'] = 'utf-8'
diff --git a/Lib/test/test_utf8_mode.py b/Lib/test/test_utf8_mode.py
index f66881044e16df..b8e49440c9f7da 100644
--- a/Lib/test/test_utf8_mode.py
+++ b/Lib/test/test_utf8_mode.py
@@ -89,8 +89,8 @@ def test_env_var(self):
         # the UTF-8 mode
         if not self.posix_locale():
             # PYTHONUTF8 should be ignored if -E is used
-            out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
-            self.assertEqual(out, '0')
+            out = self.get_output('-E', '-c', code, PYTHONUTF8='0')
+            self.assertEqual(out, '1')
 
         # invalid mode
         out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
@@ -116,7 +116,7 @@ def test_filesystemencoding(self):
             # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
             # and has the priority over -X utf8 and PYTHONUTF8
             out = self.get_output('-X', 'utf8', '-c', code,
-                                  PYTHONUTF8='strict',
+                                  PYTHONUTF8='xxx',
                                   PYTHONLEGACYWINDOWSFSENCODING='1')
             self.assertEqual(out, 'mbcs/replace')
 
diff --git 
a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst
 
b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst
new file mode 100644
index 00000000000000..c8d3d62763dc12
--- /dev/null
+++ 
b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-08-22-19-10.gh-issue-133711.e91wUy.rst
@@ -0,0 +1,2 @@
+Implement :pep:`686`: Enable :ref:`Python UTF-8 Mode <utf8-mode>` by
+default. Patch by Adam Turner.
diff --git a/Programs/_testembed.c b/Programs/_testembed.c
index 577da65c7cdafa..88936bbc699c30 100644
--- a/Programs/_testembed.c
+++ b/Programs/_testembed.c
@@ -1854,9 +1854,9 @@ static int test_initconfig_get_api(void)
     assert(initconfig_getint(config, "dev_mode") == 1);
 
     // test PyInitConfig_GetInt() on a PyPreConfig option
-    assert(initconfig_getint(config, "utf8_mode") == 0);
-    assert(PyInitConfig_SetInt(config, "utf8_mode", 1) == 0);
     assert(initconfig_getint(config, "utf8_mode") == 1);
+    assert(PyInitConfig_SetInt(config, "utf8_mode", 0) == 0);
+    assert(initconfig_getint(config, "utf8_mode") == 0);
 
     // test PyInitConfig_GetStr()
     char *str;
diff --git a/Python/initconfig.c b/Python/initconfig.c
index 73a9a9bf1ca460..cc0db19d416058 100644
--- a/Python/initconfig.c
+++ b/Python/initconfig.c
@@ -459,7 +459,7 @@ static const char usage_envvars[] =
 
 /* --- Global configuration variables ----------------------------- */
 
-/* UTF-8 mode (PEP 540): if equals to 1, use the UTF-8 encoding, and change
+/* UTF-8 mode (PEP 540): if equal to 1, use the UTF-8 encoding, and change
    stdin and stdout error handler to "surrogateescape". */
 int Py_UTF8Mode = 0;
 int Py_DebugFlag = 0; /* Needed by parser.c */
diff --git a/Python/preconfig.c b/Python/preconfig.c
index 67b2d2f2dc186d..e4cd10d9e3d40d 100644
--- a/Python/preconfig.c
+++ b/Python/preconfig.c
@@ -291,12 +291,12 @@ _PyPreConfig_InitCompatConfig(PyPreConfig *config)
     config->use_environment = -1;
     config->configure_locale = 1;
 
-    /* bpo-36443: C locale coercion (PEP 538) and UTF-8 Mode (PEP 540)
-       are disabled by default using the Compat configuration.
+    /* gh-80624: C locale coercion (PEP 538) is disabled by default using
+       the Compat configuration.
 
-       Py_UTF8Mode=1 enables the UTF-8 mode. PYTHONUTF8 environment variable
+       Py_UTF8Mode=0 disables the UTF-8 mode. PYTHONUTF8 environment variable
        is ignored (even if use_environment=1). */
-    config->utf8_mode = 0;
+    config->utf8_mode = 1;
     config->coerce_c_locale = 0;
     config->coerce_c_locale_warn = 0;
 
@@ -317,8 +317,8 @@ PyPreConfig_InitPythonConfig(PyPreConfig *config)
     config->isolated = 0;
     config->parse_argv = 1;
     config->use_environment = 1;
-    /* Set to -1 to enable C locale coercion (PEP 538) and UTF-8 Mode (PEP 540)
-       depending on the LC_CTYPE locale, PYTHONUTF8 and PYTHONCOERCECLOCALE
+    /* Set to -1 to enable C locale coercion (PEP 538) depending on
+       the LC_CTYPE locale, PYTHONUTF8 and PYTHONCOERCECLOCALE
        environment variables. */
     config->coerce_c_locale = -1;
     config->coerce_c_locale_warn = -1;
@@ -338,7 +338,7 @@ PyPreConfig_InitIsolatedConfig(PyPreConfig *config)
     config->configure_locale = 0;
     config->isolated = 1;
     config->use_environment = 0;
-    config->utf8_mode = 0;
+    config->utf8_mode = 1;
     config->dev_mode = 0;
 #ifdef MS_WINDOWS
     config->legacy_windows_fs_encoding = 0;
@@ -649,23 +649,7 @@ preconfig_init_utf8_mode(PyPreConfig *config, const 
_PyPreCmdline *cmdline)
         return _PyStatus_OK();
     }
 
-
-#ifndef MS_WINDOWS
-    if (config->utf8_mode < 0) {
-        /* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
-        const char *ctype_loc = setlocale(LC_CTYPE, NULL);
-        if (ctype_loc != NULL
-           && (strcmp(ctype_loc, "C") == 0
-               || strcmp(ctype_loc, "POSIX") == 0))
-        {
-            config->utf8_mode = 1;
-        }
-    }
-#endif
-
-    if (config->utf8_mode < 0) {
-        config->utf8_mode = 0;
-    }
+    config->utf8_mode = 1;
     return _PyStatus_OK();
 }
 

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: arch...@mail-archive.com

Reply via email to