https://github.com/python/cpython/commit/6dc661bc9f65e9923eafbcdbf18bcc57eebbf6a4
commit: 6dc661bc9f65e9923eafbcdbf18bcc57eebbf6a4
branch: main
author: Inada Naoki <songofaca...@gmail.com>
committer: methane <songofaca...@gmail.com>
date: 2024-04-16T12:56:16+09:00
summary:

gh-77102: site: try utf-8 and fallback to locale encoding when reading .pth 
file (#117802)

files:
A Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst
M Doc/library/site.rst
M Doc/whatsnew/3.13.rst
M Lib/site.py

diff --git a/Doc/library/site.rst b/Doc/library/site.rst
index 2dc9fb09d727e2..e52bbd32d4d493 100644
--- a/Doc/library/site.rst
+++ b/Doc/library/site.rst
@@ -74,6 +74,10 @@ with ``import`` (followed by space or tab) are executed.
    Limiting a code chunk to a single line is a deliberate measure
    to discourage putting anything more complex here.
 
+.. versionchanged:: 3.13
+   The :file:`.pth` files are now decoded by UTF-8 at first and then by the
+   :term:`locale encoding` if it fails.
+
 .. index::
    single: package
    triple: path; configuration; file
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
index 83c528814c967e..f957698ecb06d8 100644
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -630,6 +630,13 @@ re
 * Rename :exc:`!re.error` to :exc:`re.PatternError` for improved clarity.
   :exc:`!re.error` is kept for backward compatibility.
 
+site
+----
+
+* :file:`.pth` files are now decoded by UTF-8 first, and then by the
+  :term:`locale encoding` if the UTF-8 decoding fails.
+  (Contributed by Inada Naoki in :gh:`117802`.)
+
 sqlite3
 -------
 
diff --git a/Lib/site.py b/Lib/site.py
index 162bbec4f8f41b..93af9c453ac7bb 100644
--- a/Lib/site.py
+++ b/Lib/site.py
@@ -179,35 +179,44 @@ def addpackage(sitedir, name, known_paths):
         return
     _trace(f"Processing .pth file: {fullname!r}")
     try:
-        # locale encoding is not ideal especially on Windows. But we have used
-        # it for a long time. setuptools uses the locale encoding too.
-        f = io.TextIOWrapper(io.open_code(fullname), encoding="locale")
+        with io.open_code(fullname) as f:
+            pth_content = f.read()
     except OSError:
         return
-    with f:
-        for n, line in enumerate(f):
-            if line.startswith("#"):
-                continue
-            if line.strip() == "":
+
+    try:
+        pth_content = pth_content.decode()
+    except UnicodeDecodeError:
+        # Fallback to locale encoding for backward compatibility.
+        # We will deprecate this fallback in the future.
+        import locale
+        pth_content = pth_content.decode(locale.getencoding())
+        _trace(f"Cannot read {fullname!r} as UTF-8. "
+               f"Using fallback encoding {locale.getencoding()!r}")
+
+    for n, line in enumerate(pth_content.splitlines(), 1):
+        if line.startswith("#"):
+            continue
+        if line.strip() == "":
+            continue
+        try:
+            if line.startswith(("import ", "import\t")):
+                exec(line)
                 continue
-            try:
-                if line.startswith(("import ", "import\t")):
-                    exec(line)
-                    continue
-                line = line.rstrip()
-                dir, dircase = makepath(sitedir, line)
-                if not dircase in known_paths and os.path.exists(dir):
-                    sys.path.append(dir)
-                    known_paths.add(dircase)
-            except Exception as exc:
-                print("Error processing line {:d} of {}:\n".format(n+1, 
fullname),
-                      file=sys.stderr)
-                import traceback
-                for record in traceback.format_exception(exc):
-                    for line in record.splitlines():
-                        print('  '+line, file=sys.stderr)
-                print("\nRemainder of file ignored", file=sys.stderr)
-                break
+            line = line.rstrip()
+            dir, dircase = makepath(sitedir, line)
+            if dircase not in known_paths and os.path.exists(dir):
+                sys.path.append(dir)
+                known_paths.add(dircase)
+        except Exception as exc:
+            print(f"Error processing line {n:d} of {fullname}:\n",
+                  file=sys.stderr)
+            import traceback
+            for record in traceback.format_exception(exc):
+                for line in record.splitlines():
+                    print('  '+line, file=sys.stderr)
+            print("\nRemainder of file ignored", file=sys.stderr)
+            break
     if reset:
         known_paths = None
     return known_paths
diff --git 
a/Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst 
b/Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst
new file mode 100644
index 00000000000000..6f91251126dc7b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-04-12-17-37-11.gh-issue-77102.Mk6X_E.rst
@@ -0,0 +1,3 @@
+:mod:`site` module now parses ``.pth`` file with UTF-8 first, and
+:term:`locale encoding` if ``UnicodeDecodeError`` happened. It supported
+only locale encoding before.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-le...@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: arch...@mail-archive.com

Reply via email to