https://github.com/python/cpython/commit/7ad793e5dbdf07e51a71b70d20f3e6e3ab60244d
commit: 7ad793e5dbdf07e51a71b70d20f3e6e3ab60244d
branch: main
author: Tomas R. <[email protected]>
committer: pablogsal <[email protected]>
date: 2025-01-21T19:58:44Z
summary:
gh-125553: Fix backslash continuation in `untokenize` (#126010)
files:
A Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
M Lib/test/test_tokenize.py
M Lib/tokenize.py
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 75710db7d05375..480bff743a9f8a 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,4 +1,5 @@
import os
+import re
import token
import tokenize
import unittest
@@ -1819,6 +1820,22 @@ def test_iter_compat(self):
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
+def contains_ambiguous_backslash(source):
+ """Return `True` if the source contains a backslash on a
+ line by itself. For example:
+
+ a = (1
+ \\
+ )
+
+ Code like this cannot be untokenized exactly. This is because
+ the tokenizer does not produce any tokens for the line containing
+ the backslash and so there is no way to know its indent.
+ """
+ pattern = re.compile(br'\n\s*\\\r?\n')
+ return pattern.search(source) is not None
+
+
class TestRoundtrip(TestCase):
def check_roundtrip(self, f):
@@ -1829,6 +1846,9 @@ def check_roundtrip(self, f):
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
The test fails if the 3 pair tokenizations do not match.
+ If the source code can be untokenized unambiguously, the
+ untokenized code must match the original code exactly.
+
When untokenize bugs are fixed, untokenize with 5-tuples should
reproduce code that does not contain a backslash continuation
following spaces. A proper test should test this.
@@ -1852,6 +1872,13 @@ def check_roundtrip(self, f):
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2)
+ if not contains_ambiguous_backslash(code):
+ # The BOM does not produce a token so there is no way to preserve
it.
+ code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
+ readline =
iter(code_without_bom.splitlines(keepends=True)).__next__
+ untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
+ self.assertEqual(code_without_bom, untokenized_code)
+
def check_line_extraction(self, f):
if isinstance(f, str):
code = f.encode('utf-8')
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 1a60fd32a77ea4..9ce95a62d961ba 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -169,6 +169,7 @@ def __init__(self):
self.prev_row = 1
self.prev_col = 0
self.prev_type = None
+ self.prev_line = ""
self.encoding = None
def add_whitespace(self, start):
@@ -176,14 +177,28 @@ def add_whitespace(self, start):
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
raise ValueError("start ({},{}) precedes previous end ({},{})"
.format(row, col, self.prev_row, self.prev_col))
- row_offset = row - self.prev_row
- if row_offset:
- self.tokens.append("\\\n" * row_offset)
- self.prev_col = 0
+ self.add_backslash_continuation(start)
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
+ def add_backslash_continuation(self, start):
+ """Add backslash continuation characters if the row has increased
+ without encountering a newline token.
+
+ This also inserts the correct amount of whitespace before the
backslash.
+ """
+ row = start[0]
+ row_offset = row - self.prev_row
+ if row_offset == 0:
+ return
+
+ newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
+ line = self.prev_line.rstrip('\\\r\n')
+ ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
+ self.tokens.append(ws + f"\\{newline}" * row_offset)
+ self.prev_col = 0
+
def escape_brackets(self, token):
characters = []
consume_until_next_bracket = False
@@ -243,8 +258,6 @@ def untokenize(self, iterable):
end_line, end_col = end
extra_chars = last_line.count("{{") + last_line.count("}}")
end = (end_line, end_col + extra_chars)
- elif tok_type in (STRING, FSTRING_START) and self.prev_type in
(STRING, FSTRING_END):
- self.tokens.append(" ")
self.add_whitespace(start)
self.tokens.append(token)
@@ -253,6 +266,7 @@ def untokenize(self, iterable):
self.prev_row += 1
self.prev_col = 0
self.prev_type = tok_type
+ self.prev_line = line
return "".join(self.tokens)
def compat(self, token, iterable):
diff --git
a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
new file mode 100644
index 00000000000000..291c5e6f6f2181
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
@@ -0,0 +1,2 @@
+Fix round-trip invariance for backslash continuations in
+:func:`tokenize.untokenize`.
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]