https://github.com/python/cpython/commit/b8f2ff08f1a2cf5bd08c84e998527551197e9a6d commit: b8f2ff08f1a2cf5bd08c84e998527551197e9a6d branch: 3.12 author: Hugo van Kemenade <1324225+hug...@users.noreply.github.com> committer: pablogsal <pablog...@gmail.com> date: 2025-02-27T21:57:13Z summary:
[3.12] gh-125553: Fix backslash continuation in `untokenize` (GH-126010) (#130579) (cherry picked from commit 7ad793e5dbdf07e51a71b70d20f3e6e3ab60244d) Co-authored-by: Tomas R. <tomas.ro...@gmail.com> files: A Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst M Lib/test/test_tokenize.py M Lib/tokenize.py diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 2c4e7b960f7273..4c027372291e64 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,20 +1,20 @@ -from test import support -from test.support import os_helper +import os +import re +import token +import unittest from tokenize import (tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer, generate_tokens, NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo, TokenError) from io import BytesIO, StringIO -import unittest from textwrap import dedent from unittest import TestCase, mock +from test import support from test.test_grammar import (VALID_UNDERSCORE_LITERALS, INVALID_UNDERSCORE_LITERALS) from test.support import os_helper from test.support.script_helper import run_test_script, make_script, run_python_until_end -import os -import token # Converts a source string into a list of textual representation # of the tokens such as: @@ -1816,6 +1816,22 @@ def test_iter_compat(self): self.assertEqual(untokenize(iter(tokens)), b'Hello ') +def contains_ambiguous_backslash(source): + """Return `True` if the source contains a backslash on a + line by itself. For example: + + a = (1 + \\ + ) + + Code like this cannot be untokenized exactly. This is because + the tokenizer does not produce any tokens for the line containing + the backslash and so there is no way to know its indent. + """ + pattern = re.compile(br'\n\s*\\\r?\n') + return pattern.search(source) is not None + + class TestRoundtrip(TestCase): def check_roundtrip(self, f): @@ -1826,6 +1842,9 @@ def check_roundtrip(self, f): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. + If the source code can be untokenized unambiguously, the + untokenized code must match the original code exactly. + When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation following spaces. A proper test should test this. @@ -1849,6 +1868,13 @@ def check_roundtrip(self, f): tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) + if not contains_ambiguous_backslash(code): + # The BOM does not produce a token so there is no way to preserve it. + code_without_bom = code.removeprefix(b'\xef\xbb\xbf') + readline = iter(code_without_bom.splitlines(keepends=True)).__next__ + untokenized_code = untokenize(tokenize(readline)) + self.assertEqual(code_without_bom, untokenized_code) + def check_line_extraction(self, f): if isinstance(f, str): code = f.encode('utf-8') diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 553c1ca4388974..7cf92acb192deb 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -171,6 +171,7 @@ def __init__(self): self.prev_row = 1 self.prev_col = 0 self.prev_type = None + self.prev_line = "" self.encoding = None def add_whitespace(self, start): @@ -178,14 +179,28 @@ def add_whitespace(self, start): if row < self.prev_row or row == self.prev_row and col < self.prev_col: raise ValueError("start ({},{}) precedes previous end ({},{})" .format(row, col, self.prev_row, self.prev_col)) - row_offset = row - self.prev_row - if row_offset: - self.tokens.append("\\\n" * row_offset) - self.prev_col = 0 + self.add_backslash_continuation(start) col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) + def add_backslash_continuation(self, start): + """Add backslash continuation characters if the row has increased + without encountering a newline token. + + This also inserts the correct amount of whitespace before the backslash. + """ + row = start[0] + row_offset = row - self.prev_row + if row_offset == 0: + return + + newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n' + line = self.prev_line.rstrip('\\\r\n') + ws = ''.join(_itertools.takewhile(str.isspace, reversed(line))) + self.tokens.append(ws + f"\\{newline}" * row_offset) + self.prev_col = 0 + def escape_brackets(self, token): characters = [] consume_until_next_bracket = False @@ -245,8 +260,6 @@ def untokenize(self, iterable): end_line, end_col = end extra_chars = last_line.count("{{") + last_line.count("}}") end = (end_line, end_col + extra_chars) - elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): - self.tokens.append(" ") self.add_whitespace(start) self.tokens.append(token) @@ -255,6 +268,7 @@ def untokenize(self, iterable): self.prev_row += 1 self.prev_col = 0 self.prev_type = tok_type + self.prev_line = line return "".join(self.tokens) def compat(self, token, iterable): diff --git a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst new file mode 100644 index 00000000000000..291c5e6f6f2181 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst @@ -0,0 +1,2 @@ +Fix round-trip invariance for backslash continuations in +:func:`tokenize.untokenize`. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com