https://github.com/python/cpython/commit/6aa88a2cb36240fe2b587f2e82043873270a27cf commit: 6aa88a2cb36240fe2b587f2e82043873270a27cf branch: main author: Adam Turner <9087854+aa-tur...@users.noreply.github.com> committer: AA-Turner <9087854+aa-tur...@users.noreply.github.com> date: 2025-03-31T00:35:12Z summary:
gh-130167: Optimise ``textwrap.dedent()`` (#131919) Co-authored-by: Marius Juston <marius.jus...@hotmail.fr> Co-authored-by: Pieter Eendebak <pieter.eende...@gmail.com> Co-authored-by: Bénédikt Tran <10796600+picn...@users.noreply.github.com> files: A Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst M Lib/test/test_textwrap.py M Lib/textwrap.py diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index dfbc2b93dfc0d6..77366988b57fa7 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -769,6 +769,56 @@ def assertUnchanged(self, text): """assert that dedent() has no effect on 'text'""" self.assertEqual(text, dedent(text)) + def test_dedent_whitespace(self): + # The empty string. + text = "" + self.assertUnchanged(text) + + # Only spaces. + text = " " + expect = "" + self.assertEqual(expect, dedent(text)) + + # Only tabs. + text = "\t\t\t\t" + expect = "" + self.assertEqual(expect, dedent(text)) + + # A mixture. + text = " \t \t\t \t " + expect = "" + self.assertEqual(expect, dedent(text)) + + # ASCII whitespace. + text = "\f\n\r\t\v " + expect = "\n" + self.assertEqual(expect, dedent(text)) + + # One newline. + text = "\n" + expect = "\n" + self.assertEqual(expect, dedent(text)) + + # Windows-style newlines. + text = "\r\n" * 5 + expect = "\n" * 5 + self.assertEqual(expect, dedent(text)) + + # Whitespace mixture. + text = " \n\t\n \n\t\t\n\n\n " + expect = "\n\n\n\n\n\n" + self.assertEqual(expect, dedent(text)) + + # Lines consisting only of whitespace are always normalised + text = "a\n \n\t\n" + expect = "a\n\n\n" + self.assertEqual(expect, dedent(text)) + + # Whitespace characters on non-empty lines are retained + text = "a\r\n\r\n\r\n" + expect = "a\r\n\n\n" + self.assertEqual(expect, dedent(text)) + def test_dedent_nomargin(self): # No lines indented. text = "Hello there.\nHow are you?\nOh good, I'm glad." diff --git a/Lib/textwrap.py b/Lib/textwrap.py index 1bf07aa46cad99..bb6a1186316275 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -413,9 +413,6 @@ def shorten(text, width, **kwargs): # -- Loosely related functionality ------------------------------------- -_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) -_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) - def dedent(text): """Remove any common leading whitespace from every line in `text`. @@ -429,42 +426,21 @@ def dedent(text): Entirely blank lines are normalized to a newline character. """ - # Look for the longest leading string of spaces and tabs common to - # all lines. - margin = None - text = _whitespace_only_re.sub('', text) - indents = _leading_whitespace_re.findall(text) - for indent in indents: - if margin is None: - margin = indent - - # Current line more deeply indented than previous winner: - # no change (previous winner is still on top). - elif indent.startswith(margin): - pass - - # Current line consistent with and no deeper than previous winner: - # it's the new winner. - elif margin.startswith(indent): - margin = indent - - # Find the largest common whitespace between current line and previous - # winner. - else: - for i, (x, y) in enumerate(zip(margin, indent)): - if x != y: - margin = margin[:i] - break + if not text: + return text + + lines = text.split('\n') - # sanity check (testing/debugging only) - if 0 and margin: - for line in text.split("\n"): - assert not line or line.startswith(margin), \ - "line = %r, margin = %r" % (line, margin) + # Get length of leading whitespace, inspired by ``os.path.commonprefix()``. + non_blank_lines = [l for l in lines if l and not l.isspace()] + l1 = min(non_blank_lines, default='') + l2 = max(non_blank_lines, default='') + margin = 0 + for margin, c in enumerate(l1): + if c != l2[margin] or c not in ' \t': + break - if margin: - text = re.sub(r'(?m)^' + margin, '', text) - return text + return '\n'.join([l[margin:] if not l.isspace() else '' for l in lines]) def indent(text, prefix, predicate=None): diff --git a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst new file mode 100644 index 00000000000000..62b619c0d80f4d --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst @@ -0,0 +1,5 @@ +Improved performance of :func:`textwrap.dedent` by an average of ~2.4x, +(with improvements of up to 4x for large inputs), +and fixed a bug where blank lines with whitespace characters other than space +or horizontal tab were not normalised to the newline. +Patch by Adam Turner, Marius Juston, and Pieter Eendebak. _______________________________________________ Python-checkins mailing list -- python-checkins@python.org To unsubscribe send an email to python-checkins-le...@python.org https://mail.python.org/mailman3/lists/python-checkins.python.org/ Member address: arch...@mail-archive.com