https://github.com/naveen-seth updated https://github.com/llvm/llvm-project/pull/145243
>From 8cee6b33c54aca0ea69b3ad291287464523a3d7a Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig <naveen.ha...@outlook.com> Date: Sun, 22 Jun 2025 18:06:07 +0200 Subject: [PATCH 1/4] [clang-format] Handle Trailing Whitespace After Line Continuation (P2223R2) Fixes #145226. Implement P2223R2 in clang-format to correctly handle cases where a backslash '\' is followed by trailing whitespace before the newline. Previously, clang-format failed to properly detect and handle such cases, leading to misformatted code. With this, clang-format matches the behavior already implemented in Clang's lexer and DependencyDirectivesScanner.cpp, which allow trailing whitespace after a line continuation in any C++ standard. --- clang/docs/ReleaseNotes.rst | 3 ++ clang/lib/Format/FormatTokenLexer.cpp | 31 ++++++++++++++----- .../splice-trailing-whitespace-p2223r2.cpp | 14 +++++++++ 3 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 clang/test/Format/splice-trailing-whitespace-p2223r2.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 96477ef6ddc9a..7c8e231655e86 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1025,6 +1025,9 @@ clang-format ``enum`` enumerator lists. - Add ``OneLineFormatOffRegex`` option for turning formatting off for one line. - Add ``SpaceAfterOperatorKeyword`` option. +- Support trailing whitespace in line splicing. + (P2223R2 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p2223r2.pdf>_, #GH145226) + clang-refactor -------------- diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 4cc4f5f22db0d..bcc8e6ffd91ab 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -14,6 +14,7 @@ #include "FormatTokenLexer.h" #include "FormatToken.h" +#include "clang/Basic/CharInfo.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" #include "clang/Format/Format.h" @@ -1205,14 +1206,23 @@ static size_t countLeadingWhitespace(StringRef Text) { while (Cur < End) { if (isspace(Cur[0])) { ++Cur; - } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) { - // A '\' followed by a newline always escapes the newline, regardless - // of whether there is another '\' before it. + } else if (Cur[0] == '\\') { + // A '\' followed by a optional horizontal whitespace (P22232R2) and then + // newline always escapes the newline, regardless of whether there is + // another '\' before it. // The source has a null byte at the end. So the end of the entire input // isn't reached yet. Also the lexer doesn't break apart an escaped // newline. - assert(End - Cur >= 2); - Cur += 2; + const unsigned char *Lookahead = Cur + 1; + while (isHorizontalWhitespace(*Lookahead)) + ++Lookahead; + if (*Lookahead == '\n' || *Lookahead == '\r') { + // Splice found, consume it. + Cur = Lookahead + 1; + continue; + } + // No line splice found; the '\' is a token. + break; } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' && (Cur[3] == '\n' || Cur[3] == '\r')) { // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the @@ -1295,13 +1305,18 @@ FormatToken *FormatTokenLexer::getNextToken() { case '/': // The text was entirely whitespace when this loop was entered. Thus // this has to be an escape sequence. - assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" || - Text.substr(i, 4) == "\?\?/\r" || + assert(Text.substr(i, 4) == "\?\?/\r" || Text.substr(i, 4) == "\?\?/\n" || (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" || Text.substr(i - 1, 4) == "\?\?/\n")) || (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" || - Text.substr(i - 2, 4) == "\?\?/\n"))); + Text.substr(i - 2, 4) == "\?\?/\n")) || + (Text[i] == '\\' && [&]() -> bool { + size_t j = i + 1; + while (j < Text.size() && isHorizontalWhitespace(Text[j])) + ++j; + return j < Text.size() && (Text[j] == '\n' || Text[j] == '\r'); + }())); InEscape = true; break; default: diff --git a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp new file mode 100644 index 0000000000000..3cc08cc631965 --- /dev/null +++ b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp @@ -0,0 +1,14 @@ +// RUN: grep -Ev "// *[A-Z-]+:" %s \ +// RUN: | clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' \ +// RUN: | FileCheck -strict-whitespace %s + +// CHECK: {{^#define TAG\(\.\.\.\) \\}} +// CHECK: {{^ struct a \{\};}} +// There is whitespace following v this backslash! +#define TAG(...) struct a { \ + }; + +// CHECK: {{^int i;}} +// The comment below eats its following line because of the line splice. +// I also have trailing whitespace. Nom nom nom \ +int i; >From 26d525e3ff75885390f5d46ccb5c044fdbd8ddde Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig <naveen.ha...@outlook.com> Date: Sun, 22 Jun 2025 20:23:51 +0200 Subject: [PATCH 2/4] Remove deprecated usage of grep from regression test --- clang/test/Format/splice-trailing-whitespace-p2223r2.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp index 3cc08cc631965..4e9e2612af78a 100644 --- a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp +++ b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp @@ -1,5 +1,4 @@ -// RUN: grep -Ev "// *[A-Z-]+:" %s \ -// RUN: | clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' \ +// RUN: clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' %s \ // RUN: | FileCheck -strict-whitespace %s // CHECK: {{^#define TAG\(\.\.\.\) \\}} >From 1f54d834d94267a1e23896ef0f4addfb86f8a143 Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig <naveen.ha...@outlook.com> Date: Mon, 23 Jun 2025 02:26:26 +0200 Subject: [PATCH 3/4] Use unit testing instead of regression testing --- .../splice-trailing-whitespace-p2223r2.cpp | 13 ----------- clang/unittests/Format/FormatTest.cpp | 23 +++++++++++++++++++ 2 files changed, 23 insertions(+), 13 deletions(-) delete mode 100644 clang/test/Format/splice-trailing-whitespace-p2223r2.cpp diff --git a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp b/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp deleted file mode 100644 index 4e9e2612af78a..0000000000000 --- a/clang/test/Format/splice-trailing-whitespace-p2223r2.cpp +++ /dev/null @@ -1,13 +0,0 @@ -// RUN: clang-format -style='{BasedOnStyle: LLVM, AlignEscapedNewlines: DontAlign}' %s \ -// RUN: | FileCheck -strict-whitespace %s - -// CHECK: {{^#define TAG\(\.\.\.\) \\}} -// CHECK: {{^ struct a \{\};}} -// There is whitespace following v this backslash! -#define TAG(...) struct a { \ - }; - -// CHECK: {{^int i;}} -// The comment below eats its following line because of the line splice. -// I also have trailing whitespace. Nom nom nom \ -int i; diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index c0633ba3c29b3..23d2bc7b0c442 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -25768,6 +25768,29 @@ TEST_F(FormatTest, OperatorPassedAsAFunctionPtr) { verifyFormat("foo(operator, , -42);", Style); } +TEST_F(FormatTest, LineSpliceWithTrailingWhitespace) { + // Test that each sequence of a backslash (\) immediately followed by zero or + // more horizontal whitespace characters and then a new-line character is + // treated as a single logical line while formatting (as per P2223R2). + FormatStyle Style = getLLVMStyle(); + Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign; + Style.UseTab = FormatStyle::UT_Never; + + verifyFormat("int i;", + " \\ \n" + " int i;", + Style); + verifyFormat("#define FOO(args) \\\n struct a {};\n", + "#define FOO( args ) \\ \n" + "struct a{\\\t\t\t\n" + " };\n", + Style); + verifyFormat("comment here", + "comment \\ \n" + "here", + Style); +} + TEST_F(FormatTest, WhitespaceSensitiveMacros) { FormatStyle Style = getLLVMStyle(); Style.WhitespaceSensitiveMacros.push_back("FOO"); >From ed80aca9eb4c1d02d4ef48ea08995159f3832444 Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig <naveen.ha...@outlook.com> Date: Tue, 24 Jun 2025 16:11:29 +0200 Subject: [PATCH 4/4] Address review feedback --- clang/docs/ReleaseNotes.rst | 3 --- clang/lib/Format/FormatTokenLexer.cpp | 21 +++++++++------------ clang/unittests/Format/FormatTest.cpp | 20 ++++++-------------- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7c8e231655e86..96477ef6ddc9a 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1025,9 +1025,6 @@ clang-format ``enum`` enumerator lists. - Add ``OneLineFormatOffRegex`` option for turning formatting off for one line. - Add ``SpaceAfterOperatorKeyword`` option. -- Support trailing whitespace in line splicing. - (P2223R2 <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p2223r2.pdf>_, #GH145226) - clang-refactor -------------- diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index bcc8e6ffd91ab..06f68ec8b0fc1 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -1204,25 +1204,22 @@ static size_t countLeadingWhitespace(StringRef Text) { const unsigned char *const End = Text.bytes_end(); const unsigned char *Cur = Begin; while (Cur < End) { - if (isspace(Cur[0])) { + if (isWhitespace(Cur[0])) { ++Cur; } else if (Cur[0] == '\\') { - // A '\' followed by a optional horizontal whitespace (P22232R2) and then - // newline always escapes the newline, regardless of whether there is - // another '\' before it. + // A backslash followed by optional horizontal whitespaces (P22232R2) and + // then a newline always escapes the newline. // The source has a null byte at the end. So the end of the entire input // isn't reached yet. Also the lexer doesn't break apart an escaped // newline. - const unsigned char *Lookahead = Cur + 1; + const auto *Lookahead = Cur + 1; while (isHorizontalWhitespace(*Lookahead)) ++Lookahead; - if (*Lookahead == '\n' || *Lookahead == '\r') { - // Splice found, consume it. - Cur = Lookahead + 1; - continue; - } - // No line splice found; the '\' is a token. - break; + // No line splice found; the backslash is a token. + if (!isVerticalWhitespace(*Lookahead)) + break; + // Splice found, consume it. + Cur = Lookahead + 1; } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' && (Cur[3] == '\n' || Cur[3] == '\r')) { // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 23d2bc7b0c442..a05bf8305716b 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -25769,25 +25769,17 @@ TEST_F(FormatTest, OperatorPassedAsAFunctionPtr) { } TEST_F(FormatTest, LineSpliceWithTrailingWhitespace) { - // Test that each sequence of a backslash (\) immediately followed by zero or - // more horizontal whitespace characters and then a new-line character is - // treated as a single logical line while formatting (as per P2223R2). - FormatStyle Style = getLLVMStyle(); + auto Style = getLLVMStyle(); Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign; Style.UseTab = FormatStyle::UT_Never; - verifyFormat("int i;", - " \\ \n" - " int i;", - Style); - verifyFormat("#define FOO(args) \\\n struct a {};\n", + verifyFormat("int i;", " \\ \n" + " int i;"); + verifyFormat("#define FOO(args) \\\n" + " struct a {};", "#define FOO( args ) \\ \n" "struct a{\\\t\t\t\n" - " };\n", - Style); - verifyFormat("comment here", - "comment \\ \n" - "here", + " };", Style); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits