https://github.com/Neil-N4 updated https://github.com/llvm/llvm-project/pull/202991
>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 09:51:48 -0400 Subject: [PATCH 1/5] [clang-doc] Add standalone Markdown parsing library --- .../clang-doc/support/CMakeLists.txt | 3 +- .../clang-doc/support/Markdown.cpp | 145 ++++++++++++++++++ .../clang-doc/support/Markdown.h | 72 +++++++++ .../unittests/clang-doc/CMakeLists.txt | 4 +- .../clang-doc/MarkdownParserTest.cpp | 94 ++++++++++++ 5 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt index 8ac913ffbe998..acff865190ff9 100644 --- a/clang-tools-extra/clang-doc/support/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt @@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangDocSupport STATIC File.cpp + Markdown.cpp Utils.cpp - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp new file mode 100644 index 0000000000000..776150b939d27 --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -0,0 +1,145 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Markdown.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/DebugLog.h" + +#define DEBUG_TYPE "clang-doc-markdown" + +using namespace llvm; + +namespace clang::doc::markdown { + +static MDNode makeText(StringRef S) { + return {NodeKind::NK_Text, S, {}}; +} + +// A line is a table separator if it only contains |, -, :, and spaces, +// and has at least one -. +static bool isSepRow(StringRef Line) { + return Line.contains('-') && + Line.find_first_not_of("|-: ") == StringRef::npos; +} + +// Returns true if Line begins with a bullet list marker (-, *, or +) +// followed by a space. +static bool isListItem(StringRef Line) { + return Line.starts_with("- ") || Line.starts_with("* ") || + Line.starts_with("+ "); +} + +static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes, + BumpPtrAllocator &Arena) { + if (Nodes.empty()) + return {}; + MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size()); + std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated); + return ArrayRef<MDNode>(Allocated, Nodes.size()); +} + +ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, + BumpPtrAllocator &Arena) { + if (ParagraphText.trim().empty()) + return {}; + + SmallVector<StringRef, 16> Lines; + ParagraphText.split(Lines, '\n'); + + SmallVector<MDNode> Nodes; + size_t I = 0, E = Lines.size(); + + while (I < E) { + StringRef Line = Lines[I].trim(); + + if (Line.empty()) { + ++I; + continue; + } + + // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be + // indented up to 3 spaces, the closing fence must use the same character + // and be at least as long as the opening fence, and the closing fence may + // only be followed by spaces. Doxygen specifics should be handled on a + // case-by-case basis. + if (Line.starts_with("```") || Line.starts_with("~~~")) { + char Fence = Line[0]; + StringRef Lang = Line.drop_front(3).trim(); + SmallVector<MDNode> CodeLines; + ++I; + while (I < E) { + StringRef CodeLine = Lines[I].trim(); + if (CodeLine.size() >= 3 && + all_of(CodeLine.take_front(3), + [Fence](char C) { return C == Fence; })) + break; + CodeLines.push_back(makeText(Lines[I])); + ++I; + } + ++I; // skip closing fence + MDNode Code; + Code.Kind = NodeKind::NK_FencedCode; + Code.Content = Lang; + Code.Children = allocateNodes(CodeLines, Arena); + LDBG() << "emitting NK_FencedCode lang='" << Lang + << "' lines=" << CodeLines.size(); + Nodes.push_back(Code); + continue; + } + + // Pipe table: current line has | and next line is a separator row. + if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) { + SmallVector<MDNode> Rows; + while (I < E && Lines[I].trim().contains('|')) { + Rows.push_back(makeText(Lines[I].trim())); + ++I; + } + MDNode Table; + Table.Kind = NodeKind::NK_Table; + Table.Content = {}; + Table.Children = allocateNodes(Rows, Arena); + LDBG() << "emitting NK_Table rows=" << Rows.size(); + Nodes.push_back(Table); + continue; + } + + // Unordered list item. + if (isListItem(Line)) { + SmallVector<MDNode> Items; + while (I < E) { + StringRef L = Lines[I].trim(); + if (!isListItem(L)) + break; + MDNode Item; + Item.Kind = NodeKind::NK_ListItem; + Item.Content = L.drop_front(2).trim(); + Item.Children = {}; + Items.push_back(Item); + ++I; + } + MDNode List; + List.Kind = NodeKind::NK_UnorderedList; + List.Content = {}; + List.Children = allocateNodes(Items, Arena); + LDBG() << "emitting NK_UnorderedList items=" << Items.size(); + Nodes.push_back(List); + continue; + } + + // Plain text fallback. + Nodes.push_back(makeText(Line)); + ++I; + } + + LDBG() << "parseMarkdown done nodes=" << Nodes.size(); + return allocateNodes(Nodes, Arena); +} + +} // namespace clang::doc::markdown \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h new file mode 100644 index 0000000000000..890f764f937b1 --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines a standalone Markdown parsing library for the LLVM +/// ecosystem. The parser takes plain text and returns a tree of typed nodes +/// with no knowledge of comments, Doxygen, or Clang-Doc internals. +/// +/// This is a simple Markdown parser for use inside Clang-Doc's comment +/// pipeline. You give it a paragraph of text and an arena allocator, and it +/// gives back a list of typed nodes describing the Markdown structure it found. +/// +/// The main entry point is parseMarkdown(). If the text has no Markdown in it, +/// you get back an empty list and can fall back to plain-text output. If it +/// does, you get a tree of MDNode structs where each node has a kind, optional +/// content (like the language tag on a code fence), and optional children. +/// +/// All nodes are allocated in the arena you pass in. You own the arena and are +/// responsible for keeping it alive as long as you use the nodes. +/// +/// The parser handles fenced code blocks, pipe tables, and unordered lists. +/// Anything it does not recognize comes back as a plain text node. It will +/// never crash on bad input. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" + +namespace clang::doc::markdown { + +enum class NodeKind { + // Block nodes + NK_Paragraph, + NK_FencedCode, + NK_Table, + NK_UnorderedList, + NK_OrderedList, + NK_ListItem, + NK_ThematicBreak, + // Inline nodes + NK_Text, + NK_InlineCode, + NK_Emphasis, + NK_Strong, + NK_SoftBreak, +}; + +struct MDNode { + NodeKind Kind; + llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text + llvm::ArrayRef<MDNode> Children; // arena allocated +}; + +/// Parses Markdown from a single comment paragraph's text. +/// Returns an empty ArrayRef if no Markdown constructs are found, +/// so generators can fall back to plain-text rendering at zero cost. +llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena); + +} // namespace clang::doc::markdown + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H \ No newline at end of file diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt index 01b34ec9a791e..b74207ac88fa7 100644 --- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt @@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests ClangDocTest.cpp GeneratorTest.cpp HTMLGeneratorTest.cpp + MarkdownParserTest.cpp MDGeneratorTest.cpp MergeTest.cpp SerializeTest.cpp @@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests target_link_libraries(ClangDocTests PRIVATE clangDoc + clangDocSupport LLVMTestingSupport - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp new file mode 100644 index 0000000000000..8df5efc7f1d5f --- /dev/null +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "support/Markdown.h" +#include "llvm/Support/Allocator.h" +#include "gtest/gtest.h" + +using namespace clang::doc::markdown; + +namespace { + +TEST(MarkdownParserTest, EmptyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, WhitespaceOnlyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown(" \n \n", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("hello world", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text); + EXPECT_EQ(Nodes[0].Content, "hello world"); +} + +TEST(MarkdownParserTest, FencedCodeBlock) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(Nodes[0].Content, "cpp"); + ASSERT_EQ(Nodes[0].Children.size(), 1u); +} + +TEST(MarkdownParserTest, FencedCodeBlockNoLang) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```\nsome code\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(Nodes[0].Content.empty()); +} + +TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena); + // Unterminated fence should not crash and should produce a code node + // with whatever lines were found. + EXPECT_FALSE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PipeTable) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("a | b\nc | d", Arena); + // No separator row so should not be parsed as a table + for (const auto &Node : Nodes) + EXPECT_NE(Node.Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, UnorderedList) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList); + ASSERT_EQ(Nodes[0].Children.size(), 3u); + EXPECT_EQ(Nodes[0].Children[0].Content, "foo"); + EXPECT_EQ(Nodes[0].Children[1].Content, "bar"); + EXPECT_EQ(Nodes[0].Children[2].Content, "baz"); +} + +TEST(MarkdownParserTest, MixedContent) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena); + EXPECT_EQ(Nodes.size(), 3u); +} + +} // namespace \ No newline at end of file >From f4cb4a28630e0f91289bfd4416c59114c5654ff7 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 11:35:54 -0400 Subject: [PATCH 2/5] [clang-doc] Address review feedback: test fixture, raw strings, DEBUG_TYPE, EOF newlines --- .../clang-doc/support/Markdown.cpp | 4 +- .../clang-doc/support/Markdown.h | 2 +- .../clang-doc/MarkdownParserTest.cpp | 97 +++++++++++-------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index 776150b939d27..9e008abf8b08d 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -12,7 +12,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/DebugLog.h" -#define DEBUG_TYPE "clang-doc-markdown" +#define DEBUG_TYPE "clang-doc" using namespace llvm; @@ -142,4 +142,4 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, return allocateNodes(Nodes, Arena); } -} // namespace clang::doc::markdown \ No newline at end of file +} // namespace clang::doc::markdown diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index 890f764f937b1..09b79cc8f2437 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -69,4 +69,4 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, } // namespace clang::doc::markdown -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H \ No newline at end of file +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index 8df5efc7f1d5f..ff9bad88da136 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -14,80 +14,99 @@ using namespace clang::doc::markdown; namespace { -TEST(MarkdownParserTest, EmptyInput) { +struct MarkdownParserTest : public ::testing::Test { llvm::BumpPtrAllocator Arena; +}; + +TEST_F(MarkdownParserTest, EmptyInput) { auto Nodes = parseMarkdown("", Arena); EXPECT_TRUE(Nodes.empty()); } -TEST(MarkdownParserTest, WhitespaceOnlyInput) { - llvm::BumpPtrAllocator Arena; +TEST_F(MarkdownParserTest, WhitespaceOnlyInput) { auto Nodes = parseMarkdown(" \n \n", Arena); EXPECT_TRUE(Nodes.empty()); } -TEST(MarkdownParserTest, PlainText) { - llvm::BumpPtrAllocator Arena; +TEST_F(MarkdownParserTest, PlainText) { auto Nodes = parseMarkdown("hello world", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text); - EXPECT_EQ(Nodes[0].Content, "hello world"); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_Text); + EXPECT_EQ(N.Content, "hello world"); } -TEST(MarkdownParserTest, FencedCodeBlock) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena); +TEST_F(MarkdownParserTest, FencedCodeBlock) { + auto Nodes = parseMarkdown(R"(```cpp +int x = 0; +````)", + Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(Nodes[0].Content, "cpp"); - ASSERT_EQ(Nodes[0].Children.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "cpp"); + ASSERT_EQ(N.Children.size(), 1u); } -TEST(MarkdownParserTest, FencedCodeBlockNoLang) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("```\nsome code\n```", Arena); +TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) { + auto Nodes = parseMarkdown(R"(``` +some code +```)", + Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(Nodes[0].Content.empty()); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(N.Content.empty()); } -TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena); +TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { + auto Nodes = parseMarkdown(R"(```cpp +int x = 0;)", + Arena); // Unterminated fence should not crash and should produce a code node // with whatever lines were found. EXPECT_FALSE(Nodes.empty()); } -TEST(MarkdownParserTest, PipeTable) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena); +TEST_F(MarkdownParserTest, PipeTable) { + auto Nodes = parseMarkdown(R"(| A | B | +|---|---| +| 1 | 2 |)", + Arena); ASSERT_EQ(Nodes.size(), 1u); EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); } -TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("a | b\nc | d", Arena); - // No separator row so should not be parsed as a table +TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { + auto Nodes = parseMarkdown(R"(a | b +c | d)", + Arena); + // No separator row so should not be parsed as a table. for (const auto &Node : Nodes) EXPECT_NE(Node.Kind, NodeKind::NK_Table); } -TEST(MarkdownParserTest, UnorderedList) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena); +TEST_F(MarkdownParserTest, UnorderedList) { + auto Nodes = parseMarkdown(R"(- foo +- bar +- baz)", + Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList); - ASSERT_EQ(Nodes[0].Children.size(), 3u); - EXPECT_EQ(Nodes[0].Children[0].Content, "foo"); - EXPECT_EQ(Nodes[0].Children[1].Content, "bar"); - EXPECT_EQ(Nodes[0].Children[2].Content, "baz"); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList); + ASSERT_EQ(N.Children.size(), 3u); + EXPECT_EQ(N.Children[0].Content, "foo"); + EXPECT_EQ(N.Children[1].Content, "bar"); + EXPECT_EQ(N.Children[2].Content, "baz"); } -TEST(MarkdownParserTest, MixedContent) { - llvm::BumpPtrAllocator Arena; - auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena); +TEST_F(MarkdownParserTest, MixedContent) { + auto Nodes = parseMarkdown(R"(some text +``` +code +```` +- item)", + Arena); EXPECT_EQ(Nodes.size(), 3u); } >From 3ef8f62edab311caff0907ab2b9a0c3aaeb14353 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 13:45:44 -0400 Subject: [PATCH 3/5] [clang-doc] Add CommonMark spec tests for fenced code blocks --- .../clang-doc/MarkdownParserTest.cpp | 112 +++++++++++++++++- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index ff9bad88da136..4ca979c1f1d24 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -39,7 +39,7 @@ TEST_F(MarkdownParserTest, PlainText) { TEST_F(MarkdownParserTest, FencedCodeBlock) { auto Nodes = parseMarkdown(R"(```cpp int x = 0; -````)", +````````)", Arena); ASSERT_EQ(Nodes.size(), 1u); const auto &N = Nodes[0]; @@ -51,7 +51,7 @@ int x = 0; TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) { auto Nodes = parseMarkdown(R"(``` some code -```)", +```````)", Arena); ASSERT_EQ(Nodes.size(), 1u); const auto &N = Nodes[0]; @@ -102,12 +102,116 @@ TEST_F(MarkdownParserTest, UnorderedList) { TEST_F(MarkdownParserTest, MixedContent) { auto Nodes = parseMarkdown(R"(some text -``` +``````` code -```` +```````` - item)", Arena); EXPECT_EQ(Nodes.size(), 3u); } +// CommonMark §4.5 example 120: tilde fences work the same as backtick fences. +TEST_F(MarkdownParserTest, TildeFence) { + auto Nodes = parseMarkdown(R"(~~~ +int x = 0; +~~~)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(N.Content.empty()); + ASSERT_EQ(N.Children.size(), 1u); +} + +// CommonMark §4.5 example 120: tilde fence with a language tag. +TEST_F(MarkdownParserTest, TildeFenceWithLang) { + auto Nodes = parseMarkdown(R"(~~~cpp +int x = 0; +~~~)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "cpp"); + ASSERT_EQ(N.Children.size(), 1u); +} + +// CommonMark §4.5 example 122: a tilde line does not close a backtick fence. +TEST_F(MarkdownParserTest, ClosingFenceMustMatchOpeningChar) { + auto Nodes = parseMarkdown(R"(``` +aaa +~~~ +````````)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + // ~~~ is content, not a closing fence. + ASSERT_EQ(N.Children.size(), 2u); +} + +// CommonMark §4.5 example 130: a code block can be empty. +TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) { + auto Nodes = parseMarkdown(R"(``` +```````)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(N.Children.empty()); +} + +// CommonMark §4.5 example 129: a code block may contain only blank lines. +TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) { + auto Nodes = parseMarkdown("```\n\n \n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + ASSERT_EQ(N.Children.size(), 2u); +} + +// CommonMark §4.5 example 142: lang tag is captured from the info string. +TEST_F(MarkdownParserTest, InfoStringLangTag) { + auto Nodes = parseMarkdown(R"(```ruby +def foo(x) + return 3 +end +``````)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "ruby"); + ASSERT_EQ(N.Children.size(), 3u); +} + +// CommonMark §4.5 example 146: tilde fence info string may contain backticks. +TEST_F(MarkdownParserTest, TildeFenceInfoStringWithBackticks) { + auto Nodes = parseMarkdown(R"(~~~ aa ``` ~~~ +foo +~~~)", + Arena); + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(N.Content, "aa ``` ~~~"); + ASSERT_EQ(N.Children.size(), 1u); +} + +// CommonMark §4.5 example 124: closing fence must be at least as long as the +// opening fence. +// TODO: our parser currently closes on the first line with 3 matching fence +// chars regardless of opening fence length. Fix as part of the CommonMark +// TODO in parseMarkdown(). +TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) { + auto Nodes = parseMarkdown("````\naaa\n```", Arena); + // The ``` line should not close the ```` fence per CommonMark, but our + // parser currently treats it as a closing fence. This test documents the + // current (non-conformant) behavior. + ASSERT_EQ(Nodes.size(), 1u); + const auto &N = Nodes[0]; + EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + ASSERT_EQ(N.Children.size(), 1u); +} + } // namespace \ No newline at end of file >From ffb56028d83a542a775119a7b0c2f88271b2df84 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 13:59:52 -0400 Subject: [PATCH 4/5] [clang-doc] Replace flat MDNode with typed node hierarchy using LLVM RTTI --- .../clang-doc/support/Markdown.cpp | 84 +++--- .../clang-doc/support/Markdown.h | 264 ++++++++++++++++-- .../clang-doc/MarkdownParserTest.cpp | 84 +++--- 3 files changed, 312 insertions(+), 120 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index 9e008abf8b08d..bee15c3e23ec3 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -18,8 +18,24 @@ using namespace llvm; namespace clang::doc::markdown { -static MDNode makeText(StringRef S) { - return {NodeKind::NK_Text, S, {}}; +// Allocates a contiguous array of T in the arena and returns an ArrayRef. +template <typename T> +static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec, + BumpPtrAllocator &Arena) { + if (Vec.empty()) + return {}; + T *Allocated = Arena.Allocate<T>(Vec.size()); + std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated); + return ArrayRef<T>(Allocated, Vec.size()); +} + +// Interns a StringRef into the arena so it outlives the parse loop. +static StringRef internString(StringRef S, BumpPtrAllocator &Arena) { + if (S.empty()) + return {}; + char *Buf = Arena.Allocate<char>(S.size()); + std::copy(S.begin(), S.end(), Buf); + return StringRef(Buf, S.size()); } // A line is a table separator if it only contains |, -, :, and spaces, @@ -36,24 +52,15 @@ static bool isListItem(StringRef Line) { Line.starts_with("+ "); } -static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes, - BumpPtrAllocator &Arena) { - if (Nodes.empty()) - return {}; - MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size()); - std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated); - return ArrayRef<MDNode>(Allocated, Nodes.size()); -} - -ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, - BumpPtrAllocator &Arena) { +ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, + BumpPtrAllocator &Arena) { if (ParagraphText.trim().empty()) return {}; SmallVector<StringRef, 16> Lines; ParagraphText.split(Lines, '\n'); - SmallVector<MDNode> Nodes; + SmallVector<MDNode *> Nodes; size_t I = 0, E = Lines.size(); while (I < E) { @@ -71,8 +78,8 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, // case-by-case basis. if (Line.starts_with("```") || Line.starts_with("~~~")) { char Fence = Line[0]; - StringRef Lang = Line.drop_front(3).trim(); - SmallVector<MDNode> CodeLines; + StringRef Lang = internString(Line.drop_front(3).trim(), Arena); + SmallVector<StringRef> CodeLines; ++I; while (I < E) { StringRef CodeLine = Lines[I].trim(); @@ -80,15 +87,13 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; })) break; - CodeLines.push_back(makeText(Lines[I])); + CodeLines.push_back(internString(Lines[I], Arena)); ++I; } ++I; // skip closing fence - MDNode Code; - Code.Kind = NodeKind::NK_FencedCode; - Code.Content = Lang; - Code.Children = allocateNodes(CodeLines, Arena); - LDBG() << "emitting NK_FencedCode lang='" << Lang + auto *Code = + new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena)); + LDBG() << "emitting FencedCodeNode lang='" << Lang << "' lines=" << CodeLines.size(); Nodes.push_back(Code); continue; @@ -96,50 +101,45 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, // Pipe table: current line has | and next line is a separator row. if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) { - SmallVector<MDNode> Rows; + SmallVector<StringRef> Rows; while (I < E && Lines[I].trim().contains('|')) { - Rows.push_back(makeText(Lines[I].trim())); + Rows.push_back(internString(Lines[I].trim(), Arena)); ++I; } - MDNode Table; - Table.Kind = NodeKind::NK_Table; - Table.Content = {}; - Table.Children = allocateNodes(Rows, Arena); - LDBG() << "emitting NK_Table rows=" << Rows.size(); + auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena)); + LDBG() << "emitting TableNode rows=" << Rows.size(); Nodes.push_back(Table); continue; } // Unordered list item. if (isListItem(Line)) { - SmallVector<MDNode> Items; + SmallVector<ListItemNode *> Items; while (I < E) { StringRef L = Lines[I].trim(); if (!isListItem(L)) break; - MDNode Item; - Item.Kind = NodeKind::NK_ListItem; - Item.Content = L.drop_front(2).trim(); - Item.Children = {}; + StringRef ItemText = internString(L.drop_front(2).trim(), Arena); + SmallVector<MDNode *> ItemChildren; + ItemChildren.push_back(new (Arena) TextNode(ItemText)); + auto *Item = + new (Arena) ListItemNode(allocateArray(ItemChildren, Arena)); Items.push_back(Item); ++I; } - MDNode List; - List.Kind = NodeKind::NK_UnorderedList; - List.Content = {}; - List.Children = allocateNodes(Items, Arena); - LDBG() << "emitting NK_UnorderedList items=" << Items.size(); + auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena)); + LDBG() << "emitting UnorderedListNode items=" << Items.size(); Nodes.push_back(List); continue; } // Plain text fallback. - Nodes.push_back(makeText(Line)); + Nodes.push_back(new (Arena) TextNode(internString(Line, Arena))); ++I; } LDBG() << "parseMarkdown done nodes=" << Nodes.size(); - return allocateNodes(Nodes, Arena); + return allocateArray(Nodes, Arena); } -} // namespace clang::doc::markdown +} // namespace clang::doc::markdown \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index 09b79cc8f2437..3d457bcddfac6 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -7,30 +7,50 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines a standalone Markdown parsing library for the LLVM -/// ecosystem. The parser takes plain text and returns a tree of typed nodes -/// with no knowledge of comments, Doxygen, or Clang-Doc internals. +/// Standalone Markdown parsing library for the LLVM ecosystem. /// -/// This is a simple Markdown parser for use inside Clang-Doc's comment -/// pipeline. You give it a paragraph of text and an arena allocator, and it -/// gives back a list of typed nodes describing the Markdown structure it found. +/// The parser takes plain paragraph text and returns a polymorphic tree of +/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator. +/// Node types form a closed class hierarchy rooted at MDNode. Each concrete +/// type carries exactly the fields it needs -- no overloaded Content field, +/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe +/// downcasting; each concrete type provides classof() for this purpose. /// -/// The main entry point is parseMarkdown(). If the text has no Markdown in it, -/// you get back an empty list and can fall back to plain-text output. If it -/// does, you get a tree of MDNode structs where each node has a kind, optional -/// content (like the language tag on a code fence), and optional children. +/// See +/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates /// -/// All nodes are allocated in the arena you pass in. You own the arena and are -/// responsible for keeping it alive as long as you use the nodes. +/// Field ordering in each derived struct is chosen to minimize padding: +/// 4-byte fields (like Level or Start) are declared before 16-byte fields +/// (ArrayRef, StringRef) so that no implicit padding is inserted between the +/// base class's 4-byte Kind and the first derived field. /// -/// The parser handles fenced code blocks, pipe tables, and unordered lists. -/// Anything it does not recognize comes back as a plain text node. It will -/// never crash on bad input. +/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.): +/// TextNode -- plain text run +/// SoftBreakNode -- soft line break +/// HardBreakNode -- hard line break (trailing spaces or backslash) +/// InlineCodeNode -- inline code span (`code`) +/// EmphasisNode -- emphasis (*text* or _text_) +/// StrongNode -- strong emphasis (**text** or __text__) +/// +/// Block nodes: +/// ParagraphNode -- sequence of inline nodes +/// HeadingNode -- ATX heading (# through ######), level 1-6 +/// FencedCodeNode -- fenced code block (``` or ~~~) +/// TableNode -- pipe table (raw row text; TODO: structured cells) +/// UnorderedListNode -- bullet list (-, *, +) +/// OrderedListNode -- numbered list with explicit start number +/// ListItemNode -- single item inside a list +/// BlockQuoteNode -- block quote (>) +/// ThematicBreakNode -- horizontal rule (---, ***, ___) +/// +/// All nodes are arena-allocated. The caller owns the arena and must keep it +/// alive for the lifetime of any returned nodes. The parser never crashes on +/// malformed input; unrecognized text falls back to TextNode. /// //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" @@ -38,35 +58,217 @@ namespace clang::doc::markdown { +/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before +/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable +/// cheap range-based checks in classof() implementations. enum class NodeKind { + // Inline nodes + NK_Text, + NK_SoftBreak, + NK_HardBreak, + NK_InlineCode, + NK_Emphasis, + NK_Strong, + NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this + // Block nodes NK_Paragraph, + NK_Heading, NK_FencedCode, NK_Table, NK_UnorderedList, NK_OrderedList, NK_ListItem, + NK_BlockQuote, NK_ThematicBreak, - // Inline nodes - NK_Text, - NK_InlineCode, - NK_Emphasis, - NK_Strong, - NK_SoftBreak, + NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this }; +/// Base type for all Markdown AST nodes. Carries only the kind discriminator. +/// Nodes are arena-allocated and have no virtual destructor; use +/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting. struct MDNode { NodeKind Kind; - llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text - llvm::ArrayRef<MDNode> Children; // arena allocated + explicit MDNode(NodeKind K) : Kind(K) {} +}; + +//===----------------------------------------------------------------------===// +// Inline nodes +//===----------------------------------------------------------------------===// + +/// Plain text run. +struct TextNode : MDNode { + llvm::StringRef Text; + explicit TextNode(llvm::StringRef Text) + : MDNode(NodeKind::NK_Text), Text(Text) {} + static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; } +}; + +/// Soft line break -- a newline that does not end the paragraph. +struct SoftBreakNode : MDNode { + SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_SoftBreak; + } +}; + +/// Hard line break -- two trailing spaces or a backslash before a newline. +struct HardBreakNode : MDNode { + HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_HardBreak; + } +}; + +/// Inline code span: `code`. Code does not include the surrounding backticks. +struct InlineCodeNode : MDNode { + llvm::StringRef Code; + explicit InlineCodeNode(llvm::StringRef Code) + : MDNode(NodeKind::NK_InlineCode), Code(Code) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_InlineCode; + } +}; + +/// Emphasized text: *text* or _text_. +struct EmphasisNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Emphasis), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Emphasis; + } }; -/// Parses Markdown from a single comment paragraph's text. -/// Returns an empty ArrayRef if no Markdown constructs are found, -/// so generators can fall back to plain-text rendering at zero cost. -llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, - llvm::BumpPtrAllocator &Arena); +/// Strongly emphasized text: **text** or __text__. +struct StrongNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit StrongNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Strong), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Strong; + } +}; + +//===----------------------------------------------------------------------===// +// Block nodes +//===----------------------------------------------------------------------===// + +/// A paragraph -- sequence of inline nodes separated from other blocks by +/// blank lines. +struct ParagraphNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Paragraph), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Paragraph; + } +}; + +/// ATX heading: one to six leading # characters. Level is declared before +/// Children to avoid padding between the base class's 4-byte Kind and the +/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes. +struct HeadingNode : MDNode { + unsigned Level; // 1-6 + llvm::ArrayRef<MDNode *> Children; // inline content + HeadingNode(unsigned Level, llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_Heading), Level(Level), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_Heading; + } +}; + +/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g. +/// "cpp"); empty when no language was specified. Lines contains the raw text +/// of each interior line, without the opening or closing fence. +/// +/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up +/// to 3 spaces; the closing fence must use the same character and be at least +/// as long as the opening fence; only spaces may follow the closing fence. +struct FencedCodeNode : MDNode { + llvm::StringRef Lang; + llvm::ArrayRef<llvm::StringRef> Lines; + FencedCodeNode(llvm::StringRef Lang, llvm::ArrayRef<llvm::StringRef> Lines) + : MDNode(NodeKind::NK_FencedCode), Lang(Lang), Lines(Lines) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_FencedCode; + } +}; + +/// Pipe table. Rows contains the raw text of each row line including the +/// header and separator rows. +/// TODO: replace with a structured header/body/cell representation. +struct TableNode : MDNode { + llvm::ArrayRef<llvm::StringRef> Rows; + explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows) + : MDNode(NodeKind::NK_Table), Rows(Rows) {} + static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; } +}; + +/// A single list item. Children may contain block-level nodes for loose +/// lists, or a single inline sequence for tight lists. +struct ListItemNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit ListItemNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_ListItem), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_ListItem; + } +}; + +/// Unordered (bullet) list. Markers are -, *, or +. +struct UnorderedListNode : MDNode { + llvm::ArrayRef<ListItemNode *> Items; + explicit UnorderedListNode(llvm::ArrayRef<ListItemNode *> Items) + : MDNode(NodeKind::NK_UnorderedList), Items(Items) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_UnorderedList; + } +}; + +/// Ordered (numbered) list. Start is the number on the first item. Start is +/// declared before Items to avoid padding, keeping sizeof at 24 bytes. +struct OrderedListNode : MDNode { + unsigned Start; + llvm::ArrayRef<ListItemNode *> Items; + OrderedListNode(unsigned Start, llvm::ArrayRef<ListItemNode *> Items) + : MDNode(NodeKind::NK_OrderedList), Start(Start), Items(Items) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_OrderedList; + } +}; + +/// Block quote (> ...). Children are block-level nodes inside the quote. +struct BlockQuoteNode : MDNode { + llvm::ArrayRef<MDNode *> Children; + explicit BlockQuoteNode(llvm::ArrayRef<MDNode *> Children) + : MDNode(NodeKind::NK_BlockQuote), Children(Children) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_BlockQuote; + } +}; + +/// Thematic break: a line of three or more ---, ***, or ___ characters. +struct ThematicBreakNode : MDNode { + ThematicBreakNode() : MDNode(NodeKind::NK_ThematicBreak) {} + static bool classof(const MDNode *N) { + return N->Kind == NodeKind::NK_ThematicBreak; + } +}; + +//===----------------------------------------------------------------------===// +// Parser entry point +//===----------------------------------------------------------------------===// + +/// Parse Markdown from a single paragraph of plain text. Returns a list of +/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no +/// Markdown constructs are found, letting callers fall back to plain-text +/// rendering at zero cost. The parser never crashes on malformed input. +/// +/// The caller must keep Arena alive for the lifetime of any returned nodes. +llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena); } // namespace clang::doc::markdown -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp index 4ca979c1f1d24..b61094f034375 100644 --- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -8,9 +8,11 @@ #include "support/Markdown.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" #include "gtest/gtest.h" using namespace clang::doc::markdown; +using namespace llvm; namespace { @@ -31,9 +33,8 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) { TEST_F(MarkdownParserTest, PlainText) { auto Nodes = parseMarkdown("hello world", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_Text); - EXPECT_EQ(N.Content, "hello world"); + auto *N = cast<TextNode>(Nodes[0]); + EXPECT_EQ(N->Text, "hello world"); } TEST_F(MarkdownParserTest, FencedCodeBlock) { @@ -42,10 +43,9 @@ int x = 0; ````````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "cpp"); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "cpp"); + ASSERT_EQ(N->Lines.size(), 1u); } TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) { @@ -54,9 +54,8 @@ some code ```````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(N.Content.empty()); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_TRUE(N->Lang.empty()); } TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { @@ -74,7 +73,7 @@ TEST_F(MarkdownParserTest, PipeTable) { | 1 | 2 |)", Arena); ASSERT_EQ(Nodes.size(), 1u); - EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); + EXPECT_TRUE(isa<TableNode>(Nodes[0])); } TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { @@ -82,8 +81,8 @@ TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { c | d)", Arena); // No separator row so should not be parsed as a table. - for (const auto &Node : Nodes) - EXPECT_NE(Node.Kind, NodeKind::NK_Table); + for (const auto *Node : Nodes) + EXPECT_FALSE(isa<TableNode>(Node)); } TEST_F(MarkdownParserTest, UnorderedList) { @@ -92,12 +91,11 @@ TEST_F(MarkdownParserTest, UnorderedList) { - baz)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList); - ASSERT_EQ(N.Children.size(), 3u); - EXPECT_EQ(N.Children[0].Content, "foo"); - EXPECT_EQ(N.Children[1].Content, "bar"); - EXPECT_EQ(N.Children[2].Content, "baz"); + auto *N = cast<UnorderedListNode>(Nodes[0]); + ASSERT_EQ(N->Items.size(), 3u); + EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo"); + EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar"); + EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz"); } TEST_F(MarkdownParserTest, MixedContent) { @@ -117,10 +115,9 @@ int x = 0; ~~~)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(N.Content.empty()); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_TRUE(N->Lang.empty()); + ASSERT_EQ(N->Lines.size(), 1u); } // CommonMark §4.5 example 120: tilde fence with a language tag. @@ -130,10 +127,9 @@ int x = 0; ~~~)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "cpp"); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "cpp"); + ASSERT_EQ(N->Lines.size(), 1u); } // CommonMark §4.5 example 122: a tilde line does not close a backtick fence. @@ -144,10 +140,9 @@ aaa ````````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); + auto *N = cast<FencedCodeNode>(Nodes[0]); // ~~~ is content, not a closing fence. - ASSERT_EQ(N.Children.size(), 2u); + ASSERT_EQ(N->Lines.size(), 2u); } // CommonMark §4.5 example 130: a code block can be empty. @@ -156,18 +151,16 @@ TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) { ```````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_TRUE(N.Children.empty()); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_TRUE(N->Lines.empty()); } // CommonMark §4.5 example 129: a code block may contain only blank lines. TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) { auto Nodes = parseMarkdown("```\n\n \n```", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - ASSERT_EQ(N.Children.size(), 2u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + ASSERT_EQ(N->Lines.size(), 2u); } // CommonMark §4.5 example 142: lang tag is captured from the info string. @@ -179,10 +172,9 @@ end ``````)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "ruby"); - ASSERT_EQ(N.Children.size(), 3u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "ruby"); + ASSERT_EQ(N->Lines.size(), 3u); } // CommonMark §4.5 example 146: tilde fence info string may contain backticks. @@ -192,10 +184,9 @@ foo ~~~)", Arena); ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - EXPECT_EQ(N.Content, "aa ``` ~~~"); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + EXPECT_EQ(N->Lang, "aa ``` ~~~"); + ASSERT_EQ(N->Lines.size(), 1u); } // CommonMark §4.5 example 124: closing fence must be at least as long as the @@ -209,9 +200,8 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) { // parser currently treats it as a closing fence. This test documents the // current (non-conformant) behavior. ASSERT_EQ(Nodes.size(), 1u); - const auto &N = Nodes[0]; - EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode); - ASSERT_EQ(N.Children.size(), 1u); + auto *N = cast<FencedCodeNode>(Nodes[0]); + ASSERT_EQ(N->Lines.size(), 1u); } } // namespace \ No newline at end of file >From 25fe7daff183f51a8b31ed0d8481b9a2f1fbdbd8 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 12 Jun 2026 02:33:44 -0400 Subject: [PATCH 5/5] [clang-doc] Introduce LineReader cursor for the Markdown parse loop Replace the raw size_t I = 0, E = Lines.size() index arithmetic in parseMarkdown() with a LineReader cursor that encapsulates the position and exposes peek(), peek(Offset), advance(), and atEnd(). The parse logic and emitted nodes are unchanged; this only removes manual index bookkeeping. All 18 MarkdownParserTest cases still pass. Co-Authored-By: Claude Opus 4.8 <[email protected]> --- .../clang-doc/support/Markdown.cpp | 73 ++++++++++++++----- 1 file changed, 54 insertions(+), 19 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index bee15c3e23ec3..f171457e73046 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/DebugLog.h" +#include <cassert> #define DEBUG_TYPE "clang-doc" @@ -52,6 +53,42 @@ static bool isListItem(StringRef Line) { Line.starts_with("+ "); } +// A forward cursor over the lines of a paragraph. Encapsulates the parse +// position so the loop can inspect the current or an upcoming line and consume +// lines without manual index arithmetic. Lines are stored untrimmed; callers +// trim where they need a normalized view. +class LineReader { +public: + explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {} + + // True once every line has been consumed. + bool atEnd() const { return Pos >= Lines.size(); } + + // The current line, untrimmed. Must not be called when atEnd(). + StringRef peek() const { + assert(!atEnd() && "peek past end of input"); + return Lines[Pos]; + } + + // The line Offset positions ahead of the cursor, or an empty StringRef when + // that position is past the end. peek(0) is the current line. + StringRef peek(size_t Offset) const { + size_t Target = Pos + Offset; + return Target < Lines.size() ? Lines[Target] : StringRef(); + } + + // Consume the current line and return it, untrimmed. Must not be called when + // atEnd(). + StringRef advance() { + assert(!atEnd() && "advance past end of input"); + return Lines[Pos++]; + } + +private: + ArrayRef<StringRef> Lines; + size_t Pos = 0; +}; + ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, BumpPtrAllocator &Arena) { if (ParagraphText.trim().empty()) @@ -61,13 +98,13 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, ParagraphText.split(Lines, '\n'); SmallVector<MDNode *> Nodes; - size_t I = 0, E = Lines.size(); + LineReader Reader(Lines); - while (I < E) { - StringRef Line = Lines[I].trim(); + while (!Reader.atEnd()) { + StringRef Line = Reader.peek().trim(); if (Line.empty()) { - ++I; + Reader.advance(); continue; } @@ -79,18 +116,18 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, if (Line.starts_with("```") || Line.starts_with("~~~")) { char Fence = Line[0]; StringRef Lang = internString(Line.drop_front(3).trim(), Arena); + Reader.advance(); // consume opening fence SmallVector<StringRef> CodeLines; - ++I; - while (I < E) { - StringRef CodeLine = Lines[I].trim(); + while (!Reader.atEnd()) { + StringRef CodeLine = Reader.peek().trim(); if (CodeLine.size() >= 3 && all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; })) break; - CodeLines.push_back(internString(Lines[I], Arena)); - ++I; + CodeLines.push_back(internString(Reader.advance(), Arena)); } - ++I; // skip closing fence + if (!Reader.atEnd()) + Reader.advance(); // consume closing fence auto *Code = new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena)); LDBG() << "emitting FencedCodeNode lang='" << Lang @@ -100,12 +137,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, } // Pipe table: current line has | and next line is a separator row. - if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) { + if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) { SmallVector<StringRef> Rows; - while (I < E && Lines[I].trim().contains('|')) { - Rows.push_back(internString(Lines[I].trim(), Arena)); - ++I; - } + while (!Reader.atEnd() && Reader.peek().trim().contains('|')) + Rows.push_back(internString(Reader.advance().trim(), Arena)); auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena)); LDBG() << "emitting TableNode rows=" << Rows.size(); Nodes.push_back(Table); @@ -115,8 +150,8 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, // Unordered list item. if (isListItem(Line)) { SmallVector<ListItemNode *> Items; - while (I < E) { - StringRef L = Lines[I].trim(); + while (!Reader.atEnd()) { + StringRef L = Reader.peek().trim(); if (!isListItem(L)) break; StringRef ItemText = internString(L.drop_front(2).trim(), Arena); @@ -125,7 +160,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena)); Items.push_back(Item); - ++I; + Reader.advance(); } auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena)); LDBG() << "emitting UnorderedListNode items=" << Items.size(); @@ -135,7 +170,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText, // Plain text fallback. Nodes.push_back(new (Arena) TextNode(internString(Line, Arena))); - ++I; + Reader.advance(); } LDBG() << "parseMarkdown done nodes=" << Nodes.size(); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
