https://github.com/Neil-N4 created https://github.com/llvm/llvm-project/pull/202991
Adds a standalone Markdown parsing library under clang-doc/support. The parser takes plain paragraph text and returns a flat list of typed nodes (fenced code blocks, pipe tables, unordered lists, plain text fallback). No knowledge of Doxygen or Clang-Doc internals.All nodes are arena-allocated. Returns an empty ArrayRef if no Markdown constructs are found so generators can fall back to plain text at zero cost. Currently handles: fenced code blocks (``` or ~~~), pipe tables, unordered lists, plain text.10 unit tests included. Integration into the JSON generator is tracked separately. Assisted-by: Claude >From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Wed, 10 Jun 2026 09:51:48 -0400 Subject: [PATCH] [clang-doc] Add standalone Markdown parsing library --- .../clang-doc/support/CMakeLists.txt | 3 +- .../clang-doc/support/Markdown.cpp | 145 ++++++++++++++++++ .../clang-doc/support/Markdown.h | 72 +++++++++ .../unittests/clang-doc/CMakeLists.txt | 4 +- .../clang-doc/MarkdownParserTest.cpp | 94 ++++++++++++ 5 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt index 8ac913ffbe998..acff865190ff9 100644 --- a/clang-tools-extra/clang-doc/support/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt @@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangDocSupport STATIC File.cpp + Markdown.cpp Utils.cpp - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp new file mode 100644 index 0000000000000..776150b939d27 --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -0,0 +1,145 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Markdown.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/DebugLog.h" + +#define DEBUG_TYPE "clang-doc-markdown" + +using namespace llvm; + +namespace clang::doc::markdown { + +static MDNode makeText(StringRef S) { + return {NodeKind::NK_Text, S, {}}; +} + +// A line is a table separator if it only contains |, -, :, and spaces, +// and has at least one -. +static bool isSepRow(StringRef Line) { + return Line.contains('-') && + Line.find_first_not_of("|-: ") == StringRef::npos; +} + +// Returns true if Line begins with a bullet list marker (-, *, or +) +// followed by a space. +static bool isListItem(StringRef Line) { + return Line.starts_with("- ") || Line.starts_with("* ") || + Line.starts_with("+ "); +} + +static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes, + BumpPtrAllocator &Arena) { + if (Nodes.empty()) + return {}; + MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size()); + std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated); + return ArrayRef<MDNode>(Allocated, Nodes.size()); +} + +ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText, + BumpPtrAllocator &Arena) { + if (ParagraphText.trim().empty()) + return {}; + + SmallVector<StringRef, 16> Lines; + ParagraphText.split(Lines, '\n'); + + SmallVector<MDNode> Nodes; + size_t I = 0, E = Lines.size(); + + while (I < E) { + StringRef Line = Lines[I].trim(); + + if (Line.empty()) { + ++I; + continue; + } + + // TODO: Follow CommonMark spec ยง4.5 more closely -- opening fences may be + // indented up to 3 spaces, the closing fence must use the same character + // and be at least as long as the opening fence, and the closing fence may + // only be followed by spaces. Doxygen specifics should be handled on a + // case-by-case basis. + if (Line.starts_with("```") || Line.starts_with("~~~")) { + char Fence = Line[0]; + StringRef Lang = Line.drop_front(3).trim(); + SmallVector<MDNode> CodeLines; + ++I; + while (I < E) { + StringRef CodeLine = Lines[I].trim(); + if (CodeLine.size() >= 3 && + all_of(CodeLine.take_front(3), + [Fence](char C) { return C == Fence; })) + break; + CodeLines.push_back(makeText(Lines[I])); + ++I; + } + ++I; // skip closing fence + MDNode Code; + Code.Kind = NodeKind::NK_FencedCode; + Code.Content = Lang; + Code.Children = allocateNodes(CodeLines, Arena); + LDBG() << "emitting NK_FencedCode lang='" << Lang + << "' lines=" << CodeLines.size(); + Nodes.push_back(Code); + continue; + } + + // Pipe table: current line has | and next line is a separator row. + if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) { + SmallVector<MDNode> Rows; + while (I < E && Lines[I].trim().contains('|')) { + Rows.push_back(makeText(Lines[I].trim())); + ++I; + } + MDNode Table; + Table.Kind = NodeKind::NK_Table; + Table.Content = {}; + Table.Children = allocateNodes(Rows, Arena); + LDBG() << "emitting NK_Table rows=" << Rows.size(); + Nodes.push_back(Table); + continue; + } + + // Unordered list item. + if (isListItem(Line)) { + SmallVector<MDNode> Items; + while (I < E) { + StringRef L = Lines[I].trim(); + if (!isListItem(L)) + break; + MDNode Item; + Item.Kind = NodeKind::NK_ListItem; + Item.Content = L.drop_front(2).trim(); + Item.Children = {}; + Items.push_back(Item); + ++I; + } + MDNode List; + List.Kind = NodeKind::NK_UnorderedList; + List.Content = {}; + List.Children = allocateNodes(Items, Arena); + LDBG() << "emitting NK_UnorderedList items=" << Items.size(); + Nodes.push_back(List); + continue; + } + + // Plain text fallback. + Nodes.push_back(makeText(Line)); + ++I; + } + + LDBG() << "parseMarkdown done nodes=" << Nodes.size(); + return allocateNodes(Nodes, Arena); +} + +} // namespace clang::doc::markdown \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h new file mode 100644 index 0000000000000..890f764f937b1 --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -0,0 +1,72 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines a standalone Markdown parsing library for the LLVM +/// ecosystem. The parser takes plain text and returns a tree of typed nodes +/// with no knowledge of comments, Doxygen, or Clang-Doc internals. +/// +/// This is a simple Markdown parser for use inside Clang-Doc's comment +/// pipeline. You give it a paragraph of text and an arena allocator, and it +/// gives back a list of typed nodes describing the Markdown structure it found. +/// +/// The main entry point is parseMarkdown(). If the text has no Markdown in it, +/// you get back an empty list and can fall back to plain-text output. If it +/// does, you get a tree of MDNode structs where each node has a kind, optional +/// content (like the language tag on a code fence), and optional children. +/// +/// All nodes are allocated in the arena you pass in. You own the arena and are +/// responsible for keeping it alive as long as you use the nodes. +/// +/// The parser handles fenced code blocks, pipe tables, and unordered lists. +/// Anything it does not recognize comes back as a plain text node. It will +/// never crash on bad input. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" + +namespace clang::doc::markdown { + +enum class NodeKind { + // Block nodes + NK_Paragraph, + NK_FencedCode, + NK_Table, + NK_UnorderedList, + NK_OrderedList, + NK_ListItem, + NK_ThematicBreak, + // Inline nodes + NK_Text, + NK_InlineCode, + NK_Emphasis, + NK_Strong, + NK_SoftBreak, +}; + +struct MDNode { + NodeKind Kind; + llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text + llvm::ArrayRef<MDNode> Children; // arena allocated +}; + +/// Parses Markdown from a single comment paragraph's text. +/// Returns an empty ArrayRef if no Markdown constructs are found, +/// so generators can fall back to plain-text rendering at zero cost. +llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena); + +} // namespace clang::doc::markdown + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H \ No newline at end of file diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt index 01b34ec9a791e..b74207ac88fa7 100644 --- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt @@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests ClangDocTest.cpp GeneratorTest.cpp HTMLGeneratorTest.cpp + MarkdownParserTest.cpp MDGeneratorTest.cpp MergeTest.cpp SerializeTest.cpp @@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests target_link_libraries(ClangDocTests PRIVATE clangDoc + clangDocSupport LLVMTestingSupport - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp new file mode 100644 index 0000000000000..8df5efc7f1d5f --- /dev/null +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "support/Markdown.h" +#include "llvm/Support/Allocator.h" +#include "gtest/gtest.h" + +using namespace clang::doc::markdown; + +namespace { + +TEST(MarkdownParserTest, EmptyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, WhitespaceOnlyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown(" \n \n", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("hello world", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text); + EXPECT_EQ(Nodes[0].Content, "hello world"); +} + +TEST(MarkdownParserTest, FencedCodeBlock) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(Nodes[0].Content, "cpp"); + ASSERT_EQ(Nodes[0].Children.size(), 1u); +} + +TEST(MarkdownParserTest, FencedCodeBlockNoLang) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```\nsome code\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(Nodes[0].Content.empty()); +} + +TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena); + // Unterminated fence should not crash and should produce a code node + // with whatever lines were found. + EXPECT_FALSE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PipeTable) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("a | b\nc | d", Arena); + // No separator row so should not be parsed as a table + for (const auto &Node : Nodes) + EXPECT_NE(Node.Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, UnorderedList) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList); + ASSERT_EQ(Nodes[0].Children.size(), 3u); + EXPECT_EQ(Nodes[0].Children[0].Content, "foo"); + EXPECT_EQ(Nodes[0].Children[1].Content, "bar"); + EXPECT_EQ(Nodes[0].Children[2].Content, "baz"); +} + +TEST(MarkdownParserTest, MixedContent) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena); + EXPECT_EQ(Nodes.size(), 3u); +} + +} // namespace \ No newline at end of file _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
