https://github.com/Neil-N4 updated https://github.com/llvm/llvm-project/pull/200302
>From 87ed388807b9239da05c1433ae253456f44fcf1f Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Thu, 28 May 2026 19:23:48 -0400 Subject: [PATCH 1/8] [clang-doc] Add standalone Markdown parsing library --- clang-tools-extra/clang-doc/CMakeLists.txt | 1 + clang-tools-extra/clang-doc/Markdown.cpp | 133 +++++++++++++++++++++ clang-tools-extra/clang-doc/Markdown.h | 59 +++++++++ 3 files changed, 193 insertions(+) create mode 100644 clang-tools-extra/clang-doc/Markdown.cpp create mode 100644 clang-tools-extra/clang-doc/Markdown.h diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt index 22e2c8159e9f6..4f69385bdccc3 100644 --- a/clang-tools-extra/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/CMakeLists.txt @@ -12,6 +12,7 @@ add_clang_library(clangDoc STATIC Generators.cpp HTMLGenerator.cpp Mapper.cpp + Markdown.cpp MDGenerator.cpp Representation.cpp Serialize.cpp diff --git a/clang-tools-extra/clang-doc/Markdown.cpp b/clang-tools-extra/clang-doc/Markdown.cpp new file mode 100644 index 0000000000000..87053c94b0566 --- /dev/null +++ b/clang-tools-extra/clang-doc/Markdown.cpp @@ -0,0 +1,133 @@ +//===-- Markdown.cpp - Markdown Parser --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Markdown.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" + +namespace clang { +namespace doc { +namespace markdown { + +static MDNode makeText(llvm::StringRef S) { + return {NodeKind::Text, S, {}}; +} + +// A line is a table separator if it only contains |, -, :, and spaces, +// and has at least one -. +static bool isSepRow(llvm::StringRef Line) { + return llvm::all_of(Line, [](char C) { + return C == '|' || C == '-' || C == ':' || C == ' '; + }) && Line.contains('-'); +} + +static llvm::ArrayRef<MDNode> +allocateNodes(llvm::SmallVectorImpl<MDNode> &Nodes, + llvm::BumpPtrAllocator &Arena) { + if (Nodes.empty()) + return {}; + MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size()); + std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated); + return llvm::ArrayRef<MDNode>(Allocated, Nodes.size()); +} + +llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena) { + if (ParagraphText.trim().empty()) + return {}; + + llvm::SmallVector<llvm::StringRef, 16> Lines; + ParagraphText.split(Lines, '\n'); + + llvm::SmallVector<MDNode, 8> Nodes; + unsigned I = 0; + + while (I < Lines.size()) { + llvm::StringRef Line = Lines[I].trim(); + + if (Line.empty()) { + ++I; + continue; + } + + // Fenced code block: ``` or ~~~ + if (Line.starts_with("```") || Line.starts_with("~~~")) { + char Fence = Line[0]; + llvm::StringRef Lang = Line.drop_front(3).trim(); + llvm::SmallVector<MDNode, 4> CodeLines; + ++I; + while (I < Lines.size()) { + llvm::StringRef CodeLine = Lines[I].trim(); + if (CodeLine.size() >= 3 && + llvm::all_of(CodeLine.take_front(3), + [Fence](char C) { return C == Fence; })) + break; + CodeLines.push_back(makeText(Lines[I])); + ++I; + } + ++I; // skip closing fence + MDNode Code; + Code.Kind = NodeKind::FencedCode; + Code.Content = Lang; + Code.Children = allocateNodes(CodeLines, Arena); + Nodes.push_back(Code); + continue; + } + + // Pipe table: current line has | and next line is a separator row + if (Line.contains('|') && I + 1 < Lines.size() && + isSepRow(Lines[I + 1].trim())) { + llvm::SmallVector<MDNode, 4> Rows; + while (I < Lines.size() && Lines[I].trim().contains('|')) { + Rows.push_back(makeText(Lines[I].trim())); + ++I; + } + MDNode Table; + Table.Kind = NodeKind::Table; + Table.Content = {}; + Table.Children = allocateNodes(Rows, Arena); + Nodes.push_back(Table); + continue; + } + + // Unordered list item + if (Line.starts_with("- ") || Line.starts_with("* ") || + Line.starts_with("+ ")) { + llvm::SmallVector<MDNode, 4> Items; + while (I < Lines.size()) { + llvm::StringRef L = Lines[I].trim(); + if (!L.starts_with("- ") && !L.starts_with("* ") && + !L.starts_with("+ ")) + break; + MDNode Item; + Item.Kind = NodeKind::ListItem; + Item.Content = L.drop_front(2).trim(); + Item.Children = {}; + Items.push_back(Item); + ++I; + } + MDNode List; + List.Kind = NodeKind::UnorderedList; + List.Content = {}; + List.Children = allocateNodes(Items, Arena); + Nodes.push_back(List); + continue; + } + + // Plain text fallback + Nodes.push_back(makeText(Line)); + ++I; + } + + return allocateNodes(Nodes, Arena); +} + +} // namespace markdown +} // namespace doc +} // namespace clang \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/Markdown.h b/clang-tools-extra/clang-doc/Markdown.h new file mode 100644 index 0000000000000..c3374f06e2278 --- /dev/null +++ b/clang-tools-extra/clang-doc/Markdown.h @@ -0,0 +1,59 @@ +//===-- Markdown.h - Markdown Parser ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines a standalone Markdown parsing library for the LLVM +// ecosystem. The parser takes plain text and returns a tree of typed nodes +// with no knowledge of comments, Doxygen, or Clang-Doc internals. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" + +namespace clang { +namespace doc { +namespace markdown { + +enum class NodeKind : uint8_t { + // Block nodes + Paragraph, + FencedCode, + Table, + UnorderedList, + OrderedList, + ListItem, + ThematicBreak, + // Inline nodes + Text, + InlineCode, + Emphasis, + Strong, + SoftBreak, +}; + +struct MDNode { + NodeKind Kind; + llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text + llvm::ArrayRef<MDNode> Children; // arena allocated +}; + +// Parses Markdown from a single comment paragraph's text. +// Returns an empty ArrayRef if no Markdown constructs are found, +// so generators can fall back to plain-text rendering at zero cost. +llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena); + +} // namespace markdown +} // namespace doc +} // namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H \ No newline at end of file >From d350bf1e3fb4090bc65bcd7e5666e87c7b319b18 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Thu, 28 May 2026 19:34:38 -0400 Subject: [PATCH 2/8] [clang-doc] Fix formatting --- clang-tools-extra/clang-doc/Markdown.cpp | 8 +++++--- clang-tools-extra/clang-doc/Markdown.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/clang-doc/Markdown.cpp b/clang-tools-extra/clang-doc/Markdown.cpp index 87053c94b0566..904d8e92dff17 100644 --- a/clang-tools-extra/clang-doc/Markdown.cpp +++ b/clang-tools-extra/clang-doc/Markdown.cpp @@ -22,9 +22,11 @@ static MDNode makeText(llvm::StringRef S) { // A line is a table separator if it only contains |, -, :, and spaces, // and has at least one -. static bool isSepRow(llvm::StringRef Line) { - return llvm::all_of(Line, [](char C) { - return C == '|' || C == '-' || C == ':' || C == ' '; - }) && Line.contains('-'); + return llvm::all_of(Line, + [](char C) { + return C == '|' || C == '-' || C == ':' || C == ' '; + }) && + Line.contains('-'); } static llvm::ArrayRef<MDNode> diff --git a/clang-tools-extra/clang-doc/Markdown.h b/clang-tools-extra/clang-doc/Markdown.h index c3374f06e2278..bf4815e068b53 100644 --- a/clang-tools-extra/clang-doc/Markdown.h +++ b/clang-tools-extra/clang-doc/Markdown.h @@ -42,7 +42,7 @@ enum class NodeKind : uint8_t { struct MDNode { NodeKind Kind; - llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text + llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text llvm::ArrayRef<MDNode> Children; // arena allocated }; >From 73a9197525f835ccd3ebee0fc89dad83d19bb0ad Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Thu, 28 May 2026 19:42:16 -0400 Subject: [PATCH 3/8] [clang-doc] Fix formatting --- clang-tools-extra/clang-doc/Markdown.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-doc/Markdown.cpp b/clang-tools-extra/clang-doc/Markdown.cpp index 904d8e92dff17..17ee61369fb6b 100644 --- a/clang-tools-extra/clang-doc/Markdown.cpp +++ b/clang-tools-extra/clang-doc/Markdown.cpp @@ -15,9 +15,7 @@ namespace clang { namespace doc { namespace markdown { -static MDNode makeText(llvm::StringRef S) { - return {NodeKind::Text, S, {}}; -} +static MDNode makeText(llvm::StringRef S) { return {NodeKind::Text, S, {}}; } // A line is a table separator if it only contains |, -, :, and spaces, // and has at least one -. >From 1a899ed0122a7a15787ebe1760225b46289d50ba Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 29 May 2026 14:21:18 -0400 Subject: [PATCH 4/8] [clang-doc] Move Markdown library to support folder, fix headers and enum prefixes --- clang-tools-extra/clang-doc/CMakeLists.txt | 1 - .../clang-doc/support/CMakeLists.txt | 3 +- .../clang-doc/support/Markdown.cpp | 133 ++++++++++++++++++ .../clang-doc/support/Markdown.h | 59 ++++++++ 4 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt index 4f69385bdccc3..22e2c8159e9f6 100644 --- a/clang-tools-extra/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/CMakeLists.txt @@ -12,7 +12,6 @@ add_clang_library(clangDoc STATIC Generators.cpp HTMLGenerator.cpp Mapper.cpp - Markdown.cpp MDGenerator.cpp Representation.cpp Serialize.cpp diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt index 8ac913ffbe998..acff865190ff9 100644 --- a/clang-tools-extra/clang-doc/support/CMakeLists.txt +++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt @@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangDocSupport STATIC File.cpp + Markdown.cpp Utils.cpp - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp new file mode 100644 index 0000000000000..17ee61369fb6b --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -0,0 +1,133 @@ +//===-- Markdown.cpp - Markdown Parser --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Markdown.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" + +namespace clang { +namespace doc { +namespace markdown { + +static MDNode makeText(llvm::StringRef S) { return {NodeKind::Text, S, {}}; } + +// A line is a table separator if it only contains |, -, :, and spaces, +// and has at least one -. +static bool isSepRow(llvm::StringRef Line) { + return llvm::all_of(Line, + [](char C) { + return C == '|' || C == '-' || C == ':' || C == ' '; + }) && + Line.contains('-'); +} + +static llvm::ArrayRef<MDNode> +allocateNodes(llvm::SmallVectorImpl<MDNode> &Nodes, + llvm::BumpPtrAllocator &Arena) { + if (Nodes.empty()) + return {}; + MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size()); + std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated); + return llvm::ArrayRef<MDNode>(Allocated, Nodes.size()); +} + +llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena) { + if (ParagraphText.trim().empty()) + return {}; + + llvm::SmallVector<llvm::StringRef, 16> Lines; + ParagraphText.split(Lines, '\n'); + + llvm::SmallVector<MDNode, 8> Nodes; + unsigned I = 0; + + while (I < Lines.size()) { + llvm::StringRef Line = Lines[I].trim(); + + if (Line.empty()) { + ++I; + continue; + } + + // Fenced code block: ``` or ~~~ + if (Line.starts_with("```") || Line.starts_with("~~~")) { + char Fence = Line[0]; + llvm::StringRef Lang = Line.drop_front(3).trim(); + llvm::SmallVector<MDNode, 4> CodeLines; + ++I; + while (I < Lines.size()) { + llvm::StringRef CodeLine = Lines[I].trim(); + if (CodeLine.size() >= 3 && + llvm::all_of(CodeLine.take_front(3), + [Fence](char C) { return C == Fence; })) + break; + CodeLines.push_back(makeText(Lines[I])); + ++I; + } + ++I; // skip closing fence + MDNode Code; + Code.Kind = NodeKind::FencedCode; + Code.Content = Lang; + Code.Children = allocateNodes(CodeLines, Arena); + Nodes.push_back(Code); + continue; + } + + // Pipe table: current line has | and next line is a separator row + if (Line.contains('|') && I + 1 < Lines.size() && + isSepRow(Lines[I + 1].trim())) { + llvm::SmallVector<MDNode, 4> Rows; + while (I < Lines.size() && Lines[I].trim().contains('|')) { + Rows.push_back(makeText(Lines[I].trim())); + ++I; + } + MDNode Table; + Table.Kind = NodeKind::Table; + Table.Content = {}; + Table.Children = allocateNodes(Rows, Arena); + Nodes.push_back(Table); + continue; + } + + // Unordered list item + if (Line.starts_with("- ") || Line.starts_with("* ") || + Line.starts_with("+ ")) { + llvm::SmallVector<MDNode, 4> Items; + while (I < Lines.size()) { + llvm::StringRef L = Lines[I].trim(); + if (!L.starts_with("- ") && !L.starts_with("* ") && + !L.starts_with("+ ")) + break; + MDNode Item; + Item.Kind = NodeKind::ListItem; + Item.Content = L.drop_front(2).trim(); + Item.Children = {}; + Items.push_back(Item); + ++I; + } + MDNode List; + List.Kind = NodeKind::UnorderedList; + List.Content = {}; + List.Children = allocateNodes(Items, Arena); + Nodes.push_back(List); + continue; + } + + // Plain text fallback + Nodes.push_back(makeText(Line)); + ++I; + } + + return allocateNodes(Nodes, Arena); +} + +} // namespace markdown +} // namespace doc +} // namespace clang \ No newline at end of file diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h new file mode 100644 index 0000000000000..bf4815e068b53 --- /dev/null +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -0,0 +1,59 @@ +//===-- Markdown.h - Markdown Parser ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines a standalone Markdown parsing library for the LLVM +// ecosystem. The parser takes plain text and returns a tree of typed nodes +// with no knowledge of comments, Doxygen, or Clang-Doc internals. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" + +namespace clang { +namespace doc { +namespace markdown { + +enum class NodeKind : uint8_t { + // Block nodes + Paragraph, + FencedCode, + Table, + UnorderedList, + OrderedList, + ListItem, + ThematicBreak, + // Inline nodes + Text, + InlineCode, + Emphasis, + Strong, + SoftBreak, +}; + +struct MDNode { + NodeKind Kind; + llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text + llvm::ArrayRef<MDNode> Children; // arena allocated +}; + +// Parses Markdown from a single comment paragraph's text. +// Returns an empty ArrayRef if no Markdown constructs are found, +// so generators can fall back to plain-text rendering at zero cost. +llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, + llvm::BumpPtrAllocator &Arena); + +} // namespace markdown +} // namespace doc +} // namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H \ No newline at end of file >From 750b43aacf1705b707ae736e58deb1e55e5d169a Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 29 May 2026 14:27:09 -0400 Subject: [PATCH 5/8] [clang-doc] Fix enum prefixes and file headers --- .../clang-doc/support/Markdown.cpp | 21 +++++---- .../clang-doc/support/Markdown.h | 45 ++++++++++--------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index 17ee61369fb6b..bbce53fa17156 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -1,4 +1,4 @@ -//===-- Markdown.cpp - Markdown Parser --------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,16 +15,15 @@ namespace clang { namespace doc { namespace markdown { -static MDNode makeText(llvm::StringRef S) { return {NodeKind::Text, S, {}}; } +static MDNode makeText(llvm::StringRef S) { + return {NodeKind::NK_Text, S, {}}; +} // A line is a table separator if it only contains |, -, :, and spaces, // and has at least one -. static bool isSepRow(llvm::StringRef Line) { - return llvm::all_of(Line, - [](char C) { - return C == '|' || C == '-' || C == ':' || C == ' '; - }) && - Line.contains('-'); + return Line.contains('-') && + Line.find_first_not_of("|-: ") == llvm::StringRef::npos; } static llvm::ArrayRef<MDNode> @@ -73,7 +72,7 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, } ++I; // skip closing fence MDNode Code; - Code.Kind = NodeKind::FencedCode; + Code.Kind = NodeKind::NK_FencedCode; Code.Content = Lang; Code.Children = allocateNodes(CodeLines, Arena); Nodes.push_back(Code); @@ -89,7 +88,7 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, ++I; } MDNode Table; - Table.Kind = NodeKind::Table; + Table.Kind = NodeKind::NK_Table; Table.Content = {}; Table.Children = allocateNodes(Rows, Arena); Nodes.push_back(Table); @@ -106,14 +105,14 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, !L.starts_with("+ ")) break; MDNode Item; - Item.Kind = NodeKind::ListItem; + Item.Kind = NodeKind::NK_ListItem; Item.Content = L.drop_front(2).trim(); Item.Children = {}; Items.push_back(Item); ++I; } MDNode List; - List.Kind = NodeKind::UnorderedList; + List.Kind = NodeKind::NK_UnorderedList; List.Content = {}; List.Children = allocateNodes(Items, Arena); Nodes.push_back(List); diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index bf4815e068b53..e665170473601 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -1,15 +1,16 @@ -//===-- Markdown.h - Markdown Parser ----------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// This file defines a standalone Markdown parsing library for the LLVM -// ecosystem. The parser takes plain text and returns a tree of typed nodes -// with no knowledge of comments, Doxygen, or Clang-Doc internals. -// +/// +/// \file +/// This file defines a standalone Markdown parsing library for the LLVM +/// ecosystem. The parser takes plain text and returns a tree of typed nodes +/// with no knowledge of comments, Doxygen, or Clang-Doc internals. +/// //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H @@ -23,21 +24,21 @@ namespace clang { namespace doc { namespace markdown { -enum class NodeKind : uint8_t { +enum class NodeKind { // Block nodes - Paragraph, - FencedCode, - Table, - UnorderedList, - OrderedList, - ListItem, - ThematicBreak, + NK_Paragraph, + NK_FencedCode, + NK_Table, + NK_UnorderedList, + NK_OrderedList, + NK_ListItem, + NK_ThematicBreak, // Inline nodes - Text, - InlineCode, - Emphasis, - Strong, - SoftBreak, + NK_Text, + NK_InlineCode, + NK_Emphasis, + NK_Strong, + NK_SoftBreak, }; struct MDNode { @@ -46,9 +47,9 @@ struct MDNode { llvm::ArrayRef<MDNode> Children; // arena allocated }; -// Parses Markdown from a single comment paragraph's text. -// Returns an empty ArrayRef if no Markdown constructs are found, -// so generators can fall back to plain-text rendering at zero cost. +/// Parses Markdown from a single comment paragraph's text. +/// Returns an empty ArrayRef if no Markdown constructs are found, +/// so generators can fall back to plain-text rendering at zero cost. llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, llvm::BumpPtrAllocator &Arena); >From 434e6328c40d4f10ab70b5a4e28ca70bc1e7edd5 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 29 May 2026 16:17:59 -0400 Subject: [PATCH 6/8] [clang-doc] Add unit tests for Markdown parser --- .../unittests/clang-doc/CMakeLists.txt | 4 +- .../clang-doc/MarkdownParserTest.cpp | 94 +++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt index 01b34ec9a791e..b74207ac88fa7 100644 --- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt +++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt @@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests ClangDocTest.cpp GeneratorTest.cpp HTMLGeneratorTest.cpp + MarkdownParserTest.cpp MDGeneratorTest.cpp MergeTest.cpp SerializeTest.cpp @@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests target_link_libraries(ClangDocTests PRIVATE clangDoc + clangDocSupport LLVMTestingSupport - ) + ) \ No newline at end of file diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp new file mode 100644 index 0000000000000..8df5efc7f1d5f --- /dev/null +++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "support/Markdown.h" +#include "llvm/Support/Allocator.h" +#include "gtest/gtest.h" + +using namespace clang::doc::markdown; + +namespace { + +TEST(MarkdownParserTest, EmptyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, WhitespaceOnlyInput) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown(" \n \n", Arena); + EXPECT_TRUE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("hello world", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text); + EXPECT_EQ(Nodes[0].Content, "hello world"); +} + +TEST(MarkdownParserTest, FencedCodeBlock) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_EQ(Nodes[0].Content, "cpp"); + ASSERT_EQ(Nodes[0].Children.size(), 1u); +} + +TEST(MarkdownParserTest, FencedCodeBlockNoLang) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```\nsome code\n```", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode); + EXPECT_TRUE(Nodes[0].Content.empty()); +} + +TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena); + // Unterminated fence should not crash and should produce a code node + // with whatever lines were found. + EXPECT_FALSE(Nodes.empty()); +} + +TEST(MarkdownParserTest, PipeTable) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("a | b\nc | d", Arena); + // No separator row so should not be parsed as a table + for (const auto &Node : Nodes) + EXPECT_NE(Node.Kind, NodeKind::NK_Table); +} + +TEST(MarkdownParserTest, UnorderedList) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena); + ASSERT_EQ(Nodes.size(), 1u); + EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList); + ASSERT_EQ(Nodes[0].Children.size(), 3u); + EXPECT_EQ(Nodes[0].Children[0].Content, "foo"); + EXPECT_EQ(Nodes[0].Children[1].Content, "bar"); + EXPECT_EQ(Nodes[0].Children[2].Content, "baz"); +} + +TEST(MarkdownParserTest, MixedContent) { + llvm::BumpPtrAllocator Arena; + auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena); + EXPECT_EQ(Nodes.size(), 3u); +} + +} // namespace \ No newline at end of file >From fb907ac64844b5aa7c0679e32884438dc454949a Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 29 May 2026 16:21:43 -0400 Subject: [PATCH 7/8] [clang-doc] Add design documentation to Markdown.h --- clang-tools-extra/clang-doc/support/Markdown.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h index e665170473601..0ae33e33e7eba 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.h +++ b/clang-tools-extra/clang-doc/support/Markdown.h @@ -11,6 +11,22 @@ /// ecosystem. The parser takes plain text and returns a tree of typed nodes /// with no knowledge of comments, Doxygen, or Clang-Doc internals. /// +/// This is a simple Markdown parser for use inside Clang-Doc's comment +/// pipeline. You give it a paragraph of text and an arena allocator, and it +/// gives back a list of typed nodes describing the Markdown structure it found. +/// +/// The main entry point is parseMarkdown(). If the text has no Markdown in it, +/// you get back an empty list and can fall back to plain-text output. If it +/// does, you get a tree of MDNode structs where each node has a kind, optional +/// content (like the language tag on a code fence), and optional children. +/// +/// All nodes are allocated in the arena you pass in. You own the arena and are +/// responsible for keeping it alive as long as you use the nodes. +/// +/// The parser handles fenced code blocks, pipe tables, and unordered lists. +/// Anything it does not recognize comes back as a plain text node. It will +/// never crash on bad input. +/// //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H >From ddf1f20bb0af4ee6494cd3d5e53be965a5802430 Mon Sep 17 00:00:00 2001 From: Neil-N4 <[email protected]> Date: Fri, 5 Jun 2026 01:20:15 -0400 Subject: [PATCH 8/8] [clang-doc] Add LLVM_DEBUG macros to Markdown parser --- .../clang-doc/support/Markdown.cpp | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp index bbce53fa17156..49c141bc2baed 100644 --- a/clang-tools-extra/clang-doc/support/Markdown.cpp +++ b/clang-tools-extra/clang-doc/support/Markdown.cpp @@ -10,6 +10,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "clang-doc-markdown" namespace clang { namespace doc { @@ -38,12 +42,17 @@ allocateNodes(llvm::SmallVectorImpl<MDNode> &Nodes, llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, llvm::BumpPtrAllocator &Arena) { - if (ParagraphText.trim().empty()) + if (ParagraphText.trim().empty()) { + LLVM_DEBUG(llvm::dbgs() << "[md] empty input, returning nothing\n"); return {}; + } llvm::SmallVector<llvm::StringRef, 16> Lines; ParagraphText.split(Lines, '\n'); + LLVM_DEBUG(llvm::dbgs() << "[md] parsing " << Lines.size() + << " line(s)\n"); + llvm::SmallVector<MDNode, 8> Nodes; unsigned I = 0; @@ -59,14 +68,19 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, if (Line.starts_with("```") || Line.starts_with("~~~")) { char Fence = Line[0]; llvm::StringRef Lang = Line.drop_front(3).trim(); + LLVM_DEBUG(llvm::dbgs() << "[md] fenced code block, lang='" + << Lang << "'\n"); llvm::SmallVector<MDNode, 4> CodeLines; ++I; while (I < Lines.size()) { llvm::StringRef CodeLine = Lines[I].trim(); if (CodeLine.size() >= 3 && llvm::all_of(CodeLine.take_front(3), - [Fence](char C) { return C == Fence; })) + [Fence](char C) { return C == Fence; })) { + LLVM_DEBUG(llvm::dbgs() << "[md] closing fence found at line " + << I << "\n"); break; + } CodeLines.push_back(makeText(Lines[I])); ++I; } @@ -75,6 +89,8 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, Code.Kind = NodeKind::NK_FencedCode; Code.Content = Lang; Code.Children = allocateNodes(CodeLines, Arena); + LLVM_DEBUG(llvm::dbgs() << "[md] emitting NK_FencedCode with " + << CodeLines.size() << " line(s)\n"); Nodes.push_back(Code); continue; } @@ -82,6 +98,8 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, // Pipe table: current line has | and next line is a separator row if (Line.contains('|') && I + 1 < Lines.size() && isSepRow(Lines[I + 1].trim())) { + LLVM_DEBUG(llvm::dbgs() << "[md] pipe table detected at line " + << I << "\n"); llvm::SmallVector<MDNode, 4> Rows; while (I < Lines.size() && Lines[I].trim().contains('|')) { Rows.push_back(makeText(Lines[I].trim())); @@ -91,6 +109,8 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, Table.Kind = NodeKind::NK_Table; Table.Content = {}; Table.Children = allocateNodes(Rows, Arena); + LLVM_DEBUG(llvm::dbgs() << "[md] emitting NK_Table with " + << Rows.size() << " row(s)\n"); Nodes.push_back(Table); continue; } @@ -98,6 +118,8 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, // Unordered list item if (Line.starts_with("- ") || Line.starts_with("* ") || Line.starts_with("+ ")) { + LLVM_DEBUG(llvm::dbgs() << "[md] unordered list at line " << I + << "\n"); llvm::SmallVector<MDNode, 4> Items; while (I < Lines.size()) { llvm::StringRef L = Lines[I].trim(); @@ -115,15 +137,20 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText, List.Kind = NodeKind::NK_UnorderedList; List.Content = {}; List.Children = allocateNodes(Items, Arena); + LLVM_DEBUG(llvm::dbgs() << "[md] emitting NK_UnorderedList with " + << Items.size() << " item(s)\n"); Nodes.push_back(List); continue; } // Plain text fallback + LLVM_DEBUG(llvm::dbgs() << "[md] plain text: '" << Line << "'\n"); Nodes.push_back(makeText(Line)); ++I; } + LLVM_DEBUG(llvm::dbgs() << "[md] done, " << Nodes.size() + << " top-level node(s)\n"); return allocateNodes(Nodes, Arena); } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
