================ @@ -0,0 +1,665 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Markdown.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/DebugLog.h" +#include "llvm/Support/StringSaver.h" +#include <cassert> +#include <memory> +#include <string> + +#define DEBUG_TYPE "clang-doc" + +using namespace llvm; + +namespace clang::doc::markdown { + +// Allocates a contiguous array of T in the arena and returns an ArrayRef. +template <typename T> +static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec, + BumpPtrAllocator &Arena) { + if (Vec.empty()) + return {}; + T *Allocated = Arena.Allocate<T>(Vec.size()); + std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated); + return ArrayRef<T>(Allocated, Vec.size()); +} + +// A line is a table separator if it only contains |, -, :, and spaces, +// and has at least one -. +static bool isSepRow(StringRef Line) { + return Line.contains('-') && + Line.find_first_not_of("|-: ") == StringRef::npos; +} + +// Returns true if Line begins with a bullet list marker (-, *, or +) +// followed by a space. +static bool isListItem(StringRef Line) { + return Line.starts_with("- ") || Line.starts_with("* ") || + Line.starts_with("+ "); +} + +// Returns true if Line begins with an ordered list marker: one or more digits +// followed by a period and a space (e.g. "1. ", "42. "). +static bool isOrderedListItem(StringRef Line) { + size_t Dot = Line.find_first_not_of("0123456789"); + return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' && + Dot + 1 < Line.size() && Line[Dot + 1] == ' '; +} + +// Returns true if Line is a thematic break: three or more matching -, *, or _ +// characters, optionally separated by spaces, with nothing else. Line is +// expected to be trimmed. +static bool isThematicBreak(StringRef Line) { + char Marker = Line.empty() ? '\0' : Line[0]; + if (Marker != '-' && Marker != '*' && Marker != '_') + return false; + unsigned Count = 0; + for (char C : Line) { + if (C == Marker) + ++Count; + else if (C != ' ') + return false; + } + return Count >= 3; +} + +// Returns true if Line is a block quote line: it starts with "> ", or is a bare +// ">" marking an empty quote line. +static bool isBlockQuote(StringRef Line) { + return Line.starts_with("> ") || Line == ">"; +} + +// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to +// six leading # characters followed by a space. Returns 0 otherwise, so seven +// or more # characters fall back to plain text. +static unsigned atxHeadingLevel(StringRef Line) { + size_t Level = Line.find_first_not_of('#'); + if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ') + return 0; + return Level; +} + +// A forward cursor over the lines of a paragraph. Lines are stored untrimmed; +// callers trim where they need a normalized view. +class LineReader { +public: + explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {} + + // True once every line has been consumed. + bool atEnd() const { return Pos >= Lines.size(); } + + // The current line, untrimmed. Must not be called when atEnd(). + StringRef peek() const { + assert(!atEnd() && "peek past end of input"); + return Lines[Pos]; + } + + // The line Offset positions ahead of the cursor, or an empty StringRef when + // that position is past the end. peek(0) is the current line. + StringRef peek(size_t Offset) const { + size_t Target = Pos + Offset; + return Target < Lines.size() ? Lines[Target] : StringRef(); + } + + // Consume the current line and return it, untrimmed. Must not be called when + // atEnd(). + StringRef advance() { + assert(!atEnd() && "advance past end of input"); + return Lines[Pos++]; + } + +private: + ArrayRef<StringRef> Lines; + size_t Pos = 0; +}; + +// A forward cursor over the characters of a string. position() and seek() let +// it interoperate with the index-based run and delimiter helpers below. +class CharReader { +public: + explicit CharReader(StringRef S) : S(S) {} + + // True once every character has been consumed. + bool atEnd() const { return Pos >= S.size(); } + + // The current character. Must not be called when atEnd(). + char peek() const { + assert(!atEnd() && "peek past end of input"); + return S[Pos]; + } + + // Consume the current character and return it. Must not be called when + // atEnd(). + char advance() { + assert(!atEnd() && "advance past end of input"); + return S[Pos++]; + } + + // The current scan position, for substring, run, and delimiter computations. + size_t position() const { return Pos; } + + // Move the cursor to an absolute position, used to skip past a matched span. + void seek(size_t NewPos) { Pos = NewPos; } + +private: + StringRef S; + size_t Pos = 0; +}; + +// Returns the number of consecutive copies of C starting at S[Start]. +static size_t countRun(StringRef S, size_t Start, char C) { + size_t I = Start; + while (I < S.size() && S[I] == C) + ++I; + return I - Start; +} + +// Strips one leading and one trailing space from a code span's content when +// both are present and the content is not all spaces, per CommonMark §6.1. +static StringRef trimCodeSpan(StringRef Code) { + if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' && + Code.find_first_not_of(' ') != StringRef::npos) + return Code.drop_front().drop_back(); + return Code; +} + +// Treats the start and end of the string (passed as '\0') as whitespace for the +// CommonMark flanking rules. +static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); } + +// Computes whether a delimiter run can open or close emphasis, from the +// characters immediately before and after the run, per the CommonMark §6.2 +// flanking rules. Before and After are '\0' at the string boundaries. +static void computeFlanking(char Before, char Marker, char After, bool &CanOpen, + bool &CanClose) { + bool AfterWS = isFlankWhitespace(After); + bool BeforeWS = isFlankWhitespace(Before); + bool AfterPunct = isPunct(After); + bool BeforePunct = isPunct(Before); + bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct); + bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct); + if (Marker == '_') { + // Underscore does not open or close emphasis intraword. + CanOpen = LeftFlanking && (!RightFlanking || BeforePunct); + CanClose = RightFlanking && (!LeftFlanking || AfterPunct); + } else { + CanOpen = LeftFlanking; + CanClose = RightFlanking; + } +} + +namespace { +// One piece of inline content while emphasis is being resolved. A piece is +// either a finished content node (text, code span, or a built emphasis or +// strong node) or a run of delimiter characters that may still open or close +// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs +// can be spliced out without shifting the others. +struct InlinePiece { + MDNode *Node = nullptr; // content node, or null while this is a delimiter run + char Ch = 0; // '*' or '_' for a delimiter run + size_t Len = 0; // delimiters still available in the run + unsigned OrigLen = 0; // original run length, for the multiple-of-three rule + bool CanOpen = false; + bool CanClose = false; + int Prev = -1; + int Next = -1; +}; +} // namespace + +// Parses the inline content of a single line into a sequence of inline nodes: +// inline code (`code`), emphasis (*text* or _text_), and strong (**text** or +// __text__). Emphasis is resolved with a CommonMark-style delimiter stack: a +// first pass tokenizes the line into text, code spans, and delimiter runs (each +// tagged with its flanking flags), then a second pass walks closers back to +// openers, honoring the multiple-of-three rule. Unmatched runs stay as text. +// +// TODO: This does not yet handle links, autolinks, or backslash escapes. +static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena, + StringSaver &Saver) { + SmallVector<InlinePiece> Pool; + int Head = -1, Tail = -1; + + auto makePiece = [&]() -> int { + Pool.emplace_back(); + return Pool.size() - 1; + }; + auto linkAtTail = [&](int Idx) { + Pool[Idx].Prev = Tail; + (Tail != -1 ? Pool[Tail].Next : Head) = Idx; + Tail = Idx; + }; + auto appendNode = [&](MDNode *N) { + int Idx = makePiece(); + Pool[Idx].Node = N; + linkAtTail(Idx); + }; + // Content nodes pass through; a leftover delimiter run becomes a TextNode of + // its remaining characters. + auto pieceNode = [&](int P) -> MDNode * { + if (Pool[P].Node) + return Pool[P].Node; + return new (Arena) + TextNode(Saver.save(std::string(Pool[P].Len, Pool[P].Ch))); + }; + // Merges adjacent TextNodes so unmatched delimiters coalesce with neighboring + // text, then copies the result into the arena. + auto finalize = [&](SmallVectorImpl<MDNode *> &Nodes) -> ArrayRef<MDNode *> { + SmallVector<MDNode *> Merged; + for (MDNode *Nd : Nodes) { + if (isa<TextNode>(Nd) && !Merged.empty() && + isa<TextNode>(Merged.back())) { + StringRef Prev = cast<TextNode>(Merged.back())->Text; + StringRef Cur = cast<TextNode>(Nd)->Text; + Merged.back() = + new (Arena) TextNode(Saver.save(Prev.str() + Cur.str())); + } else { + Merged.push_back(Nd); + } + } + return allocateArray(Merged, Arena); + }; + + // Phase 1: tokenize the line into text, code spans, and delimiter runs. ---------------- ilovepi wrote:
I think phase 1 and 2 should have a more clear split than just a section w/in this long function. https://github.com/llvm/llvm-project/pull/202991 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
