https://github.com/ilovepi updated https://github.com/llvm/llvm-project/pull/159196
>From b929e27245223a126e2835a3440f1ae4f0a5aa57 Mon Sep 17 00:00:00 2001 From: Paul Kirth <paulki...@google.com> Date: Mon, 15 Sep 2025 23:27:50 -0700 Subject: [PATCH] [llvm][mustache] Use single pass when tokenizing The old implementation used many string searches over the same portions of the strings. This version sacrifices some API niceness for perf wins. Metric | Baseline | Single-Pass | Change -------------- | -------- | ----------- | ------- Time (ms) | 36.09 | 35.78 | -0.86% Cycles | 35.3M | 35.0M | -0.79% Instructions | 86.7M | 85.8M | -1.03% Branch Misses | 116K | 114K | -1.91% Cache Misses | 244K | 232K | -4.98% --- llvm/lib/Support/Mustache.cpp | 186 +++++++++++++--------------------- 1 file changed, 73 insertions(+), 113 deletions(-) diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 4328f04a36891..080611edcb3fd 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -371,141 +371,101 @@ static const char *jsonKindToString(json::Value::Kind K) { llvm_unreachable("Unknown json::Value::Kind"); } -static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open, - StringRef Close) { - const StringLiteral TripleOpen("{{{"); - const StringLiteral TripleClose("}}}"); - - size_t NormalOpenPos = Template.find(Open, StartPos); - size_t TripleOpenPos = Template.find(TripleOpen, StartPos); - - Tag Result; - - // Determine which tag comes first. - if (TripleOpenPos != StringRef::npos && - (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) { - // Found a triple mustache tag. - size_t EndPos = - Template.find(TripleClose, TripleOpenPos + TripleOpen.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Triple; - Result.StartPosition = TripleOpenPos; - size_t ContentStart = TripleOpenPos + TripleOpen.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = Template.substr( - TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos); - } else if (NormalOpenPos != StringRef::npos) { - // Found a normal mustache tag. - size_t EndPos = Template.find(Close, NormalOpenPos + Open.size()); - if (EndPos == StringRef::npos) - return Result; // No closing tag found. - - Result.TagKind = Tag::Kind::Normal; - Result.StartPosition = NormalOpenPos; - size_t ContentStart = NormalOpenPos + Open.size(); - Result.Content = Template.substr(ContentStart, EndPos - ContentStart); - Result.FullMatch = - Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos); - } - - return Result; -} - -static std::optional<std::pair<StringRef, StringRef>> -processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) { - LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content - << ", Kind: " << tagKindToString(T.TagKind) << "\n"); - if (T.TagKind == Tag::Kind::Triple) { - Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx); - return std::nullopt; - } - StringRef Interpolated = T.Content; - if (!Interpolated.trim().starts_with("=")) { - char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); - Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx); - return std::nullopt; - } - Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx); - StringRef DelimSpec = Interpolated.trim(); - DelimSpec = DelimSpec.drop_front(1); - DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); - DelimSpec = DelimSpec.trim(); - - auto [NewOpen, NewClose] = DelimSpec.split(' '); - LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen - << ", NewClose: " << NewClose << "\n"); - return std::make_pair(NewOpen, NewClose); -} - // Simple tokenizer that splits the template into tokens. -// The mustache spec allows {{{ }}} to unescape variables, -// but we don't support that here. An unescape variable -// is represented only by {{& variable}}. static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) { LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n"); SmallVector<Token> Tokens; SmallString<8> Open("{{"); SmallString<8> Close("}}"); - size_t Start = 0; + size_t Cursor = 0; + size_t TextStart = 0; + + const StringLiteral TripleOpen("{{{"); + const StringLiteral TripleClose("}}}"); - while (Start < Template.size()) { - LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start=" << Start << ", Open='" << Open - << "', Close='" << Close << "'\n"); - Tag T = findNextTag(Template, Start, Open, Close); + while (Cursor < Template.size()) { + StringRef TemplateSuffix = Template.substr(Cursor); + StringRef TagOpen, TagClose; + Tag::Kind Kind; + + // Determine which tag we've encountered. + if (TemplateSuffix.starts_with(TripleOpen)) { + Kind = Tag::Kind::Triple; + TagOpen = TripleOpen; + TagClose = TripleClose; + } else if (TemplateSuffix.starts_with(Open)) { + Kind = Tag::Kind::Normal; + TagOpen = Open; + TagClose = Close; + } else { + // Not at a tag, continue scanning. + ++Cursor; + continue; + } - if (T.TagKind == Tag::Kind::None) { - // No more tags, the rest is text. - Tokens.emplace_back(Template.substr(Start)); - break; + // Found a tag, first add the preceding text. + if (Cursor > TextStart) { + Tokens.emplace_back(Template.slice(TextStart, Cursor)); } - // Add the text before the tag. - if (T.StartPosition > Start) { - StringRef Text = Template.substr(Start, T.StartPosition - Start); - Tokens.emplace_back(Text); + // Find the closing tag. + size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size()); + if (EndPos == StringRef::npos) { + // No closing tag, the rest is text. + Tokens.emplace_back(Template.substr(Cursor)); + TextStart = Cursor = Template.size(); + break; } - if (auto NewDelims = processTag(T, Tokens, Ctx)) { - std::tie(Open, Close) = *NewDelims; + // Extract tag content and full match. + size_t ContentStart = Cursor + TagOpen.size(); + StringRef Content = Template.substr(ContentStart, EndPos - ContentStart); + StringRef FullMatch = + Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor); + + // Process the tag (inlined logic from processTag). + LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content + << ", Kind: " << tagKindToString(Kind) << "\n"); + if (Kind == Tag::Kind::Triple) { + Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx); + } else { // Normal Tag + StringRef Interpolated = Content; + if (!Interpolated.trim().starts_with("=")) { + char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front(); + Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx); + } else { // Set Delimiter + Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx); + StringRef DelimSpec = Interpolated.trim(); + DelimSpec = DelimSpec.drop_front(1); + DelimSpec = DelimSpec.take_until([](char C) { return C == '='; }); + DelimSpec = DelimSpec.trim(); + + auto [NewOpen, NewClose] = DelimSpec.split(' '); + LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen + << ", NewClose: " << NewClose << "\n"); + Open = NewOpen; + Close = NewClose; + } } - // Move past the tag. - Start = T.StartPosition + T.FullMatch.size(); + // Move past the tag for the next iteration. + Cursor += FullMatch.size(); + TextStart = Cursor; } - // Fix up white spaces for: - // - open sections - // - inverted sections - // - close sections - // - comments - // - // This loop attempts to find standalone tokens and tries to trim out - // the surrounding whitespace. - // For example: - // if you have the template string - // {{#section}} \n Example \n{{/section}} - // The output should would be - // For example: - // \n Example \n + // Add any remaining text after the last tag. + if (TextStart < Template.size()) { + Tokens.emplace_back(Template.substr(TextStart)); + } + + // Fix up white spaces for standalone tags. size_t LastIdx = Tokens.size() - 1; for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) { Token &CurrentToken = Tokens[Idx]; Token::Type CurrentType = CurrentToken.getType(); - // Check if token type requires cleanup. - bool RequiresCleanUp = requiresCleanUp(CurrentType); - - if (!RequiresCleanUp) + if (!requiresCleanUp(CurrentType)) continue; - // We adjust the token body if there's no text behind or ahead. - // A token is considered to have no text ahead if the right of the previous - // token is a newline followed by spaces. - // A token is considered to have no text behind if the left of the next - // token is spaces followed by a newline. - // eg. - // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3" bool HasTextBehind = hasTextBehind(Idx, Tokens); bool HasTextAhead = hasTextAhead(Idx, Tokens); _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits