[clang-tools-extra] [clang-tidy] Add ranges-style view for tokenizing source code (PR #172508)

Victor Chernyakin via cfe-commits Tue, 16 Dec 2025 08:01:18 -0800

https://github.com/localspook created 
https://github.com/llvm/llvm-project/pull/172508


We have several checks that want to relex source code, but right now, doing so 
is annoying; the `Lexer` API is difficult to use. This PR introduces a 
ranges-style wrapper for it and converts some checks to use the new API.

>From 84cd50de596444f93f785d27d89d946748eb4d29 Mon Sep 17 00:00:00 2001
From: Victor Chernyakin <[email protected]>
Date: Tue, 16 Dec 2025 07:27:53 -0800
Subject: [PATCH] [clang-tidy] Add ranges-style view for tokenizing source code

---
 .../bugprone/ArgumentCommentCheck.cpp         | 15 +---
 .../clang-tidy/modernize/MacroToEnumCheck.cpp | 35 ++++-----
 .../modernize/RedundantVoidArgCheck.cpp       | 11 +--
 .../clang-tidy/modernize/UseOverrideCheck.cpp | 13 +---
 .../modernize/UseTrailingReturnTypeCheck.cpp  | 55 ++++++--------
 .../readability/SimplifyBooleanExprCheck.cpp  | 23 ++----
 .../clang-tidy/utils/LexerUtils.h             | 74 +++++++++++++++++++
 7 files changed, 124 insertions(+), 102 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp 
b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
index ed30d01e645d1..235c68eea08b4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ArgumentCommentCheck.cpp
@@ -94,19 +94,8 @@ getCommentsInRange(ASTContext *Ctx, CharSourceRange Range) {
   if (Invalid)
     return Comments;
 
-  const char *StrData = Buffer.data() + BeginLoc.second;
-
-  Lexer TheLexer(SM.getLocForStartOfFile(BeginLoc.first), Ctx->getLangOpts(),
-                 Buffer.begin(), StrData, Buffer.end());
-  TheLexer.SetCommentRetentionState(true);
-
-  while (true) {
-    Token Tok;
-    if (TheLexer.LexFromRawLexer(Tok))
-      break;
-    if (Tok.getLocation() == Range.getEnd() || Tok.is(tok::eof))
-      break;
-
+  for (const Token Tok :
+       utils::lexer::tokensIncludingComments(Range, SM, Ctx->getLangOpts())) {
     if (Tok.is(tok::comment)) {
       const std::pair<FileID, unsigned> CommentLoc =
           SM.getDecomposedLoc(Tok.getLocation());
diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp 
b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
index 098d46cae5df4..8f88daf1ea7cc 100644
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@@ -7,8 +7,8 @@
 
//===----------------------------------------------------------------------===//
 
 #include "MacroToEnumCheck.h"
+#include "../utils/LexerUtils.h"
 #include "IntegralLiteralExpressionMatcher.h"
-
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/Lex/Preprocessor.h"
@@ -19,17 +19,14 @@
 
 namespace clang::tidy::modernize {
 
-static bool hasOnlyComments(SourceLocation Loc, const LangOptions &Options,
-                            StringRef Text) {
+static bool hasOnlyComments(SourceLocation Loc, const SourceManager &SM,
+                            const LangOptions &Options,
+                            CharSourceRange CharRange) {
   // Use a lexer to look for tokens; if we find something other than a single
   // hash, then there were intervening tokens between macro definitions.
-  const std::string Buffer{Text};
-  Lexer Lex(Loc, Options, Buffer.c_str(), Buffer.c_str(),
-            Buffer.c_str() + Buffer.size());
-  Token Tok;
   bool SeenHash = false;
-  while (!Lex.LexFromRawLexer(Tok)) {
-    if (Tok.getKind() == tok::hash && !SeenHash) {
+  for (const Token Tok : utils::lexer::tokens(CharRange, SM, Options)) {
+    if (Tok.is(tok::hash) && !SeenHash) {
       SeenHash = true;
       continue;
     }
@@ -46,6 +43,7 @@ static bool hasOnlyComments(SourceLocation Loc, const 
LangOptions &Options,
     CRLFCR,
   };
 
+  const StringRef Text = Lexer::getSourceText(CharRange, SM, Options);
   WhiteSpace State = WhiteSpace::Nothing;
   for (const char C : Text) {
     switch (C) {
@@ -237,8 +235,7 @@ bool MacroToEnumCallbacks::isConsecutiveMacro(const 
MacroDirective *MD) const {
       SourceRange{CurrentFile->LastMacroLocation, Define}, true};
   const CharSourceRange CharRange =
       Lexer::makeFileCharRange(BetweenMacros, SM, LangOpts);
-  const StringRef BetweenText = Lexer::getSourceText(CharRange, SM, LangOpts);
-  return hasOnlyComments(Define, LangOpts, BetweenText);
+  return hasOnlyComments(Define, SM, LangOpts, CharRange);
 }
 
 void MacroToEnumCallbacks::clearCurrentEnum(SourceLocation Loc) {
@@ -258,17 +255,11 @@ void MacroToEnumCallbacks::conditionStart(const 
SourceLocation &Loc) {
 }
 
 void MacroToEnumCallbacks::checkCondition(SourceRange Range) {
-  const CharSourceRange CharRange = Lexer::makeFileCharRange(
-      CharSourceRange::getTokenRange(Range), SM, LangOpts);
-  std::string Text = Lexer::getSourceText(CharRange, SM, LangOpts).str();
-  Lexer Lex(CharRange.getBegin(), LangOpts, Text.data(), Text.data(),
-            Text.data() + Text.size());
-  Token Tok;
-  bool End = false;
-  while (!End) {
-    End = Lex.LexFromRawLexer(Tok);
-    if (Tok.is(tok::raw_identifier) &&
-        Tok.getRawIdentifier().str() != "defined")
+  for (const Token Tok : utils::lexer::tokens(
+           Lexer::makeFileCharRange(CharSourceRange::getTokenRange(Range), SM,
+                                    LangOpts),
+           SM, LangOpts)) {
+    if (Tok.is(tok::raw_identifier) && Tok.getRawIdentifier() != "defined")
       checkName(Tok);
   }
 }
diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp 
b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
index aa2db2146475b..d3125711b89c3 100644
--- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
@@ -7,6 +7,7 @@
 
//===----------------------------------------------------------------------===//
 
 #include "RedundantVoidArgCheck.h"
+#include "../utils/LexerUtils.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Lex/Lexer.h"
 
@@ -127,12 +128,6 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens(
   const CharSourceRange CharRange =
       Lexer::makeFileCharRange(CharSourceRange::getTokenRange(Range),
                                *Result.SourceManager, getLangOpts());
-
-  std::string DeclText =
-      Lexer::getSourceText(CharRange, *Result.SourceManager, getLangOpts())
-          .str();
-  Lexer PrototypeLexer(CharRange.getBegin(), getLangOpts(), DeclText.data(),
-                       DeclText.data(), DeclText.data() + DeclText.size());
   enum class TokenState {
     Start,
     MacroId,
@@ -149,7 +144,9 @@ void RedundantVoidArgCheck::removeVoidArgumentTokens(
   const std::string Diagnostic =
       ("redundant void argument list in " + GrammarLocation).str();
 
-  while (!PrototypeLexer.LexFromRawLexer(ProtoToken)) {
+  for (const Token Tok :
+       utils::lexer::tokens(CharRange, *Result.SourceManager, getLangOpts())) {
+    ProtoToken = Tok;
     switch (State) {
     case TokenState::Start:
       if (ProtoToken.is(tok::TokenKind::l_paren))
diff --git a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp 
b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
index dd516f8e51264..6de465afcca84 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseOverrideCheck.cpp
@@ -54,21 +54,12 @@ void UseOverrideCheck::registerMatchers(MatchFinder 
*Finder) {
 static SmallVector<Token, 16>
 parseTokens(CharSourceRange Range, const MatchFinder::MatchResult &Result) {
   const SourceManager &Sources = *Result.SourceManager;
-  const std::pair<FileID, unsigned> LocInfo =
-      Sources.getDecomposedLoc(Range.getBegin());
-  const StringRef File = Sources.getBufferData(LocInfo.first);
-  const char *TokenBegin = File.data() + LocInfo.second;
-  Lexer RawLexer(Sources.getLocForStartOfFile(LocInfo.first),
-                 Result.Context->getLangOpts(), File.begin(), TokenBegin,
-                 File.end());
   SmallVector<Token, 16> Tokens;
-  Token Tok;
   int NestedParens = 0;
-  while (!RawLexer.LexFromRawLexer(Tok)) {
+  for (Token Tok :
+       utils::lexer::tokens(Range, Sources, Result.Context->getLangOpts())) {
     if ((Tok.is(tok::semi) || Tok.is(tok::l_brace)) && NestedParens == 0)
       break;
-    if (Sources.isBeforeInTranslationUnit(Range.getEnd(), Tok.getLocation()))
-      break;
     if (Tok.is(tok::l_paren))
       ++NestedParens;
     else if (Tok.is(tok::r_paren))
diff --git 
a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp 
b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index 02865b65a9ec2..054213ea542b0 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -7,6 +7,7 @@
 
//===----------------------------------------------------------------------===//
 
 #include "UseTrailingReturnTypeCheck.h"
+#include "../utils/LexerUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -14,7 +15,6 @@
 #include "clang/Tooling/FixIt.h"
 #include "llvm/ADT/StringExtras.h"
 
-#include <cctype>
 #include <optional>
 
 namespace clang::tidy {
@@ -173,13 +173,11 @@ static SourceLocation 
findTrailingReturnTypeSourceLocation(
       Lexer::getLocForEndOfToken(ClosingParen, 0, SM, LangOpts);
 
   // Skip subsequent CV and ref qualifiers.
-  const std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(Result);
-  const StringRef File = SM.getBufferData(Loc.first);
-  const char *TokenBegin = File.data() + Loc.second;
-  Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(),
-              TokenBegin, File.end());
-  Token T;
-  while (!Lexer.LexFromRawLexer(T)) {
+  for (Token T : utils::lexer::tokens(
+           Lexer::makeFileCharRange(
+               CharSourceRange::getTokenRange(Result, F.getEndLoc()), SM,
+               LangOpts),
+           SM, LangOpts)) {
     if (T.is(tok::raw_identifier)) {
       IdentifierInfo &Info = Ctx.Idents.get(
           StringRef(SM.getCharacterData(T.getLocation()), T.getLength()));
@@ -255,15 +253,11 @@ classifyTokensBeforeFunctionName(const FunctionDecl &F, 
const ASTContext &Ctx,
   const SourceLocation BeginNameF = expandIfMacroId(F.getLocation(), SM);
 
   // Create tokens for everything before the name of the function.
-  const std::pair<FileID, unsigned> Loc = SM.getDecomposedLoc(BeginF);
-  const StringRef File = SM.getBufferData(Loc.first);
-  const char *TokenBegin = File.data() + Loc.second;
-  Lexer Lexer(SM.getLocForStartOfFile(Loc.first), LangOpts, File.begin(),
-              TokenBegin, File.end());
-  Token T;
   SmallVector<ClassifiedToken, 8> ClassifiedTokens;
-  while (!Lexer.LexFromRawLexer(T) &&
-         SM.isBeforeInTranslationUnit(T.getLocation(), BeginNameF)) {
+  for (Token T : utils::lexer::tokens(
+           Lexer::makeFileCharRange(
+               CharSourceRange::getCharRange(BeginF, BeginNameF), SM, 
LangOpts),
+           SM, LangOpts)) {
     if (T.is(tok::raw_identifier)) {
       IdentifierInfo &Info = Ctx.Idents.get(
           StringRef(SM.getCharacterData(T.getLocation()), T.getLength()));
@@ -367,25 +361,20 @@ static SourceLocation findLambdaTrailingReturnInsertLoc(
     else
       ParamEndLoc = Method->getParametersSourceRange().getEnd();
 
-    const std::pair<FileID, unsigned> ParamEndLocInfo =
-        SM.getDecomposedLoc(ParamEndLoc);
-    const StringRef Buffer = SM.getBufferData(ParamEndLocInfo.first);
-
-    Lexer Lexer(SM.getLocForStartOfFile(ParamEndLocInfo.first), LangOpts,
-                Buffer.begin(), Buffer.data() + ParamEndLocInfo.second,
-                Buffer.end());
-
-    Token Token;
-    while (!Lexer.LexFromRawLexer(Token)) {
-      if (Token.is(tok::raw_identifier)) {
-        IdentifierInfo &Info = Ctx.Idents.get(StringRef(
-            SM.getCharacterData(Token.getLocation()), Token.getLength()));
-        Token.setIdentifierInfo(&Info);
-        Token.setKind(Info.getTokenID());
+    for (Token T : utils::lexer::tokens(
+             Lexer::makeFileCharRange(CharSourceRange::getTokenRange(
+                                          ParamEndLoc, Method->getEndLoc()),
+                                      SM, LangOpts),
+             SM, LangOpts)) {
+      if (T.is(tok::raw_identifier)) {
+        IdentifierInfo &Info = Ctx.Idents.get(
+            StringRef(SM.getCharacterData(T.getLocation()), T.getLength()));
+        T.setIdentifierInfo(&Info);
+        T.setKind(Info.getTokenID());
       }
 
-      if (Token.is(tok::kw_requires))
-        return Token.getLocation().getLocWithOffset(-1);
+      if (T.is(tok::kw_requires))
+        return T.getLocation().getLocWithOffset(-1);
     }
 
     return {};
diff --git 
a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp 
b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
index 1a9c161068030..baf77e6774061 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.cpp
@@ -7,6 +7,7 @@
 
//===----------------------------------------------------------------------===//
 
 #include "SimplifyBooleanExprCheck.h"
+#include "../utils/LexerUtils.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/DiagnosticIDs.h"
@@ -237,22 +238,12 @@ static std::string replacementExpression(const ASTContext 
&Context,
 
 static bool containsDiscardedTokens(const ASTContext &Context,
                                     CharSourceRange CharRange) {
-  std::string ReplacementText =
-      Lexer::getSourceText(CharRange, Context.getSourceManager(),
-                           Context.getLangOpts())
-          .str();
-  Lexer Lex(CharRange.getBegin(), Context.getLangOpts(), 
ReplacementText.data(),
-            ReplacementText.data(),
-            ReplacementText.data() + ReplacementText.size());
-  Lex.SetCommentRetentionState(true);
-
-  Token Tok;
-  while (!Lex.LexFromRawLexer(Tok)) {
-    if (Tok.is(tok::TokenKind::comment) || Tok.is(tok::TokenKind::hash))
-      return true;
-  }
-
-  return false;
+  return llvm::any_of(
+      utils::lexer::tokensIncludingComments(
+          CharRange, Context.getSourceManager(), Context.getLangOpts()),
+      [](Token Tok) {
+        return Tok.isOneOf(tok::TokenKind::comment, tok::TokenKind::hash);
+      });
 }
 
 class SimplifyBooleanExprCheck::Visitor : public RecursiveASTVisitor<Visitor> {
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.h 
b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
index c5fb646c0efd9..9daf005a6cb00 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.h
@@ -12,6 +12,8 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/basic/SourceManager.h"
+#include <iterator>
 #include <optional>
 #include <utility>
 
@@ -127,6 +129,78 @@ SourceLocation getUnifiedEndLoc(const Stmt &S, const 
SourceManager &SM,
 SourceLocation getLocationForNoexceptSpecifier(const FunctionDecl *FuncDecl,
                                                const SourceManager &SM);
 
+class TokenView {
+public:
+  class iterator { // NOLINT(readability-identifier-naming)
+  public:
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using iterator_category = std::input_iterator_tag;
+
+    iterator &operator++() {
+      if (View->RawLexer.getBufferLocation() < View->EndOfLexedRange)
+        View->RawLexer.LexFromRawLexer(View->Tok);
+      else
+        View = nullptr; // No more tokens.
+      return *this;
+    }
+
+    void operator++(int) { operator++(); }
+
+    friend bool operator==(iterator LHS, iterator RHS) {
+      return LHS.View == RHS.View;
+    }
+
+    friend bool operator!=(iterator LHS, iterator RHS) { return !(LHS == RHS); 
}
+
+    const Token &operator*() const { return View->Tok; }
+    const Token *operator->() const { return &View->Tok; }
+
+  private:
+    friend class TokenView;
+    iterator(TokenView *V) : View(V) {}
+    TokenView *View;
+  };
+
+  iterator begin() {
+    iterator It(this);
+    ++It;
+    return It;
+  }
+  iterator end() { return {nullptr}; }
+
+  TokenView(CharSourceRange Range, const SourceManager &SM,
+            const LangOptions &LangOpts, bool RetainComments)
+      : RawLexer([&]() -> Lexer {
+          const auto [FID, BeginOffset] = 
SM.getDecomposedLoc(Range.getBegin());
+          const auto [_, EndOffset] = SM.getDecomposedLoc(Range.getEnd());
+          const StringRef FileContents = SM.getBufferData(FID);
+          const StringRef LexedRange = {FileContents.begin() + BeginOffset,
+                                        EndOffset - BeginOffset};
+          EndOfLexedRange = LexedRange.end();
+          return {Range.getBegin(), LangOpts, LexedRange.begin(),
+                  LexedRange.begin(), FileContents.end()};
+        }()) {
+    RawLexer.SetCommentRetentionState(RetainComments);
+  }
+
+private:
+  Lexer RawLexer;
+  const char *EndOfLexedRange;
+  Token Tok;
+};
+
+inline TokenView tokens(CharSourceRange Range, const SourceManager &SM,
+                        const LangOptions &LangOpts) {
+  return {Range, SM, LangOpts, false};
+}
+
+inline TokenView tokensIncludingComments(CharSourceRange Range,
+                                         const SourceManager &SM,
+                                         const LangOptions &LangOpts) {
+  return {Range, SM, LangOpts, true};
+}
+
 } // namespace tidy::utils::lexer
 } // namespace clang
 

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang-tools-extra] [clang-tidy] Add ranges-style view for tokenizing source code (PR #172508)

Reply via email to