[PATCH] D59887: [Syntax] Introduce TokenBuffer, start clangToolingSyntax library

Ilya Biryukov via Phabricator via cfe-commits Tue, 16 Apr 2019 11:55:40 -0700

ilya-biryukov updated this revision to Diff 195425.
ilya-biryukov added a comment.


- Simplify rawByExpanded by using a helper function.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D59887/new/

https://reviews.llvm.org/D59887

Files:
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/lib/Tooling/CMakeLists.txt
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/Tokens.cpp
  clang/unittests/Tooling/CMakeLists.txt
  clang/unittests/Tooling/Syntax/CMakeLists.txt
  clang/unittests/Tooling/Syntax/TokensTest.cpp

Index: clang/unittests/Tooling/Syntax/TokensTest.cpp
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -0,0 +1,629 @@
+//===- TokensTest.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Tokens.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/Expr.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/FileSystemOptions.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendAction.h"
+#include "clang/Frontend/Utils.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/Token.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Annotations.h"
+#include "gmock/gmock-more-matchers.h"
+#include <cassert>
+#include <cstdlib>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <ostream>
+#include <string>
+
+using namespace clang;
+using namespace clang::syntax;
+
+using ::testing::AllOf;
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::Matcher;
+using ::testing::Pointwise;
+
+namespace {
+// Matchers for syntax::Token.
+MATCHER_P(Kind, K, "") { return arg.kind() == K; }
+MATCHER_P2(HasText, Text, SourceMgr, "") {
+  return arg.text(*SourceMgr) == Text;
+}
+MATCHER_P2(IsIdent, Text, SourceMgr, "") {
+  return arg.kind() == tok::identifier && arg.text(*SourceMgr) == Text;
+}
+/// Checks the start and end location of a token are equal to SourceRng.
+MATCHER_P(RangeIs, SourceRng, "") {
+  return arg.location() == SourceRng.first &&
+         arg.endLocation() == SourceRng.second;
+}
+/// Checks the passed tuple has two similar tokens, i.e. both are of the same
+/// kind and have the same text if they are identifiers.
+/// Ignores differences in kind between the raw and non-raw mode.
+MATCHER_P(IsSameToken, SourceMgr, "") {
+  auto ToEquivalenceClass = [](tok::TokenKind Kind) {
+    if (Kind == tok::identifier || Kind == tok::raw_identifier ||
+        tok::getKeywordSpelling(Kind) != nullptr)
+      return tok::identifier;
+    if (Kind == tok::string_literal || Kind == tok::header_name)
+      return tok::string_literal;
+    return Kind;
+  };
+
+  auto &L = std::get<0>(arg);
+  auto &R = std::get<1>(arg);
+  if (ToEquivalenceClass(L.kind()) != ToEquivalenceClass(R.kind()))
+    return false;
+  return L.text(*SourceMgr) == L.text(*SourceMgr);
+}
+} // namespace
+
+// Actual test fixture lives in the syntax namespace as it's a friend of
+// TokenBuffer.
+class syntax::TokensTest : public ::testing::Test {
+public:
+  /// Run the clang frontend, collect the preprocessed tokens from the frontend
+  /// invocation and store them in this->Buffer.
+  /// This also clears SourceManager before running the compiler.
+  void recordTokens(llvm::StringRef Code) {
+    class RecordTokens : public ASTFrontendAction {
+    public:
+      explicit RecordTokens(TokenBuffer &Result) : Result(Result) {}
+
+      bool BeginSourceFileAction(CompilerInstance &CI) override {
+        assert(!Collector && "expected only a single call to BeginSourceFile");
+        Collector.emplace(CI.getPreprocessor());
+        return true;
+      }
+      void EndSourceFileAction() override {
+        assert(Collector && "BeginSourceFileAction was never called");
+        Result = std::move(*Collector).consume();
+      }
+
+      std::unique_ptr<ASTConsumer>
+      CreateASTConsumer(CompilerInstance &CI, StringRef InFile) override {
+        return llvm::make_unique<ASTConsumer>();
+      }
+
+    private:
+      TokenBuffer &Result;
+      llvm::Optional<TokenCollector> Collector;
+    };
+
+    constexpr const char *FileName = "./input.cpp";
+    FS->addFile(FileName, time_t(), llvm::MemoryBuffer::getMemBufferCopy(""));
+    // Prepare to run a compiler.
+    std::vector<const char *> Args = {"tok-test", "-std=c++03", "-fsyntax-only",
+                                      FileName};
+    auto CI = createInvocationFromCommandLine(Args, Diags, FS);
+    assert(CI);
+    CI->getFrontendOpts().DisableFree = false;
+    CI->getPreprocessorOpts().addRemappedFile(
+        FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
+    LangOpts = *CI->getLangOpts();
+    CompilerInstance Compiler;
+    Compiler.setInvocation(std::move(CI));
+    if (!Diags->getClient())
+      Diags->setClient(new IgnoringDiagConsumer);
+    Compiler.setDiagnostics(Diags.get());
+    Compiler.setFileManager(FileMgr.get());
+    Compiler.setSourceManager(SourceMgr.get());
+
+    this->Buffer = TokenBuffer(*SourceMgr);
+    RecordTokens Recorder(this->Buffer);
+    ASSERT_TRUE(Compiler.ExecuteAction(Recorder))
+        << "failed to run the frontend";
+
+    DEBUG_WITH_TYPE("syntax-tokens-test", {
+      llvm::dbgs() << "=== Recorded token stream:\n";
+      this->Buffer.dump(llvm::dbgs());
+    });
+  }
+
+  /// Run syntax::tokenize() and return the results.
+  std::vector<syntax::Token> tokenize(llvm::StringRef Text) {
+    // Null-terminate so that we always see 'tok::eof' at the end.
+    std::string NullTerminated = Text.str();
+    auto FID = SourceMgr->createFileID(llvm::MemoryBuffer::getMemBufferCopy(
+        StringRef(NullTerminated.data(), NullTerminated.size() + 1)));
+    return syntax::tokenize(FID, *SourceMgr, LangOpts);
+  }
+
+  /// Checks that lexing \p ExpectedText in raw mode would produce the same
+  /// token stream as the one stored in this->Buffer.expandedTokens().
+  void expectTokens(llvm::StringRef ExpectedText) {
+    std::vector<syntax::Token> ExpectedTokens = tokenize(ExpectedText);
+    EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()),
+                Pointwise(IsSameToken(), ExpectedTokens))
+        << "\texpected tokens: " << ExpectedText;
+  }
+
+  void expectSameTokens(llvm::ArrayRef<syntax::Token> Actual,
+                        llvm::ArrayRef<syntax::Token> Expected) {
+    EXPECT_THAT(std::vector<syntax::Token>(Actual),
+                Pointwise(IsSameToken(), std::vector<syntax::Token>(Expected)));
+  }
+
+  struct ExpectedInvocation {
+    ExpectedInvocation(
+        std::string From, std::string To,
+        llvm::Optional<llvm::Annotations::Range> Range = llvm::None)
+        : From(std::move(From)), To(std::move(To)), Range(Range) {}
+    /// A textual representation of the macro tokens.
+    std::string From;
+    /// A textual representation of the tokens after macro replacement.
+    std::string To;
+    /// A text range the macro invocation in the source code.
+    llvm::Optional<llvm::Annotations::Range> Range;
+  };
+
+  // FIXME: use a vocabulary range type instead.
+  std::pair<unsigned, unsigned>
+  mappingTextRange(const TokenBuffer::Mapping &M,
+                   const TokenBuffer::MarkedFile &F) {
+    assert(M.BeginRawToken < M.EndRawToken && "Invalid mapping");
+    return {
+        SourceMgr->getFileOffset(F.RawTokens.at(M.BeginRawToken).location()),
+        SourceMgr->getFileOffset(
+            F.RawTokens.at(M.EndRawToken - 1).endLocation())};
+  }
+
+  FileID findFile(llvm::StringRef Name) const {
+    const FileEntry *Entry = FileMgr->getFile(Name);
+    FileID Found = SourceMgr->translateFile(Entry);
+    if (!Found.isValid()) {
+      ADD_FAILURE() << "SourceManager does not track " << Name;
+      std::abort();
+    }
+    return Found;
+  }
+  /// Checks the this->Buffer.macroInvocations() for the main file match the \p
+  /// Expected ones.
+  void expectMacroInvocations(llvm::ArrayRef<ExpectedInvocation> Expected,
+                              FileID FID = FileID()) {
+    if (!FID.isValid())
+      FID = SourceMgr->getMainFileID();
+    EXPECT_TRUE(Buffer.Files.count(FID)) << "tokens for file were not recorded";
+    TokenBuffer::MarkedFile &File = Buffer.Files[FID];
+
+    llvm::ArrayRef<TokenBuffer::Mapping> Actual = File.Mappings;
+    ASSERT_EQ(Actual.size(), Expected.size());
+
+    for (unsigned I = 0; I < Actual.size(); ++I) {
+      const auto &A = Actual[I];
+      const auto &E = Expected[I];
+
+      if (E.Range)
+        ASSERT_EQ(mappingTextRange(A, File),
+                  (std::pair<unsigned, unsigned>(E.Range->Begin, E.Range->End)))
+            << "\trange does not match";
+
+      auto DropEOF = [](std::vector<syntax::Token> Tokens) {
+        if (Tokens.empty() || Tokens.back().kind() != tok::eof) {
+          ADD_FAILURE() << "expected 'eof' at the end of the tokens";
+          return Tokens;
+        }
+        Tokens.pop_back();
+        return Tokens;
+      };
+
+      std::vector<syntax::Token> ActualRaw(
+          File.RawTokens.begin() + A.BeginRawToken,
+          File.RawTokens.begin() + A.EndRawToken);
+      ASSERT_THAT(ActualRaw,
+                  Pointwise(IsSameToken(), DropEOF(tokenize(E.From))))
+          << "\tmacro tokens do not match, expected " << E.From;
+
+      std::vector<syntax::Token> ActualExpanded(
+          Buffer.ExpandedTokens.begin() + A.BeginExpandedToken,
+          Buffer.ExpandedTokens.begin() + A.EndExpandedToken);
+      ASSERT_THAT(ActualExpanded,
+                  Pointwise(IsSameToken(), DropEOF(tokenize(E.To))))
+          << "\ttokens after macro replacements do not match, expected "
+          << E.To;
+    }
+  }
+
+  // Specialized versions of matchers that rely on SourceManager.
+  Matcher<syntax::Token> IsIdent(std::string Text) const {
+    return ::IsIdent(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> HasText(std::string Text) const {
+    return ::HasText(Text, SourceMgr.get());
+  }
+  Matcher<syntax::Token> RangeIs(llvm::Annotations::Range R) const {
+    std::pair<SourceLocation, SourceLocation> Ls;
+    Ls.first = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                   .getLocWithOffset(R.Begin);
+    Ls.second = SourceMgr->getLocForStartOfFile(SourceMgr->getMainFileID())
+                    .getLocWithOffset(R.End);
+    return ::RangeIs(Ls);
+  }
+
+  Matcher<std::tuple<const syntax::Token &, const syntax::Token &>>
+  IsSameToken() const {
+    return ::IsSameToken(SourceMgr.get());
+  }
+
+  void addFile(llvm::StringRef Path, llvm::StringRef Contents) {
+    if (!FS->addFile(Path, time_t(),
+                     llvm::MemoryBuffer::getMemBufferCopy(Contents))) {
+      ADD_FAILURE() << "could not add a file to VFS: " << Path;
+    }
+  }
+
+  // Data fields.
+  llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      new DiagnosticsEngine(new DiagnosticIDs, new DiagnosticOptions);
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS =
+      new llvm::vfs::InMemoryFileSystem;
+  llvm::IntrusiveRefCntPtr<FileManager> FileMgr =
+      new FileManager(FileSystemOptions(), FS);
+  llvm::IntrusiveRefCntPtr<SourceManager> SourceMgr =
+      new SourceManager(*Diags, *FileMgr);
+  /// Contains last result of calling recordTokens().
+  TokenBuffer Buffer = TokenBuffer(*SourceMgr);
+  /// Contains options from last run of recordTokens().
+  LangOptions LangOpts;
+};
+
+namespace {
+TEST_F(TokensTest, RawMode) {
+  EXPECT_THAT(tokenize("int main() {}"),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // Comments are ignored for now.
+  EXPECT_THAT(tokenize("/* foo */int a; // more comments"),
+              ElementsAre(Kind(tok::kw_int), IsIdent("a"), Kind(tok::semi),
+                          Kind(tok::eof)));
+}
+
+TEST_F(TokensTest, Basic) {
+  recordTokens("int main() {}");
+  EXPECT_THAT(Buffer.expandedTokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+  // All kinds of whitespace are ignored.
+  recordTokens("\t\n  int\t\n  main\t\n  (\t\n  )\t\n{\t\n  }\t\n");
+  EXPECT_THAT(Buffer.expandedTokens(),
+              ElementsAre(Kind(tok::kw_int), IsIdent("main"),
+                          Kind(tok::l_paren), Kind(tok::r_paren),
+                          Kind(tok::l_brace), Kind(tok::r_brace),
+                          Kind(tok::eof)));
+
+  llvm::Annotations Code(R"cpp(
+    $r1[[int]] $r2[[a]] $r3[[=]] $r4[["foo bar baz"]] $r5[[;]]
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_THAT(
+      Buffer.expandedTokens(),
+      ElementsAre(AllOf(Kind(tok::kw_int), RangeIs(Code.range("r1"))),
+                  AllOf(Kind(tok::identifier), RangeIs(Code.range("r2"))),
+                  AllOf(Kind(tok::equal), RangeIs(Code.range("r3"))),
+                  AllOf(Kind(tok::string_literal), RangeIs(Code.range("r4"))),
+                  AllOf(Kind(tok::semi), RangeIs(Code.range("r5"))),
+                  Kind(tok::eof)));
+}
+
+TEST_F(TokensTest, MacroDirectives) {
+  // Macro directives are not stored anywhere at the moment.
+  llvm::StringLiteral Code = R"cpp(
+    #define FOO a
+    #include "unresolved_file.h"
+    #undef FOO
+    #ifdef X
+    #else
+    #endif
+    #ifndef Y
+    #endif
+    #if 1
+    #elif 2
+    #else
+    #endif
+    #pragma once
+    #pragma something lalala
+
+    int a;
+  )cpp";
+  recordTokens(Code);
+
+  expectTokens("int a;");
+  expectMacroInvocations({});
+
+  expectSameTokens(Buffer.rawTokens(SourceMgr->getMainFileID()),
+                   tokenize(Code));
+}
+
+TEST_F(TokensTest, MacroReplacements) {
+  // A simple object-like macro.
+  llvm::Annotations Code(R"cpp(
+    #define INT int const
+    [[INT]] a;
+    )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("int const a;");
+  expectMacroInvocations({{"INT", "int const", Code.range()}});
+
+  // A simple function-like macro.
+  Code = llvm::Annotations(R"cpp(
+    #define INT(a) const int
+    [[INT(10+10)]] a;
+    )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("const int a;");
+  expectMacroInvocations({{"INT(10+10)", "const int", Code.range()}});
+
+  // Recursive macro replacements.
+  Code = llvm::Annotations(R"cpp(
+    #define ID(X) X
+    #define INT int const
+    [[ID(ID(INT))]] a;
+  )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("int const a;");
+  expectMacroInvocations({{"ID(ID(INT))", "int const", Code.range()}});
+
+  // A little more complicated recursive macro replacements.
+  Code = llvm::Annotations(R"cpp(
+    #define ADD(X, Y) X+Y
+    #define MULT(X, Y) X*Y
+
+    int a = [[ADD(MULT(1,2), MULT(3,ADD(4,5)))]];
+  )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("int a = 1*2+3*4+5;");
+  expectMacroInvocations(
+      {{"ADD(MULT(1,2), MULT(3,ADD(4,5)))", "1*2+3*4+5", Code.range()}});
+
+  // Empty macro replacement.
+  Code = llvm::Annotations(R"cpp(
+    #define EMPTY
+    #define EMPTY_FUNC(X)
+    $m[[EMPTY]]
+    $f[[EMPTY_FUNC(1+2+3)]]
+  )cpp");
+  recordTokens(Code.code());
+
+  expectTokens("");
+  expectMacroInvocations({{"EMPTY", "", Code.range("m")},
+                          {"EMPTY_FUNC(1+2+3)", "", Code.range("f")}});
+
+  // File ends with a macro replacement.
+  Code = llvm::Annotations(R"cpp(
+    #define FOO 10+10;
+    int a = [[FOO]])cpp");
+  recordTokens(Code.code());
+
+  expectTokens("int a = 10+10;");
+  expectMacroInvocations({{"FOO", "10+10;", Code.range()}});
+}
+
+TEST_F(TokensTest, SpecialTokens) {
+  // Tokens coming from concatenations.
+  recordTokens(R"cpp(
+    #define CONCAT(a, b) a ## b
+    int a = CONCAT(1, 2);
+  )cpp");
+  expectTokens("int a = 12;");
+  // Multi-line tokens with slashes at the end.
+  recordTokens("i\\\nn\\\nt");
+  EXPECT_THAT(Buffer.expandedTokens(),
+              ElementsAre(AllOf(Kind(tok::kw_int), HasText("i\\\nn\\\nt")),
+                          Kind(tok::eof)));
+  // FIXME: test tokens with digraphs and UCN identifiers.
+}
+
+TEST_F(TokensTest, LateBoundTokens) {
+  // The parser eventually breaks the first '>>' into two tokens ('>' and '>'),
+  // but we choose to record them as a single token (for now).
+  llvm::Annotations Code(R"cpp(
+    template <class T>
+    struct foo { int a; };
+    int bar = foo<foo<int$br[[>>]]().a;
+    int baz = 10 $op[[>>]] 2;
+  )cpp");
+  recordTokens(Code.code());
+  EXPECT_THAT(std::vector<syntax::Token>(Buffer.expandedTokens()),
+              AllOf(Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("br")))),
+                    Contains(AllOf(Kind(tok::greatergreater),
+                                   RangeIs(Code.range("op"))))));
+}
+
+TEST_F(TokensTest, DelayedParsing) {
+  llvm::StringLiteral Code = R"cpp(
+    struct Foo {
+      int method() {
+        // Parser will visit method bodies and initializers multiple times, but
+        // TokenBuffer should only record the first walk over the tokens;
+        return 100;
+      }
+      int a = 10;
+      int b = 20;
+
+      struct Subclass {
+        void foo() {
+          Foo().method();
+        }
+      };
+    };
+  )cpp";
+  recordTokens(Code);
+  // Checks that lexing in raw mode produces the same results, hence we're not
+  // recording any tokens twice and the order is the same.
+  expectTokens(Code);
+}
+
+TEST_F(TokensTest, Offsets) {
+  llvm::Annotations Code("");
+  /// Finds a token with the specified text.
+  auto Find = [this](llvm::StringRef Text) {
+    llvm::ArrayRef<syntax::Token> Tokens = Buffer.expandedTokens();
+    auto TokenMatches = [=](const syntax::Token &T) {
+      return T.text(*SourceMgr) == Text;
+    };
+    auto It = llvm::find_if(Tokens, TokenMatches);
+    if (It == Tokens.end()) {
+      ADD_FAILURE() << "could not find the token for " << Text;
+      std::abort();
+    }
+    if (std::find_if(std::next(It), Tokens.end(), TokenMatches) !=
+        Tokens.end()) {
+      ADD_FAILURE() << "token is not unique: " << Text;
+      std::abort();
+    };
+    return It;
+  };
+  auto Range = [&](llvm::StringRef Name) {
+    auto R = Code.range(Name);
+    syntax::FileRange FR;
+    FR.File = SourceMgr->getMainFileID();
+    FR.Begin = R.Begin;
+    FR.End = R.End;
+    return FR;
+  };
+
+  Code = llvm::Annotations(R"cpp(
+    $all[[$first[[a1 a2 a3]] FIRST $second[[b1 b2]] LAST]]
+  )cpp");
+
+  recordTokens(Code.code());
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("a1"), std::next(Find("LAST")))),
+            Range("all"));
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("a1"), Find("FIRST"))),
+            Range("first"));
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("b1"), Find("LAST"))),
+            Range("second"));
+
+  Code = llvm::Annotations(R"cpp(
+    #define A a1 a2 a3
+    #define B b1 b2
+
+    $all[[$first[[A]] FIRST $second[[B]] LAST]]
+  )cpp");
+  recordTokens(Code.code());
+
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("a1"), std::next(Find("LAST")))),
+            Range("all"));
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("a1"), Find("FIRST"))),
+            Range("first"));
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("b1"), Find("LAST"))),
+            Range("second"));
+  // Ranges not fully covering macro invocations should fail.
+  EXPECT_EQ(
+      Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a1"), Find("a3"))),
+      llvm::None);
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("b2"), Find("LAST"))),
+            llvm::None);
+  EXPECT_EQ(
+      Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a2"), Find("b2"))),
+      llvm::None);
+
+  Code = llvm::Annotations(R"cpp(
+    #define ID(x) x
+    #define B b1 b2
+
+    $both[[$first[[ID(ID(ID(a1) a2 a3))]] FIRST $second[[ID(B)]]]] LAST
+  )cpp");
+  recordTokens(Code.code());
+
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("a1"), Find("FIRST"))),
+            Range("first"));
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("b1"), Find("LAST"))),
+            Range("second"));
+  EXPECT_EQ(Buffer.findOffsetsByExpanded(
+                llvm::makeArrayRef(Find("a1"), Find("LAST"))),
+            Range("both"));
+
+  // Ranges crossing macro call boundaries.
+  EXPECT_EQ(
+      Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a1"), Find("b2"))),
+      llvm::None);
+  EXPECT_EQ(
+      Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a2"), Find("b2"))),
+      llvm::None);
+  // FIXME: next two examples should map to macro arguments, but currently they
+  //        fail.
+  EXPECT_EQ(
+      Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a2"), Find("a3"))),
+      llvm::None);
+  EXPECT_EQ(
+      Buffer.findOffsetsByExpanded(llvm::makeArrayRef(Find("a1"), Find("a3"))),
+      llvm::None);
+}
+
+TEST_F(TokensTest, MultiFile) {
+  addFile("./foo.h", R"cpp(
+    #define ADD(X, Y) X+Y
+    int a = 100;
+    #include "bar.h"
+  )cpp");
+  addFile("./bar.h", R"cpp(
+    int b = ADD(1, 2);
+    #define MULT(X, Y) X*Y
+  )cpp");
+  recordTokens(R"cpp(
+    #include "foo.h"
+    int c = ADD(1, MULT(2,3));
+  )cpp");
+
+  expectTokens(R"cpp(
+    int a = 100;
+    int b = 1+2;
+    int c = 1+2*3;
+  )cpp");
+  expectMacroInvocations({{"ADD(1,MULT(2,3))", "1+2*3"}});
+  expectMacroInvocations({{}}, findFile("./foo.h"));
+  expectMacroInvocations({{"ADD(1,2)", "1+2"}}, findFile("./bar.h"));
+}
+} // namespace
Index: clang/unittests/Tooling/Syntax/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  Support
+  )
+
+add_clang_unittest(TokensTest
+  TokensTest.cpp
+)
+
+target_link_libraries(TokensTest
+  PRIVATE
+  clangAST
+  clangBasic
+  clangFrontend
+  clangLex
+  clangSerialization
+  clangTooling
+  clangToolingSyntax
+  LLVMTestingSupport
+  )
Index: clang/unittests/Tooling/CMakeLists.txt
===================================================================
--- clang/unittests/Tooling/CMakeLists.txt
+++ clang/unittests/Tooling/CMakeLists.txt
@@ -69,3 +69,6 @@
   clangToolingInclusions
   clangToolingRefactor
   )
+
+
+add_subdirectory(Syntax)
Index: clang/lib/Tooling/Syntax/Tokens.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Tokens.cpp
@@ -0,0 +1,369 @@
+//===- Tokens.cpp - collect tokens from preprocessing ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Tooling/Syntax/Tokens.h"
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.def"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace clang;
+using namespace clang::syntax;
+
+syntax::Token::Token(const clang::Token &T)
+    : Token(T.getLocation(), T.getLength(), T.getKind()) {
+  assert(!T.isAnnotation());
+}
+
+llvm::StringRef syntax::Token::text(const SourceManager &SM) const {
+  bool Invalid = false;
+  const char *Start = SM.getCharacterData(location(), &Invalid);
+  assert(!Invalid);
+  return llvm::StringRef(Start, length());
+}
+
+std::string syntax::Token::str() const {
+  return llvm::formatv("Token({0}, length = {1})", tok::getTokenName(kind()),
+                       length());
+}
+
+std::string syntax::Token::str(const SourceManager &SM) const {
+  return llvm::formatv("Token({0}, length = {1}, location = {2}, text = {3})",
+                       tok::getTokenName(kind()), length(),
+                       location().printToString(SM), text(SM));
+}
+
+llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, const Token &T) {
+  return OS << T.str();
+}
+
+llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS,
+                                      const FileRange &R) {
+  return OS << llvm::formatv("FileRange(file = {0}, offsets = {1}-{2})",
+                             R.File.getHashValue(), R.Begin, R.End);
+}
+
+void TokenBuffer::dump(llvm::raw_ostream &OS) const {
+  OS << "expanded tokens:\n";
+  for (unsigned I = 0; I < ExpandedTokens.size(); ++I)
+    OS << llvm::formatv("  {0}: {1}\n", I, ExpandedTokens[I].str(*SourceMgr));
+
+  std::vector<FileID> Keys;
+  for (auto F : Files)
+    Keys.push_back(F.first);
+  llvm::sort(Keys);
+
+  for (FileID ID : Keys) {
+    const MarkedFile &File = Files.find(ID)->second;
+
+    auto *Entry = SourceMgr->getFileEntryForID(ID);
+    OS << "  file " << (Entry ? Entry->getName() : "<<virtual file>>") << "\n";
+    OS << "   raw tokens:\n";
+    for (unsigned I = 0; I < File.RawTokens.size(); ++I)
+      OS << llvm::formatv("     {0}: {1}\n", I,
+                          File.RawTokens[I].str(*SourceMgr));
+    OS << "   mappings:\n";
+    for (auto &M : File.Mappings)
+      OS << "     " << M.str() << "\n";
+  }
+}
+
+std::pair<const syntax::Token *, const TokenBuffer::Mapping *>
+TokenBuffer::expandedToRaw(const syntax::Token *Expanded) const {
+  assert(Expanded);
+  assert(ExpandedTokens.data() <= Expanded &&
+         Expanded < ExpandedTokens.data() + ExpandedTokens.size());
+
+  auto FileIt = Files.find(
+      SourceMgr->getFileID(SourceMgr->getExpansionLoc(Expanded->location())));
+  assert(FileIt != Files.end() && "no file for an expanded token");
+
+  const MarkedFile &File = FileIt->second;
+
+  unsigned ExpandedIndex = Expanded - ExpandedTokens.data();
+  // Find the first mapping that produced tokens after \p Expanded.
+  auto It = llvm::upper_bound(
+      File.Mappings, ExpandedIndex,
+      [](unsigned L, const Mapping &R) { return L < R.BeginExpandedToken; });
+  // Our token could only be produced by the previous mapping.
+  if (It == File.Mappings.begin()) {
+    // No mapping could produce this mapping, pick the corresponding raw token.
+    return {&File.RawTokens[ExpandedIndex - File.BeginExpanded], nullptr};
+  }
+  --It; // It now points to last mapping that started before our token.
+
+  // Check if the token is part of the mapping.
+  if (ExpandedIndex < It->EndExpandedToken)
+    return {&File.RawTokens[It->BeginRawToken], /*Mapping*/ &*It};
+
+  // Not part of the mapping, use the index from previous mapping to compute the
+  // corresponding raw token.
+  return {
+      &File.RawTokens[It->EndRawToken + (ExpandedIndex - It->EndExpandedToken)],
+      /*Mapping*/ nullptr};
+}
+
+llvm::ArrayRef<syntax::Token> TokenBuffer::rawTokens(FileID FID) const {
+  auto It = Files.find(FID);
+  assert(It != Files.end());
+  return It->second.RawTokens;
+}
+
+std::string TokenBuffer::Mapping::str() const {
+  return llvm::formatv("raw tokens: [{0},{1}), expanded "
+                       "tokens: [{2},{3})",
+                       BeginRawToken, EndRawToken, BeginExpandedToken,
+                       EndExpandedToken);
+}
+
+llvm::Optional<FileRange> TokenBuffer::findOffsetsByExpanded(
+    llvm::ArrayRef<syntax::Token> Expanded) const {
+  auto Tokens = findRawByExpanded(Expanded);
+  if (!Tokens)
+    return llvm::None;
+  assert(!Tokens->empty());
+
+  FileRange R;
+  std::tie(R.File, R.Begin) =
+      SourceMgr->getDecomposedLoc(Tokens->front().location());
+  R.End = SourceMgr->getFileOffset(Tokens->back().endLocation());
+  return R;
+}
+
+llvm::Optional<llvm::ArrayRef<syntax::Token>>
+TokenBuffer::findRawByExpanded(llvm::ArrayRef<syntax::Token> Expanded) const {
+  // Mapping an empty range is not well-defined, bail out in that case.
+  if (Expanded.empty())
+    return llvm::None;
+
+  // FIXME: also allow changes uniquely mapping to macro arguments.
+
+  const syntax::Token *BeginRawToken;
+  const Mapping *BeginMapping;
+  std::tie(BeginRawToken, BeginMapping) = expandedToRaw(&Expanded.front());
+
+  const syntax::Token *LastRawToken;
+  const Mapping *LastMapping;
+  std::tie(LastRawToken, LastMapping) = expandedToRaw(&Expanded.back());
+
+  FileID FID = SourceMgr->getFileID(BeginRawToken->location());
+  // FIXME: Handle multi-file changes by trying to map onto a common root.
+  if (FID != SourceMgr->getFileID(LastRawToken->location()))
+    return llvm::None;
+
+  const MarkedFile &File = Files.find(FID)->second;
+
+  // Do not allow changes that cross macro expansion boundaries.
+  unsigned BeginExpanded = Expanded.begin() - ExpandedTokens.data();
+  unsigned EndExpanded = Expanded.end() - ExpandedTokens.data();
+  if (BeginMapping && BeginMapping->BeginExpandedToken < BeginExpanded)
+    return llvm::None;
+  if (LastMapping && EndExpanded < LastMapping->EndExpandedToken)
+    return llvm::None;
+  // All is good, return the result.
+  return llvm::makeArrayRef(
+      BeginMapping ? File.RawTokens.data() + BeginMapping->BeginRawToken
+                   : BeginRawToken,
+      LastMapping ? File.RawTokens.data() + LastMapping->EndRawToken
+                  : LastRawToken + 1);
+}
+
+std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,
+                                            const LangOptions &LO) {
+  std::vector<syntax::Token> Tokens;
+  IdentifierTable Identifiers(LO);
+  auto AddToken = [&](clang::Token T) {
+    // Fill the proper token kind for keywords, etc.
+    if (T.getKind() == tok::raw_identifier && !T.needsCleaning() &&
+        !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases.
+      clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());
+      T.setIdentifierInfo(&II);
+      T.setKind(II.getTokenID());
+    }
+    Tokens.push_back(syntax::Token(T));
+  };
+
+  Lexer L(FID, SM.getBuffer(FID), SM, LO);
+
+  clang::Token T;
+  while (!L.LexFromRawLexer(T))
+    AddToken(T);
+  AddToken(T);
+
+  return Tokens;
+}
+
+/// Fills in the TokenBuffer by tracing the run of a preprocessor. The
+/// implementation tracks the tokens, macro expansions and directives coming
+/// from the preprocessor and:
+/// - for each token, figures out if it is a part of an expanded token stream,
+///   raw token stream or both. Stores the tokens appropriately.
+/// - records mappings from the raw to expanded token ranges, e.g. for macro
+///   expansions.
+class TokenCollector::Callbacks : public PPCallbacks {
+public:
+  Callbacks(const SourceManager &SM, TokenBuffer &Result)
+      : Result(Result), SM(SM) {}
+
+  void FileChanged(SourceLocation Loc, FileChangeReason Reason,
+                   SrcMgr::CharacteristicKind FileType,
+                   FileID PrevFID) override {
+    assert(Loc.isFileID());
+    File = &Result.Files.try_emplace(SM.getFileID(Loc)).first->second;
+    flushMacroExpansion();
+  }
+
+  void tokenLexed(const clang::Token &T, TokenSource S) {
+    if (S == TokenSource::Precached)
+      return; // the cached tokens are reported multiple times, we have already
+              // recorded these.
+
+    auto L = T.getLocation();
+    flushCurrentExpansion(L);
+
+    if (ExpansionStart.isValid() && SM.getExpansionLoc(L) != ExpansionStart) {
+      // The token comes from intermediate replacements while processing macro
+      // arguments. These are not part of the expanded token and we only record
+      // the top-level macro expansions, so skip this token.
+      return;
+    }
+
+    // 'eod' is a control token that we don't capture.
+    if (T.getKind() == tok::eod)
+      return;
+
+    DEBUG_WITH_TYPE("collect-tokens", {
+      llvm::dbgs() << "$[token] " << syntax::Token(T).str(SM) << "\n";
+    });
+
+    // Depending on where the token comes from, put it into an expanded token
+    // stream, a raw token stream, or both.
+    switch (S) {
+    case TokenSource::File:
+      assert(T.getLocation().isFileID());
+      Result.ExpandedTokens.push_back(syntax::Token(T));
+      File->RawTokens.push_back(syntax::Token(T));
+      break;
+    case clang::TokenSource::MacroExpansion:
+      assert(T.getLocation().isMacroID());
+      Result.ExpandedTokens.push_back(syntax::Token(T));
+      break;
+    case clang::TokenSource::MacroNameOrArg:
+    case TokenSource::MacroDirective:
+    case TokenSource::SkippedPPBranch:
+      assert(T.getLocation().isFileID());
+      File->RawTokens.push_back(syntax::Token(T));
+      break;
+    case TokenSource::Precached:
+      llvm_unreachable("cached tokens should be handled before");
+    case TokenSource::AfterModuleImport:
+      llvm_unreachable("not implemented yet");
+    }
+  }
+
+  void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD,
+                    SourceRange Range, const MacroArgs *Args) override {
+    auto MacroNameLoc = MacroNameTok.getLocation();
+    flushCurrentExpansion(MacroNameLoc);
+
+    // We do not record recursive invocations.
+    if (isMacroExpanding())
+      return;
+
+    // Find the first raw token of the macro invocation, i.e. the name of the
+    // macro.
+    auto InvocationStart = llvm::find_if(
+        llvm::reverse(File->RawTokens),
+        [&](const syntax::Token &T) { return T.location() == MacroNameLoc; });
+    assert(InvocationStart != File->RawTokens.rend() &&
+           "macro name must be recorded.");
+
+    // Add a raw-to-expanded mapping for this macro invocation.
+    TokenBuffer::Mapping M;
+    M.BeginRawToken =
+        std::prev(InvocationStart.base()) - File->RawTokens.begin();
+    M.EndRawToken = File->RawTokens.size();
+
+    M.BeginExpandedToken = Result.ExpandedTokens.size();
+    // MI.EndExpandedToken is filled by flushCurrentExpansion() when macro
+    // expansion finishes.
+
+    File->Mappings.push_back(M);
+
+    // We have to record where invocation ends in order to track it properly.
+    std::tie(MacroInvocationFile, ExpansionEndOffset) =
+        SM.getDecomposedLoc(Range.getEnd());
+    this->ExpansionStart = Range.getBegin();
+  }
+
+private:
+  bool isMacroExpanding() const { return MacroInvocationFile.isValid(); }
+
+  void flushMacroExpansion() {
+    if (!MacroInvocationFile.isValid())
+      return;
+    assert(!File->Mappings.empty());
+    assert(File->Mappings.back().EndExpandedToken == 0);
+    File->Mappings.back().EndExpandedToken = Result.ExpandedTokens.size();
+
+    MacroInvocationFile = FileID();
+    ExpansionStart = SourceLocation();
+    ExpansionEndOffset = 0;
+  }
+
+  void flushCurrentExpansion(SourceLocation L) {
+    assert(L.isValid());
+    if (!MacroInvocationFile.isValid())
+      return;
+    FileID File;
+    unsigned Offset;
+    std::tie(File, Offset) = SM.getDecomposedLoc(L);
+    // Note that we always get a token inside the same file after macro
+    // expansion finishes (eof would be the last token)
+    if (File != MacroInvocationFile || Offset <= ExpansionEndOffset)
+      return;
+    // Check we are not inside the current macro arguments.
+    flushMacroExpansion();
+  }
+
+  TokenBuffer::MarkedFile *File = nullptr;
+  /// When valid, the file of the last active top-level macro invocation.
+  FileID MacroInvocationFile;
+  SourceLocation ExpansionStart;
+  unsigned ExpansionEndOffset = 0;
+  TokenBuffer &Result;
+  const SourceManager &SM;
+};
+
+TokenCollector::TokenCollector(Preprocessor &PP)
+    : Tokens(PP.getSourceManager()) {
+  auto CBOwner = llvm::make_unique<Callbacks>(PP.getSourceManager(), Tokens);
+  auto *CB = CBOwner.get();
+
+  PP.addPPCallbacks(std::move(CBOwner));
+  PP.setTokenWatcher(
+      [CB](const clang::Token &T, TokenSource S) { CB->tokenLexed(T, S); });
+}
+
+TokenBuffer TokenCollector::consume() && { return std::move(Tokens); }
Index: clang/lib/Tooling/Syntax/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_library(clangToolingSyntax
+  Tokens.cpp
+
+  LINK_LIBS
+  clangBasic
+  clangFrontend
+  clangLex
+  )
Index: clang/lib/Tooling/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/CMakeLists.txt
+++ clang/lib/Tooling/CMakeLists.txt
@@ -7,6 +7,7 @@
 add_subdirectory(Inclusions)
 add_subdirectory(Refactoring)
 add_subdirectory(ASTDiff)
+add_subdirectory(Syntax)
 
 add_clang_library(clangTooling
   AllTUsExecution.cpp
Index: clang/include/clang/Tooling/Syntax/Tokens.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Tokens.h
@@ -0,0 +1,267 @@
+//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Record tokens that a preprocessor emits and define operations to map between
+// the tokens written in a file and tokens produced by the preprocessor.
+//
+// When running the compiler, there are two token streams we are interested in:
+//   - "raw" tokens directly correspond to a substring written in some source
+//     file.
+//   - "expanded" tokens represent the result of preprocessing, parses consumes
+//     this token stream to produce the AST.
+//
+// Expanded tokens correspond directly to locations found in the AST, allowing
+// to find subranges of the token stream covered by various AST nodes. Raw
+// tokens correspond directly to the source code written by the user.
+//
+// To allow composing these two use-cases, we also define operations that map
+// between expanded and raw tokens that produced them (macro calls, directives,
+// etc).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
+#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
+
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <tuple>
+
+namespace clang {
+class Preprocessor;
+
+namespace syntax {
+class TokenBuffer;
+
+/// A token coming directly from a file or from a macro invocation. Has just
+/// enough information to locate the token in the source code.
+/// Used to represent both expanded and raw tokens.
+class Token {
+public:
+  Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind)
+      : Location(Location), Length(Length), Kind(Kind) {}
+  /// EXPECTS: clang::Token is not an annotation token.
+  explicit Token(const clang::Token &T);
+
+  tok::TokenKind kind() const { return Kind; }
+  SourceLocation location() const { return Location; }
+  SourceLocation endLocation() const {
+    return Location.getLocWithOffset(Length);
+  }
+  unsigned length() const { return Length; }
+
+  /// Get the substring covered by the token. Note that will include all
+  /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
+  ///    in\
+  ///    t
+  /// both have the same kind tok::kw_int, but results of text() are different.
+  llvm::StringRef text(const SourceManager &SM) const;
+
+  /// For debugging purposes. More verbose than the other overload, but requries
+  /// a source manager.
+  std::string str(const SourceManager &SM) const;
+  /// For debugging purposes.
+  std::string str() const;
+
+private:
+  SourceLocation Location;
+  unsigned Length;
+  tok::TokenKind Kind;
+};
+/// For debugging purposes. Equivalent to a call to Token::str().
+llvm::raw_ostream& operator<<(llvm::raw_ostream &OS, const Token &T);
+
+/// A half-open range inside a particular file, the start offset is included and
+/// the end offset is excluded from the range.
+struct FileRange {
+  FileID File;
+  /// Start offset (inclusive) in a corresponding file.
+  unsigned Begin = 0;
+  /// End offset (exclusive) in a corresponding file.
+  unsigned End = 0;
+};
+inline bool operator==(const FileRange &L, const FileRange &R) {
+  return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
+}
+inline bool operator!=(const FileRange &L, const FileRange &R) {
+  return !(L == R);
+}
+/// For debugging purposes.
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
+
+/// A list of tokens obtained by preprocessing a text buffer and operations to
+/// map between the expanded and raw tokens, i.e. TokenBuffer has information
+/// about two token streams:
+///    1. Expanded tokens: tokens produced by the preprocessor after all macro
+///       replacements,
+///    2. Raw tokens: corresponding directly to the source code of a file before
+///       any macro replacements occurred.
+/// Here's an example to illustrate a difference between those two:
+///     #define FOO 10
+///     int a = FOO;
+///
+/// Raw tokens are {'#', 'define', 'FOO', '10', 'int', 'a', '=', 'FOO', ';'}.
+/// Expanded tokens are {'int', 'a', '=', '10', ';'}.
+///
+/// The full list expanded tokens can be obtained with expandedTokens(). Raw
+/// tokens for each of the files can be obtained via rawTokens(FileID).
+///
+/// To map between the expanded and raw token streams, see findRawByExpanded().
+///
+/// To build a token buffer use the TokenCollector class. You can also compute
+/// the raw tokens of a file using the tokenize() helper.
+class TokenBuffer {
+public:
+  TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
+  /// All tokens produced by the preprocessor after all macro replacements,
+  /// directives, etc. Source locations found in the clang AST will always
+  /// point to one of these tokens.
+  /// FIXME: the notable exception is '>>' being split into two '>'. figure out
+  ///        how to deal with it.
+  llvm::ArrayRef<syntax::Token> expandedTokens() const {
+    return ExpandedTokens;
+  }
+
+  /// Attempt to find the subrange of raw tokens that produced the corresponding
+  /// \p Expanded tokens. Will fail if the raw tokens cannot be determined
+  /// unambiguously. E.g. for the following example:
+  ///
+  ///   #define FIRST f1 f2 f3
+  ///   #define SECOND s1 s2 s3
+  ///
+  ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
+  ///
+  /// the results would be:
+  ///   expanded   => raw
+  ///   ------------------------
+  ///            a => a
+  ///     s1 s2 s3 => SECOND
+  ///   a f1 f2 f3 => a FIRST
+  ///         a f1 => can't map
+  ///        s1 s2 => can't map
+  ///
+  /// If \p Expanded is empty, the returned value is llvm::None.
+  /// Complexity is logarithmic.
+  llvm::Optional<llvm::ArrayRef<syntax::Token>>
+  findRawByExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
+
+  /// Obtain the text offsets corresponding to the tokens returned by
+  /// findRawByExpanded.
+  llvm::Optional<FileRange>
+  findOffsetsByExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
+
+  /// Lexed tokens of a file before preprocessing. E.g. for the following input
+  ///     #define DECL(name) int name = 10
+  ///     DECL(a);
+  /// rawTokens() returns {"#", "define", "DECL", "(", "name", ")"}.
+  /// FIXME: we do not yet store tokens of directives, like #include, #define,
+  ///        #pragma, etc.
+  llvm::ArrayRef<syntax::Token> rawTokens(FileID FID) const;
+
+  /// For debugging purposes.
+  void dump(llvm::raw_ostream &OS) const;
+
+private:
+  /// Describes a mapping between a continuous subrange of raw tokens and the
+  /// expanded tokens. Represents macro expansions, preprocessor directives,
+  /// conditionally disabled pp regions, etc.
+  ///   #define FOO 1+2
+  ///   #define BAR(a) a + 1
+  ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
+  ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
+  ///                            macroTokens = {'BAR', '(', '1', ')'}.
+  struct Mapping {
+    // Positions in the corresponding raw token stream. The corresponding range
+    // is never empty.
+    unsigned BeginRawToken = 0;
+    unsigned EndRawToken = 0;
+    // Positions in the expanded token stream. The corresponding range can be
+    // empty.
+    unsigned BeginExpandedToken = 0;
+    unsigned EndExpandedToken = 0;
+
+    /// For debugging purposes.
+    std::string str() const;
+  };
+  /// Raw tokens of the file with information about the subranges.
+  struct MarkedFile {
+    /// Lexed, but not preprocessed, tokens of the file. These map directly to
+    /// text in the corresponding files and include tokens of all preprocessor
+    /// directives.
+    /// FIXME: raw tokens don't change across FileID that map to the same
+    ///        FileEntry. We could consider deduplicating them to save memory.
+    std::vector<syntax::Token> RawTokens;
+    /// A sorted list to convert between the raw and expanded token streams.
+    std::vector<Mapping> Mappings;
+    /// The first expanded token produced for this FileID.
+    unsigned BeginExpanded = 0;
+    unsigned EndExpanded = 0;
+  };
+
+  friend class TokenCollector;
+  // Testing code has access to internal mapping.
+  friend class TokensTest;
+
+  /// Maps a single expanded token to its raw counterpart or a mapping that
+  /// produced it.
+  std::pair<const syntax::Token *, const Mapping *>
+  expandedToRaw(const syntax::Token *Expanded) const;
+
+  /// Token stream produced after preprocessing, conceputally this captures the
+  /// same stream as 'clang -E' (excluding the preprocessor directives like
+  /// #file, etc.).
+  std::vector<syntax::Token> ExpandedTokens;
+  llvm::DenseMap<FileID, MarkedFile> Files;
+  // The value is never null, pointer instead of reference to avoid disabling
+  // implicit assignment operator.
+  const SourceManager *SourceMgr;
+};
+
+/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
+/// resulting tokens. Does minimal post-processing on raw identifiers, setting
+/// the appropriate token kind (instead of the raw_identifier reported by lexer
+/// in raw mode). This is a very low-level function, most users should prefer to
+/// use TokenCollector. Lexing in raw mode produces wildly different results
+/// from what one might expect when running a C++ frontend, e.g. preprocessor
+/// does not run at all.
+std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
+                                    const LangOptions &LO);
+
+/// Collects tokens for the main file while running the frontend action. An
+/// instance of this object should be created on
+/// FrontendAction::BeginSourceFile() and the results should be consumed after
+/// FrontendAction::Execute() finishes.
+class TokenCollector {
+public:
+  /// Adds the hooks to collect the tokens. Should be called before the
+  /// preprocessing starts, i.e. as a part of BeginSourceFile() or
+  /// CreateASTConsumer().
+  TokenCollector(Preprocessor &P);
+
+  /// Finalizes token collection. Should be called after preprocessing is
+  /// finished, i.e. after running Execute().
+  LLVM_NODISCARD TokenBuffer consume() &&;
+
+private:
+  class Callbacks;
+  TokenBuffer Tokens;
+};
+
+} // namespace syntax
+} // namespace clang
+
+#endif

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D59887: [Syntax] Introduce TokenBuffer, start clangToolingSyntax library

Reply via email to