[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-12 Thread Haojian Wu via Phabricator via cfe-commits
hokein updated this revision to Diff 443922.
hokein added a comment.

remove all TokenBufferTokenManager cast usage, make it as a contract in the 
APIs.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

Files:
  clang/include/clang/Tooling/Syntax/BuildTree.h
  clang/include/clang/Tooling/Syntax/Mutations.h
  clang/include/clang/Tooling/Syntax/Nodes.h
  clang/include/clang/Tooling/Syntax/TokenBufferTokenManager.h
  clang/include/clang/Tooling/Syntax/TokenManager.h
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/include/clang/Tooling/Syntax/Tree.h
  clang/lib/Tooling/Syntax/BuildTree.cpp
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/ComputeReplacements.cpp
  clang/lib/Tooling/Syntax/Mutations.cpp
  clang/lib/Tooling/Syntax/Synthesis.cpp
  clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
  clang/lib/Tooling/Syntax/Tree.cpp
  clang/tools/clang-check/ClangCheck.cpp
  clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
  clang/unittests/Tooling/Syntax/MutationsTest.cpp
  clang/unittests/Tooling/Syntax/SynthesisTest.cpp
  clang/unittests/Tooling/Syntax/TreeTest.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.h

Index: clang/unittests/Tooling/Syntax/TreeTestBase.h
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.h
+++ clang/unittests/Tooling/Syntax/TreeTestBase.h
@@ -17,6 +17,7 @@
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Testing/TestClangConfig.h"
 #include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Tooling/Syntax/TokenBufferTokenManager.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "clang/Tooling/Syntax/Tree.h"
 #include "llvm/ADT/StringRef.h"
@@ -51,6 +52,7 @@
   std::shared_ptr Invocation;
   // Set after calling buildTree().
   std::unique_ptr TB;
+  std::unique_ptr TM;
   std::unique_ptr Arena;
 };
 
Index: clang/unittests/Tooling/Syntax/TreeTestBase.cpp
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -35,13 +35,14 @@
 using namespace clang::syntax;
 
 namespace {
-ArrayRef tokens(syntax::Node *N) {
+ArrayRef tokens(syntax::Node *N,
+   const TokenBufferTokenManager ) {
   assert(N->isOriginal() && "tokens of modified nodes are not well-defined");
   if (auto *L = dyn_cast(N))
-return llvm::makeArrayRef(L->getToken(), 1);
+return llvm::makeArrayRef(STM.getToken(L->getTokenKey()), 1);
   auto *T = cast(N);
-  return llvm::makeArrayRef(T->findFirstLeaf()->getToken(),
-T->findLastLeaf()->getToken() + 1);
+  return llvm::makeArrayRef(STM.getToken(T->findFirstLeaf()->getTokenKey()),
+STM.getToken(T->findLastLeaf()->getTokenKey()) + 1);
 }
 } // namespace
 
@@ -70,23 +71,26 @@
   public:
 BuildSyntaxTree(syntax::TranslationUnit *,
 std::unique_ptr ,
+std::unique_ptr ,
 std::unique_ptr ,
 std::unique_ptr Tokens)
-: Root(Root), TB(TB), Arena(Arena), Tokens(std::move(Tokens)) {
+: Root(Root), TB(TB), TM(TM), Arena(Arena), Tokens(std::move(Tokens)) {
   assert(this->Tokens);
 }
 
 void HandleTranslationUnit(ASTContext ) override {
   TB = std::make_unique(std::move(*Tokens).consume());
   Tokens = nullptr; // make sure we fail if this gets called twice.
-  Arena = std::make_unique(Ctx.getSourceManager(),
-  Ctx.getLangOpts(), *TB);
-  Root = syntax::buildSyntaxTree(*Arena, Ctx);
+  TM = std::make_unique(
+  *TB, Ctx.getLangOpts(), Ctx.getSourceManager());
+  Arena = std::make_unique();
+  Root = syntax::buildSyntaxTree(*Arena, *TM, Ctx);
 }
 
   private:
 syntax::TranslationUnit *
 std::unique_ptr 
+std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr Tokens;
   };
@@ -94,21 +98,23 @@
   class BuildSyntaxTreeAction : public ASTFrontendAction {
   public:
 BuildSyntaxTreeAction(syntax::TranslationUnit *,
+  std::unique_ptr ,
   std::unique_ptr ,
   std::unique_ptr )
-: Root(Root), TB(TB), Arena(Arena) {}
+: Root(Root), TM(TM), TB(TB), Arena(Arena) {}
 
 std::unique_ptr CreateASTConsumer(CompilerInstance ,
StringRef InFile) override {
   // We start recording the tokens, ast consumer will take on the result.
   auto Tokens =
   std::make_unique(CI.getPreprocessor());
-  return std::make_unique(Root, TB, Arena,
+  return std::make_unique(Root, TB, TM, Arena,
std::move(Tokens));
 }
 
   private:
 

[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-11 Thread Sam McCall via Phabricator via cfe-commits
sammccall added inline comments.



Comment at: clang/lib/Tooling/Syntax/BuildTree.cpp:370
+  TreeBuilder(syntax::Arena )
+  : Arena(Arena), STM(cast(Arena.getTokenManager())),
+Pending(Arena, STM.tokenBuffer()) {

hokein wrote:
> sammccall wrote:
> > need changes to the public API to make this cast valid
> Are you suggesting to change all public APIs where there is such a cast usage?
> 
> If yes, this seems not a trivial change, I think at least we will change all 
> APIs (`buildSyntaxTree`, `createLeaf`, `createTree`, 
> `deepCopyExpandingMacros`) in `BuildTree.h` (by adding a new 
> `TokenBufferTokenManager` parameter). 
> And the `Arena` probably doesn't need to have a `TokenManager` field (it 
> could be simplified as a single `BumpPtrAllocator`), as the TokenManager is 
> passed in parallel with the Arena.
> 
> I'm happy to do the change, but IMO, the current version doesn't seem too bad 
> for me (if we pass an non-SyntaxTokenManager, it will trigger an assertion in 
> debug mode).
> Are you suggesting to change all public APIs where there is such a cast usage?

Yes, at the very least they should document the requirement ("this arena must 
use a TokenBuffer").
An unchecked downcast with no indication on the public API that a specific 
subclass is required just looks like a bug.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-11 Thread Haojian Wu via Phabricator via cfe-commits
hokein added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/SyntaxTokenManager.h:20
+/// It tracks the underlying token buffers, source manager, etc.
+class SyntaxTokenManager : public TokenManager {
+public:

sammccall wrote:
> I don't think "syntax" in "syntax token manager" is particularly 
> disambiguating here, both TokenBuffer and TokenManager are part of syntax, so 
> it's not clear what it refers to (and it doens't have any obvious 
> plain-english meaning).
> 
> Maybe some combination like `TokenBufferTokenManager` or `BufferTokenManager`.
> In fact I think best is `TokenBuffer::TokenManager`, still defined in a 
> separate header, though I'm not sure if you think that's too weird.
Renamed to TokenBufferTokenManager. 

`BufferTokenManager` name is short, but it has `BufferToken` prefix, which 
seems confusing with `TokenBuffer`. `TokenBuffer::TokenManager` is weird to me, 
and doesn't reflect the layering IMO



Comment at: clang/lib/Tooling/Syntax/BuildTree.cpp:370
+  TreeBuilder(syntax::Arena )
+  : Arena(Arena), STM(cast(Arena.getTokenManager())),
+Pending(Arena, STM.tokenBuffer()) {

sammccall wrote:
> need changes to the public API to make this cast valid
Are you suggesting to change all public APIs where there is such a cast usage?

If yes, this seems not a trivial change, I think at least we will change all 
APIs (`buildSyntaxTree`, `createLeaf`, `createTree`, `deepCopyExpandingMacros`) 
in `BuildTree.h` (by adding a new `TokenBufferTokenManager` parameter). 
And the `Arena` probably doesn't need to have a `TokenManager` field (it could 
be simplified as a single `BumpPtrAllocator`), as the TokenManager is passed in 
parallel with the Arena.

I'm happy to do the change, but IMO, the current version doesn't seem too bad 
for me (if we pass an non-SyntaxTokenManager, it will trigger an assertion in 
debug mode).



Comment at: clang/lib/Tooling/Syntax/ComputeReplacements.cpp:94
 const syntax::TranslationUnit ) {
-  const auto  = A.getTokenBuffer();
-  const auto  = A.getSourceManager();
+  const auto  = llvm::cast(A.getTokenManager());
+  const auto  = STM.tokenBuffer();

sammccall wrote:
> Need a change to the public interface to guarantee this cast will succeed.
> The cleanest would be just to take the SyntaxTokenManager as a param (moving 
> the cast to the call site - I don't think Arena is otherwise used for 
> anything).
> 
> Failing that we at least need to update the contract
Done, this is a trivial change.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-11 Thread Haojian Wu via Phabricator via cfe-commits
hokein updated this revision to Diff 443737.
hokein added a comment.

more update


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

Files:
  clang/include/clang/Tooling/Syntax/Mutations.h
  clang/include/clang/Tooling/Syntax/Nodes.h
  clang/include/clang/Tooling/Syntax/TokenBufferTokenManager.h
  clang/include/clang/Tooling/Syntax/TokenManager.h
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/include/clang/Tooling/Syntax/Tree.h
  clang/lib/Tooling/Syntax/BuildTree.cpp
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/ComputeReplacements.cpp
  clang/lib/Tooling/Syntax/Synthesis.cpp
  clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
  clang/lib/Tooling/Syntax/Tree.cpp
  clang/tools/clang-check/ClangCheck.cpp
  clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
  clang/unittests/Tooling/Syntax/MutationsTest.cpp
  clang/unittests/Tooling/Syntax/SynthesisTest.cpp
  clang/unittests/Tooling/Syntax/TreeTest.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.h

Index: clang/unittests/Tooling/Syntax/TreeTestBase.h
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.h
+++ clang/unittests/Tooling/Syntax/TreeTestBase.h
@@ -17,6 +17,7 @@
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Testing/TestClangConfig.h"
 #include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Tooling/Syntax/TokenBufferTokenManager.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "clang/Tooling/Syntax/Tree.h"
 #include "llvm/ADT/StringRef.h"
@@ -51,6 +52,7 @@
   std::shared_ptr Invocation;
   // Set after calling buildTree().
   std::unique_ptr TB;
+  std::unique_ptr TM;
   std::unique_ptr Arena;
 };
 
Index: clang/unittests/Tooling/Syntax/TreeTestBase.cpp
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -35,13 +35,14 @@
 using namespace clang::syntax;
 
 namespace {
-ArrayRef tokens(syntax::Node *N) {
+ArrayRef tokens(syntax::Node *N,
+   const TokenBufferTokenManager ) {
   assert(N->isOriginal() && "tokens of modified nodes are not well-defined");
   if (auto *L = dyn_cast(N))
-return llvm::makeArrayRef(L->getToken(), 1);
+return llvm::makeArrayRef(STM.getToken(L->getTokenKey()), 1);
   auto *T = cast(N);
-  return llvm::makeArrayRef(T->findFirstLeaf()->getToken(),
-T->findLastLeaf()->getToken() + 1);
+  return llvm::makeArrayRef(STM.getToken(T->findFirstLeaf()->getTokenKey()),
+STM.getToken(T->findLastLeaf()->getTokenKey()) + 1);
 }
 } // namespace
 
@@ -70,23 +71,26 @@
   public:
 BuildSyntaxTree(syntax::TranslationUnit *,
 std::unique_ptr ,
+std::unique_ptr ,
 std::unique_ptr ,
 std::unique_ptr Tokens)
-: Root(Root), TB(TB), Arena(Arena), Tokens(std::move(Tokens)) {
+: Root(Root), TB(TB), TM(TM), Arena(Arena), Tokens(std::move(Tokens)) {
   assert(this->Tokens);
 }
 
 void HandleTranslationUnit(ASTContext ) override {
   TB = std::make_unique(std::move(*Tokens).consume());
   Tokens = nullptr; // make sure we fail if this gets called twice.
-  Arena = std::make_unique(Ctx.getSourceManager(),
-  Ctx.getLangOpts(), *TB);
+  TM = std::make_unique(
+  *TB, Ctx.getLangOpts(), Ctx.getSourceManager());
+  Arena = std::make_unique(*TM);
   Root = syntax::buildSyntaxTree(*Arena, Ctx);
 }
 
   private:
 syntax::TranslationUnit *
 std::unique_ptr 
+std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr Tokens;
   };
@@ -94,21 +98,23 @@
   class BuildSyntaxTreeAction : public ASTFrontendAction {
   public:
 BuildSyntaxTreeAction(syntax::TranslationUnit *,
+  std::unique_ptr ,
   std::unique_ptr ,
   std::unique_ptr )
-: Root(Root), TB(TB), Arena(Arena) {}
+: Root(Root), TM(TM), TB(TB), Arena(Arena) {}
 
 std::unique_ptr CreateASTConsumer(CompilerInstance ,
StringRef InFile) override {
   // We start recording the tokens, ast consumer will take on the result.
   auto Tokens =
   std::make_unique(CI.getPreprocessor());
-  return std::make_unique(Root, TB, Arena,
+  return std::make_unique(Root, TB, TM, Arena,
std::move(Tokens));
 }
 
   private:
 syntax::TranslationUnit *
+std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr 
   };
@@ -149,7 +155,7 @@
   Compiler.setSourceManager(SourceMgr.get());
 
   syntax::TranslationUnit *Root = nullptr;
-  

[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-11 Thread Haojian Wu via Phabricator via cfe-commits
hokein updated this revision to Diff 443736.
hokein marked 7 inline comments as done.
hokein added a comment.

address comments.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

Files:
  clang/include/clang/Tooling/Syntax/Mutations.h
  clang/include/clang/Tooling/Syntax/Nodes.h
  clang/include/clang/Tooling/Syntax/TokenBufferTokenManager.h
  clang/include/clang/Tooling/Syntax/TokenManager.h
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/include/clang/Tooling/Syntax/Tree.h
  clang/lib/Tooling/Syntax/BuildTree.cpp
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/ComputeReplacements.cpp
  clang/lib/Tooling/Syntax/Synthesis.cpp
  clang/lib/Tooling/Syntax/TokenBufferTokenManager.cpp
  clang/lib/Tooling/Syntax/Tree.cpp
  clang/tools/clang-check/ClangCheck.cpp
  clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
  clang/unittests/Tooling/Syntax/MutationsTest.cpp
  clang/unittests/Tooling/Syntax/SynthesisTest.cpp
  clang/unittests/Tooling/Syntax/TreeTest.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.h

Index: clang/unittests/Tooling/Syntax/TreeTestBase.h
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.h
+++ clang/unittests/Tooling/Syntax/TreeTestBase.h
@@ -17,6 +17,7 @@
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Testing/TestClangConfig.h"
 #include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Tooling/Syntax/TokenBufferTokenManager.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "clang/Tooling/Syntax/Tree.h"
 #include "llvm/ADT/StringRef.h"
@@ -51,6 +52,7 @@
   std::shared_ptr Invocation;
   // Set after calling buildTree().
   std::unique_ptr TB;
+  std::unique_ptr TM;
   std::unique_ptr Arena;
 };
 
Index: clang/unittests/Tooling/Syntax/TreeTestBase.cpp
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -35,13 +35,14 @@
 using namespace clang::syntax;
 
 namespace {
-ArrayRef tokens(syntax::Node *N) {
+ArrayRef tokens(syntax::Node *N,
+   const TokenBufferTokenManager ) {
   assert(N->isOriginal() && "tokens of modified nodes are not well-defined");
   if (auto *L = dyn_cast(N))
-return llvm::makeArrayRef(L->getToken(), 1);
+return llvm::makeArrayRef(STM.getToken(L->getTokenKey()), 1);
   auto *T = cast(N);
-  return llvm::makeArrayRef(T->findFirstLeaf()->getToken(),
-T->findLastLeaf()->getToken() + 1);
+  return llvm::makeArrayRef(STM.getToken(T->findFirstLeaf()->getTokenKey()),
+STM.getToken(T->findLastLeaf()->getTokenKey()) + 1);
 }
 } // namespace
 
@@ -70,23 +71,26 @@
   public:
 BuildSyntaxTree(syntax::TranslationUnit *,
 std::unique_ptr ,
+std::unique_ptr ,
 std::unique_ptr ,
 std::unique_ptr Tokens)
-: Root(Root), TB(TB), Arena(Arena), Tokens(std::move(Tokens)) {
+: Root(Root), TB(TB), TM(TM), Arena(Arena), Tokens(std::move(Tokens)) {
   assert(this->Tokens);
 }
 
 void HandleTranslationUnit(ASTContext ) override {
   TB = std::make_unique(std::move(*Tokens).consume());
   Tokens = nullptr; // make sure we fail if this gets called twice.
-  Arena = std::make_unique(Ctx.getSourceManager(),
-  Ctx.getLangOpts(), *TB);
+  TM = std::make_unique(
+  *TB, Ctx.getLangOpts(), Ctx.getSourceManager());
+  Arena = std::make_unique(*TM);
   Root = syntax::buildSyntaxTree(*Arena, Ctx);
 }
 
   private:
 syntax::TranslationUnit *
 std::unique_ptr 
+std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr Tokens;
   };
@@ -94,21 +98,23 @@
   class BuildSyntaxTreeAction : public ASTFrontendAction {
   public:
 BuildSyntaxTreeAction(syntax::TranslationUnit *,
+  std::unique_ptr ,
   std::unique_ptr ,
   std::unique_ptr )
-: Root(Root), TB(TB), Arena(Arena) {}
+: Root(Root), TM(TM), TB(TB), Arena(Arena) {}
 
 std::unique_ptr CreateASTConsumer(CompilerInstance ,
StringRef InFile) override {
   // We start recording the tokens, ast consumer will take on the result.
   auto Tokens =
   std::make_unique(CI.getPreprocessor());
-  return std::make_unique(Root, TB, Arena,
+  return std::make_unique(Root, TB, TM, Arena,
std::move(Tokens));
 }
 
   private:
 syntax::TranslationUnit *
+std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr 
   };
@@ -149,7 +155,7 @@
   Compiler.setSourceManager(SourceMgr.get());
 
   

[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-11 Thread Sam McCall via Phabricator via cfe-commits
sammccall added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/Nodes.h:24
 
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/Token.h"

I think Token and TokenKinds are also no longer needed?



Comment at: clang/include/clang/Tooling/Syntax/SyntaxTokenManager.h:20
+/// It tracks the underlying token buffers, source manager, etc.
+class SyntaxTokenManager : public TokenManager {
+public:

I don't think "syntax" in "syntax token manager" is particularly disambiguating 
here, both TokenBuffer and TokenManager are part of syntax, so it's not clear 
what it refers to (and it doens't have any obvious plain-english meaning).

Maybe some combination like `TokenBufferTokenManager` or `BufferTokenManager`.
In fact I think best is `TokenBuffer::TokenManager`, still defined in a 
separate header, though I'm not sure if you think that's too weird.



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:14
+// implementation. It enables producers (e.g. clang pseudoparser) to produce a
+// syntax-tree with a different underlying token implementation.
+//

unclear: different from what? it's not clear what "enables" means if there's no 
default. Maybe replace the sentence with:

"For example, a TokenBuffer captured from a clang parse may track macro 
expansions and associate tokens with clang's SourceManager, while a 
pseudo-parser would use a flat array of raw-lexed tokens in memory."



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:38
+  using Key = uintptr_t;
+  /// Gets the text of token identified by the key.
+  virtual llvm::StringRef getText(Key K) const = 0;

This is not a useful comment, either remove it or add more content to make it 
useful.

In particular the guarantees (or lack thereof) of exactly what this text is 
would be helpful. (This is some source code that would produce this token, 
though it may differ from exactly what was spelled in the file when 
preprocessing is involved)?



Comment at: clang/include/clang/Tooling/Syntax/Tree.h:9
 // Defines the basic structure of the syntax tree. There are two kinds of 
nodes:
-//   - leaf nodes correspond to a token in the expanded token stream,
+//   - leaf nodes correspond to a token key in the token manager
 //   - tree nodes correspond to language grammar constructs.

this is an implementation detail.
At a high level, leaf nodes correspond to tokens. I'd just delete "expanded" 
from the original comment



Comment at: clang/include/clang/Tooling/Syntax/Tree.h:46
 private:
-  SourceManager 
-  const LangOptions 
-  const TokenBuffer 
-  /// IDs and storage for additional tokenized files.
-  llvm::DenseMap> ExtraTokens;
+  // Manage all token-related stuff.
+  TokenManager& TokenMgr;

this is not a helpful comment, just remove it?



Comment at: clang/lib/Tooling/Syntax/BuildTree.cpp:370
+  TreeBuilder(syntax::Arena )
+  : Arena(Arena), STM(cast(Arena.getTokenManager())),
+Pending(Arena, STM.tokenBuffer()) {

need changes to the public API to make this cast valid



Comment at: clang/lib/Tooling/Syntax/ComputeReplacements.cpp:94
 const syntax::TranslationUnit ) {
-  const auto  = A.getTokenBuffer();
-  const auto  = A.getSourceManager();
+  const auto  = llvm::cast(A.getTokenManager());
+  const auto  = STM.tokenBuffer();

Need a change to the public interface to guarantee this cast will succeed.
The cleanest would be just to take the SyntaxTokenManager as a param (moving 
the cast to the call site - I don't think Arena is otherwise used for anything).

Failing that we at least need to update the contract


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-08 Thread Haojian Wu via Phabricator via cfe-commits
hokein added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:37
+
+  // FIXME: add an interface for getting token kind.
+};

sammccall wrote:
> I wouldn't want to prejudge this: this is a very basic attribute similar to 
> kind/role, and we may want to store it in Leaf and avoid the virtual stuff.
> 
> There's certainly enough space, e.g. of the current 16-bit `kind` use the top 
> bit to denote leaf-or-not and the bottom 15 bits to store kind-or-tokenkind.
This sounds good. Adjust the FIXME.



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:463
+/// It tracks the underlying token buffers, source manager, etc.
+class SyntaxTokenManager : public TokenManager {
+public:

ilya-biryukov wrote:
> sammccall wrote:
> > conceptually this is just "TokenBuffer implements TokenManager"
> > 
> > The main reason I can see not to actually write that is to avoid the 
> > dependency from TokenBuffer (tokens library) to TokenManager (syntax 
> > library). But here you've added that dependency anyway.
> > 
> > So I think we'd be better either with `TokenBuffer : TokenManager` or 
> > moving this class to its own header.
> I would argue they should be separate concepts.
> - `TokenBuffer` is about storing tokens and mapping between expanded and 
> spelled token streams.
> - `SyntaxTokenManager` is about implementing relevant certain operations on 
> `syntax::Token` and hiding the actual token type.
> Conceptually those things are different and decoupling them allows for more 
> flexibility and allows reasoning about them independently. In particular, one 
> could imagine having a `SyntaxTokenManager` without a `TokenBuffer` if we do 
> not need to deal with two streams of tokens.
> 
> I would suggest keeping `SyntaxTokenManager` as a separate class.
> I would suggest keeping SyntaxTokenManager as a separate class.

+1. Moved to a separate file.



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:465
+public:
+  SyntaxTokenManager(SourceManager , const LangOptions ,
+ const TokenBuffer )

sammccall wrote:
> no need to take SourceManager, TokenBuffer already includes it.
> LangOpts is unused here.
> no need to take SourceManager, TokenBuffer already includes it.

The SourceMgr can be mutated by the class (it stores the underlying tokens for 
`ExtraTokens`), while the SourceManager in TokenBuffer is immutable.

> LangOpts is unused here.
It is used in SyntaxTokenManager::lexBuffer.



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:475
+assert(Token);
+// Handle 'eof' separately, calling text() on it produces an empty string.
+if (Token->kind() == tok::eof)

sammccall wrote:
> Empty string seems like the correct return value here to me.
> If you want a special case for dump, I think that belongs in dump().
> 
> If this is because we currently provide no way to get the token kind, then 
> this should be a FIXME
Yeah, this special case is for the Leaf node dump. Added a FIXME. 

(I even double whether we need this special case at all, do we really want to 
build a Leaf node for eof token?)



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:504
+  /// IDs and storage for additional tokenized files.
+  llvm::DenseMap> ExtraTokens;
+};

sammccall wrote:
> aren't we trying to store these on the syntax arena?
> We never do lookups into this map, maybe lexbuffer just allocates storage on 
> the arena('s allocator) instead of using this map?
> aren't we trying to store these on the syntax arena?

We could do that, but I'd try to avoid that. Now the allocator of syntax::Arena 
is a storage for syntax-tree nodes only. Allocation for token-related stuff is 
on the `SyntaxTokenManager`.

> We never do lookups into this map, maybe lexbuffer just allocates storage on 
> the arena('s allocator) instead of using this map?

This map is moved from the syntax::Arena. 

You're right, there is no usage of the key at the moment, and the only use-case 
is to create a syntax leaf node that not backed by the source code (for 
refactoring usecase), it is unclear whether we will use it in the future. I 
will keep it as-is (this is not the scope of this patch).



Comment at: clang/unittests/Tooling/Syntax/TreeTestBase.cpp:116
 syntax::TranslationUnit *
+std::unique_ptr 
 std::unique_ptr 

ilya-biryukov wrote:
> NIT: it´s not breaking anything now, but I suggest putting SyntaxTokenManager 
> after TokenBuffer.
> The reason is that it´s the right destruction order, TokenManager has 
> references to TokenBuffer, so it could potentially access it in destructor 
> some time in the future (e.g. imagine asserting something on tokens).
> 
> Not that it actually breaks today, but seems like a potential surprising bug 
> in the future if we happen to refactor code in a 

[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-07-08 Thread Haojian Wu via Phabricator via cfe-commits
hokein updated this revision to Diff 443173.
hokein marked 9 inline comments as done.
hokein added a comment.
Herald added a subscriber: mgorny.

address review comments.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

Files:
  clang/include/clang/Tooling/Syntax/Nodes.h
  clang/include/clang/Tooling/Syntax/SyntaxTokenManager.h
  clang/include/clang/Tooling/Syntax/TokenManager.h
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/include/clang/Tooling/Syntax/Tree.h
  clang/lib/Tooling/Syntax/BuildTree.cpp
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/ComputeReplacements.cpp
  clang/lib/Tooling/Syntax/SyntaxTokenManager.cpp
  clang/lib/Tooling/Syntax/Synthesis.cpp
  clang/lib/Tooling/Syntax/Tree.cpp
  clang/tools/clang-check/ClangCheck.cpp
  clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
  clang/unittests/Tooling/Syntax/SynthesisTest.cpp
  clang/unittests/Tooling/Syntax/TreeTest.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.h

Index: clang/unittests/Tooling/Syntax/TreeTestBase.h
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.h
+++ clang/unittests/Tooling/Syntax/TreeTestBase.h
@@ -17,6 +17,7 @@
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Testing/TestClangConfig.h"
 #include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Tooling/Syntax/SyntaxTokenManager.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "clang/Tooling/Syntax/Tree.h"
 #include "llvm/ADT/StringRef.h"
@@ -51,6 +52,7 @@
   std::shared_ptr Invocation;
   // Set after calling buildTree().
   std::unique_ptr TB;
+  std::unique_ptr TM;
   std::unique_ptr Arena;
 };
 
Index: clang/unittests/Tooling/Syntax/TreeTestBase.cpp
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -22,6 +22,7 @@
 #include "clang/Testing/TestClangConfig.h"
 #include "clang/Tooling/Syntax/BuildTree.h"
 #include "clang/Tooling/Syntax/Nodes.h"
+#include "clang/Tooling/Syntax/SyntaxTokenManager.h"
 #include "clang/Tooling/Syntax/Tokens.h"
 #include "clang/Tooling/Syntax/Tree.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -35,13 +36,13 @@
 using namespace clang::syntax;
 
 namespace {
-ArrayRef tokens(syntax::Node *N) {
+ArrayRef tokens(syntax::Node *N, const SyntaxTokenManager ) {
   assert(N->isOriginal() && "tokens of modified nodes are not well-defined");
   if (auto *L = dyn_cast(N))
-return llvm::makeArrayRef(L->getToken(), 1);
+return llvm::makeArrayRef(STM.getToken(L->getTokenKey()), 1);
   auto *T = cast(N);
-  return llvm::makeArrayRef(T->findFirstLeaf()->getToken(),
-T->findLastLeaf()->getToken() + 1);
+  return llvm::makeArrayRef(STM.getToken(T->findFirstLeaf()->getTokenKey()),
+STM.getToken(T->findLastLeaf()->getTokenKey()) + 1);
 }
 } // namespace
 
@@ -70,23 +71,26 @@
   public:
 BuildSyntaxTree(syntax::TranslationUnit *,
 std::unique_ptr ,
+std::unique_ptr ,
 std::unique_ptr ,
 std::unique_ptr Tokens)
-: Root(Root), TB(TB), Arena(Arena), Tokens(std::move(Tokens)) {
+: Root(Root), TB(TB), TM(TM), Arena(Arena), Tokens(std::move(Tokens)) {
   assert(this->Tokens);
 }
 
 void HandleTranslationUnit(ASTContext ) override {
   TB = std::make_unique(std::move(*Tokens).consume());
   Tokens = nullptr; // make sure we fail if this gets called twice.
-  Arena = std::make_unique(Ctx.getSourceManager(),
-  Ctx.getLangOpts(), *TB);
+  TM = std::make_unique(*TB, Ctx.getLangOpts(),
+Ctx.getSourceManager());
+  Arena = std::make_unique(*TM);
   Root = syntax::buildSyntaxTree(*Arena, Ctx);
 }
 
   private:
 syntax::TranslationUnit *
 std::unique_ptr 
+std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr Tokens;
   };
@@ -94,21 +98,23 @@
   class BuildSyntaxTreeAction : public ASTFrontendAction {
   public:
 BuildSyntaxTreeAction(syntax::TranslationUnit *,
+  std::unique_ptr ,
   std::unique_ptr ,
   std::unique_ptr )
-: Root(Root), TB(TB), Arena(Arena) {}
+: Root(Root), TM(TM), TB(TB), Arena(Arena) {}
 
 std::unique_ptr CreateASTConsumer(CompilerInstance ,
StringRef InFile) override {
   // We start recording the tokens, ast consumer will take on the result.
   auto Tokens =
   std::make_unique(CI.getPreprocessor());
-  return std::make_unique(Root, TB, Arena,
+  return std::make_unique(Root, TB, TM, Arena,
  

[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-27 Thread Ilya Biryukov via Phabricator via cfe-commits
ilya-biryukov added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:463
+/// It tracks the underlying token buffers, source manager, etc.
+class SyntaxTokenManager : public TokenManager {
+public:

sammccall wrote:
> conceptually this is just "TokenBuffer implements TokenManager"
> 
> The main reason I can see not to actually write that is to avoid the 
> dependency from TokenBuffer (tokens library) to TokenManager (syntax 
> library). But here you've added that dependency anyway.
> 
> So I think we'd be better either with `TokenBuffer : TokenManager` or moving 
> this class to its own header.
I would argue they should be separate concepts.
- `TokenBuffer` is about storing tokens and mapping between expanded and 
spelled token streams.
- `SyntaxTokenManager` is about implementing relevant certain operations on 
`syntax::Token` and hiding the actual token type.
Conceptually those things are different and decoupling them allows for more 
flexibility and allows reasoning about them independently. In particular, one 
could imagine having a `SyntaxTokenManager` without a `TokenBuffer` if we do 
not need to deal with two streams of tokens.

I would suggest keeping `SyntaxTokenManager` as a separate class.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-27 Thread Sam McCall via Phabricator via cfe-commits
sammccall added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:9
+//
+// Defines Token interfaces for the syntax-tree, decoupling the syntax-tree 
from
+// the TokenBuffer. It enables producers (e.g. clang pseudoparser) to produce a

It's important that we have comments explaining what the concept is *now*, 
rather than how it changes the code structure from the previous state 
(decouples tokenbuffer, enables pseudoparser).

(I think this comment is actually fine, but be careful when writing the class 
comment for TokenManager)



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:33
+  /// The syntax-tree Leaf node holds a Key.
+  using Key = const void *;
+  /// Gets the text of token identified by the key.

hokein wrote:
> ilya-biryukov wrote:
> > I have just realized that we were discussing having opaque index as a key, 
> > but there may also be tokens in the syntax tree that are not backed by the 
> > `TokenBuffer`.
> > Those can be synthesized, e.g. imagine someone wants to change `!(A && B)` 
> > to `!A || !B`. They will need to synthesize at least the `||` token as it's 
> > not in the source code. There is a way to do this now and it prohibits the 
> > use of an index to the `TokenBuffer`.
> > 
> > So having the opaque pointer is probably ok for now, it should enable the 
> > pseudo-parser to build syntax trees.
> > We might want to add an operation to synthesize tokens into the 
> > `TokenManager` at some point, but that's a discussion for another day.
> 
> > Those can be synthesized, e.g. imagine someone wants to change !(A && B) to 
> > !A || !B. They will need to synthesize at least the || token as it's not in 
> > the source code. There is a way to do this now and it prohibits the use of 
> > an index to the TokenBuffer.
> 
> Yes, this is the exact reason why the Key is an opaque pointer, my first 
> attempt was to use an index integer, but failed -- we already have some APIs 
> doing this stuff (see `createLeaf` in BuildTree.h), the token can be a 
> synthesized token backed up by the SourceManager...
> 
> Personally, I don't like the Key to be an opaque pointer as well, but 
> considering the effort, it seems to be the best approach so far -- it enables 
> the pseudoparser to build syntax trees with a different Token implementation 
> while keeping the rest syntax stuff unchanged.
> 
> > We might want to add an operation to synthesize tokens into the 
> > TokenManager at some point, but that's a discussion for another day.
> 
> Agree, we will encounter this in the future, but we're still far away from 
> there (the layering mutation/syntax-tree is not perfect at the moment, 
> mutation still depends on the TokenBuffer). And our initial application of 
> syntax-tree in pseudoparser focuses on the read use-case, we should be fine 
> now.
Consider `uintptr_t` instead, which more naturally supports both approaches. 
(Stashing an integer in a void* is incredibly weird)



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:37
+
+  // FIXME: add an interface for getting token kind.
+};

I wouldn't want to prejudge this: this is a very basic attribute similar to 
kind/role, and we may want to store it in Leaf and avoid the virtual stuff.

There's certainly enough space, e.g. of the current 16-bit `kind` use the top 
bit to denote leaf-or-not and the bottom 15 bits to store kind-or-tokenkind.



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:463
+/// It tracks the underlying token buffers, source manager, etc.
+class SyntaxTokenManager : public TokenManager {
+public:

conceptually this is just "TokenBuffer implements TokenManager"

The main reason I can see not to actually write that is to avoid the dependency 
from TokenBuffer (tokens library) to TokenManager (syntax library). But here 
you've added that dependency anyway.

So I think we'd be better either with `TokenBuffer : TokenManager` or moving 
this class to its own header.



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:465
+public:
+  SyntaxTokenManager(SourceManager , const LangOptions ,
+ const TokenBuffer )

no need to take SourceManager, TokenBuffer already includes it.
LangOpts is unused here.



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:475
+assert(Token);
+// Handle 'eof' separately, calling text() on it produces an empty string.
+if (Token->kind() == tok::eof)

Empty string seems like the correct return value here to me.
If you want a special case for dump, I think that belongs in dump().

If this is because we currently provide no way to get the token kind, then this 
should be a FIXME



Comment at: clang/include/clang/Tooling/Syntax/Tokens.h:487
+  const SourceManager () const { return SM; }
+  const 

[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-27 Thread Ilya Biryukov via Phabricator via cfe-commits
ilya-biryukov added inline comments.



Comment at: clang/lib/Tooling/Syntax/BuildTree.cpp:569
   struct Forest {
-Forest(syntax::Arena ) {
-  assert(!A.getTokenBuffer().expandedTokens().empty());
-  assert(A.getTokenBuffer().expandedTokens().back().kind() == tok::eof);
+Forest(syntax::Arena , const syntax::SyntaxTokenManager ) {
+  assert(!STM.getTokenBuffer().expandedTokens().empty());

Could we accept a TokenBuffer here directly?
If TokenManager is needed, we can use it directly in the arena.



Comment at: clang/lib/Tooling/Syntax/BuildTree.cpp:625
 /// Add \p Node to the forest and attach child nodes based on \p Tokens.
-void foldChildren(const syntax::Arena , ArrayRef Tokens,
+void foldChildren(const syntax::SyntaxTokenManager , 
ArrayRef Tokens,
   syntax::Tree *Node) {

Same suggestion here: accept TokenBuffer instead of TokenManager



Comment at: clang/lib/Tooling/Syntax/Tree.cpp:271
+  // FIXME: can be fixed by adding an tok::Kind in the Leaf node.
+  // assert(cast(C).getToken()->kind() == 
L->getDelimiterTokenKind());
 }

hokein wrote:
> ilya-biryukov wrote:
> > Maybe add `TokenManager::getKind(Key)` right away and remove this FIXME.
> > This should as simple as `cast(T)->Kind`, right? Or am I 
> > missing some complications?
> Yeah, the main problem is that we don't have the `TokenManager` object in the 
> `syntax::Node`, we somehow need to pass it (e.g. a function parameter), which 
> seems a heavy change. I'm not sure it is worth for this small assert.
That makes sense. WDYT about the alternative fix: to pass ̀TokenManager` to 
`assertInvariants`?
Not necessary to do it now, just thinking about changing the FIXME



Comment at: clang/unittests/Tooling/Syntax/TreeTestBase.cpp:116
 syntax::TranslationUnit *
+std::unique_ptr 
 std::unique_ptr 

NIT: it´s not breaking anything now, but I suggest putting SyntaxTokenManager 
after TokenBuffer.
The reason is that it´s the right destruction order, TokenManager has 
references to TokenBuffer, so it could potentially access it in destructor some 
time in the future (e.g. imagine asserting something on tokens).

Not that it actually breaks today, but seems like a potential surprising bug in 
the future if we happen to refactor code in a certain way. 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-24 Thread Haojian Wu via Phabricator via cfe-commits
hokein added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:33
+  /// The syntax-tree Leaf node holds a Key.
+  using Key = const void *;
+  /// Gets the text of token identified by the key.

ilya-biryukov wrote:
> I have just realized that we were discussing having opaque index as a key, 
> but there may also be tokens in the syntax tree that are not backed by the 
> `TokenBuffer`.
> Those can be synthesized, e.g. imagine someone wants to change `!(A && B)` to 
> `!A || !B`. They will need to synthesize at least the `||` token as it's not 
> in the source code. There is a way to do this now and it prohibits the use of 
> an index to the `TokenBuffer`.
> 
> So having the opaque pointer is probably ok for now, it should enable the 
> pseudo-parser to build syntax trees.
> We might want to add an operation to synthesize tokens into the 
> `TokenManager` at some point, but that's a discussion for another day.

> Those can be synthesized, e.g. imagine someone wants to change !(A && B) to 
> !A || !B. They will need to synthesize at least the || token as it's not in 
> the source code. There is a way to do this now and it prohibits the use of an 
> index to the TokenBuffer.

Yes, this is the exact reason why the Key is an opaque pointer, my first 
attempt was to use an index integer, but failed -- we already have some APIs 
doing this stuff (see `createLeaf` in BuildTree.h), the token can be a 
synthesized token backed up by the SourceManager...

Personally, I don't like the Key to be an opaque pointer as well, but 
considering the effort, it seems to be the best approach so far -- it enables 
the pseudoparser to build syntax trees with a different Token implementation 
while keeping the rest syntax stuff unchanged.

> We might want to add an operation to synthesize tokens into the TokenManager 
> at some point, but that's a discussion for another day.

Agree, we will encounter this in the future, but we're still far away from 
there (the layering mutation/syntax-tree is not perfect at the moment, mutation 
still depends on the TokenBuffer). And our initial application of syntax-tree 
in pseudoparser focuses on the read use-case, we should be fine now.



Comment at: clang/lib/Tooling/Syntax/Tree.cpp:271
+  // FIXME: can be fixed by adding an tok::Kind in the Leaf node.
+  // assert(cast(C).getToken()->kind() == 
L->getDelimiterTokenKind());
 }

ilya-biryukov wrote:
> Maybe add `TokenManager::getKind(Key)` right away and remove this FIXME.
> This should as simple as `cast(T)->Kind`, right? Or am I 
> missing some complications?
Yeah, the main problem is that we don't have the `TokenManager` object in the 
`syntax::Node`, we somehow need to pass it (e.g. a function parameter), which 
seems a heavy change. I'm not sure it is worth for this small assert.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-24 Thread Ilya Biryukov via Phabricator via cfe-commits
ilya-biryukov added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:33
+  /// The syntax-tree Leaf node holds a Key.
+  using Key = const void *;
+  /// Gets the text of token identified by the key.

I have just realized that we were discussing having opaque index as a key, but 
there may also be tokens in the syntax tree that are not backed by the 
`TokenBuffer`.
Those can be synthesized, e.g. imagine someone wants to change `!(A && B)` to 
`!A || !B`. They will need to synthesize at least the `||` token as it's not in 
the source code. There is a way to do this now and it prohibits the use of an 
index to the `TokenBuffer`.

So having the opaque pointer is probably ok for now, it should enable the 
pseudo-parser to build syntax trees.
We might want to add an operation to synthesize tokens into the `TokenManager` 
at some point, but that's a discussion for another day.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-24 Thread Ilya Biryukov via Phabricator via cfe-commits
ilya-biryukov added inline comments.



Comment at: clang/include/clang/Tooling/Syntax/TokenManager.h:23
+
+/// Base token interfaces for the syntax-tree.
+class TokenManager {

NIT: maybe explain the `TokenManager` concept here, the comment seems to be a 
leftover from the previous revision.



Comment at: clang/lib/Tooling/Syntax/Tree.cpp:271
+  // FIXME: can be fixed by adding an tok::Kind in the Leaf node.
+  // assert(cast(C).getToken()->kind() == 
L->getDelimiterTokenKind());
 }

Maybe add `TokenManager::getKind(Key)` right away and remove this FIXME.
This should as simple as `cast(T)->Kind`, right? Or am I missing 
some complications?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-24 Thread Haojian Wu via Phabricator via cfe-commits
hokein added a comment.

I'm quite happy about the interfaces now, it should be in a good shape for the 
API review (would be nice to get some initial feedback before I do further 
cleanup).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-24 Thread Haojian Wu via Phabricator via cfe-commits
hokein updated this revision to Diff 439705.
hokein added a comment.

Revised to the TokenManager approach:

- Inroduce a Base Token class (TokenManager) for syntax-tree, the motivation is 
to allow using different underlying token implementation in syntax-tree
- Decouple the syntax-tree from the TokenBuffer:
  - syntax-tree structure (Tree.h) doesn't depend on the TokenBuffer, 
SourceManager Source location etc, it communicates with TokenManager interfaces;
  - syntax-tree Arena is simpler, the token-managing responsiblity is 
transferred to TokenManager;
  - in SyntaxTree directory, we implement a TokenBuffer-based 
SyntaxTokenManager, which mangues all token-related stuff
  - For the mutation/replacement computation APIs, currently they only work on 
a TokenBuffer-based token manager. Asssertion will be raised if it is not 
satisfied. It is an NFC change


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

Files:
  clang/include/clang/Tooling/Syntax/Nodes.h
  clang/include/clang/Tooling/Syntax/TokenManager.h
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/include/clang/Tooling/Syntax/Tree.h
  clang/lib/Tooling/Syntax/BuildTree.cpp
  clang/lib/Tooling/Syntax/ComputeReplacements.cpp
  clang/lib/Tooling/Syntax/Synthesis.cpp
  clang/lib/Tooling/Syntax/Tokens.cpp
  clang/lib/Tooling/Syntax/Tree.cpp
  clang/tools/clang-check/ClangCheck.cpp
  clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
  clang/unittests/Tooling/Syntax/SynthesisTest.cpp
  clang/unittests/Tooling/Syntax/TreeTest.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.h

Index: clang/unittests/Tooling/Syntax/TreeTestBase.h
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.h
+++ clang/unittests/Tooling/Syntax/TreeTestBase.h
@@ -50,6 +50,7 @@
   new SourceManager(*Diags, *FileMgr);
   std::shared_ptr Invocation;
   // Set after calling buildTree().
+  std::unique_ptr TM;
   std::unique_ptr TB;
   std::unique_ptr Arena;
 };
Index: clang/unittests/Tooling/Syntax/TreeTestBase.cpp
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -35,13 +35,13 @@
 using namespace clang::syntax;
 
 namespace {
-ArrayRef tokens(syntax::Node *N) {
+ArrayRef tokens(syntax::Node *N, const SyntaxTokenManager ) {
   assert(N->isOriginal() && "tokens of modified nodes are not well-defined");
   if (auto *L = dyn_cast(N))
-return llvm::makeArrayRef(L->getToken(), 1);
+return llvm::makeArrayRef(STM.getToken(L->getTokenKey()), 1);
   auto *T = cast(N);
-  return llvm::makeArrayRef(T->findFirstLeaf()->getToken(),
-T->findLastLeaf()->getToken() + 1);
+  return llvm::makeArrayRef(STM.getToken(T->findFirstLeaf()->getTokenKey()),
+STM.getToken(T->findLastLeaf()->getTokenKey()) + 1);
 }
 } // namespace
 
@@ -69,23 +69,26 @@
   class BuildSyntaxTree : public ASTConsumer {
   public:
 BuildSyntaxTree(syntax::TranslationUnit *,
+std::unique_ptr ,
 std::unique_ptr ,
 std::unique_ptr ,
 std::unique_ptr Tokens)
-: Root(Root), TB(TB), Arena(Arena), Tokens(std::move(Tokens)) {
+: Root(Root), TM(TM), TB(TB), Arena(Arena), Tokens(std::move(Tokens)) {
   assert(this->Tokens);
 }
 
 void HandleTranslationUnit(ASTContext ) override {
   TB = std::make_unique(std::move(*Tokens).consume());
   Tokens = nullptr; // make sure we fail if this gets called twice.
-  Arena = std::make_unique(Ctx.getSourceManager(),
-  Ctx.getLangOpts(), *TB);
+  TM = std::make_unique(Ctx.getSourceManager(),
+Ctx.getLangOpts(), *TB);
+  Arena = std::make_unique(*TM);
   Root = syntax::buildSyntaxTree(*Arena, Ctx);
 }
 
   private:
 syntax::TranslationUnit *
+std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr 
 std::unique_ptr Tokens;
@@ -94,21 +97,23 @@
   class BuildSyntaxTreeAction : public ASTFrontendAction {
   public:
 BuildSyntaxTreeAction(syntax::TranslationUnit *,
+  std::unique_ptr ,
   std::unique_ptr ,
   std::unique_ptr )
-: Root(Root), TB(TB), Arena(Arena) {}
+: Root(Root), TM(TM), TB(TB), Arena(Arena) {}
 
 std::unique_ptr CreateASTConsumer(CompilerInstance ,
StringRef InFile) override {
   // We start recording the tokens, ast consumer will take on the result.
   auto Tokens =
   std::make_unique(CI.getPreprocessor());
-  return std::make_unique(Root, TB, Arena,
+  return 

[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-23 Thread Sam McCall via Phabricator via cfe-commits
sammccall added a comment.

As discussed offline this has some problems:

- putting virtual methods on BaseToken gives it a vtable, which makes it (and 
syntax::Token) large
- being able to use ArrayRef but not ArrayRef is a 
bit weird
- unusual uses of inheritance can be hard to reason about

We suggested rather having Leaf store an opaque ID.
Callers who know what kind of tokens are in use can use this to associate with 
the original token.
For generic use (e.g dump()), we can have a TokenManager interface which 
provides the common operations (like getText()). This generalizes where 
SourceManager is needed today.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128411/new/

https://reviews.llvm.org/D128411

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128411: [syntax] Introduce a BaseToken class.

2022-06-23 Thread Haojian Wu via Phabricator via cfe-commits
hokein created this revision.
hokein added reviewers: sammccall, ilya-biryukov.
Herald added a project: All.
hokein requested review of this revision.
Herald added a project: clang.

This enables us to use a different underlying token implementation for the
syntax Leaf node -- in clang pseudoparser, we want to produce a
syntax-tree with its own pseudo::Token rather than syntax::Token.

Now Leaf node points to a BaseToken, which can be a different subclass
depending on token implementation.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D128411

Files:
  clang/include/clang/Tooling/Syntax/BaseToken.h
  clang/include/clang/Tooling/Syntax/Tokens.h
  clang/include/clang/Tooling/Syntax/Tree.h
  clang/lib/Tooling/Syntax/BuildTree.cpp
  clang/lib/Tooling/Syntax/ComputeReplacements.cpp
  clang/lib/Tooling/Syntax/Synthesis.cpp
  clang/lib/Tooling/Syntax/Tokens.cpp
  clang/lib/Tooling/Syntax/Tree.cpp
  clang/tools/clang-check/ClangCheck.cpp
  clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
  clang/unittests/Tooling/Syntax/SynthesisTest.cpp
  clang/unittests/Tooling/Syntax/TokensTest.cpp
  clang/unittests/Tooling/Syntax/TreeTest.cpp
  clang/unittests/Tooling/Syntax/TreeTestBase.cpp

Index: clang/unittests/Tooling/Syntax/TreeTestBase.cpp
===
--- clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -38,10 +38,10 @@
 ArrayRef tokens(syntax::Node *N) {
   assert(N->isOriginal() && "tokens of modified nodes are not well-defined");
   if (auto *L = dyn_cast(N))
-return llvm::makeArrayRef(L->getToken(), 1);
+return llvm::makeArrayRef(L->getTokenAs(), 1);
   auto *T = cast(N);
-  return llvm::makeArrayRef(T->findFirstLeaf()->getToken(),
-T->findLastLeaf()->getToken() + 1);
+  return llvm::makeArrayRef(T->findFirstLeaf()->getTokenAs(),
+T->findLastLeaf()->getTokenAs() + 1);
 }
 } // namespace
 
Index: clang/unittests/Tooling/Syntax/TreeTest.cpp
===
--- clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -180,7 +180,7 @@
 private:
   std::string dumpQuotedTokensOrNull(const Node *N) {
 return N ? "'" +
-   StringRef(N->dumpTokens(Arena->getSourceManager()))
+   StringRef(N->dumpTokens(>getSourceManager()))
.trim()
.str() +
"'"
Index: clang/unittests/Tooling/Syntax/TokensTest.cpp
===
--- clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -220,7 +220,7 @@
 }
 // An equality test for search.
 auto TextMatches = [this](llvm::StringRef Q, const syntax::Token ) {
-  return Q == T.text(*SourceMgr);
+  return Q == T.text(&*SourceMgr);
 };
 // Find a match.
 auto Found =
@@ -906,7 +906,7 @@
 SourceLocation Loc = SourceMgr->getComposedLoc(SourceMgr->getMainFileID(),
Code.points()[Index]);
 const syntax::Token *Tok = spelledIdentifierTouching(Loc, Buffer);
-return Tok ? Tok->text(*SourceMgr) : "";
+return Tok ? Tok->text(&*SourceMgr) : "";
   };
 
   EXPECT_THAT(Touching(0), SameRange(findSpelled("int")));
Index: clang/unittests/Tooling/Syntax/SynthesisTest.cpp
===
--- clang/unittests/Tooling/Syntax/SynthesisTest.cpp
+++ clang/unittests/Tooling/Syntax/SynthesisTest.cpp
@@ -27,7 +27,8 @@
   return ::testing::AssertionFailure()
  << "Root was not built successfully.";
 
-auto Actual = StringRef(Root->dump(Arena->getSourceManager())).trim().str();
+auto Actual =
+StringRef(Root->dump(>getSourceManager())).trim().str();
 auto Expected = Dump.trim().str();
 // EXPECT_EQ shows the diff between the two strings if they are different.
 EXPECT_EQ(Expected, Actual);
@@ -175,7 +176,7 @@
 
   auto *Copy = deepCopyExpandingMacros(*Arena, StatementContinue);
   EXPECT_TRUE(
-  treeDumpEqual(Copy, StatementContinue->dump(Arena->getSourceManager(;
+  treeDumpEqual(Copy, StatementContinue->dump(>getSourceManager(;
   // FIXME: Test that copy is independent of original, once the Mutations API is
   // more developed.
 }
Index: clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
===
--- clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
+++ clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
@@ -26,7 +26,7 @@
 auto ErrorOK = errorOK(Code);
 if (!ErrorOK)
   return ErrorOK;
-auto Actual = StringRef(Root->dump(Arena->getSourceManager())).trim().str();
+auto Actual = StringRef(Root->dump(>getSourceManager())).trim().str();
 // EXPECT_EQ shows the