[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-07-05 Thread Sam McCall via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
sammccall marked an inline comment as done.
Closed by commit rG312116748890: [pseudo] Add error-recovery framework  
brace-based recovery (authored by sammccall).

Changed prior to commit:
  https://reviews.llvm.org/D128486?vs=441154=442369#toc

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,8 +7,9 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
-#include "clang-pseudo/Token.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Language.h"
+#include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TokenKinds.h"
@@ -33,11 +34,13 @@
 using StateID = LRTable::StateID;
 using testing::AllOf;
 using testing::ElementsAre;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -247,9 +250,9 @@
   /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
   /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-  GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-  /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+  /*State=*/3, /*ForestNode=*/ClassNameNode,
+  /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
   GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
   /*Parents=*/{GSSNode2});
@@ -366,6 +369,120 @@
   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable::Builder B(TestLang.G);
+  B.Transition[{StateID{1}, id("word")}] = StateID{3};
+  B.Recoveries.push_back({StateID{1}, {RecoveryStrategy::Braces, id("word")}});
+  TestLang.Table = std::move(B).build();
+
+  auto *LBrace = (tok::l_brace, 0);
+  auto *Question1 = (tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, {Tokens, Arena, GSStack}, TestLang,
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(3), parsedSymbolID(id("word")),
+  parents({OpenedBraces}), start(1u;
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, {NoRBrace, Arena, GSStack}, TestLang,
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //0--1({)--1({)--1({)
+  //  After 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-07-05 Thread Sam McCall via Phabricator via cfe-commits
sammccall marked 5 inline comments as done.
sammccall added a comment.

Thanks in particular for flagging the issue with duplicate forest nodes, you've 
found a hole in the model.
That said, I've left a big FIXME and I think we should patch it later.




Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h:148
+RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
+SymbolID Result;   // The symbol that is produced.
+  };

hokein wrote:
> nit: mention the `Result` must be a nonterminal.
Hmm, I don't think it has to be?

(To deduce a brace recovery rule we require it, but that doesn't mean it needs 
to be true in general)



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:178
+const ForestNode  =
+Params.Forest.createOpaque(Option->Symbol, Option->Position);
+const GSS::Node *NewHead = Params.GSStack.addNode(

hokein wrote:
> should we worry about the case where we create a duplicated forest node? e.g. 
> we have two best options and both recover to the same nonterminal.
Oh no, you're right... we also have all the options for the GSS node being 
either the same/different in that case.

And it gets worse, what if we have one recovery -> X, and another recovery -> 
Y, and a rule `X := Y`. The following `glrReduce` will produce a duplicate `X`, 
instead of an `AmbiguousX{ OpaqueX, SequenceX{ OpaqueY } }`.

As much as I hate this, I think we should slap a FIXME on it and move on. I 
don't think multiple tied recoveries is common, and the solution here is just 
as likely to break the tie as it is to fix this with fancy algorithms.
However I don't think we have enough data to make a decision on exactly what to 
do yet.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:44
+
+void glrRecover(llvm::ArrayRef OldHeads,
+unsigned , const TokenStream ,

hokein wrote:
> sammccall wrote:
> > hokein wrote:
> > > The `GLR.cpp` file is growing significantly, I think the recovery part is 
> > > large enough to be lived in a separate file `GLRRecovery.cpp`, but the 
> > > declaration can still be in the `GLR.h`.
> > This is interesting, recover/shift/reduce/parse are (vertically) 
> > self-contained enough that it didn't seem like a big problem yet...
> > 
> > If the concern is file length, maybe we'd thather start with `reduce`; if 
> > it's relatedness, `GSS`?
> > 
> > My line count is:
> >  - recover: 156
> >  - shift: 47
> >  - reduce: 319
> >  - parse: 87
> >  - GSS: 66
> Yeah, indeed these pieces can be split, my main concern is not file length -- 
> I would prefer keeping shift/reduce/parse into a single file, as they form a 
> complete GLR algorithm (splitting them would make it harder to read and 
> follow).
> 
> Recovery seems like a different thing, I would image this part of code will 
> grow more in the future
> - the GLR algorithm has the core recovery mechanism framework with some 
> fallback recovery strategies (eof, brackets)
> - we have different recovery strategies implemented (some of them might be 
> cxx-specific, probably be part of pseudoCXX library);
> 
> 
It's hard to tell yet, but I guess I don't really see why it's going to be 
easier to reason about shift/recover in isolation vs shift/reduce. Let's see 
how this goes past the initial patch (in particular once we move stuff into 
extensions)



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:130
+
+  assert(NewHeads.empty()); // We may repeatedly populate and clear it.
+  llvm::Optional RecoveryRange;

hokein wrote:
> sammccall wrote:
> > hokein wrote:
> > > nit: might be clearer to move the assertion to the beginning of the 
> > > function.
> > The purpose of the assertion is to make it obvious that NewHeads.clear() 
> > only discards items added during the loop below.
> > An assertion at the top of the function would be less useful for this, both 
> > because it's not on the screen when reading NewHeads.clear() and because 
> > it's much less locally obvious that nothing has been pushed onto NewHeads 
> > at some prior point in the function.
> Yeah, but I'd treat it as a contract of the API (the `NewHeads` argument 
> passed to `glrRecover` must be empty).
> 
> btw, the empty assertion is missing in the latest version.
Yes, I think this is obsolete - the latest version of glrRecover no longer 
requires NewHeads to be empty at all, as it doesn't use it as scratch space.



Comment at: clang-tools-extra/pseudo/unittests/GLRTest.cpp:558
 
+TEST_F(GLRTest, RecoveryEndToEnd) {
+  // Simple example of brace-based recovery showing:

hokein wrote:
> nit: not sure the intention having the `RecoveryEndToEnd` separated from the 
> above recover-related tests, why not grouping them together?
this tests glrParse, and it's grouped with the other glrParse tests consistent 
with this file (e.g. GLRReduceOrder is with 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-07-05 Thread Haojian Wu via Phabricator via cfe-commits
hokein accepted this revision.
hokein added a comment.
This revision is now accepted and ready to land.

Looks great, let's ship it! feel free to land it in any form you think it is 
suitable.




Comment at: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h:150
+// OldHeads is the parse state at TokenIndex.
+// This function consumes consumes zero or more tokens by advancing TokenIndex,
+// and places any recovery states created in NewHeads.

nit: remove the duplicated `consumes`.



Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h:148
+RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
+SymbolID Result;   // The symbol that is produced.
+  };

nit: mention the `Result` must be a nonterminal.



Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h:131
+  // Expected to be called by LR parsers.
+  llvm::ArrayRef getActions(StateID State, SymbolID Terminal) const;
   // Returns the state after we reduce a nonterminal.

nit: unrelated method?



Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h:248
+  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
+  std::vector RecoveryOffset;
+  std::vector Recoveries;

sammccall wrote:
> hokein wrote:
> > I see the motivation of the `OffsetTable` structure now, this would come as 
> > a follow-up to simplify the `ReduceOffset` and `RecoveryOffset`, right?
> Yes. Though I'm on the fence about whether it's worth it with one case (it's 
> a bit awkward to generalize the building IIRC).
A motivating bit is that it is tricky to implement a correct `get` method (we 
both made the same out-of-bound issue). I'm fine with the current form, we can 
revisit it afterwards.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:178
+const ForestNode  =
+Params.Forest.createOpaque(Option->Symbol, Option->Position);
+const GSS::Node *NewHead = Params.GSStack.addNode(

should we worry about the case where we create a duplicated forest node? e.g. 
we have two best options and both recover to the same nonterminal.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:44
+
+void glrRecover(llvm::ArrayRef OldHeads,
+unsigned , const TokenStream ,

sammccall wrote:
> hokein wrote:
> > The `GLR.cpp` file is growing significantly, I think the recovery part is 
> > large enough to be lived in a separate file `GLRRecovery.cpp`, but the 
> > declaration can still be in the `GLR.h`.
> This is interesting, recover/shift/reduce/parse are (vertically) 
> self-contained enough that it didn't seem like a big problem yet...
> 
> If the concern is file length, maybe we'd thather start with `reduce`; if 
> it's relatedness, `GSS`?
> 
> My line count is:
>  - recover: 156
>  - shift: 47
>  - reduce: 319
>  - parse: 87
>  - GSS: 66
Yeah, indeed these pieces can be split, my main concern is not file length -- I 
would prefer keeping shift/reduce/parse into a single file, as they form a 
complete GLR algorithm (splitting them would make it harder to read and follow).

Recovery seems like a different thing, I would image this part of code will 
grow more in the future
- the GLR algorithm has the core recovery mechanism framework with some 
fallback recovery strategies (eof, brackets)
- we have different recovery strategies implemented (some of them might be 
cxx-specific, probably be part of pseudoCXX library);





Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:68
+// one arbitrarily.
+std::vector Path;
+  };

sammccall wrote:
> hokein wrote:
> > nit: maybe name it `Parses` or `PartialParses`. Path make me think this is 
> > a patch of GSS nodes.
> Renamed to DiscardedParse, does that work for you?
It is better than the original name. The `DiscardedParse` is a bit weird when 
we start to put it under the opaque node, in that sense, they are not 
discarded, IMO



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:83
+  //expr := type ( . expr )   - (up) we should recover this outer expr
+  //expr := . type ( expr )   - (up+left) we should not recover this type!
+  //

sammccall wrote:
> hokein wrote:
> > I think you're right -- I thought the first GSS node with a recovery state 
> > we encounter during the Walkup  state is the node we want. 
> > 
> > This example is a little confusing (it still matches our previous mental 
> > model), I didn't get it until we discussed offline. I think the following 
> > example is clearer 
> > 
> > ```
> > parsing the text `if (true) else ?`
> > 
> > IfStmt := if (stmt) else . stmt - which we're currently parsing
> > IfStmt := if (.stmt) else stmt  - (left) the first recovery GSS node, 
> > should not recover from this
> > IfStmt := . if (stmt) else stmt 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-29 Thread Sam McCall via Phabricator via cfe-commits
sammccall added inline comments.



Comment at: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h:150
+// OldHeads is the parse state at TokenIndex.
+// This function consumes consumes zero or more tokens (advancing TokenIndex),
+// and places any recovery states created in NewHeads.

hokein wrote:
> If I read it correctly, consuming zero token is consider failure of the 
> function, right?
Consuming zero tokens + producing heads is success, consuming zero tokens + not 
producing heads is failure.
Tweaked this comment a bit.

(Also fixed a bit of logic that ignored recovery that didn't advance 
TokenIndex!)



Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h:134
+  uint8_t RecoveryIndex = -1;
+  RecoveryStrategy Recovery = RecoveryStrategy::None;
+

hokein wrote:
> So each rule will have at most 1 recovery-strategy (it is fine for the 
> initial version), but I think in the future we want more (probably we need to 
> change the Sequence to an array of `{SymbolID, RevoeryStrategy}`).
> 
> ```
> selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) 
> statement ELSE statement
> ```
> We might want different recoveries in `( . init-statement_opt condition )` 
> `(init-statement_opt condition) . statement`, `ELSE . statement`.
> 
> 
> 
Added a comment to call out this limitation.
There are several ways to deal with this, e.g. we could split up the grammar 
into multiple rules, each with one recovery. (But the approach you describe 
makes sense too)



Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h:248
+  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
+  std::vector RecoveryOffset;
+  std::vector Recoveries;

hokein wrote:
> I see the motivation of the `OffsetTable` structure now, this would come as a 
> follow-up to simplify the `ReduceOffset` and `RecoveryOffset`, right?
Yes. Though I'm on the fence about whether it's worth it with one case (it's a 
bit awkward to generalize the building IIRC).



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:44
+
+void glrRecover(llvm::ArrayRef OldHeads,
+unsigned , const TokenStream ,

hokein wrote:
> The `GLR.cpp` file is growing significantly, I think the recovery part is 
> large enough to be lived in a separate file `GLRRecovery.cpp`, but the 
> declaration can still be in the `GLR.h`.
This is interesting, recover/shift/reduce/parse are (vertically) self-contained 
enough that it didn't seem like a big problem yet...

If the concern is file length, maybe we'd thather start with `reduce`; if it's 
relatedness, `GSS`?

My line count is:
 - recover: 156
 - shift: 47
 - reduce: 319
 - parse: 87
 - GSS: 66



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:68
+// one arbitrarily.
+std::vector Path;
+  };

hokein wrote:
> nit: maybe name it `Parses` or `PartialParses`. Path make me think this is a 
> patch of GSS nodes.
Renamed to DiscardedParse, does that work for you?



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:83
+  //expr := type ( . expr )   - (up) we should recover this outer expr
+  //expr := . type ( expr )   - (up+left) we should not recover this type!
+  //

hokein wrote:
> I think you're right -- I thought the first GSS node with a recovery state we 
> encounter during the Walkup  state is the node we want. 
> 
> This example is a little confusing (it still matches our previous mental 
> model), I didn't get it until we discussed offline. I think the following 
> example is clearer 
> 
> ```
> parsing the text `if (true) else ?`
> 
> IfStmt := if (stmt) else . stmt - which we're currently parsing
> IfStmt := if (.stmt) else stmt  - (left) the first recovery GSS node, should 
> not recover from this
> IfStmt := . if (stmt) else stmt - (up), we should recover from this
> ```
> 
> I also think it is worth to add it into the test.
Fair enough, a couple of problems with that example though:
 - dropping the "then" clause from the grammar of an if statement is confusing 
(but adding it back in makes the example complicated)
 - using a non-kernel item for the last fails to show the "up" edge clearly 
(but there's not enough context to show a kernel item)

Came up with another one that hopefully works.

I didn't manage to write a reasonable test showing it - it basically can't 
happen with bracket recovery. In order to demonstrate it we need `{` in the 
"wrong" recovery construct to be paired with a `}` after the cursor... which 
makes it look *very* much like a correct recovery.

Added a FIXME to add a testcase once we can define new recovery strategies 
instead.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:88
+  // For now, we have to take this into account when defining recovery rules.
+  // FIXME: find a more satisfying way 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-29 Thread Sam McCall via Phabricator via cfe-commits
sammccall updated this revision to Diff 441154.
sammccall added a comment.

address comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -32,11 +33,13 @@
 using Action = LRTable::Action;
 using testing::AllOf;
 using testing::ElementsAre;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -238,9 +241,9 @@
   /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
   /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-  GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-  /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+  /*State=*/3, /*ForestNode=*/ClassNameNode,
+  /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
   GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
   /*Parents=*/{GSSNode2});
@@ -363,6 +366,127 @@
   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = (tok::l_brace, 0);
+  auto *Question1 = (tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+AllOf(state(3), parsedSymbolID(id("word")),
+  parents({OpenedBraces}), start(1u;
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+ {G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions LOptions;
+  // Innermost brace is unmatched, 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-29 Thread Sam McCall via Phabricator via cfe-commits
sammccall updated this revision to Diff 441125.
sammccall marked 14 inline comments as done.
sammccall added a comment.

address comments


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -32,11 +33,13 @@
 using Action = LRTable::Action;
 using testing::AllOf;
 using testing::ElementsAre;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -238,9 +241,9 @@
   /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
   /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-  GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-  /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+  /*State=*/3, /*ForestNode=*/ClassNameNode,
+  /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
   GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
   /*Parents=*/{GSSNode2});
@@ -363,6 +366,124 @@
   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = (tok::l_brace, 0);
+  auto *Question1 = (tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+AllOf(state(3), parsedSymbolID(id("word")),
+  parents({OpenedBraces}), start(1u;
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+ {G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 2u) << "should not advance on failure";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-29 Thread Haojian Wu via Phabricator via cfe-commits
hokein added a comment.

the patch looks a good start to me, some initial comments (mostly around the 
recovery part).




Comment at: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h:150
+// OldHeads is the parse state at TokenIndex.
+// This function consumes consumes zero or more tokens (advancing TokenIndex),
+// and places any recovery states created in NewHeads.

If I read it correctly, consuming zero token is consider failure of the 
function, right?



Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h:134
+  uint8_t RecoveryIndex = -1;
+  RecoveryStrategy Recovery = RecoveryStrategy::None;
+

So each rule will have at most 1 recovery-strategy (it is fine for the initial 
version), but I think in the future we want more (probably we need to change 
the Sequence to an array of `{SymbolID, RevoeryStrategy}`).

```
selection-statement := IF CONSTEXPR_opt ( init-statement_opt condition ) 
statement ELSE statement
```
We might want different recoveries in `( . init-statement_opt condition )` 
`(init-statement_opt condition) . statement`, `ELSE . statement`.






Comment at: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h:248
+  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
+  std::vector RecoveryOffset;
+  std::vector Recoveries;

I see the motivation of the `OffsetTable` structure now, this would come as a 
follow-up to simplify the `ReduceOffset` and `RecoveryOffset`, right?



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:29
+
+llvm::Optional
+findRecoveryEndpoint(RecoveryStrategy Strategy,

nit: unsigned => `Token::Index`



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:44
+
+void glrRecover(llvm::ArrayRef OldHeads,
+unsigned , const TokenStream ,

The `GLR.cpp` file is growing significantly, I think the recovery part is large 
enough to be lived in a separate file `GLRRecovery.cpp`, but the declaration 
can still be in the `GLR.h`.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:68
+// one arbitrarily.
+std::vector Path;
+  };

nit: maybe name it `Parses` or `PartialParses`. Path make me think this is a 
patch of GSS nodes.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:83
+  //expr := type ( . expr )   - (up) we should recover this outer expr
+  //expr := . type ( expr )   - (up+left) we should not recover this type!
+  //

I think you're right -- I thought the first GSS node with a recovery state we 
encounter during the Walkup  state is the node we want. 

This example is a little confusing (it still matches our previous mental 
model), I didn't get it until we discussed offline. I think the following 
example is clearer 

```
parsing the text `if (true) else ?`

IfStmt := if (stmt) else . stmt - which we're currently parsing
IfStmt := if (.stmt) else stmt  - (left) the first recovery GSS node, should 
not recover from this
IfStmt := . if (stmt) else stmt - (up), we should recover from this
```

I also think it is worth to add it into the test.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:88
+  // For now, we have to take this into account when defining recovery rules.
+  // FIXME: find a more satisfying way to avoid such false recovery.
+  std::vector Path;

I can't think of a better solution other than a search-based one (would like to 
think more about it).

Currently, we find all recovery options by traversing the whole GSS, and do a 
post-filter (based on the Start, and End). I might do both in the DFS (which 
will save some cost of traversing), the DFS will give us the best recovery 
options, and then we build the GSS node, and forest node.  But up to you.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:111
+  };
+  for (auto *N : llvm::reverse(OldHeads))
+DFS(N, TokenIndex, DFS);

any particular reason why we iterate the OldHeads in a reversed order?



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:130
+
+  assert(NewHeads.empty()); // We may repeatedly populate and clear it.
+  llvm::Optional RecoveryRange;

nit: might be clearer to move the assertion to the beginning of the function.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:174
+<< " strategies\n");
+++TokenIndex;
+  }

Advancing the TokenIndex here seems a little surprising and doesn't match what 
the comment says (` On failure, NewHeads is empty and TokenIndex is 
unchanged.`). I think this should be done in caller side.



Comment at: clang-tools-extra/pseudo/lib/GLR.cpp:64
+// These represent the partial parse that is being discarded.
+// They should become the children of the opaque recovery 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-28 Thread Sam McCall via Phabricator via cfe-commits
sammccall reopened this revision.
sammccall added a comment.

Sorry, I committed this by mistake when working on another change.
Reverted and this is ready for review.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-28 Thread Sam McCall via Phabricator via cfe-commits
This revision was not accepted when it landed; it landed in state "Needs 
Review".
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGa0f4c10ae227: [pseudo] Add error-recovery framework  
brace-based recovery (authored by sammccall).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -32,11 +33,13 @@
 using Action = LRTable::Action;
 using testing::AllOf;
 using testing::ElementsAre;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -238,9 +241,9 @@
   /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
   /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-  GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-  /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+  /*State=*/3, /*ForestNode=*/ClassNameNode,
+  /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
   GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
   /*Parents=*/{GSSNode2});
@@ -363,6 +366,124 @@
   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = (tok::l_brace, 0);
+  auto *Question1 = (tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+AllOf(state(3), parsedSymbolID(id("word")),
+  parents({OpenedBraces}), start(1u;
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+ {G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-28 Thread Sam McCall via Phabricator via cfe-commits
sammccall updated this revision to Diff 440646.
sammccall added a comment.

rebase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -32,11 +33,13 @@
 using Action = LRTable::Action;
 using testing::AllOf;
 using testing::ElementsAre;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -238,9 +241,9 @@
   /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
   /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-  GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-  /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+  /*State=*/3, /*ForestNode=*/ClassNameNode,
+  /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
   GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
   /*Parents=*/{GSSNode2});
@@ -363,6 +366,124 @@
   EXPECT_THAT(Heads, ElementsAre(GSSNode1));
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G, {{/*State=*/1, id("word"), Action::goTo(3)}}, /*Reduce=*/{},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = (tok::l_brace, 0);
+  auto *Question1 = (tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {G, Table, Arena, GSStack},
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+AllOf(state(3), parsedSymbolID(id("word")),
+  parents({OpenedBraces}), start(1u;
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+ {G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G, {{/*State=*/1, id("body"), Action::goTo(2)}}, /*Reduce=*/{},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions LOptions;
+  // Innermost brace is unmatched, to test 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-27 Thread Sam McCall via Phabricator via cfe-commits
sammccall updated this revision to Diff 440393.
sammccall edited the summary of this revision.
sammccall added a comment.

update testcase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -31,11 +32,13 @@
 
 using Action = LRTable::Action;
 using testing::AllOf;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -233,9 +236,9 @@
   /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
   /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-  GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-  /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+  /*State=*/3, /*ForestNode=*/ClassNameNode,
+  /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
   GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
   /*Parents=*/{GSSNode2});
@@ -325,6 +328,122 @@
 "[  1, end)   └─* := tok[1]\n");
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G->table(), {{/*State=*/1, id("word"), Action::goTo(3)}},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = (tok::l_brace, 0);
+  auto *Question1 = (tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {*G, Table, Arena, GSStack},
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+AllOf(state(3), parsedSymbolID(id("word")),
+  parents({OpenedBraces}), start(1u;
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+ {*G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G->table(), {{/*State=*/1, id("body"), Action::goTo(2)}},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions LOptions;
+  // Innermost brace 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-27 Thread Sam McCall via Phabricator via cfe-commits
sammccall updated this revision to Diff 440383.
sammccall added a comment.

revert format changes


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -31,11 +32,13 @@
 
 using Action = LRTable::Action;
 using testing::AllOf;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -233,9 +236,9 @@
   /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
   /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-  GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-  /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+  /*State=*/3, /*ForestNode=*/ClassNameNode,
+  /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
   GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
   /*Parents=*/{GSSNode2});
@@ -325,6 +328,122 @@
 "[  1, end)   └─* := tok[1]\n");
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G->table(), {{/*State=*/1, id("word"), Action::goTo(3)}},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = (tok::l_brace, 0);
+  auto *Question1 = (tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {*G, Table, Arena, GSStack},
+ NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+AllOf(state(3), parsedSymbolID(id("word")),
+  parents({OpenedBraces}), start(1u;
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+ {*G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+  G->table(), {{/*State=*/1, id("body"), Action::goTo(2)}},
+  /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions LOptions;
+  // Innermost brace is unmatched, to test fallback to next 

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

2022-06-27 Thread Sam McCall via Phabricator via cfe-commits
sammccall created this revision.
Herald added a subscriber: mgrang.
Herald added a project: All.
sammccall updated this revision to Diff 440376.
sammccall retitled this revision from "xxx recover" to "[pseudo] Add 
error-recovery framework & brace-based recovery".
sammccall edited the summary of this revision.
sammccall added a comment.
sammccall updated this revision to Diff 440379.
sammccall published this revision for review.
sammccall added a reviewer: hokein.
Herald added subscribers: cfe-commits, alextsao1999.
Herald added a project: clang-tools-extra.

add tests, clean up


sammccall added a comment.

reduce after final recovery
comments


The idea is:

- a parse failure is detected when all heads die when trying to shift the next 
token
- we can recover by choosing a nonterminal we're partway through parsing, and 
determining where it ends through nonlocal means (e.g. matching brackets)
- we can find candidates by walking up the stack from the (ex-)heads
- the token range is defined using heuristics attached to grammar rules
- the unparsed region is represented in the forest by an Opaque node

This patch has the core GLR functionality.
It does not allow recovery heuristics to be attached as extensions to
the grammar, but rather infers a brace-based heuristic.

Expected followups:

- make recovery heuristics grammar extensions (depends on D127448 
)
- add recover to our grammar for bracketed constructs and sequence nodes
- change the structure of our augmented `_ := start` rules to eliminate some 
special-cases in glrParse.
- (if I can work out how): avoid some spurious recovery cases described in 
comments


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===--===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -31,11 +32,13 @@
 
 using Action = LRTable::Action;
 using testing::AllOf;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher
 parents(llvm::ArrayRef Parents) {
@@ -101,8 +104,8 @@
   //   0---1---4
   //   └---2---┘
   //   └---3---5
-  auto *GSSNode0 =
-  GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
+  auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+   /*Parents=*/{});
   auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr,
/*Parents=*/{GSSNode0});
   auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr,
@@ -151,8 +154,8 @@
Action::reduce(ruleFor("enum-name"))},
   });
 
-  const auto *GSSNode0 =
-  GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
+  const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+ /*Parents=*/{});
   const auto *GSSNode1 =
   GSStack.addNode(1, (tok::identifier, 0), {GSSNode0});
 
@@ -180,8 +183,8 @@
   auto *ClassNameNode = (id("class-name"), /*TokenIndex=*/0);
   auto *EnumNameNode = (id("enum-name"), /*TokenIndex=*/0);
 
-  const auto *GSSNode2 =
-  GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
+  const auto *GSSNode2 = GSStack.addNode(
+  /*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
   const auto *GSSNode3 =
   GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{});
   const auto *GSSNode4 = GSStack.addNode(
@@ -227,15 +230,17 @@
   auto *ClassNameNode = (id("class-name"), /*TokenIndex=*/1);
   auto *EnumNameNode =