sammccall created this revision.
Herald added a subscriber: mgrang.
Herald added a project: All.
sammccall updated this revision to Diff 440376.
sammccall retitled this revision from "xxx recover" to "[pseudo] Add 
error-recovery framework & brace-based recovery".
sammccall edited the summary of this revision.
sammccall added a comment.
sammccall updated this revision to Diff 440379.
sammccall published this revision for review.
sammccall added a reviewer: hokein.
Herald added subscribers: cfe-commits, alextsao1999.
Herald added a project: clang-tools-extra.

add tests, clean up


sammccall added a comment.

reduce after final recovery
comments


The idea is:

- a parse failure is detected when all heads die when trying to shift the next 
token
- we can recover by choosing a nonterminal we're partway through parsing, and 
determining where it ends through nonlocal means (e.g. matching brackets)
- we can find candidates by walking up the stack from the (ex-)heads
- the token range is defined using heuristics attached to grammar rules
- the unparsed region is represented in the forest by an Opaque node

This patch has the core GLR functionality.
It does not allow recovery heuristics to be attached as extensions to
the grammar, but rather infers a brace-based heuristic.

Expected followups:

- make recovery heuristics grammar extensions (depends on D127448 
<https://reviews.llvm.org/D127448>)
- add recover to our grammar for bracketed constructs and sequence nodes
- change the structure of our augmented `_ := start` rules to eliminate some 
special-cases in glrParse.
- (if I can work out how): avoid some spurious recovery cases described in 
comments


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -31,11 +32,13 @@
 
 using Action = LRTable::Action;
 using testing::AllOf;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher<const GSS::Node *>
 parents(llvm::ArrayRef<const GSS::Node *> Parents) {
@@ -101,8 +104,8 @@
   //   0---1---4
   //   └---2---┘
   //   └---3---5
-  auto *GSSNode0 =
-      GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
+  auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+                                   /*Parents=*/{});
   auto *GSSNode1 = GSStack.addNode(/*State=*/1, /*ForestNode=*/nullptr,
                                    /*Parents=*/{GSSNode0});
   auto *GSSNode2 = GSStack.addNode(/*State=*/2, /*ForestNode=*/nullptr,
@@ -151,8 +154,8 @@
                        Action::reduce(ruleFor("enum-name"))},
                   });
 
-  const auto *GSSNode0 =
-      GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
+  const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+                                         /*Parents=*/{});
   const auto *GSSNode1 =
       GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
 
@@ -180,8 +183,8 @@
   auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/0);
   auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
 
-  const auto *GSSNode2 =
-      GSStack.addNode(/*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
+  const auto *GSSNode2 = GSStack.addNode(
+      /*State=*/2, /*ForestNode=*/ClassNameNode, /*Parents=*/{});
   const auto *GSSNode3 =
       GSStack.addNode(/*State=*/3, /*ForestNode=*/EnumNameNode, /*Parents=*/{});
   const auto *GSSNode4 = GSStack.addNode(
@@ -227,15 +230,17 @@
   auto *ClassNameNode = &Arena.createOpaque(id("class-name"), /*TokenIndex=*/1);
   auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/1);
 
-  const auto *GSSNode0 =
-      GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
+  const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+                                         /*Parents=*/{});
   const auto *GSSNode1 = GSStack.addNode(
-      /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
+      /*State=*/1, /*ForestNode=*/CVQualifierNode,
+      /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
-      /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-      GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-                      /*Parents=*/{GSSNode1});
+      /*State=*/2, /*ForestNode=*/CVQualifierNode,
+      /*Parents=*/{GSSNode0});
+  const auto *GSSNode3 = GSStack.addNode(
+      /*State=*/3, /*ForestNode=*/ClassNameNode,
+      /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
       GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
                       /*Parents=*/{GSSNode2});
@@ -283,20 +288,20 @@
   auto *EnumNameNode = &Arena.createOpaque(id("enum-name"), /*TokenIndex=*/0);
   auto *StartTerminal = &Arena.createTerminal(tok::star, /*TokenIndex=*/1);
 
-  const auto *GSSNode0 =
-      GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
-  const auto *GSSNode1 =
-      GSStack.addNode(/*State=*/1, /*ForestNode=*/ClassNameNode,
-                      /*Parents=*/{GSSNode0});
+  const auto *GSSNode0 = GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr,
+                                         /*Parents=*/{});
+  const auto *GSSNode1 = GSStack.addNode(
+      /*State=*/1, /*ForestNode=*/ClassNameNode,
+      /*Parents=*/{GSSNode0});
   const auto *GSSNode2 =
       GSStack.addNode(/*State=*/2, /*ForestNode=*/EnumNameNode,
                       /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-      GSStack.addNode(/*State=*/3, /*ForestNode=*/StartTerminal,
-                      /*Parents=*/{GSSNode1});
-  const auto *GSSNode4 =
-      GSStack.addNode(/*State=*/4, /*ForestNode=*/StartTerminal,
-                      /*Parents=*/{GSSNode2});
+  const auto *GSSNode3 = GSStack.addNode(
+      /*State=*/3, /*ForestNode=*/StartTerminal,
+      /*Parents=*/{GSSNode1});
+  const auto *GSSNode4 = GSStack.addNode(
+      /*State=*/4, /*ForestNode=*/StartTerminal,
+      /*Parents=*/{GSSNode2});
 
   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
   LRTable Table = LRTable::buildForTests(
@@ -325,6 +330,122 @@
             "[  1, end)   └─* := tok[1]\n");
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //    0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //    0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G->table(), {{/*State=*/1, id("word"), Action::goTo(3)}},
+      /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Question1 = &Arena.createTerminal(tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {*G, Table, Arena, GSStack},
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+                            AllOf(state(3), parsedSymbolID(id("word")),
+                                  parents({OpenedBraces}), start(1u))));
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+             {*G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //    0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //    0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G->table(), {{/*State=*/1, id("body"), Action::goTo(2)}},
+      /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions LOptions;
+  // Innermost brace is unmatched, to test fallback to next brace.
+  TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions);
+  Tokens.tokens()[0].Pair = 5;
+  Tokens.tokens()[1].Pair = 4;
+  Tokens.tokens()[4].Pair = 1;
+  Tokens.tokens()[5].Pair = 0;
+
+  auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
+  auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *In1 = GSStack.addNode(1, Brace1, {Root});
+  const auto *In2 = GSStack.addNode(1, Brace2, {In1});
+  const auto *In3 = GSStack.addNode(1, Brace3, {In2});
+
+  unsigned TokenIndex = 3;
+  std::vector<const GSS::Node *> NewHeads;
+  glrRecover({In3}, TokenIndex, Tokens, {*G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 5u);
+  EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
+                                          parents({In2}), start(2u))));
+}
+
+TEST_F(GLRTest, RecoverAlternatives) {
+  // Recovery inside braces with multiple equally good options
+  //  Before:
+  //    0--1({)
+  //  After recovering either `word` or `number` inside the braces:
+  //    0--1({)--2(word)   // 2 is goto(1, word)
+  //          └--3(number) // 3 is goto(1, number)
+  buildGrammar({"number", "word"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G->table(),
+      {
+          {/*State=*/1, id("number"), Action::goTo(2)},
+          {/*State=*/1, id("word"), Action::goTo(3)},
+      },
+      /*Recovery=*/{
+          {/*State=*/1, RecoveryStrategy::Braces, id("number")},
+          {/*State=*/1, RecoveryStrategy::Braces, id("word")},
+      });
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+  unsigned TokenIndex = 1;
+
+  glrRecover({OpenedBraces}, TokenIndex, Tokens, {*G, Table, Arena, GSStack},
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 2u);
+  EXPECT_THAT(NewHeads,
+              UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
+                                         parents({OpenedBraces}), start(1u)),
+                                   AllOf(state(3), parsedSymbolID(id("word")),
+                                         parents({OpenedBraces}), start(1u))));
+}
+
 TEST_F(GLRTest, PerfectForestNodeSharing) {
   // Run the GLR on a simple grammar and test that we build exactly one forest
   // node per (SymbolID, token range).
@@ -395,6 +516,40 @@
             "[  0, end)     └─IDENTIFIER := tok[0]\n");
 }
 
+TEST_F(GLRTest, RecoveryEndToEnd) {
+  // Simple example of brace-based recovery showing:
+  //  - recovered region includes tokens both ahead of and behind the cursor
+  //  - multiple possible recovery rules
+  //  - recovery from outer scopes is rejected
+  build(R"bnf(
+    _ := block
+
+    block := { block }
+    block := { numbers }
+    numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
+  )bnf");
+  auto LRTable = LRTable::buildSLR(*G);
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
+  pairBrackets(Tokens);
+
+  const ForestNode &Parsed =
+      glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("block"));
+  EXPECT_EQ(Parsed.dumpRecursive(*G),
+            "[  0, end) block := { block [recover=1] }\n"
+            "[  0,   1) ├─{ := tok[0]\n"
+            "[  1,   5) ├─block := <ambiguous>\n"
+            "[  1,   5) │ ├─block := { block [recover=1] }\n"
+            "[  1,   2) │ │ ├─{ := tok[1]\n"
+            "[  2,   4) │ │ ├─block := <opaque>\n"
+            "[  4,   5) │ │ └─} := tok[4]\n"
+            "[  1,   5) │ └─block := { numbers [recover=1] }\n"
+            "[  1,   2) │   ├─{ := tok[1]\n"
+            "[  2,   4) │   ├─numbers := <opaque>\n"
+            "[  4,   5) │   └─} := tok[4]\n"
+            "[  5, end) └─} := tok[5]\n");
+}
+
 TEST_F(GLRTest, NoExplicitAccept) {
   build(R"bnf(
     _ := test
Index: clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
@@ -0,0 +1,13 @@
+// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
+auto x = { complete garbage };
+// CHECK:      translation-unit~simple-declaration
+// CHECK-NEXT: ├─decl-specifier-seq~AUTO := tok[0]
+// CHECK-NEXT: ├─init-declarator-list~init-declarator
+// CHECK-NEXT: │ ├─declarator~IDENTIFIER := tok[1]
+// CHECK-NEXT: │ └─initializer~brace-or-equal-initializer
+// CHECK-NEXT: │   ├─= := tok[2]
+// CHECK-NEXT: │   └─initializer-clause~braced-init-list
+// CHECK-NEXT: │     ├─{ := tok[3]
+// CHECK-NEXT: │     ├─initializer-list := <opaque>
+// CHECK-NEXT: │     └─} := tok[6]
+// CHECK-NEXT: └─; := tok[7]
Index: clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
===================================================================
--- clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
+++ clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
@@ -2,7 +2,7 @@
 class Foo {
 public:
 };
-// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification }
+// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] }
 // CHECK-NEXT: ├─class-head := class-key class-head-name
 // CHECK-NEXT: │ ├─class-key~CLASS := tok[0]
 // CHECK-NEXT: │ └─class-head-name~IDENTIFIER := tok[1]
Index: clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
@@ -10,6 +10,7 @@
 #include "clang-pseudo/grammar/LRGraph.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang/Basic/TokenKinds.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 
 namespace llvm {
@@ -40,8 +41,9 @@
 
 class LRTable::Builder {
 public:
-  Builder(llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates)
-      : StartStates(StartStates) {}
+  Builder(llvm::ArrayRef<LRGraph::Recovery> Recoveries,
+          llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates)
+      : Recoveries(Recoveries), StartStates(StartStates) {}
 
   bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
   LRTable build(const GrammarTable &GT, unsigned NumStates) && {
@@ -86,16 +88,42 @@
         ++SortedIndex;
     }
     Table.StartStates = std::move(StartStates);
+
+    // Error recovery entries: sort (no dups already), and build offset lookup.
+    llvm::sort(Recoveries,
+               [&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) {
+                 return std::tie(L.Src, L.Result, L.Strategy) <
+                        std::tie(R.Src, R.Result, R.Strategy);
+               });
+    Table.Recoveries.reserve(Recoveries.size());
+    for (const auto &R : Recoveries)
+      Table.Recoveries.push_back({R.Strategy, R.Result});
+    Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
+    SortedIndex = 0;
+    for (StateID State = 0; State < NumStates; ++State) {
+      Table.RecoveryOffset[State] = SortedIndex;
+      while (SortedIndex < Recoveries.size() &&
+             Recoveries[SortedIndex].Src == State)
+        SortedIndex++;
+    }
+    Table.RecoveryOffset[NumStates] = SortedIndex;
+    assert(SortedIndex == Recoveries.size());
+
     return Table;
   }
 
 private:
   llvm::DenseSet<Entry> Entries;
+  std::vector<LRGraph::Recovery> Recoveries;
   std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 
 LRTable LRTable::buildForTests(const GrammarTable &GT,
-                               llvm::ArrayRef<Entry> Entries) {
+                               llvm::ArrayRef<Entry> Entries,
+                               llvm::ArrayRef<RecoveryEntry> Recoveries) {
+  std::vector<LRGraph::Recovery> GRecoveries;
+  for (const auto &R : Recoveries)
+    GRecoveries.push_back({R.State, R.Strategy, R.Result});
   StateID MaxState = 0;
   for (const auto &Entry : Entries) {
     MaxState = std::max(MaxState, Entry.State);
@@ -104,7 +132,7 @@
     if (Entry.Act.kind() == LRTable::Action::GoTo)
       MaxState = std::max(MaxState, Entry.Act.getGoToState());
   }
-  Builder Build({});
+  Builder Build(GRecoveries, {});
   for (const Entry &E : Entries)
     Build.insert(E);
   return std::move(Build).build(GT, /*NumStates=*/MaxState + 1);
@@ -112,7 +140,7 @@
 
 LRTable LRTable::buildSLR(const Grammar &G) {
   auto Graph = LRGraph::buildLR0(G);
-  Builder Build(Graph.startStates());
+  Builder Build(Graph.recoveries(), Graph.startStates());
   for (const auto &T : Graph.edges()) {
     Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
     Build.insert({T.Src, T.Label, Act});
Index: clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
@@ -120,6 +120,20 @@
   return Results;
 }
 
+std::vector<std::pair<RecoveryStrategy, SymbolID>>
+availableRecovery(const State &S, const Grammar &G) {
+  std::vector<std::pair<RecoveryStrategy, SymbolID>> Result;
+  for (const Item &I : S.Items) {
+    const auto &Rule = G.lookupRule(I.rule());
+    if (I.dot() != Rule.RecoveryIndex)
+      continue;
+    Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
+  }
+  llvm::sort(Result);
+  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
+  return Result;
+}
+
 } // namespace
 
 std::string Item::dump(const Grammar &G) const {
@@ -130,9 +144,10 @@
       Results.push_back(G.symbolName(SID));
     return Results;
   };
-  return llvm::formatv("{0} := {1} • {2}", G.symbolName(Rule.Target),
+  return llvm::formatv("{0} := {1} • {2}{3}", G.symbolName(Rule.Target),
                        llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
-                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
+                       Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
       .str();
 }
 
@@ -181,6 +196,11 @@
       Edges.push_back({Src, Dst, Label});
     }
 
+    void insertRecovery(StateID Src, RecoveryStrategy Strategy,
+                        SymbolID Result) {
+      Recoveries.push_back({Src, Strategy, Result});
+    }
+
     // Returns a state with the given id.
     const State &find(StateID ID) const {
       assert(ID < States.size());
@@ -194,9 +214,10 @@
     LRGraph build() && {
       States.shrink_to_fit();
       Edges.shrink_to_fit();
+      Recoveries.shrink_to_fit();
       llvm::sort(StartStates);
       StartStates.shrink_to_fit();
-      return LRGraph(std::move(States), std::move(Edges),
+      return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
                      std::move(StartStates));
     }
 
@@ -205,6 +226,7 @@
     llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
     std::vector<State> States;
     std::vector<Edge> Edges;
+    std::vector<Recovery> Recoveries;
     const Grammar &G;
     std::vector<std::pair<SymbolID, StateID>> StartStates;
   } Builder(G);
@@ -225,15 +247,16 @@
   }
 
   while (!PendingStates.empty()) {
-    auto CurrentStateID = PendingStates.back();
+    auto StateID = PendingStates.back();
     PendingStates.pop_back();
-    for (auto Next :
-         nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+    for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
       auto Insert = Builder.insert(Next.second);
       if (Insert.second) // new state, insert to the pending queue.
         PendingStates.push_back(Insert.first);
-      Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+      Builder.insertEdge(StateID, Insert.first, Next.first);
     }
+    for (auto Recovery : availableRecovery(Builder.find(StateID), G))
+      Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
   }
   return std::move(Builder).build();
 }
Index: clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -106,6 +106,17 @@
 
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
+    // Wherever RHS contains { foo }, mark foo for brace-recovery.
+    // FIXME: this should be grammar annotations instead.
+    for (auto &Rule : T->Rules) {
+      for (unsigned I = 2; I < Rule.Size; ++I)
+        if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) &&
+            Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) &&
+            !isToken(Rule.Sequence[I - 1])) {
+          Rule.Recovery = RecoveryStrategy::Braces;
+          Rule.RecoveryIndex = I - 1;
+        }
+    }
     const auto &SymbolOrder = getTopologicalOrder(T.get());
     llvm::stable_sort(
         T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
Index: clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -59,8 +59,11 @@
   llvm::raw_string_ostream OS(Result);
   const Rule &R = T->Rules[RID];
   OS << symbolName(R.Target) << " :=";
-  for (SymbolID SID : R.seq())
-    OS << " " << symbolName(SID);
+  for (unsigned I = 0; I < R.Size; ++I) {
+    OS << " " << symbolName(R.Sequence[I]);
+    if (R.RecoveryIndex == I)
+      OS << " [recover=" << static_cast<unsigned>(R.Recovery) << "]";
+  }
   if (R.Guard)
     OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
   return Result;
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -24,6 +24,156 @@
 
 namespace clang {
 namespace pseudo {
+namespace {
+
+llvm::Optional<unsigned>
+findRecoveryEndpoint(RecoveryStrategy Strategy,
+                     const GSS::Node *RecoveryNode,
+                     const TokenStream &Tokens) {
+  assert(Strategy == RecoveryStrategy::Braces);
+  const ForestNode *LBrace = RecoveryNode->Payload;
+  assert(LBrace->kind() == ForestNode::Terminal &&
+         LBrace->symbol() == tokenSymbol(tok::l_brace));
+  if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair())
+    return Tokens.index(*RBrace);
+  return llvm::None;
+}
+
+} // namespace
+
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const TokenStream &Tokens,
+                const ParseParams &Params,
+                std::vector<const GSS::Node *> &NewHeads) {
+  LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
+  // Describes a possibility to recover by forcibly interpreting a range of
+  // tokens around the cursor as a nonterminal that we expected to see.
+  struct PlaceholderRecovery {
+    // The token prior to the nonterminal which is being recovered.
+    // This starts of the region we're skipping, so higher Position is better.
+    Token::Index Position;
+    // The nonterminal which will be created in order to recover.
+    SymbolID Symbol;
+    // The heuristic used to choose the bounds of the nonterminal to recover.
+    RecoveryStrategy Strategy;
+
+    // The GSS head where we are expecting the recovered nonterminal.
+    const GSS::Node *RecoveryNode;
+    // Payload of nodes on the way back from the OldHead to the recovery node.
+    // These represent the partial parse that is being discarded.
+    // They should become the children of the opaque recovery node.
+    //
+    // There may be multiple paths leading to the same recovery node, we choose
+    // one arbitrarily.
+    std::vector<const ForestNode *> Path;
+  };
+  std::vector<PlaceholderRecovery> Options;
+
+  // Find recovery options by walking up the stack.
+  //
+  // This is similar to exception handling: we walk up the "frames" of nested
+  // rules being parsed until we find one that has a "handler" which allows us
+  // to determine the node bounds without parsing it.
+  //
+  // Unfortunately there's a significant difference: the stack contains both
+  // "upward" nodes (ancestor parses) and "leftward" ones.
+  //  e.g. when parsing `int(2 + ?)`, the stack contains:
+  //    expr := expr + . expr     - which we're currently parsing
+  //    expr := type ( . expr )   - (up) we should recover this outer expr
+  //    expr := . type ( expr )   - (up+left) we should not recover this type!
+  //
+  // It's not obvious how to avoid collecting the latter as a recovery option.
+  // I think the distinction is ill-defined after merging items into states.
+  // For now, we have to take this into account when defining recovery rules.
+  // FIXME: find a more satisfying way to avoid such false recovery.
+  std::vector<const ForestNode *> Path;
+  llvm::DenseSet<const GSS::Node *> Seen;
+  auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) {
+    if (!Seen.insert(N).second)
+      return;
+    for (auto Strategy : Params.Table.getRecovery(N->State)) {
+      Options.push_back(PlaceholderRecovery{
+          NextTok,
+          Strategy.Result,
+          Strategy.Strategy,
+          N,
+          Path,
+      });
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Option: recover " << Params.G.symbolName(Strategy.Result)
+                 << " at token " << NextTok << "\n");
+    }
+    Path.push_back(N->Payload);
+    for (const GSS::Node *Parent : N->parents())
+      DFS(Parent, N->Payload->startTokenIndex(), DFS);
+    Path.pop_back();
+  };
+  for (auto *N : llvm::reverse(OldHeads))
+    DFS(N, TokenIndex, DFS);
+
+  // Now we select the option(s) we will use to recover.
+  //
+  // We prefer options starting further right, as these discard less code
+  // (e.g. we prefer to recover inner scopes rather than outer ones).
+  // The options also need to agree on an endpoint, so the parser has a
+  // consistent position afterwards.
+  //
+  // So conceptually we're sorting by the tuple (start, end), though we avoid
+  // computing `end` for options that can't be winners.
+
+  // Consider options starting further right first.
+  // Don't drop the others yet though, we may still use them if preferred fails.
+  llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
+    return L.Position > R.Position;
+  });
+
+  assert(NewHeads.empty()); // We may repeatedly populate and clear it.
+  llvm::Optional<Token::Range> RecoveryRange;
+  for (const PlaceholderRecovery &Option : Options) {
+    // If this starts further right than options we've already found, then
+    // we'll never find anything better. Skip computing End for the rest.
+    if (RecoveryRange && Option.Position < RecoveryRange->Begin)
+      break;
+
+    auto End =
+        findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens);
+    // Only consider recovery that advances the parse.
+    if (!End || *End <= TokenIndex)
+      continue;
+    if (RecoveryRange) {
+      // If this is worse than our previous options, ignore it.
+      if (RecoveryRange->End < *End)
+        continue;
+      // If this is an improvement over our previous options, then drop them.
+      if (RecoveryRange->End > *End)
+        NewHeads.clear();
+    }
+    // Create recovery nodes and heads for them in the GSS. These may be
+    // discarded if a better recovery is later found, but this path isn't hot.
+    RecoveryRange = {Option.Position, *End};
+    const ForestNode &Placeholder =
+        Params.Forest.createOpaque(Option.Symbol, Option.Position);
+    const GSS::Node *NewHead = Params.GSStack.addNode(
+        Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol),
+        &Placeholder, {Option.RecoveryNode});
+    NewHeads.push_back(NewHead);
+  }
+
+  // Advance the cursor, whether recovery succeeded or not.
+  if (RecoveryRange) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
+      for (const auto *Head : NewHeads)
+        llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol());
+      llvm::dbgs() << "\n";
+    });
+    TokenIndex = RecoveryRange->End;
+  } else {
+    LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
+                            << " strategies\n");
+    ++TokenIndex;
+  }
+}
 
 using StateID = LRTable::StateID;
 
@@ -31,8 +181,9 @@
   std::vector<std::string> ParentStates;
   for (const auto *Parent : N.parents())
     ParentStates.push_back(llvm::formatv("{0}", Parent->State));
-  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State,
-                      N.Payload->symbol(), llvm::join(ParentStates, ", "));
+  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
+                      N.Payload ? N.Payload->symbol() : 0,
+                      llvm::join(ParentStates, ", "));
   return OS;
 }
 
@@ -426,15 +577,27 @@
     GSS.gc(std::move(Roots));
   };
   // Each iteration fully processes a single token.
-  for (unsigned I = 0; I < Terminals.size(); ++I) {
+  for (unsigned I = 0; I < Terminals.size();) {
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
                    "Next token {0} (id={1})\n",
                    G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
     // Consume the token.
     glrShift(Heads, Terminals[I], Params, NextHeads);
+
+    // If we weren't able to consume the token, try to skip over some tokens
+    // so we can keep parsing.
+    if (NextHeads.empty()) {
+      glrRecover(Heads, I, Tokens, Params, NextHeads);
+      if (NextHeads.empty())
+        // FIXME: Ensure the `_ := start-symbol` rules have a fallback
+        // error-recovery strategy attached. Then this condition can't happen.
+        return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
+    } else
+      ++I;
+
     // Form nonterminals containing the token we just consumed.
-    SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof)
-                                                   : Terminals[I + 1].symbol();
+    SymbolID Lookahead =
+        I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
     Reduce(NextHeads, Lookahead);
     // Prepare for the next token.
     std::swap(Heads, NextHeads);
@@ -443,22 +606,35 @@
   }
   LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
 
+  // The parse was successful if we're in state `_ := start-symbol .`
   StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
-  const ForestNode *Result = nullptr;
-  for (const auto *Head : Heads) {
-    if (Head->State == AcceptState) {
-      assert(Head->Payload->symbol() == StartSymbol);
-      assert(Result == nullptr && "multiple results!");
-      Result = Head->Payload;
+  auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
+    const ForestNode *Result = nullptr;
+    for (const auto *Head : Heads) {
+      if (Head->State == AcceptState) {
+        assert(Head->Payload->symbol() == StartSymbol);
+        assert(Result == nullptr && "multiple results!");
+        Result = Head->Payload;
+      }
     }
-  }
-  if (Result)
+    return Result;
+  };
+  if (auto *Result = SearchForAccept(Heads))
     return *Result;
+  // Failed to parse the input, attempt to run recovery.
+  // FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
+  // More elegant is to include EOF in the token stream, and make the
+  // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
+  // would not be a special case: it show up as a failure to shift the EOF
+  // token.
+  unsigned I = Terminals.size();
+  glrRecover(Heads, I, Tokens, Params, NextHeads);
+  Reduce(NextHeads, tokenSymbol(tok::eof));
+  if (auto *Result = SearchForAccept(NextHeads))
+    return *Result;
+
   // We failed to parse the input, returning an opaque forest node for recovery.
-  //
-  // FIXME: We will need to invoke our generic error-recovery handlers when we
-  // reach EOF without reaching accept state, and involving the eof
-  // token in the above main for-loopmay be the best way to reuse the code).
+  // FIXME: as above, we can add fallback error handling so this is impossible.
   return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
 }
 
@@ -469,9 +645,10 @@
 }
 
 const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
+
                               llvm::ArrayRef<const Node *> Parents) {
   Node *Result = new (allocate(Parents.size()))
-      Node({State, GCParity, static_cast<unsigned>(Parents.size())});
+      Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
   Alive.push_back(Result);
   ++NodesCreated;
   Result->Payload = Symbol;
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -123,6 +123,11 @@
     uint16_t Value : ValueBits;
   };
 
+  struct Recovery {
+    RecoveryStrategy Strategy;
+    SymbolID Result;
+  };
+
   // Returns all available actions for the given state on a terminal.
   // Expected to be called by LR parsers.
   llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
@@ -139,6 +144,12 @@
   // Returns empty if no available actions in the table.
   llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
 
+  // Looks up available recovery actions if we stopped parsing in this state.
+  llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
+    return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State],
+                              Recoveries.data() + RecoveryOffset[State + 1]);
+  }
+
   // Returns the state from which the LR parser should start to parse the input
   // tokens as the given StartSymbol.
   //
@@ -169,8 +180,14 @@
     SymbolID Symbol;
     Action Act;
   };
-  // Build a specifid table for testing purposes.
-  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+  struct RecoveryEntry {
+    StateID State;
+    RecoveryStrategy Strategy;
+    SymbolID Result;
+  };
+  // Build a specified table for testing purposes.
+  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>,
+                               llvm::ArrayRef<RecoveryEntry> = {});
 
 private:
   // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
@@ -188,6 +205,11 @@
   std::vector<Action> Actions;
   // A sorted table, storing the start state for each target parsing symbol.
   std::vector<std::pair<SymbolID, StateID>> StartStates;
+
+  // Recovery stores all recovery actions from all states.
+  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
+  std::vector<uint32_t> RecoveryOffset;
+  std::vector<Recovery> Recoveries;
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
 
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
@@ -137,8 +137,20 @@
     SymbolID Label;
   };
 
+  // A possible error recovery: choose to match some tokens against a symbol.
+  //
+  // e.g. a state that contains
+  //   stmt := { . stmt-seq [recover=braces] }
+  // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
+  struct Recovery {
+    StateID Src; // The state we are in when encountering the error.
+    RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
+    SymbolID Result;           // The symbol that is produced.
+  };
+
   llvm::ArrayRef<State> states() const { return States; }
   llvm::ArrayRef<Edge> edges() const { return Edges; }
+  llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
   llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
     return StartStates;
   }
@@ -147,12 +159,15 @@
 
 private:
   LRGraph(std::vector<State> States, std::vector<Edge> Edges,
+          std::vector<Recovery> Recoveries,
           std::vector<std::pair<SymbolID, StateID>> StartStates)
       : States(std::move(States)), Edges(std::move(Edges)),
-        StartStates(std::move(StartStates)) {}
+        Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
+  }
 
   std::vector<State> States;
   std::vector<Edge> Edges;
+  std::vector<Recovery> Recoveries;
   std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
@@ -81,9 +81,12 @@
   assert(SID < NumTerminals);
   return static_cast<tok::TokenKind>(SID);
 }
-inline SymbolID tokenSymbol(tok::TokenKind TK) {
+inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
   return TokenFlag | static_cast<SymbolID>(TK);
 }
+// Error recovery strategies.
+// FIXME: these should be provided as extensions instead.
+enum class RecoveryStrategy : uint8_t { None, Braces };
 
 // An extension is a piece of native code specific to a grammar that modifies
 // the behavior of annotated rules. One ExtensionID is assigned for each unique
@@ -107,7 +110,7 @@
   // length to 9 (this is the longest sequence in cxx grammar).
   static constexpr unsigned SizeBits = 4;
   static constexpr unsigned MaxElements = 9;
-  static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
+  static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
   static_assert(SizeBits + SymbolBits <= 16,
                 "Must be able to store symbol ID + size efficiently");
 
@@ -123,6 +126,13 @@
   // being set for this rule.
   ExtensionID Guard = 0;
 
+  // Specifies the index within Sequence eligible for error recovery.
+  // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
+  // should recover by finding the matching brace, and forcing stmt-seq to match
+  // everything between braces.
+  uint8_t RecoveryIndex = -1;
+  RecoveryStrategy Recovery = RecoveryStrategy::None;
+
   llvm::ArrayRef<SymbolID> seq() const {
     return llvm::ArrayRef<SymbolID>(Sequence, Size);
   }
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -144,6 +144,26 @@
 void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
                const ParseParams &Params);
 
+// Heuristically recover from a state where no further parsing is possible.
+//
+// OldHeads is the parse state at TokenIndex.
+// This function consumes consumes zero or more tokens (advancing TokenIndex),
+// and places any recovery states created in NewHeads.
+//
+// On failure, NewHeads is empty and TokenIndex is unchanged.
+//
+// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
+// there is a risk of the parser falling into an infinite loop, creating an
+// endless sequence of recovery nodes.
+// Generally it is safe for recovery to match 0 tokens against sequence symbols
+// like `statement-seq`, as the grammar won't permit another statement-seq
+// immediately afterwards. However recovery strategies for `statement` should
+// consume at least one token, as statements may be adjacent in the input.
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const TokenStream &Tokens,
+                const ParseParams &Params,
+                std::vector<const GSS::Node *> &NewHeads);
+
 } // namespace pseudo
 } // namespace clang
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to