[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

Sam McCall via Phabricator via cfe-commits Mon, 27 Jun 2022 14:13:31 -0700

sammccall updated this revision to Diff 440393.
sammccall edited the summary of this revision.
sammccall added a comment.


update testcase


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128486/new/

https://reviews.llvm.org/D128486

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
  clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang-pseudo/GLR.h"
+#include "clang-pseudo/Bracket.h"
 #include "clang-pseudo/Token.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/LangOptions.h"
@@ -31,11 +32,13 @@
 
 using Action = LRTable::Action;
 using testing::AllOf;
+using testing::IsEmpty;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
 MATCHER_P(parsedSymbol, FNode, "") { return arg->Payload == FNode; }
 MATCHER_P(parsedSymbolID, SID, "") { return arg->Payload->symbol() == SID; }
+MATCHER_P(start, Start, "") { return arg->Payload->startTokenIndex() == Start; }
 
 testing::Matcher<const GSS::Node *>
 parents(llvm::ArrayRef<const GSS::Node *> Parents) {
@@ -233,9 +236,9 @@
       /*State=*/1, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
   const auto *GSSNode2 = GSStack.addNode(
       /*State=*/2, /*ForestNode=*/CVQualifierNode, /*Parents=*/{GSSNode0});
-  const auto *GSSNode3 =
-      GSStack.addNode(/*State=*/3, /*ForestNode=*/ClassNameNode,
-                      /*Parents=*/{GSSNode1});
+  const auto *GSSNode3 = GSStack.addNode(
+      /*State=*/3, /*ForestNode=*/ClassNameNode,
+      /*Parents=*/{GSSNode1});
   const auto *GSSNode4 =
       GSStack.addNode(/*State=*/4, /*ForestNode=*/EnumNameNode,
                       /*Parents=*/{GSSNode2});
@@ -325,6 +328,122 @@
             "[  1, end)   ââ* := tok[1]\n");
 }
 
+TEST_F(GLRTest, Recover) {
+  // Recovery while parsing "word" inside braces.
+  //  Before:
+  //    0--1({)--2(?)
+  //  After recovering a `word` at state 1:
+  //    0--3(word)  // 3 is goto(1, word)
+  buildGrammar({"word"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G->table(), {{/*State=*/1, id("word"), Action::goTo(3)}},
+      /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("word")}});
+
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Question1 = &Arena.createTerminal(tok::question, 1);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+  const auto *AfterQuestion1 = GSStack.addNode(2, Question1, {OpenedBraces});
+
+  // Need a token stream with paired braces so the strategy works.
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? ? ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+
+  unsigned TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, Tokens, {*G, Table, Arena, GSStack},
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 4u) << "should skip ahead to matching brace";
+  EXPECT_THAT(NewHeads, ElementsAre(
+                            AllOf(state(3), parsedSymbolID(id("word")),
+                                  parents({OpenedBraces}), start(1u))));
+  EXPECT_EQ(NewHeads.front()->Payload->kind(), ForestNode::Opaque);
+
+  // Test recovery failure: omit closing brace so strategy fails
+  TokenStream NoRBrace = cook(lex("{ ? ? ? ?", LOptions), LOptions);
+  pairBrackets(NoRBrace);
+  NewHeads.clear();
+  TokenIndex = 2;
+  glrRecover({AfterQuestion1}, TokenIndex, NoRBrace,
+             {*G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 3u) << "should advance by 1 by default";
+  EXPECT_THAT(NewHeads, IsEmpty());
+}
+
+TEST_F(GLRTest, RecoverRightmost) {
+  // In a nested block structure, we recover at the innermost possible block.
+  //  Before:
+  //    0--1({)--1({)--1({)
+  //  After recovering a `block` at inside the second braces:
+  //    0--1({)--2(body)  // 2 is goto(1, body)
+  buildGrammar({"body"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G->table(), {{/*State=*/1, id("body"), Action::goTo(2)}},
+      /*Recovery=*/{{/*State=*/1, RecoveryStrategy::Braces, id("body")}});
+
+  clang::LangOptions LOptions;
+  // Innermost brace is unmatched, to test fallback to next brace.
+  TokenStream Tokens = cook(lex("{ { { ? ? } }", LOptions), LOptions);
+  Tokens.tokens()[0].Pair = 5;
+  Tokens.tokens()[1].Pair = 4;
+  Tokens.tokens()[4].Pair = 1;
+  Tokens.tokens()[5].Pair = 0;
+
+  auto *Brace1 = &Arena.createTerminal(tok::l_brace, 0);
+  auto *Brace2 = &Arena.createTerminal(tok::l_brace, 1);
+  auto *Brace3 = &Arena.createTerminal(tok::l_brace, 2);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *In1 = GSStack.addNode(1, Brace1, {Root});
+  const auto *In2 = GSStack.addNode(1, Brace2, {In1});
+  const auto *In3 = GSStack.addNode(1, Brace3, {In2});
+
+  unsigned TokenIndex = 3;
+  std::vector<const GSS::Node *> NewHeads;
+  glrRecover({In3}, TokenIndex, Tokens, {*G, Table, Arena, GSStack}, NewHeads);
+  EXPECT_EQ(TokenIndex, 5u);
+  EXPECT_THAT(NewHeads, ElementsAre(AllOf(state(2), parsedSymbolID(id("body")),
+                                          parents({In2}), start(2u))));
+}
+
+TEST_F(GLRTest, RecoverAlternatives) {
+  // Recovery inside braces with multiple equally good options
+  //  Before:
+  //    0--1({)
+  //  After recovering either `word` or `number` inside the braces:
+  //    0--1({)--2(word)   // 2 is goto(1, word)
+  //          â--3(number) // 3 is goto(1, number)
+  buildGrammar({"number", "word"}, {});
+  LRTable Table = LRTable::buildForTests(
+      G->table(),
+      {
+          {/*State=*/1, id("number"), Action::goTo(2)},
+          {/*State=*/1, id("word"), Action::goTo(3)},
+      },
+      /*Recovery=*/{
+          {/*State=*/1, RecoveryStrategy::Braces, id("number")},
+          {/*State=*/1, RecoveryStrategy::Braces, id("word")},
+      });
+  auto *LBrace = &Arena.createTerminal(tok::l_brace, 0);
+  const auto *Root = GSStack.addNode(0, nullptr, {});
+  const auto *OpenedBraces = GSStack.addNode(1, LBrace, {Root});
+
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ ? }", LOptions), LOptions);
+  pairBrackets(Tokens);
+  std::vector<const GSS::Node *> NewHeads;
+  unsigned TokenIndex = 1;
+
+  glrRecover({OpenedBraces}, TokenIndex, Tokens, {*G, Table, Arena, GSStack},
+             NewHeads);
+  EXPECT_EQ(TokenIndex, 2u);
+  EXPECT_THAT(NewHeads,
+              UnorderedElementsAre(AllOf(state(2), parsedSymbolID(id("number")),
+                                         parents({OpenedBraces}), start(1u)),
+                                   AllOf(state(3), parsedSymbolID(id("word")),
+                                         parents({OpenedBraces}), start(1u))));
+}
+
 TEST_F(GLRTest, PerfectForestNodeSharing) {
   // Run the GLR on a simple grammar and test that we build exactly one forest
   // node per (SymbolID, token range).
@@ -395,6 +514,40 @@
             "[  0, end)     ââIDENTIFIER := tok[0]\n");
 }
 
+TEST_F(GLRTest, RecoveryEndToEnd) {
+  // Simple example of brace-based recovery showing:
+  //  - recovered region includes tokens both ahead of and behind the cursor
+  //  - multiple possible recovery rules
+  //  - recovery from outer scopes is rejected
+  build(R"bnf(
+    _ := block
+
+    block := { block }
+    block := { numbers }
+    numbers := NUMERIC_CONSTANT NUMERIC_CONSTANT
+  )bnf");
+  auto LRTable = LRTable::buildSLR(*G);
+  clang::LangOptions LOptions;
+  TokenStream Tokens = cook(lex("{ { 42 ? } }", LOptions), LOptions);
+  pairBrackets(Tokens);
+
+  const ForestNode &Parsed =
+      glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("block"));
+  EXPECT_EQ(Parsed.dumpRecursive(*G),
+            "[  0, end) block := { block [recover=1] }\n"
+            "[  0,   1) ââ{ := tok[0]\n"
+            "[  1,   5) ââblock := <ambiguous>\n"
+            "[  1,   5) â ââblock := { block [recover=1] }\n"
+            "[  1,   2) â â ââ{ := tok[1]\n"
+            "[  2,   4) â â ââblock := <opaque>\n"
+            "[  4,   5) â â ââ} := tok[4]\n"
+            "[  1,   5) â ââblock := { numbers [recover=1] }\n"
+            "[  1,   2) â   ââ{ := tok[1]\n"
+            "[  2,   4) â   âânumbers := <opaque>\n"
+            "[  4,   5) â   ââ} := tok[4]\n"
+            "[  5, end) ââ} := tok[5]\n");
+}
+
 TEST_F(GLRTest, NoExplicitAccept) {
   build(R"bnf(
     _ := test
Index: clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/cxx/recovery-init-list.cpp
@@ -0,0 +1,21 @@
+// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --print-forest | FileCheck %s
+auto x = { complete garbage };
+// CHECK:      translation-unit~simple-declaration
+// CHECK-NEXT: ââdecl-specifier-seq~AUTO := tok[0]
+// CHECK-NEXT: ââinit-declarator-list~init-declarator
+// CHECK-NEXT: â ââdeclarator~IDENTIFIER := tok[1]
+// CHECK-NEXT: â ââinitializer~brace-or-equal-initializer
+// CHECK-NEXT: â   ââ= := tok[2]
+// CHECK-NEXT: â   ââinitializer-clause~braced-init-list := <ambiguous>
+//      FIXME:   Eliminate ambiguity by merging designated and non-designated
+//               initializers into a single initializer-list rule.
+//               Clang supports mixed initializers in C and as an extension.
+// CHECK-NEXT: â     ââbraced-init-list := { initializer-list [recover=1] }
+// CHECK-NEXT: â     â ââ{ := tok[3]
+// CHECK-NEXT: â     â ââinitializer-list := <opaque>
+// CHECK-NEXT: â     â ââ} := tok[6]
+// CHECK-NEXT: â     ââbraced-init-list := { designated-initializer-list [recover=1] }
+// CHECK-NEXT: â       ââ{ := tok[3]
+// CHECK-NEXT: â       ââdesignated-initializer-list := <opaque>
+// CHECK-NEXT: â       ââ} := tok[6]
+// CHECK-NEXT: ââ; := tok[7]
Index: clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
===================================================================
--- clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
+++ clang-tools-extra/pseudo/test/cxx/empty-member-spec.cpp
@@ -2,7 +2,7 @@
 class Foo {
 public:
 };
-// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification }
+// CHECK:      decl-specifier-seq~class-specifier := class-head { member-specification [recover=1] }
 // CHECK-NEXT: ââclass-head := class-key class-head-name
 // CHECK-NEXT: â ââclass-key~CLASS := tok[0]
 // CHECK-NEXT: â ââclass-head-name~IDENTIFIER := tok[1]
Index: clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
@@ -10,6 +10,7 @@
 #include "clang-pseudo/grammar/LRGraph.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang/Basic/TokenKinds.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cstdint>
 
 namespace llvm {
@@ -40,8 +41,9 @@
 
 class LRTable::Builder {
 public:
-  Builder(llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates)
-      : StartStates(StartStates) {}
+  Builder(llvm::ArrayRef<LRGraph::Recovery> Recoveries,
+          llvm::ArrayRef<std::pair<SymbolID, StateID>> StartStates)
+      : Recoveries(Recoveries), StartStates(StartStates) {}
 
   bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
   LRTable build(const GrammarTable &GT, unsigned NumStates) && {
@@ -86,16 +88,42 @@
         ++SortedIndex;
     }
     Table.StartStates = std::move(StartStates);
+
+    // Error recovery entries: sort (no dups already), and build offset lookup.
+    llvm::sort(Recoveries,
+               [&](const LRGraph::Recovery &L, const LRGraph::Recovery &R) {
+                 return std::tie(L.Src, L.Result, L.Strategy) <
+                        std::tie(R.Src, R.Result, R.Strategy);
+               });
+    Table.Recoveries.reserve(Recoveries.size());
+    for (const auto &R : Recoveries)
+      Table.Recoveries.push_back({R.Strategy, R.Result});
+    Table.RecoveryOffset = std::vector<uint32_t>(NumStates + 1, 0);
+    SortedIndex = 0;
+    for (StateID State = 0; State < NumStates; ++State) {
+      Table.RecoveryOffset[State] = SortedIndex;
+      while (SortedIndex < Recoveries.size() &&
+             Recoveries[SortedIndex].Src == State)
+        SortedIndex++;
+    }
+    Table.RecoveryOffset[NumStates] = SortedIndex;
+    assert(SortedIndex == Recoveries.size());
+
     return Table;
   }
 
 private:
   llvm::DenseSet<Entry> Entries;
+  std::vector<LRGraph::Recovery> Recoveries;
   std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 
 LRTable LRTable::buildForTests(const GrammarTable &GT,
-                               llvm::ArrayRef<Entry> Entries) {
+                               llvm::ArrayRef<Entry> Entries,
+                               llvm::ArrayRef<RecoveryEntry> Recoveries) {
+  std::vector<LRGraph::Recovery> GRecoveries;
+  for (const auto &R : Recoveries)
+    GRecoveries.push_back({R.State, R.Strategy, R.Result});
   StateID MaxState = 0;
   for (const auto &Entry : Entries) {
     MaxState = std::max(MaxState, Entry.State);
@@ -104,7 +132,7 @@
     if (Entry.Act.kind() == LRTable::Action::GoTo)
       MaxState = std::max(MaxState, Entry.Act.getGoToState());
   }
-  Builder Build({});
+  Builder Build(GRecoveries, {});
   for (const Entry &E : Entries)
     Build.insert(E);
   return std::move(Build).build(GT, /*NumStates=*/MaxState + 1);
@@ -112,7 +140,7 @@
 
 LRTable LRTable::buildSLR(const Grammar &G) {
   auto Graph = LRGraph::buildLR0(G);
-  Builder Build(Graph.startStates());
+  Builder Build(Graph.recoveries(), Graph.startStates());
   for (const auto &T : Graph.edges()) {
     Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
     Build.insert({T.Src, T.Label, Act});
Index: clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRGraph.cpp
@@ -120,6 +120,20 @@
   return Results;
 }
 
+std::vector<std::pair<RecoveryStrategy, SymbolID>>
+availableRecovery(const State &S, const Grammar &G) {
+  std::vector<std::pair<RecoveryStrategy, SymbolID>> Result;
+  for (const Item &I : S.Items) {
+    const auto &Rule = G.lookupRule(I.rule());
+    if (I.dot() != Rule.RecoveryIndex)
+      continue;
+    Result.push_back({Rule.Recovery, Rule.seq()[Rule.RecoveryIndex]});
+  }
+  llvm::sort(Result);
+  Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
+  return Result;
+}
+
 } // namespace
 
 std::string Item::dump(const Grammar &G) const {
@@ -130,9 +144,10 @@
       Results.push_back(G.symbolName(SID));
     return Results;
   };
-  return llvm::formatv("{0} := {1} â¢ {2}", G.symbolName(Rule.Target),
+  return llvm::formatv("{0} := {1} â¢ {2}{3}", G.symbolName(Rule.Target),
                        llvm::join(ToNames(Rule.seq().take_front(DotPos)), " "),
-                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "))
+                       llvm::join(ToNames(Rule.seq().drop_front(DotPos)), " "),
+                       Rule.RecoveryIndex == DotPos ? " [recovery]" : "")
       .str();
 }
 
@@ -181,6 +196,11 @@
       Edges.push_back({Src, Dst, Label});
     }
 
+    void insertRecovery(StateID Src, RecoveryStrategy Strategy,
+                        SymbolID Result) {
+      Recoveries.push_back({Src, Strategy, Result});
+    }
+
     // Returns a state with the given id.
     const State &find(StateID ID) const {
       assert(ID < States.size());
@@ -194,9 +214,10 @@
     LRGraph build() && {
       States.shrink_to_fit();
       Edges.shrink_to_fit();
+      Recoveries.shrink_to_fit();
       llvm::sort(StartStates);
       StartStates.shrink_to_fit();
-      return LRGraph(std::move(States), std::move(Edges),
+      return LRGraph(std::move(States), std::move(Edges), std::move(Recoveries),
                      std::move(StartStates));
     }
 
@@ -205,6 +226,7 @@
     llvm::DenseMap<ItemSet, /*index of States*/ size_t> StatesIndex;
     std::vector<State> States;
     std::vector<Edge> Edges;
+    std::vector<Recovery> Recoveries;
     const Grammar &G;
     std::vector<std::pair<SymbolID, StateID>> StartStates;
   } Builder(G);
@@ -225,15 +247,16 @@
   }
 
   while (!PendingStates.empty()) {
-    auto CurrentStateID = PendingStates.back();
+    auto StateID = PendingStates.back();
     PendingStates.pop_back();
-    for (auto Next :
-         nextAvailableKernelItems(Builder.find(CurrentStateID), G)) {
+    for (auto Next : nextAvailableKernelItems(Builder.find(StateID), G)) {
       auto Insert = Builder.insert(Next.second);
       if (Insert.second) // new state, insert to the pending queue.
         PendingStates.push_back(Insert.first);
-      Builder.insertEdge(CurrentStateID, Insert.first, Next.first);
+      Builder.insertEdge(StateID, Insert.first, Next.first);
     }
+    for (auto Recovery : availableRecovery(Builder.find(StateID), G))
+      Builder.insertRecovery(StateID, Recovery.first, Recovery.second);
   }
   return std::move(Builder).build();
 }
Index: clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -106,6 +106,17 @@
 
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
+    // Wherever RHS contains { foo }, mark foo for brace-recovery.
+    // FIXME: this should be grammar annotations instead.
+    for (auto &Rule : T->Rules) {
+      for (unsigned I = 2; I < Rule.Size; ++I)
+        if (Rule.Sequence[I] == tokenSymbol(tok::r_brace) &&
+            Rule.Sequence[I - 2] == tokenSymbol(tok::l_brace) &&
+            !isToken(Rule.Sequence[I - 1])) {
+          Rule.Recovery = RecoveryStrategy::Braces;
+          Rule.RecoveryIndex = I - 1;
+        }
+    }
     const auto &SymbolOrder = getTopologicalOrder(T.get());
     llvm::stable_sort(
         T->Rules, [&SymbolOrder](const Rule &Left, const Rule &Right) {
Index: clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -59,8 +59,11 @@
   llvm::raw_string_ostream OS(Result);
   const Rule &R = T->Rules[RID];
   OS << symbolName(R.Target) << " :=";
-  for (SymbolID SID : R.seq())
-    OS << " " << symbolName(SID);
+  for (unsigned I = 0; I < R.Size; ++I) {
+    OS << " " << symbolName(R.Sequence[I]);
+    if (R.RecoveryIndex == I)
+      OS << " [recover=" << static_cast<unsigned>(R.Recovery) << "]";
+  }
   if (R.Guard)
     OS << " [guard=" << T->AttributeValues[R.Guard] << "]";
   return Result;
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -24,6 +24,156 @@
 
 namespace clang {
 namespace pseudo {
+namespace {
+
+llvm::Optional<unsigned>
+findRecoveryEndpoint(RecoveryStrategy Strategy,
+                     const GSS::Node *RecoveryNode,
+                     const TokenStream &Tokens) {
+  assert(Strategy == RecoveryStrategy::Braces);
+  const ForestNode *LBrace = RecoveryNode->Payload;
+  assert(LBrace->kind() == ForestNode::Terminal &&
+         LBrace->symbol() == tokenSymbol(tok::l_brace));
+  if (const Token *RBrace = Tokens.tokens()[LBrace->startTokenIndex()].pair())
+    return Tokens.index(*RBrace);
+  return llvm::None;
+}
+
+} // namespace
+
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const TokenStream &Tokens,
+                const ParseParams &Params,
+                std::vector<const GSS::Node *> &NewHeads) {
+  LLVM_DEBUG(llvm::dbgs() << "Recovery at token " << TokenIndex << "...\n");
+  // Describes a possibility to recover by forcibly interpreting a range of
+  // tokens around the cursor as a nonterminal that we expected to see.
+  struct PlaceholderRecovery {
+    // The token prior to the nonterminal which is being recovered.
+    // This starts of the region we're skipping, so higher Position is better.
+    Token::Index Position;
+    // The nonterminal which will be created in order to recover.
+    SymbolID Symbol;
+    // The heuristic used to choose the bounds of the nonterminal to recover.
+    RecoveryStrategy Strategy;
+
+    // The GSS head where we are expecting the recovered nonterminal.
+    const GSS::Node *RecoveryNode;
+    // Payload of nodes on the way back from the OldHead to the recovery node.
+    // These represent the partial parse that is being discarded.
+    // They should become the children of the opaque recovery node.
+    //
+    // There may be multiple paths leading to the same recovery node, we choose
+    // one arbitrarily.
+    std::vector<const ForestNode *> Path;
+  };
+  std::vector<PlaceholderRecovery> Options;
+
+  // Find recovery options by walking up the stack.
+  //
+  // This is similar to exception handling: we walk up the "frames" of nested
+  // rules being parsed until we find one that has a "handler" which allows us
+  // to determine the node bounds without parsing it.
+  //
+  // Unfortunately there's a significant difference: the stack contains both
+  // "upward" nodes (ancestor parses) and "leftward" ones.
+  //  e.g. when parsing `int(2 + ?)`, the stack contains:
+  //    expr := expr + . expr     - which we're currently parsing
+  //    expr := type ( . expr )   - (up) we should recover this outer expr
+  //    expr := . type ( expr )   - (up+left) we should not recover this type!
+  //
+  // It's not obvious how to avoid collecting the latter as a recovery option.
+  // I think the distinction is ill-defined after merging items into states.
+  // For now, we have to take this into account when defining recovery rules.
+  // FIXME: find a more satisfying way to avoid such false recovery.
+  std::vector<const ForestNode *> Path;
+  llvm::DenseSet<const GSS::Node *> Seen;
+  auto DFS = [&](const GSS::Node *N, Token::Index NextTok, auto &DFS) {
+    if (!Seen.insert(N).second)
+      return;
+    for (auto Strategy : Params.Table.getRecovery(N->State)) {
+      Options.push_back(PlaceholderRecovery{
+          NextTok,
+          Strategy.Result,
+          Strategy.Strategy,
+          N,
+          Path,
+      });
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Option: recover " << Params.G.symbolName(Strategy.Result)
+                 << " at token " << NextTok << "\n");
+    }
+    Path.push_back(N->Payload);
+    for (const GSS::Node *Parent : N->parents())
+      DFS(Parent, N->Payload->startTokenIndex(), DFS);
+    Path.pop_back();
+  };
+  for (auto *N : llvm::reverse(OldHeads))
+    DFS(N, TokenIndex, DFS);
+
+  // Now we select the option(s) we will use to recover.
+  //
+  // We prefer options starting further right, as these discard less code
+  // (e.g. we prefer to recover inner scopes rather than outer ones).
+  // The options also need to agree on an endpoint, so the parser has a
+  // consistent position afterwards.
+  //
+  // So conceptually we're sorting by the tuple (start, end), though we avoid
+  // computing `end` for options that can't be winners.
+
+  // Consider options starting further right first.
+  // Don't drop the others yet though, we may still use them if preferred fails.
+  llvm::stable_sort(Options, [&](const auto &L, const auto &R) {
+    return L.Position > R.Position;
+  });
+
+  assert(NewHeads.empty()); // We may repeatedly populate and clear it.
+  llvm::Optional<Token::Range> RecoveryRange;
+  for (const PlaceholderRecovery &Option : Options) {
+    // If this starts further right than options we've already found, then
+    // we'll never find anything better. Skip computing End for the rest.
+    if (RecoveryRange && Option.Position < RecoveryRange->Begin)
+      break;
+
+    auto End =
+        findRecoveryEndpoint(Option.Strategy, Option.RecoveryNode, Tokens);
+    // Only consider recovery that advances the parse.
+    if (!End || *End <= TokenIndex)
+      continue;
+    if (RecoveryRange) {
+      // If this is worse than our previous options, ignore it.
+      if (RecoveryRange->End < *End)
+        continue;
+      // If this is an improvement over our previous options, then drop them.
+      if (RecoveryRange->End > *End)
+        NewHeads.clear();
+    }
+    // Create recovery nodes and heads for them in the GSS. These may be
+    // discarded if a better recovery is later found, but this path isn't hot.
+    RecoveryRange = {Option.Position, *End};
+    const ForestNode &Placeholder =
+        Params.Forest.createOpaque(Option.Symbol, Option.Position);
+    const GSS::Node *NewHead = Params.GSStack.addNode(
+        Params.Table.getGoToState(Option.RecoveryNode->State, Option.Symbol),
+        &Placeholder, {Option.RecoveryNode});
+    NewHeads.push_back(NewHead);
+  }
+
+  // Advance the cursor, whether recovery succeeded or not.
+  if (RecoveryRange) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Recovered range=" << *RecoveryRange << ":";
+      for (const auto *Head : NewHeads)
+        llvm::dbgs() << " " << Params.G.symbolName(Head->Payload->symbol());
+      llvm::dbgs() << "\n";
+    });
+    TokenIndex = RecoveryRange->End;
+  } else {
+    LLVM_DEBUG(llvm::dbgs() << "Recovery failed after trying " << Options.size()
+                            << " strategies\n");
+    ++TokenIndex;
+  }
+}
 
 using StateID = LRTable::StateID;
 
@@ -31,8 +181,9 @@
   std::vector<std::string> ParentStates;
   for (const auto *Parent : N.parents())
     ParentStates.push_back(llvm::formatv("{0}", Parent->State));
-  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {2}", N.State,
-                      N.Payload->symbol(), llvm::join(ParentStates, ", "));
+  OS << llvm::formatv("state {0}, parsed symbol {1}, parents {3}", N.State,
+                      N.Payload ? N.Payload->symbol() : 0,
+                      llvm::join(ParentStates, ", "));
   return OS;
 }
 
@@ -426,15 +577,27 @@
     GSS.gc(std::move(Roots));
   };
   // Each iteration fully processes a single token.
-  for (unsigned I = 0; I < Terminals.size(); ++I) {
+  for (unsigned I = 0; I < Terminals.size();) {
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
                    "Next token {0} (id={1})\n",
                    G.symbolName(Terminals[I].symbol()), Terminals[I].symbol()));
     // Consume the token.
     glrShift(Heads, Terminals[I], Params, NextHeads);
+
+    // If we weren't able to consume the token, try to skip over some tokens
+    // so we can keep parsing.
+    if (NextHeads.empty()) {
+      glrRecover(Heads, I, Tokens, Params, NextHeads);
+      if (NextHeads.empty())
+        // FIXME: Ensure the `_ := start-symbol` rules have a fallback
+        // error-recovery strategy attached. Then this condition can't happen.
+        return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
+    } else
+      ++I;
+
     // Form nonterminals containing the token we just consumed.
-    SymbolID Lookahead = I + 1 == Terminals.size() ? tokenSymbol(tok::eof)
-                                                   : Terminals[I + 1].symbol();
+    SymbolID Lookahead =
+        I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
     Reduce(NextHeads, Lookahead);
     // Prepare for the next token.
     std::swap(Heads, NextHeads);
@@ -443,22 +606,35 @@
   }
   LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Reached eof\n"));
 
+  // The parse was successful if we're in state `_ := start-symbol .`
   StateID AcceptState = Params.Table.getGoToState(StartState, StartSymbol);
-  const ForestNode *Result = nullptr;
-  for (const auto *Head : Heads) {
-    if (Head->State == AcceptState) {
-      assert(Head->Payload->symbol() == StartSymbol);
-      assert(Result == nullptr && "multiple results!");
-      Result = Head->Payload;
+  auto SearchForAccept = [&](llvm::ArrayRef<const GSS::Node *> Heads) {
+    const ForestNode *Result = nullptr;
+    for (const auto *Head : Heads) {
+      if (Head->State == AcceptState) {
+        assert(Head->Payload->symbol() == StartSymbol);
+        assert(Result == nullptr && "multiple results!");
+        Result = Head->Payload;
+      }
     }
-  }
-  if (Result)
+    return Result;
+  };
+  if (auto *Result = SearchForAccept(Heads))
     return *Result;
+  // Failed to parse the input, attempt to run recovery.
+  // FIXME: this awkwardly repeats the recovery in the loop, when shift fails.
+  // More elegant is to include EOF in the token stream, and make the
+  // augmented rule: `_ := translation-unit EOF`. In this way recovery at EOF
+  // would not be a special case: it show up as a failure to shift the EOF
+  // token.
+  unsigned I = Terminals.size();
+  glrRecover(Heads, I, Tokens, Params, NextHeads);
+  Reduce(NextHeads, tokenSymbol(tok::eof));
+  if (auto *Result = SearchForAccept(NextHeads))
+    return *Result;
+
   // We failed to parse the input, returning an opaque forest node for recovery.
-  //
-  // FIXME: We will need to invoke our generic error-recovery handlers when we
-  // reach EOF without reaching accept state, and involving the eof
-  // token in the above main for-loopmay be the best way to reuse the code).
+  // FIXME: as above, we can add fallback error handling so this is impossible.
   return Params.Forest.createOpaque(StartSymbol, /*Token::Index=*/0);
 }
 
@@ -469,9 +645,10 @@
 }
 
 const GSS::Node *GSS::addNode(LRTable::StateID State, const ForestNode *Symbol,
+
                               llvm::ArrayRef<const Node *> Parents) {
   Node *Result = new (allocate(Parents.size()))
-      Node({State, GCParity, static_cast<unsigned>(Parents.size())});
+      Node({State, GCParity, static_cast<uint16_t>(Parents.size())});
   Alive.push_back(Result);
   ++NodesCreated;
   Result->Payload = Symbol;
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -123,6 +123,11 @@
     uint16_t Value : ValueBits;
   };
 
+  struct Recovery {
+    RecoveryStrategy Strategy;
+    SymbolID Result;
+  };
+
   // Returns all available actions for the given state on a terminal.
   // Expected to be called by LR parsers.
   llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
@@ -139,6 +144,12 @@
   // Returns empty if no available actions in the table.
   llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
 
+  // Looks up available recovery actions if we stopped parsing in this state.
+  llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
+    return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State],
+                              Recoveries.data() + RecoveryOffset[State + 1]);
+  }
+
   // Returns the state from which the LR parser should start to parse the input
   // tokens as the given StartSymbol.
   //
@@ -169,8 +180,14 @@
     SymbolID Symbol;
     Action Act;
   };
-  // Build a specifid table for testing purposes.
-  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+  struct RecoveryEntry {
+    StateID State;
+    RecoveryStrategy Strategy;
+    SymbolID Result;
+  };
+  // Build a specified table for testing purposes.
+  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>,
+                               llvm::ArrayRef<RecoveryEntry> = {});
 
 private:
   // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
@@ -188,6 +205,11 @@
   std::vector<Action> Actions;
   // A sorted table, storing the start state for each target parsing symbol.
   std::vector<std::pair<SymbolID, StateID>> StartStates;
+
+  // Recovery stores all recovery actions from all states.
+  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
+  std::vector<uint32_t> RecoveryOffset;
+  std::vector<Recovery> Recoveries;
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
 
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRGraph.h
@@ -137,8 +137,20 @@
     SymbolID Label;
   };
 
+  // A possible error recovery: choose to match some tokens against a symbol.
+  //
+  // e.g. a state that contains
+  //   stmt := { . stmt-seq [recover=braces] }
+  // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
+  struct Recovery {
+    StateID Src; // The state we are in when encountering the error.
+    RecoveryStrategy Strategy; // Heuristic choosing the tokens to match.
+    SymbolID Result;           // The symbol that is produced.
+  };
+
   llvm::ArrayRef<State> states() const { return States; }
   llvm::ArrayRef<Edge> edges() const { return Edges; }
+  llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
   llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
     return StartStates;
   }
@@ -147,12 +159,15 @@
 
 private:
   LRGraph(std::vector<State> States, std::vector<Edge> Edges,
+          std::vector<Recovery> Recoveries,
           std::vector<std::pair<SymbolID, StateID>> StartStates)
       : States(std::move(States)), Edges(std::move(Edges)),
-        StartStates(std::move(StartStates)) {}
+        Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
+  }
 
   std::vector<State> States;
   std::vector<Edge> Edges;
+  std::vector<Recovery> Recoveries;
   std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/Grammar.h
@@ -81,9 +81,12 @@
   assert(SID < NumTerminals);
   return static_cast<tok::TokenKind>(SID);
 }
-inline SymbolID tokenSymbol(tok::TokenKind TK) {
+inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
   return TokenFlag | static_cast<SymbolID>(TK);
 }
+// Error recovery strategies.
+// FIXME: these should be provided as extensions instead.
+enum class RecoveryStrategy : uint8_t { None, Braces };
 
 // An extension is a piece of native code specific to a grammar that modifies
 // the behavior of annotated rules. One ExtensionID is assigned for each unique
@@ -107,7 +110,7 @@
   // length to 9 (this is the longest sequence in cxx grammar).
   static constexpr unsigned SizeBits = 4;
   static constexpr unsigned MaxElements = 9;
-  static_assert(MaxElements <= (1 << SizeBits), "Exceeds the maximum limit");
+  static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
   static_assert(SizeBits + SymbolBits <= 16,
                 "Must be able to store symbol ID + size efficiently");
 
@@ -123,6 +126,13 @@
   // being set for this rule.
   ExtensionID Guard = 0;
 
+  // Specifies the index within Sequence eligible for error recovery.
+  // Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
+  // should recover by finding the matching brace, and forcing stmt-seq to match
+  // everything between braces.
+  uint8_t RecoveryIndex = -1;
+  RecoveryStrategy Recovery = RecoveryStrategy::None;
+
   llvm::ArrayRef<SymbolID> seq() const {
     return llvm::ArrayRef<SymbolID>(Sequence, Size);
   }
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -144,6 +144,26 @@
 void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
                const ParseParams &Params);
 
+// Heuristically recover from a state where no further parsing is possible.
+//
+// OldHeads is the parse state at TokenIndex.
+// This function consumes consumes zero or more tokens (advancing TokenIndex),
+// and places any recovery states created in NewHeads.
+//
+// On failure, NewHeads is empty and TokenIndex is unchanged.
+//
+// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
+// there is a risk of the parser falling into an infinite loop, creating an
+// endless sequence of recovery nodes.
+// Generally it is safe for recovery to match 0 tokens against sequence symbols
+// like `statement-seq`, as the grammar won't permit another statement-seq
+// immediately afterwards. However recovery strategies for `statement` should
+// consume at least one token, as statements may be adjacent in the input.
+void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
+                unsigned &TokenIndex, const TokenStream &Tokens,
+                const ParseParams &Params,
+                std::vector<const GSS::Node *> &NewHeads);
+
 } // namespace pseudo
 } // namespace clang

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D128486: [pseudo] Add error-recovery framework & brace-based recovery

Reply via email to