[PATCH] D128472: [pseudo] Check follow-sets instead of tying reduce actions to lookahead tokens.

Sam McCall via Phabricator via cfe-commits Thu, 23 Jun 2022 15:18:21 -0700

sammccall created this revision.
sammccall added a reviewer: hokein.
Herald added a subscriber: mgrang.
Herald added a project: All.
sammccall requested review of this revision.
Herald added subscribers: cfe-commits, alextsao1999.
Herald added a project: clang-tools-extra.


Previously, the action table stores a reduce action for each lookahead
token it should allow. These tokens are the followSet(action.rule.target).

In practice, the follow sets are large, so we spend a bunch of time binary
searching around all these essentially-duplicates to check whether our lookahead
token is there.
However the number of reduces for a given state is very small, so we're
much better off linear scanning over them and performing a fast check for each.

D128318 <https://reviews.llvm.org/D128318> was an attempt at this, storing a 
bitmap for each reduce.
However it's even more compact just to use the follow sets directly, as
there are fewer nonterminals than (state, rule) pairs. It's also faster.

This specialized approach means unbundling Reduce from other actions in
LRTable, so it's no longer useful to support it in Action. I suspect
Action will soon go away, as we store each kind of action separately.

This improves glrParse speed by 42% (3.30 -> 4.69 MB/s).
It also reduces LR table size by 59% (343 -> 142kB).


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D128472

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
  clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
  clang-tools-extra/pseudo/test/lr-build-basic.test
  clang-tools-extra/pseudo/test/lr-build-conflicts.test
  clang-tools-extra/pseudo/unittests/GLRTest.cpp
  clang-tools-extra/pseudo/unittests/LRTableTest.cpp

Index: clang-tools-extra/pseudo/unittests/LRTableTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/LRTableTest.cpp
+++ clang-tools-extra/pseudo/unittests/LRTableTest.cpp
@@ -9,6 +9,7 @@
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang/Basic/TokenKinds.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <vector>
@@ -17,36 +18,59 @@
 namespace pseudo {
 namespace {
 
-using testing::IsEmpty;
-using testing::UnorderedElementsAre;
+using llvm::ValueIs;
+using testing::ElementsAre;
 using Action = LRTable::Action;
 
 TEST(LRTable, Builder) {
-  GrammarTable GTable;
-
-  //           eof   semi  ...
-  // +-------+----+-------+---
-  // |state0 |    | s0,r0 |...
-  // |state1 | acc|       |...
-  // |state2 |    |  r1   |...
-  // +-------+----+-------+---
+  std::vector<std::string> GrammarDiags;
+  auto G = Grammar::parseBNF(R"bnf(
+    _ := expr
+    expr := term
+    expr := expr + term
+    term := IDENTIFIER
+  )bnf",
+                             GrammarDiags);
+  EXPECT_THAT(GrammarDiags, testing::IsEmpty());
+
+  SymbolID Term = *G->findNonterminal("term");
+  SymbolID Eof = tokenSymbol(tok::eof);
+  SymbolID Semi = tokenSymbol(tok::semi);
+  SymbolID Int = tokenSymbol(tok::kw_int);
+  SymbolID Plus = tokenSymbol(tok::plus);
+
+  //           eof   semi   term   reduce
+  // +-------+----+-------+------+-------
+  // |state0 |    | s0    |      | r0
+  // |state1 |    |       | g3   | acc
+  // |state2 |    |       |      | r1
+  // +-------+----+-------+------+-------
   std::vector<LRTable::Entry> Entries = {
-      {/* State */ 0, tokenSymbol(tok::semi), Action::shift(0)},
-      {/* State */ 0, tokenSymbol(tok::semi), Action::reduce(0)},
-      {/* State */ 1, tokenSymbol(tok::eof), Action::reduce(2)},
-      {/* State */ 2, tokenSymbol(tok::semi), Action::reduce(1)}};
-  GrammarTable GT;
-  LRTable T = LRTable::buildForTests(GT, Entries);
-  EXPECT_THAT(T.find(0, tokenSymbol(tok::eof)), IsEmpty());
-  EXPECT_THAT(T.find(0, tokenSymbol(tok::semi)),
-              UnorderedElementsAre(Action::shift(0), Action::reduce(0)));
-  EXPECT_THAT(T.find(1, tokenSymbol(tok::eof)),
-              UnorderedElementsAre(Action::reduce(2)));
-  EXPECT_THAT(T.find(1, tokenSymbol(tok::semi)), IsEmpty());
-  EXPECT_THAT(T.find(2, tokenSymbol(tok::semi)),
-              UnorderedElementsAre(Action::reduce(1)));
+      {/* State */ 0, Semi, Action::shift(0)},
+      {/* State */ 1, Term, Action::goTo(3)},
+  };
+  std::vector<LRTable::ReduceEntry> ReduceEntries = {
+      {/* State=*/0, 0},
+      {/* State=*/1, 2},
+      {/* State=*/2, 1},
+  };
+  LRTable T = LRTable::buildForTests(*G, Entries, ReduceEntries);
+  EXPECT_EQ(T.getShiftState(0, Eof), llvm::None);
+  EXPECT_THAT(T.getShiftState(0, Semi), ValueIs(0));
+  EXPECT_THAT(T.getReduceRules(0), ElementsAre(0));
+
+  EXPECT_EQ(T.getShiftState(1, Eof), llvm::None);
+  EXPECT_EQ(T.getShiftState(1, Semi), llvm::None);
+  EXPECT_EQ(T.getGoToState(1, Term), 3);
+  EXPECT_THAT(T.getReduceRules(1), ElementsAre(2));
+
   // Verify the behaivor for other non-available-actions terminals.
-  EXPECT_THAT(T.find(2, tokenSymbol(tok::kw_int)), IsEmpty());
+  EXPECT_EQ(T.getShiftState(2, Int), llvm::None);
+
+  // Check follow sets.
+  EXPECT_TRUE(T.canFollow(Term, Plus));
+  EXPECT_TRUE(T.canFollow(Term, Eof));
+  EXPECT_FALSE(T.canFollow(Term, Int));
 }
 
 } // namespace
Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -31,6 +31,7 @@
 
 using Action = LRTable::Action;
 using testing::AllOf;
+using testing::ElementsAre;
 using testing::UnorderedElementsAre;
 
 MATCHER_P(state, StateID, "") { return arg->State == StateID; }
@@ -112,11 +113,13 @@
 
   buildGrammar({}, {}); // Create a fake empty grammar.
   LRTable T =
-      LRTable::buildForTests(G->table(), /*Entries=*/{
+      LRTable::buildForTests(*G, /*Entries=*/
+                             {
                                  {1, tokenSymbol(tok::semi), Action::shift(4)},
                                  {2, tokenSymbol(tok::semi), Action::shift(4)},
                                  {3, tokenSymbol(tok::semi), Action::shift(5)},
-                             });
+                             },
+                             {});
 
   ForestNode &SemiTerminal = Arena.createTerminal(tok::semi, 0);
   std::vector<const GSS::Node *> NewHeads;
@@ -142,14 +145,15 @@
                {"class-name := IDENTIFIER", "enum-name := IDENTIFIER"});
 
   LRTable Table = LRTable::buildForTests(
-      G->table(), {
-                      {/*State=*/0, id("class-name"), Action::goTo(2)},
-                      {/*State=*/0, id("enum-name"), Action::goTo(3)},
-                      {/*State=*/1, tokenSymbol(tok::l_brace),
-                       Action::reduce(ruleFor("class-name"))},
-                      {/*State=*/1, tokenSymbol(tok::l_brace),
-                       Action::reduce(ruleFor("enum-name"))},
-                  });
+      *G,
+      {
+          {/*State=*/0, id("class-name"), Action::goTo(2)},
+          {/*State=*/0, id("enum-name"), Action::goTo(3)},
+      },
+      {
+          {/*State=*/1, ruleFor("class-name")},
+          {/*State=*/1, ruleFor("enum-name")},
+      });
 
   const auto *GSSNode0 =
       GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
@@ -157,7 +161,7 @@
       GSStack.addNode(1, &Arena.createTerminal(tok::identifier, 0), {GSSNode0});
 
   std::vector<const GSS::Node *> Heads = {GSSNode1};
-  glrReduce(Heads, tokenSymbol(tok::l_brace), {*G, Table, Arena, GSStack});
+  glrReduce(Heads, tokenSymbol(tok::eof), {*G, Table, Arena, GSStack});
   EXPECT_THAT(Heads, UnorderedElementsAre(
                          GSSNode1,
                          AllOf(state(2), parsedSymbolID(id("class-name")),
@@ -189,15 +193,16 @@
       /*Parents=*/{GSSNode2, GSSNode3});
 
   LRTable Table = LRTable::buildForTests(
-      G->table(),
+      *G,
       {
           {/*State=*/2, id("ptr-operator"), Action::goTo(/*NextState=*/5)},
           {/*State=*/3, id("ptr-operator"), Action::goTo(/*NextState=*/6)},
-          {/*State=*/4, tokenSymbol(tok::identifier),
-           Action::reduce(ruleFor("ptr-operator"))},
+      },
+      {
+          {/*State=*/4, ruleFor("ptr-operator")},
       });
   std::vector<const GSS::Node *> Heads = {GSSNode4};
-  glrReduce(Heads, tokenSymbol(tok::identifier), {*G, Table, Arena, GSStack});
+  glrReduce(Heads, tokenSymbol(tok::eof), {*G, Table, Arena, GSStack});
 
   EXPECT_THAT(Heads, UnorderedElementsAre(
                          GSSNode4,
@@ -242,17 +247,17 @@
 
   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
   LRTable Table = LRTable::buildForTests(
-      G->table(),
+      *G,
       {
           {/*State=*/1, id("type-name"), Action::goTo(/*NextState=*/5)},
           {/*State=*/2, id("type-name"), Action::goTo(/*NextState=*/5)},
-          {/*State=*/3, tokenSymbol(tok::l_paren),
-           Action::reduce(/* type-name := class-name */ 0)},
-          {/*State=*/4, tokenSymbol(tok::l_paren),
-           Action::reduce(/* type-name := enum-name */ 1)},
+      },
+      {
+          {/*State=*/3, /* type-name := class-name */ 0},
+          {/*State=*/4, /* type-name := enum-name */ 1},
       });
   std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
-  glrReduce(Heads, tokenSymbol(tok::l_paren), {*G, Table, Arena, GSStack});
+  glrReduce(Heads, tokenSymbol(tok::eof), {*G, Table, Arena, GSStack});
 
   // Verify that the stack heads are joint at state 5 after reduces.
   EXPECT_THAT(Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
@@ -299,16 +304,17 @@
                       /*Parents=*/{GSSNode2});
 
   // FIXME: figure out a way to get rid of the hard-coded reduce RuleID!
-  LRTable Table = LRTable::buildForTests(
-      G->table(), {
-                      {/*State=*/0, id("pointer"), Action::goTo(5)},
-                      {3, tokenSymbol(tok::l_paren),
-                       Action::reduce(/* pointer := class-name */ 0)},
-                      {4, tokenSymbol(tok::l_paren),
-                       Action::reduce(/* pointer := enum-name */ 1)},
-                  });
+  LRTable Table =
+      LRTable::buildForTests(*G,
+                             {
+                                 {/*State=*/0, id("pointer"), Action::goTo(5)},
+                             },
+                             {
+                                 {3, /* pointer := class-name */ 0},
+                                 {4, /* pointer := enum-name */ 1},
+                             });
   std::vector<const GSS::Node *> Heads = {GSSNode3, GSSNode4};
-  glrReduce(Heads, tokenSymbol(tok::l_paren), {*G, Table, Arena, GSStack});
+  glrReduce(Heads, tokenSymbol(tok::eof), {*G, Table, Arena, GSStack});
 
   EXPECT_THAT(
       Heads, UnorderedElementsAre(GSSNode3, GSSNode4,
@@ -325,6 +331,38 @@
             "[  1, end)   ââ* := tok[1]\n");
 }
 
+TEST_F(GLRTest, ReduceLookahead) {
+  // A term can be followed by +, but not by -.
+  buildGrammar({"sum", "term"}, {"expr := term + term", "term := IDENTIFIER"});
+  LRTable Table =
+      LRTable::buildForTests(*G,
+                             {
+                                 {/*State=*/0, id("term"), Action::goTo(2)},
+                             },
+                             {
+                                 {/*State=*/1, 0},
+                             });
+
+  auto *Identifier = &Arena.createTerminal(tok::identifier, /*Start=*/0);
+
+  const auto *Root =
+      GSStack.addNode(/*State=*/0, /*ForestNode=*/nullptr, /*Parents=*/{});
+  const auto *GSSNode1 =
+      GSStack.addNode(/*State=*/1, /*ForestNode=*/Identifier, {Root});
+
+  // When the lookahead is +, reduce is performed.
+  std::vector<const GSS::Node *> Heads = {GSSNode1};
+  glrReduce(Heads, tokenSymbol(tok::plus), {*G, Table, Arena, GSStack});
+  EXPECT_THAT(Heads,
+              ElementsAre(GSSNode1, AllOf(state(2), parsedSymbolID(id("term")),
+                                          parents(Root))));
+
+  // When the lookahead is -, reduce is not performed.
+  Heads = {GSSNode1};
+  glrReduce(Heads, tokenSymbol(tok::minus), {*G, Table, Arena, GSStack});
+  EXPECT_THAT(Heads, ElementsAre(GSSNode1));
+}
+
 TEST_F(GLRTest, PerfectForestNodeSharing) {
   // Run the GLR on a simple grammar and test that we build exactly one forest
   // node per (SymbolID, token range).
Index: clang-tools-extra/pseudo/test/lr-build-conflicts.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-conflicts.test
+++ clang-tools-extra/pseudo/test/lr-build-conflicts.test
@@ -30,17 +30,15 @@
 # RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
 #      TABLE: LRTable:
 # TABLE-NEXT: State 0
-# TABLE-NEXT:     'IDENTIFIER': shift state 2
-# TABLE-NEXT:     'expr': go to state 1
+# TABLE-NEXT:     IDENTIFIER: shift state 2
+# TABLE-NEXT:     expr: go to state 1
 # TABLE-NEXT: State 1
-# TABLE-NEXT:     '-': shift state 3
+# TABLE-NEXT:     -: shift state 3
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := IDENTIFIER'
-# TABLE-NEXT:     '-': reduce by rule 1 'expr := IDENTIFIER'
+# TABLE-NEXT:     EOF -: reduce by rule 1 'expr := IDENTIFIER'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     'IDENTIFIER': shift state 2
-# TABLE-NEXT:     'expr': go to state 4
+# TABLE-NEXT:     IDENTIFIER: shift state 2
+# TABLE-NEXT:     expr: go to state 4
 # TABLE-NEXT: State 4
-# TABLE-NEXT:     'EOF': reduce by rule 0 'expr := expr - expr'
-# TABLE-NEXT:     '-': shift state 3
-# TABLE-NEXT:     '-': reduce by rule 0 'expr := expr - expr'
+# TABLE-NEXT:     -: shift state 3
+# TABLE-NEXT:     EOF -: reduce by rule 0 'expr := expr - expr'
Index: clang-tools-extra/pseudo/test/lr-build-basic.test
===================================================================
--- clang-tools-extra/pseudo/test/lr-build-basic.test
+++ clang-tools-extra/pseudo/test/lr-build-basic.test
@@ -18,11 +18,11 @@
 # RUN: clang-pseudo -grammar %s -print-table | FileCheck %s --check-prefix=TABLE
 #      TABLE: LRTable:
 # TABLE-NEXT: State 0
-# TABLE-NEXT:     'IDENTIFIER': shift state 3
-# TABLE-NEXT:     'expr': go to state 1
-# TABLE-NEXT:     'id': go to state 2
+# TABLE-NEXT:     IDENTIFIER: shift state 3
+# TABLE-NEXT:     expr: go to state 1
+# TABLE-NEXT:     id: go to state 2
 # TABLE-NEXT: State 1
 # TABLE-NEXT: State 2
-# TABLE-NEXT:     'EOF': reduce by rule 1 'expr := id'
+# TABLE-NEXT:     EOF: reduce by rule 1 'expr := id'
 # TABLE-NEXT: State 3
-# TABLE-NEXT:     'EOF': reduce by rule 0 'id := IDENTIFIER'
+# TABLE-NEXT:     EOF: reduce by rule 0 'id := IDENTIFIER'
Index: clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTableBuild.cpp
@@ -10,6 +10,7 @@
 #include "clang-pseudo/grammar/LRGraph.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/SmallSet.h"
 #include <cstdint>
 
 namespace llvm {
@@ -44,7 +45,7 @@
       : StartStates(StartStates) {}
 
   bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
-  LRTable build(const GrammarTable &GT, unsigned NumStates) && {
+  LRTable build(unsigned NumStates) && {
     // E.g. given the following parsing table with 3 states and 3 terminals:
     //
     //            a    b     c
@@ -86,16 +87,41 @@
         ++SortedIndex;
     }
     Table.StartStates = std::move(StartStates);
+
+    // Compile the follow sets into a bitmap.
+    Table.FollowSets.resize(tok::NUM_TOKENS * FollowSets.size());
+    for (SymbolID NT = 0; NT < FollowSets.size(); ++NT)
+      for (SymbolID Follow : FollowSets[NT])
+        Table.FollowSets.set(NT * tok::NUM_TOKENS + symbolToToken(Follow));
+
+    // Store the reduce actions in a vector partitioned by state.
+    Table.ReduceOffset.reserve(NumStates + 1);
+    std::vector<RuleID> StateRules;
+    for (StateID S = 0; S < NumStates; ++S) {
+      Table.ReduceOffset.push_back(Table.Reduces.size());
+      auto It = Reduces.find(S);
+      if (It == Reduces.end())
+        continue;
+      Table.Reduces.insert(Table.Reduces.end(), It->second.begin(),
+                           It->second.end());
+      std::sort(Table.Reduces.begin() + Table.ReduceOffset.back(),
+                Table.Reduces.end());
+    }
+    Table.ReduceOffset.push_back(Table.Reduces.size());
+
     return Table;
   }
 
+  llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduces;
+  std::vector<llvm::DenseSet<SymbolID>> FollowSets;
+
 private:
   llvm::DenseSet<Entry> Entries;
   std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 
-LRTable LRTable::buildForTests(const GrammarTable &GT,
-                               llvm::ArrayRef<Entry> Entries) {
+LRTable LRTable::buildForTests(const Grammar &G, llvm::ArrayRef<Entry> Entries,
+                               llvm::ArrayRef<ReduceEntry> Reduces) {
   StateID MaxState = 0;
   for (const auto &Entry : Entries) {
     MaxState = std::max(MaxState, Entry.State);
@@ -107,7 +133,10 @@
   Builder Build({});
   for (const Entry &E : Entries)
     Build.insert(E);
-  return std::move(Build).build(GT, /*NumStates=*/MaxState + 1);
+  for (const ReduceEntry &E : Reduces)
+    Build.Reduces[E.State].insert(E.Rule);
+  Build.FollowSets = followSets(G);
+  return std::move(Build).build(/*NumStates=*/MaxState + 1);
 }
 
 LRTable LRTable::buildSLR(const Grammar &G) {
@@ -117,9 +146,10 @@
     Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
     Build.insert({T.Src, T.Label, Act});
   }
+  Build.FollowSets = followSets(G);
   assert(Graph.states().size() <= (1 << StateBits) &&
          "Graph states execceds the maximum limit!");
-  auto FollowSets = followSets(G);
+  // Add reduce actions.
   for (StateID SID = 0; SID < Graph.states().size(); ++SID) {
     for (const Item &I : Graph.states()[SID].Items) {
       // If we've just parsed the start symbol, this means we successfully parse
@@ -127,17 +157,13 @@
       // LRTable (the GLR parser handles it specifically).
       if (G.lookupRule(I.rule()).Target == G.underscore() && !I.hasNext())
         continue;
-      if (!I.hasNext()) {
+      if (!I.hasNext())
         // If we've reached the end of a rule A := ..., then we can reduce if
         // the next token is in the follow set of A.
-        for (SymbolID Follow : FollowSets[G.lookupRule(I.rule()).Target]) {
-          assert(isToken(Follow));
-          Build.insert({SID, Follow, Action::reduce(I.rule())});
-        }
-      }
+        Build.Reduces[SID].insert(I.rule());
     }
   }
-  return std::move(Build).build(G.table(), Graph.states().size());
+  return std::move(Build).build(Graph.states().size());
 }
 
 } // namespace pseudo
Index: clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
+++ clang-tools-extra/pseudo/lib/grammar/LRTable.cpp
@@ -10,6 +10,7 @@
 #include "clang-pseudo/grammar/Grammar.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
@@ -21,8 +22,6 @@
   switch (A.kind()) {
   case LRTable::Action::Shift:
     return OS << llvm::formatv("shift state {0}", A.getShiftState());
-  case LRTable::Action::Reduce:
-    return OS << llvm::formatv("reduce by rule {0}", A.getReduceRule());
   case LRTable::Action::GoTo:
     return OS << llvm::formatv("go to state {0}", A.getGoToState());
   case LRTable::Action::Sentinel:
@@ -36,9 +35,11 @@
 Statistics of the LR parsing table:
     number of states: {0}
     number of actions: {1}
-    size of the table (bytes): {2}
+    number of reduces: {2}
+    size of the table (bytes): {3}
 )",
-                       StateOffset.size() - 1, Actions.size(), bytes())
+                       StateOffset.size() - 1, Actions.size(), Reduces.size(),
+                       bytes())
       .str();
 }
 
@@ -52,19 +53,27 @@
       SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
       for (auto A : find(S, TokID)) {
         if (A.kind() == LRTable::Action::Shift)
-          OS.indent(4) << llvm::formatv("'{0}': shift state {1}\n",
+          OS.indent(4) << llvm::formatv("{0}: shift state {1}\n",
                                         G.symbolName(TokID), A.getShiftState());
-        else if (A.kind() == LRTable::Action::Reduce)
-          OS.indent(4) << llvm::formatv("'{0}': reduce by rule {1} '{2}'\n",
-                                        G.symbolName(TokID), A.getReduceRule(),
-                                        G.dumpRule(A.getReduceRule()));
       }
     }
+    for (RuleID R : getReduceRules(S)) {
+      SymbolID Target = G.lookupRule(R).Target;
+      std::vector<llvm::StringRef> Terminals;
+      for (unsigned Terminal = 0; Terminal < NumTerminals; ++Terminal) {
+        SymbolID TokID = tokenSymbol(static_cast<tok::TokenKind>(Terminal));
+        if (canFollow(Target, TokID))
+          Terminals.push_back(G.symbolName(TokID));
+      }
+      OS.indent(4) << llvm::formatv("{0}: reduce by rule {1} '{2}'\n",
+                                    llvm::join(Terminals, " "), R,
+                                    G.dumpRule(R));
+    }
     for (SymbolID NontermID = 0; NontermID < G.table().Nonterminals.size();
          ++NontermID) {
       if (find(S, NontermID).empty())
         continue;
-      OS.indent(4) << llvm::formatv("'{0}': go to state {1}\n",
+      OS.indent(4) << llvm::formatv("{0}: go to state {1}\n",
                                     G.symbolName(NontermID),
                                     getGoToState(S, NontermID));
     }
@@ -77,18 +86,12 @@
   // FIXME: we spend a significant amount of time on misses here.
   // We could consider storing a std::bitset for a cheaper test?
   assert(pseudo::isToken(Terminal) && "expected terminal symbol!");
-  for (const auto &Result : getActions(State, Terminal))
+  for (const auto &Result : find(State, Terminal))
     if (Result.kind() == Action::Shift)
       return Result.getShiftState(); // unique: no shift/shift conflicts.
   return llvm::None;
 }
 
-llvm::ArrayRef<LRTable::Action> LRTable::getActions(StateID State,
-                                                    SymbolID Terminal) const {
-  assert(pseudo::isToken(Terminal) && "expect terminal symbol!");
-  return find(State, Terminal);
-}
-
 LRTable::StateID LRTable::getGoToState(StateID State,
                                        SymbolID Nonterminal) const {
   assert(pseudo::isNonterminal(Nonterminal) && "expected nonterminal symbol!");
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -251,9 +251,8 @@
 private:
   // pop walks up the parent chain(s) for a reduction from Head by to Rule.
   // Once we reach the end, record the bases and sequences.
-  void pop(const GSS::Node *Head, RuleID RID) {
+  void pop(const GSS::Node *Head, RuleID RID, const Rule &Rule) {
     LLVM_DEBUG(llvm::dbgs() << "  Pop " << Params.G.dumpRule(RID) << "\n");
-    const auto &Rule = Params.G.lookupRule(RID);
     Family F{/*Start=*/0, /*Symbol=*/Rule.Target, /*Rule=*/RID};
     TempSequence.resize_for_overwrite(Rule.Size);
     auto DFS = [&](const GSS::Node *N, unsigned I, auto &DFS) {
@@ -286,11 +285,11 @@
       // In trivial cases, we perform the complete reduce here!
       if (popAndPushTrivial())
         continue;
-      for (const auto &A :
-           Params.Table.getActions((*Heads)[NextPopHead]->State, Lookahead)) {
-        if (A.kind() != LRTable::Action::Reduce)
-          continue;
-        pop((*Heads)[NextPopHead], A.getReduceRule());
+      for (RuleID RID :
+           Params.Table.getReduceRules((*Heads)[NextPopHead]->State)) {
+        const auto &Rule = Params.G.lookupRule(RID);
+        if (Params.Table.canFollow(Rule.Target, Lookahead))
+          pop((*Heads)[NextPopHead], RID, Rule);
       }
     }
   }
@@ -372,16 +371,16 @@
       return false;
     const GSS::Node *Head = Heads->back();
     llvm::Optional<RuleID> RID;
-    for (auto &A : Params.Table.getActions(Head->State, Lookahead)) {
-      if (A.kind() != LRTable::Action::Reduce)
-        continue;
+    for (RuleID R : Params.Table.getReduceRules(Head->State)) {
       if (RID.hasValue())
         return false;
-      RID = A.getReduceRule();
+      RID = R;
     }
     if (!RID.hasValue())
       return true; // no reductions available, but we've processed the head!
     const auto &Rule = Params.G.lookupRule(*RID);
+    if (!Params.Table.canFollow(Rule.Target, Lookahead))
+      return true; // reduction is not available
     const GSS::Node *Base = Head;
     TempSequence.resize_for_overwrite(Rule.Size);
     for (unsigned I = 0; I < Rule.Size; ++I) {
Index: clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -38,6 +38,8 @@
 
 #include "clang-pseudo/grammar/Grammar.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/Capacity.h"
 #include <cstdint>
 #include <vector>
 
@@ -62,6 +64,9 @@
 
   // Action represents the terminal and nonterminal actions, it combines the
   // entry of the ACTION and GOTO tables from the LR literature.
+  //
+  // FIXME: as we move away from a homogeneous table structure shared between
+  // action types, this class becomes less useful. Remove it.
   class Action {
   public:
     enum Kind : uint8_t {
@@ -73,8 +78,6 @@
       // A shift is a forward transition, and the value n is the next state that
       // the parser is to enter.
       Shift,
-      // Reduce by a rule: pop the state stack.
-      Reduce,
 
       // NOTE: there are no typical accept actions in the LRtable, accept
       // actions are handled specifically in the parser -- if the parser
@@ -91,7 +94,6 @@
 
     static Action goTo(StateID S) { return Action(GoTo, S); }
     static Action shift(StateID S) { return Action(Shift, S); }
-    static Action reduce(RuleID RID) { return Action(Reduce, RID); }
     static Action sentinel() { return Action(Sentinel, 0); }
 
     StateID getShiftState() const {
@@ -102,10 +104,6 @@
       assert(kind() == GoTo);
       return Value;
     }
-    RuleID getReduceRule() const {
-      assert(kind() == Reduce);
-      return Value;
-    }
     Kind kind() const { return static_cast<Kind>(K); }
 
     bool operator==(const Action &L) const { return opaque() == L.opaque(); }
@@ -123,9 +121,6 @@
     uint16_t Value : ValueBits;
   };
 
-  // Returns all available actions for the given state on a terminal.
-  // Expected to be called by LR parsers.
-  llvm::ArrayRef<Action> getActions(StateID State, SymbolID Terminal) const;
   // Returns the state after we reduce a nonterminal.
   // Expected to be called by LR parsers.
   // REQUIRES: Nonterminal is valid here.
@@ -135,9 +130,20 @@
   // If the terminal is invalid here, returns None.
   llvm::Optional<StateID> getShiftState(StateID State, SymbolID Terminal) const;
 
-  // Looks up available actions.
-  // Returns empty if no available actions in the table.
-  llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+  // Returns the possible reductions from a state.
+  // These are not keyed by a lookahead token. Instead, call canFollow() to
+  // check whether a reduction should apply in the current context.
+  llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
+    return llvm::makeArrayRef(&Reduces[ReduceOffset[State]],
+                              &Reduces[ReduceOffset[State + 1]]);
+  }
+  // Returns whether Terminal can follow Nonterminal in a valid source file.
+  bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
+    assert(isToken(Terminal));
+    assert(!isToken(Nonterminal));
+    return FollowSets.test(tok::NUM_TOKENS * Nonterminal +
+                           symbolToToken(Terminal));
+  }
 
   // Returns the state from which the LR parser should start to parse the input
   // tokens as the given StartSymbol.
@@ -151,9 +157,12 @@
   StateID getStartState(SymbolID StartSymbol) const;
 
   size_t bytes() const {
-    return sizeof(*this) + Actions.capacity() * sizeof(Action) +
-           Symbols.capacity() * sizeof(SymbolID) +
-           StateOffset.capacity() * sizeof(uint32_t);
+    return sizeof(*this) + llvm::capacity_in_bytes(Actions) +
+           llvm::capacity_in_bytes(Symbols) +
+           llvm::capacity_in_bytes(StateOffset) +
+           llvm::capacity_in_bytes(Reduces) +
+           llvm::capacity_in_bytes(ReduceOffset) +
+           llvm::capacity_in_bytes(FollowSets);
   }
 
   std::string dumpStatistics() const;
@@ -169,10 +178,18 @@
     SymbolID Symbol;
     Action Act;
   };
+  struct ReduceEntry {
+    StateID State;
+    RuleID Rule;
+  };
   // Build a specifid table for testing purposes.
-  static LRTable buildForTests(const GrammarTable &, llvm::ArrayRef<Entry>);
+  static LRTable buildForTests(const Grammar &G, llvm::ArrayRef<Entry>,
+                               llvm::ArrayRef<ReduceEntry>);
 
 private:
+  // Looks up actions stored in the generic table.
+  llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+
   // Conceptually the LR table is a multimap from (State, SymbolID) => Action.
   // Our physical representation is quite different for compactness.
 
@@ -188,6 +205,12 @@
   std::vector<Action> Actions;
   // A sorted table, storing the start state for each target parsing symbol.
   std::vector<std::pair<SymbolID, StateID>> StartStates;
+
+  // Reduces from state S: Reduces[ [ReduceOffset[S], ReduceOffset[S+1]) ]
+  std::vector<uint32_t> ReduceOffset;
+  std::vector<RuleID> Reduces;
+  // T in Follow(NT) == FollowSets[NT * NUM_TOKENS + tok::Kind(T)]
+  llvm::BitVector FollowSets;
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D128472: [pseudo] Check follow-sets instead of tying reduce actions to lookahead tokens.

Reply via email to