hokein created this revision.
hokein added a reviewer: sammccall.
Herald added a subscriber: mgrang.
Herald added a project: All.
hokein requested review of this revision.
Herald added a subscriber: alextsao1999.
Herald added a project: clang-tools-extra.

With this patch, we're able to parse smaller chunks of C++ code (statement,
declaration), rather than translation-unit.

The target symbol is listed in the grammar in a form of `_ :=
statement`, each target symbol has a dedicated state (`_ := • statement`).
We create and track all these separate states in the LRTable. When we
start parsing, we lookup the corresponding state to start the parser.

LR pasing table changes with this patch:

- number of states: 1467 -> 1471
- number of actions: 82891 -> 83578
- size of the table (bytes): 334248 -> 336996


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D125006

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
  clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
  clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
  clang-tools-extra/pseudo/lib/GLR.cpp
  clang-tools-extra/pseudo/lib/LRGraph.cpp
  clang-tools-extra/pseudo/lib/LRTable.cpp
  clang-tools-extra/pseudo/lib/LRTableBuild.cpp
  clang-tools-extra/pseudo/lib/cxx.bnf
  clang-tools-extra/pseudo/test/glr-variant-start.cpp
  clang-tools-extra/pseudo/tool/ClangPseudo.cpp
  clang-tools-extra/pseudo/unittests/GLRTest.cpp

Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -344,7 +344,8 @@
   const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
   auto LRTable = LRTable::buildSLR(*G);
 
-  const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack});
+  const ForestNode &Parsed =
+      glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
   // Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
   // in the forest, see the `#1` and `=#1` in the dump string.
   EXPECT_EQ(Parsed.dumpRecursive(*G),
@@ -381,7 +382,8 @@
   const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
   auto LRTable = LRTable::buildSLR(*G);
 
-  const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack});
+  const ForestNode &Parsed =
+      glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
   EXPECT_EQ(Parsed.dumpRecursive(*G),
             "[  0, end) test := <ambiguous>\n"
             "[  0, end) ├─test := IDENTIFIER\n"
Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -38,6 +38,9 @@
                       desc("Print directive structure of source code"));
 static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
 static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
+static opt<std::string> ParseSymbol("parse-symbol",
+                                    desc("specify the target symbol to parse"),
+                                    init("translation-unit"));
 
 static std::string readOrDie(llvm::StringRef Path) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -50,6 +53,14 @@
   return Text.get()->getBuffer().str();
 }
 
+static clang::pseudo::SymbolID
+findNonterminalID(llvm::StringRef Name, const clang::pseudo::Grammar &G) {
+  for (clang::pseudo::SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID)
+    if (G.table().Nonterminals[ID].Name == Name)
+      return ID;
+  return 0;
+}
+
 int main(int argc, char *argv[]) {
   llvm::cl::ParseCommandLineOptions(argc, argv, "");
 
@@ -96,9 +107,9 @@
     if (ParseableStream) {
       clang::pseudo::ForestArena Arena;
       clang::pseudo::GSS GSS;
-      auto &Root =
-          glrParse(*ParseableStream,
-                   clang::pseudo::ParseParams{*G, LRTable, Arena, GSS});
+      auto &Root = glrParse(*ParseableStream,
+                            clang::pseudo::ParseParams{*G, LRTable, Arena, GSS},
+                            findNonterminalID(ParseSymbol, *G));
       if (PrintForest)
         llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true);
 
Index: clang-tools-extra/pseudo/test/glr-variant-start.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/glr-variant-start.cpp
@@ -0,0 +1,9 @@
+// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --parse-symbol=statement-seq --print-forest | FileCheck %s
+
+a + a;
+// CHECK:      statement-seq~expression-statement := expression ;
+// CHECK-NEXT: ├─expression~additive-expression := additive-expression + multiplicative-expression
+// CHECK-NEXT: │ ├─additive-expression~IDENTIFIER :=
+// CHECK-NEXT: │ ├─+ :=
+// CHECK-NEXT: │ └─multiplicative-expression~IDENTIFIER :=
+// CHECK-NEXT: └─; :=
Index: clang-tools-extra/pseudo/lib/cxx.bnf
===================================================================
--- clang-tools-extra/pseudo/lib/cxx.bnf
+++ clang-tools-extra/pseudo/lib/cxx.bnf
@@ -25,8 +25,11 @@
 # [1] https://isocpp.org/files/papers/N4860.pdf
 #
 #
-# _ serves as a "fake" start symbol, coming with real grammar symbols.
+# _ serves as a "fake" start symbol, coming with real grammar symbols which we
+# target to parse.
 _ := translation-unit
+_ := statement-seq
+_ := declaration-seq
 
 # gram.key
 typedef-name := IDENTIFIER
Index: clang-tools-extra/pseudo/lib/LRTableBuild.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/LRTableBuild.cpp
+++ clang-tools-extra/pseudo/lib/LRTableBuild.cpp
@@ -40,6 +40,9 @@
 
 class LRTable::Builder {
 public:
+  Builder(std::vector<std::pair<SymbolID, StateID>> StartStates)
+      : StartStates(std::move(StartStates)) {}
+
   bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
   LRTable build(const GrammarTable &GT) && {
     // E.g. given the following parsing table with 3 states and 3 terminals:
@@ -92,24 +95,26 @@
                  tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
         ++SortedIndex;
     }
+    Table.StartStates = std::move(StartStates);
     return Table;
   }
 
 private:
   llvm::DenseSet<Entry> Entries;
+  std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 
 LRTable LRTable::buildForTests(const GrammarTable &GT,
                                llvm::ArrayRef<Entry> Entries) {
-  Builder Build;
+  Builder Build({});
   for (const Entry &E : Entries)
     Build.insert(E);
   return std::move(Build).build(GT);
 }
 
 LRTable LRTable::buildSLR(const Grammar &G) {
-  Builder Build;
   auto Graph = LRGraph::buildLR0(G);
+  Builder Build(Graph.startStates());
   for (const auto &T : Graph.edges()) {
     Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
     Build.insert({T.Src, T.Label, Act});
Index: clang-tools-extra/pseudo/lib/LRTable.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/LRTable.cpp
+++ clang-tools-extra/pseudo/lib/LRTable.cpp
@@ -120,5 +120,16 @@
                             /*length=*/End - Start);
 }
 
+LRTable::StateID LRTable::getStartState(SymbolID Target) const {
+  assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!");
+  auto It = llvm::partition_point(
+      StartStates, [Target](const std::pair<SymbolID, StateID> &X) {
+        return X.first < Target;
+      });
+  assert(It != StartStates.end() && It->first == Target &&
+         "target symbol doesn't have a start state!");
+  return It->second;
+}
+
 } // namespace pseudo
 } // namespace clang
Index: clang-tools-extra/pseudo/lib/LRGraph.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/LRGraph.cpp
+++ clang-tools-extra/pseudo/lib/LRGraph.cpp
@@ -160,9 +160,12 @@
 }
 
 LRGraph LRGraph::buildLR0(const Grammar &G) {
+  std::vector<std::pair<SymbolID, StateID>> StartStates;
   class Builder {
   public:
-    Builder(const Grammar &G) : G(G) {}
+    Builder(const Grammar &G,
+            std::vector<std::pair<SymbolID, StateID>> &StartStates)
+        : G(G), StartStates(StartStates) {}
 
     // Adds a given state if not existed.
     std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
@@ -190,7 +193,10 @@
     LRGraph build() && {
       States.shrink_to_fit();
       Edges.shrink_to_fit();
-      return LRGraph(std::move(States), std::move(Edges));
+      llvm::sort(StartStates);
+      StartStates.shrink_to_fit();
+      return LRGraph(std::move(States), std::move(Edges),
+                     std::move(StartStates));
     }
 
   private:
@@ -199,7 +205,8 @@
     std::vector<State> States;
     std::vector<Edge> Edges;
     const Grammar &G;
-  } Builder(G);
+    std::vector<std::pair<SymbolID, StateID>> &StartStates;
+  } Builder(G, StartStates);
 
   std::vector<StateID> PendingStates;
   // Initialize states with the start symbol.
@@ -209,6 +216,11 @@
     auto Result = Builder.insert(std::move(StartState));
     assert(Result.second && "State must be new");
     PendingStates.push_back(Result.first);
+
+    const Rule &StartRule = G.lookupRule(RID);
+    assert(StartRule.Size == 1 &&
+           "Start rule must have exactly one symbol in its body!");
+    StartStates.push_back({StartRule.seq().front(), Result.first});
   }
 
   while (!PendingStates.empty()) {
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -36,8 +36,8 @@
   return OS;
 }
 
-const ForestNode &glrParse(const TokenStream &Tokens,
-                           const ParseParams &Params) {
+const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
+                           SymbolID Target) {
   llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Tokens);
   auto &G = Params.G;
   auto &GSS = Params.GSStack;
@@ -61,9 +61,9 @@
       }
     }
   };
-
   std::vector<const GSS::Node *> NewHeads = {
-      GSS.addNode(/*State=*/0, /*ForestNode*/ nullptr, {})};
+      GSS.addNode(/*State=*/Params.Table.getStartState(Target),
+                  /*ForestNode=*/nullptr, {})};
   for (const ForestNode &Terminal : Terminals) {
     LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n",
                                              G.symbolName(Terminal.symbol()),
@@ -101,11 +101,7 @@
     return *PendingAccept.front().Head->Payload;
   }
   // We failed to parse the input, returning an opaque forest node for recovery.
-  auto RulesForStart = G.rulesFor(G.startSymbol());
-  // FIXME: support multiple start symbols.
-  assert(RulesForStart.size() == 1 && RulesForStart.front().Size == 1 &&
-         "start symbol _ must have exactly one rule");
-  return Params.Forest.createOpaque(RulesForStart.front().Sequence[0], 0);
+  return Params.Forest.createOpaque(Target, /*Token::Index=*/0);
 }
 
 // Apply all pending shift actions.
Index: clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
@@ -132,6 +132,16 @@
   // Returns empty if no available actions in the table.
   llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
 
+  // Returns the state from which the LR parser should start to parse the input
+  // tokens as the given Target symbol.
+  //
+  // In LR parsing, the start state of `translation-unit` corresponds to
+  // `_ := • translation-unit`.
+  //
+  // REQUIRE: The given Target symbol must exist in the grammar (in a form of
+  //          `_ := target`).
+  StateID getStartState(SymbolID Target) const;
+
   size_t bytes() const {
     return sizeof(*this) + Actions.capacity() * sizeof(Action) +
            States.capacity() * sizeof(StateID) +
@@ -171,7 +181,10 @@
   std::vector<StateID> States;
   // A flat list of available actions, sorted by (SymbolID, State).
   std::vector<Action> Actions;
+  // A sorted table, storing the start state for each target parsing symbol.
+  std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
+
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
 
 } // namespace pseudo
Index: clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
@@ -139,15 +139,21 @@
 
   llvm::ArrayRef<State> states() const { return States; }
   llvm::ArrayRef<Edge> edges() const { return Edges; }
+  llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
+    return StartStates;
+  }
 
   std::string dumpForTests(const Grammar &) const;
 
 private:
-  LRGraph(std::vector<State> States, std::vector<Edge> Edges)
-      : States(std::move(States)), Edges(std::move(Edges)) {}
+  LRGraph(std::vector<State> States, std::vector<Edge> Edges,
+          std::vector<std::pair<SymbolID, StateID>> StartStates)
+      : States(std::move(States)), Edges(std::move(Edges)),
+        StartStates(std::move(StartStates)) {}
 
   std::vector<State> States;
   std::vector<Edge> Edges;
+  std::vector<std::pair<SymbolID, StateID>> StartStates;
 };
 
 } // namespace pseudo
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -122,13 +122,12 @@
   ForestArena &Forest;  // Storage for the output forest.
   GSS &GSStack;         // Storage for parsing stacks.
 };
-// Parse the given token stream with the GLR algorithm, and return a forest node
-// of the start symbol.
+// Parses the given token stream as the Target symbol with the GLR algorithm,
+// and returns a forest node of the target symbol.
 //
 // If the parsing fails, we model it as an opaque node in the forest.
-//
-// FIXME: add support for variant start symbols.
-const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params);
+const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params,
+                           SymbolID Target);
 
 // An active stack head can have multiple available actions (reduce/reduce
 // actions, reduce/shift actions).
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to