[PATCH] D121150: [pseudo][WIP] Implement a GLR parser.

Haojian Wu via Phabricator via cfe-commits Tue, 08 Mar 2022 06:32:28 -0800

hokein updated this revision to Diff 413788.
hokein added a comment.

fix a subtle bug where we might leave an unremoved node in the reduce path.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D121150/new/

https://reviews.llvm.org/D121150

Files:
  clang/include/clang/Tooling/Syntax/Pseudo/Forest.h
  clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h
  clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/lib/Tooling/Syntax/Pseudo/Forest.cpp
  clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp
  clang/tools/clang-pseudo/ClangPseudo.cpp

Index: clang/tools/clang-pseudo/ClangPseudo.cpp
===================================================================
--- clang/tools/clang-pseudo/ClangPseudo.cpp
+++ clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -8,6 +8,7 @@
 
 #include "clang/Basic/LangOptions.h"
 #include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
+#include "clang/Tooling/Syntax/Pseudo/GLRParser.h"
 #include "clang/Tooling/Syntax/Pseudo/Grammar.h"
 #include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
 #include "clang/Tooling/Syntax/Pseudo/LRTable.h"
@@ -29,6 +30,8 @@
                             desc("Print the LR graph for the grammar"));
 static opt<bool> PrintTable("print-table",
                             desc("Print the LR table for the grammar"));
+static opt<std::string> ParseFile("parse", desc("Parse a C++ source file"),
+                                  init(""));
 static opt<std::string> Source("source", desc("Source file"));
 static opt<bool> PrintSource("print-source", desc("Print token stream"));
 static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
@@ -69,6 +72,26 @@
     if (PrintTable)
       llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests(
           *G);
+    if (ParseFile.getNumOccurrences()) {
+      std::string Code = readOrDie(ParseFile);
+      const auto &T = clang::syntax::pseudo::LRTable::buildSLR(*G);
+      clang::LangOptions Opts;
+      Opts.CPlusPlus = 1;
+
+      auto RawTokens = clang::syntax::pseudo::lex(Code, Opts);
+      auto Tokens = clang::syntax::pseudo::stripComments(cook(RawTokens, Opts));
+      clang::syntax::pseudo::ForestArena Arena;
+      clang::syntax::pseudo::GLRParser Parser(T, *G, Arena);
+      const auto *Root = Parser.parse(Tokens);
+      if (Root) {
+        llvm::outs() << "parsed successfully!\n";
+        llvm::outs() << "Forest bytes: " << Arena.bytes()
+                     << " nodes: " << Arena.nodeNum() << "\n";
+        llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes()
+                     << " nodes: " << Parser.getGSS().nodeCount() << "\n";
+        // llvm::outs() << Root->DumpRecursive(*G, true);
+      }
+    }
     return 0;
   }
 
Index: clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp
@@ -0,0 +1,334 @@
+//===--- GLRParser.cpp   -----------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/GLRParser.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <memory>
+#include <tuple>
+
+#define DEBUG_TYPE "GLRParser.cpp"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+using StateID = LRTable::StateID;
+
+const ForestNode *GLRParser::parse(const TokenStream &Code) {
+  ParsedForest.init(Code);
+  const Token *Lookahead = &Code.tokens().front();
+  addActions(GSS.addNode(/*StartState*/ 0, nullptr, {}), *Lookahead);
+
+  while (!ShiftList.empty() || !ReduceList.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+                   "Lookahead token {0} (id: {1} text: '{2}')\n",
+                   G.symbolName(tokenSymbol(Lookahead->Kind)),
+                   tokenSymbol(Lookahead->Kind), Lookahead->text()));
+
+    performReduction(*Lookahead);
+    auto NewHeads = performShift(Code.index(*Lookahead));
+
+    if (Lookahead->Kind != tok::eof)
+      ++Lookahead;
+    for (const auto &AS : NewHeads)
+      addActions(AS, *Lookahead);
+  }
+
+  if (!AcceptLists.empty()) {
+    // FIXME: supporting multiple accepted symbols. It should be fine now, as we
+    // only have one production for the start symbol `_`. This would become a
+    // problem when we support parsing any code snippet rather than the
+    // translation unit.
+    assert(AcceptLists.size() == 1);
+    LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted results:\n",
+                                             AcceptLists.size()));
+    for (const auto &A : AcceptLists)
+      LLVM_DEBUG(llvm::dbgs()
+                 << "  - " << G.symbolName(A.Head->Parsed->symbol()) << "\n");
+    return AcceptLists.front().Head->Parsed;
+  }
+  return nullptr;
+}
+
+std::vector<const Graph::Node *>
+GLRParser::performShift(Token::Index Lookahead) {
+  assert(ReduceList.empty() &&
+         "Reduce actions must be performed before shift actions");
+  if (ShiftList.empty())
+    return {};
+  LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+                 "  Perform Shift ({0} active heads):\n", ShiftList.size()));
+
+  const pseudo::ForestNode *Leaf = &ParsedForest.terminal(Lookahead);
+  // New heads after performing all the shifts.
+  std::vector<const Graph::Node *> NewHeads;
+
+  // Merge the stack -- if multiple stack heads are going to shift a same
+  // state, we perform the shift only once by combining these heads.
+  //
+  // E.g. we have two heads (2, 3) in the GSS, and state 4 is to be shifted from
+  // state 2 and state 3:
+  //   0 -> 1 -> 2
+  //        ` -> 3
+  // After the shift action, the GSS looks like below, state 4 becomes the new
+  // head:
+  //   0 -> 1 -> 2 -> 4
+  //        ` -> 3 ---^
+  //
+  // Shifts are partitioned by the shift state, so each partition (per loop
+  // iteration) corresponds to a "perform" shift.
+  llvm::sort(ShiftList, [](const Frontier &L, const Frontier &R) {
+    assert(L.PerformAction->kind() == LRTable::Action::Shift &&
+           R.PerformAction->kind() == LRTable::Action::Shift);
+    return std::forward_as_tuple(L.PerformAction->getShiftState(), L.Head) <
+           std::forward_as_tuple(R.PerformAction->getShiftState(), R.Head);
+  });
+  auto Partition = llvm::makeArrayRef(ShiftList);
+  while (!Partition.empty()) {
+    StateID NextState = Partition.front().PerformAction->getShiftState();
+    auto Batch = Partition.take_while([&NextState](const Frontier &A) {
+      return A.PerformAction->getShiftState() == NextState;
+    });
+    assert(!Batch.empty());
+    // Predecessors of the new head in GSS.
+    std::vector<const Graph::Node *> Predecessors;
+    llvm::for_each(Batch, [&Predecessors](const Frontier &F) {
+      assert(llvm::find(Predecessors, F.Head) == Predecessors.end() &&
+             "Unexpected duplicated stack heads during shift!");
+      Predecessors.push_back(F.Head);
+    });
+    const auto *Head = GSS.addNode(NextState, Leaf, Predecessors);
+    LLVM_DEBUG(llvm::dbgs()
+               << llvm::formatv("  - state {0} -> state {1}\n",
+                                Partition.front().Head->State, NextState));
+
+    NewHeads.push_back(Head);
+    // Next iteration for next partition.
+    Partition = Partition.drop_front(Batch.size());
+  }
+  ShiftList.clear();
+  return NewHeads;
+}
+
+static std::vector<std::string>
+getStateString(llvm::ArrayRef<const Graph::Node *> A) {
+  std::vector<std::string> States;
+  for (const auto &N : A)
+    States.push_back(llvm::formatv("state {0}", N->State));
+  return States;
+}
+
+// Enumerate all reduce paths on the stack by traversing from the given Head in
+// the GSS.
+static void enumerateReducePath(const Graph::Node *Head, unsigned PathLength,
+                                std::vector<const Graph::Node *> &PathStorage,
+                                std::function<void()> CB) {
+  assert(PathStorage.empty() && "PathStorage must be empty!");
+  std::function<void(const Graph::Node *, unsigned)> enumPath =
+      [&CB, &PathStorage, &enumPath](const Graph::Node *Current,
+                                     unsigned RemainingLength) -> void {
+    assert(RemainingLength > 0);
+
+    --RemainingLength;
+    PathStorage.push_back(Current);
+    if (RemainingLength == 0) {
+      CB();
+    } else {
+      for (const auto *Next : Current->predecessors())
+        enumPath(Next, RemainingLength);
+    }
+    PathStorage.pop_back();
+  };
+  enumPath(Head, PathLength);
+}
+
+// Perform reduction recursively until we don't have reduce actions with
+// heads.
+void GLRParser::performReduction(const Token &Lookahead) {
+  if (!ReduceList.empty())
+    LLVM_DEBUG(llvm::dbgs() << "  Performing **Reduce**\n");
+
+  // Reduce can manipulate the GSS in following way:
+  //
+  //  1) Split --
+  //     1.1 when a stack head has mutiple reduce actions, the head is
+  //     made to split to accommodate the various possiblities.
+  //     E.g.
+  //       0 -> 1 (ID)
+  //     After performing reduce of production rules (class-name := ID,
+  //     enum-name := ID), the GSS now has two new heads:
+  //       0 -> 2 (class-name)
+  //        `-> 3 (enum-name)
+  //
+  //     1.2 when a stack head has a reduce action with multiple reduce
+  //     paths, the head is to split.
+  //     E.g.
+  //       ... -> 1(...) -> 3 (INT)
+  //                        ^
+  //       ... -> 2(...) ---|
+  //
+  //     After the reduce action (simple-type-specifier := INT), the GSS looks
+  //     like:
+  //       ... -> 1(...) -> 4 (simple-type-specifier)
+  //       ... -> 2(...) -> 5 (simple-type-specifier)
+  //
+  //  2) Merge -- if multiple heads turn out to be identical after
+  //     reduction (new heads have the same state, and point to the same
+  //     predecessors), these heads are merged and treated as a single head.
+  //     This is usually where ambiguity happens.
+  //
+  //     E.g.
+  //         0 -> 2 (class-name)
+  //         ` -> 3 (enum-name)
+  //     After reduction of rules (type-name := class-name | enum-name), the GSS
+  //     has the following form:
+  //         0 -> 4 (type-name)
+  //     The type-name forest node in the new head 4 is ambiguous, which has two
+  //     parses (type-name -> class-name -> id, type-name -> enum-name -> id).
+
+  // Store all newly-created stack heads for tracking ambiguities.
+  std::vector<const Graph::Node *> CreatedHeads;
+  while (!ReduceList.empty()) {
+    auto RA = std::move(ReduceList.back());
+    ReduceList.pop_back();
+
+    RuleID ReduceRuleID = RA.PerformAction->getReduceRule();
+    const Rule &ReduceRule = G.lookupRule(ReduceRuleID);
+    LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+                   "    !reduce rule {0}: {1} head: {2}\n", ReduceRuleID,
+                   G.dumpRule(ReduceRuleID), RA.Head->State));
+
+    std::vector<const Graph::Node *> ReducePath;
+    enumerateReducePath(RA.Head, ReduceRule.Size, ReducePath, [&]() {
+      LLVM_DEBUG(
+          llvm::dbgs() << llvm::formatv(
+              "     stack path: {0}, bases: {1}\n",
+              llvm::join(getStateString(ReducePath), " -> "),
+              llvm::join(getStateString(ReducePath.back()->predecessors()),
+                         ", ")));
+      assert(ReducePath.size() == ReduceRule.Size &&
+             "Reduce path's length must equal to the reduce rule size");
+      // A reduce is a back-and-forth operation in the stack.
+      // For example, we reduce a rule "declaration := decl-specifier-seq ;" on
+      // the linear stack:
+      //
+      //   0 -> 1(decl-specifier-seq) -> 3(;)
+      //   ^ Base                        ^ Head
+      //        <--- ReducePath: [3,1]  ---->
+      //
+      //   1. back -- pop |ReduceRuleLength| nodes (ReducePath) in the stack;
+      //   2. forth -- push a new node in the stack and mark it as a head;
+      //     0 -> 4(declaration)
+      //          ^ Head
+      //
+      // It becomes tricky if a reduce path has multiple bases, we want to merge
+      // them if their next state is the same. Similiar to above performShift,
+      // we partition the bases by their next state, and process each partition
+      // per loop iteration.
+      struct BaseInfo {
+        // An intermediate head after the stack has poped |ReducePath| nodes.
+        const Graph::Node *Base = nullptr;
+        // The final state after reduce.
+        // It is getGoToState(Base->State, ReduceSymbol).
+        StateID NextState;
+      };
+      std::vector<BaseInfo> Bases;
+      for (const Graph::Node *Base : ReducePath.back()->predecessors())
+        Bases.push_back(
+            {Base, ParsingTable.getGoToState(Base->State, ReduceRule.Target)});
+      llvm::sort(Bases, [](const BaseInfo &L, const BaseInfo &R) {
+        return std::forward_as_tuple(L.NextState, L.Base) <
+               std::forward_as_tuple(R.NextState, R.Base);
+      });
+
+      llvm::ArrayRef<BaseInfo> Partition = llvm::makeArrayRef(Bases);
+      while (!Partition.empty()) {
+        StateID NextState = Partition.front().NextState;
+        // Predecessors of the new stack head.
+        std::vector<const Graph::Node *> Predecessors;
+        auto Batch = Partition.take_while([&](const BaseInfo &TB) {
+          if (NextState != TB.NextState)
+            return false;
+          Predecessors.push_back(TB.Base);
+          return true;
+        });
+        assert(!Batch.empty());
+        Partition = Partition.drop_front(Batch.size());
+
+        // Check ambiguities.
+        auto It = llvm::find_if(CreatedHeads, [&](const Graph::Node *Head) {
+          return Head->Parsed->symbol() == ReduceRule.Target &&
+                 Head->predecessors() == llvm::makeArrayRef(Predecessors);
+        });
+        if (It != CreatedHeads.end()) {
+          // This should be guaranteed by checking the equalivent of
+          // predecessors and reduce nonterminal symbol!
+          assert(NextState == (*It)->State);
+          LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+                         "    found ambiguity, merged in state {0} (forest "
+                         "'{1}')\n",
+                         (*It)->State, G.symbolName((*It)->Parsed->symbol())));
+          // FIXME: create ambiguous foreset node!
+          continue;
+        }
+
+        // Create a corresponding sequence forest node for the reduce rule.
+        std::vector<const ForestNode *> ForestChildren;
+        for (const Graph::Node *PN : llvm::reverse(ReducePath))
+          ForestChildren.push_back(PN->Parsed);
+        const ForestNode &ForestNode = ParsedForest.createSequence(
+            ReduceRule.Target, RA.PerformAction->getReduceRule(),
+            ForestChildren.front()->startLoc(), ForestChildren);
+        LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+                       "     after reduce: {0} -> state {1} ({2})\n",
+                       llvm::join(getStateString(Predecessors), ", "),
+                       NextState, G.symbolName(ReduceRule.Target)));
+
+        // Create a new stack head.
+        const Graph::Node *Head =
+            GSS.addNode(NextState, &ForestNode, Predecessors);
+        CreatedHeads.push_back(Head);
+
+        // Actions that are enabled by this reduce.
+        addActions(Head, Lookahead);
+      }
+    });
+  }
+}
+
+void GLRParser::addActions(const Graph::Node *Head, const Token &Lookahead) {
+  for (const auto &Action :
+       ParsingTable.getActions(Head->State, tokenSymbol(Lookahead.Kind))) {
+    switch (Action.kind()) {
+    case LRTable::Action::Shift:
+      ShiftList.push_back({Head, &Action});
+      break;
+    case LRTable::Action::Reduce:
+      ReduceList.push_back({Head, &Action});
+      break;
+    case LRTable::Action::Accept:
+      AcceptLists.push_back({Head, &Action});
+      break;
+    default:
+      llvm_unreachable("unexpected action kind!");
+    }
+  }
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/Forest.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Forest.cpp
@@ -0,0 +1,125 @@
+//===--- Forest.cpp - Parse forest  ------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Forest.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+std::string ForestNode::Dump(const Grammar &G) const {
+  switch (kind()) {
+  case Ambiguous:
+    return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
+  case Terminal:
+    return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()), startLoc());
+  case Sequence:
+    return G.dumpRule(rule());
+  }
+  llvm_unreachable("unhandle node kind!");
+}
+
+std::string ForestNode::DumpRecursive(const Grammar &G,
+                                      bool Abbreviated) const {
+  // Count visits of nodes so we can mark those seen multiple times.
+  llvm::DenseMap<const ForestNode *, unsigned> Visits;
+  std::function<void(const ForestNode *)> CountVisits =
+      [&](const ForestNode *P) {
+        if (Visits[P]++ > 0)
+          return; // Don't count children as multiply visited.
+        if (P->kind() == Ambiguous)
+          llvm::for_each(P->alternatives(), CountVisits);
+        else if (P->kind() == Sequence)
+          llvm::for_each(P->elements(), CountVisits);
+      };
+  CountVisits(this);
+
+  llvm::DenseMap<const ForestNode *, unsigned> Ids;
+  std::string Result;
+  constexpr unsigned kEnd = std::numeric_limits<unsigned>::max();
+  std::function<void(const ForestNode *, unsigned, unsigned,
+                     llvm::Optional<SymbolID>)>
+      Dump = [&](const ForestNode *P, unsigned Level, unsigned End,
+                 llvm::Optional<SymbolID> ElidedParent) {
+        // absl::Span<const Node* const> children;
+        llvm::ArrayRef<const ForestNode *> children;
+        auto end_of_element = [&](unsigned child_index) {
+          return child_index + 1 == children.size()
+                     ? End
+                     : children[child_index + 1]->startLoc();
+        };
+        if (P->kind() == Ambiguous) {
+          children = P->alternatives();
+        } else if (P->kind() == Sequence) {
+          children = P->elements();
+          if (Abbreviated) {
+            if (P->startLoc() == End)
+              return;
+            for (unsigned i = 0; i < children.size(); ++i)
+              if (children[i]->startLoc() == P->startLoc() &&
+                  end_of_element(i) == End) {
+                return Dump(
+                    children[i], Level, End,
+                    /*elided_parent=*/ElidedParent.getValueOr(P->symbol()));
+              }
+          }
+        }
+
+        // FIXME: pretty ascii trees
+        if (End == kEnd)
+          Result += llvm::formatv("[{0},end) ", P->startLoc());
+        else
+          Result += llvm::formatv("[{0},{1}) ", P->startLoc(), End);
+        Result.append(2 * Level, ' ');
+        if (ElidedParent.hasValue()) {
+          Result += G.symbolName(*ElidedParent);
+          Result += "~";
+        }
+        Result.append(P->Dump(G));
+        if (Visits.find(P)->getSecond() > 1 &&
+            P->kind() != ForestNode::Terminal) {
+          // The first time, print as #1. Later, =#1.
+          auto id = Ids.try_emplace(P, Ids.size() + 1);
+          Result +=
+              llvm::formatv(" {0}#{1}", id.second ? "" : "=", id.first->second);
+        }
+        Result.push_back('\n');
+
+        ++Level;
+        for (unsigned i = 0; i < children.size(); ++i)
+          Dump(children[i], Level,
+               P->kind() == Sequence ? end_of_element(i) : End, llvm::None);
+      };
+  Dump(this, 0, kEnd, llvm::None);
+  return Result;
+}
+
+void ForestArena::init(const TokenStream &Tokens) {
+  Arena.Reset(); // clean the arena.
+  NodeNum = 0;
+  // List of leaves is prepopulated, it's convenient and we need them anyway.
+  Terminals = Arena.Allocate<ForestNode>(Tokens.tokens().size());
+  size_t Index = 0;
+  for (const auto &T : Tokens.tokens()) {
+    new (&Terminals[Index])
+        ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
+                   /*begin=*/Index, /*TerminalData*/ 0);
+    ++Index;
+  }
+  NodeNum = Tokens.tokens().size();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -8,6 +8,8 @@
   LRGraph.cpp
   LRTable.cpp
   LRTableBuild.cpp
+  Forest.cpp
+  GLRParser.cpp
   Token.cpp
 
   LINK_LIBS
Index: clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h
@@ -0,0 +1,148 @@
+//===--- GLRParser.h - Implement a standard GLR parser -----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a standard Generalized LR (GLR) parsing algorithm.
+//
+// The GLR parser behaves as a normal LR parser until it encounters a conflict.
+// To handle a conflict (where there are multiple actions could perform), the
+// parser will simulate nondeterminism by doing a breadth-first search
+// over all the possibilities.
+//
+// Basic mechanisims of the GLR parser:
+//  - A number of processes are operated in parallel.
+//  - Each process has its own parsing stack and behaves as a standard
+//    determinism LR parser.
+//  - When a process encounters a conflict, it will be fork (one for each
+//    avaiable action).
+//  - When a process encounters an error, it is abandoned.
+//  - All process are synchronized by the lookahead token: they perfrom shift
+//    action at the same time, which means some processes need wait until other
+//    processes have performed all reduce actions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Forest.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// An implementation of a directed acyclic graph (DAG), used as a
+// graph-structured stack (GSS) in the GLR parser.
+//
+// GSS is an efficient data structure to represent multiple active stacks, it
+// employs a stack-combination optimization to avoid potentially exponential
+// growth of the stack:
+//  - combing equal stack prefixes -- A new stack doesn't need to have a full
+//    copy of its parentâs stack. They share a common prefix.
+//  - combing euqal stack suffices -- as there are a finite number of DFA's
+//    state the parser can be in. A set of heads can be in the same state
+//    though they may have different parses, these heads can be merged,
+//    resulting a single head.
+//
+// E.g. we have two active stacks:
+//   0 -> 1 -> 2
+//        |    ^ head1, representing a stack [2, 1, 0]
+//        ` -> 3
+//             ^ head2, representing a stack [3, 1, 0]
+struct Graph {
+  // Represents a node in the graph.
+  struct Node {
+    // The parsing state presented by the graph node.
+    LRTable::StateID State : LRTable::StateBits;
+    static constexpr unsigned PredecessorBits = 3;
+    // Number of the predecessors of the node.
+    // u is the predecessor of v, if u -> v.
+    unsigned PredecessorCount : PredecessorBits;
+    // The forest node for a termina/nonterminal symbol.
+    // The symbol correponds to the label of edges which leads to current node
+    // from the predecessor nodes.
+    const ForestNode *Parsed = nullptr;
+
+    llvm::ArrayRef<const Node *> predecessors() const {
+      return llvm::makeArrayRef(reinterpret_cast<const Node *const *>(this + 1),
+                                PredecessorCount);
+    };
+
+    bool operator==(const Node &L) const {
+      return State == L.State && predecessors() == L.predecessors();
+    }
+    // A trailing array of Node*.
+  };
+
+  // Creates a new node in the graph.
+  const Node *addNode(LRTable::StateID State,
+                      const ::clang::syntax::pseudo::ForestNode *Symbol,
+                      llvm::ArrayRef<const Node *> Predecessors) {
+    assert(Predecessors.size() < (1 << Node::PredecessorBits) &&
+           "Too many predecessors to fit in PredecessorBits!");
+    ++NodeCount;
+    Node *Result = new (Arena.Allocate(
+        sizeof(Node) + Predecessors.size() * sizeof(Node *), alignof(Node)))
+        Node({State, static_cast<unsigned>(Predecessors.size())});
+    Result->Parsed = Symbol;
+    if (!Predecessors.empty())
+      llvm::copy(Predecessors, reinterpret_cast<const Node **>(Result + 1));
+    return Result;
+  }
+
+  size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
+  size_t nodeCount() const { return NodeCount; }
+
+private:
+  llvm::BumpPtrAllocator Arena;
+  unsigned NodeCount = 0;
+};
+
+class GLRParser {
+public:
+  GLRParser(const LRTable &T, const Grammar &G, ForestArena &Arena)
+      : ParsingTable(T), G(G), ParsedForest(Arena) {}
+
+  const ForestNode *parse(const TokenStream &Code);
+
+  const Graph &getGSS() const { return GSS; }
+
+private:
+  // Return a list of active stack heads.
+  std::vector<const Graph::Node *> performShift(Token::Index Lookahead);
+  void performReduction(const Token &Lookahead);
+
+  void addActions(const Graph::Node *Head, const Token &Lookahead);
+
+  const LRTable &ParsingTable;
+  const Grammar &G;
+
+  // An active stack head can have multiple avaialble actions (reduce/reduce
+  // actions, reduce/shift actions)
+  // Frontier is to track all avaiable actions from all active stack heads.
+  struct Frontier {
+    // A corresponding stack head.
+    const Graph::Node *Head = nullptr;
+    // An action associated with the Head.
+    const LRTable::Action *PerformAction = nullptr;
+  };
+  // A list of active shift actions.
+  std::vector<Frontier> ShiftList;
+  // A list of active reduce actions.
+  std::vector<Frontier> ReduceList;
+  // A list of active accept action.
+  std::vector<Frontier> AcceptLists;
+
+  Graph GSS;
+  ForestArena &ParsedForest;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/include/clang/Tooling/Syntax/Pseudo/Forest.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/Forest.h
@@ -0,0 +1,135 @@
+//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Parse forest is the output of the GLR parser.
+//
+// For an ambiguous grammar, there might be multiple parse trees generated from
+// for the given input. Forest is a DAG which represent numberous possible in a
+// space-efficient manner. Common subtrees are shared -- if two or more trees
+// treat the token range [1, 3) as an Expression, then there is a single shared
+// Expression node representing the subparse in the forest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cstdint>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// A node in a forest.
+class ForestNode {
+public:
+  enum Kind : uint8_t {
+    // A Terminal node is a single terminal symbol bound to a token.
+    Terminal,
+    // A Sequence node is a nonterminal symbol parsed with a grammar rule.
+    // elements() are the parses of each symbol on the RHS of the rule.
+    Sequence,
+    // An Ambiguous node exposes multiple ways to match the code to the symbol.
+    // alternatives() are the possible parses, we should choose one.
+    Ambiguous,
+  };
+  Kind kind() const { return K; }
+
+  SymbolID symbol() const { return Symbol; }
+
+  // The parses for each element in the RHS of the rule.
+  // REQUIRES: this is a Sequence node.
+  RuleID rule() const {
+    assert(kind() == Sequence);
+    return Data_ & ((1 << RuleBits) - 1);
+  }
+  // REQUIRES: this is a Sequence node;
+  llvm::ArrayRef<const ForestNode *> elements() const {
+    assert(kind() == Sequence);
+    return children(Data_ >> RuleBits);
+  };
+  uint32_t startLoc() const { return StartLoc; }
+
+  // The possible interpretations of the code.
+  // REQUIRES: this is an Ambiguous node.
+  llvm::ArrayRef<const ForestNode *> alternatives() const {
+    assert(kind() == Ambiguous);
+    return children(Data_);
+  }
+
+  std::string Dump(const Grammar &) const;
+  std::string DumpRecursive(const Grammar &, bool abbreviated = false) const;
+
+private:
+  friend class ForestArena;
+  ForestNode(Kind K, SymbolID Symbol, Token::Index StartLoc, uint16_t Data)
+      : StartLoc(StartLoc), K(K), Symbol(Symbol), Data_(Data) {}
+
+  llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
+    return llvm::makeArrayRef(
+        reinterpret_cast<const ForestNode *const *>(this + 1), Num);
+  }
+  static uint16_t SequenceData(RuleID rule,
+                               llvm::ArrayRef<const ForestNode *> elements) {
+    assert(rule < (1 << RuleBits));
+    assert(elements.size() < (1 << (16 - RuleBits)));
+    return rule | elements.size() << RuleBits;
+  }
+  Token::Index StartLoc;
+  Kind K : 4;
+  SymbolID Symbol : SymbolBits;
+  // Sequences - child count : 4 | RuleID : 12
+  // Ambiguous - child count : 16
+  // Terminal - unused
+  uint16_t Data_;
+  // A trailing array of Node* .
+};
+
+// Node may not be destroyed (for BumpPtrAllocator).
+static_assert(std::is_trivially_destructible<ForestNode>(), "");
+
+// A memory arena for the parse forest.
+class ForestArena {
+public:
+  size_t nodeNum() const { return NodeNum; }
+  size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }
+
+  ForestNode &createSequence(SymbolID SID, RuleID RID, Token::Index Start,
+                             llvm::ArrayRef<const ForestNode *> Elements) {
+    return create(ForestNode::Sequence, SID, Start,
+                  ForestNode::SequenceData(RID, Elements), Elements);
+  }
+
+  void init(const TokenStream &Code);
+  const ForestNode &terminal(Token::Index Index) const {
+    assert(Terminals && "Terminals are not intialized!");
+    return Terminals[Index];
+  }
+
+private:
+  ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
+                     uint16_t Data,
+                     llvm::ArrayRef<const ForestNode *> Elements) {
+    ++NodeNum;
+    ForestNode *New = new (Arena.Allocate(
+        sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
+        alignof(ForestNode))) ForestNode(K, SID, Start, Data);
+    if (!Elements.empty())
+      llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
+    return *New;
+  }
+
+  llvm::BumpPtrAllocator Arena;
+  ForestNode *Terminals = nullptr;
+  uint32_t NodeNum = 0;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D121150: [pseudo][WIP] Implement a GLR parser.

Reply via email to