hokein updated this revision to Diff 413788.
hokein added a comment.
fix a subtle bug where we might leave an unremoved node in the reduce path.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D121150/new/
https://reviews.llvm.org/D121150
Files:
clang/include/clang/Tooling/Syntax/Pseudo/Forest.h
clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h
clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
clang/lib/Tooling/Syntax/Pseudo/Forest.cpp
clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp
clang/tools/clang-pseudo/ClangPseudo.cpp
Index: clang/tools/clang-pseudo/ClangPseudo.cpp
===================================================================
--- clang/tools/clang-pseudo/ClangPseudo.cpp
+++ clang/tools/clang-pseudo/ClangPseudo.cpp
@@ -8,6 +8,7 @@
#include "clang/Basic/LangOptions.h"
#include "clang/Tooling/Syntax/Pseudo/DirectiveMap.h"
+#include "clang/Tooling/Syntax/Pseudo/GLRParser.h"
#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
#include "clang/Tooling/Syntax/Pseudo/LRGraph.h"
#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
@@ -29,6 +30,8 @@
desc("Print the LR graph for the grammar"));
static opt<bool> PrintTable("print-table",
desc("Print the LR table for the grammar"));
+static opt<std::string> ParseFile("parse", desc("Parse a C++ source file"),
+ init(""));
static opt<std::string> Source("source", desc("Source file"));
static opt<bool> PrintSource("print-source", desc("Print token stream"));
static opt<bool> PrintTokens("print-tokens", desc("Print detailed token info"));
@@ -69,6 +72,26 @@
if (PrintTable)
llvm::outs() << clang::syntax::pseudo::LRTable::buildSLR(*G).dumpForTests(
*G);
+ if (ParseFile.getNumOccurrences()) {
+ std::string Code = readOrDie(ParseFile);
+ const auto &T = clang::syntax::pseudo::LRTable::buildSLR(*G);
+ clang::LangOptions Opts;
+ Opts.CPlusPlus = 1;
+
+ auto RawTokens = clang::syntax::pseudo::lex(Code, Opts);
+ auto Tokens = clang::syntax::pseudo::stripComments(cook(RawTokens, Opts));
+ clang::syntax::pseudo::ForestArena Arena;
+ clang::syntax::pseudo::GLRParser Parser(T, *G, Arena);
+ const auto *Root = Parser.parse(Tokens);
+ if (Root) {
+ llvm::outs() << "parsed successfully!\n";
+ llvm::outs() << "Forest bytes: " << Arena.bytes()
+ << " nodes: " << Arena.nodeNum() << "\n";
+ llvm::outs() << "GSS bytes: " << Parser.getGSS().bytes()
+ << " nodes: " << Parser.getGSS().nodeCount() << "\n";
+ // llvm::outs() << Root->DumpRecursive(*G, true);
+ }
+ }
return 0;
}
Index: clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/GLRParser.cpp
@@ -0,0 +1,334 @@
+//===--- GLRParser.cpp -----------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/GLRParser.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <memory>
+#include <tuple>
+
+#define DEBUG_TYPE "GLRParser.cpp"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+using StateID = LRTable::StateID;
+
+const ForestNode *GLRParser::parse(const TokenStream &Code) {
+ ParsedForest.init(Code);
+ const Token *Lookahead = &Code.tokens().front();
+ addActions(GSS.addNode(/*StartState*/ 0, nullptr, {}), *Lookahead);
+
+ while (!ShiftList.empty() || !ReduceList.empty()) {
+ LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+ "Lookahead token {0} (id: {1} text: '{2}')\n",
+ G.symbolName(tokenSymbol(Lookahead->Kind)),
+ tokenSymbol(Lookahead->Kind), Lookahead->text()));
+
+ performReduction(*Lookahead);
+ auto NewHeads = performShift(Code.index(*Lookahead));
+
+ if (Lookahead->Kind != tok::eof)
+ ++Lookahead;
+ for (const auto &AS : NewHeads)
+ addActions(AS, *Lookahead);
+ }
+
+ if (!AcceptLists.empty()) {
+ // FIXME: supporting multiple accepted symbols. It should be fine now, as we
+ // only have one production for the start symbol `_`. This would become a
+ // problem when we support parsing any code snippet rather than the
+ // translation unit.
+ assert(AcceptLists.size() == 1);
+ LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Accept: {0} accepted results:\n",
+ AcceptLists.size()));
+ for (const auto &A : AcceptLists)
+ LLVM_DEBUG(llvm::dbgs()
+ << " - " << G.symbolName(A.Head->Parsed->symbol()) << "\n");
+ return AcceptLists.front().Head->Parsed;
+ }
+ return nullptr;
+}
+
+std::vector<const Graph::Node *>
+GLRParser::performShift(Token::Index Lookahead) {
+ assert(ReduceList.empty() &&
+ "Reduce actions must be performed before shift actions");
+ if (ShiftList.empty())
+ return {};
+ LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+ " Perform Shift ({0} active heads):\n", ShiftList.size()));
+
+ const pseudo::ForestNode *Leaf = &ParsedForest.terminal(Lookahead);
+ // New heads after performing all the shifts.
+ std::vector<const Graph::Node *> NewHeads;
+
+ // Merge the stack -- if multiple stack heads are going to shift a same
+ // state, we perform the shift only once by combining these heads.
+ //
+ // E.g. we have two heads (2, 3) in the GSS, and state 4 is to be shifted from
+ // state 2 and state 3:
+ // 0 -> 1 -> 2
+ // ` -> 3
+ // After the shift action, the GSS looks like below, state 4 becomes the new
+ // head:
+ // 0 -> 1 -> 2 -> 4
+ // ` -> 3 ---^
+ //
+ // Shifts are partitioned by the shift state, so each partition (per loop
+ // iteration) corresponds to a "perform" shift.
+ llvm::sort(ShiftList, [](const Frontier &L, const Frontier &R) {
+ assert(L.PerformAction->kind() == LRTable::Action::Shift &&
+ R.PerformAction->kind() == LRTable::Action::Shift);
+ return std::forward_as_tuple(L.PerformAction->getShiftState(), L.Head) <
+ std::forward_as_tuple(R.PerformAction->getShiftState(), R.Head);
+ });
+ auto Partition = llvm::makeArrayRef(ShiftList);
+ while (!Partition.empty()) {
+ StateID NextState = Partition.front().PerformAction->getShiftState();
+ auto Batch = Partition.take_while([&NextState](const Frontier &A) {
+ return A.PerformAction->getShiftState() == NextState;
+ });
+ assert(!Batch.empty());
+ // Predecessors of the new head in GSS.
+ std::vector<const Graph::Node *> Predecessors;
+ llvm::for_each(Batch, [&Predecessors](const Frontier &F) {
+ assert(llvm::find(Predecessors, F.Head) == Predecessors.end() &&
+ "Unexpected duplicated stack heads during shift!");
+ Predecessors.push_back(F.Head);
+ });
+ const auto *Head = GSS.addNode(NextState, Leaf, Predecessors);
+ LLVM_DEBUG(llvm::dbgs()
+ << llvm::formatv(" - state {0} -> state {1}\n",
+ Partition.front().Head->State, NextState));
+
+ NewHeads.push_back(Head);
+ // Next iteration for next partition.
+ Partition = Partition.drop_front(Batch.size());
+ }
+ ShiftList.clear();
+ return NewHeads;
+}
+
+static std::vector<std::string>
+getStateString(llvm::ArrayRef<const Graph::Node *> A) {
+ std::vector<std::string> States;
+ for (const auto &N : A)
+ States.push_back(llvm::formatv("state {0}", N->State));
+ return States;
+}
+
+// Enumerate all reduce paths on the stack by traversing from the given Head in
+// the GSS.
+static void enumerateReducePath(const Graph::Node *Head, unsigned PathLength,
+ std::vector<const Graph::Node *> &PathStorage,
+ std::function<void()> CB) {
+ assert(PathStorage.empty() && "PathStorage must be empty!");
+ std::function<void(const Graph::Node *, unsigned)> enumPath =
+ [&CB, &PathStorage, &enumPath](const Graph::Node *Current,
+ unsigned RemainingLength) -> void {
+ assert(RemainingLength > 0);
+
+ --RemainingLength;
+ PathStorage.push_back(Current);
+ if (RemainingLength == 0) {
+ CB();
+ } else {
+ for (const auto *Next : Current->predecessors())
+ enumPath(Next, RemainingLength);
+ }
+ PathStorage.pop_back();
+ };
+ enumPath(Head, PathLength);
+}
+
+// Perform reduction recursively until we don't have reduce actions with
+// heads.
+void GLRParser::performReduction(const Token &Lookahead) {
+ if (!ReduceList.empty())
+ LLVM_DEBUG(llvm::dbgs() << " Performing **Reduce**\n");
+
+ // Reduce can manipulate the GSS in following way:
+ //
+ // 1) Split --
+ // 1.1 when a stack head has mutiple reduce actions, the head is
+ // made to split to accommodate the various possiblities.
+ // E.g.
+ // 0 -> 1 (ID)
+ // After performing reduce of production rules (class-name := ID,
+ // enum-name := ID), the GSS now has two new heads:
+ // 0 -> 2 (class-name)
+ // `-> 3 (enum-name)
+ //
+ // 1.2 when a stack head has a reduce action with multiple reduce
+ // paths, the head is to split.
+ // E.g.
+ // ... -> 1(...) -> 3 (INT)
+ // ^
+ // ... -> 2(...) ---|
+ //
+ // After the reduce action (simple-type-specifier := INT), the GSS looks
+ // like:
+ // ... -> 1(...) -> 4 (simple-type-specifier)
+ // ... -> 2(...) -> 5 (simple-type-specifier)
+ //
+ // 2) Merge -- if multiple heads turn out to be identical after
+ // reduction (new heads have the same state, and point to the same
+ // predecessors), these heads are merged and treated as a single head.
+ // This is usually where ambiguity happens.
+ //
+ // E.g.
+ // 0 -> 2 (class-name)
+ // ` -> 3 (enum-name)
+ // After reduction of rules (type-name := class-name | enum-name), the GSS
+ // has the following form:
+ // 0 -> 4 (type-name)
+ // The type-name forest node in the new head 4 is ambiguous, which has two
+ // parses (type-name -> class-name -> id, type-name -> enum-name -> id).
+
+ // Store all newly-created stack heads for tracking ambiguities.
+ std::vector<const Graph::Node *> CreatedHeads;
+ while (!ReduceList.empty()) {
+ auto RA = std::move(ReduceList.back());
+ ReduceList.pop_back();
+
+ RuleID ReduceRuleID = RA.PerformAction->getReduceRule();
+ const Rule &ReduceRule = G.lookupRule(ReduceRuleID);
+ LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+ " !reduce rule {0}: {1} head: {2}\n", ReduceRuleID,
+ G.dumpRule(ReduceRuleID), RA.Head->State));
+
+ std::vector<const Graph::Node *> ReducePath;
+ enumerateReducePath(RA.Head, ReduceRule.Size, ReducePath, [&]() {
+ LLVM_DEBUG(
+ llvm::dbgs() << llvm::formatv(
+ " stack path: {0}, bases: {1}\n",
+ llvm::join(getStateString(ReducePath), " -> "),
+ llvm::join(getStateString(ReducePath.back()->predecessors()),
+ ", ")));
+ assert(ReducePath.size() == ReduceRule.Size &&
+ "Reduce path's length must equal to the reduce rule size");
+ // A reduce is a back-and-forth operation in the stack.
+ // For example, we reduce a rule "declaration := decl-specifier-seq ;" on
+ // the linear stack:
+ //
+ // 0 -> 1(decl-specifier-seq) -> 3(;)
+ // ^ Base ^ Head
+ // <--- ReducePath: [3,1] ---->
+ //
+ // 1. back -- pop |ReduceRuleLength| nodes (ReducePath) in the stack;
+ // 2. forth -- push a new node in the stack and mark it as a head;
+ // 0 -> 4(declaration)
+ // ^ Head
+ //
+ // It becomes tricky if a reduce path has multiple bases, we want to merge
+ // them if their next state is the same. Similiar to above performShift,
+ // we partition the bases by their next state, and process each partition
+ // per loop iteration.
+ struct BaseInfo {
+ // An intermediate head after the stack has poped |ReducePath| nodes.
+ const Graph::Node *Base = nullptr;
+ // The final state after reduce.
+ // It is getGoToState(Base->State, ReduceSymbol).
+ StateID NextState;
+ };
+ std::vector<BaseInfo> Bases;
+ for (const Graph::Node *Base : ReducePath.back()->predecessors())
+ Bases.push_back(
+ {Base, ParsingTable.getGoToState(Base->State, ReduceRule.Target)});
+ llvm::sort(Bases, [](const BaseInfo &L, const BaseInfo &R) {
+ return std::forward_as_tuple(L.NextState, L.Base) <
+ std::forward_as_tuple(R.NextState, R.Base);
+ });
+
+ llvm::ArrayRef<BaseInfo> Partition = llvm::makeArrayRef(Bases);
+ while (!Partition.empty()) {
+ StateID NextState = Partition.front().NextState;
+ // Predecessors of the new stack head.
+ std::vector<const Graph::Node *> Predecessors;
+ auto Batch = Partition.take_while([&](const BaseInfo &TB) {
+ if (NextState != TB.NextState)
+ return false;
+ Predecessors.push_back(TB.Base);
+ return true;
+ });
+ assert(!Batch.empty());
+ Partition = Partition.drop_front(Batch.size());
+
+ // Check ambiguities.
+ auto It = llvm::find_if(CreatedHeads, [&](const Graph::Node *Head) {
+ return Head->Parsed->symbol() == ReduceRule.Target &&
+ Head->predecessors() == llvm::makeArrayRef(Predecessors);
+ });
+ if (It != CreatedHeads.end()) {
+ // This should be guaranteed by checking the equalivent of
+ // predecessors and reduce nonterminal symbol!
+ assert(NextState == (*It)->State);
+ LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+ " found ambiguity, merged in state {0} (forest "
+ "'{1}')\n",
+ (*It)->State, G.symbolName((*It)->Parsed->symbol())));
+ // FIXME: create ambiguous foreset node!
+ continue;
+ }
+
+ // Create a corresponding sequence forest node for the reduce rule.
+ std::vector<const ForestNode *> ForestChildren;
+ for (const Graph::Node *PN : llvm::reverse(ReducePath))
+ ForestChildren.push_back(PN->Parsed);
+ const ForestNode &ForestNode = ParsedForest.createSequence(
+ ReduceRule.Target, RA.PerformAction->getReduceRule(),
+ ForestChildren.front()->startLoc(), ForestChildren);
+ LLVM_DEBUG(llvm::dbgs() << llvm::formatv(
+ " after reduce: {0} -> state {1} ({2})\n",
+ llvm::join(getStateString(Predecessors), ", "),
+ NextState, G.symbolName(ReduceRule.Target)));
+
+ // Create a new stack head.
+ const Graph::Node *Head =
+ GSS.addNode(NextState, &ForestNode, Predecessors);
+ CreatedHeads.push_back(Head);
+
+ // Actions that are enabled by this reduce.
+ addActions(Head, Lookahead);
+ }
+ });
+ }
+}
+
+void GLRParser::addActions(const Graph::Node *Head, const Token &Lookahead) {
+ for (const auto &Action :
+ ParsingTable.getActions(Head->State, tokenSymbol(Lookahead.Kind))) {
+ switch (Action.kind()) {
+ case LRTable::Action::Shift:
+ ShiftList.push_back({Head, &Action});
+ break;
+ case LRTable::Action::Reduce:
+ ReduceList.push_back({Head, &Action});
+ break;
+ case LRTable::Action::Accept:
+ AcceptLists.push_back({Head, &Action});
+ break;
+ default:
+ llvm_unreachable("unexpected action kind!");
+ }
+ }
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/Forest.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Forest.cpp
@@ -0,0 +1,125 @@
+//===--- Forest.cpp - Parse forest ------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Forest.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+std::string ForestNode::Dump(const Grammar &G) const {
+ switch (kind()) {
+ case Ambiguous:
+ return llvm::formatv("{0} := <ambiguous>", G.symbolName(symbol()));
+ case Terminal:
+ return llvm::formatv("{0} := tok[{1}]", G.symbolName(symbol()), startLoc());
+ case Sequence:
+ return G.dumpRule(rule());
+ }
+ llvm_unreachable("unhandle node kind!");
+}
+
+std::string ForestNode::DumpRecursive(const Grammar &G,
+ bool Abbreviated) const {
+ // Count visits of nodes so we can mark those seen multiple times.
+ llvm::DenseMap<const ForestNode *, unsigned> Visits;
+ std::function<void(const ForestNode *)> CountVisits =
+ [&](const ForestNode *P) {
+ if (Visits[P]++ > 0)
+ return; // Don't count children as multiply visited.
+ if (P->kind() == Ambiguous)
+ llvm::for_each(P->alternatives(), CountVisits);
+ else if (P->kind() == Sequence)
+ llvm::for_each(P->elements(), CountVisits);
+ };
+ CountVisits(this);
+
+ llvm::DenseMap<const ForestNode *, unsigned> Ids;
+ std::string Result;
+ constexpr unsigned kEnd = std::numeric_limits<unsigned>::max();
+ std::function<void(const ForestNode *, unsigned, unsigned,
+ llvm::Optional<SymbolID>)>
+ Dump = [&](const ForestNode *P, unsigned Level, unsigned End,
+ llvm::Optional<SymbolID> ElidedParent) {
+ // absl::Span<const Node* const> children;
+ llvm::ArrayRef<const ForestNode *> children;
+ auto end_of_element = [&](unsigned child_index) {
+ return child_index + 1 == children.size()
+ ? End
+ : children[child_index + 1]->startLoc();
+ };
+ if (P->kind() == Ambiguous) {
+ children = P->alternatives();
+ } else if (P->kind() == Sequence) {
+ children = P->elements();
+ if (Abbreviated) {
+ if (P->startLoc() == End)
+ return;
+ for (unsigned i = 0; i < children.size(); ++i)
+ if (children[i]->startLoc() == P->startLoc() &&
+ end_of_element(i) == End) {
+ return Dump(
+ children[i], Level, End,
+ /*elided_parent=*/ElidedParent.getValueOr(P->symbol()));
+ }
+ }
+ }
+
+ // FIXME: pretty ascii trees
+ if (End == kEnd)
+ Result += llvm::formatv("[{0},end) ", P->startLoc());
+ else
+ Result += llvm::formatv("[{0},{1}) ", P->startLoc(), End);
+ Result.append(2 * Level, ' ');
+ if (ElidedParent.hasValue()) {
+ Result += G.symbolName(*ElidedParent);
+ Result += "~";
+ }
+ Result.append(P->Dump(G));
+ if (Visits.find(P)->getSecond() > 1 &&
+ P->kind() != ForestNode::Terminal) {
+ // The first time, print as #1. Later, =#1.
+ auto id = Ids.try_emplace(P, Ids.size() + 1);
+ Result +=
+ llvm::formatv(" {0}#{1}", id.second ? "" : "=", id.first->second);
+ }
+ Result.push_back('\n');
+
+ ++Level;
+ for (unsigned i = 0; i < children.size(); ++i)
+ Dump(children[i], Level,
+ P->kind() == Sequence ? end_of_element(i) : End, llvm::None);
+ };
+ Dump(this, 0, kEnd, llvm::None);
+ return Result;
+}
+
+void ForestArena::init(const TokenStream &Tokens) {
+ Arena.Reset(); // clean the arena.
+ NodeNum = 0;
+ // List of leaves is prepopulated, it's convenient and we need them anyway.
+ Terminals = Arena.Allocate<ForestNode>(Tokens.tokens().size());
+ size_t Index = 0;
+ for (const auto &T : Tokens.tokens()) {
+ new (&Terminals[Index])
+ ForestNode(ForestNode::Terminal, tokenSymbol(T.Kind),
+ /*begin=*/Index, /*TerminalData*/ 0);
+ ++Index;
+ }
+ NodeNum = Tokens.tokens().size();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -8,6 +8,8 @@
LRGraph.cpp
LRTable.cpp
LRTableBuild.cpp
+ Forest.cpp
+ GLRParser.cpp
Token.cpp
LINK_LIBS
Index: clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/GLRParser.h
@@ -0,0 +1,148 @@
+//===--- GLRParser.h - Implement a standard GLR parser -----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a standard Generalized LR (GLR) parsing algorithm.
+//
+// The GLR parser behaves as a normal LR parser until it encounters a conflict.
+// To handle a conflict (where there are multiple actions could perform), the
+// parser will simulate nondeterminism by doing a breadth-first search
+// over all the possibilities.
+//
+// Basic mechanisims of the GLR parser:
+// - A number of processes are operated in parallel.
+// - Each process has its own parsing stack and behaves as a standard
+// determinism LR parser.
+// - When a process encounters a conflict, it will be fork (one for each
+// avaiable action).
+// - When a process encounters an error, it is abandoned.
+// - All process are synchronized by the lookahead token: they perfrom shift
+// action at the same time, which means some processes need wait until other
+// processes have performed all reduce actions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Forest.h"
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/LRTable.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// An implementation of a directed acyclic graph (DAG), used as a
+// graph-structured stack (GSS) in the GLR parser.
+//
+// GSS is an efficient data structure to represent multiple active stacks, it
+// employs a stack-combination optimization to avoid potentially exponential
+// growth of the stack:
+// - combing equal stack prefixes -- A new stack doesn't need to have a full
+// copy of its parentâs stack. They share a common prefix.
+// - combing euqal stack suffices -- as there are a finite number of DFA's
+// state the parser can be in. A set of heads can be in the same state
+// though they may have different parses, these heads can be merged,
+// resulting a single head.
+//
+// E.g. we have two active stacks:
+// 0 -> 1 -> 2
+// | ^ head1, representing a stack [2, 1, 0]
+// ` -> 3
+// ^ head2, representing a stack [3, 1, 0]
+struct Graph {
+ // Represents a node in the graph.
+ struct Node {
+ // The parsing state presented by the graph node.
+ LRTable::StateID State : LRTable::StateBits;
+ static constexpr unsigned PredecessorBits = 3;
+ // Number of the predecessors of the node.
+ // u is the predecessor of v, if u -> v.
+ unsigned PredecessorCount : PredecessorBits;
+ // The forest node for a termina/nonterminal symbol.
+ // The symbol correponds to the label of edges which leads to current node
+ // from the predecessor nodes.
+ const ForestNode *Parsed = nullptr;
+
+ llvm::ArrayRef<const Node *> predecessors() const {
+ return llvm::makeArrayRef(reinterpret_cast<const Node *const *>(this + 1),
+ PredecessorCount);
+ };
+
+ bool operator==(const Node &L) const {
+ return State == L.State && predecessors() == L.predecessors();
+ }
+ // A trailing array of Node*.
+ };
+
+ // Creates a new node in the graph.
+ const Node *addNode(LRTable::StateID State,
+ const ::clang::syntax::pseudo::ForestNode *Symbol,
+ llvm::ArrayRef<const Node *> Predecessors) {
+ assert(Predecessors.size() < (1 << Node::PredecessorBits) &&
+ "Too many predecessors to fit in PredecessorBits!");
+ ++NodeCount;
+ Node *Result = new (Arena.Allocate(
+ sizeof(Node) + Predecessors.size() * sizeof(Node *), alignof(Node)))
+ Node({State, static_cast<unsigned>(Predecessors.size())});
+ Result->Parsed = Symbol;
+ if (!Predecessors.empty())
+ llvm::copy(Predecessors, reinterpret_cast<const Node **>(Result + 1));
+ return Result;
+ }
+
+ size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
+ size_t nodeCount() const { return NodeCount; }
+
+private:
+ llvm::BumpPtrAllocator Arena;
+ unsigned NodeCount = 0;
+};
+
+class GLRParser {
+public:
+ GLRParser(const LRTable &T, const Grammar &G, ForestArena &Arena)
+ : ParsingTable(T), G(G), ParsedForest(Arena) {}
+
+ const ForestNode *parse(const TokenStream &Code);
+
+ const Graph &getGSS() const { return GSS; }
+
+private:
+ // Return a list of active stack heads.
+ std::vector<const Graph::Node *> performShift(Token::Index Lookahead);
+ void performReduction(const Token &Lookahead);
+
+ void addActions(const Graph::Node *Head, const Token &Lookahead);
+
+ const LRTable &ParsingTable;
+ const Grammar &G;
+
+ // An active stack head can have multiple avaialble actions (reduce/reduce
+ // actions, reduce/shift actions)
+ // Frontier is to track all avaiable actions from all active stack heads.
+ struct Frontier {
+ // A corresponding stack head.
+ const Graph::Node *Head = nullptr;
+ // An action associated with the Head.
+ const LRTable::Action *PerformAction = nullptr;
+ };
+ // A list of active shift actions.
+ std::vector<Frontier> ShiftList;
+ // A list of active reduce actions.
+ std::vector<Frontier> ReduceList;
+ // A list of active accept action.
+ std::vector<Frontier> AcceptLists;
+
+ Graph GSS;
+ ForestArena &ParsedForest;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/include/clang/Tooling/Syntax/Pseudo/Forest.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/Forest.h
@@ -0,0 +1,135 @@
+//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Parse forest is the output of the GLR parser.
+//
+// For an ambiguous grammar, there might be multiple parse trees generated from
+// for the given input. Forest is a DAG which represent numberous possible in a
+// space-efficient manner. Common subtrees are shared -- if two or more trees
+// treat the token range [1, 3) as an Expression, then there is a single shared
+// Expression node representing the subparse in the forest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Tooling/Syntax/Pseudo/Token.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cstdint>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+// A node in a forest.
+class ForestNode {
+public:
+ enum Kind : uint8_t {
+ // A Terminal node is a single terminal symbol bound to a token.
+ Terminal,
+ // A Sequence node is a nonterminal symbol parsed with a grammar rule.
+ // elements() are the parses of each symbol on the RHS of the rule.
+ Sequence,
+ // An Ambiguous node exposes multiple ways to match the code to the symbol.
+ // alternatives() are the possible parses, we should choose one.
+ Ambiguous,
+ };
+ Kind kind() const { return K; }
+
+ SymbolID symbol() const { return Symbol; }
+
+ // The parses for each element in the RHS of the rule.
+ // REQUIRES: this is a Sequence node.
+ RuleID rule() const {
+ assert(kind() == Sequence);
+ return Data_ & ((1 << RuleBits) - 1);
+ }
+ // REQUIRES: this is a Sequence node;
+ llvm::ArrayRef<const ForestNode *> elements() const {
+ assert(kind() == Sequence);
+ return children(Data_ >> RuleBits);
+ };
+ uint32_t startLoc() const { return StartLoc; }
+
+ // The possible interpretations of the code.
+ // REQUIRES: this is an Ambiguous node.
+ llvm::ArrayRef<const ForestNode *> alternatives() const {
+ assert(kind() == Ambiguous);
+ return children(Data_);
+ }
+
+ std::string Dump(const Grammar &) const;
+ std::string DumpRecursive(const Grammar &, bool abbreviated = false) const;
+
+private:
+ friend class ForestArena;
+ ForestNode(Kind K, SymbolID Symbol, Token::Index StartLoc, uint16_t Data)
+ : StartLoc(StartLoc), K(K), Symbol(Symbol), Data_(Data) {}
+
+ llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
+ return llvm::makeArrayRef(
+ reinterpret_cast<const ForestNode *const *>(this + 1), Num);
+ }
+ static uint16_t SequenceData(RuleID rule,
+ llvm::ArrayRef<const ForestNode *> elements) {
+ assert(rule < (1 << RuleBits));
+ assert(elements.size() < (1 << (16 - RuleBits)));
+ return rule | elements.size() << RuleBits;
+ }
+ Token::Index StartLoc;
+ Kind K : 4;
+ SymbolID Symbol : SymbolBits;
+ // Sequences - child count : 4 | RuleID : 12
+ // Ambiguous - child count : 16
+ // Terminal - unused
+ uint16_t Data_;
+ // A trailing array of Node* .
+};
+
+// Node may not be destroyed (for BumpPtrAllocator).
+static_assert(std::is_trivially_destructible<ForestNode>(), "");
+
+// A memory arena for the parse forest.
+class ForestArena {
+public:
+ size_t nodeNum() const { return NodeNum; }
+ size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }
+
+ ForestNode &createSequence(SymbolID SID, RuleID RID, Token::Index Start,
+ llvm::ArrayRef<const ForestNode *> Elements) {
+ return create(ForestNode::Sequence, SID, Start,
+ ForestNode::SequenceData(RID, Elements), Elements);
+ }
+
+ void init(const TokenStream &Code);
+ const ForestNode &terminal(Token::Index Index) const {
+ assert(Terminals && "Terminals are not intialized!");
+ return Terminals[Index];
+ }
+
+private:
+ ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
+ uint16_t Data,
+ llvm::ArrayRef<const ForestNode *> Elements) {
+ ++NodeNum;
+ ForestNode *New = new (Arena.Allocate(
+ sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
+ alignof(ForestNode))) ForestNode(K, SID, Start, Data);
+ if (!Elements.empty())
+ llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
+ return *New;
+ }
+
+ llvm::BumpPtrAllocator Arena;
+ ForestNode *Terminals = nullptr;
+ uint32_t NodeNum = 0;
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits