hokein created this revision.
hokein added a reviewer: sammccall.
Herald added a subscriber: mgrang.
Herald added a project: All.
hokein requested review of this revision.
Herald added a subscriber: alextsao1999.
Herald added a project: clang-tools-extra.
With this patch, we're able to parse smaller chunks of C++ code (statement,
declaration), rather than translation-unit.
The target symbol is listed in the grammar in a form of `_ :=
statement`, each target symbol has a dedicated state (`_ := • statement`).
We create and track all these separate states in the LRTable. When we
start parsing, we lookup the corresponding state to start the parser.
LR pasing table changes with this patch:
- number of states: 1467 -> 1471
- number of actions: 82891 -> 83578
- size of the table (bytes): 334248 -> 336996
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D125006
Files:
clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
clang-tools-extra/pseudo/lib/GLR.cpp
clang-tools-extra/pseudo/lib/LRGraph.cpp
clang-tools-extra/pseudo/lib/LRTable.cpp
clang-tools-extra/pseudo/lib/LRTableBuild.cpp
clang-tools-extra/pseudo/lib/cxx.bnf
clang-tools-extra/pseudo/test/glr-variant-start.cpp
clang-tools-extra/pseudo/tool/ClangPseudo.cpp
clang-tools-extra/pseudo/unittests/GLRTest.cpp
Index: clang-tools-extra/pseudo/unittests/GLRTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -344,7 +344,8 @@
const TokenStream &Tokens = cook(lex("{ abc", LOptions), LOptions);
auto LRTable = LRTable::buildSLR(*G);
- const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack});
+ const ForestNode &Parsed =
+ glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
// Verify that there is no duplicated sequence node of `expr := IDENTIFIER`
// in the forest, see the `#1` and `=#1` in the dump string.
EXPECT_EQ(Parsed.dumpRecursive(*G),
@@ -381,7 +382,8 @@
const TokenStream &Tokens = cook(lex("IDENTIFIER", LOptions), LOptions);
auto LRTable = LRTable::buildSLR(*G);
- const ForestNode &Parsed = glrParse(Tokens, {*G, LRTable, Arena, GSStack});
+ const ForestNode &Parsed =
+ glrParse(Tokens, {*G, LRTable, Arena, GSStack}, id("test"));
EXPECT_EQ(Parsed.dumpRecursive(*G),
"[ 0, end) test := <ambiguous>\n"
"[ 0, end) ââtest := IDENTIFIER\n"
Index: clang-tools-extra/pseudo/tool/ClangPseudo.cpp
===================================================================
--- clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -38,6 +38,9 @@
desc("Print directive structure of source code"));
static opt<bool> PrintStatistics("print-statistics", desc("Print GLR parser statistics"));
static opt<bool> PrintForest("print-forest", desc("Print parse forest"));
+static opt<std::string> ParseSymbol("parse-symbol",
+ desc("specify the target symbol to parse"),
+ init("translation-unit"));
static std::string readOrDie(llvm::StringRef Path) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -50,6 +53,14 @@
return Text.get()->getBuffer().str();
}
+static clang::pseudo::SymbolID
+findNonterminalID(llvm::StringRef Name, const clang::pseudo::Grammar &G) {
+ for (clang::pseudo::SymbolID ID = 0; ID < G.table().Nonterminals.size(); ++ID)
+ if (G.table().Nonterminals[ID].Name == Name)
+ return ID;
+ return 0;
+}
+
int main(int argc, char *argv[]) {
llvm::cl::ParseCommandLineOptions(argc, argv, "");
@@ -96,9 +107,9 @@
if (ParseableStream) {
clang::pseudo::ForestArena Arena;
clang::pseudo::GSS GSS;
- auto &Root =
- glrParse(*ParseableStream,
- clang::pseudo::ParseParams{*G, LRTable, Arena, GSS});
+ auto &Root = glrParse(*ParseableStream,
+ clang::pseudo::ParseParams{*G, LRTable, Arena, GSS},
+ findNonterminalID(ParseSymbol, *G));
if (PrintForest)
llvm::outs() << Root.dumpRecursive(*G, /*Abbreviated=*/true);
Index: clang-tools-extra/pseudo/test/glr-variant-start.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/pseudo/test/glr-variant-start.cpp
@@ -0,0 +1,9 @@
+// RUN: clang-pseudo -grammar=%cxx-bnf-file -source=%s --parse-symbol=statement-seq --print-forest | FileCheck %s
+
+a + a;
+// CHECK: statement-seq~expression-statement := expression ;
+// CHECK-NEXT: ââexpression~additive-expression := additive-expression + multiplicative-expression
+// CHECK-NEXT: â ââadditive-expression~IDENTIFIER :=
+// CHECK-NEXT: â ââ+ :=
+// CHECK-NEXT: â ââmultiplicative-expression~IDENTIFIER :=
+// CHECK-NEXT: ââ; :=
Index: clang-tools-extra/pseudo/lib/cxx.bnf
===================================================================
--- clang-tools-extra/pseudo/lib/cxx.bnf
+++ clang-tools-extra/pseudo/lib/cxx.bnf
@@ -25,8 +25,11 @@
# [1] https://isocpp.org/files/papers/N4860.pdf
#
#
-# _ serves as a "fake" start symbol, coming with real grammar symbols.
+# _ serves as a "fake" start symbol, coming with real grammar symbols which we
+# target to parse.
_ := translation-unit
+_ := statement-seq
+_ := declaration-seq
# gram.key
typedef-name := IDENTIFIER
Index: clang-tools-extra/pseudo/lib/LRTableBuild.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/LRTableBuild.cpp
+++ clang-tools-extra/pseudo/lib/LRTableBuild.cpp
@@ -40,6 +40,9 @@
class LRTable::Builder {
public:
+ Builder(std::vector<std::pair<SymbolID, StateID>> StartStates)
+ : StartStates(std::move(StartStates)) {}
+
bool insert(Entry E) { return Entries.insert(std::move(E)).second; }
LRTable build(const GrammarTable >) && {
// E.g. given the following parsing table with 3 states and 3 terminals:
@@ -92,24 +95,26 @@
tokenSymbol(static_cast<tok::TokenKind>(Terminal)))
++SortedIndex;
}
+ Table.StartStates = std::move(StartStates);
return Table;
}
private:
llvm::DenseSet<Entry> Entries;
+ std::vector<std::pair<SymbolID, StateID>> StartStates;
};
LRTable LRTable::buildForTests(const GrammarTable >,
llvm::ArrayRef<Entry> Entries) {
- Builder Build;
+ Builder Build({});
for (const Entry &E : Entries)
Build.insert(E);
return std::move(Build).build(GT);
}
LRTable LRTable::buildSLR(const Grammar &G) {
- Builder Build;
auto Graph = LRGraph::buildLR0(G);
+ Builder Build(Graph.startStates());
for (const auto &T : Graph.edges()) {
Action Act = isToken(T.Label) ? Action::shift(T.Dst) : Action::goTo(T.Dst);
Build.insert({T.Src, T.Label, Act});
Index: clang-tools-extra/pseudo/lib/LRTable.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/LRTable.cpp
+++ clang-tools-extra/pseudo/lib/LRTable.cpp
@@ -120,5 +120,16 @@
/*length=*/End - Start);
}
+LRTable::StateID LRTable::getStartState(SymbolID Target) const {
+ assert(llvm::is_sorted(StartStates) && "StartStates must be sorted!");
+ auto It = llvm::partition_point(
+ StartStates, [Target](const std::pair<SymbolID, StateID> &X) {
+ return X.first < Target;
+ });
+ assert(It != StartStates.end() && It->first == Target &&
+ "target symbol doesn't have a start state!");
+ return It->second;
+}
+
} // namespace pseudo
} // namespace clang
Index: clang-tools-extra/pseudo/lib/LRGraph.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/LRGraph.cpp
+++ clang-tools-extra/pseudo/lib/LRGraph.cpp
@@ -160,9 +160,12 @@
}
LRGraph LRGraph::buildLR0(const Grammar &G) {
+ std::vector<std::pair<SymbolID, StateID>> StartStates;
class Builder {
public:
- Builder(const Grammar &G) : G(G) {}
+ Builder(const Grammar &G,
+ std::vector<std::pair<SymbolID, StateID>> &StartStates)
+ : G(G), StartStates(StartStates) {}
// Adds a given state if not existed.
std::pair<StateID, /*inserted*/ bool> insert(ItemSet KernelItems) {
@@ -190,7 +193,10 @@
LRGraph build() && {
States.shrink_to_fit();
Edges.shrink_to_fit();
- return LRGraph(std::move(States), std::move(Edges));
+ llvm::sort(StartStates);
+ StartStates.shrink_to_fit();
+ return LRGraph(std::move(States), std::move(Edges),
+ std::move(StartStates));
}
private:
@@ -199,7 +205,8 @@
std::vector<State> States;
std::vector<Edge> Edges;
const Grammar &G;
- } Builder(G);
+ std::vector<std::pair<SymbolID, StateID>> &StartStates;
+ } Builder(G, StartStates);
std::vector<StateID> PendingStates;
// Initialize states with the start symbol.
@@ -209,6 +216,11 @@
auto Result = Builder.insert(std::move(StartState));
assert(Result.second && "State must be new");
PendingStates.push_back(Result.first);
+
+ const Rule &StartRule = G.lookupRule(RID);
+ assert(StartRule.Size == 1 &&
+ "Start rule must have exactly one symbol in its body!");
+ StartStates.push_back({StartRule.seq().front(), Result.first});
}
while (!PendingStates.empty()) {
Index: clang-tools-extra/pseudo/lib/GLR.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/GLR.cpp
+++ clang-tools-extra/pseudo/lib/GLR.cpp
@@ -36,8 +36,8 @@
return OS;
}
-const ForestNode &glrParse(const TokenStream &Tokens,
- const ParseParams &Params) {
+const ForestNode &glrParse(const TokenStream &Tokens, const ParseParams &Params,
+ SymbolID Target) {
llvm::ArrayRef<ForestNode> Terminals = Params.Forest.createTerminals(Tokens);
auto &G = Params.G;
auto &GSS = Params.GSStack;
@@ -61,9 +61,9 @@
}
}
};
-
std::vector<const GSS::Node *> NewHeads = {
- GSS.addNode(/*State=*/0, /*ForestNode*/ nullptr, {})};
+ GSS.addNode(/*State=*/Params.Table.getStartState(Target),
+ /*ForestNode=*/nullptr, {})};
for (const ForestNode &Terminal : Terminals) {
LLVM_DEBUG(llvm::dbgs() << llvm::formatv("Next token {0} (id={1})\n",
G.symbolName(Terminal.symbol()),
@@ -101,11 +101,7 @@
return *PendingAccept.front().Head->Payload;
}
// We failed to parse the input, returning an opaque forest node for recovery.
- auto RulesForStart = G.rulesFor(G.startSymbol());
- // FIXME: support multiple start symbols.
- assert(RulesForStart.size() == 1 && RulesForStart.front().Size == 1 &&
- "start symbol _ must have exactly one rule");
- return Params.Forest.createOpaque(RulesForStart.front().Sequence[0], 0);
+ return Params.Forest.createOpaque(Target, /*Token::Index=*/0);
}
// Apply all pending shift actions.
Index: clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/LRTable.h
@@ -132,6 +132,16 @@
// Returns empty if no available actions in the table.
llvm::ArrayRef<Action> find(StateID State, SymbolID Symbol) const;
+ // Returns the state from which the LR parser should start to parse the input
+ // tokens as the given Target symbol.
+ //
+ // In LR parsing, the start state of `translation-unit` corresponds to
+ // `_ := ⢠translation-unit`.
+ //
+ // REQUIRE: The given Target symbol must exist in the grammar (in a form of
+ // `_ := target`).
+ StateID getStartState(SymbolID Target) const;
+
size_t bytes() const {
return sizeof(*this) + Actions.capacity() * sizeof(Action) +
States.capacity() * sizeof(StateID) +
@@ -171,7 +181,10 @@
std::vector<StateID> States;
// A flat list of available actions, sorted by (SymbolID, State).
std::vector<Action> Actions;
+ // A sorted table, storing the start state for each target parsing symbol.
+ std::vector<std::pair<SymbolID, StateID>> StartStates;
};
+
llvm::raw_ostream &operator<<(llvm::raw_ostream &, const LRTable::Action &);
} // namespace pseudo
Index: clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/LRGraph.h
@@ -139,15 +139,21 @@
llvm::ArrayRef<State> states() const { return States; }
llvm::ArrayRef<Edge> edges() const { return Edges; }
+ llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
+ return StartStates;
+ }
std::string dumpForTests(const Grammar &) const;
private:
- LRGraph(std::vector<State> States, std::vector<Edge> Edges)
- : States(std::move(States)), Edges(std::move(Edges)) {}
+ LRGraph(std::vector<State> States, std::vector<Edge> Edges,
+ std::vector<std::pair<SymbolID, StateID>> StartStates)
+ : States(std::move(States)), Edges(std::move(Edges)),
+ StartStates(std::move(StartStates)) {}
std::vector<State> States;
std::vector<Edge> Edges;
+ std::vector<std::pair<SymbolID, StateID>> StartStates;
};
} // namespace pseudo
Index: clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/GLR.h
@@ -122,13 +122,12 @@
ForestArena &Forest; // Storage for the output forest.
GSS &GSStack; // Storage for parsing stacks.
};
-// Parse the given token stream with the GLR algorithm, and return a forest node
-// of the start symbol.
+// Parses the given token stream as the Target symbol with the GLR algorithm,
+// and returns a forest node of the target symbol.
//
// If the parsing fails, we model it as an opaque node in the forest.
-//
-// FIXME: add support for variant start symbols.
-const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params);
+const ForestNode &glrParse(const TokenStream &Code, const ParseParams &Params,
+ SymbolID Target);
// An active stack head can have multiple available actions (reduce/reduce
// actions, reduce/shift actions).
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits