[PATCH] D114790: [syntax][pseudo] Add grammar facilities for the pseudo-parser

Haojian Wu via Phabricator via cfe-commits Tue, 30 Nov 2021 05:34:50 -0800

hokein created this revision.
hokein added a reviewer: sammccall.
Herald added subscribers: mgrang, mgorny.
hokein requested review of this revision.
Herald added a project: clang.


This patch introduces basic structures for the grammar, which is used to
build a grammar-based parser.

As the first patch, the scope is limited:

- base types of modeling the grammar rules, symbols
- grammar parsing bit (annotations are excluded, and will be added afterwards)


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D114790

Files:
  clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
  clang/lib/Tooling/Syntax/CMakeLists.txt
  clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
  clang/unittests/Tooling/Syntax/CMakeLists.txt
  clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
  clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp

Index: clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/Pseudo/GrammarTest.cpp
@@ -0,0 +1,114 @@
+//===--- GrammarTest.cpp - grammar tests  ------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+namespace {
+
+using testing::AllOf;
+using testing::ElementsAre;
+using testing::Field;
+using testing::HasSubstr;
+using testing::Not;
+using testing::UnorderedElementsAre;
+
+MATCHER_P(SName, Name, "") { return arg.Symbol == Name; }
+MATCHER_P(Target, Name, "") { return arg.Target == Name; }
+template <typename... T>
+testing::Matcher<const Grammar::RuleSpec &> withSequence(T... Name) {
+  return Field(&Grammar::RuleSpec::Sequence, ElementsAre(SName(Name)...));
+}
+
+MATCHER_P(Name, N, "") { return arg.Name == N; }
+MATCHER(Null, "") { return arg.Nullable; }
+
+MATCHER_P(TargetID, SID, "") { return arg.target() == SID; }
+template <typename... T>
+testing::Matcher<const Rule &> withSequenceID(T... IDs) {
+  return testing::Property(&Rule::seq, ElementsAre(IDs...));
+}
+SymbolID symbolID(llvm::StringRef Name, const Grammar &G) {
+  return G.lookupSymbol(Name);
+}
+
+TEST(GrammarTest, Basic) {
+  auto RS = cantFail(Grammar::RuleSpec::parseRaw(
+      "expression := IDENTIFIER + expression # comment"));
+  EXPECT_THAT(RS, AllOf(Target("expression"),
+                        withSequence("IDENTIFIER", "+", "expression")));
+
+  auto Error = Grammar::RuleSpec::parseRaw("abc");
+  EXPECT_FALSE(Error);
+  EXPECT_THAT(llvm::toString(Error.takeError()), testing::HasSubstr("Failed"));
+}
+
+TEST(GrammarTest, EliminatedOptional) {
+  auto RS = cantFail(Grammar::RuleSpec::parseRaw("list := IDENTIFIER ,_opt"));
+  EXPECT_THAT(RS, AllOf(Target("list"), withSequence("IDENTIFIER", ",_opt")));
+
+  const auto &InlinedRules = Grammar::RuleSpec::eliminateOptional({RS});
+  EXPECT_THAT(InlinedRules,
+              UnorderedElementsAre(
+                  AllOf(Target("list"), withSequence("IDENTIFIER", ",")),
+                  AllOf(Target("list"), withSequence("IDENTIFIER"))));
+
+  const auto &G = Grammar::build(InlinedRules);
+  EXPECT_THAT(
+      G.lookupRules(G.lookupSymbol("list")),
+      UnorderedElementsAre(
+          AllOf(TargetID(symbolID("list", G)),
+                withSequenceID(symbolID("IDENTIFIER", G), symbolID(",", G))),
+          AllOf(TargetID(symbolID("list", G)),
+                withSequenceID(symbolID("IDENTIFIER", G)))));
+}
+
+TEST(GrammarTest, Nullable) {
+  llvm::StringRef Text = R"(
+     # This is a comment.
+     null1 := +_opt
+     null2 := null1
+     null3 := null1 +_opt
+     null4 := +
+     null4 :=
+
+     non-null := +
+  )";
+  const auto RSpecs = cantFail(Grammar::RuleSpec::parseAll(Text));
+
+  const auto &G = Grammar::build(RSpecs);
+  EXPECT_THAT(G.table().Nonterminals,
+              UnorderedElementsAre(
+                  AllOf(Name("null1"), Null()), AllOf(Name("null2"), Null()),
+                  AllOf(Name("null3"), Null()), AllOf(Name("null4"), Null()),
+                  AllOf(Name("non-null"), Not(Null()))));
+}
+
+TEST(GrammarTest, Diagnose) {
+  llvm::StringRef Text = R"(
+     _ := undefined
+     unused := IDENTIFIER
+
+     _ := IDENFIFIE # a typo of the terminal IDENFITIER 
+  )";
+  const auto RSpecs = cantFail(Grammar::RuleSpec::parseAll(Text));
+  EXPECT_THAT(
+      Grammar::build(RSpecs).diagnose(),
+      UnorderedElementsAre(HasSubstr("No rules for nonterminal: undefined"),
+                           HasSubstr("never used"), HasSubstr("Token-like"),
+                           HasSubstr("No rules for nonterminal: IDENFIFIE")));
+}
+
+} // namespace
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/unittests/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_clang_unittest(PseudoParserTests
+  GrammarTest.cpp
+)
+
+clang_target_link_libraries(PseudoParserTests
+  PRIVATE
+  clangBasic
+  clangLex
+  clangToolingPseudo
+  clangTesting
+  )
+
+target_link_libraries(PseudoParserTests
+  PRIVATE
+  LLVMTestingSupport
+)
Index: clang/unittests/Tooling/Syntax/CMakeLists.txt
===================================================================
--- clang/unittests/Tooling/Syntax/CMakeLists.txt
+++ clang/unittests/Tooling/Syntax/CMakeLists.txt
@@ -28,3 +28,5 @@
   PRIVATE
   LLVMTestingSupport
 )
+
+add_subdirectory(Pseudo)
Index: clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/Grammar.cpp
@@ -0,0 +1,433 @@
+//===--- Grammar.cpp - Grammar for clang pseudo parser  ----------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/Syntax/Pseudo/Grammar.h"
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <numeric>
+#include <queue>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+
+namespace {
+static constexpr int NumTerminals = tok::NUM_TOKENS;
+static std::string *TerminalNames = nullptr;
+static const llvm::StringRef OptSuffix = "_opt";
+static const llvm::StringRef StartSymbol = "_";
+
+int initTerminalNames() {
+  assert(!TerminalNames && "TerminalNames must be not initailized");
+  TerminalNames = new std::string[NumTerminals];
+  for (unsigned I = 0; I < NumTerminals; ++I) {
+    tok::TokenKind K = static_cast<tok::TokenKind>(I);
+    if (const auto *Punc = tok::getPunctuatorSpelling(K))
+      TerminalNames[I] = Punc;
+    else
+      TerminalNames[I] = llvm::StringRef(tok::getTokenName(K)).upper();
+  }
+  return 0;
+}
+
+void computeNullability(Table &T) {
+  // A nonterminal is nullable if some rule producing it is nullable.
+  // A rule is nullable if every symbol on the RHS is a nullable nonterminal.
+  // (We only get to anything nullable at all if some rule has an empty RHS).
+
+  // Logically we have two maps:
+  //  rule_deps: map<rule, set<symbol>>
+  //             For each rule, the set of symbols such that if they *all*
+  //             become nullable, the rule is nullable.
+  //  symbol_rdeps: map<symbol, set<rule>>
+  //             For each symbol, the set of rules such that if *any* becomes
+  //             nullable, the symbol is nullable.
+  // In practice, our algorithm deals with propagating updates from the RHS
+  // to the LHS, so the maps actually go in the other direction.
+
+  llvm::DenseSet<RuleID> NullableRules;
+  // Maps rule -> rule_deps[rule].size. Rule becomes nullable if it is zero.
+  llvm::DenseMap<RuleID, unsigned> RuleDepCount;
+  // Maps rule_deps[rule] -> rule. When a symbol goes nullable, check these.
+  llvm::DenseMap<SymbolID, std::vector<RuleID>> SymbolRdeps;
+
+  // Avoid deep recursion by deferring analysis of rules with a queue.
+  std::vector<RuleID> PendingNullableRules;
+
+  // Initialize dependency maps.
+  for (RuleID RID = 0; RID < T.Rules.size(); ++RID) {
+    const Rule &R = T.Rules[RID];
+    if (R.size() == 0) {
+      PendingNullableRules.push_back(RID);
+      continue;
+    }
+
+    // Only track rules that are potentially nullable - no terminals.
+    if (llvm::any_of(R.seq(), [](SymbolID SID) { return isToken(SID); }))
+      continue;
+
+    unsigned &DepCount = RuleDepCount[RID];
+    for (SymbolID SID : R.seq()) {
+      auto &Rdep = SymbolRdeps[SID];
+      if (!Rdep.empty() && Rdep.back() == RID)
+        continue; // repeated dep only counted once!
+      Rdep.push_back(RID);
+      ++DepCount;
+    }
+  }
+
+  while (!PendingNullableRules.empty()) {
+    // Check if we already knew the rule was nullable, and mark it.
+    RuleID RID = PendingNullableRules.back();
+    PendingNullableRules.pop_back();
+    if (!NullableRules.insert(RID).second)
+      continue;
+
+    // This makes its symbol nullable, check and mark it too.
+    SymbolID SID = T.Rules[RID].target();
+    if (T.Nonterminals[SID].Nullable)
+      continue;
+    T.Nonterminals[SID].Nullable = true;
+
+    // This may in turn contribute to the nullability of rdep rules.
+    // If this was the last needed symbol, the rule is now nullable.
+    if (SymbolRdeps.count(SID))
+      for (RuleID RdepRule : SymbolRdeps[SID]) {
+        assert(RuleDepCount[RdepRule] > 0);
+        if (--RuleDepCount[RdepRule] == 0)
+          PendingNullableRules.push_back(RdepRule);
+      }
+  }
+}
+
+void eliminateOptionalTail(llvm::ArrayRef<Grammar::RuleSpec::Element> Elements,
+                           std::vector<Grammar::RuleSpec::Element> &Result,
+                           llvm::function_ref<void()> CB) {
+  if (Elements.empty()) {
+    CB();
+    return;
+  }
+  auto Front = Elements.front();
+  if (!Front.Symbol.endswith(OptSuffix)) {
+    Result.push_back(std::move(Front));
+    eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+    Result.pop_back();
+    return;
+  }
+  // Enumerate two options: skip the opt symbol, or inline the symbol.
+  eliminateOptionalTail(Elements.drop_front(1), Result, CB); // skip
+  Front.Symbol = Front.Symbol.drop_back(OptSuffix.size());   // drop "_opt"
+  Result.push_back(std::move(Front));
+  eliminateOptionalTail(Elements.drop_front(1), Result, CB);
+  Result.pop_back();
+}
+
+} // namespace
+
+Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
+    : Data({Target, static_cast<uint8_t>(Sequence.size())}) {
+  assert(Sequence.size() <= Rule::MaxElements);
+  llvm::copy(Sequence, this->Sequence);
+}
+
+std::string Grammar::RuleSpec::dump() const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  OS << this->Target << " :=";
+  for (const auto &E : this->Sequence) {
+    OS << " " << E.Symbol;
+  }
+  return OS.str();
+}
+
+llvm::Expected<std::vector<Grammar::RuleSpec>>
+Grammar::RuleSpec::parseAll(llvm::StringRef Lines) {
+  std::vector<RuleSpec> Specs;
+  for (llvm::StringRef Line : llvm::split(Lines, '\n')) {
+    Line = Line.trim();
+    // A line started with # is a comment.
+    if (Line.empty() || Line.startswith("#"))
+      continue;
+    auto Spec = pseudo::Grammar::RuleSpec::parseRaw(Line);
+    if (!Spec)
+      return Spec.takeError();
+
+    Specs.push_back(std::move(*Spec));
+  }
+  return eliminateOptional(Specs);
+}
+
+llvm::Expected<Grammar::RuleSpec>
+Grammar::RuleSpec::parseRaw(llvm::StringRef Line) {
+  auto Parts = Line.split(":=");
+  if (Parts.first == Line) // no separator in Line
+    return llvm::make_error<llvm::StringError>(
+        llvm::errc::invalid_argument,
+        llvm::formatv("Failed to parse rule {0}\n", Line).str());
+  RuleSpec Result;
+  Result.Target = Parts.first.trim();
+  for (llvm::StringRef Chunk : llvm::split(Parts.second, ' ')) {
+    Chunk = Chunk.trim();
+    if (Chunk.empty())
+      continue; // skip empty
+    if (Chunk.startswith("#"))
+      break; // comment, skip anything coming after #
+
+    Result.Sequence.emplace_back();
+    Result.Sequence.back().Symbol = Chunk;
+  }
+  return Result;
+}
+
+std::vector<Grammar::RuleSpec>
+Grammar::RuleSpec::eliminateOptional(const std::vector<RuleSpec> &Input) {
+  std::vector<pseudo::Grammar::RuleSpec> Results;
+  std::vector<Grammar::RuleSpec::Element> Storage;
+  for (const auto &R : Input) {
+    eliminateOptionalTail(R.Sequence, Storage, [&Results, &Storage, &R]() {
+      Results.emplace_back();
+      Results.back().Target = R.Target;
+      Results.back().Sequence = Storage;
+    });
+  }
+  return Results;
+}
+
+Grammar::Grammar(std::unique_ptr<Table> T) : T(std::move(T)) {}
+
+Grammar Grammar::build(llvm::ArrayRef<RuleSpec> Specs) {
+  assert(llvm::all_of(Specs,
+                      [](const RuleSpec &R) {
+                        if (R.Target.endswith(OptSuffix))
+                          return false;
+                        return llvm::all_of(
+                            R.Sequence,
+                            [](const Grammar::RuleSpec::Element &E) {
+                              return !E.Symbol.endswith(OptSuffix);
+                            });
+                      }) &&
+         "unexpected optional symbols!");
+  static int Dummy = initTerminalNames();
+  (void)Dummy;
+
+  auto T = std::make_unique<Table>();
+
+  // Assemble the name->ID and ID->(nonterminal) name maps.
+  llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
+  llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
+  for (unsigned I = 0; I < NumTerminals; ++I)
+    SymbolIds.try_emplace(TerminalNames[I], tokenSymbol(tok::TokenKind(I)));
+  auto Consider = [&](llvm::StringRef Name) {
+    if (!SymbolIds.count(Name))
+      UniqueNonterminals.insert(Name);
+  };
+  for (const auto &Spec : Specs) {
+    Consider(Spec.Target);
+    for (const RuleSpec::Element &Elt : Spec.Sequence)
+      Consider(Elt.Symbol);
+  }
+  llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
+    T->Nonterminals.emplace_back();
+    T->Nonterminals.back().Name = Name.str();
+  });
+  assert(T->Nonterminals.size() < (1 << (SymbolBits - 1)) &&
+         "Too many nonterminals to fit in SymbolID bits!");
+  llvm::sort(T->Nonterminals,
+             [](const Table::Nonterminal &L, const Table::Nonterminal &R) {
+               return L.Name < R.Name;
+             });
+  // Build name -> ID maps for nonterminals.
+  for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+    SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
+
+  // Convert the rules.
+  T->Rules.reserve(Specs.size());
+  std::vector<SymbolID> Symbols;
+  auto Lookup = [SymbolIds](llvm::StringRef Name) {
+    auto It = SymbolIds.find(Name);
+    assert(It != SymbolIds.end() && "Didn't find the symbol in SymbolIds!");
+    return It->second;
+  };
+  for (const auto &Spec : Specs) {
+    assert(Spec.Sequence.size() < Rule::MaxElements);
+    Symbols.clear();
+    for (const RuleSpec::Element &Elt : Spec.Sequence)
+      Symbols.push_back(Lookup(Elt.Symbol));
+    T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
+  }
+  assert(T->Rules.size() < (1 << RuleBits) &&
+         "Too many rules to fit in RuleID bits!");
+  llvm::sort(T->Rules, [](const Rule &Left, const Rule &Right) {
+    return std::forward_as_tuple(Left.target(), Left.size(), Left.Sequence) <
+           std::forward_as_tuple(Right.target(), Right.size(), Right.Sequence);
+  });
+  RuleID RulePos = 0;
+  for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
+    RuleID Start = RulePos;
+    while (RulePos < T->Rules.size() && T->Rules[RulePos].target() == SID)
+      ++RulePos;
+    T->Nonterminals[SID].RuleRange = {Start, RulePos};
+  }
+
+  computeNullability(*T);
+
+  return Grammar(std::move(T));
+}
+
+SymbolID Grammar::lookupSymbol(llvm::StringRef Name) const {
+  auto Find = llvm::partition_point(
+      T->Nonterminals,
+      [Name](const Table::Nonterminal &T) { return T.Name < Name; });
+  if (Find != T->Nonterminals.end() && Find->Name == Name)
+    return Find - T->Nonterminals.begin();
+  for (unsigned I = 0; I < NumTerminals; ++I) {
+    if (TerminalNames[I] == Name)
+      return tokenSymbol(static_cast<tok::TokenKind>(I));
+  }
+  assert(false);
+}
+
+llvm::ArrayRef<Rule> Grammar::lookupRules(SymbolID SID) const {
+  assert(isNonterminal(SID));
+  auto R = T->Nonterminals[SID].RuleRange;
+  assert(R.second <= T->Rules.size());
+  return llvm::makeArrayRef(&T->Rules[R.first], R.second - R.first);
+}
+
+const Rule &Grammar::lookupRule(RuleID RID) const {
+  assert(RID < T->Rules.size());
+  return T->Rules[RID];
+}
+
+llvm::StringRef Grammar::dumpSymbol(SymbolID SID) const {
+  assert(T->isSymbol(SID));
+  if (isToken(SID))
+    return TerminalNames[symbolToToken(SID)];
+  return T->Nonterminals[SID].Name;
+}
+
+std::string Grammar::dumpRule(RuleID RID) const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  const Rule &R = T->Rules[RID];
+  OS << dumpSymbol(R.target()) << " :=";
+  for (SymbolID SID : R.seq()) {
+    OS << " " << dumpSymbol(SID);
+  }
+  return Result;
+}
+
+std::string Grammar::dumpRules(SymbolID SID) const {
+  assert(isNonterminal(SID));
+  std::string Result;
+  const auto &Range = T->Nonterminals[SID].RuleRange;
+  for (RuleID RID = Range.first; RID < Range.second; ++RID)
+    Result.append(dumpRule(RID)).push_back('\n');
+  return Result;
+}
+
+static std::string recursiveDump(const Grammar &G, const Table &T,
+                                 llvm::ArrayRef<RuleID> Seed) {
+  std::string Result;
+  llvm::DenseSet<RuleID> SeenRules;
+  llvm::DenseSet<SymbolID> SeenSymbols;
+  std::queue<SymbolID> Queue;
+  auto Visit = [&](RuleID RID) {
+    assert(G.table().isRule(RID));
+    if (!SeenRules.insert(RID).second)
+      return;
+    Result.append(G.dumpRule(RID)).push_back('\n');
+    for (SymbolID SID : T.Rules[RID].seq())
+      if (isNonterminal(SID) && !SeenSymbols.insert(SID).second)
+        Queue.push(SID);
+  };
+  llvm::for_each(Seed, Visit);
+  while (!Queue.empty()) {
+    assert(G.table().isSymbol(Queue.front()));
+    auto R = T.Nonterminals[Queue.front()].RuleRange;
+    for (RuleID I = R.first; I < R.second; ++I)
+      Visit(I);
+    Queue.pop();
+  }
+  return Result;
+}
+
+std::string Grammar::dumpRuleRecursively(RuleID RID) const {
+  return recursiveDump(*this, *T, llvm::makeArrayRef<RuleID>(&RID, 1));
+}
+
+std::string Grammar::dumpRulesRecursively(SymbolID SID) const {
+  assert(isNonterminal(SID));
+  auto Range = T->Nonterminals[SID].RuleRange;
+  std::vector<RuleID> Seed;
+  for (RuleID RID = Range.first; RID < Range.second; ++RID)
+    Seed.push_back(RID);
+  return recursiveDump(*this, *T, Seed);
+}
+
+std::vector<std::string> Grammar::diagnose() const {
+  std::vector<std::string> Result;
+  for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID) {
+    auto Range = T->Nonterminals[SID].RuleRange;
+    if (Range.first == Range.second)
+      Result.push_back(
+          llvm::formatv("No rules for nonterminal: {0}", dumpSymbol(SID)));
+    llvm::StringRef NameRef = T->Nonterminals[SID].Name;
+    if (llvm::all_of(NameRef, llvm::isAlpha) && NameRef.upper() == NameRef) {
+      Result.push_back(llvm::formatv(
+          "Token-like name {0} is used as a nonterminal", dumpSymbol(SID)));
+    }
+  }
+  for (RuleID RID = 0; RID + 1 < T->Rules.size(); ++RID) {
+    if (T->Rules[RID] == T->Rules[RID + 1])
+      Result.push_back(llvm::formatv("Duplicate rule: {0}", dumpRule(RID)));
+    // Warning for nullable non-terminals (except for the start symbol _).
+    if (T->Rules[RID].size() == 0 &&
+        lookupSymbol(StartSymbol) != T->Rules[RID].target())
+      Result.push_back(llvm::formatv("Nullable symbol: {0}",
+                                     dumpSymbol(T->Rules[RID].target())));
+  }
+  // symbol-id -> used counts
+  std::vector<unsigned> UseCounts(T->Nonterminals.size(), 0);
+  for (const Rule &R : T->Rules)
+    for (SymbolID SID : R.seq())
+      if (isNonterminal(SID))
+        ++UseCounts[SID];
+  for (SymbolID SID = 0; SID < UseCounts.size(); ++SID)
+    if (UseCounts[SID] == 0 && T->Nonterminals[SID].Name != StartSymbol)
+      Result.push_back(
+          llvm::formatv("Nonterminal never used: {0}", dumpSymbol(SID)));
+  return Result;
+}
+
+std::string Grammar::dump() const {
+  std::string Result;
+  llvm::raw_string_ostream OS(Result);
+  OS << "Nonterminals:\n";
+  for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
+    OS << llvm::formatv("  {0} {1}{2}\n", SID, dumpSymbol(SID),
+                        T->Nonterminals[SID].Nullable ? " NULLABLE" : "");
+  OS << "Rules:\n";
+  for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
+    OS << llvm::formatv("  {0} {1}\n", RID, dumpRule(RID));
+  return OS.str();
+}
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
Index: clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang/lib/Tooling/Syntax/Pseudo/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_LINK_COMPONENTS Support)
+
+add_clang_library(clangToolingPseudo
+  Grammar.cpp
+
+  LINK_LIBS
+  clangBasic
+  clangLex
+  )
Index: clang/lib/Tooling/Syntax/CMakeLists.txt
===================================================================
--- clang/lib/Tooling/Syntax/CMakeLists.txt
+++ clang/lib/Tooling/Syntax/CMakeLists.txt
@@ -19,3 +19,5 @@
   DEPENDS
   omp_gen
   )
+
+add_subdirectory(Pseudo)
Index: clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
===================================================================
--- /dev/null
+++ clang/include/clang/Tooling/Syntax/Pseudo/Grammar.h
@@ -0,0 +1,197 @@
+//===--- Grammar.h - grammar used by clang pseudo parser  --------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines base structures for parsing & modeling a grammar for a
+//  programming language:
+//
+//    # This is a fake C++ grammar
+//    _ := translation-unit
+//    translation-unit := declaration-seq_opt
+//    declaration-seq := declaration
+//    declaration-seq := declaration-seq declaration
+//
+//  A grammar formally describes a language, and it is constructed by a set of
+//  production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
+//  non-terminal or terminal, identified by a SymbolID.
+//
+//  Notions about the grammar:
+//  - "_" is the start symbol
+//  - single-line comment is supported, starting with a #
+//  - A rule describes how a nonterminal (left side of :=) is constructed, and
+//    it is per line in the grammar file
+//  - Terminals (also called tokens) correspond to the clang::TokenKind; they
+//    are written in the grammar like "IDENTIFIER", "USING", "+"
+//  - Nonterminals are specified with "lower-case" names in the grammar; they
+//    shouldn't be nullable (formed by an empty string), except the start symbol
+//  - optional symbols are supported (specified with a _opt suffix), and they
+//    will be eliminated during the grammar parsing stage
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
+#define LLVM_CLANG_TOOLING_SYNTAX_GRAMMAR_H
+
+#include "clang/Basic/TokenKinds.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <vector>
+
+namespace clang {
+namespace syntax {
+namespace pseudo {
+// A SymbolID uniquely identifies a terminal/non-terminal symbol in a grammar.
+// Non-terminal IDs are indexes into a table of non-terminal symbols.
+// Terminal IDs correspond to the clang TokenKind enum.
+using SymbolID = uint16_t;
+// SymbolID is only 12 bits wide.
+// There are maximum 2^11 terminals and 2^11 nonterminals.
+static constexpr unsigned SymbolBits = 12;
+// SymbolIDs with the top bit set are tokens/terminals.
+static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
+inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
+inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
+// The terminals are always the clang tok::TokenKind (not all are used).
+inline tok::TokenKind symbolToToken(SymbolID SID) {
+  assert(isToken(SID));
+  SID &= ~TokenFlag;
+  assert(SID < tok::NUM_TOKENS);
+  return static_cast<tok::TokenKind>(SID);
+}
+inline SymbolID tokenSymbol(tok::TokenKind TK) {
+  return TokenFlag | static_cast<SymbolID>(TK);
+}
+
+// A RuleID uniquely identifies a production rule in a grammar.
+// It is an index into a table of rules.
+using RuleID = uint16_t;
+// RuleID is only 12 bits wide.
+static constexpr unsigned RuleBits = 12;
+
+// Represent a production rule in the grammar, e.g.
+//   expression := a b c
+//   ^Target       ^Sequence
+struct Rule {
+  Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
+
+  static constexpr unsigned SizeBits = 4;
+  static_assert(SizeBits + SymbolBits <= 16,
+                "Must be able to store symbol ID + size efficiently");
+  static constexpr unsigned MaxElements = 1 << SizeBits;
+
+  // 16 bits for target symbol and size of sequence:
+  // SymbolID : 12 | Size : 4
+  struct {
+    SymbolID Target : SymbolBits;
+    uint8_t Size : SizeBits; // Size of the Sequence
+  } Data;
+  SymbolID Sequence[MaxElements];
+
+  SymbolID target() const { return Data.Target; }
+  uint8_t size() const { return Data.Size; }
+  llvm::ArrayRef<SymbolID> seq() const {
+    return llvm::ArrayRef<SymbolID>(Sequence, size());
+  }
+  friend bool operator==(const Rule &L, const Rule &R) {
+    return L.target() == R.target() && L.size() == R.size() &&
+           L.seq() == R.seq();
+  }
+};
+
+struct Table;
+
+// Grammar that describes a programming language, C++ etc.
+// It parses BNF grammar rules, and is a building brick for constructing a
+// grammar-based parser.
+//
+// FIXME: add grammar annotations.
+class Grammar {
+public:
+  ~Grammar() = default;
+  Grammar(Grammar &&) = default;
+  // Text representation of a grammar rule.
+  struct RuleSpec {
+    llvm::StringRef Target;
+    struct Element {
+      llvm::StringRef Symbol; // Name of the symbol
+    };
+    std::vector<Element> Sequence;
+
+    //
+    static llvm::Expected<std::vector<RuleSpec>>
+    parseAll(llvm::StringRef Lines);
+
+    std::string dump() const;
+    // Exposed for testing.
+    static llvm::Expected<RuleSpec> parseRaw(llvm::StringRef Line);
+    // Exposed for testing.
+    // Inline all _opt symbols.
+    // For example, a rule E := id +_opt id, after elimination, we have two
+    // equivalent rules:
+    //   1) E := id + id
+    //   2) E := id id
+    static std::vector<RuleSpec>
+    eliminateOptional(const std::vector<RuleSpec> &);
+  };
+  static Grammar build(llvm::ArrayRef<RuleSpec>);
+
+  // Lookup non-terminal by name.
+  SymbolID lookupSymbol(llvm::StringRef Name) const;
+  // Return all rules of the given non-terminal symbol.
+  llvm::ArrayRef<Rule> lookupRules(SymbolID SID) const;
+  const Rule &lookupRule(RuleID RID) const;
+
+  // Dump methods for debugging.
+  // Dump the whole grammar.
+  std::string dump() const;
+  // Dump symbol (terminal or non-terminal) name.
+  llvm::StringRef dumpSymbol(SymbolID) const;
+  std::string dumpRule(RuleID) const;
+  std::string dumpRuleRecursively(RuleID) const;
+  // Dump all rules of the given nonterminal symbol.
+  std::string dumpRules(SymbolID) const;
+  std::string dumpRulesRecursively(SymbolID) const;
+
+  // Diagnose the grammar and returns warnings if any.
+  std::vector<std::string> diagnose() const;
+
+  const Table &table() const { return *T; }
+
+private:
+  explicit Grammar(std::unique_ptr<Table>);
+  std::unique_ptr<Table> T;
+};
+
+// Underlying storage of the Grammar.
+struct Table {
+  // The rules are sorted (and thus grouped) by target symbol.
+  // RuleID is the index of the vector.
+  std::vector<Rule> Rules;
+
+  struct Nonterminal {
+    std::string Name;
+    bool Nullable = false;
+    std::pair</*start=*/RuleID, /*end*/ RuleID> RuleRange;
+  };
+  // A table of nonterminals, sorted by name.
+  // SymbolID is the index of the table.
+  std::vector<Nonterminal> Nonterminals;
+
+  bool isRule(RuleID RID) const { return RID < Rules.size(); }
+  bool isSymbol(SymbolID SID) const {
+    return isToken(SID) ? ((SID & ~TokenFlag) < tok::NUM_TOKENS)
+                        : SID < Nonterminals.size();
+  };
+};
+
+} // namespace pseudo
+} // namespace syntax
+} // namespace clang
+
+#endif

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D114790: [syntax][pseudo] Add grammar facilities for the pseudo-parser

Reply via email to