[PATCH] D126536: [pseudo] Add grammar annotations support.

Haojian Wu via Phabricator via cfe-commits Wed, 01 Jun 2022 04:39:30 -0700

hokein updated this revision to Diff 433354.
hokein marked 2 inline comments as done.
hokein added a comment.


remove a left-out change.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D126536/new/

https://reviews.llvm.org/D126536

Files:
  clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
  clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
  clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
  clang-tools-extra/pseudo/unittests/GrammarTest.cpp

Index: clang-tools-extra/pseudo/unittests/GrammarTest.cpp
===================================================================
--- clang-tools-extra/pseudo/unittests/GrammarTest.cpp
+++ clang-tools-extra/pseudo/unittests/GrammarTest.cpp
@@ -99,6 +99,14 @@
   EXPECT_LT(ruleFor("x"), ruleFor("_"));
 }
 
+TEST_F(GrammarTest, Annotation) {
+  build(R"bnf(
+    _ := IDENTIFIER [guard=override]
+  )bnf");
+  ASSERT_TRUE(Diags.empty());
+  EXPECT_TRUE(G->lookupRule(ruleFor("_")).Guard);
+}
+
 TEST_F(GrammarTest, Diagnostics) {
   build(R"cpp(
     _ := ,_opt
@@ -110,6 +118,8 @@
     # cycle
     a := b
     b := a
+
+    _ := IDENTIFIER [guard=override;unknown=value]
   )cpp");
 
   EXPECT_EQ(G->underscore(), id("_"));
@@ -120,7 +130,8 @@
                          "Failed to parse 'invalid': no separator :=",
                          "Token-like name IDENFIFIE is used as a nonterminal",
                          "No rules for nonterminal: IDENFIFIE",
-                         "The grammar contains a cycle involving symbol a"));
+                         "The grammar contains a cycle involving symbol a",
+                         "Unknown extension key 'unknown'"));
 }
 
 TEST_F(GrammarTest, FirstAndFollowSets) {
Index: clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
+++ clang-tools-extra/pseudo/lib/grammar/GrammarBNF.cpp
@@ -47,6 +47,9 @@
     // Assemble the name->ID and ID->nonterminal name maps.
     llvm::DenseSet<llvm::StringRef> UniqueNonterminals;
     llvm::DenseMap<llvm::StringRef, SymbolID> SymbolIds;
+
+    llvm::DenseSet<llvm::StringRef> UniqueExtensionValues;
+
     for (uint16_t I = 0; I < NumTerminals; ++I)
       SymbolIds.try_emplace(T->Terminals[I], tokenSymbol(tok::TokenKind(I)));
     auto Consider = [&](llvm::StringRef Name) {
@@ -55,8 +58,11 @@
     };
     for (const auto &Spec : Specs) {
       Consider(Spec.Target);
-      for (const RuleSpec::Element &Elt : Spec.Sequence)
+      for (const RuleSpec::Element &Elt : Spec.Sequence) {
         Consider(Elt.Symbol);
+        for (const auto& KV : Elt.Extensions)
+           UniqueExtensionValues.insert(KV.second);
+      }
     }
     llvm::for_each(UniqueNonterminals, [&T](llvm::StringRef Name) {
       T->Nonterminals.emplace_back();
@@ -68,6 +74,15 @@
                                    const GrammarTable::Nonterminal &R) {
       return L.Name < R.Name;
     });
+    // Add an empty string for the corresponding sentinel none extension.
+    T->Extensions.push_back("");
+    llvm::for_each(UniqueExtensionValues, [&T](llvm::StringRef Name) {
+      T->Extensions.emplace_back();
+      T->Extensions.back() = Name.str();
+    });
+    llvm::sort(T->Extensions);
+    assert(T->Extensions.front() == "");
+
     // Build name -> ID maps for nonterminals.
     for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
       SymbolIds.try_emplace(T->Nonterminals[SID].Name, SID);
@@ -87,6 +102,8 @@
         Symbols.push_back(Lookup(Elt.Symbol));
       T->Rules.push_back(Rule(Lookup(Spec.Target), Symbols));
     }
+    applyExtension(Specs, *T);
+
     assert(T->Rules.size() < (1 << RuleBits) &&
            "Too many rules to fit in RuleID bits!");
     const auto &SymbolOrder = getTopologicalOrder(T.get());
@@ -164,6 +181,9 @@
     llvm::StringRef Target;
     struct Element {
       llvm::StringRef Symbol; // Name of the symbol
+      // Extensions that are associated to the sequence symbol or rule.
+      std::vector<std::pair<llvm::StringRef/*Key*/, llvm::StringRef/*Value*/>>
+          Extensions;
     };
     std::vector<Element> Sequence;
 
@@ -204,11 +224,58 @@
       Chunk = Chunk.trim();
       if (Chunk.empty())
         continue; // skip empty
+      if (Chunk.startswith("[") && Chunk.endswith("]")) {
+        if (Out.Sequence.empty())
+          continue;
+        parseExtension(Chunk, Out.Sequence.back().Extensions);
+        continue;
+      }
 
       Out.Sequence.push_back({Chunk});
     }
     return true;
-  };
+  }
+
+  bool parseExtension(
+      llvm::StringRef Content,
+      std::vector<std::pair<llvm::StringRef, llvm::StringRef>> &Out) {
+    assert(Content.startswith("[") && Content.endswith("]"));
+    for (llvm::StringRef ExtText :
+         llvm::split(Content.drop_front().drop_back(), ';')) {
+      auto KV = ExtText.split('=');
+      if (KV.first == ExtText) { // no separator in Line
+        Diagnostics.push_back(
+            llvm::formatv("Failed to parse extension '{0}': no separator =",
+                          ExtText)
+                .str());
+        return false;
+      }
+      Out.push_back({KV.first, KV.second.trim()});
+    }
+    return true;
+  }
+  // Apply the parsed extensions (stored in RuleSpec) to the grammar Rule.
+  void applyExtension(llvm::ArrayRef<RuleSpec> Specs, GrammarTable &T) {
+    assert(T.Rules.size() == Specs.size());
+    assert(llvm::is_sorted(T.Extensions));
+    auto LookupExtensionID = [&T](llvm::StringRef Name) {
+      const auto It = llvm::partition_point(
+          T.Extensions, [&](llvm::StringRef X) { return X < Name; });
+      assert(It != T.Extensions.end() && *It == Name &&
+             "Didn't find the symbol in AttrValues!");
+      return It - T.Extensions.begin();
+    };
+    for (unsigned I = 0; I < Specs.size(); ++I) {
+      for (const auto &KV : Specs[I].Sequence.back().Extensions) {
+        if (KV.first == "guard") {
+          T.Rules[I].Guard = LookupExtensionID(KV.second);
+          continue;
+        }
+        Diagnostics.push_back(
+            llvm::formatv("Unknown extension key '{0}'", KV.first).str());
+      }
+    }
+  }
 
   // Inlines all _opt symbols.
   // For example, a rule E := id +_opt id, after elimination, we have two
Index: clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
===================================================================
--- clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
+++ clang-tools-extra/pseudo/lib/grammar/Grammar.cpp
@@ -61,6 +61,8 @@
   OS << symbolName(R.Target) << " :=";
   for (SymbolID SID : R.seq())
     OS << " " << symbolName(SID);
+  if (R.Guard)
+    OS << " [guard=" << T->Extensions[R.Guard] << "]";
   return Result;
 }
 
Index: clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
===================================================================
--- clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
+++ clang-tools-extra/pseudo/include/clang-pseudo/Grammar.h
@@ -19,6 +19,20 @@
 //  production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
 //  nonterminal or terminal, identified by a SymbolID.
 //
+//  The grammar supports extensions, which have the syntax form of
+//  [key=value;key=value]. Extensions are associated with a grammar symbol (
+//  on the right-hand side of the symbol) or a grammar rule (at the end of the
+//  rule body).
+//
+//  Extensions provide a way to inject custom code into the general GLR
+//  parser. For example,  we have a rule in the C++ grammar:
+//
+//    contextual-override := IDENTIFIER [guard=Override]
+//
+//  This rule is guarded -- the reduction of this rule will be conducted by the
+//  GLR parser only if the IDENTIFIER content is `override` (see detail
+//  implementationin CXX.h).
+//
 //  Notions about the BNF grammar:
 //  - "_" is the start symbol of the augmented grammar;
 //  - single-line comment is supported, starting with a #
@@ -69,6 +83,11 @@
   return TokenFlag | static_cast<SymbolID>(TK);
 }
 
+// An extension uniquely identifies an extension in a grammar.
+// It is the index into a table of extension values.
+// NOTE: value among extensions must be unique even within different keys!
+using ExtensionID = uint16_t;
+
 // A RuleID uniquely identifies a production rule in a grammar.
 // It is an index into a table of rules.
 using RuleID = uint16_t;
@@ -96,11 +115,16 @@
   uint8_t Size : SizeBits; // Size of the Sequence
   SymbolID Sequence[MaxElements];
 
+  // A guard extension controls whether a reduction of a rule will be conducted
+  // by the GLR parser.
+  // 0 is sentinel extension ID, indicating no extensions.
+  ExtensionID Guard = 0;
+
   llvm::ArrayRef<SymbolID> seq() const {
     return llvm::ArrayRef<SymbolID>(Sequence, Size);
   }
   friend bool operator==(const Rule &L, const Rule &R) {
-    return L.Target == R.Target && L.seq() == R.seq();
+    return L.Target == R.Target && L.seq() == R.seq() && L.Guard == R.Guard;
   }
 };
 
@@ -186,6 +210,9 @@
   // A table of nonterminals, sorted by name.
   // SymbolID is the index of the table.
   std::vector<Nonterminal> Nonterminals;
+  // A table of extensions values, sorted by name.
+  // ExtensionID is the index of the table.
+  std::vector<std::string> Extensions;
 };
 
 } // namespace pseudo

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D126536: [pseudo] Add grammar annotations support.

Reply via email to