[clang] [clang-format] Skip protected data blocks in Verilog (PR #190695)

via cfe-commits Mon, 06 Apr 2026 15:05:01 -0700

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang-format

Author: sstwcw

<details>
<summary>Changes</summary>

A Verilog file can have encrypted stuff (sections 34 and O in the spec). This 
patch makes the formatter skip it.  Previously the formatter could mess it up 
by treating it as ordinary code.

Now the entire block following the `pragma protect` line is treated as a single 
token.

The keywords added in this patch only mean special things in the pragma lines.  
Thus they are not added to `VerilogExtraKeywords`.

While the files containing the stuff are machine generated, it is a bad idea 
for a formatter to break code.  For example, one may wish to run the formatter 
on an entire project containing both ordinary and encrypted files.  Another use 
case is formatting the prototypes in files that contain clear text prototypes 
in and encrypted implementation.

---
Full diff: https://github.com/llvm/llvm-project/pull/190695.diff


6 Files Affected:

- (modified) clang/lib/Format/FormatToken.h (+22-1) 
- (modified) clang/lib/Format/FormatTokenLexer.cpp (+58-9) 
- (modified) clang/lib/Format/FormatTokenLexer.h (+7-1) 
- (modified) clang/lib/Format/UnwrappedLineParser.cpp (+9) 
- (modified) clang/unittests/Format/FormatTestVerilog.cpp (+62) 
- (modified) clang/unittests/Format/TokenAnnotatorTest.cpp (+55) 


``````````diff
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 240bb31148f6c..22e74fbb14728 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -222,6 +222,9 @@ namespace format {
   TYPE(VerilogMultiLineListLParen)                                             
\
   /* for the base in a number literal, not including the quote */              
\
   TYPE(VerilogNumberBase)                                                      
\
+  /* The text that is in the opaque protected block. Like the text between     
\
+   * 'pragma protect data_block' and 'pragma protect end_protected'. */        
\
+  TYPE(VerilogProtected)                                                       
\
   /* like `(strong1, pull0)` */                                                
\
   TYPE(VerilogStrength)                                                        
\
   /* Things inside the table in user-defined primitives. */                    
\
@@ -1202,6 +1205,9 @@ struct AdditionalKeywords {
     kw_cover = &IdentTable.get("cover");
     kw_covergroup = &IdentTable.get("covergroup");
     kw_coverpoint = &IdentTable.get("coverpoint");
+    kw_data_block = &IdentTable.get("data_block");
+    kw_data_decrypt_key = &IdentTable.get("data_decrypt_key");
+    kw_data_public_key = &IdentTable.get("data_public_key");
     kw_default_decay_time = &IdentTable.get("default_decay_time");
     kw_default_nettype = &IdentTable.get("default_nettype");
     kw_default_trireg_strength = &IdentTable.get("default_trireg_strength");
@@ -1209,6 +1215,9 @@ struct AdditionalKeywords {
     kw_delay_mode_path = &IdentTable.get("delay_mode_path");
     kw_delay_mode_unit = &IdentTable.get("delay_mode_unit");
     kw_delay_mode_zero = &IdentTable.get("delay_mode_zero");
+    kw_digest_block = &IdentTable.get("digest_block");
+    kw_digest_decrypt_key = &IdentTable.get("digest_decrypt_key");
+    kw_digest_public_key = &IdentTable.get("digest_public_key");
     kw_disable = &IdentTable.get("disable");
     kw_dist = &IdentTable.get("dist");
     kw_edge = &IdentTable.get("edge");
@@ -1251,6 +1260,8 @@ struct AdditionalKeywords {
     kw_join = &IdentTable.get("join");
     kw_join_any = &IdentTable.get("join_any");
     kw_join_none = &IdentTable.get("join_none");
+    kw_key_block = &IdentTable.get("key_block");
+    kw_key_public_key = &IdentTable.get("key_public_key");
     kw_large = &IdentTable.get("large");
     kw_local = &IdentTable.get("local");
     kw_localparam = &IdentTable.get("localparam");
@@ -1267,6 +1278,7 @@ struct AdditionalKeywords {
     kw_priority = &IdentTable.get("priority");
     kw_program = &IdentTable.get("program");
     kw_property = &IdentTable.get("property");
+    kw_protect = &IdentTable.get("protect");
     kw_pull0 = &IdentTable.get("pull0");
     kw_pull1 = &IdentTable.get("pull1");
     kw_pure = &IdentTable.get("pure");
@@ -1621,6 +1633,9 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_cover;
   IdentifierInfo *kw_covergroup;
   IdentifierInfo *kw_coverpoint;
+  IdentifierInfo *kw_data_block;
+  IdentifierInfo *kw_data_decrypt_key;
+  IdentifierInfo *kw_data_public_key;
   IdentifierInfo *kw_default_decay_time;
   IdentifierInfo *kw_default_nettype;
   IdentifierInfo *kw_default_trireg_strength;
@@ -1628,10 +1643,13 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_delay_mode_path;
   IdentifierInfo *kw_delay_mode_unit;
   IdentifierInfo *kw_delay_mode_zero;
+  IdentifierInfo *kw_digest_block;
+  IdentifierInfo *kw_digest_decrypt_key;
+  IdentifierInfo *kw_digest_public_key;
   IdentifierInfo *kw_disable;
   IdentifierInfo *kw_dist;
-  IdentifierInfo *kw_elsif;
   IdentifierInfo *kw_edge;
+  IdentifierInfo *kw_elsif;
   IdentifierInfo *kw_end;
   IdentifierInfo *kw_end_keywords;
   IdentifierInfo *kw_endcase;
@@ -1670,6 +1688,8 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_join;
   IdentifierInfo *kw_join_any;
   IdentifierInfo *kw_join_none;
+  IdentifierInfo *kw_key_block;
+  IdentifierInfo *kw_key_public_key;
   IdentifierInfo *kw_large;
   IdentifierInfo *kw_local;
   IdentifierInfo *kw_localparam;
@@ -1686,6 +1706,7 @@ struct AdditionalKeywords {
   IdentifierInfo *kw_priority;
   IdentifierInfo *kw_program;
   IdentifierInfo *kw_property;
+  IdentifierInfo *kw_protect;
   IdentifierInfo *kw_pull0;
   IdentifierInfo *kw_pull1;
   IdentifierInfo *kw_pure;
diff --git a/clang/lib/Format/FormatTokenLexer.cpp 
b/clang/lib/Format/FormatTokenLexer.cpp
index 0dc6f776aeca0..bda591ab1027e 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -35,7 +35,7 @@ FormatTokenLexer::FormatTokenLexer(
       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
       FormattingDisabled(false), FormatOffRegex(Style.OneLineFormatOffRegex),
       MacroBlockBeginRegex(Style.MacroBlockBegin),
-      MacroBlockEndRegex(Style.MacroBlockEnd) {
+      MacroBlockEndRegex(Style.MacroBlockEnd), VerilogProtectedBlock(false) {
   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
   Lex->SetKeepWhitespaceMode(true);
 
@@ -1391,8 +1391,22 @@ FormatToken *FormatTokenLexer::getNextToken() {
       FormatTok->Tok.setKind(tok::identifier);
     } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
       FormatTok->Tok.setKind(tok::identifier);
-    } else if (Style.isVerilog() && Keywords.isVerilogIdentifier(*FormatTok)) {
-      FormatTok->Tok.setKind(tok::identifier);
+    } else if (Style.isVerilog()) {
+      if (Keywords.isVerilogIdentifier(*FormatTok))
+        FormatTok->Tok.setKind(tok::identifier);
+      // Look for the protect line. The next lines needs to be lexed as a 
single
+      // token.
+      if (Tokens.size() - FirstInLineIndex >= 3u &&
+          Tokens[FirstInLineIndex]->is(tok::hash) &&
+          Tokens[FirstInLineIndex + 1u]->is(tok::pp_pragma) &&
+          Tokens[FirstInLineIndex + 2u]->is(Keywords.kw_protect) &&
+          FormatTok->isOneOf(
+              Keywords.kw_data_block, Keywords.kw_data_decrypt_key,
+              Keywords.kw_data_public_key, Keywords.kw_digest_block,
+              Keywords.kw_digest_decrypt_key, Keywords.kw_digest_public_key,
+              Keywords.kw_key_block, Keywords.kw_key_public_key)) {
+        VerilogProtectedBlock = true;
+      }
     }
   } else if (const bool Greater = FormatTok->is(tok::greatergreater);
              Greater || FormatTok->is(tok::lessless)) {
@@ -1469,7 +1483,42 @@ FormatToken *FormatTokenLexer::getNextToken() {
   return FormatTok;
 }
 
-bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
+bool FormatTokenLexer::readVerilogProtected(FormatToken &Tok) {
+  // The block follows the pragma line.
+  if (!VerilogProtectedBlock || Tok.NewlinesBefore == 0)
+    return false;
+  VerilogProtectedBlock = false;
+
+  // The block can be empty. Then no token is necessary. A backtick on its own
+  // line is likely a uuencode line. A backtick followed by something is 
assumed
+  // to be the pragma line that ends the block.
+  const char *const Start = Lex->getBufferLocation();
+  size_t Len = Lex->getBuffer().end() - Start;
+  if (Len == 0 ||
+      (Len >= 2 && Start[0] == '`' && !isVerticalWhitespace(Start[1]))) {
+    return false;
+  }
+
+  // The block ends when the next pragma line starts.
+  static const llvm::Regex NextDirective("[\n\r][ \t]*`[^\n\r]");
+  SmallVector<StringRef, 1> Matches;
+  if (NextDirective.match(StringRef(Start, Len), &Matches)) {
+    assert(Matches.size() == 1);
+    Len = Matches[0].begin() - Start;
+  }
+
+  Tok.Tok.setKind(tok::string_literal);
+  Tok.Tok.setLength(Len);
+  Tok.Tok.setLocation(Lex->getSourceLocation(Start, Len));
+  Tok.setFinalizedType(TT_VerilogProtected);
+  Lex->seek(Lex->getCurrentBufferOffset() + Len,
+            /*IsAtStartOfLine=*/false);
+  return true;
+}
+
+bool FormatTokenLexer::readRawTokenVerilogSpecific(FormatToken &Tok) {
+  if (readVerilogProtected(Tok))
+    return true;
   const char *Start = Lex->getBufferLocation();
   size_t Len;
   switch (Start[0]) {
@@ -1519,10 +1568,10 @@ bool 
FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
   // The kind has to be an identifier so we can match it against those defined
   // in Keywords. The kind has to be set before the length because the 
setLength
   // function checks that the kind is not an annotation.
-  Tok.setKind(tok::raw_identifier);
-  Tok.setLength(Len);
-  Tok.setLocation(Lex->getSourceLocation(Start, Len));
-  Tok.setRawIdentifierData(Start);
+  Tok.Tok.setKind(tok::raw_identifier);
+  Tok.Tok.setLength(Len);
+  Tok.Tok.setLocation(Lex->getSourceLocation(Start, Len));
+  Tok.Tok.setRawIdentifierData(Start);
   Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
   return true;
 }
@@ -1530,7 +1579,7 @@ bool FormatTokenLexer::readRawTokenVerilogSpecific(Token 
&Tok) {
 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
   // For Verilog, first see if there is a special token, and fall back to the
   // normal lexer if there isn't one.
-  if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
+  if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok))
     Lex->LexFromRawLexer(Tok.Tok);
   Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
                             Tok.Tok.getLength());
diff --git a/clang/lib/Format/FormatTokenLexer.h 
b/clang/lib/Format/FormatTokenLexer.h
index 4141e1434f72f..9f5b735efe1d0 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -140,11 +140,17 @@ class FormatTokenLexer {
   llvm::Regex MacroBlockBeginRegex;
   llvm::Regex MacroBlockEndRegex;
 
+  // The next line is a Verilog protected block that should not be split into
+  // tokens. Set at the 'pragma protect' line. Cleared at the next line.
+  bool VerilogProtectedBlock;
+
   // Targets that may appear inside a C# attribute.
   static const llvm::StringSet<> CSharpAttributeTargets;
 
+  /// Handle Verilog opaque protected stuff.
+  bool readVerilogProtected(FormatToken &Tok);
   /// Handle Verilog-specific tokens.
-  bool readRawTokenVerilogSpecific(Token &Tok);
+  bool readRawTokenVerilogSpecific(FormatToken &Tok);
 
   void readRawToken(FormatToken &Tok);
 
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp 
b/clang/lib/Format/UnwrappedLineParser.cpp
index c44cbebfbc598..0ab11f69d9782 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1882,6 +1882,15 @@ void UnwrappedLineParser::parseStructuralElement(
     case tok::r_brace:
       addUnwrappedLine();
       return;
+    case tok::string_literal:
+      if (Style.isVerilog() && FormatTok->is(TT_VerilogProtected)) {
+        FormatTok->Finalized = true;
+        nextToken();
+        addUnwrappedLine();
+        return;
+      }
+      nextToken();
+      break;
     case tok::l_paren: {
       parseParens();
       // Break the unwrapped line if a K&R C function definition has a 
parameter
diff --git a/clang/unittests/Format/FormatTestVerilog.cpp 
b/clang/unittests/Format/FormatTestVerilog.cpp
index f407fc36c3a12..bd3009c3b2b25 100644
--- a/clang/unittests/Format/FormatTestVerilog.cpp
+++ b/clang/unittests/Format/FormatTestVerilog.cpp
@@ -1253,6 +1253,53 @@ TEST_F(FormatTestVerilog, Primitive) {
                "endprimitive");
 }
 
+TEST_F(FormatTestVerilog, Protected) {
+  // The mess-up function does not know that the pragma needs to be on its own
+  // line. So the 1-argument `verifyFormat` function is avoided here.
+
+  // Stuff inside the block should not change.
+  verifyNoChange("`pragma protect data_block\n"
+                 " 0+\n"
+                 "0=\n"
+                 "`pragma protect end_protected");
+  verifyNoChange("`pragma protect data_block\n"
+                 "`pragma protect end_protected");
+  verifyNoChange("`pragma protect data_block\n"
+                 " 0+0=\n"
+                 "`pragma protect end_protected");
+  verifyNoChange("`pragma protect data_block\n"
+                 "0+0=\n"
+                 "`pragma protect end_protected");
+
+  // Stuff around the block should be formatted.
+  verifyFormat("x = 0;\n"
+               "`pragma protect data_block\n"
+               "0+0=\n"
+               "`pragma protect end_protected\n"
+               "x = 0;",
+               "x=0;\n"
+               "`pragma protect data_block\n"
+               "0+0=\n"
+               "`pragma protect end_protected\n"
+               "x=0;");
+  verifyFormat("x = 0;\n"
+               "`pragma protect data_block\n"
+               "`pragma protect end_protected\n"
+               "x = 0;",
+               "x=0;\n"
+               "`pragma protect data_block\n"
+               "`pragma protect end_protected\n"
+               "x=0;");
+
+  // Stuff between `begin` and `end` is ordinary code. It should be formatted.
+  verifyFormat("`pragma protect begin\n"
+               "x = 0;\n"
+               "`pragma protect end",
+               "`pragma protect begin\n"
+               "x=0;\n"
+               "`pragma protect end");
+}
+
 TEST_F(FormatTestVerilog, Streaming) {
   verifyFormat("x = {>>{j}};");
   verifyFormat("x = {>>byte{j}};");
@@ -1340,6 +1387,21 @@ TEST_F(FormatTestVerilog, StringLiteral) {
                 getStyleWithColumns(getDefaultStyle(), 23));
   verifyNoCrash(R"(x(_T("xxxxxxxxxxxxxxxx xxxx"));)",
                 getStyleWithColumns(getDefaultStyle(), 23));
+
+  // The protected block is internally a string literal. But the program should
+  // not try to break it into multiple lines.
+  verifyNoChange("`pragma protect data_block\n"
+                 "'00000000000000000000000000000000000000000+0='\n"
+                 "`pragma protect end_protected",
+                 getStyleWithColumns(getDefaultStyle(), 29));
+  verifyNoChange("`pragma protect data_block\n"
+                 "\"00000000000000000000000000000000000000000+0=\"\n"
+                 "`pragma protect end_protected",
+                 getStyleWithColumns(getDefaultStyle(), 29));
+  verifyNoChange("`pragma protect data_block\n"
+                 "00000000000000000000000000000000000000000+0=\n"
+                 "`pragma protect end_protected",
+                 getStyleWithColumns(getDefaultStyle(), 29));
 }
 
 TEST_F(FormatTestVerilog, StructLiteral) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp 
b/clang/unittests/Format/TokenAnnotatorTest.cpp
index c33a2f4a77fd8..cf2bd9f8bd76a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3028,6 +3028,61 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::hash, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::hashhash, TT_Unknown);
+
+  // A protected data block that is not empty should be treated as a single
+  // token.
+  Tokens = Annotate(
+      "`pragma protect data_block, encoding=(enctype=\"base64\", bytes=2)\n"
+      "0+0=\n"
+      "`pragma protect end_protected");
+  ASSERT_EQ(Tokens.size(), 22u) << Tokens;
+  EXPECT_TOKEN(Tokens[16], tok::string_literal, TT_VerilogProtected);
+  Tokens = Annotate(
+      "`pragma protect data_block, encoding=(enctype=\"base64\", bytes=2)\n"
+      "0+0=");
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
+  EXPECT_TOKEN(Tokens[16], tok::string_literal, TT_VerilogProtected);
+  EXPECT_EQ(Tokens[16]->TokenText, "0+0=");
+  Tokens = Annotate("`pragma protect data_block\n"
+                    "0\n"
+                    "+0=\n"
+                    "`pragma protect end_protected");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::string_literal, TT_VerilogProtected);
+  EXPECT_EQ(Tokens[4]->TokenText, "0\n+0=");
+  Tokens = Annotate("`pragma protect data_block\n"
+                    "0+0=\n"
+                    "`pragma protect end_protected");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::string_literal, TT_VerilogProtected);
+  EXPECT_EQ(Tokens[4]->TokenText, "0+0=");
+
+  // A protected block that is empty should not have tokens.
+  Tokens = Annotate("`pragma protect data_block");
+  ASSERT_EQ(Tokens.size(), 5u) << Tokens;
+  Tokens = Annotate("`pragma protect data_block\n");
+  ASSERT_EQ(Tokens.size(), 5u) << Tokens;
+  Tokens = Annotate("`pragma protect data_block\n"
+                    "`pragma protect end_protected");
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
+
+  // The standard allows uuencode as well. The program should not confuse the
+  // backticks added by the encoding as the end pragma.
+  Tokens = Annotate("`pragma protect data_block\n"
+                    "`\n"
+                    "end\n"
+                    "`pragma protect end_protected");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::string_literal, TT_VerilogProtected);
+  EXPECT_EQ(Tokens[4]->TokenText, "`\nend");
+  Tokens = Annotate("`pragma protect data_block\n"
+                    "!````\n"
+                    "`\n"
+                    "end\n"
+                    "`pragma protect end_protected");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::string_literal, TT_VerilogProtected);
+  EXPECT_EQ(Tokens[4]->TokenText, "!````\n`\nend");
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {

``````````

</details>


https://github.com/llvm/llvm-project/pull/190695
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [clang-format] Skip protected data blocks in Verilog (PR #190695)

Reply via email to