llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-tablegen

Author: David Zbarsky (dzbarsky)

<details>
<summary>Changes</summary>

Generate each distinct builtin diagnostic description once in the existing 
`DiagnosticStableIDs.inc` output, emit pointer-free character storage, and 
store its 20-bit offset and 12-bit length in `StaticDiagInfoRec`. Diagtool also 
stores each diagnostic name once, uses naturally aligned six-byte 
`DiagnosticRecord` entries, and replaces the duplicate ID-sorted record table 
with a direct `uint16_t` index, without changing the generated `DIAG` macro 
signature.

On arm64 macOS Release builds, stripped standalone clang decreases by 49,536 
bytes and the LLVM 22 stripped all-tools multicall binary decreases by 49,544 
bytes and linked fixups decrease from 14,604 to 4.

Work towards #<!-- -->202616

AI tool disclosure: Co-authored with OpenAI Codex.


---
Full diff: https://github.com/llvm/llvm-project/pull/202624.diff


6 Files Affected:

- (modified) clang/lib/Basic/DiagnosticIDs.cpp (+16-42) 
- (modified) clang/tools/diagtool/DiagnosticNames.cpp (+44-30) 
- (modified) clang/tools/diagtool/DiagnosticNames.h (+70-77) 
- (modified) clang/utils/TableGen/ClangDiagnosticsEmitter.cpp (+25-2) 
- (modified) llvm/include/llvm/TableGen/StringToOffsetTable.h (+6) 
- (modified) llvm/lib/TableGen/StringToOffsetTable.cpp (+10-5) 


``````````diff
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp 
b/clang/lib/Basic/DiagnosticIDs.cpp
index d9c5e4082c1a7..2ebe3b5f5cc4e 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -36,40 +36,8 @@ struct StaticDiagInfoRec;
 #include "clang/Basic/DiagnosticStableIDs.inc"
 #undef GET_DIAG_STABLE_ID_ARRAYS
 
-// Store the descriptions in a separate table to avoid pointers that need to
-// be relocated, and also decrease the amount of data needed on 64-bit
-// platforms. See "How To Write Shared Libraries" by Ulrich Drepper.
-struct StaticDiagInfoDescriptionStringTable {
-#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     
\
-             SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, 
\
-             LEGACY_STABLE_IDS)                                                
\
-  char ENUM##_desc[sizeof(DESC)];
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
-const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = {
-#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     
\
-             SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, 
\
-             LEGACY_STABLE_IDS)                                                
\
-  DESC,
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
 extern const StaticDiagInfoRec StaticDiagInfo[];
 
-// Stored separately from StaticDiagInfoRec to pack better.  Otherwise,
-// StaticDiagInfoRec would have extra padding on 64-bit platforms.
-const uint32_t StaticDiagInfoDescriptionOffsets[] = {
-#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     
\
-             SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, 
\
-             LEGACY_STABLE_IDS)                                                
\
-  offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc),
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
 const uint32_t StaticDiagInfoStableIDOffsets[] = {
 #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR,     
\
              SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, 
\
@@ -111,25 +79,26 @@ struct StaticDiagInfoRec {
   uint16_t WarnNoWerror : 1;
   LLVM_PREFERRED_TYPE(bool)
   uint16_t WarnShowInSystemHeader : 1;
+  LLVM_PREFERRED_TYPE(diag::Group)
+  uint16_t OptionGroupIndex : 14;
   LLVM_PREFERRED_TYPE(bool)
   uint16_t WarnShowInSystemMacro : 1;
-
-  LLVM_PREFERRED_TYPE(diag::Group)
-  uint16_t OptionGroupIndex : 15;
   LLVM_PREFERRED_TYPE(bool)
   uint16_t Deferrable : 1;
 
-  uint16_t DescriptionLen;
+  uint16_t DescriptionOffsetLow;
+  uint16_t DescriptionOffsetHigh : 4;
+  uint16_t DescriptionLen : 12;
 
   unsigned getOptionGroupIndex() const {
     return OptionGroupIndex;
   }
 
   StringRef getDescription() const {
-    size_t MyIndex = this - &StaticDiagInfo[0];
-    uint32_t StringOffset = StaticDiagInfoDescriptionOffsets[MyIndex];
-    const char* Table = reinterpret_cast<const 
char*>(&StaticDiagInfoDescriptions);
-    return StringRef(&Table[StringOffset], DescriptionLen);
+    uint32_t Offset =
+        DescriptionOffsetLow | (uint32_t(DescriptionOffsetHigh) << 16);
+    return StringRef(StaticDiagInfoDescriptionsStorage + Offset,
+                     DescriptionLen);
   }
 
   StringRef getStableID() const {
@@ -159,6 +128,9 @@ struct StaticDiagInfoRec {
     return DiagID < RHS.DiagID;
   }
 };
+static_assert(sizeof(StaticDiagInfoRec) == 10);
+static_assert(static_cast<unsigned>(diag::Group::NUM_GROUPS) < (1U << 14),
+              "too many diagnostic groups for StaticDiagInfoRec");
 
 #define STRINGIFY_NAME(NAME) #NAME
 #define VALIDATE_DIAG_SIZE(NAME)                                               
\
@@ -200,9 +172,11 @@ const StaticDiagInfoRec StaticDiagInfo[] = {
       CATEGORY,                                                                
\
       NOWERROR,                                                                
\
       SHOWINSYSHEADER,                                                         
\
-      SHOWINSYSMACRO,                                                          
\
       GROUP,                                                                   
\
-           DEFERRABLE,                                                         
     \
+      SHOWINSYSMACRO,                                                          
\
+      DEFERRABLE,                                                              
\
+      uint16_t(DIAG_DESC_OFFSET_##ENUM),                                       
\
+      uint16_t(DIAG_DESC_OFFSET_##ENUM >> 16),                                 
\
       STR_SIZE(DESC, uint16_t)},
 #include "clang/Basic/DiagnosticCommonKinds.inc"
 #include "clang/Basic/DiagnosticDriverKinds.inc"
diff --git a/clang/tools/diagtool/DiagnosticNames.cpp 
b/clang/tools/diagtool/DiagnosticNames.cpp
index 1538167022aca..0c5f6bc771cff 100644
--- a/clang/tools/diagtool/DiagnosticNames.cpp
+++ b/clang/tools/diagtool/DiagnosticNames.cpp
@@ -10,53 +10,67 @@
 #include "clang/Basic/AllDiagnostics.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringTable.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
 
 using namespace clang;
 using namespace diagtool;
 
-static const DiagnosticRecord BuiltinDiagnosticsByName[] = {
-#define DIAG_NAME_INDEX(ENUM) { #ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t) },
+struct BuiltinDiagnosticNameStorage {
+#define DIAG_NAME_INDEX(ENUM) char ENUM[sizeof(#ENUM)];
 #include "clang/Basic/DiagnosticIndexName.inc"
 #undef DIAG_NAME_INDEX
 };
 
+static_assert(sizeof(BuiltinDiagnosticNameStorage) <= uint64_t(1) << 24);
+
+static constexpr BuiltinDiagnosticNameStorage BuiltinDiagnosticNames = {
+#define DIAG_NAME_INDEX(ENUM) #ENUM,
+#include "clang/Basic/DiagnosticIndexName.inc"
+#undef DIAG_NAME_INDEX
+};
+
+#define DIAGNOSTIC_RECORD(ENUM)                                                
\
+  {diag::ENUM, uint16_t(offsetof(BuiltinDiagnosticNameStorage, ENUM)),         
\
+   uint8_t(offsetof(BuiltinDiagnosticNameStorage, ENUM) >> 16),                
\
+   STR_SIZE(#ENUM, uint8_t)}
+
+static constexpr DiagnosticRecord BuiltinDiagnosticsByName[] = {
+#define DIAG_NAME_INDEX(ENUM) DIAGNOSTIC_RECORD(ENUM),
+#include "clang/Basic/DiagnosticIndexName.inc"
+#undef DIAG_NAME_INDEX
+};
+static_assert(std::size(BuiltinDiagnosticsByName) < (1U << 16));
+#undef DIAGNOSTIC_RECORD
+
 llvm::ArrayRef<DiagnosticRecord> diagtool::getBuiltinDiagnosticsByName() {
   return llvm::ArrayRef(BuiltinDiagnosticsByName);
 }
 
-// FIXME: Is it worth having two tables, especially when this one can get
-// out of sync easily?
-static const DiagnosticRecord BuiltinDiagnosticsByID[] = {
-#define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR,      
\
-             SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY, STABLE_ID,      
\
-             LEGACY_STABLE_IDS)                                                
\
-  {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)},
-#include "clang/Basic/AllDiagnosticKinds.inc"
-#undef DIAG
-};
-
-static bool orderByID(const DiagnosticRecord &Left,
-                      const DiagnosticRecord &Right) {
-  return Left.DiagID < Right.DiagID;
+static constexpr auto BuiltinDiagnosticIndexByID = [] {
+  std::array<uint16_t, diag::DIAG_UPPER_LIMIT> Result = {};
+  uint16_t Index = 0;
+#define DIAG_NAME_INDEX(ENUM) Result[diag::ENUM] = ++Index;
+#include "clang/Basic/DiagnosticIndexName.inc"
+#undef DIAG_NAME_INDEX
+  return Result;
+}();
+
+llvm::StringRef DiagnosticRecord::getName() const {
+  const char *Names = reinterpret_cast<const char *>(&BuiltinDiagnosticNames);
+  uint32_t NameOffset =
+      uint32_t(NameOffsetLow) | (uint32_t(NameOffsetHigh) << 16);
+  return llvm::StringRef(Names + NameOffset, NameLen);
 }
 
 const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) {
-  DiagnosticRecord Key = {nullptr, DiagID, 0};
-
-  // The requirement for lower_bound to produce a valid result it is
-  // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID),
-  // but as we want this function to work for all possible values of
-  // DiagID sent in as argument it is better to right away check if
-  // BuiltinDiagnosticsByID is sorted.
-  assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) &&
-         "IDs in BuiltinDiagnosticsByID must be sorted.");
-  const DiagnosticRecord *Result =
-      llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID);
-  assert(Result && "diagnostic not found; table may be out of date");
-  return *Result;
+  assert(DiagID >= 0 &&
+         static_cast<unsigned>(DiagID) < BuiltinDiagnosticIndexByID.size() &&
+         BuiltinDiagnosticIndexByID[DiagID] != 0 && "diagnostic not found");
+  return BuiltinDiagnosticsByName[BuiltinDiagnosticIndexByID[DiagID] - 1];
 }
 
-
 #define GET_DIAG_ARRAYS
 #include "clang/Basic/DiagnosticGroups.inc"
 #undef GET_DIAG_ARRAYS
diff --git a/clang/tools/diagtool/DiagnosticNames.h 
b/clang/tools/diagtool/DiagnosticNames.h
index f541e88577cc5..52622875175af 100644
--- a/clang/tools/diagtool/DiagnosticNames.h
+++ b/clang/tools/diagtool/DiagnosticNames.h
@@ -15,91 +15,84 @@
 
 namespace diagtool {
 
-  struct DiagnosticRecord {
-    const char *NameStr;
-    short DiagID;
-    uint8_t NameLen;
+struct DiagnosticRecord {
+  short DiagID;
+  uint16_t NameOffsetLow;
+  uint8_t NameOffsetHigh;
+  uint8_t NameLen;
 
-    llvm::StringRef getName() const {
-      return llvm::StringRef(NameStr, NameLen);
+  llvm::StringRef getName() const;
+
+  bool operator<(const DiagnosticRecord &Other) const {
+    return getName() < Other.getName();
+  }
+};
+static_assert(sizeof(DiagnosticRecord) == 6,
+              "DiagnosticRecord must remain compact");
+
+/// Get every diagnostic in the system, sorted by name.
+llvm::ArrayRef<DiagnosticRecord> getBuiltinDiagnosticsByName();
+
+/// Get a diagnostic by its ID.
+const DiagnosticRecord &getDiagnosticForID(short DiagID);
+
+struct GroupRecord {
+  uint16_t NameOffset;
+  uint16_t Members;
+  uint16_t SubGroups;
+
+  llvm::StringRef getName() const;
+
+  template <typename RecordType> class group_iterator {
+    const short *CurrentID;
+
+    friend struct GroupRecord;
+    group_iterator(const short *Start) : CurrentID(Start) {
+      if (CurrentID && *CurrentID == -1)
+        CurrentID = nullptr;
     }
 
-    bool operator<(const DiagnosticRecord &Other) const {
-      return getName() < Other.getName();
+  public:
+    typedef RecordType value_type;
+    typedef const value_type &reference;
+    typedef const value_type *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+
+    inline reference operator*() const;
+    inline pointer operator->() const { return &operator*(); }
+
+    inline short getID() const { return *CurrentID; }
+
+    group_iterator &operator++() {
+      ++CurrentID;
+      if (*CurrentID == -1)
+        CurrentID = nullptr;
+      return *this;
+    }
+
+    bool operator==(const group_iterator &Other) const {
+      return CurrentID == Other.CurrentID;
     }
-  };
 
-  /// Get every diagnostic in the system, sorted by name.
-  llvm::ArrayRef<DiagnosticRecord> getBuiltinDiagnosticsByName();
-
-  /// Get a diagnostic by its ID.
-  const DiagnosticRecord &getDiagnosticForID(short DiagID);
-
-
-  struct GroupRecord {
-    uint16_t NameOffset;
-    uint16_t Members;
-    uint16_t SubGroups;
-
-    llvm::StringRef getName() const;
-
-    template<typename RecordType>
-    class group_iterator {
-      const short *CurrentID;
-
-      friend struct GroupRecord;
-      group_iterator(const short *Start) : CurrentID(Start) {
-        if (CurrentID && *CurrentID == -1)
-          CurrentID = nullptr;
-      }
-
-    public:
-      typedef RecordType                 value_type;
-      typedef const value_type &         reference;
-      typedef const value_type *         pointer;
-      typedef std::forward_iterator_tag  iterator_category;
-      typedef std::ptrdiff_t             difference_type;
-
-      inline reference operator*() const;
-      inline pointer operator->() const {
-        return &operator*();
-      }
-
-      inline short getID() const {
-        return *CurrentID;
-      }
-
-      group_iterator &operator++() {
-        ++CurrentID;
-        if (*CurrentID == -1)
-          CurrentID = nullptr;
-        return *this;
-      }
-
-      bool operator==(const group_iterator &Other) const {
-        return CurrentID == Other.CurrentID;
-      }
-
-      bool operator!=(const group_iterator &Other) const {
-        return CurrentID != Other.CurrentID;
-      }
-    };
-
-    typedef group_iterator<GroupRecord> subgroup_iterator;
-    subgroup_iterator subgroup_begin() const;
-    subgroup_iterator subgroup_end() const;
-    llvm::iterator_range<subgroup_iterator> subgroups() const;
-
-    typedef group_iterator<DiagnosticRecord> diagnostics_iterator;
-    diagnostics_iterator diagnostics_begin() const;
-    diagnostics_iterator diagnostics_end() const;
-    llvm::iterator_range<diagnostics_iterator> diagnostics() const;
-
-    bool operator<(llvm::StringRef Other) const {
-      return getName() < Other;
+    bool operator!=(const group_iterator &Other) const {
+      return CurrentID != Other.CurrentID;
     }
   };
 
+  typedef group_iterator<GroupRecord> subgroup_iterator;
+  subgroup_iterator subgroup_begin() const;
+  subgroup_iterator subgroup_end() const;
+  llvm::iterator_range<subgroup_iterator> subgroups() const;
+
+  typedef group_iterator<DiagnosticRecord> diagnostics_iterator;
+  diagnostics_iterator diagnostics_begin() const;
+  diagnostics_iterator diagnostics_end() const;
+  llvm::iterator_range<diagnostics_iterator> diagnostics() const;
+
+  bool operator<(llvm::StringRef Other) const { return getName() < Other; }
+};
+
   /// Get every diagnostic group in the system, sorted by name.
   llvm::ArrayRef<GroupRecord> getDiagnosticGroups();
 
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp 
b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index ba10be060c20a..193cfa69e8d88 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1736,10 +1736,11 @@ class DiagStableIDsMap {
   ///     static constexpr llvm::StringTable DiagStableIds;
   ///  #endif
   /// \endcode
-  void emit(raw_ostream &OS) const {
+  void emit(raw_ostream &OS, const RecordKeeper &Records) const {
     OS << "\n#ifdef GET_DIAG_STABLE_ID_ARRAYS\n";
     emitStableIDs(OS);
     emitLegacyStableIDs(OS);
+    emitDescriptions(OS, Records);
     OS << "#endif // GET_DIAG_STABLE_ID_ARRAYS\n\n";
   }
 
@@ -1795,6 +1796,28 @@ class DiagStableIDsMap {
     }
     OS << "};\n\n";
   }
+
+  static void emitDescriptions(raw_ostream &OS, const RecordKeeper &Records) {
+    DiagnosticTextBuilder DiagTextBuilder(Records);
+    StringToOffsetTable Descriptions;
+
+    OS << "enum {\n";
+    for (const Record &R :
+         make_pointee_range(Records.getAllDerivedDefinitions("Diagnostic"))) {
+      std::string Description = DiagTextBuilder.buildForDefinition(&R);
+      if (Description.size() >= (1U << 12))
+        PrintFatalError(R.getLoc(),
+                        "diagnostic description exceeds 12-bit length limit");
+
+      unsigned Offset = Descriptions.GetOrAddStringOffset(Description);
+      OS << "  DIAG_DESC_OFFSET_" << R.getName() << " = " << Offset << ",\n";
+    }
+    OS << "};\n\n";
+
+    if (Descriptions.size() >= (1U << 20))
+      PrintFatalError("diagnostic descriptions exceed 20-bit offset limit");
+    Descriptions.EmitStringTableStorageDef(OS, "StaticDiagInfoDescriptions");
+  }
 };
 } // namespace
 
@@ -1802,7 +1825,7 @@ class DiagStableIDsMap {
 void clang::EmitClangDiagsStableIDs(const RecordKeeper &Records,
                                     raw_ostream &OS) {
   DiagStableIDsMap StableIDs(Records);
-  StableIDs.emit(OS);
+  StableIDs.emit(OS, Records);
 }
 
 /// ClangDiagsDefsEmitter - The top-level class emits .def files containing
diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h 
b/llvm/include/llvm/TableGen/StringToOffsetTable.h
index 5ee5fb538f3eb..fcf517ad56918 100644
--- a/llvm/include/llvm/TableGen/StringToOffsetTable.h
+++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h
@@ -67,6 +67,12 @@ class StringToOffsetTable {
   // valid identifiers to declare.
   void EmitStringTableDef(raw_ostream &OS, const Twine &Name) const;
 
+  // Emit only the character storage used by a string table definition.
+  //
+  // This preserves the large-string handling of EmitStringTableDef without
+  // emitting an llvm::StringTable object.
+  void EmitStringTableStorageDef(raw_ostream &OS, const Twine &Name) const;
+
   // Emit the string as one single string.
   void EmitString(raw_ostream &O) const;
 };
diff --git a/llvm/lib/TableGen/StringToOffsetTable.cpp 
b/llvm/lib/TableGen/StringToOffsetTable.cpp
index 06d8240486245..826752c23ff64 100644
--- a/llvm/lib/TableGen/StringToOffsetTable.cpp
+++ b/llvm/lib/TableGen/StringToOffsetTable.cpp
@@ -26,10 +26,9 @@ unsigned StringToOffsetTable::GetOrAddStringOffset(StringRef 
Str) {
   return II->second;
 }
 
-void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS,
-                                             const Twine &Name) const {
-  // This generates a `llvm::StringTable` which expects that entries are null
-  // terminated. So fail with an error if `AppendZero` is false.
+void StringToOffsetTable::EmitStringTableStorageDef(raw_ostream &OS,
+                                                    const Twine &Name) const {
+  // String table entries must be null terminated.
   if (!AppendZero)
     PrintFatalError("llvm::StringTable requires null terminated strings");
 
@@ -79,11 +78,17 @@ void StringToOffsetTable::EmitStringTableDef(raw_ostream 
&OS,
   }
   OS << LineSep << (UseChars ? "};" : "  ;");
 
-  OS << formatv(R"(
+  OS << R"(
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
+)";
+}
 
+void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS,
+                                             const Twine &Name) const {
+  EmitStringTableStorageDef(OS, Name);
+  OS << formatv(R"(
 {1} llvm::StringTable
 {2}{0} = {0}Storage;
 )",

``````````

</details>


https://github.com/llvm/llvm-project/pull/202624
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to