llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-tablegen Author: David Zbarsky (dzbarsky) <details> <summary>Changes</summary> Generate each distinct builtin diagnostic description once in the existing `DiagnosticStableIDs.inc` output, emit pointer-free character storage, and store its 20-bit offset and 12-bit length in `StaticDiagInfoRec`. Diagtool also stores each diagnostic name once, uses naturally aligned six-byte `DiagnosticRecord` entries, and replaces the duplicate ID-sorted record table with a direct `uint16_t` index, without changing the generated `DIAG` macro signature. On arm64 macOS Release builds, stripped standalone clang decreases by 49,536 bytes and the LLVM 22 stripped all-tools multicall binary decreases by 49,544 bytes and linked fixups decrease from 14,604 to 4. Work towards #<!-- -->202616 AI tool disclosure: Co-authored with OpenAI Codex. --- Full diff: https://github.com/llvm/llvm-project/pull/202624.diff 6 Files Affected: - (modified) clang/lib/Basic/DiagnosticIDs.cpp (+16-42) - (modified) clang/tools/diagtool/DiagnosticNames.cpp (+44-30) - (modified) clang/tools/diagtool/DiagnosticNames.h (+70-77) - (modified) clang/utils/TableGen/ClangDiagnosticsEmitter.cpp (+25-2) - (modified) llvm/include/llvm/TableGen/StringToOffsetTable.h (+6) - (modified) llvm/lib/TableGen/StringToOffsetTable.cpp (+10-5) ``````````diff diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index d9c5e4082c1a7..2ebe3b5f5cc4e 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -36,40 +36,8 @@ struct StaticDiagInfoRec; #include "clang/Basic/DiagnosticStableIDs.inc" #undef GET_DIAG_STABLE_ID_ARRAYS -// Store the descriptions in a separate table to avoid pointers that need to -// be relocated, and also decrease the amount of data needed on 64-bit -// platforms. See "How To Write Shared Libraries" by Ulrich Drepper. -struct StaticDiagInfoDescriptionStringTable { -#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \ - LEGACY_STABLE_IDS) \ - char ENUM##_desc[sizeof(DESC)]; -#include "clang/Basic/AllDiagnosticKinds.inc" -#undef DIAG -}; - -const StaticDiagInfoDescriptionStringTable StaticDiagInfoDescriptions = { -#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \ - LEGACY_STABLE_IDS) \ - DESC, -#include "clang/Basic/AllDiagnosticKinds.inc" -#undef DIAG -}; - extern const StaticDiagInfoRec StaticDiagInfo[]; -// Stored separately from StaticDiagInfoRec to pack better. Otherwise, -// StaticDiagInfoRec would have extra padding on 64-bit platforms. -const uint32_t StaticDiagInfoDescriptionOffsets[] = { -#define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \ - LEGACY_STABLE_IDS) \ - offsetof(StaticDiagInfoDescriptionStringTable, ENUM##_desc), -#include "clang/Basic/AllDiagnosticKinds.inc" -#undef DIAG -}; - const uint32_t StaticDiagInfoStableIDOffsets[] = { #define DIAG(ENUM, CLASS, DEFAULT_SEVERITY, DESC, GROUP, SFINAE, NOWERROR, \ SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY, STABLE_ID, \ @@ -111,25 +79,26 @@ struct StaticDiagInfoRec { uint16_t WarnNoWerror : 1; LLVM_PREFERRED_TYPE(bool) uint16_t WarnShowInSystemHeader : 1; + LLVM_PREFERRED_TYPE(diag::Group) + uint16_t OptionGroupIndex : 14; LLVM_PREFERRED_TYPE(bool) uint16_t WarnShowInSystemMacro : 1; - - LLVM_PREFERRED_TYPE(diag::Group) - uint16_t OptionGroupIndex : 15; LLVM_PREFERRED_TYPE(bool) uint16_t Deferrable : 1; - uint16_t DescriptionLen; + uint16_t DescriptionOffsetLow; + uint16_t DescriptionOffsetHigh : 4; + uint16_t DescriptionLen : 12; unsigned getOptionGroupIndex() const { return OptionGroupIndex; } StringRef getDescription() const { - size_t MyIndex = this - &StaticDiagInfo[0]; - uint32_t StringOffset = StaticDiagInfoDescriptionOffsets[MyIndex]; - const char* Table = reinterpret_cast<const char*>(&StaticDiagInfoDescriptions); - return StringRef(&Table[StringOffset], DescriptionLen); + uint32_t Offset = + DescriptionOffsetLow | (uint32_t(DescriptionOffsetHigh) << 16); + return StringRef(StaticDiagInfoDescriptionsStorage + Offset, + DescriptionLen); } StringRef getStableID() const { @@ -159,6 +128,9 @@ struct StaticDiagInfoRec { return DiagID < RHS.DiagID; } }; +static_assert(sizeof(StaticDiagInfoRec) == 10); +static_assert(static_cast<unsigned>(diag::Group::NUM_GROUPS) < (1U << 14), + "too many diagnostic groups for StaticDiagInfoRec"); #define STRINGIFY_NAME(NAME) #NAME #define VALIDATE_DIAG_SIZE(NAME) \ @@ -200,9 +172,11 @@ const StaticDiagInfoRec StaticDiagInfo[] = { CATEGORY, \ NOWERROR, \ SHOWINSYSHEADER, \ - SHOWINSYSMACRO, \ GROUP, \ - DEFERRABLE, \ + SHOWINSYSMACRO, \ + DEFERRABLE, \ + uint16_t(DIAG_DESC_OFFSET_##ENUM), \ + uint16_t(DIAG_DESC_OFFSET_##ENUM >> 16), \ STR_SIZE(DESC, uint16_t)}, #include "clang/Basic/DiagnosticCommonKinds.inc" #include "clang/Basic/DiagnosticDriverKinds.inc" diff --git a/clang/tools/diagtool/DiagnosticNames.cpp b/clang/tools/diagtool/DiagnosticNames.cpp index 1538167022aca..0c5f6bc771cff 100644 --- a/clang/tools/diagtool/DiagnosticNames.cpp +++ b/clang/tools/diagtool/DiagnosticNames.cpp @@ -10,53 +10,67 @@ #include "clang/Basic/AllDiagnostics.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringTable.h" +#include <array> +#include <cstddef> +#include <cstdint> using namespace clang; using namespace diagtool; -static const DiagnosticRecord BuiltinDiagnosticsByName[] = { -#define DIAG_NAME_INDEX(ENUM) { #ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t) }, +struct BuiltinDiagnosticNameStorage { +#define DIAG_NAME_INDEX(ENUM) char ENUM[sizeof(#ENUM)]; #include "clang/Basic/DiagnosticIndexName.inc" #undef DIAG_NAME_INDEX }; +static_assert(sizeof(BuiltinDiagnosticNameStorage) <= uint64_t(1) << 24); + +static constexpr BuiltinDiagnosticNameStorage BuiltinDiagnosticNames = { +#define DIAG_NAME_INDEX(ENUM) #ENUM, +#include "clang/Basic/DiagnosticIndexName.inc" +#undef DIAG_NAME_INDEX +}; + +#define DIAGNOSTIC_RECORD(ENUM) \ + {diag::ENUM, uint16_t(offsetof(BuiltinDiagnosticNameStorage, ENUM)), \ + uint8_t(offsetof(BuiltinDiagnosticNameStorage, ENUM) >> 16), \ + STR_SIZE(#ENUM, uint8_t)} + +static constexpr DiagnosticRecord BuiltinDiagnosticsByName[] = { +#define DIAG_NAME_INDEX(ENUM) DIAGNOSTIC_RECORD(ENUM), +#include "clang/Basic/DiagnosticIndexName.inc" +#undef DIAG_NAME_INDEX +}; +static_assert(std::size(BuiltinDiagnosticsByName) < (1U << 16)); +#undef DIAGNOSTIC_RECORD + llvm::ArrayRef<DiagnosticRecord> diagtool::getBuiltinDiagnosticsByName() { return llvm::ArrayRef(BuiltinDiagnosticsByName); } -// FIXME: Is it worth having two tables, especially when this one can get -// out of sync easily? -static const DiagnosticRecord BuiltinDiagnosticsByID[] = { -#define DIAG(ENUM, CLASS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFER, CATEGORY, STABLE_ID, \ - LEGACY_STABLE_IDS) \ - {#ENUM, diag::ENUM, STR_SIZE(#ENUM, uint8_t)}, -#include "clang/Basic/AllDiagnosticKinds.inc" -#undef DIAG -}; - -static bool orderByID(const DiagnosticRecord &Left, - const DiagnosticRecord &Right) { - return Left.DiagID < Right.DiagID; +static constexpr auto BuiltinDiagnosticIndexByID = [] { + std::array<uint16_t, diag::DIAG_UPPER_LIMIT> Result = {}; + uint16_t Index = 0; +#define DIAG_NAME_INDEX(ENUM) Result[diag::ENUM] = ++Index; +#include "clang/Basic/DiagnosticIndexName.inc" +#undef DIAG_NAME_INDEX + return Result; +}(); + +llvm::StringRef DiagnosticRecord::getName() const { + const char *Names = reinterpret_cast<const char *>(&BuiltinDiagnosticNames); + uint32_t NameOffset = + uint32_t(NameOffsetLow) | (uint32_t(NameOffsetHigh) << 16); + return llvm::StringRef(Names + NameOffset, NameLen); } const DiagnosticRecord &diagtool::getDiagnosticForID(short DiagID) { - DiagnosticRecord Key = {nullptr, DiagID, 0}; - - // The requirement for lower_bound to produce a valid result it is - // enough if the BuiltinDiagnosticsByID is partitioned (by DiagID), - // but as we want this function to work for all possible values of - // DiagID sent in as argument it is better to right away check if - // BuiltinDiagnosticsByID is sorted. - assert(llvm::is_sorted(BuiltinDiagnosticsByID, orderByID) && - "IDs in BuiltinDiagnosticsByID must be sorted."); - const DiagnosticRecord *Result = - llvm::lower_bound(BuiltinDiagnosticsByID, Key, orderByID); - assert(Result && "diagnostic not found; table may be out of date"); - return *Result; + assert(DiagID >= 0 && + static_cast<unsigned>(DiagID) < BuiltinDiagnosticIndexByID.size() && + BuiltinDiagnosticIndexByID[DiagID] != 0 && "diagnostic not found"); + return BuiltinDiagnosticsByName[BuiltinDiagnosticIndexByID[DiagID] - 1]; } - #define GET_DIAG_ARRAYS #include "clang/Basic/DiagnosticGroups.inc" #undef GET_DIAG_ARRAYS diff --git a/clang/tools/diagtool/DiagnosticNames.h b/clang/tools/diagtool/DiagnosticNames.h index f541e88577cc5..52622875175af 100644 --- a/clang/tools/diagtool/DiagnosticNames.h +++ b/clang/tools/diagtool/DiagnosticNames.h @@ -15,91 +15,84 @@ namespace diagtool { - struct DiagnosticRecord { - const char *NameStr; - short DiagID; - uint8_t NameLen; +struct DiagnosticRecord { + short DiagID; + uint16_t NameOffsetLow; + uint8_t NameOffsetHigh; + uint8_t NameLen; - llvm::StringRef getName() const { - return llvm::StringRef(NameStr, NameLen); + llvm::StringRef getName() const; + + bool operator<(const DiagnosticRecord &Other) const { + return getName() < Other.getName(); + } +}; +static_assert(sizeof(DiagnosticRecord) == 6, + "DiagnosticRecord must remain compact"); + +/// Get every diagnostic in the system, sorted by name. +llvm::ArrayRef<DiagnosticRecord> getBuiltinDiagnosticsByName(); + +/// Get a diagnostic by its ID. +const DiagnosticRecord &getDiagnosticForID(short DiagID); + +struct GroupRecord { + uint16_t NameOffset; + uint16_t Members; + uint16_t SubGroups; + + llvm::StringRef getName() const; + + template <typename RecordType> class group_iterator { + const short *CurrentID; + + friend struct GroupRecord; + group_iterator(const short *Start) : CurrentID(Start) { + if (CurrentID && *CurrentID == -1) + CurrentID = nullptr; } - bool operator<(const DiagnosticRecord &Other) const { - return getName() < Other.getName(); + public: + typedef RecordType value_type; + typedef const value_type &reference; + typedef const value_type *pointer; + typedef std::forward_iterator_tag iterator_category; + typedef std::ptrdiff_t difference_type; + + inline reference operator*() const; + inline pointer operator->() const { return &operator*(); } + + inline short getID() const { return *CurrentID; } + + group_iterator &operator++() { + ++CurrentID; + if (*CurrentID == -1) + CurrentID = nullptr; + return *this; + } + + bool operator==(const group_iterator &Other) const { + return CurrentID == Other.CurrentID; } - }; - /// Get every diagnostic in the system, sorted by name. - llvm::ArrayRef<DiagnosticRecord> getBuiltinDiagnosticsByName(); - - /// Get a diagnostic by its ID. - const DiagnosticRecord &getDiagnosticForID(short DiagID); - - - struct GroupRecord { - uint16_t NameOffset; - uint16_t Members; - uint16_t SubGroups; - - llvm::StringRef getName() const; - - template<typename RecordType> - class group_iterator { - const short *CurrentID; - - friend struct GroupRecord; - group_iterator(const short *Start) : CurrentID(Start) { - if (CurrentID && *CurrentID == -1) - CurrentID = nullptr; - } - - public: - typedef RecordType value_type; - typedef const value_type & reference; - typedef const value_type * pointer; - typedef std::forward_iterator_tag iterator_category; - typedef std::ptrdiff_t difference_type; - - inline reference operator*() const; - inline pointer operator->() const { - return &operator*(); - } - - inline short getID() const { - return *CurrentID; - } - - group_iterator &operator++() { - ++CurrentID; - if (*CurrentID == -1) - CurrentID = nullptr; - return *this; - } - - bool operator==(const group_iterator &Other) const { - return CurrentID == Other.CurrentID; - } - - bool operator!=(const group_iterator &Other) const { - return CurrentID != Other.CurrentID; - } - }; - - typedef group_iterator<GroupRecord> subgroup_iterator; - subgroup_iterator subgroup_begin() const; - subgroup_iterator subgroup_end() const; - llvm::iterator_range<subgroup_iterator> subgroups() const; - - typedef group_iterator<DiagnosticRecord> diagnostics_iterator; - diagnostics_iterator diagnostics_begin() const; - diagnostics_iterator diagnostics_end() const; - llvm::iterator_range<diagnostics_iterator> diagnostics() const; - - bool operator<(llvm::StringRef Other) const { - return getName() < Other; + bool operator!=(const group_iterator &Other) const { + return CurrentID != Other.CurrentID; } }; + typedef group_iterator<GroupRecord> subgroup_iterator; + subgroup_iterator subgroup_begin() const; + subgroup_iterator subgroup_end() const; + llvm::iterator_range<subgroup_iterator> subgroups() const; + + typedef group_iterator<DiagnosticRecord> diagnostics_iterator; + diagnostics_iterator diagnostics_begin() const; + diagnostics_iterator diagnostics_end() const; + llvm::iterator_range<diagnostics_iterator> diagnostics() const; + + bool operator<(llvm::StringRef Other) const { return getName() < Other; } +}; + /// Get every diagnostic group in the system, sorted by name. llvm::ArrayRef<GroupRecord> getDiagnosticGroups(); diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp index ba10be060c20a..193cfa69e8d88 100644 --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -1736,10 +1736,11 @@ class DiagStableIDsMap { /// static constexpr llvm::StringTable DiagStableIds; /// #endif /// \endcode - void emit(raw_ostream &OS) const { + void emit(raw_ostream &OS, const RecordKeeper &Records) const { OS << "\n#ifdef GET_DIAG_STABLE_ID_ARRAYS\n"; emitStableIDs(OS); emitLegacyStableIDs(OS); + emitDescriptions(OS, Records); OS << "#endif // GET_DIAG_STABLE_ID_ARRAYS\n\n"; } @@ -1795,6 +1796,28 @@ class DiagStableIDsMap { } OS << "};\n\n"; } + + static void emitDescriptions(raw_ostream &OS, const RecordKeeper &Records) { + DiagnosticTextBuilder DiagTextBuilder(Records); + StringToOffsetTable Descriptions; + + OS << "enum {\n"; + for (const Record &R : + make_pointee_range(Records.getAllDerivedDefinitions("Diagnostic"))) { + std::string Description = DiagTextBuilder.buildForDefinition(&R); + if (Description.size() >= (1U << 12)) + PrintFatalError(R.getLoc(), + "diagnostic description exceeds 12-bit length limit"); + + unsigned Offset = Descriptions.GetOrAddStringOffset(Description); + OS << " DIAG_DESC_OFFSET_" << R.getName() << " = " << Offset << ",\n"; + } + OS << "};\n\n"; + + if (Descriptions.size() >= (1U << 20)) + PrintFatalError("diagnostic descriptions exceed 20-bit offset limit"); + Descriptions.EmitStringTableStorageDef(OS, "StaticDiagInfoDescriptions"); + } }; } // namespace @@ -1802,7 +1825,7 @@ class DiagStableIDsMap { void clang::EmitClangDiagsStableIDs(const RecordKeeper &Records, raw_ostream &OS) { DiagStableIDsMap StableIDs(Records); - StableIDs.emit(OS); + StableIDs.emit(OS, Records); } /// ClangDiagsDefsEmitter - The top-level class emits .def files containing diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h index 5ee5fb538f3eb..fcf517ad56918 100644 --- a/llvm/include/llvm/TableGen/StringToOffsetTable.h +++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h @@ -67,6 +67,12 @@ class StringToOffsetTable { // valid identifiers to declare. void EmitStringTableDef(raw_ostream &OS, const Twine &Name) const; + // Emit only the character storage used by a string table definition. + // + // This preserves the large-string handling of EmitStringTableDef without + // emitting an llvm::StringTable object. + void EmitStringTableStorageDef(raw_ostream &OS, const Twine &Name) const; + // Emit the string as one single string. void EmitString(raw_ostream &O) const; }; diff --git a/llvm/lib/TableGen/StringToOffsetTable.cpp b/llvm/lib/TableGen/StringToOffsetTable.cpp index 06d8240486245..826752c23ff64 100644 --- a/llvm/lib/TableGen/StringToOffsetTable.cpp +++ b/llvm/lib/TableGen/StringToOffsetTable.cpp @@ -26,10 +26,9 @@ unsigned StringToOffsetTable::GetOrAddStringOffset(StringRef Str) { return II->second; } -void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS, - const Twine &Name) const { - // This generates a `llvm::StringTable` which expects that entries are null - // terminated. So fail with an error if `AppendZero` is false. +void StringToOffsetTable::EmitStringTableStorageDef(raw_ostream &OS, + const Twine &Name) const { + // String table entries must be null terminated. if (!AppendZero) PrintFatalError("llvm::StringTable requires null terminated strings"); @@ -79,11 +78,17 @@ void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS, } OS << LineSep << (UseChars ? "};" : " ;"); - OS << formatv(R"( + OS << R"( #ifdef __GNUC__ #pragma GCC diagnostic pop #endif +)"; +} +void StringToOffsetTable::EmitStringTableDef(raw_ostream &OS, + const Twine &Name) const { + EmitStringTableStorageDef(OS, Name); + OS << formatv(R"( {1} llvm::StringTable {2}{0} = {0}Storage; )", `````````` </details> https://github.com/llvm/llvm-project/pull/202624 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
