https://github.com/clayborg created https://github.com/llvm/llvm-project/pull/167457
We can change llvm-dwp to emit DWARF64 version of the .debug_str_offsets tables for .dwo files in a .dwp file. This allows the string table to exceed 4GB without truncating string offsets into the .debug_str section and losing data. llvm-dwp will append all strings to the .debug_str section for a .dwo file, and if any of the new string offsets exceed UINT32_MAX, it will upgrade the .debug_str_offsets table to a DWARF64 header and then each string offset in that table can now have a 64 bit offset. Fixed LLDB to be able to successfully load the 64 bit string tables in .dwp files. Fixed llvm-dwarfdump and LLVM DWARF parsing code to do the right thing with DWARF64 string table headers. >From a2431068c087edc09893009448d41b5a83ca03c9 Mon Sep 17 00:00:00 2001 From: Greg Clayton <[email protected]> Date: Mon, 10 Nov 2025 21:22:39 -0800 Subject: [PATCH] Modify llvm-dwp to be able to emit string tables over 4GB without losing data. We can change llvm-dwp to emit DWARF64 version of the .debug_str_offsets tables for .dwo files in a .dwp file. This allows the string table to exceed 4GB without truncating string offsets into the .debug_str section and losing data. llvm-dwp will append all strings to the .debug_str section for a .dwo file, and if any of the new string offsets exceed UINT32_MAX, it will upgrade the .debug_str_offsets table to a DWARF64 header and then each string offset in that table can now have a 64 bit offset. Fixed LLDB to be able to successfully load the 64 bit string tables in .dwp files. Fixed llvm-dwarfdump and LLVM DWARF parsing code to do the right thing with DWARF64 string table headers. --- .../Plugins/SymbolFile/DWARF/DWARFUnit.cpp | 13 +++- .../Plugins/SymbolFile/DWARF/DWARFUnit.h | 2 +- llvm/include/llvm/DWP/DWP.h | 4 +- llvm/include/llvm/DWP/DWPStringPool.h | 6 +- llvm/lib/DWP/DWP.cpp | 78 +++++++++++++++---- llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp | 13 +++- 6 files changed, 89 insertions(+), 27 deletions(-) diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp index 94fc2e83e899d..7b7864caf8c09 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp @@ -360,8 +360,10 @@ void DWARFUnit::SetDwoStrOffsetsBase() { const DWARFDataExtractor &strOffsets = GetSymbolFileDWARF().GetDWARFContext().getOrLoadStrOffsetsData(); uint64_t length = strOffsets.GetU32(&baseOffset); - if (length == 0xffffffff) + if (length == 0xffffffff) { length = strOffsets.GetU64(&baseOffset); + m_str_offsets_size = 8; + } // Check version. if (strOffsets.GetU16(&baseOffset) < 5) @@ -369,6 +371,10 @@ void DWARFUnit::SetDwoStrOffsetsBase() { // Skip padding. baseOffset += 2; + } else { + // Size of offset for .debug_str_offsets is same as DWARF offset byte size + // of the DWARFUnit for DWARF version 4 and earlier. + m_str_offsets_size = m_header.getDwarfOffsetByteSize(); } SetStrOffsetsBase(baseOffset); @@ -1079,10 +1085,9 @@ uint32_t DWARFUnit::GetHeaderByteSize() const { return m_header.getSize(); } std::optional<uint64_t> DWARFUnit::GetStringOffsetSectionItem(uint32_t index) const { - lldb::offset_t offset = - GetStrOffsetsBase() + index * m_header.getDwarfOffsetByteSize(); + lldb::offset_t offset = GetStrOffsetsBase() + index * m_str_offsets_size; return m_dwarf.GetDWARFContext().getOrLoadStrOffsetsData().GetMaxU64( - &offset, m_header.getDwarfOffsetByteSize()); + &offset, m_str_offsets_size); } llvm::Expected<llvm::DWARFAddressRangesVector> diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h index 91a693860c55a..856db5e4101cd 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h @@ -364,7 +364,7 @@ class DWARFUnit : public DWARFExpression::Delegate, public UserID { dw_offset_t m_line_table_offset = DW_INVALID_OFFSET; dw_offset_t m_str_offsets_base = 0; // Value of DW_AT_str_offsets_base. - + dw_offset_t m_str_offsets_size = 4; // Size in bytes of the string offsets. std::optional<llvm::DWARFDebugRnglistTable> m_rnglist_table; bool m_rnglist_table_done = false; std::optional<llvm::DWARFListTableHeader> m_loclist_table_header; diff --git a/llvm/include/llvm/DWP/DWP.h b/llvm/include/llvm/DWP/DWP.h index a759bae10d160..cc38369658eaa 100644 --- a/llvm/include/llvm/DWP/DWP.h +++ b/llvm/include/llvm/DWP/DWP.h @@ -70,6 +70,8 @@ struct CompileUnitIdentifiers { LLVM_ABI Error write(MCStreamer &Out, ArrayRef<std::string> Inputs, OnCuIndexOverflow OverflowOptValue); +typedef std::vector<std::pair<DWARFSectionKind, uint32_t>> SectionLengths; + LLVM_ABI Error handleSection( const StringMap<std::pair<MCSection *, DWARFSectionKind>> &KnownSections, const MCSection *StrSection, const MCSection *StrOffsetSection, @@ -82,7 +84,7 @@ LLVM_ABI Error handleSection( std::vector<StringRef> &CurTypesSection, std::vector<StringRef> &CurInfoSection, StringRef &AbbrevSection, StringRef &CurCUIndexSection, StringRef &CurTUIndexSection, - std::vector<std::pair<DWARFSectionKind, uint32_t>> &SectionLength); + SectionLengths &SectionLength); LLVM_ABI Expected<InfoSectionUnitHeader> parseInfoSectionUnitHeader(StringRef Info); diff --git a/llvm/include/llvm/DWP/DWPStringPool.h b/llvm/include/llvm/DWP/DWPStringPool.h index 1354b46f156b6..d1486ff7872e1 100644 --- a/llvm/include/llvm/DWP/DWPStringPool.h +++ b/llvm/include/llvm/DWP/DWPStringPool.h @@ -32,13 +32,13 @@ class DWPStringPool { MCStreamer &Out; MCSection *Sec; - DenseMap<const char *, uint32_t, CStrDenseMapInfo> Pool; - uint32_t Offset = 0; + DenseMap<const char *, uint64_t, CStrDenseMapInfo> Pool; + uint64_t Offset = 0; public: DWPStringPool(MCStreamer &Out, MCSection *Sec) : Out(Out), Sec(Sec) {} - uint32_t getOffset(const char *Str, unsigned Length) { + uint64_t getOffset(const char *Str, unsigned Length) { assert(strlen(Str) + 1 == Length && "Ensure length hint is correct"); auto Pair = Pool.insert(std::make_pair(Str, Offset)); diff --git a/llvm/lib/DWP/DWP.cpp b/llvm/lib/DWP/DWP.cpp index b565edbfe96db..54edce81208b5 100644 --- a/llvm/lib/DWP/DWP.cpp +++ b/llvm/lib/DWP/DWP.cpp @@ -413,33 +413,43 @@ Expected<InfoSectionUnitHeader> parseInfoSectionUnitHeader(StringRef Info) { } static void writeNewOffsetsTo(MCStreamer &Out, DataExtractor &Data, - DenseMap<uint64_t, uint32_t> &OffsetRemapping, - uint64_t &Offset, uint64_t &Size) { + DenseMap<uint64_t, uint64_t> &OffsetRemapping, + uint64_t &Offset, const uint64_t Size, + uint32_t OldOffsetSize, uint32_t NewOffsetSize) { while (Offset < Size) { - auto OldOffset = Data.getU32(&Offset); - auto NewOffset = OffsetRemapping[OldOffset]; - Out.emitIntValue(NewOffset, 4); + const uint64_t OldOffset = Data.getUnsigned(&Offset, OldOffsetSize); + const uint64_t NewOffset = OffsetRemapping[OldOffset]; + assert(NewOffsetSize == 8 || NewOffset <= UINT32_MAX); + Out.emitIntValue(NewOffset, NewOffsetSize); } } void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings, MCSection *StrOffsetSection, StringRef CurStrSection, - StringRef CurStrOffsetSection, uint16_t Version) { + StringRef CurStrOffsetSection, uint16_t Version, + SectionLengths &SectionLength) { // Could possibly produce an error or warning if one of these was non-null but // the other was null. if (CurStrSection.empty() || CurStrOffsetSection.empty()) return; - DenseMap<uint64_t, uint32_t> OffsetRemapping; + DenseMap<uint64_t, uint64_t> OffsetRemapping; DataExtractor Data(CurStrSection, true, 0); uint64_t LocalOffset = 0; uint64_t PrevOffset = 0; + + // Keep track if any new string offsets exceed UINT32_MAX. If any do, we can + // emit a DWARF64 .debug_str_offsets table for this compile unit. + uint32_t OldOffsetSize = 4; + uint32_t NewOffsetSize = 4; while (const char *S = Data.getCStr(&LocalOffset)) { - OffsetRemapping[PrevOffset] = - Strings.getOffset(S, LocalOffset - PrevOffset); + uint64_t NewOffset = Strings.getOffset(S, LocalOffset - PrevOffset); + OffsetRemapping[PrevOffset] = NewOffset; + if (NewOffset > UINT32_MAX) + NewOffsetSize = 8; PrevOffset = LocalOffset; } @@ -451,7 +461,7 @@ void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings, uint64_t Size = CurStrOffsetSection.size(); if (Version > 4) { while (Offset < Size) { - uint64_t HeaderSize = debugStrOffsetsHeaderSize(Data, Version); + const uint64_t HeaderSize = debugStrOffsetsHeaderSize(Data, Version); assert(HeaderSize <= Size - Offset && "StrOffsetSection size is less than its header"); @@ -461,16 +471,52 @@ void writeStringsAndOffsets(MCStreamer &Out, DWPStringPool &Strings, if (HeaderSize == 8) { ContributionSize = Data.getU32(&HeaderLengthOffset); } else if (HeaderSize == 16) { + OldOffsetSize = 8; HeaderLengthOffset += 4; // skip the dwarf64 marker ContributionSize = Data.getU64(&HeaderLengthOffset); } ContributionEnd = ContributionSize + HeaderLengthOffset; - Out.emitBytes(Data.getBytes(&Offset, HeaderSize)); - writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, ContributionEnd); + + StringRef HeaderBytes = Data.getBytes(&Offset, HeaderSize); + if (OldOffsetSize == 4 && NewOffsetSize == 8) { + // We had a DWARF32 .debug_str_offsets header, but we need to emit + // some string offsets that require 64 bit offsets on the .debug_str + // section. Emit the .debug_str_offsets header in DWARF64 format so we + // can emit string offsets that exceed UINT32_MAX without truncating + // the string offset. + + // 2 bytes for DWARF version, 2 bytes pad. + const uint64_t VersionPadSize = 4; + const uint64_t NewLength = + (ContributionSize - VersionPadSize) * 2 + VersionPadSize; + // Emit the DWARF64 length that starts with a 4 byte DW_LENGTH_DWARF64 + // value followed by the 8 byte updated length. + Out.emitIntValue(llvm::dwarf::DW_LENGTH_DWARF64, 4); + Out.emitIntValue(NewLength, 8); + // Emit DWARF version as a 2 byte integer. + Out.emitIntValue(Version, 2); + // Emit 2 bytes of padding. + Out.emitIntValue(0, 2); + // Update the .debug_str_offsets section length contribution for the + // this .dwo file. + for (auto &Pair : SectionLength) { + if (Pair.first == DW_SECT_STR_OFFSETS) { + Pair.second = NewLength + 12; + break; + } + } + } else { + // Just emit the same .debug_str_offsets header. + Out.emitBytes(HeaderBytes); + } + writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, ContributionEnd, + OldOffsetSize, NewOffsetSize); } } else { - writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, Size); + assert(OldOffsetSize == NewOffsetSize); + writeNewOffsetsTo(Out, Data, OffsetRemapping, Offset, Size, OldOffsetSize, + NewOffsetSize); } } @@ -562,7 +608,7 @@ Error handleSection( std::vector<StringRef> &CurTypesSection, std::vector<StringRef> &CurInfoSection, StringRef &AbbrevSection, StringRef &CurCUIndexSection, StringRef &CurTUIndexSection, - std::vector<std::pair<DWARFSectionKind, uint32_t>> &SectionLength) { + SectionLengths &SectionLength) { if (Section.isBSS()) return Error::success(); @@ -684,7 +730,7 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs, // This maps each section contained in this file to its length. // This information is later on used to calculate the contributions, // i.e. offset and length, of each compile/type unit to a section. - std::vector<std::pair<DWARFSectionKind, uint32_t>> SectionLength; + SectionLengths SectionLength; for (const auto &Section : Obj.sections()) if (auto Err = handleSection( @@ -713,7 +759,7 @@ Error write(MCStreamer &Out, ArrayRef<std::string> Inputs, } writeStringsAndOffsets(Out, Strings, StrOffsetSection, CurStrSection, - CurStrOffsetSection, Header.Version); + CurStrOffsetSection, Header.Version, SectionLength); for (auto Pair : SectionLength) { auto Index = getContributionIndex(Pair.first, IndexVersion); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp index da0bf03e1ac57..b4256ae13914c 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -1187,9 +1187,18 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA) { if (getVersion() >= 5) { if (DA.getData().data() == nullptr) return std::nullopt; - Offset += Header.getFormat() == dwarf::DwarfFormat::DWARF32 ? 8 : 16; + // For .dwo files, the section contribution for the .debug_str_offsets + // points to the string offsets table header. Decode the format from this + // data as llvm-dwp has been modified to be able to emit a + // .debug_str_offsets table as DWARF64 even if the compile unit is DWARF32. + // This allows .dwp files to have string tables that exceed UINT32_MAX in + // size. + uint64_t Length = 0; + DwarfFormat Format = dwarf::DwarfFormat::DWARF32; + std::tie(Length, Format) = DA.getInitialLength(&Offset); + Offset += 4; // Skip the DWARF version uint16_t and the uint16_t padding. // Look for a valid contribution at the given offset. - auto DescOrError = parseDWARFStringOffsetsTableHeader(DA, Header.getFormat(), Offset); + auto DescOrError = parseDWARFStringOffsetsTableHeader(DA, Format, Offset); if (!DescOrError) return DescOrError.takeError(); return *DescOrError; _______________________________________________ lldb-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits
