https://github.com/snehasish updated https://github.com/llvm/llvm-project/pull/140501
>From 178cc0cac2258349b482f9315694f84e9d3581a5 Mon Sep 17 00:00:00 2001 From: Snehasish Kumar <snehasi...@google.com> Date: Fri, 16 May 2025 18:54:05 -0700 Subject: [PATCH 1/2] [NFC][MemProf] Move Radix tree methods to their own header and cpp. --- llvm/include/llvm/ProfileData/MemProf.h | 336 ---------------- .../llvm/ProfileData/MemProfRadixTree.h | 358 ++++++++++++++++++ llvm/include/llvm/ProfileData/MemProfReader.h | 2 +- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 1 + llvm/lib/ProfileData/CMakeLists.txt | 1 + llvm/lib/ProfileData/IndexedMemProfData.cpp | 1 + llvm/lib/ProfileData/InstrProfReader.cpp | 3 +- llvm/lib/ProfileData/MemProf.cpp | 235 ------------ llvm/lib/ProfileData/MemProfRadixTree.cpp | 253 +++++++++++++ llvm/unittests/ProfileData/InstrProfTest.cpp | 1 + llvm/unittests/ProfileData/MemProfTest.cpp | 3 +- 11 files changed, 620 insertions(+), 574 deletions(-) create mode 100644 llvm/include/llvm/ProfileData/MemProfRadixTree.h create mode 100644 llvm/lib/ProfileData/MemProfRadixTree.cpp diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index e713c3807611b..0bc1432f7d198 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -818,133 +818,6 @@ class CallStackLookupTrait { } }; -namespace detail { -// "Dereference" the iterator from DenseMap or OnDiskChainedHashTable. We have -// to do so in one of two different ways depending on the type of the hash -// table. -template <typename value_type, typename IterTy> -value_type DerefIterator(IterTy Iter) { - using deref_type = llvm::remove_cvref_t<decltype(*Iter)>; - if constexpr (std::is_same_v<deref_type, value_type>) - return *Iter; - else - return Iter->second; -} -} // namespace detail - -// A function object that returns a frame for a given FrameId. -template <typename MapTy> struct FrameIdConverter { - std::optional<FrameId> LastUnmappedId; - MapTy ⤅ - - FrameIdConverter() = delete; - FrameIdConverter(MapTy &Map) : Map(Map) {} - - // Delete the copy constructor and copy assignment operator to avoid a - // situation where a copy of FrameIdConverter gets an error in LastUnmappedId - // while the original instance doesn't. - FrameIdConverter(const FrameIdConverter &) = delete; - FrameIdConverter &operator=(const FrameIdConverter &) = delete; - - Frame operator()(FrameId Id) { - auto Iter = Map.find(Id); - if (Iter == Map.end()) { - LastUnmappedId = Id; - return Frame(); - } - return detail::DerefIterator<Frame>(Iter); - } -}; - -// A function object that returns a call stack for a given CallStackId. -template <typename MapTy> struct CallStackIdConverter { - std::optional<CallStackId> LastUnmappedId; - MapTy ⤅ - llvm::function_ref<Frame(FrameId)> FrameIdToFrame; - - CallStackIdConverter() = delete; - CallStackIdConverter(MapTy &Map, - llvm::function_ref<Frame(FrameId)> FrameIdToFrame) - : Map(Map), FrameIdToFrame(FrameIdToFrame) {} - - // Delete the copy constructor and copy assignment operator to avoid a - // situation where a copy of CallStackIdConverter gets an error in - // LastUnmappedId while the original instance doesn't. - CallStackIdConverter(const CallStackIdConverter &) = delete; - CallStackIdConverter &operator=(const CallStackIdConverter &) = delete; - - std::vector<Frame> operator()(CallStackId CSId) { - std::vector<Frame> Frames; - auto CSIter = Map.find(CSId); - if (CSIter == Map.end()) { - LastUnmappedId = CSId; - } else { - llvm::SmallVector<FrameId> CS = - detail::DerefIterator<llvm::SmallVector<FrameId>>(CSIter); - Frames.reserve(CS.size()); - for (FrameId Id : CS) - Frames.push_back(FrameIdToFrame(Id)); - } - return Frames; - } -}; - -// A function object that returns a Frame stored at a given index into the Frame -// array in the profile. -struct LinearFrameIdConverter { - const unsigned char *FrameBase; - - LinearFrameIdConverter() = delete; - LinearFrameIdConverter(const unsigned char *FrameBase) - : FrameBase(FrameBase) {} - - Frame operator()(LinearFrameId LinearId) { - uint64_t Offset = static_cast<uint64_t>(LinearId) * Frame::serializedSize(); - return Frame::deserialize(FrameBase + Offset); - } -}; - -// A function object that returns a call stack stored at a given index into the -// call stack array in the profile. -struct LinearCallStackIdConverter { - const unsigned char *CallStackBase; - llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame; - - LinearCallStackIdConverter() = delete; - LinearCallStackIdConverter( - const unsigned char *CallStackBase, - llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame) - : CallStackBase(CallStackBase), FrameIdToFrame(FrameIdToFrame) {} - - std::vector<Frame> operator()(LinearCallStackId LinearCSId) { - std::vector<Frame> Frames; - - const unsigned char *Ptr = - CallStackBase + - static_cast<uint64_t>(LinearCSId) * sizeof(LinearFrameId); - uint32_t NumFrames = - support::endian::readNext<uint32_t, llvm::endianness::little>(Ptr); - Frames.reserve(NumFrames); - for (; NumFrames; --NumFrames) { - LinearFrameId Elem = - support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); - // Follow a pointer to the parent, if any. See comments below on - // CallStackRadixTreeBuilder for the description of the radix tree format. - if (static_cast<std::make_signed_t<LinearFrameId>>(Elem) < 0) { - Ptr += (-Elem) * sizeof(LinearFrameId); - Elem = - support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); - } - // We shouldn't encounter another pointer. - assert(static_cast<std::make_signed_t<LinearFrameId>>(Elem) >= 0); - Frames.push_back(FrameIdToFrame(Elem)); - Ptr += sizeof(LinearFrameId); - } - - return Frames; - } -}; - struct LineLocation { LineLocation(uint32_t L, uint32_t D) : LineOffset(L), Column(D) {} @@ -970,73 +843,6 @@ struct LineLocation { // A pair of a call site location and its corresponding callee GUID. using CallEdgeTy = std::pair<LineLocation, uint64_t>; -// Used to extract caller-callee pairs from the call stack array. The leaf -// frame is assumed to call a heap allocation function with GUID 0. The -// resulting pairs are accumulated in CallerCalleePairs. Users can take it -// with: -// -// auto Pairs = std::move(Extractor.CallerCalleePairs); -struct CallerCalleePairExtractor { - // The base address of the radix tree array. - const unsigned char *CallStackBase; - // A functor to convert a linear FrameId to a Frame. - llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame; - // A map from caller GUIDs to lists of call sites in respective callers. - DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> CallerCalleePairs; - - // The set of linear call stack IDs that we've visited. - BitVector Visited; - - CallerCalleePairExtractor() = delete; - CallerCalleePairExtractor( - const unsigned char *CallStackBase, - llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame, - unsigned RadixTreeSize) - : CallStackBase(CallStackBase), FrameIdToFrame(FrameIdToFrame), - Visited(RadixTreeSize) {} - - void operator()(LinearCallStackId LinearCSId) { - const unsigned char *Ptr = - CallStackBase + - static_cast<uint64_t>(LinearCSId) * sizeof(LinearFrameId); - uint32_t NumFrames = - support::endian::readNext<uint32_t, llvm::endianness::little>(Ptr); - // The leaf frame calls a function with GUID 0. - uint64_t CalleeGUID = 0; - for (; NumFrames; --NumFrames) { - LinearFrameId Elem = - support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); - // Follow a pointer to the parent, if any. See comments below on - // CallStackRadixTreeBuilder for the description of the radix tree format. - if (static_cast<std::make_signed_t<LinearFrameId>>(Elem) < 0) { - Ptr += (-Elem) * sizeof(LinearFrameId); - Elem = - support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); - } - // We shouldn't encounter another pointer. - assert(static_cast<std::make_signed_t<LinearFrameId>>(Elem) >= 0); - - // Add a new caller-callee pair. - Frame F = FrameIdToFrame(Elem); - uint64_t CallerGUID = F.Function; - LineLocation Loc(F.LineOffset, F.Column); - CallerCalleePairs[CallerGUID].emplace_back(Loc, CalleeGUID); - - // Keep track of the indices we've visited. If we've already visited the - // current one, terminate the traversal. We will not discover any new - // caller-callee pair by continuing the traversal. - unsigned Offset = - std::distance(CallStackBase, Ptr) / sizeof(LinearFrameId); - if (Visited.test(Offset)) - break; - Visited.set(Offset); - - Ptr += sizeof(LinearFrameId); - CalleeGUID = CallerGUID; - } - } -}; - struct IndexedMemProfData { // A map to hold memprof data per function. The lower 64 bits obtained from // the md5 hash of the function name is used to index into the map. @@ -1087,148 +893,6 @@ struct IndexedMemProfData { // Compute a CallStackId for a given call stack. CallStackId hashCallStack(ArrayRef<FrameId> CS) const; }; - -// A convenience wrapper around FrameIdConverter and CallStackIdConverter for -// tests. -struct IndexedCallstackIdConverter { - IndexedCallstackIdConverter() = delete; - IndexedCallstackIdConverter(IndexedMemProfData &MemProfData) - : FrameIdConv(MemProfData.Frames), - CSIdConv(MemProfData.CallStacks, FrameIdConv) {} - - // Delete the copy constructor and copy assignment operator to avoid a - // situation where a copy of IndexedCallstackIdConverter gets an error in - // LastUnmappedId while the original instance doesn't. - IndexedCallstackIdConverter(const IndexedCallstackIdConverter &) = delete; - IndexedCallstackIdConverter & - operator=(const IndexedCallstackIdConverter &) = delete; - - std::vector<Frame> operator()(CallStackId CSId) { return CSIdConv(CSId); } - - FrameIdConverter<decltype(IndexedMemProfData::Frames)> FrameIdConv; - CallStackIdConverter<decltype(IndexedMemProfData::CallStacks)> CSIdConv; -}; - -struct FrameStat { - // The number of occurrences of a given FrameId. - uint64_t Count = 0; - // The sum of indexes where a given FrameId shows up. - uint64_t PositionSum = 0; -}; - -// Compute a histogram of Frames in call stacks. -template <typename FrameIdTy> -llvm::DenseMap<FrameIdTy, FrameStat> -computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> - &MemProfCallStackData); - -// Construct a radix tree of call stacks. -// -// A set of call stacks might look like: -// -// CallStackId 1: f1 -> f2 -> f3 -// CallStackId 2: f1 -> f2 -> f4 -> f5 -// CallStackId 3: f1 -> f2 -> f4 -> f6 -// CallStackId 4: f7 -> f8 -> f9 -// -// where each fn refers to a stack frame. -// -// Since we expect a lot of common prefixes, we can compress the call stacks -// into a radix tree like: -// -// CallStackId 1: f1 -> f2 -> f3 -// | -// CallStackId 2: +---> f4 -> f5 -// | -// CallStackId 3: +---> f6 -// -// CallStackId 4: f7 -> f8 -> f9 -// -// Now, we are interested in retrieving call stacks for a given CallStackId, so -// we just need a pointer from a given call stack to its parent. For example, -// CallStackId 2 would point to CallStackId 1 as a parent. -// -// We serialize the radix tree above into a single array along with the length -// of each call stack and pointers to the parent call stacks. -// -// Index: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 -// Array: L3 f9 f8 f7 L4 f6 J3 L4 f5 f4 J3 L3 f3 f2 f1 -// ^ ^ ^ ^ -// | | | | -// CallStackId 4: 0 --+ | | | -// CallStackId 3: 4 --------------+ | | -// CallStackId 2: 7 -----------------------+ | -// CallStackId 1: 11 -----------------------------------+ -// -// - LN indicates the length of a call stack, encoded as ordinary integer N. -// -// - JN indicates a pointer to the parent, encoded as -N. -// -// The radix tree allows us to reconstruct call stacks in the leaf-to-root -// order as we scan the array from left ro right while following pointers to -// parents along the way. -// -// For example, if we are decoding CallStackId 2, we start a forward traversal -// at Index 7, noting the call stack length of 4 and obtaining f5 and f4. When -// we see J3 at Index 10, we resume a forward traversal at Index 13 = 10 + 3, -// picking up f2 and f1. We are done after collecting 4 frames as indicated at -// the beginning of the traversal. -// -// On-disk IndexedMemProfRecord will refer to call stacks by their indexes into -// the radix tree array, so we do not explicitly encode mappings like: -// "CallStackId 1 -> 11". -template <typename FrameIdTy> class CallStackRadixTreeBuilder { - // The radix tree array. - std::vector<LinearFrameId> RadixArray; - - // Mapping from CallStackIds to indexes into RadixArray. - llvm::DenseMap<CallStackId, LinearCallStackId> CallStackPos; - - // In build, we partition a given call stack into two parts -- the prefix - // that's common with the previously encoded call stack and the frames beyond - // the common prefix -- the unique portion. Then we want to find out where - // the common prefix is stored in RadixArray so that we can link the unique - // portion to the common prefix. Indexes, declared below, helps with our - // needs. Intuitively, Indexes tells us where each of the previously encoded - // call stack is stored in RadixArray. More formally, Indexes satisfies: - // - // RadixArray[Indexes[I]] == Prev[I] - // - // for every I, where Prev is the the call stack in the root-to-leaf order - // previously encoded by build. (Note that Prev, as passed to - // encodeCallStack, is in the leaf-to-root order.) - // - // For example, if the call stack being encoded shares 5 frames at the root of - // the call stack with the previously encoded call stack, - // RadixArray[Indexes[0]] is the root frame of the common prefix. - // RadixArray[Indexes[5 - 1]] is the last frame of the common prefix. - std::vector<LinearCallStackId> Indexes; - - using CSIdPair = std::pair<CallStackId, llvm::SmallVector<FrameIdTy>>; - - // Encode a call stack into RadixArray. Return the starting index within - // RadixArray. - LinearCallStackId encodeCallStack( - const llvm::SmallVector<FrameIdTy> *CallStack, - const llvm::SmallVector<FrameIdTy> *Prev, - const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes); - -public: - CallStackRadixTreeBuilder() = default; - - // Build a radix tree array. - void - build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> - &&MemProfCallStackData, - const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, - llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram); - - ArrayRef<LinearFrameId> getRadixArray() const { return RadixArray; } - - llvm::DenseMap<CallStackId, LinearCallStackId> takeCallStackPos() { - return std::move(CallStackPos); - } -}; } // namespace memprof } // namespace llvm diff --git a/llvm/include/llvm/ProfileData/MemProfRadixTree.h b/llvm/include/llvm/ProfileData/MemProfRadixTree.h new file mode 100644 index 0000000000000..9abf9f7a8774b --- /dev/null +++ b/llvm/include/llvm/ProfileData/MemProfRadixTree.h @@ -0,0 +1,358 @@ +//===- MemProfRadixTree.h - MemProf format support ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A custom Radix Tree builder for memprof data to optimize for space. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_PROFILEDATA_MEMPROFRADIXTREE_H +#define LLVM_PROFILEDATA_MEMPROFRADIXTREE_H + +#include "llvm/ProfileData/MemProf.h" + +namespace llvm { +namespace memprof { +namespace detail { +// "Dereference" the iterator from DenseMap or OnDiskChainedHashTable. We have +// to do so in one of two different ways depending on the type of the hash +// table. +template <typename value_type, typename IterTy> +value_type DerefIterator(IterTy Iter) { + using deref_type = llvm::remove_cvref_t<decltype(*Iter)>; + if constexpr (std::is_same_v<deref_type, value_type>) + return *Iter; + else + return Iter->second; +} +} // namespace detail + +// A function object that returns a frame for a given FrameId. +template <typename MapTy> struct FrameIdConverter { + std::optional<FrameId> LastUnmappedId; + MapTy ⤅ + + FrameIdConverter() = delete; + FrameIdConverter(MapTy &Map) : Map(Map) {} + + // Delete the copy constructor and copy assignment operator to avoid a + // situation where a copy of FrameIdConverter gets an error in LastUnmappedId + // while the original instance doesn't. + FrameIdConverter(const FrameIdConverter &) = delete; + FrameIdConverter &operator=(const FrameIdConverter &) = delete; + + Frame operator()(FrameId Id) { + auto Iter = Map.find(Id); + if (Iter == Map.end()) { + LastUnmappedId = Id; + return Frame(); + } + return detail::DerefIterator<Frame>(Iter); + } +}; + +// A function object that returns a call stack for a given CallStackId. +template <typename MapTy> struct CallStackIdConverter { + std::optional<CallStackId> LastUnmappedId; + MapTy ⤅ + llvm::function_ref<Frame(FrameId)> FrameIdToFrame; + + CallStackIdConverter() = delete; + CallStackIdConverter(MapTy &Map, + llvm::function_ref<Frame(FrameId)> FrameIdToFrame) + : Map(Map), FrameIdToFrame(FrameIdToFrame) {} + + // Delete the copy constructor and copy assignment operator to avoid a + // situation where a copy of CallStackIdConverter gets an error in + // LastUnmappedId while the original instance doesn't. + CallStackIdConverter(const CallStackIdConverter &) = delete; + CallStackIdConverter &operator=(const CallStackIdConverter &) = delete; + + std::vector<Frame> operator()(CallStackId CSId) { + std::vector<Frame> Frames; + auto CSIter = Map.find(CSId); + if (CSIter == Map.end()) { + LastUnmappedId = CSId; + } else { + llvm::SmallVector<FrameId> CS = + detail::DerefIterator<llvm::SmallVector<FrameId>>(CSIter); + Frames.reserve(CS.size()); + for (FrameId Id : CS) + Frames.push_back(FrameIdToFrame(Id)); + } + return Frames; + } +}; + +// A function object that returns a Frame stored at a given index into the Frame +// array in the profile. +struct LinearFrameIdConverter { + const unsigned char *FrameBase; + + LinearFrameIdConverter() = delete; + LinearFrameIdConverter(const unsigned char *FrameBase) + : FrameBase(FrameBase) {} + + Frame operator()(LinearFrameId LinearId) { + uint64_t Offset = static_cast<uint64_t>(LinearId) * Frame::serializedSize(); + return Frame::deserialize(FrameBase + Offset); + } +}; + +// A function object that returns a call stack stored at a given index into the +// call stack array in the profile. +struct LinearCallStackIdConverter { + const unsigned char *CallStackBase; + llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame; + + LinearCallStackIdConverter() = delete; + LinearCallStackIdConverter( + const unsigned char *CallStackBase, + llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame) + : CallStackBase(CallStackBase), FrameIdToFrame(FrameIdToFrame) {} + + std::vector<Frame> operator()(LinearCallStackId LinearCSId) { + std::vector<Frame> Frames; + + const unsigned char *Ptr = + CallStackBase + + static_cast<uint64_t>(LinearCSId) * sizeof(LinearFrameId); + uint32_t NumFrames = + support::endian::readNext<uint32_t, llvm::endianness::little>(Ptr); + Frames.reserve(NumFrames); + for (; NumFrames; --NumFrames) { + LinearFrameId Elem = + support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); + // Follow a pointer to the parent, if any. See comments below on + // CallStackRadixTreeBuilder for the description of the radix tree format. + if (static_cast<std::make_signed_t<LinearFrameId>>(Elem) < 0) { + Ptr += (-Elem) * sizeof(LinearFrameId); + Elem = + support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); + } + // We shouldn't encounter another pointer. + assert(static_cast<std::make_signed_t<LinearFrameId>>(Elem) >= 0); + Frames.push_back(FrameIdToFrame(Elem)); + Ptr += sizeof(LinearFrameId); + } + + return Frames; + } +}; + +// Used to extract caller-callee pairs from the call stack array. The leaf +// frame is assumed to call a heap allocation function with GUID 0. The +// resulting pairs are accumulated in CallerCalleePairs. Users can take it +// with: +// +// auto Pairs = std::move(Extractor.CallerCalleePairs); +struct CallerCalleePairExtractor { + // The base address of the radix tree array. + const unsigned char *CallStackBase; + // A functor to convert a linear FrameId to a Frame. + llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame; + // A map from caller GUIDs to lists of call sites in respective callers. + DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> CallerCalleePairs; + + // The set of linear call stack IDs that we've visited. + BitVector Visited; + + CallerCalleePairExtractor() = delete; + CallerCalleePairExtractor( + const unsigned char *CallStackBase, + llvm::function_ref<Frame(LinearFrameId)> FrameIdToFrame, + unsigned RadixTreeSize) + : CallStackBase(CallStackBase), FrameIdToFrame(FrameIdToFrame), + Visited(RadixTreeSize) {} + + void operator()(LinearCallStackId LinearCSId) { + const unsigned char *Ptr = + CallStackBase + + static_cast<uint64_t>(LinearCSId) * sizeof(LinearFrameId); + uint32_t NumFrames = + support::endian::readNext<uint32_t, llvm::endianness::little>(Ptr); + // The leaf frame calls a function with GUID 0. + uint64_t CalleeGUID = 0; + for (; NumFrames; --NumFrames) { + LinearFrameId Elem = + support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); + // Follow a pointer to the parent, if any. See comments below on + // CallStackRadixTreeBuilder for the description of the radix tree format. + if (static_cast<std::make_signed_t<LinearFrameId>>(Elem) < 0) { + Ptr += (-Elem) * sizeof(LinearFrameId); + Elem = + support::endian::read<LinearFrameId, llvm::endianness::little>(Ptr); + } + // We shouldn't encounter another pointer. + assert(static_cast<std::make_signed_t<LinearFrameId>>(Elem) >= 0); + + // Add a new caller-callee pair. + Frame F = FrameIdToFrame(Elem); + uint64_t CallerGUID = F.Function; + LineLocation Loc(F.LineOffset, F.Column); + CallerCalleePairs[CallerGUID].emplace_back(Loc, CalleeGUID); + + // Keep track of the indices we've visited. If we've already visited the + // current one, terminate the traversal. We will not discover any new + // caller-callee pair by continuing the traversal. + unsigned Offset = + std::distance(CallStackBase, Ptr) / sizeof(LinearFrameId); + if (Visited.test(Offset)) + break; + Visited.set(Offset); + + Ptr += sizeof(LinearFrameId); + CalleeGUID = CallerGUID; + } + } +}; + + +// A convenience wrapper around FrameIdConverter and CallStackIdConverter for +// tests. +struct IndexedCallstackIdConverter { + IndexedCallstackIdConverter() = delete; + IndexedCallstackIdConverter(IndexedMemProfData &MemProfData) + : FrameIdConv(MemProfData.Frames), + CSIdConv(MemProfData.CallStacks, FrameIdConv) {} + + // Delete the copy constructor and copy assignment operator to avoid a + // situation where a copy of IndexedCallstackIdConverter gets an error in + // LastUnmappedId while the original instance doesn't. + IndexedCallstackIdConverter(const IndexedCallstackIdConverter &) = delete; + IndexedCallstackIdConverter & + operator=(const IndexedCallstackIdConverter &) = delete; + + std::vector<Frame> operator()(CallStackId CSId) { return CSIdConv(CSId); } + + FrameIdConverter<decltype(IndexedMemProfData::Frames)> FrameIdConv; + CallStackIdConverter<decltype(IndexedMemProfData::CallStacks)> CSIdConv; +}; + +struct FrameStat { + // The number of occurrences of a given FrameId. + uint64_t Count = 0; + // The sum of indexes where a given FrameId shows up. + uint64_t PositionSum = 0; +}; + +// Compute a histogram of Frames in call stacks. +template <typename FrameIdTy> +llvm::DenseMap<FrameIdTy, FrameStat> +computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> + &MemProfCallStackData); + +// Construct a radix tree of call stacks. +// +// A set of call stacks might look like: +// +// CallStackId 1: f1 -> f2 -> f3 +// CallStackId 2: f1 -> f2 -> f4 -> f5 +// CallStackId 3: f1 -> f2 -> f4 -> f6 +// CallStackId 4: f7 -> f8 -> f9 +// +// where each fn refers to a stack frame. +// +// Since we expect a lot of common prefixes, we can compress the call stacks +// into a radix tree like: +// +// CallStackId 1: f1 -> f2 -> f3 +// | +// CallStackId 2: +---> f4 -> f5 +// | +// CallStackId 3: +---> f6 +// +// CallStackId 4: f7 -> f8 -> f9 +// +// Now, we are interested in retrieving call stacks for a given CallStackId, so +// we just need a pointer from a given call stack to its parent. For example, +// CallStackId 2 would point to CallStackId 1 as a parent. +// +// We serialize the radix tree above into a single array along with the length +// of each call stack and pointers to the parent call stacks. +// +// Index: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 +// Array: L3 f9 f8 f7 L4 f6 J3 L4 f5 f4 J3 L3 f3 f2 f1 +// ^ ^ ^ ^ +// | | | | +// CallStackId 4: 0 --+ | | | +// CallStackId 3: 4 --------------+ | | +// CallStackId 2: 7 -----------------------+ | +// CallStackId 1: 11 -----------------------------------+ +// +// - LN indicates the length of a call stack, encoded as ordinary integer N. +// +// - JN indicates a pointer to the parent, encoded as -N. +// +// The radix tree allows us to reconstruct call stacks in the leaf-to-root +// order as we scan the array from left ro right while following pointers to +// parents along the way. +// +// For example, if we are decoding CallStackId 2, we start a forward traversal +// at Index 7, noting the call stack length of 4 and obtaining f5 and f4. When +// we see J3 at Index 10, we resume a forward traversal at Index 13 = 10 + 3, +// picking up f2 and f1. We are done after collecting 4 frames as indicated at +// the beginning of the traversal. +// +// On-disk IndexedMemProfRecord will refer to call stacks by their indexes into +// the radix tree array, so we do not explicitly encode mappings like: +// "CallStackId 1 -> 11". +template <typename FrameIdTy> class CallStackRadixTreeBuilder { + // The radix tree array. + std::vector<LinearFrameId> RadixArray; + + // Mapping from CallStackIds to indexes into RadixArray. + llvm::DenseMap<CallStackId, LinearCallStackId> CallStackPos; + + // In build, we partition a given call stack into two parts -- the prefix + // that's common with the previously encoded call stack and the frames beyond + // the common prefix -- the unique portion. Then we want to find out where + // the common prefix is stored in RadixArray so that we can link the unique + // portion to the common prefix. Indexes, declared below, helps with our + // needs. Intuitively, Indexes tells us where each of the previously encoded + // call stack is stored in RadixArray. More formally, Indexes satisfies: + // + // RadixArray[Indexes[I]] == Prev[I] + // + // for every I, where Prev is the the call stack in the root-to-leaf order + // previously encoded by build. (Note that Prev, as passed to + // encodeCallStack, is in the leaf-to-root order.) + // + // For example, if the call stack being encoded shares 5 frames at the root of + // the call stack with the previously encoded call stack, + // RadixArray[Indexes[0]] is the root frame of the common prefix. + // RadixArray[Indexes[5 - 1]] is the last frame of the common prefix. + std::vector<LinearCallStackId> Indexes; + + using CSIdPair = std::pair<CallStackId, llvm::SmallVector<FrameIdTy>>; + + // Encode a call stack into RadixArray. Return the starting index within + // RadixArray. + LinearCallStackId encodeCallStack( + const llvm::SmallVector<FrameIdTy> *CallStack, + const llvm::SmallVector<FrameIdTy> *Prev, + const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes); + +public: + CallStackRadixTreeBuilder() = default; + + // Build a radix tree array. + void + build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> + &&MemProfCallStackData, + const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, + llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram); + + ArrayRef<LinearFrameId> getRadixArray() const { return RadixArray; } + + llvm::DenseMap<CallStackId, LinearCallStackId> takeCallStackPos() { + return std::move(CallStackPos); + } +}; +} // namespace memprof +} // namespace llvm +#endif // LLVM_PROFILEDATA_MEMPROFRADIXTREE_H diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h index 29d9e57cae3e3..9aa55554fdf72 100644 --- a/llvm/include/llvm/ProfileData/MemProfReader.h +++ b/llvm/include/llvm/ProfileData/MemProfReader.h @@ -22,8 +22,8 @@ #include "llvm/Object/Binary.h" #include "llvm/Object/ObjectFile.h" #include "llvm/ProfileData/InstrProfReader.h" -#include "llvm/ProfileData/MemProf.h" #include "llvm/ProfileData/MemProfData.inc" +#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 1a15c5120d3fd..f8748babb1625 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -62,6 +62,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Object/IRSymtab.h" #include "llvm/ProfileData/MemProf.h" +#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/ProfileData/CMakeLists.txt b/llvm/lib/ProfileData/CMakeLists.txt index 67a69d7761b2c..ca9ea3205ee1d 100644 --- a/llvm/lib/ProfileData/CMakeLists.txt +++ b/llvm/lib/ProfileData/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_component_library(LLVMProfileData ItaniumManglingCanonicalizer.cpp MemProf.cpp MemProfReader.cpp + MemProfRadixTree.cpp PGOCtxProfReader.cpp PGOCtxProfWriter.cpp ProfileSummaryBuilder.cpp diff --git a/llvm/lib/ProfileData/IndexedMemProfData.cpp b/llvm/lib/ProfileData/IndexedMemProfData.cpp index 3d20f7a7a5778..59e59720179af 100644 --- a/llvm/lib/ProfileData/IndexedMemProfData.cpp +++ b/llvm/lib/ProfileData/IndexedMemProfData.cpp @@ -13,6 +13,7 @@ #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/MemProf.h" +#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/OnDiskHashTable.h" diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index e5ec016c91bb0..cb50809f25e40 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -18,7 +18,8 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/InstrProf.h" -#include "llvm/ProfileData/MemProf.h" +//#include "llvm/ProfileData/MemProf.h" +#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/ProfileData/ProfileCommon.h" #include "llvm/ProfileData/SymbolRemappingReader.h" #include "llvm/Support/Endian.h" diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index e497bbff67d2e..a9c5ee09a6daf 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -395,240 +395,5 @@ CallStackId IndexedMemProfData::hashCallStack(ArrayRef<FrameId> CS) const { std::memcpy(&CSId, Hash.data(), sizeof(Hash)); return CSId; } - -// Encode a call stack into RadixArray. Return the starting index within -// RadixArray. For each call stack we encode, we emit two or three components -// into RadixArray. If a given call stack doesn't have a common prefix relative -// to the previous one, we emit: -// -// - the frames in the given call stack in the root-to-leaf order -// -// - the length of the given call stack -// -// If a given call stack has a non-empty common prefix relative to the previous -// one, we emit: -// -// - the relative location of the common prefix, encoded as a negative number. -// -// - a portion of the given call stack that's beyond the common prefix -// -// - the length of the given call stack, including the length of the common -// prefix. -// -// The resulting RadixArray requires a somewhat unintuitive backward traversal -// to reconstruct a call stack -- read the call stack length and scan backward -// while collecting frames in the leaf to root order. build, the caller of this -// function, reverses RadixArray in place so that we can reconstruct a call -// stack as if we were deserializing an array in a typical way -- the call stack -// length followed by the frames in the leaf-to-root order except that we need -// to handle pointers to parents along the way. -// -// To quickly determine the location of the common prefix within RadixArray, -// Indexes caches the indexes of the previous call stack's frames within -// RadixArray. -template <typename FrameIdTy> -LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack( - const llvm::SmallVector<FrameIdTy> *CallStack, - const llvm::SmallVector<FrameIdTy> *Prev, - const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) { - // Compute the length of the common root prefix between Prev and CallStack. - uint32_t CommonLen = 0; - if (Prev) { - auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), - CallStack->rend()); - CommonLen = std::distance(CallStack->rbegin(), Pos.second); - } - - // Drop the portion beyond CommonLen. - assert(CommonLen <= Indexes.size()); - Indexes.resize(CommonLen); - - // Append a pointer to the parent. - if (CommonLen) { - uint32_t CurrentIndex = RadixArray.size(); - uint32_t ParentIndex = Indexes.back(); - // The offset to the parent must be negative because we are pointing to an - // element we've already added to RadixArray. - assert(ParentIndex < CurrentIndex); - RadixArray.push_back(ParentIndex - CurrentIndex); - } - - // Copy the part of the call stack beyond the common prefix to RadixArray. - assert(CommonLen <= CallStack->size()); - for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { - // Remember the index of F in RadixArray. - Indexes.push_back(RadixArray.size()); - RadixArray.push_back( - MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F); - } - assert(CallStack->size() == Indexes.size()); - - // End with the call stack length. - RadixArray.push_back(CallStack->size()); - - // Return the index within RadixArray where we can start reconstructing a - // given call stack from. - return RadixArray.size() - 1; -} - -template <typename FrameIdTy> -void CallStackRadixTreeBuilder<FrameIdTy>::build( - llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> - &&MemProfCallStackData, - const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, - llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) { - // Take the vector portion of MemProfCallStackData. The vector is exactly - // what we need to sort. Also, we no longer need its lookup capability. - llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); - - // Return early if we have no work to do. - if (CallStacks.empty()) { - RadixArray.clear(); - CallStackPos.clear(); - return; - } - - // Sorting the list of call stacks in the dictionary order is sufficient to - // maximize the length of the common prefix between two adjacent call stacks - // and thus minimize the length of RadixArray. However, we go one step - // further and try to reduce the number of times we follow pointers to parents - // during deserilization. Consider a poorly encoded radix tree: - // - // CallStackId 1: f1 -> f2 -> f3 - // | - // CallStackId 2: +--- f4 -> f5 - // | - // CallStackId 3: +--> f6 - // - // Here, f2 and f4 appear once and twice, respectively, in the call stacks. - // Once we encode CallStackId 1 into RadixArray, every other call stack with - // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 - // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to - // parents twice. - // - // We try to alleviate the situation by sorting the list of call stacks by - // comparing the popularity of frames rather than the integer values of - // FrameIds. In the example above, f4 is more popular than f2, so we sort the - // call stacks and encode them as: - // - // CallStackId 2: f1 -- f4 -> f5 - // | | - // CallStackId 3: | +--> f6 - // | - // CallStackId 1: +--> f2 -> f3 - // - // Notice that CallStackId 3 follows a pointer to a parent only once. - // - // All this is a quick-n-dirty trick to reduce the number of jumps. The - // proper way would be to compute the weight of each radix tree node -- how - // many call stacks use a given radix tree node, and encode a radix tree from - // the heaviest node first. We do not do so because that's a lot of work. - llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { - // Call stacks are stored from leaf to root. Perform comparisons from the - // root. - return std::lexicographical_compare( - L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), - [&](FrameIdTy F1, FrameIdTy F2) { - uint64_t H1 = FrameHistogram[F1].Count; - uint64_t H2 = FrameHistogram[F2].Count; - // Popular frames should come later because we encode call stacks from - // the last one in the list. - if (H1 != H2) - return H1 < H2; - // For sort stability. - return F1 < F2; - }); - }); - - // Reserve some reasonable amount of storage. - RadixArray.clear(); - RadixArray.reserve(CallStacks.size() * 8); - - // Indexes will grow as long as the longest call stack. - Indexes.clear(); - Indexes.reserve(512); - - // CallStackPos will grow to exactly CallStacks.size() entries. - CallStackPos.clear(); - CallStackPos.reserve(CallStacks.size()); - - // Compute the radix array. We encode one call stack at a time, computing the - // longest prefix that's shared with the previous call stack we encode. For - // each call stack we encode, we remember a mapping from CallStackId to its - // position within RadixArray. - // - // As an optimization, we encode from the last call stack in CallStacks to - // reduce the number of times we follow pointers to the parents. Consider the - // list of call stacks that has been sorted in the dictionary order: - // - // Call Stack 1: F1 - // Call Stack 2: F1 -> F2 - // Call Stack 3: F1 -> F2 -> F3 - // - // If we traversed CallStacks in the forward order, we would end up with a - // radix tree like: - // - // Call Stack 1: F1 - // | - // Call Stack 2: +---> F2 - // | - // Call Stack 3: +---> F3 - // - // Notice that each call stack jumps to the previous one. However, if we - // traverse CallStacks in the reverse order, then Call Stack 3 has the - // complete call stack encoded without any pointers. Call Stack 1 and 2 point - // to appropriate prefixes of Call Stack 3. - const llvm::SmallVector<FrameIdTy> *Prev = nullptr; - for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { - LinearCallStackId Pos = - encodeCallStack(&CallStack, Prev, MemProfFrameIndexes); - CallStackPos.insert({CSId, Pos}); - Prev = &CallStack; - } - - // "RadixArray.size() - 1" below is problematic if RadixArray is empty. - assert(!RadixArray.empty()); - - // Reverse the radix array in place. We do so mostly for intuitive - // deserialization where we would read the length field and then the call - // stack frames proper just like any other array deserialization, except - // that we have occasional jumps to take advantage of prefixes. - for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) - std::swap(RadixArray[I], RadixArray[J]); - - // "Reverse" the indexes stored in CallStackPos. - for (auto &[K, V] : CallStackPos) - V = RadixArray.size() - 1 - V; -} - -// Explicitly instantiate class with the utilized FrameIdTy. -template class CallStackRadixTreeBuilder<FrameId>; -template class CallStackRadixTreeBuilder<LinearFrameId>; - -template <typename FrameIdTy> -llvm::DenseMap<FrameIdTy, FrameStat> -computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> - &MemProfCallStackData) { - llvm::DenseMap<FrameIdTy, FrameStat> Histogram; - - for (const auto &KV : MemProfCallStackData) { - const auto &CS = KV.second; - for (unsigned I = 0, E = CS.size(); I != E; ++I) { - auto &S = Histogram[CS[I]]; - ++S.Count; - S.PositionSum += I; - } - } - return Histogram; -} - -// Explicitly instantiate function with the utilized FrameIdTy. -template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>( - llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> - &MemProfCallStackData); -template llvm::DenseMap<LinearFrameId, FrameStat> -computeFrameHistogram<LinearFrameId>( - llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> - &MemProfCallStackData); } // namespace memprof } // namespace llvm diff --git a/llvm/lib/ProfileData/MemProfRadixTree.cpp b/llvm/lib/ProfileData/MemProfRadixTree.cpp new file mode 100644 index 0000000000000..5ef357efdeffd --- /dev/null +++ b/llvm/lib/ProfileData/MemProfRadixTree.cpp @@ -0,0 +1,253 @@ +//===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains logic that implements a space efficient radix tree +// encoding for callstacks used by MemProf. +// +//===----------------------------------------------------------------------===// + + +#include "llvm/ProfileData/MemProfRadixTree.h" + +namespace llvm { +namespace memprof { +// Encode a call stack into RadixArray. Return the starting index within +// RadixArray. For each call stack we encode, we emit two or three components +// into RadixArray. If a given call stack doesn't have a common prefix relative +// to the previous one, we emit: +// +// - the frames in the given call stack in the root-to-leaf order +// +// - the length of the given call stack +// +// If a given call stack has a non-empty common prefix relative to the previous +// one, we emit: +// +// - the relative location of the common prefix, encoded as a negative number. +// +// - a portion of the given call stack that's beyond the common prefix +// +// - the length of the given call stack, including the length of the common +// prefix. +// +// The resulting RadixArray requires a somewhat unintuitive backward traversal +// to reconstruct a call stack -- read the call stack length and scan backward +// while collecting frames in the leaf to root order. build, the caller of this +// function, reverses RadixArray in place so that we can reconstruct a call +// stack as if we were deserializing an array in a typical way -- the call stack +// length followed by the frames in the leaf-to-root order except that we need +// to handle pointers to parents along the way. +// +// To quickly determine the location of the common prefix within RadixArray, +// Indexes caches the indexes of the previous call stack's frames within +// RadixArray. +template <typename FrameIdTy> +LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack( + const llvm::SmallVector<FrameIdTy> *CallStack, + const llvm::SmallVector<FrameIdTy> *Prev, + const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) { + // Compute the length of the common root prefix between Prev and CallStack. + uint32_t CommonLen = 0; + if (Prev) { + auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), + CallStack->rend()); + CommonLen = std::distance(CallStack->rbegin(), Pos.second); + } + + // Drop the portion beyond CommonLen. + assert(CommonLen <= Indexes.size()); + Indexes.resize(CommonLen); + + // Append a pointer to the parent. + if (CommonLen) { + uint32_t CurrentIndex = RadixArray.size(); + uint32_t ParentIndex = Indexes.back(); + // The offset to the parent must be negative because we are pointing to an + // element we've already added to RadixArray. + assert(ParentIndex < CurrentIndex); + RadixArray.push_back(ParentIndex - CurrentIndex); + } + + // Copy the part of the call stack beyond the common prefix to RadixArray. + assert(CommonLen <= CallStack->size()); + for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { + // Remember the index of F in RadixArray. + Indexes.push_back(RadixArray.size()); + RadixArray.push_back( + MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F); + } + assert(CallStack->size() == Indexes.size()); + + // End with the call stack length. + RadixArray.push_back(CallStack->size()); + + // Return the index within RadixArray where we can start reconstructing a + // given call stack from. + return RadixArray.size() - 1; +} + +template <typename FrameIdTy> +void CallStackRadixTreeBuilder<FrameIdTy>::build( + llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> + &&MemProfCallStackData, + const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, + llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) { + // Take the vector portion of MemProfCallStackData. The vector is exactly + // what we need to sort. Also, we no longer need its lookup capability. + llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); + + // Return early if we have no work to do. + if (CallStacks.empty()) { + RadixArray.clear(); + CallStackPos.clear(); + return; + } + + // Sorting the list of call stacks in the dictionary order is sufficient to + // maximize the length of the common prefix between two adjacent call stacks + // and thus minimize the length of RadixArray. However, we go one step + // further and try to reduce the number of times we follow pointers to parents + // during deserilization. Consider a poorly encoded radix tree: + // + // CallStackId 1: f1 -> f2 -> f3 + // | + // CallStackId 2: +--- f4 -> f5 + // | + // CallStackId 3: +--> f6 + // + // Here, f2 and f4 appear once and twice, respectively, in the call stacks. + // Once we encode CallStackId 1 into RadixArray, every other call stack with + // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 + // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to + // parents twice. + // + // We try to alleviate the situation by sorting the list of call stacks by + // comparing the popularity of frames rather than the integer values of + // FrameIds. In the example above, f4 is more popular than f2, so we sort the + // call stacks and encode them as: + // + // CallStackId 2: f1 -- f4 -> f5 + // | | + // CallStackId 3: | +--> f6 + // | + // CallStackId 1: +--> f2 -> f3 + // + // Notice that CallStackId 3 follows a pointer to a parent only once. + // + // All this is a quick-n-dirty trick to reduce the number of jumps. The + // proper way would be to compute the weight of each radix tree node -- how + // many call stacks use a given radix tree node, and encode a radix tree from + // the heaviest node first. We do not do so because that's a lot of work. + llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { + // Call stacks are stored from leaf to root. Perform comparisons from the + // root. + return std::lexicographical_compare( + L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), + [&](FrameIdTy F1, FrameIdTy F2) { + uint64_t H1 = FrameHistogram[F1].Count; + uint64_t H2 = FrameHistogram[F2].Count; + // Popular frames should come later because we encode call stacks from + // the last one in the list. + if (H1 != H2) + return H1 < H2; + // For sort stability. + return F1 < F2; + }); + }); + + // Reserve some reasonable amount of storage. + RadixArray.clear(); + RadixArray.reserve(CallStacks.size() * 8); + + // Indexes will grow as long as the longest call stack. + Indexes.clear(); + Indexes.reserve(512); + + // CallStackPos will grow to exactly CallStacks.size() entries. + CallStackPos.clear(); + CallStackPos.reserve(CallStacks.size()); + + // Compute the radix array. We encode one call stack at a time, computing the + // longest prefix that's shared with the previous call stack we encode. For + // each call stack we encode, we remember a mapping from CallStackId to its + // position within RadixArray. + // + // As an optimization, we encode from the last call stack in CallStacks to + // reduce the number of times we follow pointers to the parents. Consider the + // list of call stacks that has been sorted in the dictionary order: + // + // Call Stack 1: F1 + // Call Stack 2: F1 -> F2 + // Call Stack 3: F1 -> F2 -> F3 + // + // If we traversed CallStacks in the forward order, we would end up with a + // radix tree like: + // + // Call Stack 1: F1 + // | + // Call Stack 2: +---> F2 + // | + // Call Stack 3: +---> F3 + // + // Notice that each call stack jumps to the previous one. However, if we + // traverse CallStacks in the reverse order, then Call Stack 3 has the + // complete call stack encoded without any pointers. Call Stack 1 and 2 point + // to appropriate prefixes of Call Stack 3. + const llvm::SmallVector<FrameIdTy> *Prev = nullptr; + for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { + LinearCallStackId Pos = + encodeCallStack(&CallStack, Prev, MemProfFrameIndexes); + CallStackPos.insert({CSId, Pos}); + Prev = &CallStack; + } + + // "RadixArray.size() - 1" below is problematic if RadixArray is empty. + assert(!RadixArray.empty()); + + // Reverse the radix array in place. We do so mostly for intuitive + // deserialization where we would read the length field and then the call + // stack frames proper just like any other array deserialization, except + // that we have occasional jumps to take advantage of prefixes. + for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) + std::swap(RadixArray[I], RadixArray[J]); + + // "Reverse" the indexes stored in CallStackPos. + for (auto &[K, V] : CallStackPos) + V = RadixArray.size() - 1 - V; +} + +// Explicitly instantiate class with the utilized FrameIdTy. +template class CallStackRadixTreeBuilder<FrameId>; +template class CallStackRadixTreeBuilder<LinearFrameId>; + +template <typename FrameIdTy> +llvm::DenseMap<FrameIdTy, FrameStat> +computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> + &MemProfCallStackData) { + llvm::DenseMap<FrameIdTy, FrameStat> Histogram; + + for (const auto &KV : MemProfCallStackData) { + const auto &CS = KV.second; + for (unsigned I = 0, E = CS.size(); I != E; ++I) { + auto &S = Histogram[CS[I]]; + ++S.Count; + S.PositionSum += I; + } + } + return Histogram; +} + +// Explicitly instantiate function with the utilized FrameIdTy. +template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>( + llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> + &MemProfCallStackData); +template llvm::DenseMap<LinearFrameId, FrameStat> +computeFrameHistogram<LinearFrameId>( + llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> + &MemProfCallStackData); +} // namespace memprof +} // namespace llvm diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp index a0bd41bccf928..9f1caae296cca 100644 --- a/llvm/unittests/ProfileData/InstrProfTest.cpp +++ b/llvm/unittests/ProfileData/InstrProfTest.cpp @@ -15,6 +15,7 @@ #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/InstrProfWriter.h" #include "llvm/ProfileData/MemProf.h" +#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/ProfileData/MemProfData.inc" #include "llvm/Support/Compression.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index b1937992ea7f4..a072dee26d9a0 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ProfileData/MemProf.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLForwardCompat.h" @@ -14,8 +13,10 @@ #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" #include "llvm/IR/Value.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/ProfileData/MemProf.h" #include "llvm/ProfileData/MemProfData.inc" #include "llvm/ProfileData/MemProfReader.h" +#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/ProfileData/MemProfYAML.h" #include "llvm/Support/raw_ostream.h" #include "gmock/gmock.h" >From 1efadfb086c9886a4a469e41753224fc76aa3162 Mon Sep 17 00:00:00 2001 From: Snehasish Kumar <snehasi...@google.com> Date: Mon, 19 May 2025 15:08:03 -0700 Subject: [PATCH 2/2] Fix formatting. --- llvm/include/llvm/ProfileData/MemProfRadixTree.h | 3 +-- llvm/lib/ProfileData/InstrProfReader.cpp | 2 +- llvm/lib/ProfileData/MemProfRadixTree.cpp | 3 +-- llvm/unittests/ProfileData/InstrProfTest.cpp | 2 +- llvm/unittests/ProfileData/MemProfTest.cpp | 4 ++-- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/ProfileData/MemProfRadixTree.h b/llvm/include/llvm/ProfileData/MemProfRadixTree.h index 9abf9f7a8774b..7afa660885400 100644 --- a/llvm/include/llvm/ProfileData/MemProfRadixTree.h +++ b/llvm/include/llvm/ProfileData/MemProfRadixTree.h @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// A custom Radix Tree builder for memprof data to optimize for space. +// A custom Radix Tree builder for memprof data to optimize for space. // //===----------------------------------------------------------------------===// @@ -211,7 +211,6 @@ struct CallerCalleePairExtractor { } }; - // A convenience wrapper around FrameIdConverter and CallStackIdConverter for // tests. struct IndexedCallstackIdConverter { diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index cb50809f25e40..7518f46bd2347 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -18,7 +18,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/InstrProf.h" -//#include "llvm/ProfileData/MemProf.h" +// #include "llvm/ProfileData/MemProf.h" #include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/ProfileData/ProfileCommon.h" #include "llvm/ProfileData/SymbolRemappingReader.h" diff --git a/llvm/lib/ProfileData/MemProfRadixTree.cpp b/llvm/lib/ProfileData/MemProfRadixTree.cpp index 5ef357efdeffd..ec3ff2e721e2e 100644 --- a/llvm/lib/ProfileData/MemProfRadixTree.cpp +++ b/llvm/lib/ProfileData/MemProfRadixTree.cpp @@ -5,12 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// This file contains logic that implements a space efficient radix tree +// This file contains logic that implements a space efficient radix tree // encoding for callstacks used by MemProf. // //===----------------------------------------------------------------------===// - #include "llvm/ProfileData/MemProfRadixTree.h" namespace llvm { diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp index 9f1caae296cca..439c602672718 100644 --- a/llvm/unittests/ProfileData/InstrProfTest.cpp +++ b/llvm/unittests/ProfileData/InstrProfTest.cpp @@ -15,8 +15,8 @@ #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/InstrProfWriter.h" #include "llvm/ProfileData/MemProf.h" -#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/ProfileData/MemProfData.inc" +#include "llvm/ProfileData/MemProfRadixTree.h" #include "llvm/Support/Compression.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Testing/Support/Error.h" diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index a072dee26d9a0..26b09698c7ea3 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ProfileData/MemProf.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLForwardCompat.h" @@ -13,10 +14,9 @@ #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" #include "llvm/IR/Value.h" #include "llvm/Object/ObjectFile.h" -#include "llvm/ProfileData/MemProf.h" #include "llvm/ProfileData/MemProfData.inc" -#include "llvm/ProfileData/MemProfReader.h" #include "llvm/ProfileData/MemProfRadixTree.h" +#include "llvm/ProfileData/MemProfReader.h" #include "llvm/ProfileData/MemProfYAML.h" #include "llvm/Support/raw_ostream.h" #include "gmock/gmock.h" _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits