llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-bolt

Author: Amir Ayupov (aaupov)

<details>
<summary>Changes</summary>

Pseudo probe matching (#<!-- -->100446) needs callee information for call 
probes.
Embed call probe information (probe id, inline tree node, indirect flag)
into CallSiteInfo. This allows the following:
- Remove call probes from PseudoProbeInfo to avoid duplication, making 
  PseudoProbeInfo only contain block probes.
- This makes probe grouping across inline tree nodes more potent
  and allows to unambiguously elide block id 1 (common case).

Probe information can be further compressed (to be added in a follow-up 
diff). To unblock it and simplify reading and writing the profile, remove
block mask encoding optimization which becomes a low ROI optimization.

The size increase is modest, at ~3% for a XL profile (461-&gt;475MB),
to be shrinked by ~6% by compact block probe encoding.

Test Plan: updated pseudoprobe-decoding-{inline,noinline}.test

---
Full diff: https://github.com/llvm/llvm-project/pull/165490.diff


7 Files Affected:

- (modified) bolt/include/bolt/Profile/ProfileYAMLMapping.h (+12-14) 
- (modified) bolt/include/bolt/Profile/YAMLProfileWriter.h (+17-18) 
- (modified) bolt/lib/Profile/DataAggregator.cpp (+6-10) 
- (modified) bolt/lib/Profile/StaleProfileMatching.cpp (+4-20) 
- (modified) bolt/lib/Profile/YAMLProfileWriter.cpp (+58-45) 
- (modified) bolt/test/X86/pseudoprobe-decoding-inline.test (+3-3) 
- (modified) bolt/test/X86/pseudoprobe-decoding-noinline.test (+4-3) 


``````````diff
diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h 
b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
index 41e2bd1651efd..b393c85321b7d 100644
--- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h
+++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
@@ -29,6 +29,10 @@ struct CallSiteInfo {
   uint32_t EntryDiscriminator{0}; /// multiple entry discriminator
   uint64_t Count{0};
   uint64_t Mispreds{0};
+  // Pseudo probe information, optional
+  uint32_t Probe{0};
+  bool Indirect = false;
+  uint32_t InlineTreeNode{0};
 
   bool operator==(const CallSiteInfo &Other) const {
     return Offset == Other.Offset && DestId == Other.DestId &&
@@ -63,6 +67,9 @@ template <> struct MappingTraits<bolt::CallSiteInfo> {
     YamlIO.mapOptional("disc", CSI.EntryDiscriminator, (uint32_t)0);
     YamlIO.mapRequired("cnt", CSI.Count);
     YamlIO.mapOptional("mis", CSI.Mispreds, (uint64_t)0);
+    YamlIO.mapOptional("pp", CSI.Probe, 0);
+    YamlIO.mapOptional("ppn", CSI.InlineTreeNode, 0);
+    YamlIO.mapOptional("ind", CSI.Indirect, false);
   }
 
   static const bool flow = true;
@@ -95,29 +102,20 @@ template <> struct MappingTraits<bolt::SuccessorInfo> {
 
 namespace bolt {
 struct PseudoProbeInfo {
-  uint32_t InlineTreeIndex = 0;
-  uint64_t BlockMask = 0;            // bitset with probe indices from 1 to 64
-  std::vector<uint64_t> BlockProbes; // block probes with indices above 64
-  std::vector<uint64_t> CallProbes;
-  std::vector<uint64_t> IndCallProbes;
+  std::vector<uint64_t> BlockProbes;
   std::vector<uint32_t> InlineTreeNodes;
 
   bool operator==(const PseudoProbeInfo &Other) const {
-    return InlineTreeIndex == Other.InlineTreeIndex &&
-           BlockProbes == Other.BlockProbes && CallProbes == Other.CallProbes 
&&
-           IndCallProbes == Other.IndCallProbes;
+    return InlineTreeNodes == Other.InlineTreeNodes &&
+           BlockProbes == Other.BlockProbes;
   }
 };
 } // end namespace bolt
 
 template <> struct MappingTraits<bolt::PseudoProbeInfo> {
   static void mapping(IO &YamlIO, bolt::PseudoProbeInfo &PI) {
-    YamlIO.mapOptional("blx", PI.BlockMask, 0);
-    YamlIO.mapOptional("blk", PI.BlockProbes, std::vector<uint64_t>());
-    YamlIO.mapOptional("call", PI.CallProbes, std::vector<uint64_t>());
-    YamlIO.mapOptional("icall", PI.IndCallProbes, std::vector<uint64_t>());
-    YamlIO.mapOptional("id", PI.InlineTreeIndex, 0);
-    YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector<uint32_t>());
+    YamlIO.mapOptional("blk", PI.BlockProbes, std::vector<uint64_t>(1, 1));
+    YamlIO.mapOptional("ids", PI.InlineTreeNodes, std::vector<uint32_t>(1, 0));
   }
 
   static const bool flow = true;
diff --git a/bolt/include/bolt/Profile/YAMLProfileWriter.h 
b/bolt/include/bolt/Profile/YAMLProfileWriter.h
index d4d7217464cc8..50ee78d342df8 100644
--- a/bolt/include/bolt/Profile/YAMLProfileWriter.h
+++ b/bolt/include/bolt/Profile/YAMLProfileWriter.h
@@ -74,25 +74,24 @@ class YAMLProfileWriter {
   collectInlineTree(const MCPseudoProbeDecoder &Decoder,
                     const MCDecodedPseudoProbeInlineTree &Root);
 
-  // 0 - block probe, 1 - indirect call, 2 - direct call
-  using ProbeList = std::array<SmallVector<uint64_t, 0>, 3>;
-  using NodeIdToProbes = DenseMap<uint32_t, ProbeList>;
-  static std::vector<yaml::bolt::PseudoProbeInfo>
-  convertNodeProbes(NodeIdToProbes &NodeProbes);
-
 public:
-  template <typename T>
-  static std::vector<yaml::bolt::PseudoProbeInfo>
-  writeBlockProbes(T Probes, const InlineTreeMapTy &InlineTreeNodeId) {
-    NodeIdToProbes NodeProbes;
-    for (const MCDecodedPseudoProbe &Probe : Probes) {
-      auto It = InlineTreeNodeId.find(Probe.getInlineTreeNode());
-      if (It == InlineTreeNodeId.end())
-        continue;
-      NodeProbes[It->second][Probe.getType()].emplace_back(Probe.getIndex());
-    }
-    return convertNodeProbes(NodeProbes);
-  }
+  class BlockProbeCtx {
+    struct Call {
+      uint64_t Id;
+      uint32_t Node;
+      bool Indirect;
+      bool Used;
+    };
+    // Group block probes by node id.
+    DenseMap<uint32_t, std::vector<uint64_t>> NodeToProbes;
+    // Offset -> call probe
+    DenseMap<uint32_t, Call> CallProbes;
+
+  public:
+    void addBlockProbe(const InlineTreeMapTy &Map,
+                       const MCDecodedPseudoProbe &Probe, uint32_t 
ProbeOffset);
+    void finalize(yaml::bolt::BinaryBasicBlockProfile &YamlBB);
+  };
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Profile/DataAggregator.cpp 
b/bolt/lib/Profile/DataAggregator.cpp
index 3604fdd3a94b4..6a12efdcb96c0 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2383,10 +2383,7 @@ std::error_code 
DataAggregator::writeBATYAML(BinaryContext &BC,
             PseudoProbeDecoder->getAddress2ProbesMap();
         BinaryFunction::FragmentsSetTy Fragments(BF->Fragments);
         Fragments.insert(BF);
-        DenseMap<
-            uint32_t,
-            std::vector<std::reference_wrapper<const MCDecodedPseudoProbe>>>
-            BlockProbes;
+        DenseMap<uint32_t, YAMLProfileWriter::BlockProbeCtx> BlockCtx;
         for (const BinaryFunction *F : Fragments) {
           const uint64_t FuncAddr = F->getAddress();
           for (const MCDecodedPseudoProbe &Probe :
@@ -2394,15 +2391,14 @@ std::error_code 
DataAggregator::writeBATYAML(BinaryContext &BC,
             const uint32_t OutputAddress = Probe.getAddress();
             const uint32_t InputOffset = BAT->translate(
                 FuncAddr, OutputAddress - FuncAddr, /*IsBranchSrc=*/true);
-            const unsigned BlockIndex = getBlock(InputOffset).second;
-            BlockProbes[BlockIndex].emplace_back(Probe);
+            const auto &[BlockOffset, BlockIndex] = getBlock(InputOffset);
+            BlockCtx[BlockIndex].addBlockProbe(InlineTreeNodeId, Probe,
+                                               InputOffset - BlockOffset);
           }
         }
 
-        for (auto &[Block, Probes] : BlockProbes) {
-          YamlBF.Blocks[Block].PseudoProbes =
-              YAMLProfileWriter::writeBlockProbes(Probes, InlineTreeNodeId);
-        }
+        for (auto &[Block, Ctx] : BlockCtx)
+          Ctx.finalize(YamlBF.Blocks[Block]);
       }
       // Skip printing if there's no profile data
       llvm::erase_if(
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp 
b/bolt/lib/Profile/StaleProfileMatching.cpp
index 1a61949d77472..5fb65153cf313 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -348,26 +348,10 @@ class StaleMatcher {
       return It->second;
     };
 
-    auto matchPseudoProbeInfo = [&](const yaml::bolt::PseudoProbeInfo
-                                        &ProfileProbe,
-                                    uint32_t NodeId) {
-      for (uint64_t Index = 0; Index < 64; ++Index)
-        if (ProfileProbe.BlockMask & 1ull << Index)
-          ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, Index + 1)];
-      for (const auto &ProfileProbes :
-           {ProfileProbe.BlockProbes, ProfileProbe.IndCallProbes,
-            ProfileProbe.CallProbes})
-        for (uint64_t ProfileProbe : ProfileProbes)
-          ++FlowBlockMatchCount[matchProfileProbeToBlock(NodeId, 
ProfileProbe)];
-    };
-
-    for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes) {
-      if (!ProfileProbe.InlineTreeNodes.empty())
-        for (uint32_t ProfileInlineTreeNode : ProfileProbe.InlineTreeNodes)
-          matchPseudoProbeInfo(ProfileProbe, ProfileInlineTreeNode);
-      else
-        matchPseudoProbeInfo(ProfileProbe, ProfileProbe.InlineTreeIndex);
-    }
+    for (const yaml::bolt::PseudoProbeInfo &ProfileProbe : BlockPseudoProbes)
+      for (uint32_t Node : ProfileProbe.InlineTreeNodes)
+        for (uint64_t Probe : ProfileProbe.BlockProbes)
+          ++FlowBlockMatchCount[matchProfileProbeToBlock(Node, Probe)];
     uint32_t BestMatchCount = 0;
     uint32_t TotalMatchCount = 0;
     const FlowBlock *BestMatchBlock = nullptr;
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp 
b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 1632aa1c6bfe2..82f2114a7a3d2 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -129,50 +129,62 @@ YAMLProfileWriter::convertPseudoProbeDesc(const 
MCPseudoProbeDecoder &Decoder) {
   return {Desc, InlineTree};
 }
 
-std::vector<yaml::bolt::PseudoProbeInfo>
-YAMLProfileWriter::convertNodeProbes(NodeIdToProbes &NodeProbes) {
-  struct BlockProbeInfoHasher {
-    size_t operator()(const yaml::bolt::PseudoProbeInfo &BPI) const {
-      return llvm::hash_combine(llvm::hash_combine_range(BPI.BlockProbes),
-                                llvm::hash_combine_range(BPI.CallProbes),
-                                llvm::hash_combine_range(BPI.IndCallProbes));
+void YAMLProfileWriter::BlockProbeCtx::addBlockProbe(
+    const InlineTreeMapTy &Map, const MCDecodedPseudoProbe &Probe,
+    uint32_t ProbeOffset) {
+  auto It = Map.find(Probe.getInlineTreeNode());
+  if (It == Map.end())
+    return;
+  auto NodeId = It->second;
+  uint32_t Index = Probe.getIndex();
+  if (Probe.isCall())
+    CallProbes[ProbeOffset] =
+        Call{Index, NodeId, Probe.isIndirectCall(), false};
+  else
+    NodeToProbes[NodeId].emplace_back(Index);
+}
+
+void YAMLProfileWriter::BlockProbeCtx::finalize(
+    yaml::bolt::BinaryBasicBlockProfile &YamlBB) {
+  // Hash block probes by vector
+  struct ProbeHasher {
+    size_t operator()(const ArrayRef<uint64_t> Probes) const {
+      return llvm::hash_combine_range(Probes);
     }
   };
 
-  // Check identical BlockProbeInfo structs and merge them
-  std::unordered_map<yaml::bolt::PseudoProbeInfo, std::vector<uint32_t>,
-                     BlockProbeInfoHasher>
-      BPIToNodes;
-  for (auto &[NodeId, Probes] : NodeProbes) {
-    yaml::bolt::PseudoProbeInfo BPI;
-    BPI.BlockProbes = std::vector(Probes[0].begin(), Probes[0].end());
-    BPI.IndCallProbes = std::vector(Probes[1].begin(), Probes[1].end());
-    BPI.CallProbes = std::vector(Probes[2].begin(), Probes[2].end());
-    BPIToNodes[BPI].push_back(NodeId);
+  // Check identical block probes and merge them
+  std::unordered_map<std::vector<uint64_t>, std::vector<uint32_t>, ProbeHasher>
+      ProbesToNodes;
+  for (auto &[NodeId, Probes] : NodeToProbes) {
+    llvm::sort(Probes);
+    ProbesToNodes[Probes].emplace_back(NodeId);
   }
-
-  auto handleMask = [](const auto &Ids, auto &Vec, auto &Mask) {
-    for (auto Id : Ids)
-      if (Id > 64)
-        Vec.emplace_back(Id);
-      else
-        Mask |= 1ull << (Id - 1);
-  };
-
-  // Add to YAML with merged nodes/block mask optimizations
-  std::vector<yaml::bolt::PseudoProbeInfo> YamlProbes;
-  YamlProbes.reserve(BPIToNodes.size());
-  for (const auto &[BPI, Nodes] : BPIToNodes) {
-    auto &YamlBPI = YamlProbes.emplace_back(yaml::bolt::PseudoProbeInfo());
-    YamlBPI.CallProbes = BPI.CallProbes;
-    YamlBPI.IndCallProbes = BPI.IndCallProbes;
-    if (Nodes.size() == 1)
-      YamlBPI.InlineTreeIndex = Nodes.front();
-    else
-      YamlBPI.InlineTreeNodes = Nodes;
-    handleMask(BPI.BlockProbes, YamlBPI.BlockProbes, YamlBPI.BlockMask);
+  for (auto &[Probes, Nodes] : ProbesToNodes) {
+    llvm::sort(Nodes);
+    YamlBB.PseudoProbes.emplace_back(
+        yaml::bolt::PseudoProbeInfo{Probes, Nodes});
+  }
+  for (yaml::bolt::CallSiteInfo &CSI : YamlBB.CallSites) {
+    auto It = CallProbes.find(CSI.Offset);
+    if (It == CallProbes.end())
+      continue;
+    Call &Probe = It->second;
+    CSI.Probe = Probe.Id;
+    CSI.InlineTreeNode = Probe.Node;
+    CSI.Indirect = Probe.Indirect;
+    Probe.Used = true;
+  }
+  for (const auto &[Offset, Probe] : CallProbes) {
+    if (Probe.Used)
+      continue;
+    yaml::bolt::CallSiteInfo CSI;
+    CSI.Offset = Offset;
+    CSI.Probe = Probe.Id;
+    CSI.InlineTreeNode = Probe.Node;
+    CSI.Indirect = Probe.Indirect;
+    YamlBB.CallSites.emplace_back(CSI);
   }
-  return YamlProbes;
 }
 
 std::tuple<std::vector<yaml::bolt::InlineTreeNode>,
@@ -343,12 +355,13 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool 
UseDFS,
       const AddressProbesMap &ProbeMap =
           PseudoProbeDecoder->getAddress2ProbesMap();
       const uint64_t FuncAddr = BF.getAddress();
-      const std::pair<uint64_t, uint64_t> &BlockRange =
-          BB->getInputAddressRange();
-      const std::pair<uint64_t, uint64_t> BlockAddrRange = {
-          FuncAddr + BlockRange.first, FuncAddr + BlockRange.second};
-      auto Probes = ProbeMap.find(BlockAddrRange.first, BlockAddrRange.second);
-      YamlBB.PseudoProbes = writeBlockProbes(Probes, InlineTreeNodeId);
+      auto [Start, End] = BB->getInputAddressRange();
+      Start += FuncAddr;
+      End += FuncAddr;
+      BlockProbeCtx Ctx;
+      for (const MCDecodedPseudoProbe &Probe : ProbeMap.find(Start, End))
+        Ctx.addBlockProbe(InlineTreeNodeId, Probe, Probe.getAddress() - Start);
+      Ctx.finalize(YamlBB);
     }
 
     YamlBF.Blocks.emplace_back(YamlBB);
diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test 
b/bolt/test/X86/pseudoprobe-decoding-inline.test
index e5e8aadc18f9e..9748fc1b6a4d4 100644
--- a/bolt/test/X86/pseudoprobe-decoding-inline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-inline.test
@@ -14,17 +14,17 @@
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 9 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 4 ] } ]
 # CHECK-YAML: inline_tree: [ { } ]
 #
 # CHECK-YAML: name: foo
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 3 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 2 ] } ]
 # CHECK-YAML: inline_tree: [ { g: 1 }, { g: 0, cs: 8 } ]
 #
 # CHECK-YAML: name: main
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 3, id: 1 }, { blx: 1 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 2 ], ids: [ 1 ] }, { } ]
 # CHECK-YAML: inline_tree: [ { g: 2 }, { g: 1, cs: 2 }, { g: 0, p: 1, cs: 8 } ]
 #
 # CHECK-YAML: pseudo_probe_desc:
diff --git a/bolt/test/X86/pseudoprobe-decoding-noinline.test 
b/bolt/test/X86/pseudoprobe-decoding-noinline.test
index 36a2fab74e857..4ba51cdc96f9e 100644
--- a/bolt/test/X86/pseudoprobe-decoding-noinline.test
+++ b/bolt/test/X86/pseudoprobe-decoding-noinline.test
@@ -15,17 +15,18 @@
 # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML
 # CHECK-YAML: name: bar
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 9 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 4 ] } ]
 # CHECK-YAML: inline_tree: [ {  } ]
 #
 # CHECK-YAML: name: foo
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 3 } ]
+# CHECK-YAML:   probes: [ { blk: [ 1, 2 ] } ]
 # CHECK-YAML: inline_tree: [ { g: 2 } ]
 #
 # CHECK-YAML: name: main
 # CHECK-YAML: - bid: 0
-# CHECK-YAML:   probes: [ { blx: 1, call: [ 2 ] } ]
+# CHECK-YAML:   calls: [ { off: 0x4, fid: 0, cnt: 0, pp: 2 } ]
+# CHECK-YAML:   probes: [ { } ]
 # CHECK-YAML: inline_tree: [ { g: 1 } ]
 #
 # CHECK-YAML: pseudo_probe_desc:

``````````

</details>


https://github.com/llvm/llvm-project/pull/165490
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to