[clang] [compiler-rt] [llvm] [PGO][AMDGPU] Add offload profiling with uniformity-aware optimization (PR #177665)

Juan Manuel Martinez Caamaño via cfe-commits Mon, 26 Jan 2026 05:35:19 -0800

================
@@ -1208,6 +1321,353 @@ void 
InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
   Inc->eraseFromParent();
 }
 
+// Lowers an InstrProfIncrementInst for AMDGPU to per-wave aggregated counter
+// updates. It computes a "slot" index based on block and warp-local indices,
+// elects a leader lane, and increments a counter by (Inc->getStep() *
+// number_of_active_lanes) to reflect that only one lane performs the update on
+// behalf of the whole wave.
+//
+// Assumptions:
+// - AMDGPU Wave32 (32 lanes): uses ballot.i32, mbcnt.lo with a full mask, and
+// lane = mbcnt & 31.
+// - OffloadProfilingThreadBitWidth (KSlotBits) >= 5; kWarpBits is 5 for 
Wave32.
+// - Two modes:
+//   - PatternOverflow: performs a non-atomic RMW, routes to an overflow slot
+//   based on sampling.
+//   - AtomicWarpLeader: only the elected leader performs an atomic add.
+// - Inc->getStep() is an LLVM integer-typed Value (often a constant 1), and 
may
+// not equal 1.
+// - The increment amount in both modes is Inc->getStep() *
+// popcount(activeMask).
+void InstrLowerer::lowerIncrementAMDGPU(InstrProfIncrementInst *Inc) {
+  IRBuilder<> Builder(Inc);
+  LLVMContext &Context = M.getContext();
+  auto *Int1Ty = Type::getInt1Ty(Context);
+  auto *Int8Ty = Type::getInt8Ty(Context);
+  auto *Int16Ty = Type::getInt16Ty(Context);
+  auto *Int32Ty = Type::getInt32Ty(Context);
+  auto *Int64Ty = Type::getInt64Ty(Context);
+
+  // Constants/configuration
+  const unsigned KSlotBits =
+      OffloadProfilingThreadBitWidth; // must be >= 5 (Wave32)
+  const unsigned KSlots = 1u << KSlotBits;
+  const unsigned KOverflow = KSlots - 1u; // only used in PatternOverflow mode
+  const unsigned KPattern14 = 0x2A3Fu;    // only used in PatternOverflow mode
+  const unsigned kWarpBits = 5u;          // Wave32 lane width
+
+  if (KSlotBits < kWarpBits)
+    report_fatal_error(
+        "OffloadProfilingThreadBitWidth must be >= 5 for wave32");
+
+  // --- Get thread and block identifiers ---
+  FunctionCallee BlockIdxFn =
+      M.getOrInsertFunction("llvm.amdgcn.workgroup.id.x", Int32Ty);
+  Value *BlockIdx = Builder.CreateCall(BlockIdxFn, {}, "BlockIdxX");
+
+  FunctionCallee ThreadIdxFn =
+      M.getOrInsertFunction("llvm.amdgcn.workitem.id.x", Int32Ty);
+  Value *ThreadIdx = Builder.CreateCall(ThreadIdxFn, {}, "ThreadIdxX");
+
+  // --- Get launch-time data from implicit arguments ---
+  FunctionCallee ImplicitArgFn = M.getOrInsertFunction(
+      "llvm.amdgcn.implicitarg.ptr", PointerType::get(Context, 4));
+  Value *ImplicitArgPtr = Builder.CreateCall(ImplicitArgFn, {});
+
+  // gridDim.x (i32) at base
+  Value *GridDimX = Builder.CreateLoad(Int32Ty, ImplicitArgPtr, "GridDimX");
+
+  // blockDim.x (i16) at offset 12
+  Value *BlockDimXAddr = Builder.CreateInBoundsGEP(
+      Int8Ty, ImplicitArgPtr, ConstantInt::get(Int64Ty, 12), "BlockDimXAddr");
+  Value *BlockDimX = Builder.CreateLoad(Int16Ty, BlockDimXAddr, "BlockDimX");
+
+  // --- Optional: 64-bit gid (not used by slot calc, but useful to keep) ---
+  Value *BlockIdx64 = Builder.CreateZExt(BlockIdx, Int64Ty, "BlockIdxX.zext");
+  Value *ThreadIdx64 =
+      Builder.CreateZExt(ThreadIdx, Int64Ty, "ThreadIdxX.zext");
+  Value *BlockDimX64 = Builder.CreateZExt(BlockDimX, Int64Ty, 
"BlockDimX.zext");
+  Value *Gid = Builder.CreateAdd(Builder.CreateMul(BlockIdx64, BlockDimX64),
+                                 ThreadIdx64, "Gid");
+  (void)Gid;
+
+  // ----------------------------
+  // Common slot computation (Wave32)
+  // ----------------------------
+
+  // lane id via amdgcn.mbcnt.lo (count active lanes below me); lane = & 31
+  auto *MbcntLoTy = FunctionType::get(Int32Ty, {Int32Ty, Int32Ty}, false);
+  FunctionCallee MbcntLoFnByName =
+      M.getOrInsertFunction("llvm.amdgcn.mbcnt.lo", MbcntLoTy);
+  Value *MbcntLo = Builder.CreateCall(
+      MbcntLoFnByName,
+      {ConstantInt::getSigned(Int32Ty, -1), ConstantInt::get(Int32Ty, 0)},
+      "mbcnt.lo");
+  Value *Lane =
+      Builder.CreateAnd(MbcntLo, ConstantInt::get(Int32Ty, 31), "lane");
+
+  // warpLocal = threadIdx.x >> 5
+  Value *WarpLocal = Builder.CreateLShr(
+      ThreadIdx, ConstantInt::get(Int32Ty, kWarpBits), "warpLocal");
+
+  // blockBits = (gridDim.x > 1) ? (32 - ctlz(gridDim.x - 1)) : 1
+  Value *GridGt1 = Builder.CreateICmpUGT(GridDimX, ConstantInt::get(Int32Ty, 
1),
+                                         "grid_gt_1");
+  Value *GridDimXMinus1 = Builder.CreateSub(
+      GridDimX, ConstantInt::get(Int32Ty, 1), "gridDimX_minus_1");
+  FunctionCallee CtlzI32Fn =
+      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ctlz, {Int32Ty});
+  Value *CtlzVal = Builder.CreateCall(
+      CtlzI32Fn, {GridDimXMinus1, Builder.getFalse()}, 
"ctlz_gridDimX_minus_1");
+  Value *BlockBitsCandidate = Builder.CreateSub(ConstantInt::get(Int32Ty, 32),
+                                                CtlzVal, "blockBits_cand");
+  Value *BlockBits = Builder.CreateSelect(
+      GridGt1, BlockBitsCandidate, ConstantInt::get(Int32Ty, 1), "blockBits");
+
+  // usedForHi = min(blockBits, KSlotBits - kWarpBits)
+  Value *SlotHiBits = ConstantInt::get(Int32Ty, (int)(KSlotBits - kWarpBits));
+  Value *BlockLtSlotHi = Builder.CreateICmpULT(BlockBits, SlotHiBits);
+  Value *UsedForHi =
+      Builder.CreateSelect(BlockLtSlotHi, BlockBits, SlotHiBits, "usedForHi");
+
+  // sampBits = blockBits - usedForHi
+  Value *SampBits = Builder.CreateSub(BlockBits, UsedForHi, "sampBits");
+  Value *SampBitsIsZero = Builder.CreateICmpEQ(
+      SampBits, ConstantInt::get(Int32Ty, 0), "sampBits_is_zero");
----------------
jmmartinez wrote:


```suggestion
  Value *SampBitsIsZero = Builder.CreateIsNull(
      SampBits, "sampBits_is_zero");
```

https://github.com/llvm/llvm-project/pull/177665
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [compiler-rt] [llvm] [PGO][AMDGPU] Add offload profiling with uniformity-aware optimization (PR #177665)

Reply via email to