================
@@ -1208,6 +1321,353 @@ void
InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
Inc->eraseFromParent();
}
+// Lowers an InstrProfIncrementInst for AMDGPU to per-wave aggregated counter
+// updates. It computes a "slot" index based on block and warp-local indices,
+// elects a leader lane, and increments a counter by (Inc->getStep() *
+// number_of_active_lanes) to reflect that only one lane performs the update on
+// behalf of the whole wave.
+//
+// Assumptions:
+// - AMDGPU Wave32 (32 lanes): uses ballot.i32, mbcnt.lo with a full mask, and
+// lane = mbcnt & 31.
+// - OffloadProfilingThreadBitWidth (KSlotBits) >= 5; kWarpBits is 5 for
Wave32.
+// - Two modes:
+// - PatternOverflow: performs a non-atomic RMW, routes to an overflow slot
+// based on sampling.
+// - AtomicWarpLeader: only the elected leader performs an atomic add.
+// - Inc->getStep() is an LLVM integer-typed Value (often a constant 1), and
may
+// not equal 1.
+// - The increment amount in both modes is Inc->getStep() *
+// popcount(activeMask).
+void InstrLowerer::lowerIncrementAMDGPU(InstrProfIncrementInst *Inc) {
+ IRBuilder<> Builder(Inc);
+ LLVMContext &Context = M.getContext();
+ auto *Int1Ty = Type::getInt1Ty(Context);
+ auto *Int8Ty = Type::getInt8Ty(Context);
+ auto *Int16Ty = Type::getInt16Ty(Context);
+ auto *Int32Ty = Type::getInt32Ty(Context);
+ auto *Int64Ty = Type::getInt64Ty(Context);
+
+ // Constants/configuration
+ const unsigned KSlotBits =
+ OffloadProfilingThreadBitWidth; // must be >= 5 (Wave32)
+ const unsigned KSlots = 1u << KSlotBits;
+ const unsigned KOverflow = KSlots - 1u; // only used in PatternOverflow mode
+ const unsigned KPattern14 = 0x2A3Fu; // only used in PatternOverflow mode
+ const unsigned kWarpBits = 5u; // Wave32 lane width
+
+ if (KSlotBits < kWarpBits)
+ report_fatal_error(
+ "OffloadProfilingThreadBitWidth must be >= 5 for wave32");
+
+ // --- Get thread and block identifiers ---
+ FunctionCallee BlockIdxFn =
+ M.getOrInsertFunction("llvm.amdgcn.workgroup.id.x", Int32Ty);
+ Value *BlockIdx = Builder.CreateCall(BlockIdxFn, {}, "BlockIdxX");
+
+ FunctionCallee ThreadIdxFn =
+ M.getOrInsertFunction("llvm.amdgcn.workitem.id.x", Int32Ty);
+ Value *ThreadIdx = Builder.CreateCall(ThreadIdxFn, {}, "ThreadIdxX");
+
+ // --- Get launch-time data from implicit arguments ---
+ FunctionCallee ImplicitArgFn = M.getOrInsertFunction(
+ "llvm.amdgcn.implicitarg.ptr", PointerType::get(Context, 4));
+ Value *ImplicitArgPtr = Builder.CreateCall(ImplicitArgFn, {});
+
+ // gridDim.x (i32) at base
+ Value *GridDimX = Builder.CreateLoad(Int32Ty, ImplicitArgPtr, "GridDimX");
+
+ // blockDim.x (i16) at offset 12
+ Value *BlockDimXAddr = Builder.CreateInBoundsGEP(
+ Int8Ty, ImplicitArgPtr, ConstantInt::get(Int64Ty, 12), "BlockDimXAddr");
+ Value *BlockDimX = Builder.CreateLoad(Int16Ty, BlockDimXAddr, "BlockDimX");
+
+ // --- Optional: 64-bit gid (not used by slot calc, but useful to keep) ---
+ Value *BlockIdx64 = Builder.CreateZExt(BlockIdx, Int64Ty, "BlockIdxX.zext");
+ Value *ThreadIdx64 =
+ Builder.CreateZExt(ThreadIdx, Int64Ty, "ThreadIdxX.zext");
+ Value *BlockDimX64 = Builder.CreateZExt(BlockDimX, Int64Ty,
"BlockDimX.zext");
+ Value *Gid = Builder.CreateAdd(Builder.CreateMul(BlockIdx64, BlockDimX64),
+ ThreadIdx64, "Gid");
+ (void)Gid;
+
+ // ----------------------------
+ // Common slot computation (Wave32)
+ // ----------------------------
+
+ // lane id via amdgcn.mbcnt.lo (count active lanes below me); lane = & 31
+ auto *MbcntLoTy = FunctionType::get(Int32Ty, {Int32Ty, Int32Ty}, false);
+ FunctionCallee MbcntLoFnByName =
+ M.getOrInsertFunction("llvm.amdgcn.mbcnt.lo", MbcntLoTy);
+ Value *MbcntLo = Builder.CreateCall(
+ MbcntLoFnByName,
+ {ConstantInt::getSigned(Int32Ty, -1), ConstantInt::get(Int32Ty, 0)},
+ "mbcnt.lo");
+ Value *Lane =
+ Builder.CreateAnd(MbcntLo, ConstantInt::get(Int32Ty, 31), "lane");
+
+ // warpLocal = threadIdx.x >> 5
+ Value *WarpLocal = Builder.CreateLShr(
+ ThreadIdx, ConstantInt::get(Int32Ty, kWarpBits), "warpLocal");
+
+ // blockBits = (gridDim.x > 1) ? (32 - ctlz(gridDim.x - 1)) : 1
+ Value *GridGt1 = Builder.CreateICmpUGT(GridDimX, ConstantInt::get(Int32Ty,
1),
+ "grid_gt_1");
+ Value *GridDimXMinus1 = Builder.CreateSub(
+ GridDimX, ConstantInt::get(Int32Ty, 1), "gridDimX_minus_1");
+ FunctionCallee CtlzI32Fn =
+ Intrinsic::getOrInsertDeclaration(&M, Intrinsic::ctlz, {Int32Ty});
+ Value *CtlzVal = Builder.CreateCall(
+ CtlzI32Fn, {GridDimXMinus1, Builder.getFalse()},
"ctlz_gridDimX_minus_1");
+ Value *BlockBitsCandidate = Builder.CreateSub(ConstantInt::get(Int32Ty, 32),
+ CtlzVal, "blockBits_cand");
+ Value *BlockBits = Builder.CreateSelect(
+ GridGt1, BlockBitsCandidate, ConstantInt::get(Int32Ty, 1), "blockBits");
+
+ // usedForHi = min(blockBits, KSlotBits - kWarpBits)
+ Value *SlotHiBits = ConstantInt::get(Int32Ty, (int)(KSlotBits - kWarpBits));
+ Value *BlockLtSlotHi = Builder.CreateICmpULT(BlockBits, SlotHiBits);
+ Value *UsedForHi =
+ Builder.CreateSelect(BlockLtSlotHi, BlockBits, SlotHiBits, "usedForHi");
+
+ // sampBits = blockBits - usedForHi
+ Value *SampBits = Builder.CreateSub(BlockBits, UsedForHi, "sampBits");
+ Value *SampBitsIsZero = Builder.CreateICmpEQ(
+ SampBits, ConstantInt::get(Int32Ty, 0), "sampBits_is_zero");
----------------
jmmartinez wrote:
```suggestion
Value *SampBitsIsZero = Builder.CreateIsNull(
SampBits, "sampBits_is_zero");
```
https://github.com/llvm/llvm-project/pull/177665
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits