================
@@ -61,6 +65,78 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
return (logical_lane_id == 0);
}
+// Reduction within a block on the GPU.
+//
+// Template parameters:
+// - checkLiveness: Whether to check the liveness of the lanes. This is only
+// useful if gpu_block_reduce is called in a context where
+// L2 parallel regions are possible.
+// Parameters:
+// - reduce_data: Pointer to the reduction data
+// - shflFct: Shuffle reduction function
+// - cpyFct: Inter-warp copy function (copies data from each warp's thread
+// 0 to the lanes of the zeroth warp)
+// - NumValues: Number of values to reduce / threads to consider
+// - ThreadId: Thread ID in block (getThreadIdInBlock() in SPMD and 0 in
+// Generic mode)
+//
+// Returns:
+// - 1 if the thread is the zeroth thread of the block
+// - 0 otherwise
+template <bool checkLiveness = true>
+[[clang::always_inline]]
+static uint32_t gpu_block_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
+ InterWarpCopyFnTy cpyFct, uint32_t NumValues,
+ uint32_t BlockThreadId) {
+ if (NumValues <= 1)
+ return BlockThreadId == 0;
+
+ uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
+ uint32_t WarpOffset = WarpId * mapping::getWarpSize();
+ // Calculate how many values this warp has to deal with. Cap WarpId *
+ // mapping::getWarpSize() at NumValues to avoid underflow.
+ uint32_t ActiveLanes =
+ WarpOffset < NumValues
+ ? kmpc_min(NumValues - WarpOffset, mapping::getWarpSize())
+ : 0;
----------------
ro-i wrote:
not an issue anymore since I now properly differentiate the two situations of
parallel_reduce and team_reduce properly on constexpr level
https://github.com/llvm/llvm-project/pull/195102
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits