Author: Joseph Huber Date: 2026-03-09T15:50:51-05:00 New Revision: a89bb6291dd9cbc401f291b4b4c15a71f9053eb0
URL: https://github.com/llvm/llvm-project/commit/a89bb6291dd9cbc401f291b4b4c15a71f9053eb0 DIFF: https://github.com/llvm/llvm-project/commit/a89bb6291dd9cbc401f291b4b4c15a71f9053eb0.diff LOG: [Clang] Update the 'gpuintrin.h' lane scan handling (#185451) Summary: This patch uses a more efficient algorithm for the reduction rather than a divergent branch. We also provide a prefix and suffix version, the sum is now just the first element of this. This changes the name to this, which is technically breaking but I don't think these were really used in practice and it's a trivial change based on the clang version if it's really needed.. ``` __gpu_prefix_scan_sum_u32(...) __gpu_suffix_scan_sum_u32(...) ``` Added: Modified: clang/lib/Headers/gpuintrin.h clang/test/Headers/gpuintrin.c libc/src/__support/GPU/utils.h Removed: ################################################################################ diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 4335ad8c83ddd..4f7eea0cf6188 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -201,67 +201,55 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, __builtin_bit_cast(uint64_t, __x), __width)); } -// Gets the accumulator scan of the threads in the warp or wavefront. -#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \ - _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \ - uint64_t __lane_mask, uint32_t __x) { \ - uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \ - bool __divergent = __gpu_read_first_lane_##__suffix( \ - __lane_mask, __first & (__first + 1)); \ - if (__divergent) { \ - __type __accum = 0; \ - for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \ - __type __index = __builtin_ctzll(__mask); \ - __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \ - __gpu_num_lanes()); \ - __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \ - __accum += __tmp; \ - } \ - } else { \ - for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ - uint32_t __index = __gpu_lane_id() - __step; \ - __bitmask_type bitmask = __gpu_lane_id() >= __step; \ - __x += __builtin_bit_cast( \ - __type, \ - -bitmask & __builtin_bit_cast(__bitmask_type, \ - __gpu_shuffle_idx_##__suffix( \ - __lane_mask, __index, __x, \ - __gpu_num_lanes()))); \ - } \ +// Implements scan and reduction operations across a GPU warp or wavefront. +// +// Both scans work by iterating log2(N) steps. The bitmask tracks the currently +// unprocessed lanes, above or below the current lane in the case of a suffix or +// prefix scan. Each iteration we shuffle in the unprocessed neighbors and then +// clear the bits that this operation handled. +#define __DO_LANE_OP(__type, __op, __identity, __prefix, __suffix) \ + _DEFAULT_FN_ATTRS static __inline__ __type \ + __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \ + __type __x) { \ + uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \ + for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ + uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id(); \ + __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \ + __gpu_num_lanes()); \ + __x = __x __op(__above ? __result : (__type)__identity); \ + for (uint32_t __i = 0; __i < __step; ++__i) \ + __above &= __above - 1; \ } \ return __x; \ - } -__DO_LANE_SCAN(uint32_t, uint32_t, u32); // uint32_t __gpu_lane_scan_u32(m, x) -__DO_LANE_SCAN(uint64_t, uint64_t, u64); // uint64_t __gpu_lane_scan_u64(m, x) -__DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x) -__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x) -#undef __DO_LANE_SCAN - -// Gets the sum of all lanes inside the warp or wavefront. -#define __DO_LANE_SUM(__type, __suffix) \ - _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \ - uint64_t __lane_mask, __type __x) { \ - uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \ - bool __divergent = __gpu_read_first_lane_##__suffix( \ - __lane_mask, __first & (__first + 1)); \ - if (__divergent) { \ - return __gpu_shuffle_idx_##__suffix( \ - __lane_mask, 63 - __builtin_clzll(__lane_mask), \ - __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \ - } else { \ - for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ - uint32_t __index = __step + __gpu_lane_id(); \ - __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \ - __gpu_num_lanes()); \ - } \ - return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \ + } \ + \ + _DEFAULT_FN_ATTRS static __inline__ __type \ + __gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \ + __type __x) { \ + uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1); \ + for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ + uint32_t __src = \ + __below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id(); \ + __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \ + __gpu_num_lanes()); \ + __x = __x __op(__below ? __result : (__type)__identity); \ + for (uint32_t __i = 0; __i < __step; ++__i) \ + __below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below; \ } \ + return __x; \ + } \ + \ + _DEFAULT_FN_ATTRS static __inline__ __type \ + __gpu_lane_##__prefix##_##__suffix(uint64_t __lane_mask, __type __x) { \ + return __gpu_read_first_lane_##__suffix( \ + __lane_mask, \ + __gpu_suffix_scan_##__prefix##_##__suffix(__lane_mask, __x)); \ } -__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x) -__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x) -__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x) -__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x) -#undef __DO_LANE_SUM +__DO_LANE_OP(uint32_t, +, 0, sum, u32); +__DO_LANE_OP(uint64_t, +, 0, sum, u64); +__DO_LANE_OP(float, +, 0, sum, f32); +__DO_LANE_OP(double, +, 0, sum, f64); +#undef __DO_LANE_OP // Returns a bitmask marking all lanes that have the same value of __x. _DEFAULT_FN_ATTRS static __inline__ uint64_t diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index 17c1699ee5c36..04b50acc4a049 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -43,6 +43,7 @@ __gpu_kernel void foo() { __gpu_shuffle_idx_u32(-1, -1, -1, 0); __gpu_first_lane_id(-1); __gpu_is_first_in_lane(-1); + __gpu_prefix_scan_sum_u32(~0, 1); __gpu_exit(); } // AMDGPU-LABEL: define protected amdgpu_kernel void @foo( @@ -75,6 +76,7 @@ __gpu_kernel void foo() { // AMDGPU-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR8]] // AMDGPU-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR8]] // AMDGPU-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR8]] +// AMDGPU-NEXT: [[CALL25:%.*]] = call i32 @__gpu_prefix_scan_sum_u32(i64 noundef -1, i32 noundef 1) #[[ATTR8]] // AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR9:[0-9]+]] // AMDGPU-NEXT: unreachable // @@ -525,6 +527,113 @@ __gpu_kernel void foo() { // AMDGPU-NEXT: ret i1 [[CMP]] // // +// AMDGPU-LABEL: define internal i32 @__gpu_prefix_scan_sum_u32( +// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__BELOW:%.*]] = alloca i64, align 8, addrspace(5) +// AMDGPU-NEXT: [[__STEP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__SRC:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__RESULT:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__I:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr +// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr +// AMDGPU-NEXT: [[__BELOW_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__BELOW]] to ptr +// AMDGPU-NEXT: [[__STEP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__STEP]] to ptr +// AMDGPU-NEXT: [[__SRC_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__SRC]] to ptr +// AMDGPU-NEXT: [[__RESULT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__RESULT]] to ptr +// AMDGPU-NEXT: [[__I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__I]] to ptr +// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: store i32 [[__X]], ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR8]] +// AMDGPU-NEXT: [[SH_PROM:%.*]] = zext i32 [[CALL]] to i64 +// AMDGPU-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]] +// AMDGPU-NEXT: [[SUB:%.*]] = sub i64 [[SHL]], 1 +// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[SUB]] +// AMDGPU-NEXT: store i64 [[AND]], ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: store i32 1, ptr [[__STEP_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[FOR_COND:.*]] +// AMDGPU: [[FOR_COND]]: +// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[__STEP_ASCAST]], align 4 +// AMDGPU-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR8]] +// AMDGPU-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[CALL1]] +// AMDGPU-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END20:.*]] +// AMDGPU: [[FOR_BODY]]: +// AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[TMP2]], 0 +// AMDGPU-NEXT: br i1 [[TOBOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// AMDGPU: [[COND_TRUE]]: +// AMDGPU-NEXT: [[TMP3:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 true) +// AMDGPU-NEXT: [[CAST:%.*]] = trunc i64 [[TMP4]] to i32 +// AMDGPU-NEXT: [[SUB2:%.*]] = sub nsw i32 63, [[CAST]] +// AMDGPU-NEXT: br label %[[COND_END:.*]] +// AMDGPU: [[COND_FALSE]]: +// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_lane_id() #[[ATTR8]] +// AMDGPU-NEXT: br label %[[COND_END]] +// AMDGPU: [[COND_END]]: +// AMDGPU-NEXT: [[COND:%.*]] = phi i32 [ [[SUB2]], %[[COND_TRUE]] ], [ [[CALL3]], %[[COND_FALSE]] ] +// AMDGPU-NEXT: store i32 [[COND]], ptr [[__SRC_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP5:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[__SRC_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR8]] +// AMDGPU-NEXT: [[CALL5:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef [[TMP5]], i32 noundef [[TMP6]], i32 noundef [[TMP7]], i32 noundef [[CALL4]]) #[[ATTR8]] +// AMDGPU-NEXT: store i32 [[CALL5]], ptr [[__RESULT_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP9:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: [[TOBOOL6:%.*]] = icmp ne i64 [[TMP9]], 0 +// AMDGPU-NEXT: br i1 [[TOBOOL6]], label %[[COND_TRUE7:.*]], label %[[COND_FALSE8:.*]] +// AMDGPU: [[COND_TRUE7]]: +// AMDGPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[__RESULT_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[COND_END9:.*]] +// AMDGPU: [[COND_FALSE8]]: +// AMDGPU-NEXT: br label %[[COND_END9]] +// AMDGPU: [[COND_END9]]: +// AMDGPU-NEXT: [[COND10:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE7]] ], [ 0, %[[COND_FALSE8]] ] +// AMDGPU-NEXT: [[ADD:%.*]] = add i32 [[TMP8]], [[COND10]] +// AMDGPU-NEXT: store i32 [[ADD]], ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: store i32 0, ptr [[__I_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[FOR_COND11:.*]] +// AMDGPU: [[FOR_COND11]]: +// AMDGPU-NEXT: [[TMP11:%.*]] = load i32, ptr [[__I_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[__STEP_ASCAST]], align 4 +// AMDGPU-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[TMP12]] +// AMDGPU-NEXT: br i1 [[CMP12]], label %[[FOR_BODY13:.*]], label %[[FOR_END:.*]] +// AMDGPU: [[FOR_BODY13]]: +// AMDGPU-NEXT: [[TMP13:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: [[TMP14:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP13]], i1 true) +// AMDGPU-NEXT: [[CAST14:%.*]] = trunc i64 [[TMP14]] to i32 +// AMDGPU-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP13]], 0 +// AMDGPU-NEXT: [[CLZG:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[CAST14]] +// AMDGPU-NEXT: [[SUB15:%.*]] = sub nsw i32 63, [[CLZG]] +// AMDGPU-NEXT: [[SH_PROM16:%.*]] = zext i32 [[SUB15]] to i64 +// AMDGPU-NEXT: [[SHL17:%.*]] = shl i64 1, [[SH_PROM16]] +// AMDGPU-NEXT: [[TMP15:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: [[AND18:%.*]] = and i64 [[SHL17]], [[TMP15]] +// AMDGPU-NEXT: [[TMP16:%.*]] = load i64, ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: [[XOR:%.*]] = xor i64 [[TMP16]], [[AND18]] +// AMDGPU-NEXT: store i64 [[XOR]], ptr [[__BELOW_ASCAST]], align 8 +// AMDGPU-NEXT: br label %[[FOR_INC:.*]] +// AMDGPU: [[FOR_INC]]: +// AMDGPU-NEXT: [[TMP17:%.*]] = load i32, ptr [[__I_ASCAST]], align 4 +// AMDGPU-NEXT: [[INC:%.*]] = add i32 [[TMP17]], 1 +// AMDGPU-NEXT: store i32 [[INC]], ptr [[__I_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[FOR_COND11]], !llvm.loop [[LOOP5:![0-9]+]] +// AMDGPU: [[FOR_END]]: +// AMDGPU-NEXT: br label %[[FOR_INC19:.*]] +// AMDGPU: [[FOR_INC19]]: +// AMDGPU-NEXT: [[TMP18:%.*]] = load i32, ptr [[__STEP_ASCAST]], align 4 +// AMDGPU-NEXT: [[MUL:%.*]] = mul i32 [[TMP18]], 2 +// AMDGPU-NEXT: store i32 [[MUL]], ptr [[__STEP_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] +// AMDGPU: [[FOR_END20]]: +// AMDGPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: ret i32 [[TMP19]] +// +// // AMDGPU-LABEL: define internal void @__gpu_exit( // AMDGPU-SAME: ) #[[ATTR1:[0-9]+]] { // AMDGPU-NEXT: [[ENTRY:.*:]] @@ -562,6 +671,7 @@ __gpu_kernel void foo() { // NVPTX-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]] // NVPTX-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]] // NVPTX-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL25:%.*]] = call i32 @__gpu_prefix_scan_sum_u32(i64 noundef -1, i32 noundef 1) #[[ATTR6]] // NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]] // NVPTX-NEXT: unreachable // @@ -967,6 +1077,106 @@ __gpu_kernel void foo() { // NVPTX-NEXT: ret i1 [[CMP]] // // +// NVPTX-LABEL: define internal i32 @__gpu_prefix_scan_sum_u32( +// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__BELOW:%.*]] = alloca i64, align 8 +// NVPTX-NEXT: [[__STEP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__SRC:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__RESULT:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__I:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]] +// NVPTX-NEXT: [[SH_PROM:%.*]] = zext i32 [[CALL]] to i64 +// NVPTX-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]] +// NVPTX-NEXT: [[SUB:%.*]] = sub i64 [[SHL]], 1 +// NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[SUB]] +// NVPTX-NEXT: store i64 [[AND]], ptr [[__BELOW]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[__STEP]], align 4 +// NVPTX-NEXT: br label %[[FOR_COND:.*]] +// NVPTX: [[FOR_COND]]: +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__STEP]], align 4 +// NVPTX-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]] +// NVPTX-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[CALL1]] +// NVPTX-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END20:.*]] +// NVPTX: [[FOR_BODY]]: +// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__BELOW]], align 8 +// NVPTX-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[TMP2]], 0 +// NVPTX-NEXT: br i1 [[TOBOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// NVPTX: [[COND_TRUE]]: +// NVPTX-NEXT: [[TMP3:%.*]] = load i64, ptr [[__BELOW]], align 8 +// NVPTX-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 true) +// NVPTX-NEXT: [[CAST:%.*]] = trunc i64 [[TMP4]] to i32 +// NVPTX-NEXT: [[SUB2:%.*]] = sub nsw i32 63, [[CAST]] +// NVPTX-NEXT: br label %[[COND_END:.*]] +// NVPTX: [[COND_FALSE]]: +// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]] +// NVPTX-NEXT: br label %[[COND_END]] +// NVPTX: [[COND_END]]: +// NVPTX-NEXT: [[COND:%.*]] = phi i32 [ [[SUB2]], %[[COND_TRUE]] ], [ [[CALL3]], %[[COND_FALSE]] ] +// NVPTX-NEXT: store i32 [[COND]], ptr [[__SRC]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[__SRC]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]] +// NVPTX-NEXT: [[CALL5:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef [[TMP5]], i32 noundef [[TMP6]], i32 noundef [[TMP7]], i32 noundef [[CALL4]]) #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL5]], ptr [[__RESULT]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load i64, ptr [[__BELOW]], align 8 +// NVPTX-NEXT: [[TOBOOL6:%.*]] = icmp ne i64 [[TMP9]], 0 +// NVPTX-NEXT: br i1 [[TOBOOL6]], label %[[COND_TRUE7:.*]], label %[[COND_FALSE8:.*]] +// NVPTX: [[COND_TRUE7]]: +// NVPTX-NEXT: [[TMP10:%.*]] = load i32, ptr [[__RESULT]], align 4 +// NVPTX-NEXT: br label %[[COND_END9:.*]] +// NVPTX: [[COND_FALSE8]]: +// NVPTX-NEXT: br label %[[COND_END9]] +// NVPTX: [[COND_END9]]: +// NVPTX-NEXT: [[COND10:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE7]] ], [ 0, %[[COND_FALSE8]] ] +// NVPTX-NEXT: [[ADD:%.*]] = add i32 [[TMP8]], [[COND10]] +// NVPTX-NEXT: store i32 [[ADD]], ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: store i32 0, ptr [[__I]], align 4 +// NVPTX-NEXT: br label %[[FOR_COND11:.*]] +// NVPTX: [[FOR_COND11]]: +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[__I]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = load i32, ptr [[__STEP]], align 4 +// NVPTX-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[TMP12]] +// NVPTX-NEXT: br i1 [[CMP12]], label %[[FOR_BODY13:.*]], label %[[FOR_END:.*]] +// NVPTX: [[FOR_BODY13]]: +// NVPTX-NEXT: [[TMP13:%.*]] = load i64, ptr [[__BELOW]], align 8 +// NVPTX-NEXT: [[TMP14:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP13]], i1 true) +// NVPTX-NEXT: [[CAST14:%.*]] = trunc i64 [[TMP14]] to i32 +// NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP13]], 0 +// NVPTX-NEXT: [[CLZG:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[CAST14]] +// NVPTX-NEXT: [[SUB15:%.*]] = sub nsw i32 63, [[CLZG]] +// NVPTX-NEXT: [[SH_PROM16:%.*]] = zext i32 [[SUB15]] to i64 +// NVPTX-NEXT: [[SHL17:%.*]] = shl i64 1, [[SH_PROM16]] +// NVPTX-NEXT: [[TMP15:%.*]] = load i64, ptr [[__BELOW]], align 8 +// NVPTX-NEXT: [[AND18:%.*]] = and i64 [[SHL17]], [[TMP15]] +// NVPTX-NEXT: [[TMP16:%.*]] = load i64, ptr [[__BELOW]], align 8 +// NVPTX-NEXT: [[XOR:%.*]] = xor i64 [[TMP16]], [[AND18]] +// NVPTX-NEXT: store i64 [[XOR]], ptr [[__BELOW]], align 8 +// NVPTX-NEXT: br label %[[FOR_INC:.*]] +// NVPTX: [[FOR_INC]]: +// NVPTX-NEXT: [[TMP17:%.*]] = load i32, ptr [[__I]], align 4 +// NVPTX-NEXT: [[INC:%.*]] = add i32 [[TMP17]], 1 +// NVPTX-NEXT: store i32 [[INC]], ptr [[__I]], align 4 +// NVPTX-NEXT: br label %[[FOR_COND11]], !llvm.loop [[LOOP1:![0-9]+]] +// NVPTX: [[FOR_END]]: +// NVPTX-NEXT: br label %[[FOR_INC19:.*]] +// NVPTX: [[FOR_INC19]]: +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[__STEP]], align 4 +// NVPTX-NEXT: [[MUL:%.*]] = mul i32 [[TMP18]], 2 +// NVPTX-NEXT: store i32 [[MUL]], ptr [[__STEP]], align 4 +// NVPTX-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// NVPTX: [[FOR_END20]]: +// NVPTX-NEXT: [[TMP19:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// NVPTX-NEXT: ret i32 [[TMP19]] +// +// // NVPTX-LABEL: define internal void @__gpu_exit( // NVPTX-SAME: ) #[[ATTR1:[0-9]+]] { // NVPTX-NEXT: [[ENTRY:.*:]] @@ -1004,6 +1214,7 @@ __gpu_kernel void foo() { // SPIRV-NEXT: [[CALL22:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) // SPIRV-NEXT: [[CALL23:%.*]] = call spir_func i64 @__gpu_first_lane_id(i64 noundef -1) // SPIRV-NEXT: [[CALL24:%.*]] = call spir_func zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) +// SPIRV-NEXT: [[CALL25:%.*]] = call spir_func i32 @__gpu_prefix_scan_sum_u32(i64 noundef -1, i32 noundef 1) // SPIRV-NEXT: call spir_func void @__gpu_exit() #[[ATTR7:[0-9]+]] // SPIRV-NEXT: unreachable // @@ -1401,6 +1612,106 @@ __gpu_kernel void foo() { // SPIRV-NEXT: ret i1 [[CMP]] // // +// SPIRV-LABEL: define internal spir_func i32 @__gpu_prefix_scan_sum_u32( +// SPIRV-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8 +// SPIRV-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[__BELOW:%.*]] = alloca i64, align 8 +// SPIRV-NEXT: [[__STEP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[__SRC:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[__RESULT:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[__I:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV-NEXT: [[CALL:%.*]] = call spir_func i32 @__gpu_lane_id() +// SPIRV-NEXT: [[SH_PROM:%.*]] = zext i32 [[CALL]] to i64 +// SPIRV-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]] +// SPIRV-NEXT: [[SUB:%.*]] = sub i64 [[SHL]], 1 +// SPIRV-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[SUB]] +// SPIRV-NEXT: store i64 [[AND]], ptr [[__BELOW]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[__STEP]], align 4 +// SPIRV-NEXT: br label %[[FOR_COND:.*]] +// SPIRV: [[FOR_COND]]: +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[__STEP]], align 4 +// SPIRV-NEXT: [[CALL1:%.*]] = call spir_func i32 @__gpu_num_lanes() +// SPIRV-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP1]], [[CALL1]] +// SPIRV-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END20:.*]] +// SPIRV: [[FOR_BODY]]: +// SPIRV-NEXT: [[TMP2:%.*]] = load i64, ptr [[__BELOW]], align 8 +// SPIRV-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[TMP2]], 0 +// SPIRV-NEXT: br i1 [[TOBOOL]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// SPIRV: [[COND_TRUE]]: +// SPIRV-NEXT: [[TMP3:%.*]] = load i64, ptr [[__BELOW]], align 8 +// SPIRV-NEXT: [[TMP4:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP3]], i1 true) +// SPIRV-NEXT: [[CAST:%.*]] = trunc i64 [[TMP4]] to i32 +// SPIRV-NEXT: [[SUB2:%.*]] = sub nsw i32 63, [[CAST]] +// SPIRV-NEXT: br label %[[COND_END:.*]] +// SPIRV: [[COND_FALSE]]: +// SPIRV-NEXT: [[CALL3:%.*]] = call spir_func i32 @__gpu_lane_id() +// SPIRV-NEXT: br label %[[COND_END]] +// SPIRV: [[COND_END]]: +// SPIRV-NEXT: [[COND:%.*]] = phi i32 [ [[SUB2]], %[[COND_TRUE]] ], [ [[CALL3]], %[[COND_FALSE]] ] +// SPIRV-NEXT: store i32 [[COND]], ptr [[__SRC]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[__SRC]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// SPIRV-NEXT: [[CALL4:%.*]] = call spir_func i32 @__gpu_num_lanes() +// SPIRV-NEXT: [[CALL5:%.*]] = call spir_func i32 @__gpu_shuffle_idx_u32(i64 noundef [[TMP5]], i32 noundef [[TMP6]], i32 noundef [[TMP7]], i32 noundef [[CALL4]]) +// SPIRV-NEXT: store i32 [[CALL5]], ptr [[__RESULT]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load i64, ptr [[__BELOW]], align 8 +// SPIRV-NEXT: [[TOBOOL6:%.*]] = icmp ne i64 [[TMP9]], 0 +// SPIRV-NEXT: br i1 [[TOBOOL6]], label %[[COND_TRUE7:.*]], label %[[COND_FALSE8:.*]] +// SPIRV: [[COND_TRUE7]]: +// SPIRV-NEXT: [[TMP10:%.*]] = load i32, ptr [[__RESULT]], align 4 +// SPIRV-NEXT: br label %[[COND_END9:.*]] +// SPIRV: [[COND_FALSE8]]: +// SPIRV-NEXT: br label %[[COND_END9]] +// SPIRV: [[COND_END9]]: +// SPIRV-NEXT: [[COND10:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE7]] ], [ 0, %[[COND_FALSE8]] ] +// SPIRV-NEXT: [[ADD:%.*]] = add i32 [[TMP8]], [[COND10]] +// SPIRV-NEXT: store i32 [[ADD]], ptr [[__X_ADDR]], align 4 +// SPIRV-NEXT: store i32 0, ptr [[__I]], align 4 +// SPIRV-NEXT: br label %[[FOR_COND11:.*]] +// SPIRV: [[FOR_COND11]]: +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[__I]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = load i32, ptr [[__STEP]], align 4 +// SPIRV-NEXT: [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[TMP12]] +// SPIRV-NEXT: br i1 [[CMP12]], label %[[FOR_BODY13:.*]], label %[[FOR_END:.*]] +// SPIRV: [[FOR_BODY13]]: +// SPIRV-NEXT: [[TMP13:%.*]] = load i64, ptr [[__BELOW]], align 8 +// SPIRV-NEXT: [[TMP14:%.*]] = call i64 @llvm.ctlz.i64(i64 [[TMP13]], i1 true) +// SPIRV-NEXT: [[CAST14:%.*]] = trunc i64 [[TMP14]] to i32 +// SPIRV-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP13]], 0 +// SPIRV-NEXT: [[CLZG:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[CAST14]] +// SPIRV-NEXT: [[SUB15:%.*]] = sub nsw i32 63, [[CLZG]] +// SPIRV-NEXT: [[SH_PROM16:%.*]] = zext i32 [[SUB15]] to i64 +// SPIRV-NEXT: [[SHL17:%.*]] = shl i64 1, [[SH_PROM16]] +// SPIRV-NEXT: [[TMP15:%.*]] = load i64, ptr [[__BELOW]], align 8 +// SPIRV-NEXT: [[AND18:%.*]] = and i64 [[SHL17]], [[TMP15]] +// SPIRV-NEXT: [[TMP16:%.*]] = load i64, ptr [[__BELOW]], align 8 +// SPIRV-NEXT: [[XOR:%.*]] = xor i64 [[TMP16]], [[AND18]] +// SPIRV-NEXT: store i64 [[XOR]], ptr [[__BELOW]], align 8 +// SPIRV-NEXT: br label %[[FOR_INC:.*]] +// SPIRV: [[FOR_INC]]: +// SPIRV-NEXT: [[TMP17:%.*]] = load i32, ptr [[__I]], align 4 +// SPIRV-NEXT: [[INC:%.*]] = add i32 [[TMP17]], 1 +// SPIRV-NEXT: store i32 [[INC]], ptr [[__I]], align 4 +// SPIRV-NEXT: br label %[[FOR_COND11]], !llvm.loop [[LOOP1:![0-9]+]] +// SPIRV: [[FOR_END]]: +// SPIRV-NEXT: br label %[[FOR_INC19:.*]] +// SPIRV: [[FOR_INC19]]: +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[__STEP]], align 4 +// SPIRV-NEXT: [[MUL:%.*]] = mul i32 [[TMP18]], 2 +// SPIRV-NEXT: store i32 [[MUL]], ptr [[__STEP]], align 4 +// SPIRV-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// SPIRV: [[FOR_END20]]: +// SPIRV-NEXT: [[TMP19:%.*]] = load i32, ptr [[__X_ADDR]], align 4 +// SPIRV-NEXT: ret i32 [[TMP19]] +// +// // SPIRV-LABEL: define internal spir_func void @__gpu_exit( // SPIRV-SAME: ) #[[ATTR1:[0-9]+]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -1411,4 +1722,15 @@ __gpu_kernel void foo() { // AMDGPU: [[RNG2]] = !{i32 1, i32 0} // AMDGPU: [[META3]] = !{} // AMDGPU: [[RNG4]] = !{i16 1, i16 1025} +// AMDGPU: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]]} +// AMDGPU: [[META6]] = !{!"llvm.loop.mustprogress"} +// AMDGPU: [[LOOP7]] = distinct !{[[LOOP7]], [[META6]]} +//. +// NVPTX: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]} +// NVPTX: [[META2]] = !{!"llvm.loop.mustprogress"} +// NVPTX: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]]} +//. +// SPIRV: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]]} +// SPIRV: [[META2]] = !{!"llvm.loop.mustprogress"} +// SPIRV: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]]} //. diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index 1b3e6edfc4e0d..1916f57959037 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -123,7 +123,7 @@ LIBC_INLINE uint32_t reduce(uint64_t lane_mask, uint32_t x) { } LIBC_INLINE uint32_t scan(uint64_t lane_mask, uint32_t x) { - return __gpu_lane_scan_u32(lane_mask, x); + return __gpu_prefix_scan_sum_u32(lane_mask, x); } LIBC_INLINE uint64_t fixed_frequency_clock() { _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
