llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-libc Author: None (llvmbot) <details> <summary>Changes</summary> Backport 2d8106cb5a505326d1da0f4461708ed44a0ac761 718cdeb9c701725412a040b2b7148523a286a256 Requested by: @<!-- -->jhuber6 --- Full diff: https://github.com/llvm/llvm-project/pull/125912.diff 7 Files Affected: - (modified) clang/lib/Headers/amdgpuintrin.h (+9-5) - (modified) clang/lib/Headers/gpuintrin.h (+14-10) - (modified) clang/lib/Headers/nvptxintrin.h (+8-7) - (modified) clang/test/Headers/gpuintrin.c (+65-3) - (modified) libc/src/__support/GPU/utils.h (+3-2) - (modified) libc/test/integration/src/__support/GPU/CMakeLists.txt (+9) - (added) libc/test/integration/src/__support/GPU/shuffle.cpp (+33) ``````````diff diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 038605605462f8..9dad99ffe9439a 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -145,17 +145,21 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { // Shuffles the the lanes inside the wavefront according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint32_t -__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { - return __builtin_amdgcn_ds_bpermute(__idx << 2, __x); +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { + uint32_t __lane = __idx + (__gpu_lane_id() & ~(__width - 1)); + return __builtin_amdgcn_ds_bpermute(__lane << 2, __x); } // Shuffles the the lanes inside the wavefront according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, + uint32_t __width) { uint32_t __hi = (uint32_t)(__x >> 32ull); uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); - return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) | - ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo)); + return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width) + << 32ull) | + ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width)); } // Returns true if the flat pointer points to AMDGPU 'shared' memory. diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 4c463c333308fc..11c87e85cd4975 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -133,18 +133,21 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) { // Shuffles the the lanes according to the given index. _DEFAULT_FN_ATTRS static __inline__ float -__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x) { +__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x, + uint32_t __width) { return __builtin_bit_cast( float, __gpu_shuffle_idx_u32(__lane_mask, __idx, - __builtin_bit_cast(uint32_t, __x))); + __builtin_bit_cast(uint32_t, __x), __width)); } // Shuffles the the lanes according to the given index. _DEFAULT_FN_ATTRS static __inline__ double -__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) { +__gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, + uint32_t __width) { return __builtin_bit_cast( - double, __gpu_shuffle_idx_u64(__lane_mask, __idx, - __builtin_bit_cast(uint64_t, __x))); + double, + __gpu_shuffle_idx_u64(__lane_mask, __idx, + __builtin_bit_cast(uint64_t, __x), __width)); } // Gets the sum of all lanes inside the warp or wavefront. @@ -153,7 +156,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) { uint64_t __lane_mask, __type __x) { \ for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \ uint32_t __index = __step + __gpu_lane_id(); \ - __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \ + __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \ + __gpu_num_lanes()); \ } \ return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \ } @@ -171,10 +175,10 @@ __DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x) uint32_t __index = __gpu_lane_id() - __step; \ __bitmask_type bitmask = __gpu_lane_id() >= __step; \ __x += __builtin_bit_cast( \ - __type, \ - -bitmask & __builtin_bit_cast(__bitmask_type, \ - __gpu_shuffle_idx_##__suffix( \ - __lane_mask, __index, __x))); \ + __type, -bitmask & __builtin_bit_cast(__bitmask_type, \ + __gpu_shuffle_idx_##__suffix( \ + __lane_mask, __index, __x, \ + __gpu_num_lanes()))); \ } \ return __x; \ } diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index fb2864eab6a09d..40fa2edebe975c 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -149,22 +149,23 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { // Shuffles the the lanes inside the warp according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint32_t -__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) { +__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, + uint32_t __width) { uint32_t __mask = (uint32_t)__lane_mask; - return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, __gpu_num_lanes() - 1u); + return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, + ((__gpu_num_lanes() - __width) << 8u) | 0x1f); } // Shuffles the the lanes inside the warp according to the given index. _DEFAULT_FN_ATTRS static __inline__ uint64_t -__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) { +__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x, + uint32_t __width) { uint32_t __hi = (uint32_t)(__x >> 32ull); uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF); uint32_t __mask = (uint32_t)__lane_mask; - return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __idx, - __gpu_num_lanes() - 1u) + return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32ull) | - ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __idx, - __gpu_num_lanes() - 1u)); + ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width)); } // Returns true if the flat pointer points to CUDA 'shared' memory. diff --git a/clang/test/Headers/gpuintrin.c b/clang/test/Headers/gpuintrin.c index 281339716c3edf..89efe12ee8def8 100644 --- a/clang/test/Headers/gpuintrin.c +++ b/clang/test/Headers/gpuintrin.c @@ -38,7 +38,7 @@ // AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]] // AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]] // AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]] -// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR7]] +// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]] // AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]] // AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]] // AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]] @@ -70,7 +70,7 @@ // NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]] // NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]] // NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]] -// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1) #[[ATTR6]] +// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]] // NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]] // NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]] // NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]] @@ -90,6 +90,68 @@ __gpu_kernel void foo() { __gpu_num_threads_z(); __gpu_num_threads(0); __gpu_thread_id_x(); +// AMDGPU-LABEL: define internal i32 @__gpu_thread_id( +// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// AMDGPU-NEXT: [[ENTRY:.*:]] +// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr +// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4 +// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]] +// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]] +// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]] +// AMDGPU-NEXT: ] +// AMDGPU: [[SW_BB]]: +// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN:.*]] +// AMDGPU: [[SW_BB1]]: +// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_BB3]]: +// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]] +// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: br label %[[RETURN]] +// AMDGPU: [[SW_DEFAULT]]: +// AMDGPU-NEXT: unreachable +// AMDGPU: [[RETURN]]: +// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4 +// AMDGPU-NEXT: ret i32 [[TMP1]] +// +// NVPTX-LABEL: define internal i32 @__gpu_thread_id( +// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4 +// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]] +// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]] +// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]] +// NVPTX-NEXT: ] +// NVPTX: [[SW_BB]]: +// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN:.*]] +// NVPTX: [[SW_BB1]]: +// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_BB3]]: +// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]] +// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4 +// NVPTX-NEXT: br label %[[RETURN]] +// NVPTX: [[SW_DEFAULT]]: +// NVPTX-NEXT: unreachable +// NVPTX: [[RETURN]]: +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4 +// NVPTX-NEXT: ret i32 [[TMP1]] +// __gpu_thread_id_y(); __gpu_thread_id_z(); __gpu_thread_id(0); @@ -100,7 +162,7 @@ __gpu_kernel void foo() { __gpu_ballot(-1, 1); __gpu_sync_threads(); __gpu_sync_lane(-1); - __gpu_shuffle_idx_u32(-1, -1, -1); + __gpu_shuffle_idx_u32(-1, -1, -1, 0); __gpu_first_lane_id(-1); __gpu_is_first_in_lane(-1); __gpu_exit(); diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h index e138c84c0cb22d..323c003f1ff074 100644 --- a/libc/src/__support/GPU/utils.h +++ b/libc/src/__support/GPU/utils.h @@ -87,8 +87,9 @@ LIBC_INLINE void sync_threads() { __gpu_sync_threads(); } LIBC_INLINE void sync_lane(uint64_t lane_mask) { __gpu_sync_lane(lane_mask); } -LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x) { - return __gpu_shuffle_idx_u32(lane_mask, idx, x); +LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x, + uint32_t width = __gpu_num_lanes()) { + return __gpu_shuffle_idx_u32(lane_mask, idx, x, width); } [[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); } diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt index 7811e0da45ddcf..68bbc3849bc7ec 100644 --- a/libc/test/integration/src/__support/GPU/CMakeLists.txt +++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt @@ -9,3 +9,12 @@ add_integration_test( LOADER_ARGS --threads 64 ) + +add_integration_test( + shuffle_test + SUITE libc-support-gpu-tests + SRCS + shuffle.cpp + LOADER_ARGS + --threads 64 +) diff --git a/libc/test/integration/src/__support/GPU/shuffle.cpp b/libc/test/integration/src/__support/GPU/shuffle.cpp new file mode 100644 index 00000000000000..c346a2eb3f0c29 --- /dev/null +++ b/libc/test/integration/src/__support/GPU/shuffle.cpp @@ -0,0 +1,33 @@ +//===-- Test for the shuffle operations on the GPU ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/bit.h" +#include "src/__support/GPU/utils.h" +#include "test/IntegrationTest/test.h" + +using namespace LIBC_NAMESPACE; + +// Test to make sure the shuffle instruction works by doing a simple broadcast. +// Each iteration reduces the width, so it will broadcast to a subset we check. +static void test_shuffle() { + uint64_t mask = gpu::get_lane_mask(); + EXPECT_EQ(cpp::popcount(mask), gpu::get_lane_size()); + + uint32_t x = gpu::get_lane_id(); + for (uint32_t width = gpu::get_lane_size(); width > 0; width /= 2) + EXPECT_EQ(gpu::shuffle(mask, 0, x, width), (x / width) * width); +} + +TEST_MAIN(int argc, char **argv, char **envp) { + if (gpu::get_thread_id() >= gpu::get_lane_size()) + return 0; + + test_shuffle(); + + return 0; +} `````````` </details> https://github.com/llvm/llvm-project/pull/125912 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits