https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/185302
>From 070cc45568625508d85b61c0e0277aefa3121796 Mon Sep 17 00:00:00 2001 From: Joseph Huber <[email protected]> Date: Sun, 8 Mar 2026 11:14:18 -0500 Subject: [PATCH 1/2] [AMDGPU] Add clang builtin for generic AMDGPU shuffle Summary: AMDGPU introduced a high level intrinsic for shuffles. The main advantage of this over the ds_bpermute path is that it is correctly lowered for w32 / w64 and doesn't require the four byte offset. This PR adds '__builtin_amdgcn_wave_shuffle' to access it. --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 1 + clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp | 3 ++- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 3 +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 7 +++++++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index acd0a34a79253..285533a6b8fb8 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -213,6 +213,7 @@ def __builtin_amdgcn_ds_permute : AMDGPUBuiltin<"int(int, int)", [Const]>; def __builtin_amdgcn_ds_bpermute : AMDGPUBuiltin<"int(int, int)", [Const]>; def __builtin_amdgcn_readfirstlane : AMDGPUBuiltin<"int(int)", [Const]>; def __builtin_amdgcn_readlane : AMDGPUBuiltin<"int(int, int)", [Const]>; +def __builtin_amdgcn_wave_shuffle : AMDGPUBuiltin<"int(int, int)", [Const]>; def __builtin_amdgcn_fmed3f : AMDGPUBuiltin<"float(float, float, float)", [Const]>; def __builtin_amdgcn_ds_faddf : AMDGPUBuiltin<"float(float address_space<3> *, float, _Constant int, _Constant int, _Constant bool)">; def __builtin_amdgcn_ds_fminf : AMDGPUBuiltin<"float(float address_space<3> *, float, _Constant int, _Constant int, _Constant bool)">; diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp index b4b0c455904fc..ffbfe669510a8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp @@ -78,7 +78,8 @@ CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId, return mlir::Value{}; } case AMDGPU::BI__builtin_amdgcn_readlane: - case AMDGPU::BI__builtin_amdgcn_readfirstlane: { + case AMDGPU::BI__builtin_amdgcn_readfirstlane: + case AMDGPU::BI__builtin_amdgcn_wave_shuffle: { cgm.errorNYI(expr->getSourceRange(), std::string("unimplemented AMDGPU builtin call: ") + getContext().BuiltinInfo.getName(builtinId)); diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 72d5cb8040119..f4eaece58faa7 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -554,6 +554,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_readlane: return emitBuiltinWithOneOverloadedType<2>(*this, E, Intrinsic::amdgcn_readlane); + case AMDGPU::BI__builtin_amdgcn_wave_shuffle: + return emitBuiltinWithOneOverloadedType<2>(*this, E, + Intrinsic::amdgcn_wave_shuffle); case AMDGPU::BI__builtin_amdgcn_readfirstlane: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_readfirstlane); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 376105cb6594c..ea29657aaf623 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -314,6 +314,13 @@ void test_readlane(global int* out, int a, int b) *out = __builtin_amdgcn_readlane(a, b); } +// CHECK-LABEL: @test_wave_shuffle +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.shuffle.i32(i32 %a, i32 %b) +void test_wave_shuffle(global int* out, int a, int b) +{ + *out = __builtin_amdgcn_wave_shuffle(a, b); +} + // CHECK-LABEL: @test_fcmp_f32 // CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.fcmp.i64.f32(float %a, float %b, i32 5) void test_fcmp_f32(global ulong* out, float a, float b) >From bbae291c3ae69b615df13c79a3dc64089acf59b2 Mon Sep 17 00:00:00 2001 From: Joseph Huber <[email protected]> Date: Sun, 8 Mar 2026 13:04:55 -0500 Subject: [PATCH 2/2] docs --- clang/include/clang/Basic/BuiltinsAMDGPU.td | 2 +- .../include/clang/Basic/BuiltinsAMDGPUDocs.td | 31 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 285533a6b8fb8..6a40e2eac7617 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -213,7 +213,7 @@ def __builtin_amdgcn_ds_permute : AMDGPUBuiltin<"int(int, int)", [Const]>; def __builtin_amdgcn_ds_bpermute : AMDGPUBuiltin<"int(int, int)", [Const]>; def __builtin_amdgcn_readfirstlane : AMDGPUBuiltin<"int(int)", [Const]>; def __builtin_amdgcn_readlane : AMDGPUBuiltin<"int(int, int)", [Const]>; -def __builtin_amdgcn_wave_shuffle : AMDGPUBuiltin<"int(int, int)", [Const]>; +def __builtin_amdgcn_wave_shuffle : AMDGPUBuiltin<"int(int, int)", [Const]> { let Documentation = [DocWaveShuffle]; } def __builtin_amdgcn_fmed3f : AMDGPUBuiltin<"float(float, float, float)", [Const]>; def __builtin_amdgcn_ds_faddf : AMDGPUBuiltin<"float(float address_space<3> *, float, _Constant int, _Constant int, _Constant bool)">; def __builtin_amdgcn_ds_fminf : AMDGPUBuiltin<"float(float address_space<3> *, float, _Constant int, _Constant int, _Constant bool)">; diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td index 683c41a414c4e..42dd043747e2f 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td @@ -573,6 +573,37 @@ WMMA with per-operand scale factors applied during the computation. }]; } +//===----------------------------------------------------------------------===// +// Wave Data Exchange Builtins +//===----------------------------------------------------------------------===// + +def DocCatWaveDataExchange : DocumentationCategory<"Wave Data Exchange Builtins"> { + let Content = [{ +These builtins provide cross-lane data exchange within a wavefront. +}]; +} + +def DocWaveShuffle : Documentation { + let Category = DocCatWaveDataExchange; + let Content = [{ +Returns the value of ``src`` held by the lane identified by ``idx``. This is +a high-level wave shuffle operation that is correctly lowered for both wave32 +and wave64 without requiring the caller to compute byte offsets. + +.. code-block:: c + + int __builtin_amdgcn_wave_shuffle(int src, int idx); + +``src`` + The 32-bit value from the current lane to make available for shuffling. + +``idx`` + The lane index to read from. If the index is outside the valid range + ``[0, wavefront_size)``, it wraps around modulo the wavefront size. + Reading from an inactive lane returns a nondeterministic value. +}]; +} + def DocWMMA_scale16_GFX1250 : Documentation { let Category = DocCatWMMA; let Content = [{ _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
