https://github.com/skc7 created https://github.com/llvm/llvm-project/pull/190137
Upstreaming clangIR PR: https://github.com/llvm/clangir/pull/2047 This PR adds support for lowering of _builtin_amdgcn_wave_reduce* amdgpu builtins to clangIR. Followed similar lowering from reference clang->llvmir in clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp. builtins-amdgc.hip is added to test cir and llvm lowering for hip. >From 2a30e72eebac8413e85f7e992a4ccb2e6eb613a2 Mon Sep 17 00:00:00 2001 From: skc7 <[email protected]> Date: Wed, 1 Apr 2026 16:18:39 +0530 Subject: [PATCH] [CIR][AMDGPU] Add amdgpu wave reduce builtins codegen --- clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp | 46 ++++- clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip | 180 ++++++++++++++++++ 2 files changed, 222 insertions(+), 4 deletions(-) create mode 100644 clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp index b4b0c455904fc..0f6fc8949b0e8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp @@ -19,6 +19,40 @@ using namespace clang; using namespace clang::CIRGen; +static llvm::StringRef getIntrinsicNameforWaveReduction(unsigned BuiltinID) { + switch (BuiltinID) { + default: + llvm_unreachable("Unknown BuiltinID for wave reduction"); + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64: + return "amdgcn.wave.reduce.add"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64: + return "amdgcn.wave.reduce.sub"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: + return "amdgcn.wave.reduce.min"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: + return "amdgcn.wave.reduce.umin"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: + return "amdgcn.wave.reduce.max"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: + return "amdgcn.wave.reduce.umax"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64: + return "amdgcn.wave.reduce.and"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64: + return "amdgcn.wave.reduce.or"; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: + return "amdgcn.wave.reduce.xor"; + } +} + std::optional<mlir::Value> CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId, const CallExpr *expr) { @@ -41,10 +75,14 @@ CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId, case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64: case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64: case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: { - cgm.errorNYI(expr->getSourceRange(), - std::string("unimplemented AMDGPU builtin call: ") + - getContext().BuiltinInfo.getName(builtinId)); - return mlir::Value{}; + llvm::StringRef intrinsicName = getIntrinsicNameforWaveReduction(builtinId); + mlir::Value Value = emitScalarExpr(expr->getArg(0)); + mlir::Value Strategy = emitScalarExpr(expr->getArg(1)); + return cir::LLVMIntrinsicCallOp::create( + builder, getLoc(expr->getExprLoc()), + builder.getStringAttr(intrinsicName), Value.getType(), + {Value, Strategy}) + .getResult(); } case AMDGPU::BI__builtin_amdgcn_div_scale: case AMDGPU::BI__builtin_amdgcn_div_scalef: { diff --git a/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip b/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip new file mode 100644 index 0000000000000..5b178274d5fbd --- /dev/null +++ b/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip @@ -0,0 +1,180 @@ +#include "../CodeGenCUDA/Inputs/cuda.h" + +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -std=c++11 -fclangir \ +// RUN: -target-cpu tahiti -fcuda-is-device -emit-cir %s -o %t.cir +// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -std=c++11 -fclangir \ +// RUN: -target-cpu tahiti -fcuda-is-device -emit-llvm %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -std=c++11 \ +// RUN: -target-cpu tahiti -fcuda-is-device -emit-llvm %s -o %t.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s + +//===----------------------------------------------------------------------===// +// Test AMDGPU builtins +//===----------------------------------------------------------------------===// + +// CIR-LABEL: @_Z28test_wave_reduce_add_u32_i32Pi +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, !s32i) -> !u32i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_add_u32_i32Pii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_add_u32_i32(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 0); +} + +// CIR-LABEL: @_Z28test_wave_reduce_add_u64_i64Pl +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u64i, !s32i) -> !u64i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_add_u64_i64Pll( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_add_u64_i64(long* out, long in) { + *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0); +} + +// CIR-LABEL: @_Z28test_wave_reduce_sub_u32_i32Pi +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.sub" {{.*}} : (!u32i, !s32i) -> !u32i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_sub_u32_i32Pii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_sub_u32_i32(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 0); +} + +// CIR-LABEL: @_Z28test_wave_reduce_sub_u64_i64Pl +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.sub" {{.*}} : (!u64i, !s32i) -> !u64i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_sub_u64_i64Pll( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_sub_u64_i64(long* out, long in) { + *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0); +} + +// CIR-LABEL: @_Z29test_wave_reduce_min_i32_signPii +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.min" {{.*}} : (!s32i, !s32i) -> !s32i +// LLVM: define{{.*}} void @_Z29test_wave_reduce_min_i32_signPii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_min_i32_sign(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 0); +} + +// CIR-LABEL: @_Z31test_wave_reduce_min_u32_unsignPjj +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umin" {{.*}} : (!u32i, !s32i) -> !u32i +// LLVM: define{{.*}} void @_Z31test_wave_reduce_min_u32_unsignPjj( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_min_u32_unsign(unsigned int* out, unsigned int in) { + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 0); +} + +// CIR-LABEL: @_Z29test_wave_reduce_min_i64_signPll +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.min" {{.*}} : (!s64i, !s32i) -> !s64i +// LLVM: define{{.*}} void @_Z29test_wave_reduce_min_i64_signPll( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_min_i64_sign(long* out, long in) { + *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0); +} + +// CIR-LABEL: @_Z31test_wave_reduce_min_u64_unsignPmm +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umin" {{.*}} : (!u64i, !s32i) -> !u64i +// LLVM: define{{.*}} void @_Z31test_wave_reduce_min_u64_unsignPmm( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_min_u64_unsign(unsigned long* out, unsigned long in) { + *out = __builtin_amdgcn_wave_reduce_min_u64(in, 0); +} + +// CIR-LABEL: @_Z29test_wave_reduce_max_i32_signPii +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.max" {{.*}} : (!s32i, !s32i) -> !s32i +// LLVM: define{{.*}} void @_Z29test_wave_reduce_max_i32_signPii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_max_i32_sign(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 0); +} + +// CIR-LABEL: @_Z31test_wave_reduce_max_u32_unsignPjj +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umax" {{.*}} : (!u32i, !s32i) -> !u32i +// LLVM: define{{.*}} void @_Z31test_wave_reduce_max_u32_unsignPjj( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_max_u32_unsign(unsigned int* out, unsigned int in) { + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 0); +} + +// CIR-LABEL: @_Z29test_wave_reduce_max_i64_signPll +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.max" {{.*}} : (!s64i, !s32i) -> !s64i +// LLVM: define{{.*}} void @_Z29test_wave_reduce_max_i64_signPll( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_max_i64_sign(long* out, long in) { + *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0); +} + +// CIR-LABEL: @_Z31test_wave_reduce_max_u64_unsignPmm +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umax" {{.*}} : (!u64i, !s32i) -> !u64i +// LLVM: define{{.*}} void @_Z31test_wave_reduce_max_u64_unsignPmm( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_max_u64_unsign(unsigned long* out, unsigned long in) { + *out = __builtin_amdgcn_wave_reduce_max_u64(in, 0); +} + +// CIR-LABEL: @_Z28test_wave_reduce_and_b32_i32Pii +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.and" {{.*}} : (!s32i, !s32i) -> !s32i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_and_b32_i32Pii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_and_b32_i32(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 0); +} + +// CIR-LABEL: @_Z28test_wave_reduce_and_b64_i64Pll +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.and" {{.*}} : (!s64i, !s32i) -> !s64i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_and_b64_i64Pll( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_and_b64_i64(long* out, long in) { + *out = __builtin_amdgcn_wave_reduce_and_b64(in, 0); +} + +// CIR-LABEL: @_Z27test_wave_reduce_or_b32_i32Pii +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.or" {{.*}} : (!s32i, !s32i) -> !s32i +// LLVM: define{{.*}} void @_Z27test_wave_reduce_or_b32_i32Pii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_or_b32_i32(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 0); +} + +// CIR-LABEL: @_Z27test_wave_reduce_or_b64_i64Pll +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.or" {{.*}} : (!s64i, !s32i) -> !s64i +// LLVM: define{{.*}} void @_Z27test_wave_reduce_or_b64_i64Pll( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_or_b64_i64(long* out, long in) { + *out = __builtin_amdgcn_wave_reduce_or_b64(in, 0); +} + +// CIR-LABEL: @_Z28test_wave_reduce_xor_b32_i32Pii +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.xor" {{.*}} : (!s32i, !s32i) -> !s32i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_xor_b32_i32Pii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %{{.*}}, i32 0) +__device__ void test_wave_reduce_xor_b32_i32(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0); +} + +// CIR-LABEL: @_Z28test_wave_reduce_xor_b64_i64Pll +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.xor" {{.*}} : (!s64i, !s32i) -> !s64i +// LLVM: define{{.*}} void @_Z28test_wave_reduce_xor_b64_i64Pll( +// LLVM: call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %{{.*}}, i32 0) +__device__ void test_wave_reduce_xor_b64_i64(long* out, long in) { + *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 0); +} + +// CIR-LABEL: @_Z38test_wave_reduce_add_u32_iterative_i32Pii +// CIR: cir.const #cir.int<1> : !s32i +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, !s32i) -> !u32i +// LLVM: define{{.*}} void @_Z38test_wave_reduce_add_u32_iterative_i32Pii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 1) +__device__ void test_wave_reduce_add_u32_iterative_i32(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 1); +} + +// CIR-LABEL: @_Z32test_wave_reduce_add_u32_dpp_i32Pii +// CIR: cir.const #cir.int<2> : !s32i +// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, !s32i) -> !u32i +// LLVM: define{{.*}} void @_Z32test_wave_reduce_add_u32_dpp_i32Pii( +// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 2) +__device__ void test_wave_reduce_add_u32_dpp_i32(int* out, int in) { + *out = __builtin_amdgcn_wave_reduce_add_u32(in, 2); +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
