https://github.com/skc7 created https://github.com/llvm/llvm-project/pull/190137

Upstreaming clangIR PR: https://github.com/llvm/clangir/pull/2047

This PR adds support for lowering of _builtin_amdgcn_wave_reduce* amdgpu 
builtins to clangIR.
Followed similar lowering from reference clang->llvmir in 
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp.

builtins-amdgc.hip is added to test cir and llvm lowering for hip.

>From 2a30e72eebac8413e85f7e992a4ccb2e6eb613a2 Mon Sep 17 00:00:00 2001
From: skc7 <[email protected]>
Date: Wed, 1 Apr 2026 16:18:39 +0530
Subject: [PATCH] [CIR][AMDGPU] Add amdgpu wave reduce builtins codegen

---
 clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp |  46 ++++-
 clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip | 180 ++++++++++++++++++
 2 files changed, 222 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp 
b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp
index b4b0c455904fc..0f6fc8949b0e8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAMDGPU.cpp
@@ -19,6 +19,40 @@
 using namespace clang;
 using namespace clang::CIRGen;
 
+static llvm::StringRef getIntrinsicNameforWaveReduction(unsigned BuiltinID) {
+  switch (BuiltinID) {
+  default:
+    llvm_unreachable("Unknown BuiltinID for wave reduction");
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
+    return "amdgcn.wave.reduce.add";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
+    return "amdgcn.wave.reduce.sub";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+    return "amdgcn.wave.reduce.min";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
+    return "amdgcn.wave.reduce.umin";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+    return "amdgcn.wave.reduce.max";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
+    return "amdgcn.wave.reduce.umax";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
+    return "amdgcn.wave.reduce.and";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
+    return "amdgcn.wave.reduce.or";
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
+  case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
+    return "amdgcn.wave.reduce.xor";
+  }
+}
+
 std::optional<mlir::Value>
 CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId,
                                       const CallExpr *expr) {
@@ -41,10 +75,14 @@ CIRGenFunction::emitAMDGPUBuiltinExpr(unsigned builtinId,
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
   case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented AMDGPU builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinId));
-    return mlir::Value{};
+    llvm::StringRef intrinsicName = 
getIntrinsicNameforWaveReduction(builtinId);
+    mlir::Value Value = emitScalarExpr(expr->getArg(0));
+    mlir::Value Strategy = emitScalarExpr(expr->getArg(1));
+    return cir::LLVMIntrinsicCallOp::create(
+               builder, getLoc(expr->getExprLoc()),
+               builder.getStringAttr(intrinsicName), Value.getType(),
+               {Value, Strategy})
+        .getResult();
   }
   case AMDGPU::BI__builtin_amdgcn_div_scale:
   case AMDGPU::BI__builtin_amdgcn_div_scalef: {
diff --git a/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip 
b/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip
new file mode 100644
index 0000000000000..5b178274d5fbd
--- /dev/null
+++ b/clang/test/CIR/CodeGenHIP/builtins-amdgcn.hip
@@ -0,0 +1,180 @@
+#include "../CodeGenCUDA/Inputs/cuda.h"
+
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -std=c++11 -fclangir \
+// RUN:            -target-cpu tahiti -fcuda-is-device -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -std=c++11 -fclangir \
+// RUN:            -target-cpu tahiti -fcuda-is-device -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -std=c++11 \
+// RUN:            -target-cpu tahiti -fcuda-is-device -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+//===----------------------------------------------------------------------===//
+// Test AMDGPU builtins
+//===----------------------------------------------------------------------===//
+
+// CIR-LABEL: @_Z28test_wave_reduce_add_u32_i32Pi
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, 
!s32i) -> !u32i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_add_u32_i32Pii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_add_u32_i32(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_add_u32(in, 0);
+}
+
+// CIR-LABEL: @_Z28test_wave_reduce_add_u64_i64Pl
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u64i, 
!s32i) -> !u64i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_add_u64_i64Pll(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.add.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_add_u64_i64(long* out, long in) {
+  *out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
+}
+
+// CIR-LABEL: @_Z28test_wave_reduce_sub_u32_i32Pi
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.sub" {{.*}} : (!u32i, 
!s32i) -> !u32i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_sub_u32_i32Pii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_sub_u32_i32(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_sub_u32(in, 0);
+}
+
+// CIR-LABEL: @_Z28test_wave_reduce_sub_u64_i64Pl
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.sub" {{.*}} : (!u64i, 
!s32i) -> !u64i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_sub_u64_i64Pll(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.sub.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_sub_u64_i64(long* out, long in) {
+  *out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
+}
+
+// CIR-LABEL: @_Z29test_wave_reduce_min_i32_signPii
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.min" {{.*}} : (!s32i, 
!s32i) -> !s32i
+// LLVM: define{{.*}} void @_Z29test_wave_reduce_min_i32_signPii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_min_i32_sign(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_min_i32(in, 0);
+}
+
+// CIR-LABEL: @_Z31test_wave_reduce_min_u32_unsignPjj
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umin" {{.*}} : (!u32i, 
!s32i) -> !u32i
+// LLVM: define{{.*}} void @_Z31test_wave_reduce_min_u32_unsignPjj(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_min_u32_unsign(unsigned int* out, unsigned 
int in) {
+  *out = __builtin_amdgcn_wave_reduce_min_u32(in, 0);
+}
+
+// CIR-LABEL: @_Z29test_wave_reduce_min_i64_signPll
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.min" {{.*}} : (!s64i, 
!s32i) -> !s64i
+// LLVM: define{{.*}} void @_Z29test_wave_reduce_min_i64_signPll(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.min.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_min_i64_sign(long* out, long in) {
+  *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
+}
+
+// CIR-LABEL: @_Z31test_wave_reduce_min_u64_unsignPmm
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umin" {{.*}} : (!u64i, 
!s32i) -> !u64i
+// LLVM: define{{.*}} void @_Z31test_wave_reduce_min_u64_unsignPmm(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.umin.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_min_u64_unsign(unsigned long* out, unsigned 
long in) {
+  *out = __builtin_amdgcn_wave_reduce_min_u64(in, 0);
+}
+
+// CIR-LABEL: @_Z29test_wave_reduce_max_i32_signPii
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.max" {{.*}} : (!s32i, 
!s32i) -> !s32i
+// LLVM: define{{.*}} void @_Z29test_wave_reduce_max_i32_signPii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_max_i32_sign(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_max_i32(in, 0);
+}
+
+// CIR-LABEL: @_Z31test_wave_reduce_max_u32_unsignPjj
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umax" {{.*}} : (!u32i, 
!s32i) -> !u32i
+// LLVM: define{{.*}} void @_Z31test_wave_reduce_max_u32_unsignPjj(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_max_u32_unsign(unsigned int* out, unsigned 
int in) {
+  *out = __builtin_amdgcn_wave_reduce_max_u32(in, 0);
+}
+
+// CIR-LABEL: @_Z29test_wave_reduce_max_i64_signPll
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.max" {{.*}} : (!s64i, 
!s32i) -> !s64i
+// LLVM: define{{.*}} void @_Z29test_wave_reduce_max_i64_signPll(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.max.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_max_i64_sign(long* out, long in) {
+  *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
+}
+
+// CIR-LABEL: @_Z31test_wave_reduce_max_u64_unsignPmm
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.umax" {{.*}} : (!u64i, 
!s32i) -> !u64i
+// LLVM: define{{.*}} void @_Z31test_wave_reduce_max_u64_unsignPmm(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.umax.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_max_u64_unsign(unsigned long* out, unsigned 
long in) {
+  *out = __builtin_amdgcn_wave_reduce_max_u64(in, 0);
+}
+
+// CIR-LABEL: @_Z28test_wave_reduce_and_b32_i32Pii
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.and" {{.*}} : (!s32i, 
!s32i) -> !s32i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_and_b32_i32Pii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_and_b32_i32(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_and_b32(in, 0);
+}
+
+// CIR-LABEL: @_Z28test_wave_reduce_and_b64_i64Pll
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.and" {{.*}} : (!s64i, 
!s32i) -> !s64i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_and_b64_i64Pll(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_and_b64_i64(long* out, long in) {
+  *out = __builtin_amdgcn_wave_reduce_and_b64(in, 0);
+}
+
+// CIR-LABEL: @_Z27test_wave_reduce_or_b32_i32Pii
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.or" {{.*}} : (!s32i, 
!s32i) -> !s32i
+// LLVM: define{{.*}} void @_Z27test_wave_reduce_or_b32_i32Pii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_or_b32_i32(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_or_b32(in, 0);
+}
+
+// CIR-LABEL: @_Z27test_wave_reduce_or_b64_i64Pll
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.or" {{.*}} : (!s64i, 
!s32i) -> !s64i
+// LLVM: define{{.*}} void @_Z27test_wave_reduce_or_b64_i64Pll(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_or_b64_i64(long* out, long in) {
+  *out = __builtin_amdgcn_wave_reduce_or_b64(in, 0);
+}
+
+// CIR-LABEL: @_Z28test_wave_reduce_xor_b32_i32Pii
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.xor" {{.*}} : (!s32i, 
!s32i) -> !s32i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_xor_b32_i32Pii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_xor_b32_i32(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0);
+}
+
+// CIR-LABEL: @_Z28test_wave_reduce_xor_b64_i64Pll
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.xor" {{.*}} : (!s64i, 
!s32i) -> !s64i
+// LLVM: define{{.*}} void @_Z28test_wave_reduce_xor_b64_i64Pll(
+// LLVM: call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %{{.*}}, i32 0)
+__device__ void test_wave_reduce_xor_b64_i64(long* out, long in) {
+  *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 0);
+}
+
+// CIR-LABEL: @_Z38test_wave_reduce_add_u32_iterative_i32Pii
+// CIR: cir.const #cir.int<1> : !s32i
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, 
!s32i) -> !u32i
+// LLVM: define{{.*}} void @_Z38test_wave_reduce_add_u32_iterative_i32Pii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 1)
+__device__ void test_wave_reduce_add_u32_iterative_i32(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_add_u32(in, 1);
+}
+
+// CIR-LABEL: @_Z32test_wave_reduce_add_u32_dpp_i32Pii
+// CIR: cir.const #cir.int<2> : !s32i
+// CIR: cir.call_llvm_intrinsic "amdgcn.wave.reduce.add" {{.*}} : (!u32i, 
!s32i) -> !u32i
+// LLVM: define{{.*}} void @_Z32test_wave_reduce_add_u32_dpp_i32Pii(
+// LLVM: call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %{{.*}}, i32 2)
+__device__ void test_wave_reduce_add_u32_dpp_i32(int* out, int in) {
+  *out = __builtin_amdgcn_wave_reduce_add_u32(in, 2);
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to