https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/150170
>From 01432a07787d4067b4625c2df8882b04faa073c7 Mon Sep 17 00:00:00 2001 From: Aaditya <aaditya.alokdeshpa...@amd.com> Date: Sat, 19 Jul 2025 12:57:27 +0530 Subject: [PATCH 1/2] Add builtins for wave reduction intrinsics --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 25 ++ clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 58 +++ clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 378 +++++++++++++++++++ 3 files changed, 461 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 878543566f0e3..c8b324193e9fb 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -351,6 +351,31 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n") BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") +//===----------------------------------------------------------------------===// + +// Wave Reduction builtins. + +//===----------------------------------------------------------------------===// + +BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "UiUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "UiUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "iii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_add_i64, "WiWii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_i64, "WiWii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_i64, "WiWii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_u64, "WUiWUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_i64, "WiWii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWii", "nc") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 7dccf82b1a7a3..28ea918b97cc5 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -295,11 +295,69 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { + switch (BuiltinID) { + default: + llvm_unreachable("Unknown BuiltinID for wave reduction"); + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i64: + return Intrinsic::amdgcn_wave_reduce_add; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i64: + return Intrinsic::amdgcn_wave_reduce_sub; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: + return Intrinsic::amdgcn_wave_reduce_min; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: + return Intrinsic::amdgcn_wave_reduce_umin; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: + return Intrinsic::amdgcn_wave_reduce_max; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: + return Intrinsic::amdgcn_wave_reduce_umax; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64: + return Intrinsic::amdgcn_wave_reduce_and; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64: + return Intrinsic::amdgcn_wave_reduce_or; + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: + return Intrinsic::amdgcn_wave_reduce_xor; + } +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; llvm::SyncScope::ID SSID; switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: { + Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID); + llvm::Value *Value = EmitScalarExpr(E->getArg(0)); + llvm::Value *Strategy = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()}); + return Builder.CreateCall(F, {Value, Strategy}); + } case AMDGPU::BI__builtin_amdgcn_div_scale: case AMDGPU::BI__builtin_amdgcn_div_scalef: { // Translate from the intrinsics's struct return to the builtin's out diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index bf022bc6eb446..16f5a524f3094 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -398,6 +398,384 @@ void test_s_sendmsghalt_var(int in) __builtin_amdgcn_s_sendmsghalt(1, in); } +// CHECK-LABEL: @test_wave_reduce_add_i32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_i32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_add_i64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64( +void test_wave_reduce_add_i64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_add_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_add_i64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64( +void test_wave_reduce_add_i64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_add_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_add_i64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64( +void test_wave_reduce_add_i64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_i32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64( +void test_wave_reduce_sub_i64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64( +void test_wave_reduce_sub_i64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64( +void test_wave_reduce_sub_i64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_and_b32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_b32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_and_b64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64( +void test_wave_reduce_and_b64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_and_b32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_b32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_and_b64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64( +void test_wave_reduce_and_b64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_and_b32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_b32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_and_b64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64( +void test_wave_reduce_and_b64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_and_b64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_or_b32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_b32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_or_b64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64( +void test_wave_reduce_or_b64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_or_b32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_b32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_or_b64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64( +void test_wave_reduce_or_b64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_or_b32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_b32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_or_b64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64( +void test_wave_reduce_or_b64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_or_b64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_b32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64( +void test_wave_reduce_xor_b64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_b32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64( +void test_wave_reduce_xor_b64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_b32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_xor_b64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64( +void test_wave_reduce_xor_b64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_i64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64( +void test_wave_reduce_min_i64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_i64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64( +void test_wave_reduce_min_i64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_i64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64( +void test_wave_reduce_min_i64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_u32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_min_u32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_u64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64( +void test_wave_reduce_min_u64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_min_u32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_min_u32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_u64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64( +void test_wave_reduce_min_u64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_min_u32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_min_u32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_min_u64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64( +void test_wave_reduce_min_u64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_min_u64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_i64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64( +void test_wave_reduce_max_i64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_i64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64( +void test_wave_reduce_max_i64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_i64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64( +void test_wave_reduce_max_i64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_u32_default +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_max_u32_default(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_u64_default +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64( +void test_wave_reduce_max_u64_default(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u64(in, 0); +} + +// CHECK-LABEL: @test_wave_reduce_max_u32_iterative +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_max_u32_iterative(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_u64_iterative +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64( +void test_wave_reduce_max_u64_iterative(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u64(in, 1); +} + +// CHECK-LABEL: @test_wave_reduce_max_u32_dpp +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_max_u32_dpp(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u32(in, 2); +} + +// CHECK-LABEL: @test_wave_reduce_max_u64_dpp +// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64( +void test_wave_reduce_max_u64_dpp(global int* out, long in) +{ + *out = __builtin_amdgcn_wave_reduce_max_u64(in, 2); +} + // CHECK-LABEL: @test_s_barrier // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier() >From dcaea7230ffd22bb56eeb2eb8956c28164b909fa Mon Sep 17 00:00:00 2001 From: Aaditya <aaditya.alokdeshpa...@amd.com> Date: Mon, 28 Jul 2025 14:35:08 +0530 Subject: [PATCH 2/2] Using `int32_t` inplace of `int` --- a.out | Bin 0 -> 22264 bytes clang/include/clang/Basic/BuiltinsAMDGPU.def | 36 ++++++------ hip.sh | 20 +++++++ test.cpp | 58 +++++++++++++++++++ 4 files changed, 96 insertions(+), 18 deletions(-) create mode 100755 a.out create mode 100644 hip.sh create mode 100644 test.cpp diff --git a/a.out b/a.out new file mode 100755 index 0000000000000000000000000000000000000000..2dbcd9ad6edc6908ee25aacddc07417f96ca46f2 GIT binary patch literal 22264 zcmeHPdvIJ=c|Te|A`)AY^8k^A<Pzi{6SS6ONnQs+R+eP164_QQCzt@ay4r`dWVO5O z-gWGRViU&!Rk&k9NnOgcX$OKKJRBaW5@uvg?8Zr(b__H00RdcwbXE-%)6$U1P}JY= z-1Duj*1M4z@(0t=k@kM)`<?Im&bi+`_uTWi_w4T**b=CzQ4#{`U5eb1t(Jzu`lwWB zhlzlMRY+Ze<F)EdY9;vP8uQY_%F+>`V*w8rTI;cD87TFRne<};OQ#6ET60PXl@gLt zZ$4Ni>d_+`Jyxh{@ObL2z)x^FKcNj5d2&MidOXL6-{qx0e5a)&LK{u_hu3)N<$6<L z51-0fIwJIC4^~rj1cNCj%`v|jli!TVPv{xr2%+N7oiDgC`sG5)DgGQZdS^PJjBuvY zT0mo(Q%b1#{tuwXa{jh~`uv?TemiBZyHLv*6e{_xPbEjXI@YIREvaO>u(xGzsH>%` zBbd(w+qFHqkLR>?`)%3+-D<MXe6<60F0GJw88E8?ejPCW%0mc1C46fY+y-7L-+fi^ zi7NP!D)^_X;GeC6KU)QV0~}PToNumzZ?1xGuY$j?3ZAZlKT-vss)F-AsZ{<;P&q2$ z4OQ@P6?|J2JXQt2rwV=o{$Hc6RN<V37@7l=n;`~Dd^m4uUhrmPXK=fR>qLDB;67M} zdtw>dyA_~3u22`DRjI{qCF0(1CDQow(wV~hxdwlqgzYfm-aWNUeQ5z-qkaN@xRf4H zuNWSX%Dr`EJzLt(0%LI3J%}c1rTP|l+MC$F4zY{WX~R1&|0!a%>bp2H3|_zQ5{0t= zm*ERnCSS)EV>xbU>VM-C522U=)xaAZfo|e6g7ixKlzOAGz!1Cq=&GKk@l`$R`o24$ zY#S{|)Xuw3&b7y#WE#lmcqVNS8*DB<n#{ZLoSn!y<8eEgPGo>a_d0eWnRZgid*WbY z@ycRj$?RYz+vB8CJ(+YOIa<g$ZZeZrzz3W{Iy!c1JeQ8A)HozEQNr8iWAX85b{~w> zNFhIVz7_k9A-5x%DY%NUwl3Sw<lS7{8AmDe@toTq1=<!Jb8@zubCPbpKib!~6WE3k zC!dUpbpI}USF7DO)MXFeI<&z~!P@qrOe)`(WbD=<w=?SQ%f_MHw!z+J-<^!b)5sy| z_U-XRH)IQLU(}0tCQ>LXQc&tGxj3Ao3mMPlbS#zczpyezccDa`D3SKc*0|f3%Vlyy zZZ4S~)s}d=?pQ8!w>{#-?5LA>SsFMbo{pt{>oZsJy-B)p`;Z$-X7aF^k9YLr9DCZ# zS(f3w8rSu}aXn>TsGX*^Y;oMps2(|U{Yo&dxvl{Es@|5CS~D+>`sj?uV99%qZKNOV z4m{70kauIz8*bp^txIhk=-=G4d$+wI*x`={H!a}W+Nd`gMHQ!7o7&pb1C>^yQJ*2Z zJ=h-X^i&bw(4p36vhMm^COY1-G1wMt<&$=ODz#^P{YWysJ{UCrKxt~+Go_9UTfEs! zo1vrnaMXvJTUg@{`*3sXYW$cFmuEc_j{9)-+ayT)NKE>MOq2GJ_?rlraMtv@*uC@T zTlC@ne9!rCf4(pK@CE&z4_Z1W+BaQD&DZ*H>7y~B!H1(^D}_cMesKv^%JSi87)znq zhp#H3O11j%)jmAr!|Qx_*oULxE`^8>Zz!Q?pMCfmAHLg%`=6`BKAe3j31dF|asffJ zKKu$FzSoDp-iIIX;cxKaAM@eY`0yh>ywQgr_2F;y;Sc+8@A={-9`oT>`Q&R%uX8wJ zY4q)uiu4a!XcRNK8!cryu7wV4+)~MOEV(BcD>x}@kCQ6I^GJ2$dDp&6KPT+Fojq|p zkz$2t+;(CyyU^9)%VsIM<bl`31Ge2W(7k=Dy<^LkfgRnw_U7BR_YU;gcB$=dSeT96 zOJnGNa1ds9b+i=Hccn9Tr_IQ0G+k(s(d?ADg|9V6@Pymb(SoPmXf#d2-&o#hX&FuI z-PGzV%xZ~S^1zY@mOQZJfh7+td0@!{OCDJAz>)`+Jh0?}B@Zll;5G8VesBFhEOVgC z)LFfjBId&;ve!)Z-Hb_5uK&Ld$2F>6&;L?a<jU#GHJ$;XG8al%^7JGY9>ZpmRjAA; zP*2ax%4Dld`WY%GBGj)(lVLN78ot5HPiWR;KO==Q^B&BPddH34@ppKFLXFzI&x{`D zPc9@s$yfX$>-ZO1PUBwnyROHFSYU*5dcfd9#XWR?3TYN-_qN`xgIIPy_!VzDC+JOK z=0EX6b?Ku)XEd9$v2>Mo)#rjvZZ!YN`@a&zjAkyL&)fN|6OBJIn#zngsm}*9i9|k5 zu{_q=Y99^iiBbD5OqG7w6R}4Mi9|eiP(*7z(dB;8V<Y=)tyq#;^=eX21V?k3Le|d5 zN5|u77fXG6<1w2BP><^qL6J*|SOgqR*M9RnoZ<~%2&SC$Xu%ncAKQ_Qr+Wshp2td% zy(gZ_V?FS}fSL-9JA3U!%5m+xGr7A=_OS2|%W>*yEIpoBWFq)m!E7$M2a8dqg854Q zXfR*$_$7~dOg@`TrOZY6d@#Rnd?b_l*XtI^C3shCDp*J-6Pet&E~!0V)S9mc3;B57 zj_pHvvA&39)#zPM;WCu64m{>D{;a<m<g9igmw}(@vhbHpg~6iiXxx1QT;+*DsaWE~ z8dED#`N{(Chf9~CcAEA;Ce7are9bc4k-RAp!uqIGFX$R|v6A~)?k7_Dao)e77OND? zxwfv<t{d))9Dn<YFWh?QmCJZP@aJD!0$U1{*M(ig*^ABn<6?O0!*;ZgjM*DEY)W** z+B@31I@;U2I^--%+5XlM(`?ZhuNuPYtLo#B@ULCHs>1Ji;(axN8MeIvwQ>eTeYobn z>A>pyrted#Y5&Z}Ppk|C9t;EmA65sl&wGvhNARk~fu-M)50>h{QXM#99w%zbbc|$; zUe=c79fC0vk+uC{HZ}<V^P9O0X`@LuW>i=dndlGbY*|ruG%WKlwA%5~MzD1>=~~VC z{Me1_*CRSs7-@-*7uG|%Wgs~^=BDv7O-p|oZ#UshCjELxOE#DJKs@SNR(ofAsNL#{ zb%r`)p^fc{k=TZ4=cbOBv(ecQZQbNVH;uHn$Kwe+Z8j#l)~T}W+e_Ia<FJ=`Fq#>^ zQNKjOgJczCAzRxzHncmf8#jhF#yVQNHjZ?3wRc9_V~J2G)IQSMk#LZ>A-Zv$8c3$m zlf2Oy80fXybuL!(Yn+AOvkT&UZY%YRx$=(2+YGLW?=WO=8J0-WM+`2!(63d&zh4Dk ztqb&vInfu|T?Nln!T+iX{>>`*vsLhS(lLQrCF3V4&krD9ssHuAz$^8?EZC{k|5Cwp z#*M2WI+*CLsvuUCgJVv9OqnBbH>ZNROw4f|6~v2Ku9jwZ6mUG1%h*{AGx!o&a3r5s zL2sB6jOINJ!?MGQ{&;e{Fb)}1TaE%0`^c^{0y>6lyqHH9?Wm;-<2Jj&7bM8R(S-<( z;H^W9N-jj&H#G2uF^lv(y^j2zXmrtX=pMfwx8$REyD>Vps0OYWhX}eHC_V=X7o?Mm zM+<tmbV1U5l?ECvUJXp86bBL6f*xa)E=3upy<Uc0h;J7y%_5z2yZ1fYySMfC$j$OQ zP~MhoYovXuHg1<J<hdd|b@{9keR+Nf9X5h|3^6a6pOEJo-xYglZZ0GG^86CoZ1g2v z^bL%+biH{TNtfrH&~kk#ztC%;&oP(i%kx#}Fw*>i=im=-1w%~qWn3sU>=#D7oL>)& z`ttmj@us{4Sek!=j1FZ5=mbbz88=EhD)T-vFC^Un@t=&-_<0C1(LZkHvt*uXP#gsr zhO+F{Hed_%WnRm4V9Mpo`JEMPG9SsGaepY+caT_)QT{^p_Za<FD*1xw?<g}atkC_? zDP4P?zA`UO9yNmWH{X{O|4aS<C@}H8U;U_gx$@8aAmWm*z@I>*l75?c3A3-PaOL`2 z2Kyr#Yp}$mrZrbBdq)g0Ud9=W<qv?K13e1L4<5b*5(a$^^a$urK^yR5<|-Jo@M0y6 zwt~8#M?gOgIs?k@G|qy47L)<AAv)CDvs2aVZK%0w)kU>OkVc&E<%W>w^_1wnNLN`N zI7=}IT<SFCBKQpe3xTTcx`qb=J*!vdVGJzO$M8!b{T9$tdZ14WF#TireFo_+(|hZN z10!`~y>;3CIvAkMWBC0V>6g&io9i07>l%CO4%8f6_Q!$1f7E@frnl}$O+SFZx9S>^ z{pPybZgAeW20wwy8Jv*x*?Dd+<M%$i)n<XRJa-@?;`}b-8;HD@cyC>6U}qg?j8rr5 zUn6oI@ojYtyNrAocn9p@5-bS+U4P9hZ9nUu`1YNVi62J}{^Z=?uD-UJwr3-UZ)$_e z+!fz%RO*+BVEwiCYdy-Jd~QD67MXbAr4=u%LydC-FTVBZ72jhtGP&aG(2E>=S-F?N z)Vuz3ZZ2}T^EA?muPj&U{U^nqr~fmfzv<}}KQ{XJivFs|<V_DWL-E^Cgy9d&Cmg=% zg`dsM6`#=-y)s1({&aU_;zLcL3Abta#N|zq!@W(R$V6{bn6c%IMIsYtNe3f`Kh)G5 zIqWtyMh@?4YKR;jXsRtv>RjBW=E+@6t&z!rrpDgM-X?1RiQyBwS6gb%ZK}nuvG_qs z9y~R6a^5?MBi*+Tp$KCqkfw?@B_>XuR#9+~)4ffoB#0o}+G#DY9mhj6^@k3^)sacS zPZEwy{O}0?s(7<W{t}Z<5aMr|hseY$S_T)ZukG35S}j@FA3<4b^+_T()&~Wuzv1EO z<IMGGBZ;xy#2OeIW|=~C1^ltA$)YP7iw97LBa?1Z2(Adz70q--D_t=tu4sfSnu`fa z9z4}RX*q`<J;5fWcrTK8iZ`LuC-^q~^gcN8Nxqw1+>f-$4H_*TM0|RfNmPqWoGv;v zI=SLvT>HpmxEPr{+M2gi<k0u)@8?%v2ks3icV+$KJv}eJJ~Hv6;!Z8T4AE)cKy!nw z_0#7(+gq6gi#xOk++4g3F_!8#kZhabReW#vohMK5t^VmK3Q_FUDkt@|DsDt}P!DVD zN0EuAi)#^`&Onf68#R7<8Z}qV{u~FNonPZ@`X`>QaB)YF0_7J$1}1(*qh}B;89fOG z#;5t5o#oecKBJ#Unl^f#w^t(tMjrw(M(;(mWHb$iM)?$bF;d(Q)@OE5r=HYhEp7k< zO9L9y=iG*9$x@TXPfu@xrQ&J$hmN=zpsuDxK2GYd?S%p>^lSnG3C-i?XvPh_8f|st z&|Km26H}e=_U)dv$i#|Yu7%>eXX@9WX4bD+-Zafy?#aVAJi$Zv)Eok9Jj-`tc)()4 zYo5hwke459KL!)9?OrihyoPp9ulREy3=pmLH(qEm22V7gP8L6h0-oTvZPR3*oXJ{h z_v~?iZ0{nISFC74J$Y&NFMvGvoW>6Rv=)`~Z&8mUhi2Ts+!<}$lV=*ULY8x#-+K#= zYoEul{@O5N<i}W=*|R*CwcjBRyK44~=Kqd7%B^N+H2)O&W#o@*{z>xMs}F1bYvlQe zn?0iWFOp|hZ}x!Z|C&6z-?LfGf0{fSqS;~1KS=&{<OenXN91`2&4x99A9+6BXInLY z5BXK(EzRf1vllblp!pAwXRl;dX?}z}dl|Fmegk0b9pw3Vo;|Dio#gSv#QAIf7V>;d z&CY0kGkG=_v&S`mBY8G$vkz;&ojjgwIDgH*jXZh|IDgGwN1m%QvsukwNgmG-oWJHT zCC}l(>>&6vTP;M<7eUR^!O(_?$IG`qS1t3BdEprN4I_i_{D#rb_m#=>n?`>+ziDLP ziTDFA97E`0OH=&>LF}03=GR4dLJYDzeb&euvnp#MrWooO)Nls<i~yB<SaBGN0ZUU} zcm}c0ipEe}X=y5b6*jF5GQT)u(&<<R`F{N>k1g4gZ(OGCv~&vlSPXKWtY{3U%ynQz zV>oO0a~KnnKeNIjQMiPfZ!r0z#%Z26%xe+YGBC*Z@tW}+R&KL2_j*IGH?+&pEr#B1 zXu{AvhCX2ELxw(L=;MYyZRkH4`tOFmWa!H6CVxY(H?+&pEr#B1Xu{AvhCX2ELxys( zeQExe^V_e<6tl0UER3?<;I~|-%zfuwMRe_v5D3((8PhJw{m1)qIp&KOT2YD(UjXmP zNIMoX0v2cuzF405AY=3G6w_-k;tX|JgiGNT#3=uakq?`c_k+^Dw1<>q7{!nBwRn6o zNIUq4I(}K{{+IjuevMzQ<h~dDS5T}8G%C5z1fPU_rStt0;4II%TP?)q$4-wUPF&h? zi-`#Th#$+z$2^0S^9P7iUYT}9kE?+FT+3Xk8jZY^=W>=)ty1j1Fi88j4mj<wXToru zJ}UJN;FZe%6dI<Qz`XyZoyxOdfj5@gX=6tH3k3pKoY($5&3pq5rFK%}pVRo7QhO=* z1F*w-b=;Iw#)aPg2=JS!Z`T`zDC}H{^EzvugO`~0o;?=}XD|DVi2VuRoM#_gZD~l( zi}M_km&lTS5qLudJAVV*@(Y^ycYup;{mI2DI9{cwD1S9@o|k+-=^A6VYTR$xi^qZU zyx1*ekos__iu|Xl;Ge02KT-uhUIqVq;Fb1q;Qj`c{QMSNw@P?R6?}UY{EjL(d#aV} zXMtDRSK^*3@?WchpQ?gCUj=^yDqf}XTwMk4FnIYs7Xww~-%|z8R>2Qf!5;xm|5#=1 zk_sldh?@1*r-I#^`&(QGD@^`%Dzh-_Em|4I##+_<8dhobDxPwvsu!%BU^a!7Ct1<r zZU70##Ygy9^TG}bdYgeE?=3YIjE|XJD#l{4ES#)^amxl3#I6-d<ryVlJGq>*&x0k7 zyuAgp(G0U*xN7FDlOe^kp%>9EV1dlGospzXuS9e12zE(G2eD&>9n0vQB<5p5TLVeG zvj*R8>WkQ0c6M*;vj_TzcG=vK!{|UVGs3lM57OtQ6B$DyNXdvkbqyu;a!oLr&0@EQ zflctCx8p{CvahWboZe=_b_WM{+2hLII<RAN_keBZ3nO;F%{4QlIphw6P|}Ps%*!&K zi4{_DJL+bz)dly0u(9HV3uTWv*pVWB{@YoekasohyK_!98y8JuD@I%J7q+yZw_+)L z`;C1<^gDhob`pu&*(7$@(A$?_4-r%fULC_r;qejgP#?DV=-!M^c$MA11L;@*wXtwV z|8ge?PeV39@!Kfd4aGZM<7(Nw;?;LX$!ClMc$unflM;*XNk;MBq6;@`FTdLN?%swd zm3CEGWKDBv*Of(gg86^G>xzG;mP(ticz0ywcPYzTqQ33DY>i&K#ar}K&@IwGbPIGi z7O)KUco}zloF^-BQce(EgrDfkIOUW9ETs7Z9%l;z=Os915q+6oC>lU0^I%z}UnTu& zuD^0DFZwdxKVt?&T%Tk>H$VrG6Z&?boEs2c<_8+ggvW8C$NX9L^73=J(!c&J^BXfp zzuD+Z{!)Get3E`SCi%<!Nzv%b{E5iZ2mY`P49g_?G9MH&6EMe&aC!dXzjqt`%|=e< zht3*(|NIc=Y^406?*L<&+Ks;S>jzhR0LO*?Ao_wO5uv`6Uo}`>5@!Mtrq+72bTqi| z1w{P%GVYq{^dzL5B43{WN6Yl*_Z{>!Vxnkh`St%)nZAsdLtUQ2ZWhiL%JctAq*26N z?$Y>5_J{0{fxWR&cKYgbW%@E7cIus;g3PNK_4DfQqh<Ose|FaBo69Kb=jlIIrhm?i z8_yYi&XxN^dHFdW=cKUIPni#t{UaB<eosLcqw|u!%o|LZ8^7G&<vhpz^u6RS^LMiE z<r!0c-v11uFX!>kz$ha6vLDrvFhGQ6kCcw0FTnGN5EK2WVM`}Xg*8zTo}4d~xl_G} zG=Kgw-WGi+r<f?u{}*NYGXMKb*wZMAqM_ybuK@R#U*?gHcY6vPpZY`C&mhi;Dwe;s z3>W>y-k+$2a#<Fhr5=y8nj7mHuvg<J<(GQJHBj-rr(yK^J$fAr=L==5y0VJ?&RaaC Le=BP+%Ju&bz?d`u literal 0 HcmV?d00001 diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index c8b324193e9fb..a9bf747d0aaa1 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -357,24 +357,24 @@ BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") //===----------------------------------------------------------------------===// -BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "iii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "iii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "iii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "UiUii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "iii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "UiUii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "iii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "iii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "iii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_add_i64, "WiWii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_sub_i64, "WiWii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_min_i64, "WiWii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_min_u64, "WUiWUii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_max_i64, "WiWii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWii", "nc") -BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ZiZiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ZiZiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ZiZiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "ZUiZUiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ZiZiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "ZUiZUiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "ZiZiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "ZiZiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "ZiZiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_add_i64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_i64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_i64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_u64, "WUiWUiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_i64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc") //===----------------------------------------------------------------------===// // R600-NI only builtins. diff --git a/hip.sh b/hip.sh new file mode 100644 index 0000000000000..b7bf7f67908ba --- /dev/null +++ b/hip.sh @@ -0,0 +1,20 @@ + + "build/bin/clang-22" -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -E -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_isa_version_90a.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_abi_version_500.bc -target-cpu gfx90a -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include/cuda_wrappers -idirafter /opt/rocm-7.1.0/lib/llvm/bin/../../../include -include __clang_hip_runtime_wrapper.h -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-hip-amdgcn-amd-amdhsa-gfx90a.hipi -x hip test.cpp + + "build/bin/clang-22" -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -emit-llvm-bc -emit-llvm-uselists -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_isa_version_90a.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_abi_version_500.bc -target-cpu gfx90a -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -disable-llvm-passes -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-hip-amdgcn-amd-amdhsa-gfx90a.bc -x hip-cpp-output test-hip-amdgcn-amd-amdhsa-gfx90a.hipi + + "build/bin/clang-22" -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -S -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_isa_version_90a.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_abi_version_500.bc -target-cpu gfx90a -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -o test-hip-amdgcn-amd-amdhsa-gfx90a.s -x ir test-hip-amdgcn-amd-amdhsa-gfx90a.bc + + "build/bin/clang-22" -cc1as -triple amdgcn-amd-amdhsa -filetype obj -main-file-name test.cpp -target-cpu gfx90a -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -dwarf-version=5 -mrelocation-model pic -o test-hip-amdgcn-amd-amdhsa-gfx90a.o test-hip-amdgcn-amd-amdhsa-gfx90a.s + "build/bin/lld" -flavor gnu -m elf64_amdgpu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -plugin-opt=O3 --lto-CGO3 -save-temps --whole-archive -o test-hip-amdgcn-amd-amdhsa-gfx90a.out test-hip-amdgcn-amd-amdhsa-gfx90a.o --no-whole-archive + "build/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input=test-hip-amdgcn-amd-amdhsa-gfx90a.out -output=test.cpp-hip-amdgcn-amd-amdhsa.hipfb -verbose + "build/bin/clang-22" -cc1 -triple x86_64-unknown-linux-gnu -aux-triple amdgcn-amd-amdhsa -E -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -fcoverage-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include/cuda_wrappers -idirafter /opt/rocm-7.1.0/lib/llvm/bin/../../../include -include __clang_hip_runtime_wrapper.h -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdeprecated-macro -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-host-x86_64-unknown-linux-gnu.hipi -x hip test.cpp + + "build/bin/clang-22" -cc1 -triple x86_64-unknown-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm-bc -emit-llvm-uselists -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -fcoverage-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -fdeprecated-macro -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -disable-llvm-passes -fcuda-include-gpubinary test.cpp-hip-amdgcn-amd-amdhsa.hipfb -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-host-x86_64-unknown-linux-gnu.bc -x hip-cpp-output test-host-x86_64-unknown-linux-gnu.hipi + + "build/bin/clang-22" -cc1 -triple x86_64-unknown-linux-gnu -aux-triple amdgcn-amd-amdhsa -S -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -fcoverage-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-host-x86_64-unknown-linux-gnu.s -x ir test-host-x86_64-unknown-linux-gnu.bc + + "build/bin/clang-22" -cc1as -triple x86_64-unknown-linux-gnu -filetype obj -main-file-name test.cpp -target-cpu x86-64 -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -dwarf-version=5 -mrelocation-model static -o test-host-x86_64-unknown-linux-gnu.o test-host-x86_64-unknown-linux-gnu.s + "build/bin/ld.lld" -z relro --hash-style=gnu --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o a.out /lib/x86_64-linux-gnu/crt1.o /lib/x86_64-linux-gnu/crti.o /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/clang_rt.crtbegin-x86_64.o -L/usr/lib/gcc/x86_64-linux-gnu/12 -L/usr/lib/gcc/x86_64-linux-gnu/12/../../../../lib64 -L/lib/x86_64-linux-gnu -L/lib/../lib64 -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib64 -L/lib -L/usr/lib --enable-new-dtags test-host-x86_64-unknown-linux-gnu.o -L/opt/rocm-7.1.0/lib/llvm/bin/../../../lib -rpath /opt/rocm-7.1.0/lib/llvm/bin/../../../lib -lamdhip64 -lstdc++ -lm /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/libclang_rt.builtins-x86_64.a -lgcc_s -lc /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/libclang_rt.builtins-x86_64.a -lgcc_s /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/clang_rt.crtend-x86_64.o /lib/x86_64-linux-gnu/crtn.o + + ./a.out diff --git a/test.cpp b/test.cpp new file mode 100644 index 0000000000000..5400171e32d0f --- /dev/null +++ b/test.cpp @@ -0,0 +1,58 @@ +#include <hip/hip_runtime.h> +#include <iostream> + +using namespace std; + +#define HIP_CHECK(status) \ + if (status != hipSuccess) { \ + std::cerr << "HIP error: " << hipGetErrorString(status) \ + << " at line " << __LINE__ << std::endl; \ + std::exit(EXIT_FAILURE); \ + } + + +extern "C" __global__ void test_kernel_wave_reduce_add_u64(int32_t* a, int32_t N, int num_active_lanes) { + int32_t threadID = threadIdx.x ; + int32_t reduced_val = __builtin_amdgcn_wave_reduce_add_i32(N, 0); // uniform value + between waves + // test with : std::numeric_limits<unsigned int>::max() + + // for(int i = 0; i < num_active_lanes; i++) a[i] = reduced_val; + a[threadID] = reduced_val; + // a[thre] = thre + 10; +} + + +int main() { + int num_active_lanes = 1; + // std::unique_ptr<int32_t[]> h1 = std::make_unique<int32_t[]>(num_active_lanes); + int32_t *h1 = (int32_t *)malloc(sizeof(int32_t) * num_active_lanes); + + // std::unique_ptr<int32_t> val_to_reduce = std::make_unique<int32_t>(10); + int32_t *val_to_reduce = (int32_t *)malloc(sizeof(int32_t)); + *val_to_reduce = std::numeric_limits<uint32_t>::min(); + + // initialize the memory + for (int i = 0; i < num_active_lanes; i++) { + h1[i] = 99; + } + + size_t size = num_active_lanes * sizeof(int32_t); + int32_t* d1 = nullptr; + HIP_CHECK(hipMalloc(&d1, size)); + // HIP_CHECK(hipMemcpy(d1, h1.get(), size, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d1, h1, size, hipMemcpyHostToDevice)); + // std::cout << "before kernel: " << std::endl; + hipLaunchKernelGGL(test_kernel_wave_reduce_add_u64, dim3(1), dim3(num_active_lanes), 0, 0, d1, *val_to_reduce, num_active_lanes); + // std::cout << "after kernel: "<< std::endl; + // HIP_CHECK(hipMemcpy(h1.get(), d1, size, hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(h1, d1, size, hipMemcpyDeviceToHost)); + + + std::cout << "individual values: "; + for(int i = 0; i < 1; i++){ + std::cout << h1[i] << ", "; + // std::cout << std::hex << h1[i] << ", "; + } + std::cout << std::endl; + HIP_CHECK(hipFree(d1)); +} \ No newline at end of file _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits