https://github.com/KungFuDonkey created https://github.com/llvm/llvm-project/pull/204923
This PR adds the InterlockedOr function to HLSL. A similar PR from last year was made for this #180804 but was never merged. So I reimplemented as InterlockedAdd went in recently, which made this change easy enough for me to do. Added some reusability on the for future interlocked functions >From 9a4648ee49b5b0a7b255b354adf360d9f81bf530 Mon Sep 17 00:00:00 2001 From: KungFuDonkey <[email protected]> Date: Sat, 20 Jun 2026 13:53:41 +0200 Subject: [PATCH] [HLSL][SPRIV][DXIL] Implement InterlockedOr builtin --- clang/include/clang/Basic/Builtins.td | 6 ++ clang/lib/CodeGen/CGHLSLBuiltins.cpp | 64 ++++++----- clang/lib/CodeGen/CGHLSLRuntime.h | 1 + clang/lib/Sema/HLSLExternalSemaSource.cpp | 18 ++++ clang/lib/Sema/SemaHLSL.cpp | 7 +- .../CodeGenHLSL/builtins/InterlockedOr.hlsl | 59 +++++++++++ .../BuiltIns/InterlockedOr-errors.hlsl | 100 ++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsDirectX.td | 4 + llvm/include/llvm/IR/IntrinsicsSPIRV.td | 4 + .../Target/DirectX/DXILIntrinsicExpansion.cpp | 13 ++- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 19 ++-- llvm/test/CodeGen/DirectX/InterlockedOr.ll | 52 +++++++++ .../SPIRV/hlsl-intrinsics/InterlockedOr.ll | 36 +++++++ .../hlsl-intrinsics/InterlockedOr_spv_i64.ll | 37 +++++++ 14 files changed, 378 insertions(+), 42 deletions(-) create mode 100644 clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl create mode 100644 llvm/test/CodeGen/DirectX/InterlockedOr.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 053a257ba6d4a..61e63c4d9b073 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -5465,6 +5465,12 @@ def HLSLInterlockedAdd : LangBuiltin<"HLSL_LANG"> { let Prototype = "void (...)"; } +def HLSLInterlockedOr : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_interlocked_or"]; + let Attributes = [NoThrow]; + let Prototype = "void (...)"; +} + def HLSLWaveActiveBallot : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_wave_active_ballot"]; let Attributes = [NoThrow, Const]; diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 20a2119e28ce1..5f184dbb91068 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -172,6 +172,35 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) { return LastInst; } +// Emit an HLSL Interlocked* atomic operation. All Interlocked* builtins share +// the same shape, differing only in the target intrinsic: +// void Interlocked<Op>(groupshared|device T &dest, T value); +// void Interlocked<Op>(groupshared|device T &dest, T value, +// T &original_value); +// Both `dest` and `original_value` are plain references, so we can use the +// underlying lvalue directly without HLSLOutArgExpr unwrapping. +static Value *handleHlslInterlocked(CodeGenFunction &CGF, const CallExpr *E, + Intrinsic::ID ID, const Twine &Name) { + LValue DestLV = CGF.EmitLValue(E->getArg(0)); + Value *Ptr = DestLV.getAddress().emitRawPointer(CGF); + Value *Val = CGF.EmitScalarExpr(E->getArg(1)); + assert(E->getArg(1)->getType()->isIntegerType() && + "Intrinsic Interlocked value operand must be an integer"); + + Value *Call = CGF.EmitRuntimeCall( + Intrinsic::getOrInsertDeclaration(&CGF.CGM.getModule(), ID, + {Val->getType(), Ptr->getType()}), + ArrayRef<Value *>{Ptr, Val}, Name); + + // The 3-arg overload writes the old value (the intrinsic's return value) + // into the `original_value` reference parameter. + if (E->getNumArgs() == 3) { + LValue OrigLV = CGF.EmitLValue(E->getArg(2)); + CGF.EmitStoreThroughLValue(RValue::get(Call), OrigLV); + } + return Call; +} + static Value *handleHlslWaveActiveBallot(CodeGenFunction &CGF, const CallExpr *E) { Value *Cond = CGF.EmitScalarExpr(E->getArg(0)); @@ -1427,33 +1456,14 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op}, "hlsl.wave.active.bit.and"); } - case Builtin::BI__builtin_hlsl_interlocked_add: { - // HLSL signatures (synthesized as overloads in HLSLExternalSemaSource): - // void InterlockedAdd(groupshared|device T &dest, T value); - // void InterlockedAdd(groupshared|device T &dest, T value, - // T &original_value); - // Both `dest` and `original_value` are plain references, so we can use - // the underlying lvalue directly without HLSLOutArgExpr unwrapping. - LValue DestLV = EmitLValue(E->getArg(0)); - Value *Ptr = DestLV.getAddress().emitRawPointer(*this); - Value *Val = EmitScalarExpr(E->getArg(1)); - assert(E->getArg(1)->getType()->isIntegerType() && - "Intrinsic InterlockedAdd value operand must be an integer"); - - Intrinsic::ID ID = CGM.getHLSLRuntime().getInterlockedAddIntrinsic(); - Value *Call = EmitRuntimeCall( - Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID, - {Val->getType(), Ptr->getType()}), - ArrayRef<Value *>{Ptr, Val}, "hlsl.interlocked.add"); - - // The 3-arg overload writes the old value (the intrinsic's return value) - // into the `original_value` reference parameter. - if (E->getNumArgs() == 3) { - LValue OrigLV = EmitLValue(E->getArg(2)); - EmitStoreThroughLValue(RValue::get(Call), OrigLV); - } - return Call; - } + case Builtin::BI__builtin_hlsl_interlocked_add: + return handleHlslInterlocked( + *this, E, CGM.getHLSLRuntime().getInterlockedAddIntrinsic(), + "hlsl.interlocked.add"); + case Builtin::BI__builtin_hlsl_interlocked_or: + return handleHlslInterlocked( + *this, E, CGM.getHLSLRuntime().getInterlockedOrIntrinsic(), + "hlsl.interlocked.or"); case Builtin::BI__builtin_hlsl_wave_active_ballot: { [[maybe_unused]] Value *Op = EmitScalarExpr(E->getArg(0)); assert(Op->getType()->isIntegerTy(1) && diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index a126d4612a5f4..154d19ff7bd25 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -152,6 +152,7 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveBitXor, wave_reduce_xor) GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveBitAnd, wave_reduce_and) GENERATE_HLSL_INTRINSIC_FUNCTION(InterlockedAdd, interlocked_add) + GENERATE_HLSL_INTRINSIC_FUNCTION(InterlockedOr, interlocked_or) GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveMax, wave_reduce_max) GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveUMax, wave_reduce_umax) GENERATE_HLSL_INTRINSIC_FUNCTION(WaveActiveMin, wave_reduce_min) diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 3f7255cb3f8a7..537357be5e2bd 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -760,8 +760,26 @@ static void defineHLSLInterlockedAdd(Sema &S, NamespaceDecl *NS) { ThreeArg); } +// Synthesize the InterlockedOr overload set: {int, uint, int64_t, uint64_t} +// x {groupshared, device} x {2-arg, 3-arg}. +static void defineHLSLInterlockedOr(Sema &S, NamespaceDecl *NS) { + ASTContext &AST = S.getASTContext(); + // HLSL: int64_t == long, uint64_t == unsigned long (see hlsl_basic_types.h). + QualType Elems[] = {AST.IntTy, AST.UnsignedIntTy, AST.LongTy, + AST.UnsignedLongTy}; + LangAS AddrSpaces[] = {LangAS::hlsl_groupshared, LangAS::hlsl_device}; + + for (QualType ElemTy : Elems) + for (LangAS AS : AddrSpaces) + for (bool ThreeArg : {false, true}) + buildAtomicOverload(S, NS, "InterlockedOr", + "__builtin_hlsl_interlocked_or", ElemTy, AS, + ThreeArg); +} + void HLSLExternalSemaSource::defineHLSLAtomicIntrinsics() { defineHLSLInterlockedAdd(*SemaPtr, HLSLNamespace); + defineHLSLInterlockedOr(*SemaPtr, HLSLNamespace); } void HLSLExternalSemaSource::onCompletion(CXXRecordDecl *Record, diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 075dc97b0aef2..e3d8e4ff22bcb 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -4534,10 +4534,11 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { TheCall->setType(ArgTyExpr); break; } - case Builtin::BI__builtin_hlsl_interlocked_add: { + case Builtin::BI__builtin_hlsl_interlocked_add: + case Builtin::BI__builtin_hlsl_interlocked_or: { // The builtin's prototype in Builtins.td is `void (...)`, so direct calls - // to `__builtin_hlsl_interlocked_add` bypass argument checking entirely. - // When reached via the synthesized `InterlockedAdd` overload set in + // to `__builtin_hlsl_interlocked_*` bypass argument checking entirely. + // When reached via the synthesized `Interlocked*` overload set in // HLSLExternalSemaSource, overload resolution has already enforced the // argument count, integer-type matching, and the address-space requirement // on `dest`. The checks below are a safety net for callers that invoke the diff --git a/clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl b/clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl new file mode 100644 index 0000000000000..a4c4f4cc7dd6c --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/InterlockedOr.hlsl @@ -0,0 +1,59 @@ +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.6-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,DXCHECK + +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \ +// RUN: spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,SPVCHECK + +// Test basic lowering of HLSL InterlockedOr to the target intrinsic. + +groupshared int gs_i32; +groupshared uint gs_u32; +groupshared int64_t gs_i64; +groupshared uint64_t gs_u64; + +// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_int_2arg +// DXCHECK: call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}}) +// SPVCHECK: call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}}) +export void test_int_2arg(int v) { + InterlockedOr(gs_i32, v); +} + +// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_uint_2arg +// DXCHECK: call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}}) +// SPVCHECK: call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}}) +export void test_uint_2arg(uint v) { + InterlockedOr(gs_u32, v); +} + +// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_int_3arg +// DXCHECK: %[[R:.*]] = call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}}) +// SPVCHECK: %[[R:.*]] = call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_i32{{.*}}, i32 %{{.*}}) +// CHECK: store i32 %[[R]], ptr {{.*}} +export void test_int_3arg(int v, out int orig) { + InterlockedOr(gs_i32, v, orig); +} + +// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_uint_3arg +// DXCHECK: %[[R:.*]] = call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}}) +// SPVCHECK: %[[R:.*]] = call spir_func i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) {{.*}}@gs_u32{{.*}}, i32 %{{.*}}) +// CHECK: store i32 %[[R]], ptr {{.*}} +export void test_uint_3arg(uint v, out uint orig) { + InterlockedOr(gs_u32, v, orig); +} + +// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_int64_2arg +// DXCHECK: call i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_i64{{.*}}, i64 %{{.*}}) +// SPVCHECK: call spir_func i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_i64{{.*}}, i64 %{{.*}}) +export void test_int64_2arg(int64_t v) { + InterlockedOr(gs_i64, v); +} + +// CHECK-LABEL: define {{(dso_local |hidden |internal |protected |spir_func )*}}void @{{.*}}test_uint64_3arg +// DXCHECK: %[[R:.*]] = call i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_u64{{.*}}, i64 %{{.*}}) +// SPVCHECK: %[[R:.*]] = call spir_func i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3) {{.*}}@gs_u64{{.*}}, i64 %{{.*}}) +// CHECK: store i64 %[[R]], ptr {{.*}} +export void test_uint64_3arg(uint64_t v, out uint64_t orig) { + InterlockedOr(gs_u64, v, orig); +} diff --git a/clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl new file mode 100644 index 0000000000000..faa2825139ad4 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/InterlockedOr-errors.hlsl @@ -0,0 +1,100 @@ +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header \ +// RUN: -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only \ +// RUN: -disable-llvm-passes -verify + +// InterlockedOr is provided as a set of address-space-qualified overloads +// (groupshared/device, {int,uint,int64_t,uint64_t}, 2-arg/3-arg). All arg +// mismatches surface as "no matching function" with 16 candidates. The +// candidate notes come from synthesized FunctionDecls with no source +// location, so they are matched with `@*:*`. + +groupshared int gs_i32; +groupshared float gs_f32; +struct S { int x; }; +groupshared S gs_s; + +void too_few(int v) { + InterlockedOr(gs_i32); // expected-error{{no matching function for call to 'InterlockedOr'}} + // expected-note@*:* 16 {{candidate function}} +} + +void too_many(int v, int extra) { + int o; + InterlockedOr(gs_i32, v, o, extra); // expected-error{{no matching function for call to 'InterlockedOr'}} + // expected-note@*:* 16 {{candidate function}} +} + +// Atomics must operate on actual addresses in groupshared or device memory; +// passing a plain local (no address space) must not bind to any overload. +void local_dest(int v) { + int dest; + InterlockedOr(dest, v); // expected-error{{no matching function for call to 'InterlockedOr'}} + // expected-note@*:* 16 {{candidate function}} +} + +void float_dest(float v) { + InterlockedOr(gs_f32, v); // expected-error{{no matching function for call to 'InterlockedOr'}} + // expected-note@*:* 16 {{candidate function}} +} + +void struct_dest(int v) { + InterlockedOr(gs_s, v); // expected-error{{no matching function for call to 'InterlockedOr'}} + // expected-note@*:* 16 {{candidate function}} +} + +void mismatched_orig_type(int v) { + uint orig; + InterlockedOr(gs_i32, v, orig); // expected-error{{no matching function for call to 'InterlockedOr'}} + // expected-note@*:* 16 {{candidate function}} +} + +// The tests below exercise direct invocations of the underlying clang builtin +// `__builtin_hlsl_interlocked_or`. These bypass overload resolution against +// the synthesized `InterlockedOr` overload set (the builtin's prototype in +// Builtins.td is `void (...)`), so each error is produced by the explicit +// checks in SemaHLSL.cpp rather than by candidate-set rejection. + +void direct_too_few() { + __builtin_hlsl_interlocked_or(gs_i32); + // expected-error@-1 {{too few arguments to function call, expected at least 2, have 1}} +} + +void direct_too_many(int v, int extra) { + int o; + __builtin_hlsl_interlocked_or(gs_i32, v, o, extra); + // expected-error@-1 {{too many arguments to function call, expected at most 3, have 4}} +} + +void direct_non_integer_dest() { + S local_s; + __builtin_hlsl_interlocked_or(local_s, 1); + // expected-error@-1 {{1st argument must be a scalar integer type (was 'S')}} +} + +void direct_nonlvalue_dest(int v) { + __builtin_hlsl_interlocked_or(1, v); + // expected-error@-1 {{cannot bind non-lvalue argument '1' to out parameter}} +} + +void direct_mismatched_value() { + uint uv = 1u; + __builtin_hlsl_interlocked_or(gs_i32, uv); + // expected-error@-1 {{passing 'uint' (aka 'unsigned int') to parameter of incompatible type 'int'}} +} + +void direct_mismatched_orig(int v) { + uint orig; + __builtin_hlsl_interlocked_or(gs_i32, v, orig); + // expected-error@-1 {{passing 'uint' (aka 'unsigned int') to parameter of incompatible type 'int'}} +} + +void direct_nonlvalue_orig(int v) { + __builtin_hlsl_interlocked_or(gs_i32, v, 1); + // expected-error@-1 {{cannot bind non-lvalue argument '1' to out parameter}} +} + +void direct_default_as_dest(int v) { + int local; + __builtin_hlsl_interlocked_or(local, v); + // expected-error@-1 {{1st argument to atomic builtin must reference groupshared or device memory (was 'int')}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index af360dfc78965..d2db4905aeabe 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -261,6 +261,10 @@ def int_dx_interlocked_add : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyptr_ty, LLVMMatchType<0>], [IntrArgMemOnly]>; +def int_dx_interlocked_or : + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [llvm_anyptr_ty, LLVMMatchType<0>], + [IntrArgMemOnly]>; def int_dx_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>; def int_dx_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>; def int_dx_wave_reduce_min : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem, IntrTriviallyScalarizable]>; diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 6e4cf8f7e72dc..5c59a32ddce99 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -152,6 +152,10 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty] DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyptr_ty, LLVMMatchType<0>], [IntrArgMemOnly]>; + def int_spv_interlocked_or : + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [llvm_anyptr_ty, LLVMMatchType<0>], + [IntrArgMemOnly]>; def int_spv_subgroup_ballot : ClangBuiltin<"__builtin_spirv_subgroup_ballot">, DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>; def int_spv_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>; diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index 88eda6656d89b..62fb8d1b12891 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -228,6 +228,7 @@ static bool isIntrinsicExpansion(Function &F) { case Intrinsic::dx_step: case Intrinsic::dx_radians: case Intrinsic::dx_interlocked_add: + case Intrinsic::dx_interlocked_or: case Intrinsic::usub_sat: case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_fadd: @@ -771,15 +772,16 @@ static Value *expandRadiansIntrinsic(CallInst *Orig) { return Builder.CreateFMul(X, PiOver180); } -static Value *expandInterlockedAddIntrinsic(CallInst *Orig) { - // Lower @llvm.dx.interlocked.add(ptr, val) to `atomicrmw add ptr, val +static Value *expandInterlockedIntrinsic(CallInst *Orig, + AtomicRMWInst::BinOp Op) { + // Lower @llvm.dx.interlocked.<op>(ptr, val) to `atomicrmw <op> ptr, val // monotonic`. HLSL Interlocked operations imply no fence/barrier, which maps // to monotonic ordering. The instruction's result is the old value, matching // the intrinsic's return value. Value *Ptr = Orig->getArgOperand(0); Value *Val = Orig->getArgOperand(1); IRBuilder<> Builder(Orig); - return Builder.CreateAtomicRMW(AtomicRMWInst::Add, Ptr, Val, MaybeAlign(), + return Builder.CreateAtomicRMW(Op, Ptr, Val, MaybeAlign(), AtomicOrdering::Monotonic); } @@ -1245,7 +1247,10 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) { Result = expandRadiansIntrinsic(Orig); break; case Intrinsic::dx_interlocked_add: - Result = expandInterlockedAddIntrinsic(Orig); + Result = expandInterlockedIntrinsic(Orig, AtomicRMWInst::Add); + break; + case Intrinsic::dx_interlocked_or: + Result = expandInterlockedIntrinsic(Orig, AtomicRMWInst::Or); break; case Intrinsic::dx_resource_load_rawbuffer: if (expandBufferLoadIntrinsic(Orig, /*IsRaw*/ true)) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index cd99015a61ba9..2220fc72e3837 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -177,8 +177,8 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectAtomicRMW(Register ResVReg, SPIRVTypeInst ResType, MachineInstr &I, unsigned NewOpcode, unsigned NegateOpcode = 0) const; - bool selectInterlockedAdd(Register ResVReg, SPIRVTypeInst ResType, - MachineInstr &I) const; + bool selectInterlocked(Register ResVReg, SPIRVTypeInst ResType, + MachineInstr &I, unsigned Opcode) const; bool selectAtomicCmpXchg(Register ResVReg, SPIRVTypeInst ResType, MachineInstr &I) const; @@ -2445,16 +2445,17 @@ bool SPIRVInstructionSelector::selectAtomicRMW(Register ResVReg, return true; } -bool SPIRVInstructionSelector::selectInterlockedAdd(Register ResVReg, - SPIRVTypeInst ResType, - MachineInstr &I) const { +bool SPIRVInstructionSelector::selectInterlocked(Register ResVReg, + SPIRVTypeInst ResType, + MachineInstr &I, + unsigned Opcode) const { Register Ptr = I.getOperand(2).getReg(); Register Value = I.getOperand(3).getReg(); SPIRV::StorageClass::StorageClass SC = GR.getPointerStorageClass(Ptr); assert((SC == SPIRV::StorageClass::Workgroup || SC == SPIRV::StorageClass::StorageBuffer) && - "InterlockedAdd requires Workgroup or StorageBuffer storage class"); + "Interlocked op requires Workgroup or StorageBuffer storage class"); uint32_t Scope = static_cast<uint32_t>(SC == SPIRV::StorageClass::Workgroup ? SPIRV::Scope::Workgroup : SPIRV::Scope::Device); @@ -2463,7 +2464,7 @@ bool SPIRVInstructionSelector::selectInterlockedAdd(Register ResVReg, uint32_t MemSem = static_cast<uint32_t>(getMemSemanticsForStorageClass(SC)); Register MemSemReg = buildI32Constant(MemSem, I); - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpAtomicIAdd)) + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) .addUse(Ptr) @@ -5362,7 +5363,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectWaveReduceOp(ResVReg, ResType, I, SPIRV::OpGroupNonUniformBitwiseAnd); case Intrinsic::spv_interlocked_add: - return selectInterlockedAdd(ResVReg, ResType, I); + return selectInterlocked(ResVReg, ResType, I, SPIRV::OpAtomicIAdd); + case Intrinsic::spv_interlocked_or: + return selectInterlocked(ResVReg, ResType, I, SPIRV::OpAtomicOr); case Intrinsic::spv_wave_reduce_umax: return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true); case Intrinsic::spv_wave_reduce_max: diff --git a/llvm/test/CodeGen/DirectX/InterlockedOr.ll b/llvm/test/CodeGen/DirectX/InterlockedOr.ll new file mode 100644 index 0000000000000..34387adefeed3 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/InterlockedOr.ll @@ -0,0 +1,52 @@ +; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.6-compute %s | FileCheck %s + +; Verify llvm.dx.interlocked.or expands to atomicrmw or monotonic. + +; Groupshared (addrspace 3) memory tests. +@gs_i32 = internal addrspace(3) global i32 zeroinitializer +@gs_i64 = internal addrspace(3) global i64 zeroinitializer + +define i32 @test_i32(i32 %v) { +entry: +; CHECK-LABEL: @test_i32 +; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(3) @gs_i32, i32 %v monotonic +; CHECK: ret i32 %[[R]] + %r = call i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3) @gs_i32, i32 %v) + ret i32 %r +} + +define i64 @test_i64(i64 %v) { +entry: +; CHECK-LABEL: @test_i64 +; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(3) @gs_i64, i64 %v monotonic +; CHECK: ret i64 %[[R]] + %r = call i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3) @gs_i64, i64 %v) + ret i64 %r +} + +; Device (addrspace 1) memory tests. +@dev_i32 = external addrspace(1) global i32 +@dev_i64 = external addrspace(1) global i64 + +define i32 @test_device_i32(i32 %v) { +entry: +; CHECK-LABEL: @test_device_i32 +; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(1) @dev_i32, i32 %v monotonic +; CHECK: ret i32 %[[R]] + %r = call i32 @llvm.dx.interlocked.or.i32.p1(ptr addrspace(1) @dev_i32, i32 %v) + ret i32 %r +} + +define i64 @test_device_i64(i64 %v) { +entry: +; CHECK-LABEL: @test_device_i64 +; CHECK: %[[R:.*]] = atomicrmw or ptr addrspace(1) @dev_i64, i64 %v monotonic +; CHECK: ret i64 %[[R]] + %r = call i64 @llvm.dx.interlocked.or.i64.p1(ptr addrspace(1) @dev_i64, i64 %v) + ret i64 %r +} + +declare i32 @llvm.dx.interlocked.or.i32.p3(ptr addrspace(3), i32) +declare i64 @llvm.dx.interlocked.or.i64.p3(ptr addrspace(3), i64) +declare i32 @llvm.dx.interlocked.or.i32.p1(ptr addrspace(1), i32) +declare i64 @llvm.dx.interlocked.or.i64.p1(ptr addrspace(1), i64) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll new file mode 100644 index 0000000000000..a8b3c692f9bc0 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr.ll @@ -0,0 +1,36 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %} + +; Test lowering of llvm.spv.interlocked.or to OpAtomicOr. + +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#scope_wg:]] = OpConstant %[[#uint]] 2 +; CHECK-DAG: %[[#scope_dev:]] = OpConstant %[[#uint]] 1 +; CHECK-DAG: %[[#mem_wg:]] = OpConstant %[[#uint]] 256 +; CHECK-DAG: %[[#mem_uniform:]] = OpConstant %[[#uint]] 64 + +@gs_i32 = internal addrspace(3) global i32 zeroinitializer +@dev_i32 = external addrspace(11) global i32 + +; Workgroup (addrspace 3) memory tests. + +; CHECK-LABEL: Begin function test_i32 +define i32 @test_i32(i32 %v) { +entry: +; CHECK: %[[#R:]] = OpAtomicOr %[[#uint]] %[[#]] %[[#scope_wg]] %[[#mem_wg]] %[[#]] + %r = call i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3) @gs_i32, i32 %v) + ret i32 %r +} + +; Device / StorageBuffer (addrspace 11) memory tests. + +; CHECK-LABEL: Begin function test_device_i32 +define i32 @test_device_i32(i32 %v) { +entry: +; CHECK: %[[#R:]] = OpAtomicOr %[[#uint]] %[[#]] %[[#scope_dev]] %[[#mem_uniform]] %[[#]] + %r = call i32 @llvm.spv.interlocked.or.i32.p11(ptr addrspace(11) @dev_i32, i32 %v) + ret i32 %r +} + +declare i32 @llvm.spv.interlocked.or.i32.p3(ptr addrspace(3), i32) +declare i32 @llvm.spv.interlocked.or.i32.p11(ptr addrspace(11), i32) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll new file mode 100644 index 0000000000000..18a16229ee718 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/InterlockedOr_spv_i64.ll @@ -0,0 +1,37 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %} + +; Test lowering of llvm.spv.interlocked.or with i64 to OpAtomicOr. + +; CHECK-DAG: %[[#ulong:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#scope_wg:]] = OpConstant %[[#uint]] 2 +; CHECK-DAG: %[[#scope_dev:]] = OpConstant %[[#uint]] 1 +; CHECK-DAG: %[[#mem_wg:]] = OpConstant %[[#uint]] 256 +; CHECK-DAG: %[[#mem_uniform:]] = OpConstant %[[#uint]] 64 + +@gs_i64 = internal addrspace(3) global i64 zeroinitializer +@dev_i64 = external addrspace(11) global i64 + +; Workgroup (addrspace 3) memory test. + +; CHECK-LABEL: Begin function test_i64 +define i64 @test_i64(i64 %v) { +entry: +; CHECK: %[[#R:]] = OpAtomicOr %[[#ulong]] %[[#]] %[[#scope_wg]] %[[#mem_wg]] %[[#]] + %r = call i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3) @gs_i64, i64 %v) + ret i64 %r +} + +; Device / StorageBuffer (addrspace 11) memory test. + +; CHECK-LABEL: Begin function test_device_i64 +define i64 @test_device_i64(i64 %v) { +entry: +; CHECK: %[[#R:]] = OpAtomicOr %[[#ulong]] %[[#]] %[[#scope_dev]] %[[#mem_uniform]] %[[#]] + %r = call i64 @llvm.spv.interlocked.or.i64.p11(ptr addrspace(11) @dev_i64, i64 %v) + ret i64 %r +} + +declare i64 @llvm.spv.interlocked.or.i64.p3(ptr addrspace(3), i64) +declare i64 @llvm.spv.interlocked.or.i64.p11(ptr addrspace(11), i64) _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
