Author: Artem Belevich Date: 2026-03-12T09:48:09-07:00 New Revision: 595b96140013cff96b0b92b148857e3e72fa2786
URL: https://github.com/llvm/llvm-project/commit/595b96140013cff96b0b92b148857e3e72fa2786 DIFF: https://github.com/llvm/llvm-project/commit/595b96140013cff96b0b92b148857e3e72fa2786.diff LOG: [CUDA] Use monotonic ordering for __nvvm_atom* builtins (#185822) CUDA's __nvvm_atom* builtins are expected to produce atomic operations with relaxed ordering. However, Clang lowered tham as atomicrmw and cmpxchg with the default seq_cst ordering. That mismatch went unnoticed because until recently NVPTX back end was unable to lower all atomic instructions correctly, and despite using `cst_seq` ordering in IR we ended up generating the intended PTX instructions with relaxed ordering, It worked well enough until https://github.com/llvm/llvm-project/pull/179553 implemented correct NVPTX atomic lowering. That, in turn, caused severe performance regression for the code that relied on these builtins. Thanks to @akshayrdeodhar for figuring out what happened. Switching __nvvm_atom* builtins to generate atomic instructions with monotonic ordering matches the expected semantics of the builtins, and restores performance of the generated code. See: https://github.com/llvm/llvm-project/pull/179553#issuecomment-4035193968 Added: Modified: clang/lib/CodeGen/CGBuiltin.cpp clang/lib/CodeGen/CGBuiltin.h clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp clang/test/CodeGen/builtins-nvptx-ptx50.cu clang/test/CodeGen/builtins-nvptx.c Removed: ################################################################################ diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 6fb43d5cb0fbf..df03e84ce9f81 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -390,7 +390,9 @@ static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF, /// Note: In order to lower Microsoft's _InterlockedCompareExchange* intrinsics /// invoke the function EmitAtomicCmpXchgForMSIntrin. Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E, - bool ReturnBool) { + bool ReturnBool, + llvm::AtomicOrdering SuccessOrdering, + llvm::AtomicOrdering FailureOrdering) { QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType(); Address DestAddr = CheckAtomicAlignment(CGF, E); @@ -403,8 +405,7 @@ Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E, Value *New = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType); Value *Pair = CGF.Builder.CreateAtomicCmpXchg( - DestAddr, Cmp, New, llvm::AtomicOrdering::SequentiallyConsistent, - llvm::AtomicOrdering::SequentiallyConsistent); + DestAddr, Cmp, New, SuccessOrdering, FailureOrdering); if (ReturnBool) // Extract boolean success flag and zext it to int. return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1), @@ -5079,14 +5080,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__sync_val_compare_and_swap_4: case Builtin::BI__sync_val_compare_and_swap_8: case Builtin::BI__sync_val_compare_and_swap_16: - return RValue::get(MakeAtomicCmpXchgValue(*this, E, false)); + return RValue::get(MakeAtomicCmpXchgValue( + *this, E, false, AtomicOrdering::SequentiallyConsistent, + AtomicOrdering::SequentiallyConsistent)); case Builtin::BI__sync_bool_compare_and_swap_1: case Builtin::BI__sync_bool_compare_and_swap_2: case Builtin::BI__sync_bool_compare_and_swap_4: case Builtin::BI__sync_bool_compare_and_swap_8: case Builtin::BI__sync_bool_compare_and_swap_16: - return RValue::get(MakeAtomicCmpXchgValue(*this, E, true)); + return RValue::get(MakeAtomicCmpXchgValue( + *this, E, true, AtomicOrdering::SequentiallyConsistent, + AtomicOrdering::SequentiallyConsistent)); case Builtin::BI__sync_swap_1: case Builtin::BI__sync_swap_2: diff --git a/clang/lib/CodeGen/CGBuiltin.h b/clang/lib/CodeGen/CGBuiltin.h index 667bce845f5c0..df71e46629884 100644 --- a/clang/lib/CodeGen/CGBuiltin.h +++ b/clang/lib/CodeGen/CGBuiltin.h @@ -99,7 +99,8 @@ llvm::Value *EmitOverflowIntrinsic(clang::CodeGen::CodeGenFunction &CGF, llvm::Value *&Carry); llvm::Value *MakeAtomicCmpXchgValue(clang::CodeGen::CodeGenFunction &CGF, - const clang::CallExpr *E, - bool ReturnBool); + const clang::CallExpr *E, bool ReturnBool, + llvm::AtomicOrdering SuccessOrdering, + llvm::AtomicOrdering FailureOrdering); #endif diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index b4f7342e23473..423a7a3097119 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -431,52 +431,62 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: case NVPTX::BI__nvvm_atom_add_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_sub_gen_i: case NVPTX::BI__nvvm_atom_sub_gen_l: case NVPTX::BI__nvvm_atom_sub_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_and_gen_i: case NVPTX::BI__nvvm_atom_and_gen_l: case NVPTX::BI__nvvm_atom_and_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_or_gen_i: case NVPTX::BI__nvvm_atom_or_gen_l: case NVPTX::BI__nvvm_atom_or_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_xor_gen_i: case NVPTX::BI__nvvm_atom_xor_gen_l: case NVPTX::BI__nvvm_atom_xor_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_xchg_gen_i: case NVPTX::BI__nvvm_atom_xchg_gen_l: case NVPTX::BI__nvvm_atom_xchg_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_max_gen_i: case NVPTX::BI__nvvm_atom_max_gen_l: case NVPTX::BI__nvvm_atom_max_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_max_gen_ui: case NVPTX::BI__nvvm_atom_max_gen_ul: case NVPTX::BI__nvvm_atom_max_gen_ull: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_min_gen_i: case NVPTX::BI__nvvm_atom_min_gen_l: case NVPTX::BI__nvvm_atom_min_gen_ll: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_min_gen_ui: case NVPTX::BI__nvvm_atom_min_gen_ul: case NVPTX::BI__nvvm_atom_min_gen_ull: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_cas_gen_us: case NVPTX::BI__nvvm_atom_cas_gen_i: @@ -484,7 +494,9 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_atom_cas_gen_ll: // __nvvm_atom_cas_gen_* should return the old value rather than the // success flag. - return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false); + return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false, + AtomicOrdering::Monotonic, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_add_gen_f: case NVPTX::BI__nvvm_atom_add_gen_d: { @@ -492,14 +504,16 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, Value *Val = EmitScalarExpr(E->getArg(1)); return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val, - AtomicOrdering::SequentiallyConsistent); + AtomicOrdering::Monotonic); } case NVPTX::BI__nvvm_atom_inc_gen_ui: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_atom_dec_gen_ui: - return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E); + return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E, + AtomicOrdering::Monotonic); case NVPTX::BI__nvvm_ldg_c: case NVPTX::BI__nvvm_ldg_sc: diff --git a/clang/test/CodeGen/builtins-nvptx-ptx50.cu b/clang/test/CodeGen/builtins-nvptx-ptx50.cu index a2d527537aed0..2a141baf3a6d0 100644 --- a/clang/test/CodeGen/builtins-nvptx-ptx50.cu +++ b/clang/test/CodeGen/builtins-nvptx-ptx50.cu @@ -17,7 +17,7 @@ // CHECK-LABEL: test_fn __device__ void test_fn(double d, double* double_ptr) { - // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw fadd ptr {{.*}} monotonic, align 8 // expected-error@+1 {{'__nvvm_atom_add_gen_d' needs target feature sm_60}} __nvvm_atom_add_gen_d(double_ptr, d); } diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 2e1acc0aac259..f1b41ba557426 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -309,91 +309,91 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, unsigned short *usp, unsigned short us, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l, long long *llp, long long ll) { - // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw add ptr {{.*}} monotonic, align 4 __nvvm_atom_add_gen_i(ip, i); - // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw add ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_add_gen_l(&dl, l); - // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw add ptr {{.*}} monotonic, align 8 __nvvm_atom_add_gen_ll(&sll, ll); - // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw sub ptr {{.*}} monotonic, align 4 __nvvm_atom_sub_gen_i(ip, i); - // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw sub ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_sub_gen_l(&dl, l); - // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw sub ptr {{.*}} monotonic, align 8 __nvvm_atom_sub_gen_ll(&sll, ll); - // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw and ptr {{.*}} monotonic, align 4 __nvvm_atom_and_gen_i(ip, i); - // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw and ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_and_gen_l(&dl, l); - // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw and ptr {{.*}} monotonic, align 8 __nvvm_atom_and_gen_ll(&sll, ll); - // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw or ptr {{.*}} monotonic, align 4 __nvvm_atom_or_gen_i(ip, i); - // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw or ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_or_gen_l(&dl, l); - // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw or ptr {{.*}} monotonic, align 8 __nvvm_atom_or_gen_ll(&sll, ll); - // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw xor ptr {{.*}} monotonic, align 4 __nvvm_atom_xor_gen_i(ip, i); - // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw xor ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_xor_gen_l(&dl, l); - // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw xor ptr {{.*}} monotonic, align 8 __nvvm_atom_xor_gen_ll(&sll, ll); - // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw xchg ptr {{.*}} monotonic, align 4 __nvvm_atom_xchg_gen_i(ip, i); - // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw xchg ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_xchg_gen_l(&dl, l); - // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw xchg ptr {{.*}} monotonic, align 8 __nvvm_atom_xchg_gen_ll(&sll, ll); - // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw max ptr {{.*}} monotonic, align 4 __nvvm_atom_max_gen_i(ip, i); - // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw umax ptr {{.*}} monotonic, align 4 __nvvm_atom_max_gen_ui((unsigned int *)ip, i); - // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw max ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_max_gen_l(&dl, l); - // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw umax ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_max_gen_ul((unsigned long *)&dl, l); - // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw max ptr {{.*}} monotonic, align 8 __nvvm_atom_max_gen_ll(&sll, ll); - // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw umax ptr {{.*}} monotonic, align 8 __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll); - // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw min ptr {{.*}} monotonic, align 4 __nvvm_atom_min_gen_i(ip, i); - // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw umin ptr {{.*}} monotonic, align 4 __nvvm_atom_min_gen_ui((unsigned int *)ip, i); - // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw min ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_min_gen_l(&dl, l); - // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}} + // CHECK: atomicrmw umin ptr {{.*}} monotonic, align {{4|8}} __nvvm_atom_min_gen_ul((unsigned long *)&dl, l); - // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw min ptr {{.*}} monotonic, align 8 __nvvm_atom_min_gen_ll(&sll, ll); - // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8 + // CHECK: atomicrmw umin ptr {{.*}} monotonic, align 8 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll); - // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4 + // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align 4 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0 __nvvm_atom_cas_gen_i(ip, 0, i); - // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}} + // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align {{4|8}} // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0 __nvvm_atom_cas_gen_l(&dl, 0, l); - // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8 + // CHECK: cmpxchg ptr {{.*}} monotonic monotonic, align 8 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0 __nvvm_atom_cas_gen_ll(&sll, 0, ll); - // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw fadd ptr {{.*}} monotonic, align 4 __nvvm_atom_add_gen_f(fp, f); - // CHECK: atomicrmw uinc_wrap ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw uinc_wrap ptr {{.*}} monotonic, align 4 __nvvm_atom_inc_gen_ui(uip, ui); - // CHECK: atomicrmw udec_wrap ptr {{.*}} seq_cst, align 4 + // CHECK: atomicrmw udec_wrap ptr {{.*}} monotonic, align 4 __nvvm_atom_dec_gen_ui(uip, ui); @@ -646,7 +646,7 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, #endif #if __CUDA_ARCH__ >= 700 - // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2 + // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} monotonic monotonic, align 2 // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0 __nvvm_atom_cas_gen_us(usp, 0, us); // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
