https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/179415
>From e96559501de30cbb359a92fe8c23804f80a9811a Mon Sep 17 00:00:00 2001 From: pvanhout <[email protected]> Date: Mon, 2 Feb 2026 14:10:51 +0100 Subject: [PATCH 1/3] [AMDGPU][PromoteAlloca] Set !amdgpu.non.volatile if promotion fails I thought about doing this in a separate pass, but this pass already has all the necessary analysis for this to be a trivial addition. We can simply set `!amdgpu.non.volatile` if all other attempts to promote the operation failed. --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 31 ++++++++++++- .../CodeGen/AMDGPU/promote-alloca-memset.ll | 4 +- .../promote-alloca-non-volatile-accesses.ll | 45 +++++++++++++++++++ .../AMDGPU/promote-alloca-vgpr-ratio.ll | 41 +++++++++-------- 4 files changed, 99 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index d18d3a13b29ea..a04944cc5bd2e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -120,6 +120,11 @@ struct AllocaAnalysis { } LDS; explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {} + + void eraseAlloca() { + Alloca->eraseFromParent(); + Alloca = nullptr; + } }; // Shared implementation which can do both promotion to vector and to LDS. @@ -152,6 +157,10 @@ class AMDGPUPromoteAllocaImpl { bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, Instruction *UseInst, int OpIdx0, int OpIdx1) const; + /// Set the amdgpu.non.volatile metadata on all load/store users of \p AA. + /// This assumes the pointer of the alloca never escapes, and thus the memory + /// is thread-local. + void setNonVolatileMetadata(AllocaAnalysis &AA); /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); @@ -443,6 +452,15 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs)) Changed = true; + + // If we were unable to remove this alloca, mark all accesses to it as + // non-volatile instead. This pass rejects all allocas whose pointer escape, + // so the memory of the alloca is known to never be written to outside this + // thread. + if (AA.Alloca) { + setNonVolatileMetadata(AA); + Changed = true; + } } finishDeferredAllocaToLDSPromotion(DeferredIntrs); @@ -1196,7 +1214,7 @@ void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) { // Alloca should now be dead too. assert(AA.Alloca->use_empty()); - AA.Alloca->eraseFromParent(); + AA.eraseAlloca(); } std::pair<Value *, Value *> @@ -1468,6 +1486,15 @@ void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const { AA.LDS.Enable = true; } +void AMDGPUPromoteAllocaImpl::setNonVolatileMetadata(AllocaAnalysis &AA) { + for (Use *U : AA.Uses) { + Instruction *I = dyn_cast<Instruction>(U->getUser()); + if (isa<LoadInst>(I) || isa<StoreInst>(I)) { + I->setMetadata("amdgpu.non.volatile", MDNode::get(I->getContext(), {})); + } + } +} + bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { FunctionType *FTy = F.getFunctionType(); @@ -1665,7 +1692,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS( Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); AA.Alloca->mutateType(Offset->getType()); AA.Alloca->replaceAllUsesWith(Offset); - AA.Alloca->eraseFromParent(); + AA.eraseAlloca(); PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll index 81d6dba494cfc..60526551b22fc 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -44,7 +44,7 @@ define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) ; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8 +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0:![0-9]+]] ; CHECK-NEXT: ret void ; entry: @@ -59,7 +59,7 @@ define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) ; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true) -; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8 +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0]] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll new file mode 100644 index 0000000000000..b912e2199bac1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +; Verify that the !amdgpu.non.volatile metadata is set if promoting an alloca fails. + +define amdgpu_kernel void @test(i64 %val, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @test( +; CHECK-SAME: i64 [[VAL:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [[STACK_1:%.*]] = getelementptr inbounds i64, ptr addrspace(5) [[STACK]], i32 1 +; CHECK-NEXT: store i64 43, ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP:.*]], label %[[END:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PSTACK:%.*]] = phi ptr addrspace(5) [ [[STACK]], %[[ENTRY]] ], [ [[STACK_1]], %[[LOOP]] ] +; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[PSTACK]], align 8, !amdgpu.non.volatile [[META0]] +; CHECK-NEXT: store i64 32, ptr addrspace(5) [[STACK_1]], align 8, !amdgpu.non.volatile [[META0]] +; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[LOAD]], 32 +; CHECK-NEXT: br i1 [[LOOP_CC]], label %[[LOOP]], label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: [[RELOAD:%.*]] = load i64, ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0]] +; CHECK-NEXT: [[RELOAD_1:%.*]] = load i64, ptr addrspace(5) [[STACK_1]], align 8, !amdgpu.non.volatile [[META0]] +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.1 = getelementptr inbounds i64, ptr addrspace(5) %stack, i32 1 + store i64 43, ptr addrspace(5) %stack + br i1 %cond, label %loop, label %end + +loop: + %pstack = phi ptr addrspace(5) [%stack, %entry], [%stack.1, %loop] + %load = load i64, ptr addrspace(5) %pstack + store i64 32, ptr addrspace(5) %stack.1 + %loop.cc = icmp ne i64 %load, 32 + br i1 %loop.cc, label %loop, label %end + +end: + %reload = load i64, ptr addrspace(5) %stack + %reload.1 = load i64, ptr addrspace(5) %stack.1 + ret void +} +;. +; CHECK: [[META0]] = !{} +;. diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll index 4bcc46861d66b..74c4704d716b1 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll @@ -16,10 +16,10 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) ; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 ; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 -; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0:![0-9]+]] +; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]] ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]] ; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; DEFAULT-NEXT: ret void ; @@ -48,10 +48,10 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 { ; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) ; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 ; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 -; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0:![0-9]+]] +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; RATIO8-NEXT: ret void ; @@ -112,10 +112,10 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 { ; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false) ; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 ; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20 -; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]] +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; RATIO8-NEXT: ret void ; @@ -176,10 +176,10 @@ define amdgpu_kernel void @i32_16_elements(ptr %out) #0 { ; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false) ; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 ; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15 -; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]] +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; RATIO8-NEXT: ret void ; @@ -214,10 +214,10 @@ define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 { ; DEFAULT-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false) ; DEFAULT-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 ; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15 -; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; DEFAULT-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]] +; DEFAULT-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]] ; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; DEFAULT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]] ; DEFAULT-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; DEFAULT-NEXT: ret void ; @@ -246,10 +246,10 @@ define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 { ; RATIO8-NEXT: call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false) ; RATIO8-NEXT: [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 ; RATIO8-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15 -; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4 -; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4 +; RATIO8-NEXT: store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]] +; RATIO8-NEXT: store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]] -; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4 +; RATIO8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]] ; RATIO8-NEXT: store i32 [[LOAD]], ptr [[OUT]], align 4 ; RATIO8-NEXT: ret void ; @@ -278,5 +278,10 @@ declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32, attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" } attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" } attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" } +;. +; DEFAULT: [[META0]] = !{} +;. +; RATIO8: [[META0]] = !{} +;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; BASE: {{.*}} >From 60b3edc77856784ceb921da919ade633e32d328a Mon Sep 17 00:00:00 2001 From: pvanhout <[email protected]> Date: Tue, 3 Feb 2026 14:17:52 +0100 Subject: [PATCH 2/3] Pull metadata impl at the top of the patch stack --- llvm/docs/AMDGPUUsage.rst | 23 ++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 + .../AMDGPU/memory-legalizer-non-volatile.ll | 218 ++++++++++++++++++ 3 files changed, 243 insertions(+) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index cd5410a31b98f..d3717dcc00908 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1878,6 +1878,29 @@ and !0 = !{} +.. _amdgpu_non_volatile: + +'``amdgpu.non.volatile``' Metadata +---------------------------------- + +Explicitly marks memory accesses (load, store, rmw) to locations that are never written to by other threads during the execution of the shader. + +This metadata is a performance optimization and can be dropped if necessary. +Using this metadata on a memory access to a location that is written to by other threads is undefined behavior. + +Sets ``NV=1`` on the instruction if the target supports it. + +.. code-block:: llvm + + %val = load i32, ptr %in, align 4, !amdgpu.non.volatile !{} + + +.. note:: + + This metadata is used to request ``NV=1`` on an operation, but the compiler may also set ``NV=1`` + on memory accesses that do not have the metadata when it is safe to do so. For example, it may + set it on accesses to constant memory, when loading from or storing to scratch memory used for + spills, etc. LLVM IR Attributes ================== diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b043d5354042d..3b17198593db9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19357,6 +19357,8 @@ SITargetLowering::getTargetMMOFlags(const Instruction &I) const { Flags |= MONoClobber; if (I.getMetadata("amdgpu.last.use")) Flags |= MOLastUse; + if (I.getMetadata("amdgpu.non.volatile")) + Flags |= MONonVolatile; return Flags; } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll index 5017e572b70e5..c516292984001 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll @@ -56,6 +56,35 @@ entry: ret i32 %val } +define void @md_nv__flat_i32_nonatomic(ptr addrspace(0) %in, ptr addrspace(0) %out) { +; GFX12-CU-LABEL: md_nv__flat_i32_nonatomic: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: md_nv__flat_i32_nonatomic: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] nv +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v[2:3], v0 nv +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +entry: + %val = load i32, ptr addrspace(0) %in, !amdgpu.non.volatile !0 + store i32 %val, ptr addrspace(0) %out, !amdgpu.non.volatile !0 + ret void +} + define void @global_i32_nonatomic(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-CU-LABEL: global_i32_nonatomic: ; GFX12-CU: ; %bb.0: ; %entry @@ -107,6 +136,63 @@ entry: ret i32 %val } +define void @md_nv__global_i32_nonatomic(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; GFX12-CU-LABEL: md_nv__global_i32_nonatomic: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: global_load_b32 v0, v[0:1], off +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: md_nv__global_i32_nonatomic: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off nv +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off nv +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +entry: + %val = load i32, ptr addrspace(1) %in, !amdgpu.non.volatile !0 + store i32 %val, ptr addrspace(1) %out, !amdgpu.non.volatile !0 + ret void +} + +; DS does not have nv. +define void @lds_i32_nonatomic(ptr addrspace(3) %in, ptr addrspace(3) %out) { +; GFX12-CU-LABEL: lds_i32_nonatomic: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: ds_load_b32 v0, v0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: ds_store_b32 v1, v0 +; GFX12-CU-NEXT: s_wait_dscnt 0x0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lds_i32_nonatomic: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ds_load_b32 v0, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +entry: + %val = load i32, ptr addrspace(3) %in, !amdgpu.non.volatile !0 + store i32 %val, ptr addrspace(3) %out, !amdgpu.non.volatile !0 + ret void +} + define i32 @scalar_i32_nonatomic(ptr addrspace(4) inreg %in) { ; GFX12-CU-LABEL: scalar_i32_nonatomic: ; GFX12-CU: ; %bb.0: ; %entry @@ -159,6 +245,32 @@ entry: ret i32 %val } +define i32 @md_nv__scalar_i32_nonatomic(ptr addrspace(4) inreg %in) { +; GFX12-CU-LABEL: md_nv__scalar_i32_nonatomic: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: md_nv__scalar_i32_nonatomic: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +entry: + %val = load i32, ptr addrspace(4) %in, !amdgpu.non.volatile !0 + ret i32 %val +} + define void @scratch_i32_nonatomic(ptr addrspace(5) %in, ptr addrspace(5) %out) { ; GFX12-CU-LABEL: scratch_i32_nonatomic: ; GFX12-CU: ; %bb.0: ; %entry @@ -210,6 +322,33 @@ entry: ret i32 %val } +define void @md_nv__scratch_i32_nonatomic(ptr addrspace(5) %in, ptr addrspace(5) %out) { +; GFX12-CU-LABEL: md_nv__scratch_i32_nonatomic: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: scratch_load_b32 v0, v0, off +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: scratch_store_b32 v1, v0, off +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: md_nv__scratch_i32_nonatomic: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off nv +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_store_b32 v1, v0, off nv +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +entry: + %val = load i32, ptr addrspace(5) %in, !amdgpu.non.volatile !0 + store i32 %val, ptr addrspace(5) %out, !amdgpu.non.volatile !0 + ret void +} + define i32 @scalar32_i32_nonatomic(ptr addrspace(6) inreg %in) { ; GFX12-CU-LABEL: scalar32_i32_nonatomic: ; GFX12-CU: ; %bb.0: ; %entry @@ -266,6 +405,34 @@ entry: ret i32 %val } +define i32 @md_nv__scalar32_i32_nonatomic(ptr addrspace(6) inreg %in) { +; GFX12-CU-LABEL: md_nv__scalar32_i32_nonatomic: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: s_mov_b32 s1, 0 +; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: md_nv__scalar32_i32_nonatomic: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 nv +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +entry: + %val = load i32, ptr addrspace(6) %in, !amdgpu.non.volatile !0 + ret i32 %val +} + define void @buffer_i32_nonatomic(ptr addrspace(7) inreg %in, ptr addrspace(7) inreg %out) { ; GFX12-CU-LABEL: buffer_i32_nonatomic: ; GFX12-CU: ; %bb.0: ; %entry @@ -343,4 +510,55 @@ entry: ret i32 %val } +define void @md_nv__buffer_i32_nonatomic(ptr addrspace(7) inreg %in, ptr addrspace(7) inreg %out) { +; GFX12-CU-LABEL: md_nv__buffer_i32_nonatomic: +; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-CU-NEXT: s_wait_expcnt 0x0 +; GFX12-CU-NEXT: s_wait_samplecnt 0x0 +; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 +; GFX12-CU-NEXT: s_wait_kmcnt 0x0 +; GFX12-CU-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21 +; GFX12-CU-NEXT: s_mov_b32 s7, s20 +; GFX12-CU-NEXT: s_mov_b32 s6, s19 +; GFX12-CU-NEXT: s_mov_b32 s5, s18 +; GFX12-CU-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen +; GFX12-CU-NEXT: s_mov_b32 s4, s17 +; GFX12-CU-NEXT: s_wait_loadcnt 0x0 +; GFX12-CU-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen +; GFX12-CU-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-DAGISEL-LABEL: md_nv__buffer_i32_nonatomic: +; GFX1250-DAGISEL: ; %bb.0: ; %entry +; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-DAGISEL-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s7, s20 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s6, s19 +; GFX1250-DAGISEL-NEXT: s_mov_b32 s5, s18 +; GFX1250-DAGISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen nv +; GFX1250-DAGISEL-NEXT: s_mov_b32 s4, s17 +; GFX1250-DAGISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-DAGISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen nv +; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: md_nv__buffer_i32_nonatomic: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, s17 +; GFX1250-GISEL-NEXT: s_mov_b32 s5, s18 +; GFX1250-GISEL-NEXT: s_mov_b32 s6, s19 +; GFX1250-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen nv +; GFX1250-GISEL-NEXT: s_mov_b32 s7, s20 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen nv +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +entry: + %val = load i32, ptr addrspace(7) %in, !amdgpu.non.volatile !0 + store i32 %val, ptr addrspace(7) %out, !amdgpu.non.volatile !0 + ret void +} + !0 = !{} >From b92b4c2025c6c0900a6fc2a841a9e5e9adc6476e Mon Sep 17 00:00:00 2001 From: pvanhout <[email protected]> Date: Tue, 3 Feb 2026 14:35:43 +0100 Subject: [PATCH 3/3] Rename to MOThreadPrivate --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3b17198593db9..45fbe17eb01dd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19358,7 +19358,7 @@ SITargetLowering::getTargetMMOFlags(const Instruction &I) const { if (I.getMetadata("amdgpu.last.use")) Flags |= MOLastUse; if (I.getMetadata("amdgpu.non.volatile")) - Flags |= MONonVolatile; + Flags |= MOThreadPrivate; return Flags; } _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
