Re: [Mesa-dev] [PATCH] ac, ac/nir: use a better sync scope for shared atomics
Thanks for your investigations Rhys. As discussed over IRC, this makes sense to me. Reviewed-by: Samuel Pitoiset On 4/26/19 3:48 PM, Rhys Perry wrote: https://reviews.llvm.org/rL356946 (present in LLVM 9 and later) changed the meaning of the "system" sync scope, making it no longer restricted to the memory operation's address space. So a single address space sync scope is needed for shared atomic operations (such as "system-one-as" or "workgroup-one-as") otherwise buffer_wbinvl1 and s_waitcnt instructions can be created at each shared atomic operation. This mostly reimplements LLVMBuildAtomicRMW and LLVMBuildAtomicCmpXchg to allow for more sync scopes and uses the new functions in ac->nir with the "workgroup-one-as" or "workgroup" sync scopes. F1 2017 (4K, Ultra High settings, TAA), avg FPS : 59 -> 59.67 (+1.14%) Strange Brigade (4K, ~highest settings), avg FPS : 51.5 -> 51.6 (+0.19%) RotTR/mountain (4K, VeryHigh settings, FXAA), avg FPS : 57.2 -> 57.2 (+0.0%) RotTR/tomb (4K, VeryHigh settings, FXAA), avg FPS : 42.5 -> 43.0 (+1.17%) RotTR/valley (4K, VeryHigh settings, FXAA), avg FPS : 40.7 -> 41.6 (+2.21%) Warhammer II/fallen, avg FPS : 31.63 -> 31.83 (+0.63%) Warhammer II/skaven, avg FPS : 37.77 -> 38.07 (+0.79%) Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.h| 10 +- src/amd/common/ac_llvm_helper.cpp | 59 +++ src/amd/common/ac_nir_to_llvm.c | 12 +++ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index f4cee667153..98f856106d6 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -26,7 +26,7 @@ #define AC_LLVM_BUILD_H #include -#include +#include #include "compiler/nir/nir.h" #include "amd_family.h" @@ -694,6 +694,14 @@ ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij); LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx); +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, +LLVMValueRef ptr, LLVMValueRef val, +const char *sync_scope); + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef cmp, LLVMValueRef val, + const char *sync_scope); + #ifdef __cplusplus } #endif diff --git a/src/amd/common/ac_llvm_helper.cpp b/src/amd/common/ac_llvm_helper.cpp index dcfb8008546..e5030c6f472 100644 --- a/src/amd/common/ac_llvm_helper.cpp +++ b/src/amd/common/ac_llvm_helper.cpp @@ -31,6 +31,7 @@ #include "ac_binary.h" #include "ac_llvm_util.h" +#include "ac_llvm_build.h" #include #include @@ -167,3 +168,61 @@ void ac_enable_global_isel(LLVMTargetMachineRef tm) { reinterpret_cast(tm)->setGlobalISel(true); } + +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, +LLVMValueRef ptr, LLVMValueRef val, +const char *sync_scope) { + llvm::AtomicRMWInst::BinOp binop; + switch (op) { + case LLVMAtomicRMWBinOpXchg: + binop = llvm::AtomicRMWInst::Xchg; + break; + case LLVMAtomicRMWBinOpAdd: + binop = llvm::AtomicRMWInst::Add; + break; + case LLVMAtomicRMWBinOpSub: + binop = llvm::AtomicRMWInst::Sub; + break; + case LLVMAtomicRMWBinOpAnd: + binop = llvm::AtomicRMWInst::And; + break; + case LLVMAtomicRMWBinOpNand: + binop = llvm::AtomicRMWInst::Nand; + break; + case LLVMAtomicRMWBinOpOr: + binop = llvm::AtomicRMWInst::Or; + break; + case LLVMAtomicRMWBinOpXor: + binop = llvm::AtomicRMWInst::Xor; + break; + case LLVMAtomicRMWBinOpMax: + binop = llvm::AtomicRMWInst::Max; + break; + case LLVMAtomicRMWBinOpMin: + binop = llvm::AtomicRMWInst::Min; + break; + case LLVMAtomicRMWBinOpUMax: + binop = llvm::AtomicRMWInst::UMax; + break; + case LLVMAtomicRMWBinOpUMin: + binop = llvm::AtomicRMWInst::UMin; + break; + default: + unreachable(!"invalid LLVMAtomicRMWBinOp"); + break; + } + unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); + return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW( + binop, llvm::unwrap(ptr), llvm::unwrap(val), + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); +} + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, +
[Mesa-dev] [PATCH] ac, ac/nir: use a better sync scope for shared atomics
https://reviews.llvm.org/rL356946 (present in LLVM 9 and later) changed the meaning of the "system" sync scope, making it no longer restricted to the memory operation's address space. So a single address space sync scope is needed for shared atomic operations (such as "system-one-as" or "workgroup-one-as") otherwise buffer_wbinvl1 and s_waitcnt instructions can be created at each shared atomic operation. This mostly reimplements LLVMBuildAtomicRMW and LLVMBuildAtomicCmpXchg to allow for more sync scopes and uses the new functions in ac->nir with the "workgroup-one-as" or "workgroup" sync scopes. F1 2017 (4K, Ultra High settings, TAA), avg FPS : 59 -> 59.67 (+1.14%) Strange Brigade (4K, ~highest settings), avg FPS : 51.5 -> 51.6 (+0.19%) RotTR/mountain (4K, VeryHigh settings, FXAA), avg FPS : 57.2 -> 57.2 (+0.0%) RotTR/tomb (4K, VeryHigh settings, FXAA), avg FPS : 42.5 -> 43.0 (+1.17%) RotTR/valley (4K, VeryHigh settings, FXAA), avg FPS : 40.7 -> 41.6 (+2.21%) Warhammer II/fallen, avg FPS : 31.63 -> 31.83 (+0.63%) Warhammer II/skaven, avg FPS : 37.77 -> 38.07 (+0.79%) Signed-off-by: Rhys Perry --- src/amd/common/ac_llvm_build.h| 10 +- src/amd/common/ac_llvm_helper.cpp | 59 +++ src/amd/common/ac_nir_to_llvm.c | 12 +++ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index f4cee667153..98f856106d6 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -26,7 +26,7 @@ #define AC_LLVM_BUILD_H #include -#include +#include #include "compiler/nir/nir.h" #include "amd_family.h" @@ -694,6 +694,14 @@ ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij); LLVMValueRef ac_build_load_helper_invocation(struct ac_llvm_context *ctx); +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, +LLVMValueRef ptr, LLVMValueRef val, +const char *sync_scope); + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef cmp, LLVMValueRef val, + const char *sync_scope); + #ifdef __cplusplus } #endif diff --git a/src/amd/common/ac_llvm_helper.cpp b/src/amd/common/ac_llvm_helper.cpp index dcfb8008546..e5030c6f472 100644 --- a/src/amd/common/ac_llvm_helper.cpp +++ b/src/amd/common/ac_llvm_helper.cpp @@ -31,6 +31,7 @@ #include "ac_binary.h" #include "ac_llvm_util.h" +#include "ac_llvm_build.h" #include #include @@ -167,3 +168,61 @@ void ac_enable_global_isel(LLVMTargetMachineRef tm) { reinterpret_cast(tm)->setGlobalISel(true); } + +LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op, +LLVMValueRef ptr, LLVMValueRef val, +const char *sync_scope) { + llvm::AtomicRMWInst::BinOp binop; + switch (op) { + case LLVMAtomicRMWBinOpXchg: + binop = llvm::AtomicRMWInst::Xchg; + break; + case LLVMAtomicRMWBinOpAdd: + binop = llvm::AtomicRMWInst::Add; + break; + case LLVMAtomicRMWBinOpSub: + binop = llvm::AtomicRMWInst::Sub; + break; + case LLVMAtomicRMWBinOpAnd: + binop = llvm::AtomicRMWInst::And; + break; + case LLVMAtomicRMWBinOpNand: + binop = llvm::AtomicRMWInst::Nand; + break; + case LLVMAtomicRMWBinOpOr: + binop = llvm::AtomicRMWInst::Or; + break; + case LLVMAtomicRMWBinOpXor: + binop = llvm::AtomicRMWInst::Xor; + break; + case LLVMAtomicRMWBinOpMax: + binop = llvm::AtomicRMWInst::Max; + break; + case LLVMAtomicRMWBinOpMin: + binop = llvm::AtomicRMWInst::Min; + break; + case LLVMAtomicRMWBinOpUMax: + binop = llvm::AtomicRMWInst::UMax; + break; + case LLVMAtomicRMWBinOpUMin: + binop = llvm::AtomicRMWInst::UMin; + break; + default: + unreachable(!"invalid LLVMAtomicRMWBinOp"); + break; + } + unsigned SSID = llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope); + return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW( + binop, llvm::unwrap(ptr), llvm::unwrap(val), + llvm::AtomicOrdering::SequentiallyConsistent, SSID)); +} + +LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr, + LLVMValueRef cmp, LLVMValueRef val, + const char *sync_scope) { + unsigned SSID =