Re: [Mesa-dev] [PATCH] ac, ac/nir: use a better sync scope for shared atomics

2019-04-26 Thread Samuel Pitoiset
Thanks for your investigations Rhys. As discussed over IRC, this makes 
sense to me.


Reviewed-by: Samuel Pitoiset 

On 4/26/19 3:48 PM, Rhys Perry wrote:

https://reviews.llvm.org/rL356946 (present in LLVM 9 and later) changed
the meaning of the "system" sync scope, making it no longer restricted to
the memory operation's address space. So a single address space sync scope
is needed for shared atomic operations (such as "system-one-as" or
"workgroup-one-as") otherwise buffer_wbinvl1 and s_waitcnt instructions
can be created at each shared atomic operation.

This mostly reimplements LLVMBuildAtomicRMW and LLVMBuildAtomicCmpXchg
to allow for more sync scopes and uses the new functions in ac->nir with
the "workgroup-one-as" or "workgroup" sync scopes.

   F1 2017 (4K, Ultra High settings, TAA), avg FPS : 59 -> 59.67 (+1.14%)
  Strange Brigade (4K, ~highest settings), avg FPS : 51.5 -> 51.6 (+0.19%)
RotTR/mountain (4K, VeryHigh settings, FXAA), avg FPS : 57.2 -> 57.2 (+0.0%)
 RotTR/tomb (4K, VeryHigh settings, FXAA), avg FPS : 42.5 -> 43.0 (+1.17%)
   RotTR/valley (4K, VeryHigh settings, FXAA), avg FPS : 40.7 -> 41.6 (+2.21%)
  Warhammer II/fallen, avg FPS : 31.63 -> 31.83 (+0.63%)
  Warhammer II/skaven, avg FPS : 37.77 -> 38.07 (+0.79%)

Signed-off-by: Rhys Perry 
---
  src/amd/common/ac_llvm_build.h| 10 +-
  src/amd/common/ac_llvm_helper.cpp | 59 +++
  src/amd/common/ac_nir_to_llvm.c   | 12 +++
  3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index f4cee667153..98f856106d6 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -26,7 +26,7 @@
  #define AC_LLVM_BUILD_H
  
  #include 

-#include 
+#include 
  #include "compiler/nir/nir.h"
  #include "amd_family.h"
  
@@ -694,6 +694,14 @@ ac_build_ddxy_interp(struct ac_llvm_context *ctx, LLVMValueRef interp_ij);

  LLVMValueRef
  ac_build_load_helper_invocation(struct ac_llvm_context *ctx);
  
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op,

+LLVMValueRef ptr, LLVMValueRef val,
+const char *sync_scope);
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, 
LLVMValueRef ptr,
+ LLVMValueRef cmp, LLVMValueRef val,
+ const char *sync_scope);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/src/amd/common/ac_llvm_helper.cpp 
b/src/amd/common/ac_llvm_helper.cpp
index dcfb8008546..e5030c6f472 100644
--- a/src/amd/common/ac_llvm_helper.cpp
+++ b/src/amd/common/ac_llvm_helper.cpp
@@ -31,6 +31,7 @@
  
  #include "ac_binary.h"

  #include "ac_llvm_util.h"
+#include "ac_llvm_build.h"
  
  #include 

  #include 
@@ -167,3 +168,61 @@ void ac_enable_global_isel(LLVMTargetMachineRef tm)
  {
reinterpret_cast(tm)->setGlobalISel(true);
  }
+
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, 
LLVMAtomicRMWBinOp op,
+LLVMValueRef ptr, LLVMValueRef val,
+const char *sync_scope) {
+   llvm::AtomicRMWInst::BinOp binop;
+   switch (op) {
+   case LLVMAtomicRMWBinOpXchg:
+   binop = llvm::AtomicRMWInst::Xchg;
+   break;
+   case LLVMAtomicRMWBinOpAdd:
+   binop = llvm::AtomicRMWInst::Add;
+   break;
+   case LLVMAtomicRMWBinOpSub:
+   binop = llvm::AtomicRMWInst::Sub;
+   break;
+   case LLVMAtomicRMWBinOpAnd:
+   binop = llvm::AtomicRMWInst::And;
+   break;
+   case LLVMAtomicRMWBinOpNand:
+   binop = llvm::AtomicRMWInst::Nand;
+   break;
+   case LLVMAtomicRMWBinOpOr:
+   binop = llvm::AtomicRMWInst::Or;
+   break;
+   case LLVMAtomicRMWBinOpXor:
+   binop = llvm::AtomicRMWInst::Xor;
+   break;
+   case LLVMAtomicRMWBinOpMax:
+   binop = llvm::AtomicRMWInst::Max;
+   break;
+   case LLVMAtomicRMWBinOpMin:
+   binop = llvm::AtomicRMWInst::Min;
+   break;
+   case LLVMAtomicRMWBinOpUMax:
+   binop = llvm::AtomicRMWInst::UMax;
+   break;
+   case LLVMAtomicRMWBinOpUMin:
+   binop = llvm::AtomicRMWInst::UMin;
+   break;
+   default:
+   unreachable(!"invalid LLVMAtomicRMWBinOp");
+  break;
+   }
+   unsigned SSID = 
llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
+   return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW(
+   binop, llvm::unwrap(ptr), llvm::unwrap(val),
+   llvm::AtomicOrdering::SequentiallyConsistent, SSID));
+}
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, 
LLVMValueRef ptr,
+ 

[Mesa-dev] [PATCH] ac, ac/nir: use a better sync scope for shared atomics

2019-04-26 Thread Rhys Perry
https://reviews.llvm.org/rL356946 (present in LLVM 9 and later) changed
the meaning of the "system" sync scope, making it no longer restricted to
the memory operation's address space. So a single address space sync scope
is needed for shared atomic operations (such as "system-one-as" or
"workgroup-one-as") otherwise buffer_wbinvl1 and s_waitcnt instructions
can be created at each shared atomic operation.

This mostly reimplements LLVMBuildAtomicRMW and LLVMBuildAtomicCmpXchg
to allow for more sync scopes and uses the new functions in ac->nir with
the "workgroup-one-as" or "workgroup" sync scopes.

  F1 2017 (4K, Ultra High settings, TAA), avg FPS : 59 -> 59.67 (+1.14%)
 Strange Brigade (4K, ~highest settings), avg FPS : 51.5 -> 51.6 (+0.19%)
RotTR/mountain (4K, VeryHigh settings, FXAA), avg FPS : 57.2 -> 57.2 (+0.0%)
RotTR/tomb (4K, VeryHigh settings, FXAA), avg FPS : 42.5 -> 43.0 (+1.17%)
  RotTR/valley (4K, VeryHigh settings, FXAA), avg FPS : 40.7 -> 41.6 (+2.21%)
 Warhammer II/fallen, avg FPS : 31.63 -> 31.83 (+0.63%)
 Warhammer II/skaven, avg FPS : 37.77 -> 38.07 (+0.79%)

Signed-off-by: Rhys Perry 
---
 src/amd/common/ac_llvm_build.h| 10 +-
 src/amd/common/ac_llvm_helper.cpp | 59 +++
 src/amd/common/ac_nir_to_llvm.c   | 12 +++
 3 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index f4cee667153..98f856106d6 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -26,7 +26,7 @@
 #define AC_LLVM_BUILD_H
 
 #include 
-#include 
+#include 
 #include "compiler/nir/nir.h"
 #include "amd_family.h"
 
@@ -694,6 +694,14 @@ ac_build_ddxy_interp(struct ac_llvm_context *ctx, 
LLVMValueRef interp_ij);
 LLVMValueRef
 ac_build_load_helper_invocation(struct ac_llvm_context *ctx);
 
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, 
LLVMAtomicRMWBinOp op,
+LLVMValueRef ptr, LLVMValueRef val,
+const char *sync_scope);
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, 
LLVMValueRef ptr,
+ LLVMValueRef cmp, LLVMValueRef val,
+ const char *sync_scope);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_llvm_helper.cpp 
b/src/amd/common/ac_llvm_helper.cpp
index dcfb8008546..e5030c6f472 100644
--- a/src/amd/common/ac_llvm_helper.cpp
+++ b/src/amd/common/ac_llvm_helper.cpp
@@ -31,6 +31,7 @@
 
 #include "ac_binary.h"
 #include "ac_llvm_util.h"
+#include "ac_llvm_build.h"
 
 #include 
 #include 
@@ -167,3 +168,61 @@ void ac_enable_global_isel(LLVMTargetMachineRef tm)
 {
   reinterpret_cast(tm)->setGlobalISel(true);
 }
+
+LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, 
LLVMAtomicRMWBinOp op,
+LLVMValueRef ptr, LLVMValueRef val,
+const char *sync_scope) {
+   llvm::AtomicRMWInst::BinOp binop;
+   switch (op) {
+   case LLVMAtomicRMWBinOpXchg:
+   binop = llvm::AtomicRMWInst::Xchg;
+   break;
+   case LLVMAtomicRMWBinOpAdd:
+   binop = llvm::AtomicRMWInst::Add;
+   break;
+   case LLVMAtomicRMWBinOpSub:
+   binop = llvm::AtomicRMWInst::Sub;
+   break;
+   case LLVMAtomicRMWBinOpAnd:
+   binop = llvm::AtomicRMWInst::And;
+   break;
+   case LLVMAtomicRMWBinOpNand:
+   binop = llvm::AtomicRMWInst::Nand;
+   break;
+   case LLVMAtomicRMWBinOpOr:
+   binop = llvm::AtomicRMWInst::Or;
+   break;
+   case LLVMAtomicRMWBinOpXor:
+   binop = llvm::AtomicRMWInst::Xor;
+   break;
+   case LLVMAtomicRMWBinOpMax:
+   binop = llvm::AtomicRMWInst::Max;
+   break;
+   case LLVMAtomicRMWBinOpMin:
+   binop = llvm::AtomicRMWInst::Min;
+   break;
+   case LLVMAtomicRMWBinOpUMax:
+   binop = llvm::AtomicRMWInst::UMax;
+   break;
+   case LLVMAtomicRMWBinOpUMin:
+   binop = llvm::AtomicRMWInst::UMin;
+   break;
+   default:
+   unreachable(!"invalid LLVMAtomicRMWBinOp");
+  break;
+   }
+   unsigned SSID = 
llvm::unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
+   return llvm::wrap(llvm::unwrap(ctx->builder)->CreateAtomicRMW(
+   binop, llvm::unwrap(ptr), llvm::unwrap(val),
+   llvm::AtomicOrdering::SequentiallyConsistent, SSID));
+}
+
+LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, 
LLVMValueRef ptr,
+ LLVMValueRef cmp, LLVMValueRef val,
+ const char *sync_scope) {
+   unsigned SSID =