Re: [Mesa-dev] [PATCH] ac: split 16-bit ssbo loads that may not be dword aligned

2018-12-13 Thread Samuel Pitoiset
This refactoring is really not easy to read as is. Can you please split 
this in different parts? Maybe refactor first, then fix 16-bit ssbo loads?


If we want this to be backported, we will just need to squash the 
different parts and send a separate patch to mesa-stable.


On 12/6/18 12:13 AM, Rhys Perry wrote:

This ends up refactoring visit_load_buffer() a little.

Fixes: 7e7ee826982 ('ac: add support for 16bit buffer loads')
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108114
Signed-off-by: Rhys Perry 
---
Unfortunately I was not able to test this patch on a Polaris due to hardware
issues. It fixed the deqp-vk tests without regressions on Vega though.

  src/amd/common/ac_llvm_build.c  |  8 ++--
  src/amd/common/ac_nir_to_llvm.c | 80 -
  src/compiler/nir/nir.h  |  2 +-
  3 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index abc18da13d..154cc696a2 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2943,9 +2943,11 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, 
LLVMValueRef value,
if (count == num_components)
return value;
  
-	LLVMValueRef masks[] = {

-   ctx->i32_0, ctx->i32_1,
-   LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+   LLVMValueRef masks[MAX2(count, 2)];
+   masks[0] = ctx->i32_0;
+   masks[1] = ctx->i32_1;
+   for (unsigned i = 2; i < count; i++)
+   masks[i] = LLVMConstInt(ctx->i32, i, false);
  
  	if (count == 1)

return LLVMBuildExtractElement(ctx->builder, value, masks[0],
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index a109f5a815..4a4c09cf5f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1623,37 +1623,45 @@ static LLVMValueRef visit_atomic_ssbo(struct 
ac_nir_context *ctx,
  static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
const nir_intrinsic_instr *instr)
  {
-   LLVMValueRef results[2];
-   int load_bytes;
int elem_size_bytes = instr->dest.ssa.bit_size / 8;
int num_components = instr->num_components;
-   int num_bytes = num_components * elem_size_bytes;
enum gl_access_qualifier access = nir_intrinsic_access(instr);
LLVMValueRef glc = ctx->ac.i1false;
  
  	if (access & (ACCESS_VOLATILE | ACCESS_COHERENT))

glc = ctx->ac.i1true;
  
-	for (int i = 0; i < num_bytes; i += load_bytes) {

-   load_bytes = MIN2(num_bytes - i, 16);
-   const char *load_name;
-   LLVMTypeRef data_type;
-   LLVMValueRef offset = get_src(ctx, instr->src[1]);
-   LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i, false);
-   LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-   get_src(ctx, 
instr->src[0]), false);
-   LLVMValueRef vindex = ctx->ac.i32_0;
+   LLVMValueRef offset = get_src(ctx, instr->src[1]);
+   LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
+   get_src(ctx, instr->src[0]), 
false);
+   LLVMValueRef vindex = ctx->ac.i32_0;
+
+   LLVMTypeRef def_type = get_def_type(ctx, >dest.ssa);
+   LLVMTypeRef def_elem_type = num_components > 1 ? 
LLVMGetElementType(def_type) : def_type;
  
-		int idx = i ? 1 : 0;

+   LLVMValueRef results[4];
+   for (int i = 0; i < num_components;) {
+   int num_elems = num_components - i;
+   if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0)
+   num_elems = 1;
+   if (num_elems * elem_size_bytes > 16)
+   num_elems = 16 / elem_size_bytes;
+   int load_bytes = num_elems * elem_size_bytes;
+
+   LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * 
elem_size_bytes, false);
+
+   LLVMValueRef ret;
if (load_bytes == 2) {
-   results[idx] = ac_build_tbuffer_load_short(>ac,
-  rsrc,
-  vindex,
-  offset,
-  
ctx->ac.i32_0,
-  immoffset,
-  glc);
+   ret = ac_build_tbuffer_load_short(>ac,
+ rsrc,
+ vindex,
+ offset,
+

[Mesa-dev] [PATCH] ac: split 16-bit ssbo loads that may not be dword aligned

2018-12-05 Thread Rhys Perry
This ends up refactoring visit_load_buffer() a little.

Fixes: 7e7ee826982 ('ac: add support for 16bit buffer loads')
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108114
Signed-off-by: Rhys Perry 
---
Unfortunately I was not able to test this patch on a Polaris due to hardware
issues. It fixed the deqp-vk tests without regressions on Vega though.

 src/amd/common/ac_llvm_build.c  |  8 ++--
 src/amd/common/ac_nir_to_llvm.c | 80 -
 src/compiler/nir/nir.h  |  2 +-
 3 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index abc18da13d..154cc696a2 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2943,9 +2943,11 @@ LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, 
LLVMValueRef value,
if (count == num_components)
return value;
 
-   LLVMValueRef masks[] = {
-   ctx->i32_0, ctx->i32_1,
-   LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
+   LLVMValueRef masks[MAX2(count, 2)];
+   masks[0] = ctx->i32_0;
+   masks[1] = ctx->i32_1;
+   for (unsigned i = 2; i < count; i++)
+   masks[i] = LLVMConstInt(ctx->i32, i, false);
 
if (count == 1)
return LLVMBuildExtractElement(ctx->builder, value, masks[0],
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index a109f5a815..4a4c09cf5f 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1623,37 +1623,45 @@ static LLVMValueRef visit_atomic_ssbo(struct 
ac_nir_context *ctx,
 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
   const nir_intrinsic_instr *instr)
 {
-   LLVMValueRef results[2];
-   int load_bytes;
int elem_size_bytes = instr->dest.ssa.bit_size / 8;
int num_components = instr->num_components;
-   int num_bytes = num_components * elem_size_bytes;
enum gl_access_qualifier access = nir_intrinsic_access(instr);
LLVMValueRef glc = ctx->ac.i1false;
 
if (access & (ACCESS_VOLATILE | ACCESS_COHERENT))
glc = ctx->ac.i1true;
 
-   for (int i = 0; i < num_bytes; i += load_bytes) {
-   load_bytes = MIN2(num_bytes - i, 16);
-   const char *load_name;
-   LLVMTypeRef data_type;
-   LLVMValueRef offset = get_src(ctx, instr->src[1]);
-   LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i, false);
-   LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
-   get_src(ctx, 
instr->src[0]), false);
-   LLVMValueRef vindex = ctx->ac.i32_0;
+   LLVMValueRef offset = get_src(ctx, instr->src[1]);
+   LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
+   get_src(ctx, instr->src[0]), 
false);
+   LLVMValueRef vindex = ctx->ac.i32_0;
+
+   LLVMTypeRef def_type = get_def_type(ctx, >dest.ssa);
+   LLVMTypeRef def_elem_type = num_components > 1 ? 
LLVMGetElementType(def_type) : def_type;
 
-   int idx = i ? 1 : 0;
+   LLVMValueRef results[4];
+   for (int i = 0; i < num_components;) {
+   int num_elems = num_components - i;
+   if (elem_size_bytes < 4 && nir_intrinsic_align(instr) % 4 != 0)
+   num_elems = 1;
+   if (num_elems * elem_size_bytes > 16)
+   num_elems = 16 / elem_size_bytes;
+   int load_bytes = num_elems * elem_size_bytes;
+
+   LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i * 
elem_size_bytes, false);
+
+   LLVMValueRef ret;
if (load_bytes == 2) {
-   results[idx] = ac_build_tbuffer_load_short(>ac,
-  rsrc,
-  vindex,
-  offset,
-  
ctx->ac.i32_0,
-  immoffset,
-  glc);
+   ret = ac_build_tbuffer_load_short(>ac,
+ rsrc,
+ vindex,
+ offset,
+ ctx->ac.i32_0,
+ immoffset,
+ glc);
} else {
+   const char *load_name;
+   LLVMTypeRef data_type;
switch