On 04.05.2018 20:32, Daniel Schürmann wrote:
This functionality is currently disabled due to a bug in LLVM.
The idea of this implementation is taken from the ROCm Device Libs:
https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/master/ockl/src/wfredscan.cl
---
  src/amd/common/ac_llvm_build.c | 98 ++++++++++++++++++++++++++++------
  1 file changed, 82 insertions(+), 16 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index c9b2e36b63..8300c9d6bd 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2967,26 +2967,66 @@ ac_build_alu_op(struct ac_llvm_context *ctx, 
LLVMValueRef lhs, LLVMValueRef rhs,
        }
  }
-/* TODO: add inclusive and excluse scan functions for SI chip class. */
  static LLVMValueRef
  ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, 
LLVMValueRef identity)
  {
        LLVMValueRef result, tmp;
        result = src;
-       tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
-       result = ac_build_alu_op(ctx, result, tmp, op);
-       tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
-       result = ac_build_alu_op(ctx, result, tmp, op);
-       tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
-       result = ac_build_alu_op(ctx, result, tmp, op);
-       tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, 
false);
-       result = ac_build_alu_op(ctx, result, tmp, op);
-       tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, 
false);
-       result = ac_build_alu_op(ctx, result, tmp, op);
-       tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, 
false);
-       result = ac_build_alu_op(ctx, result, tmp, op);
-       tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, 
false);
-       result = ac_build_alu_op(ctx, result, tmp, op);
+       if (ctx->chip_class >= VI) {
+               tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, 
false);
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, 
false);
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, 
false);
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 
0xe, false);
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 
0xc, false);
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 
0xf, false);
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 
0xf, false);
+               result = ac_build_alu_op(ctx, result, tmp, op);
+       } else {
+               LLVMValueRef tid = ac_get_thread_id(ctx);
+               LLVMValueRef active;
+               tmp = ac_build_ds_swizzle(ctx, src, ds_pattern_bitmode(0x1e, 
0x00, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                      LLVMBuildAnd(ctx->builder, tid, ctx->i32_1, 
""),
+                                      ctx->i32_0, "");
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1c, 
0x01, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 2, 0), ""),
+                                      ctx->i32_0, "");
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x18, 
0x03, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 4, 0), ""),
+                                      ctx->i32_0, "");
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x10, 
0x07, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 8, 0), ""),
+                                      ctx->i32_0, "");
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x00, 
0x0f, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 16, 0), ""),
+                                      ctx->i32_0, "");
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+               result = ac_build_alu_op(ctx, result, tmp, op);
+               tmp = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 31, 
0));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntNE,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 32, 0), ""),
+                                      ctx->i32_0, "");
+               tmp = LLVMBuildSelect(ctx->builder, active, tmp, identity, "");
+               result = ac_build_alu_op(ctx, result, tmp, op);
+       }
        return result;
  }
@@ -3015,7 +3055,33 @@ ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op
        result = LLVMBuildBitCast(ctx->builder,
                                                                
ac_build_set_inactive(ctx, src, identity),
                                                                LLVMTypeOf(identity), 
"");
-       result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, 
false);
+       if (ctx->chip_class >= VI) {
+               result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 
0xf, false);
+       } else {
+               LLVMValueRef active, tmp1, tmp2;
+               LLVMValueRef tid = ac_get_thread_id(ctx);
+               tmp1 = ac_build_ds_swizzle(ctx, result, (1 << 15) | 
dpp_quad_perm(0, 0, 1, 2));
+               tmp2 = ac_build_ds_swizzle(ctx, result, 
ds_pattern_bitmode(0x18, 0x03, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 0x7, 0), ""),
+                                      LLVMConstInt(ctx->i32, 0x4, 0), "");
+               tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+               tmp2 = ac_build_ds_swizzle(ctx, result, 
ds_pattern_bitmode(0x10, 0x07, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 0xf, 0), ""),
+                                      LLVMConstInt(ctx->i32, 0x8, 0), "");
+               tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+               tmp2 = ac_build_ds_swizzle(ctx, result, 
ds_pattern_bitmode(0x00, 0x0f, 0x00));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
+                                      LLVMBuildAnd(ctx->builder, tid, 
LLVMConstInt(ctx->i32, 0x1f, 0), ""),
+                                      LLVMConstInt(ctx->i32, 0x10, 0), "");
+               tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+               tmp2 = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 
31, 0));
+               active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, 
LLVMConstInt(ctx->i32, 32, 0), "");
+               tmp1 = LLVMBuildSelect(ctx->builder, active, tmp2, tmp1, "");
+               active = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tid, 
LLVMConstInt(ctx->i32, 0, 0), "");
+               result = LLVMBuildSelect(ctx->builder, active, identity, tmp1, 
"");
+       }

This is one of those times where being inflexible in how we use LDS hurts us. It would be much faster to just write everything to LDS and read back from shifted offsets.

Anyway, it does look correct, so:

Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>


        result = ac_build_scan(ctx, op, result, identity);
return ac_build_wwm(ctx, result);



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to