Re: [Mesa-dev] [PATCH v3 5/7] ac: add LLVM build functions for subgroup instrinsics

2018-04-15 Thread Nicolai Hähnle

I'm a bit late, but just some stylistic nitpicks for the future:

As a general stylistic note, prefer `false' as the last argument to 
LLVMConstInt (since it's a boolean, whether to sign-extend).


[snip]

+{
+   ac_build_optimization_barrier(ctx, );
+   LLVMValueRef result;
+   LLVMValueRef identity = get_reduction_identity(ctx, op,
+   
ac_get_type_size(LLVMTypeOf(src)));
+   result = LLVMBuildBitCast(ctx->builder,
+   
ac_build_set_inactive(ctx, src, identity),
+   LLVMTypeOf(identity), 
"");


Weird whitespace, are you using a non-default tab width?



+   result = ac_build_scan(ctx, op, result, identity);
+
+   return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op 
op)
+{
+   ac_build_optimization_barrier(ctx, );
+   LLVMValueRef result;
+   LLVMValueRef identity = get_reduction_identity(ctx, op,
+   
ac_get_type_size(LLVMTypeOf(src)));
+   result = LLVMBuildBitCast(ctx->builder,
+   
ac_build_set_inactive(ctx, src, identity),
+   LLVMTypeOf(identity), 
"");
+   result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, 
false);
+   result = ac_build_scan(ctx, op, result, identity);
+
+   return ac_build_wwm(ctx, result);
+}
+
+LLVMValueRef
+ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, 
unsigned cluster_size)
+{
+   if (cluster_size == 1) return src;


Prefer to put the return on a new line, please.

Thanks,
Nicolai



+   ac_build_optimization_barrier(ctx, );
+   LLVMValueRef result, swap;
+   LLVMValueRef identity = get_reduction_identity(ctx, op,
+   
ac_get_type_size(LLVMTypeOf(src)));
+   result = LLVMBuildBitCast(ctx->builder,
+   
ac_build_set_inactive(ctx, src, identity),
+   LLVMTypeOf(identity), 
"");


Weird whitespace again.

With those style nitpicks fixed,

Reviewed-by: Nicolai Hähnle 



+   swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
+   result = ac_build_alu_op(ctx, result, swap, op);
+   if (cluster_size == 2) return ac_build_wwm(ctx, result);
+
+   swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
+   result = ac_build_alu_op(ctx, result, swap, op);
+   if (cluster_size == 4) return ac_build_wwm(ctx, result);
+
+   if (ctx->chip_class >= VI)
+   swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 
0xf, 0xf, false);
+   else
+   swap = ac_build_ds_swizzle(ctx, result, 
ds_pattern_bitmode(0x1f, 0, 0x04));
+   result = ac_build_alu_op(ctx, result, swap, op);
+   if (cluster_size == 8) return ac_build_wwm(ctx, result);
+
+   if (ctx->chip_class >= VI)
+   swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 
0xf, false);
+   else
+   swap = ac_build_ds_swizzle(ctx, result, 
ds_pattern_bitmode(0x1f, 0, 0x08));
+   result = ac_build_alu_op(ctx, result, swap, op);
+   if (cluster_size == 16) return ac_build_wwm(ctx, result);
+
+   if (ctx->chip_class >= VI && cluster_size != 32)
+   swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 
0xa, 0xf, false);
+   else
+   swap = ac_build_ds_swizzle(ctx, result, 
ds_pattern_bitmode(0x1f, 0, 0x10));
+   result = ac_build_alu_op(ctx, result, swap, op);
+   if (cluster_size == 32) return ac_build_wwm(ctx, result);
+
+   if (ctx->chip_class >= VI) {
+   swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 
0xc, 0xf, false);
+   result = ac_build_alu_op(ctx, result, swap, op);
+   result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 
63, 0));
+   return ac_build_wwm(ctx, result);
+   } else {
+   swap = ac_build_readlane(ctx, result, ctx->i32_0);
+   result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 
32, 0));
+   result = ac_build_alu_op(ctx, result, swap, op);
+   return ac_build_wwm(ctx, result);
+   }
+}
+
+LLVMValueRef
+ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
+   unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+   unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
+   if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0600) {
+   return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
+   } else {
+  

[Mesa-dev] [PATCH v3 5/7] ac: add LLVM build functions for subgroup instrinsics

2018-04-10 Thread Daniel Schürmann
Co-authored-by: Connor Abbott 
---
 src/amd/common/ac_llvm_build.c | 456 +
 src/amd/common/ac_llvm_build.h |  30 ++-
 2 files changed, 485 insertions(+), 1 deletion(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 2fb8aeaac6..9a00bb1114 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -2507,3 +2507,459 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context 
*ac, LLVMValueRef fmask,
addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
addr[sample_chan], "");
 }
+
+static LLVMValueRef
+_ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef 
lane)
+{
+   ac_build_optimization_barrier(ctx, );
+   return ac_build_intrinsic(ctx,
+   lane == NULL ? "llvm.amdgcn.readfirstlane" : 
"llvm.amdgcn.readlane",
+   LLVMTypeOf(src), (LLVMValueRef []) {
+   src, lane },
+   lane == NULL ? 1 : 2,
+   AC_FUNC_ATTR_READNONE |
+   AC_FUNC_ATTR_CONVERGENT);
+}
+
+/**
+ * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
+ * @param ctx
+ * @param src
+ * @param lane - id of the lane or NULL for the first active lane
+ * @return value of the lane
+ */
+LLVMValueRef
+ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef 
lane)
+{
+   LLVMTypeRef src_type = LLVMTypeOf(src);
+   src = ac_to_integer(ctx, src);
+   unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
+   LLVMValueRef ret;
+
+   if (bits == 32) {
+   ret = _ac_build_readlane(ctx, src, lane);
+   } else {
+   assert(bits % 32 == 0);
+   LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
+   LLVMValueRef src_vector =
+   LLVMBuildBitCast(ctx->builder, src, vec_type, "");
+   ret = LLVMGetUndef(vec_type);
+   for (unsigned i = 0; i < bits / 32; i++) {
+   src = LLVMBuildExtractElement(ctx->builder, src_vector,
+   LLVMConstInt(ctx->i32, i, 0), 
"");
+   LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, 
lane);
+   ret = LLVMBuildInsertElement(ctx->builder, ret, 
ret_comp,
+   LLVMConstInt(ctx->i32, i, 0), 
"");
+   }
+   }
+   return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
+}
+
+LLVMValueRef
+ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef 
value, LLVMValueRef lane)
+{
+   /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
+*/
+   LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
+ ac_get_thread_id(ctx), "");
+   return LLVMBuildSelect(ctx->builder, pred, value, src, "");
+}
+
+LLVMValueRef
+ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
+{
+   LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
+LLVMVectorType(ctx->i32, 2),
+"");
+   LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
+  ctx->i32_0, "");
+   LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
+  ctx->i32_1, "");
+   LLVMValueRef val =
+   ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
+  (LLVMValueRef []) { mask_lo, ctx->i32_0 },
+  2, AC_FUNC_ATTR_READNONE);
+   val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
+(LLVMValueRef []) { mask_hi, val },
+2, AC_FUNC_ATTR_READNONE);
+   return val;
+}
+
+enum dpp_ctrl {
+   _dpp_quad_perm = 0x000,
+   _dpp_row_sl = 0x100,
+   _dpp_row_sr = 0x110,
+   _dpp_row_rr = 0x120,
+   dpp_wf_sl1 = 0x130,
+   dpp_wf_rl1 = 0x134,
+   dpp_wf_sr1 = 0x138,
+   dpp_wf_rr1 = 0x13C,
+   dpp_row_mirror = 0x140,
+   dpp_row_half_mirror = 0x141,
+   dpp_row_bcast15 = 0x142,
+   dpp_row_bcast31 = 0x143
+};
+
+static inline enum dpp_ctrl
+dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
+{
+   assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
+   return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 
6);
+}
+
+static inline enum dpp_ctrl
+dpp_row_sl(unsigned amount)
+{
+   assert(amount > 0 && amount < 16);
+   return _dpp_row_sl | amount;
+}
+
+static inline enum dpp_ctrl
+dpp_row_sr(unsigned amount)