On 23.05.2018 11:48, Bas Nieuwenhuizen wrote:
WQM is pretty reliable now on LLVM 7, so let us just use
DPP + WQM.

This gives approximately a 1.5% performance increase on the
vrcompositor built-in benchmark.
---
  src/amd/common/ac_llvm_build.c | 243 ++++++++++++++++++---------------
  1 file changed, 130 insertions(+), 113 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 36c1d62637b..f849f6461ce 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1137,119 +1137,6 @@ ac_get_thread_id(struct ac_llvm_context *ctx)
        return tid;
  }
-/*
- * SI implements derivatives using the local data store (LDS)
- * All writes to the LDS happen in all executing threads at
- * the same time. TID is the Thread ID for the current
- * thread and is a value between 0 and 63, representing
- * the thread's position in the wavefront.
- *
- * For the pixel shader threads are grouped into quads of four pixels.
- * The TIDs of the pixels of a quad are:
- *
- *  +------+------+
- *  |4n + 0|4n + 1|
- *  +------+------+
- *  |4n + 2|4n + 3|
- *  +------+------+
- *
- * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
- * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
- * the current pixel's column, and masking with 0xfffffffe yields the TID
- * of the left pixel of the current pixel's row.
- *
- * Adding 1 yields the TID of the pixel to the right of the left pixel, and
- * adding 2 yields the TID of the pixel below the top pixel.
- */
-LLVMValueRef
-ac_build_ddxy(struct ac_llvm_context *ctx,
-             uint32_t mask,
-             int idx,
-             LLVMValueRef val)
-{
-       LLVMValueRef tl, trbl, args[2];
-       LLVMValueRef result;
-
-       if (ctx->chip_class >= VI) {
-               LLVMValueRef thread_id, tl_tid, trbl_tid;
-               thread_id = ac_get_thread_id(ctx);
-
-               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
-                                     LLVMConstInt(ctx->i32, mask, false), "");
-
-               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
-                                       LLVMConstInt(ctx->i32, idx, false), "");
-
-               args[0] = LLVMBuildMul(ctx->builder, tl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
-               args[1] = val;
-               tl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
-
-               args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
-                                      LLVMConstInt(ctx->i32, 4, false), "");
-               trbl = ac_build_intrinsic(ctx,
-                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
-                                         args, 2,
-                                         AC_FUNC_ATTR_READNONE |
-                                         AC_FUNC_ATTR_CONVERGENT);
-       } else {
-               uint32_t masks[2] = {};
-
-               switch (mask) {
-               case AC_TID_MASK_TOP_LEFT:
-                       masks[0] = 0x8000;
-                       if (idx == 1)
-                               masks[1] = 0x8055;
-                       else
-                               masks[1] = 0x80aa;
-
-                       break;
-               case AC_TID_MASK_TOP:
-                       masks[0] = 0x8044;
-                       masks[1] = 0x80ee;
-                       break;
-               case AC_TID_MASK_LEFT:
-                       masks[0] = 0x80a0;
-                       masks[1] = 0x80f5;
-                       break;
-               default:
-                       assert(0);
-               }
-
-               args[0] = val;
-               args[1] = LLVMConstInt(ctx->i32, masks[0], false);
-
-               tl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
-
-               args[1] = LLVMConstInt(ctx->i32, masks[1], false);
-               trbl = ac_build_intrinsic(ctx,
-                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
-                                       args, 2,
-                                       AC_FUNC_ATTR_READNONE |
-                                       AC_FUNC_ATTR_CONVERGENT);
-       }
-
-       tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
-       trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
-       result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
-
-       if (HAVE_LLVM >= 0x0700) {
-               result = ac_build_intrinsic(ctx,
-                       "llvm.amdgcn.wqm.f32", ctx->f32,
-                       &result, 1, 0);
-       }
-
-       return result;
-}
-
  void
  ac_build_sendmsg(struct ac_llvm_context *ctx,
                 uint32_t msg,
@@ -2764,6 +2651,136 @@ ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef 
old, LLVMValueRef src,
        return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
  }
+/*
+ * SI implements derivatives using the local data store (LDS)
+ * All writes to the LDS happen in all executing threads at
+ * the same time. TID is the Thread ID for the current
+ * thread and is a value between 0 and 63, representing
+ * the thread's position in the wavefront.
+ *
+ * For the pixel shader threads are grouped into quads of four pixels.
+ * The TIDs of the pixels of a quad are:
+ *
+ *  +------+------+
+ *  |4n + 0|4n + 1|
+ *  +------+------+
+ *  |4n + 2|4n + 3|
+ *  +------+------+
+ *
+ * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
+ * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
+ * the current pixel's column, and masking with 0xfffffffe yields the TID
+ * of the left pixel of the current pixel's row.
+ *
+ * Adding 1 yields the TID of the pixel to the right of the left pixel, and
+ * adding 2 yields the TID of the pixel below the top pixel.
+ */
+LLVMValueRef
+ac_build_ddxy(struct ac_llvm_context *ctx,
+             uint32_t mask,
+             int idx,
+             LLVMValueRef val)
+{
+       LLVMValueRef tl, trbl, args[2];
+       LLVMValueRef result;
+
+       if (ctx->chip_class >= VI && HAVE_LLVM >= 0x0700) {
+               LLVMValueRef zero = ctx->i32_0;
+               unsigned tl_lanes[4], trbl_lanes[4];
+
+               for (unsigned i = 0; i < 4; ++i) {
+                       tl_lanes[i] = i & mask;
+                       trbl_lanes[i] = (i & mask) + idx;
+               }
+
+                tl = ac_build_dpp(ctx, zero, val,

Inconsistent indentation.

Anyway, maybe this should just use ac_build_quad_swizzle?

Cheers,
Nicolai


+                                 dpp_quad_perm(tl_lanes[0], tl_lanes[1],
+                                               tl_lanes[2], tl_lanes[3]),
+                                 0xf, 0xf, false);
+               trbl = ac_build_dpp(ctx, zero, val,
+                                   dpp_quad_perm(trbl_lanes[0], trbl_lanes[1],
+                                                 trbl_lanes[2], trbl_lanes[3]),
+                                   0xf, 0xf, false);
+       } else if (ctx->chip_class >= VI) {
+               LLVMValueRef thread_id, tl_tid, trbl_tid;
+               thread_id = ac_get_thread_id(ctx);
+
+               tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
+                                     LLVMConstInt(ctx->i32, mask, false), "");
+
+               trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
+                                       LLVMConstInt(ctx->i32, idx, false), "");
+
+               args[0] = LLVMBuildMul(ctx->builder, tl_tid,
+                                      LLVMConstInt(ctx->i32, 4, false), "");
+               args[1] = val;
+               tl = ac_build_intrinsic(ctx,
+                                       "llvm.amdgcn.ds.bpermute", ctx->i32,
+                                       args, 2,
+                                       AC_FUNC_ATTR_READNONE |
+                                       AC_FUNC_ATTR_CONVERGENT);
+
+               args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
+                                      LLVMConstInt(ctx->i32, 4, false), "");
+               trbl = ac_build_intrinsic(ctx,
+                                         "llvm.amdgcn.ds.bpermute", ctx->i32,
+                                         args, 2,
+                                         AC_FUNC_ATTR_READNONE |
+                                         AC_FUNC_ATTR_CONVERGENT);
+       } else {
+               uint32_t masks[2] = {};
+
+               switch (mask) {
+               case AC_TID_MASK_TOP_LEFT:
+                       masks[0] = 0x8000;
+                       if (idx == 1)
+                               masks[1] = 0x8055;
+                       else
+                               masks[1] = 0x80aa;
+
+                       break;
+               case AC_TID_MASK_TOP:
+                       masks[0] = 0x8044;
+                       masks[1] = 0x80ee;
+                       break;
+               case AC_TID_MASK_LEFT:
+                       masks[0] = 0x80a0;
+                       masks[1] = 0x80f5;
+                       break;
+               default:
+                       assert(0);
+               }
+
+               args[0] = val;
+               args[1] = LLVMConstInt(ctx->i32, masks[0], false);
+
+               tl = ac_build_intrinsic(ctx,
+                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
+                                       args, 2,
+                                       AC_FUNC_ATTR_READNONE |
+                                       AC_FUNC_ATTR_CONVERGENT);
+
+               args[1] = LLVMConstInt(ctx->i32, masks[1], false);
+               trbl = ac_build_intrinsic(ctx,
+                                       "llvm.amdgcn.ds.swizzle", ctx->i32,
+                                       args, 2,
+                                       AC_FUNC_ATTR_READNONE |
+                                       AC_FUNC_ATTR_CONVERGENT);
+       }
+
+       tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
+       trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
+       result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+       if (HAVE_LLVM >= 0x0700) {
+               result = ac_build_intrinsic(ctx,
+                       "llvm.amdgcn.wqm.f32", ctx->f32,
+                       &result, 1, 0);
+       }
+
+       return result;
+}
+
  static inline unsigned
  ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
  {



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to