The eliminate_find_live_channel optimization eliminates
FIND_LIVE_CHANNEL instructions in cases where control flow is known to
be uniform, and replaces them with 'MOV 0', which in turn unblocks
subsequent elimination of the BROADCAST instruction frequently used on
the result of FIND_LIVE_CHANNEL.  This is however not correct in
per-sample fragment shader dispatch because the PSD can dispatch a
fully unlit sample under certain conditions.  Disable the optimization
in that case.
---
 src/mesa/drivers/dri/i965/brw_compiler.h | 41 ++++++++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs.cpp     |  8 +++++++
 src/mesa/drivers/dri/i965/brw_vec4.cpp   |  8 +++++++
 3 files changed, 57 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h 
b/src/mesa/drivers/dri/i965/brw_compiler.h
index 84d3dde..1429875 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -868,6 +868,47 @@ encode_slm_size(unsigned gen, uint32_t bytes)
    return slm_size;
 }
 
+/**
+ * Return true if the given shader stage is dispatched contiguously by the
+ * relevant fixed function starting from channel 0 of the SIMD thread, which
+ * implies that the dispatch mask of a thread can be assumed to have the form
+ * '2^n - 1' for some n.
+ */
+static inline bool
+brw_stage_has_packed_dispatch(gl_shader_stage stage,
+                              const struct brw_stage_prog_data *prog_data)
+{
+   switch (stage) {
+   case MESA_SHADER_FRAGMENT: {
+      /* The PSD discards subspans coming in with no lit samples, which in the
+       * per-pixel shading case implies that each subspan will either be fully
+       * lit (due to the VMask being used to allow derivative computations),
+       * or not dispatched at all.  In per-sample dispatch mode individual
+       * samples from the same subspan have a fixed relative location within
+       * the SIMD thread, so dispatch of unlit samples cannot be avoided in
+       * general and we should return false.
+       */
+      const struct brw_wm_prog_data *wm_prog_data =
+         (const struct brw_wm_prog_data *)prog_data;
+      return !wm_prog_data->persample_dispatch;
+   }
+   case MESA_SHADER_COMPUTE:
+      /* Compute shaders will be spawned with either a fully enabled dispatch
+       * mask or with whatever bottom/right execution mask was given to the
+       * GPGPU walker command to be used along the workgroup edges -- In both
+       * cases the dispatch mask is required to be tightly packed for our
+       * invocation index calculations to work.
+       */
+      return true;
+   default:
+      /* Most remaining fixed functions are limited to use a packed dispatch
+       * mask due to the hardware representation of the dispatch mask as a
+       * single counter representing the number of enabled channels.
+       */
+      return true;
+   }
+}
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index bb65077..32f7ae2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2835,6 +2835,14 @@ fs_visitor::eliminate_find_live_channel()
    bool progress = false;
    unsigned depth = 0;
 
+   if (!brw_stage_has_packed_dispatch(stage, stage_prog_data)) {
+      /* The optimization below assumes that channel zero is live on thread
+       * dispatch, which may not be the case if the fixed function dispatches
+       * threads sparsely.
+       */
+      return progress;
+   }
+
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
       switch (inst->opcode) {
       case BRW_OPCODE_IF:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 58c8a8a..d5bb82b 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1291,6 +1291,14 @@ vec4_visitor::eliminate_find_live_channel()
    bool progress = false;
    unsigned depth = 0;
 
+   if (!brw_stage_has_packed_dispatch(stage, stage_prog_data)) {
+      /* The optimization below assumes that channel zero is live on thread
+       * dispatch, which may not be the case if the fixed function dispatches
+       * threads sparsely.
+       */
+      return progress;
+   }
+
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
       switch (inst->opcode) {
       case BRW_OPCODE_IF:
-- 
2.9.0

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to