Module: Mesa
Branch: main
Commit: 023e78b4d7e9f8666cf09dbd218bd71266853da3
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=023e78b4d7e9f8666cf09dbd218bd71266853da3

Author: Daniel Schürmann <dan...@schuermann.dev>
Date:   Thu Oct 12 10:52:45 2023 +0200

aco: add new post-RA scheduler for ILP

Totals from 77247 (97.37% of 79330) affected shaders: (GFX11)

Instrs: 44371374 -> 43215723 (-2.60%); split: -2.64%, +0.03%
CodeSize: 227819532 -> 223188224 (-2.03%); split: -2.06%, +0.03%
Latency: 301016823 -> 290147626 (-3.61%); split: -3.70%, +0.09%
InvThroughput: 48551749 -> 47646212 (-1.87%); split: -1.88%, +0.01%
VClause: 870581 -> 834655 (-4.13%); split: -4.13%, +0.00%
SClause: 1487061 -> 1340851 (-9.83%); split: -9.83%, +0.00%
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25676>

---

 src/amd/compiler/README.md                   |   7 +-
 src/amd/compiler/aco_interface.cpp           |   4 +
 src/amd/compiler/aco_ir.cpp                  |  26 +-
 src/amd/compiler/aco_ir.h                    |   2 +
 src/amd/compiler/aco_scheduler_ilp.cpp       | 415 +++++++++++++++++++++++++++
 src/amd/compiler/meson.build                 |   1 +
 src/amd/compiler/tests/test_d3d11_derivs.cpp |   4 +-
 7 files changed, 444 insertions(+), 15 deletions(-)

diff --git a/src/amd/compiler/README.md b/src/amd/compiler/README.md
index 774a52ce7ea..6dac15be3e6 100644
--- a/src/amd/compiler/README.md
+++ b/src/amd/compiler/README.md
@@ -97,6 +97,10 @@ The next step is a pass out of SSA by inserting 
parallelcopies at the end of blo
 Most pseudo instructions are lowered to actual machine instructions.
 These are mostly parallel copy instructions created by instruction selection 
or register allocation and spill/reload code.
 
+#### ILP Scheduling
+
+This second scheduler works on registers rather than SSA-values to determine 
dependencies. It implements a forward list scheduling algorithm using a partial 
dependency graph of few instructions at a time and aims to create larger memory 
clauses and improve ILP.
+
 #### Insert wait states
 
 GCN requires some wait states to be manually inserted in order to ensure 
correct behavior on memory instructions and some register dependencies.
@@ -249,7 +253,8 @@ We also have `ACO_DEBUG` options:
 * `force-waitcnt` - Forces ACO to emit a wait state after each instruction 
when there is something to wait for. Harms performance.
 * `novn` - Disables the ACO value numbering stage.
 * `noopt` - Disables the ACO optimizer.
-* `nosched` - Disables the ACO scheduler.
+* `nosched` - Disables the ACO pre-RA and post-RA scheduler.
+* `nosched-ilp` - Disables the ACO post-RA ILP scheduler.
 
 Note that you need to **combine these options into a comma-separated list**, 
for example: `RADV_DEBUG=nocache,shaders` otherwise only the last one will take 
effect. (This is how all environment variables work, yet this is an often made 
mistake.) Example:
 
diff --git a/src/amd/compiler/aco_interface.cpp 
b/src/amd/compiler/aco_interface.cpp
index 802e03b0926..b405fbd82cb 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -199,6 +199,10 @@ aco_postprocess_shader(const struct aco_compiler_options* 
options,
    aco::lower_to_hw_instr(program.get());
    validate(program.get());
 
+   /* Schedule hardware instructions for ILP */
+   if (!options->optimisations_disabled && !(aco::debug_flags & 
aco::DEBUG_NO_SCHED_ILP))
+      aco::schedule_ilp(program.get());
+
    /* Insert Waitcnt */
    aco::insert_wait_states(program.get());
    aco::insert_NOPs(program.get());
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 22f50db4149..ea49a10da5c 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -36,18 +36,20 @@ thread_local aco::monotonic_buffer_resource* 
instruction_buffer = nullptr;
 
 uint64_t debug_flags = 0;
 
-static const struct debug_control aco_debug_options[] = {{"validateir", 
DEBUG_VALIDATE_IR},
-                                                         {"validatera", 
DEBUG_VALIDATE_RA},
-                                                         {"novalidateir", 
DEBUG_NO_VALIDATE_IR},
-                                                         {"perfwarn", 
DEBUG_PERFWARN},
-                                                         {"force-waitcnt", 
DEBUG_FORCE_WAITCNT},
-                                                         {"force-waitdeps", 
DEBUG_FORCE_WAITDEPS},
-                                                         {"novn", DEBUG_NO_VN},
-                                                         {"noopt", 
DEBUG_NO_OPT},
-                                                         {"nosched", 
DEBUG_NO_SCHED},
-                                                         {"perfinfo", 
DEBUG_PERF_INFO},
-                                                         {"liveinfo", 
DEBUG_LIVE_INFO},
-                                                         {NULL, 0}};
+static const struct debug_control aco_debug_options[] = {
+   {"validateir", DEBUG_VALIDATE_IR},
+   {"validatera", DEBUG_VALIDATE_RA},
+   {"novalidateir", DEBUG_NO_VALIDATE_IR},
+   {"perfwarn", DEBUG_PERFWARN},
+   {"force-waitcnt", DEBUG_FORCE_WAITCNT},
+   {"force-waitdeps", DEBUG_FORCE_WAITDEPS},
+   {"novn", DEBUG_NO_VN},
+   {"noopt", DEBUG_NO_OPT},
+   {"nosched", DEBUG_NO_SCHED | DEBUG_NO_SCHED_ILP},
+   {"nosched-ilp", DEBUG_NO_SCHED_ILP},
+   {"perfinfo", DEBUG_PERF_INFO},
+   {"liveinfo", DEBUG_LIVE_INFO},
+   {NULL, 0}};
 
 static once_flag init_once_flag = ONCE_FLAG_INIT;
 
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index cddc5bd5cfb..cd4ceb221ce 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -57,6 +57,7 @@ enum {
    DEBUG_LIVE_INFO = 0x100,
    DEBUG_FORCE_WAITDEPS = 0x200,
    DEBUG_NO_VALIDATE_IR = 0x400,
+   DEBUG_NO_SCHED_ILP = 0x800,
 };
 
 enum storage_class : uint8_t {
@@ -2207,6 +2208,7 @@ void register_allocation(Program* program, 
std::vector<IDSet>& live_out_per_bloc
 void ssa_elimination(Program* program);
 void lower_to_hw_instr(Program* program);
 void schedule_program(Program* program, live& live_vars);
+void schedule_ilp(Program* program);
 void spill(Program* program, live& live_vars);
 void insert_wait_states(Program* program);
 bool dealloc_vgprs(Program* program);
diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp 
b/src/amd/compiler/aco_scheduler_ilp.cpp
new file mode 100644
index 00000000000..007386cef7b
--- /dev/null
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@@ -0,0 +1,415 @@
+/*
+ * Copyright 2023 Valve Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "aco_ir.h"
+
+#include "util/bitscan.h"
+#include "util/macros.h"
+
+#include <limits>
+
+/*
+ * This pass implements a simple forward list-scheduler which works on a small
+ * partial DAG of 16 nodes at any time. Only ALU instructions are scheduled
+ * entirely freely. Memory load instructions must be kept in-order and any 
other
+ * instruction must not be re-scheduled at all.
+ *
+ * The main goal of this scheduler is to create more memory clauses, schedule
+ * memory loads early, and to improve ALU instruction level parallelism.
+ */
+
+namespace aco {
+namespace {
+
+constexpr unsigned num_nodes = 16;
+using mask_t = uint16_t;
+static_assert(std::numeric_limits<mask_t>::digits >= num_nodes);
+
+struct InstrInfo {
+   Instruction* instr;
+   int32_t priority;
+   mask_t dependency_mask;       /* bitmask of nodes which have to be 
scheduled before this node. */
+   uint8_t next_non_reorderable; /* index of next non-reorderable instruction 
node after this one. */
+   bool potential_clause; /* indicates that this instruction is not (yet) 
immediately followed by a
+                             reorderable instruction. */
+};
+
+struct RegisterInfo {
+   mask_t read_mask; /* bitmask of nodes which have to be scheduled before the 
next write. */
+   int8_t latency;   /* estimated latency of last register write. */
+   uint8_t direct_dependency : 4;     /* node that has to be scheduled before 
any other access. */
+   uint8_t has_direct_dependency : 1; /* whether there is an unscheduled 
direct dependency. */
+   uint8_t padding : 3;
+};
+
+struct SchedILPContext {
+   Program* program;
+   InstrInfo nodes[num_nodes];
+   RegisterInfo regs[512];
+   mask_t non_reorder_mask = 0; /* bitmask of instruction nodes which should 
not be reordered. */
+   mask_t active_mask = 0;      /* bitmask of valid instruction nodes. */
+   uint8_t next_non_reorderable = UINT8_MAX; /* index of next node which 
should not be reordered. */
+   uint8_t last_non_reorderable = UINT8_MAX; /* index of last node which 
should not be reordered. */
+};
+
+/**
+ * Returns true for side-effect free SALU and VALU instructions.
+ */
+bool
+can_reorder(const Instruction* const instr)
+{
+   if (instr->isVALU())
+      return true;
+   if (!instr->isSALU() || instr->isSOPP())
+      return false;
+
+   switch (instr->opcode) {
+   /* SOP2 */
+   case aco_opcode::s_cbranch_g_fork:
+   case aco_opcode::s_rfe_restore_b64:
+   /* SOP1 */
+   case aco_opcode::s_setpc_b64:
+   case aco_opcode::s_swappc_b64:
+   case aco_opcode::s_rfe_b64:
+   case aco_opcode::s_cbranch_join:
+   case aco_opcode::s_set_gpr_idx_idx:
+   case aco_opcode::s_sendmsg_rtn_b32:
+   case aco_opcode::s_sendmsg_rtn_b64:
+   /* SOPK */
+   case aco_opcode::s_cbranch_i_fork:
+   case aco_opcode::s_getreg_b32:
+   case aco_opcode::s_setreg_b32:
+   case aco_opcode::s_setreg_imm32_b32:
+   case aco_opcode::s_call_b64:
+   case aco_opcode::s_waitcnt_vscnt:
+   case aco_opcode::s_waitcnt_vmcnt:
+   case aco_opcode::s_waitcnt_expcnt:
+   case aco_opcode::s_waitcnt_lgkmcnt:
+   case aco_opcode::s_subvector_loop_begin:
+   case aco_opcode::s_subvector_loop_end:
+   /* SOPC */
+   case aco_opcode::s_setvskip:
+   case aco_opcode::s_set_gpr_idx_on: return false;
+   default: break;
+   }
+
+   return true;
+}
+
+unsigned
+get_latency(const Instruction* const instr)
+{
+   /* Note, that these are not accurate latency estimations. */
+   if (instr->isVALU())
+      return 5;
+   if (instr->isSALU())
+      return 2;
+   if (instr->isVMEM() || instr->isFlatLike())
+      return 32;
+   if (instr->isSMEM())
+      return 5;
+   if (instr->accessesLDS())
+      return 2;
+
+   return 0;
+}
+
+bool
+is_memory_instr(const Instruction* const instr)
+{
+   /* For memory instructions, we allow to reorder them with ALU if it helps
+    * to form larger clauses or to increase def-use distances.
+    */
+   return instr->isVMEM() || instr->isFlatLike() || instr->isSMEM() || 
instr->accessesLDS();
+}
+
+constexpr unsigned max_sgpr = 128;
+constexpr unsigned min_vgpr = 256;
+
+void
+add_entry(SchedILPContext& ctx, Instruction* const instr, const uint32_t idx)
+{
+   InstrInfo& entry = ctx.nodes[idx];
+   entry.instr = instr;
+   entry.priority = 0;
+   const mask_t mask = BITFIELD_BIT(idx);
+   bool reorder = can_reorder(instr);
+   ctx.active_mask |= mask;
+
+   for (const Operand& op : instr->operands) {
+      assert(op.isFixed());
+      unsigned reg = op.physReg();
+      if (reg >= max_sgpr && reg != scc && reg < min_vgpr) {
+         reorder &= reg != pops_exiting_wave_id;
+         continue;
+      }
+
+      for (unsigned i = 0; i < op.size(); i++) {
+         RegisterInfo& reg_info = ctx.regs[reg + i];
+
+         /* Add register reads. */
+         reg_info.read_mask |= mask;
+
+         int cycles_since_reg_write = num_nodes;
+         if (reg_info.has_direct_dependency) {
+            /* A previous dependency is still part of the DAG. */
+            entry.dependency_mask |= BITFIELD_BIT(reg_info.direct_dependency);
+            cycles_since_reg_write = 
ctx.nodes[reg_info.direct_dependency].priority;
+         }
+
+         if (reg_info.latency) {
+            /* Ignore and reset register latencies for memory loads and other 
non-reorderable
+             * instructions. We schedule these as early as possible anyways.
+             */
+            if (reorder && reg_info.latency > cycles_since_reg_write) {
+               entry.priority = MIN2(entry.priority, cycles_since_reg_write - 
reg_info.latency);
+
+               /* If a previous register write created some latency, ensure 
that this
+                * is the first read of the register by making this instruction 
a direct
+                * dependency of all following register reads.
+                */
+               reg_info.has_direct_dependency = 1;
+               reg_info.direct_dependency = idx;
+            }
+            reg_info.latency = 0;
+         }
+      }
+   }
+
+   /* Check if this instructions reads implicit registers. */
+   if (needs_exec_mask(instr)) {
+      for (unsigned reg = exec_lo; reg <= exec_hi; reg++) {
+         if (ctx.regs[reg].has_direct_dependency)
+            entry.dependency_mask |= 
BITFIELD_BIT(ctx.regs[reg].direct_dependency);
+         ctx.regs[reg].read_mask |= mask;
+      }
+   }
+   if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
+      for (unsigned reg = flat_scr_lo; reg <= flat_scr_hi; reg++) {
+         if (ctx.regs[reg].has_direct_dependency)
+            entry.dependency_mask |= 
BITFIELD_BIT(ctx.regs[reg].direct_dependency);
+         ctx.regs[reg].read_mask |= mask;
+      }
+   }
+
+   for (const Definition& def : instr->definitions) {
+      for (unsigned i = 0; i < def.size(); i++) {
+         RegisterInfo& reg_info = ctx.regs[def.physReg().reg() + i];
+
+         /* Add all previous register reads and writes to the dependencies. */
+         entry.dependency_mask |= reg_info.read_mask;
+         reg_info.read_mask = mask;
+
+         /* This register write is a direct dependency for all following 
reads. */
+         reg_info.has_direct_dependency = 1;
+         reg_info.direct_dependency = idx;
+
+         /* Add latency information for the next register read. */
+         reg_info.latency = get_latency(instr);
+      }
+   }
+
+   if (!reorder) {
+      ctx.non_reorder_mask |= mask;
+
+      /* Set this node as last non-reorderable instruction */
+      if (ctx.next_non_reorderable == UINT8_MAX) {
+         ctx.next_non_reorderable = idx;
+      } else {
+         ctx.nodes[ctx.last_non_reorderable].next_non_reorderable = idx;
+      }
+      ctx.last_non_reorderable = idx;
+      entry.next_non_reorderable = UINT8_MAX;
+
+      /* Just don't reorder these at all. */
+      if (!is_memory_instr(instr) || instr->definitions.empty() ||
+          get_sync_info(instr).semantics & semantic_volatile) {
+         /* Add all previous instructions as dependencies. */
+         entry.dependency_mask = ctx.active_mask;
+      }
+
+      /* Remove non-reorderable instructions from dependencies, since WaR 
dependencies can interfere
+       * with clause formation. This should be fine, since these are always 
scheduled in-order and
+       * any cases that are actually a concern for clause formation are added 
as transitive
+       * dependencies. */
+      entry.dependency_mask &= ~ctx.non_reorder_mask;
+      entry.potential_clause = true;
+   } else if (ctx.last_non_reorderable != UINT8_MAX) {
+      ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
+   }
+
+   entry.dependency_mask &= ~mask;
+
+   for (unsigned i = 0; i < num_nodes; i++) {
+      if (!ctx.nodes[i].instr || i == idx)
+         continue;
+
+      /* Add transitive dependencies. */
+      if (entry.dependency_mask & BITFIELD_BIT(i))
+         entry.dependency_mask |= ctx.nodes[i].dependency_mask;
+
+      /* increment base priority */
+      ctx.nodes[i].priority++;
+   }
+}
+
+void
+remove_entry(SchedILPContext& ctx, const Instruction* const instr, const 
uint32_t idx)
+{
+   const mask_t mask = ~BITFIELD_BIT(idx);
+   ctx.active_mask &= mask;
+
+   for (const Operand& op : instr->operands) {
+      const unsigned reg = op.physReg();
+      if (reg >= max_sgpr && reg != scc && reg < min_vgpr)
+         continue;
+
+      for (unsigned i = 0; i < op.size(); i++) {
+         RegisterInfo& reg_info = ctx.regs[reg + i];
+         reg_info.read_mask &= mask;
+         reg_info.has_direct_dependency &= reg_info.direct_dependency != idx;
+      }
+   }
+   if (needs_exec_mask(instr)) {
+      ctx.regs[exec_lo].read_mask &= mask;
+      ctx.regs[exec_hi].read_mask &= mask;
+   }
+   if (ctx.program->gfx_level < GFX10 && instr->isScratch()) {
+      ctx.regs[flat_scr_lo].read_mask &= mask;
+      ctx.regs[flat_scr_hi].read_mask &= mask;
+   }
+   for (const Definition& def : instr->definitions) {
+      for (unsigned i = 0; i < def.size(); i++) {
+         unsigned reg = def.physReg().reg() + i;
+         ctx.regs[reg].read_mask &= mask;
+         ctx.regs[reg].has_direct_dependency &= 
ctx.regs[reg].direct_dependency != idx;
+      }
+   }
+
+   for (unsigned i = 0; i < num_nodes; i++)
+      ctx.nodes[i].dependency_mask &= mask;
+
+   if (ctx.next_non_reorderable == idx) {
+      ctx.non_reorder_mask &= mask;
+      ctx.next_non_reorderable = ctx.nodes[idx].next_non_reorderable;
+      if (ctx.last_non_reorderable == idx)
+         ctx.last_non_reorderable = UINT8_MAX;
+   }
+}
+
+/**
+ * Returns a bitfield of nodes which have to be scheduled before the
+ * next non-reorderable instruction.
+ * If the next non-reorderable instruction can form a clause, returns the
+ * dependencies of the entire clause.
+ */
+mask_t
+collect_clause_dependencies(const SchedILPContext& ctx, const uint8_t next, 
mask_t clause_mask)
+{
+   const InstrInfo& entry = ctx.nodes[next];
+   mask_t dependencies = entry.dependency_mask;
+   clause_mask |= (entry.potential_clause << next);
+
+   if (!is_memory_instr(entry.instr))
+      return dependencies;
+
+   /* If this is potentially an "open" clause, meaning that the clause might
+    * consist of instruction not yet added to the DAG, consider all previous
+    * instructions as dependencies. This prevents splitting of larger, already
+    * formed clauses.
+    */
+   if (next == ctx.last_non_reorderable && entry.potential_clause)
+      return (~clause_mask & ctx.active_mask) | dependencies;
+
+   if (entry.next_non_reorderable == UINT8_MAX)
+      return dependencies;
+
+   /* Check if this can form a clause with the following non-reorderable 
instruction */
+   if (should_form_clause(entry.instr, 
ctx.nodes[entry.next_non_reorderable].instr)) {
+      mask_t clause_deps =
+         collect_clause_dependencies(ctx, entry.next_non_reorderable, 
clause_mask);
+
+      /* if the following clause is independent from us, add their 
dependencies */
+      if (!(clause_deps & BITFIELD_BIT(next)))
+         dependencies |= clause_deps;
+   }
+
+   return dependencies;
+}
+
+/**
+ * Returns the index of the next instruction to be selected.
+ */
+unsigned
+select_instruction(const SchedILPContext& ctx)
+{
+   mask_t mask = ctx.active_mask;
+
+   /* First, collect all dependencies of the next non-reorderable 
instruction(s).
+    * These make up the list of possible candidates.
+    */
+   if (ctx.next_non_reorderable != UINT8_MAX)
+      mask = collect_clause_dependencies(ctx, ctx.next_non_reorderable, 0);
+
+   /* If the next non-reorderable instruction has no dependencies, select it */
+   if (mask == 0)
+      return ctx.next_non_reorderable;
+
+   /* Otherwise, select the instruction with highest priority of all 
candidates. */
+   unsigned idx = -1u;
+   int32_t priority = INT32_MIN;
+   u_foreach_bit (i, mask) {
+      const InstrInfo& candidate = ctx.nodes[i];
+
+      /* Check if the candidate has pending dependencies. */
+      if (candidate.dependency_mask)
+         continue;
+
+      if (idx == -1u || candidate.priority > priority) {
+         idx = i;
+         priority = candidate.priority;
+      }
+   }
+
+   assert(idx != -1u);
+   return idx;
+}
+
+} // namespace
+
+void
+schedule_ilp(Program* program)
+{
+   SchedILPContext ctx = {program};
+
+   for (Block& block : program->blocks) {
+      auto it = block.instructions.begin();
+      for (unsigned i = 0; i < num_nodes; i++) {
+         if (it == block.instructions.end())
+            break;
+
+         add_entry(ctx, (it++)->get(), i);
+      }
+
+      auto insert_it = block.instructions.begin();
+      while (insert_it != block.instructions.end()) {
+         unsigned next_idx = select_instruction(ctx);
+         Instruction* next_instr = ctx.nodes[next_idx].instr;
+         remove_entry(ctx, next_instr, next_idx);
+         (insert_it++)->reset(next_instr);
+         ctx.nodes[next_idx].instr = NULL;
+
+         if (it != block.instructions.end()) {
+            add_entry(ctx, (it++)->get(), next_idx);
+         } else if (ctx.last_non_reorderable != UINT8_MAX) {
+            ctx.nodes[ctx.last_non_reorderable].potential_clause = false;
+            ctx.last_non_reorderable = UINT8_MAX;
+         }
+      }
+      assert(it == block.instructions.end());
+   }
+}
+
+} // namespace aco
diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build
index dc6f7a9071b..f624d84626f 100644
--- a/src/amd/compiler/meson.build
+++ b/src/amd/compiler/meson.build
@@ -75,6 +75,7 @@ libaco_files = files(
   'aco_print_ir.cpp',
   'aco_reindex_ssa.cpp',
   'aco_scheduler.cpp',
+  'aco_scheduler_ilp.cpp',
   'aco_spill.cpp',
   'aco_ssa_elimination.cpp',
   'aco_statistics.cpp',
diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp 
b/src/amd/compiler/tests/test_d3d11_derivs.cpp
index 3654e3daed7..4bff19aa7d2 100644
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@@ -398,10 +398,10 @@ BEGIN_TEST(d3d11_derivs._1d_array_gfx9)
    pbld.print_ir(VK_SHADER_STAGE_FRAGMENT_BIT, "ACO IR");
 
    //>> v_interp_p2_f32_e32 v#rl_tmp, v#_, attr0.y                   ; $_
-   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                               ; $_
    //>> v_interp_p2_f32_e32 v#rx_tmp, v#_, attr0.x                   ; $_
-   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
+   //>> v_rndne_f32_e32 v#rl, v#rl_tmp                               ; $_
    //>> v_mov_b32_e32 v#ry, 0.5                                      ; $_
+   //>> v_mov_b32_e32 v#rx, v#rx_tmp                                 ; $_
    //>> BB1:
    //; success = rx+1 == ry and rx+2 == rl
    //>> image_sample v[#_:#_], v#rx, s[#_:#_], s[#_:#_] dmask:0xf da ; $_ $_

Reply via email to