Generally, instructions in Align16 mode only ever write to a single register and don't need anny form of SIMD splitting, that's why we have never had a SIMD splitting pass in the vec4 backend. However, double-precision instructions typically write 2 registers and in some cases they run into certain hardware bugs and limitations that we need to work around by splitting the instructions so we only write to 1 register at a time.
This patch implements a basic SIMD splitting pass for this purpose. Notice that it does not attempt to be as flexible and generic as the FS version, because as I explain above, the cases where this needs to act are more limited, so we take advantage of that to simplify the implementation. Because we only use double-precision instructions in Align16 mode in gen7 (gen8+ is fully scalar and gens < 7 do not implement fp64) the pass is restricted to act on gen7 hardware only. For now the pass only handles the gen7 restriction where any instruction that writes 2 registers also needs to read 2 registers. This affects double-precision instructions reading uniforms, for example. Later patches will extend the lowering pass adding a few more cases. --- src/mesa/drivers/dri/i965/brw_ir_vec4.h | 1 + src/mesa/drivers/dri/i965/brw_vec4.cpp | 100 +++++++++++++++++++++++++++++++- src/mesa/drivers/dri/i965/brw_vec4.h | 2 + 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h index 721772e..f66c093 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h @@ -167,6 +167,7 @@ public: unsigned sol_vertex; /**< gen6: used for setting dst index in SVB header */ uint8_t exec_size; + uint8_t group; bool is_send_from_grf(); unsigned regs_read(unsigned arg) const; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 8316691..829b7d3 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -1947,6 +1947,101 @@ vec4_visitor::convert_to_hw_regs() } } +/** + * Get the closest native SIMD width supported by the hardware for instruction + * \p inst. The instruction will be left untouched by + * vec4_visitor::lower_simd_width() if the returned value matches the + * instruction's original execution size. + */ +static unsigned +get_lowered_simd_width(const struct brw_device_info *devinfo, + const vec4_instruction *inst) +{ + /* For now we only need to split some cases of double-precision instructions + * that write 2 registers. We only need to care about this in gen7 because + * that is the only hardware that implements fp64 in Align16. + */ + if (devinfo->gen != 7 || inst->regs_written < 2) + return inst->exec_size; + + unsigned lowered_width = MIN2(8, inst->exec_size); + + /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct + * Register Addressing: + * + * "When destination spans two registers, the source MUST span two + * registers." + */ + for (unsigned i = 0; i < 3; i++) { + if (inst->src[i].file == BAD_FILE) + continue; + if (inst->regs_read(i) < 2) + lowered_width = MIN2(lowered_width, 4); + } + + return lowered_width; +} + +bool +vec4_visitor::lower_simd_width() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { + const unsigned lowered_width = get_lowered_simd_width(devinfo, inst); + assert(lowered_width <= inst->exec_size); + if (lowered_width == inst->exec_size) + continue; + + /* For now we only support splitting 8-wide instructions into 4-wide */ + assert(inst->exec_size == 8 && lowered_width == 4); + + /* We always split so that each lowered instruction writes exactly to + * one register. + */ + assert(inst->regs_written == inst->exec_size / lowered_width); + + for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) { + dst_reg dst = offset(inst->dst, n); + + src_reg srcs[3]; + for (int i = 0; i < 3; i++) { + srcs[i] = inst->src[i]; + + if (srcs[i].file == BAD_FILE) + continue; + + if (!is_uniform(srcs[i])) { + if (type_sz(srcs[i].type) == 8) { + srcs[i] = offset(srcs[i], n); + } else { + assert(lowered_width * n < 8); + srcs[i].subnr += lowered_width * n; + } + } + } + + vec4_instruction *linst = new(mem_ctx) + vec4_instruction(inst->opcode, dst, srcs[0], srcs[1], srcs[2]); + linst->exec_size = lowered_width; + linst->group = lowered_width * n; + linst->regs_written = 1; + linst->conditional_mod = inst->conditional_mod; + linst->predicate = inst->predicate; + linst->saturate = inst->saturate; + inst->insert_before(block, linst); + } + + inst->remove(block); + progress = true; + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + bool vec4_visitor::run() { @@ -2002,9 +2097,12 @@ vec4_visitor::run() backend_shader::dump_instructions(filename); } - bool progress; + bool progress = false; int iteration = 0; int pass_num = 0; + + OPT(lower_simd_width); + do { progress = false; pass_num = 0; diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index cf7cdab..e4c4e91 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -160,6 +160,8 @@ public: void opt_schedule_instructions(); void convert_to_hw_regs(); + bool lower_simd_width(); + vec4_instruction *emit(vec4_instruction *inst); vec4_instruction *emit(enum opcode opcode); -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev