Matt Turner <matts...@gmail.com> writes: > On Wed, Aug 5, 2015 at 10:52 AM, Francisco Jerez <curroje...@riseup.net> > wrote: >> In order to make room for the code that will lower the MULH virtual >> instruction. Also move the hardware generation and execution type >> checks into the same branch, they are going to have to be different >> for MULH. >> --- >> src/mesa/drivers/dri/i965/brw_fs.cpp | 265 >> ++++++++++++++++++----------------- >> 1 file changed, 136 insertions(+), 129 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> b/src/mesa/drivers/dri/i965/brw_fs.cpp >> index fc9f007..911e32b 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -3125,155 +3125,162 @@ fs_visitor::lower_integer_multiplication() >> { >> bool progress = false; >> >> - /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation >> - * directly, but CHV/BXT cannot. >> - */ >> - if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton) >> - return false; >> - >> foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { >> - if (inst->opcode != BRW_OPCODE_MUL || >> - inst->dst.is_accumulator() || >> - (inst->dst.type != BRW_REGISTER_TYPE_D && >> - inst->dst.type != BRW_REGISTER_TYPE_UD)) >> - continue; >> - >> const fs_builder ibld(this, block, inst); >> >> - /* The MUL instruction isn't commutative. On Gen <= 6, only the low >> - * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of >> - * src1 are used. >> - * >> - * If multiplying by an immediate value that fits in 16-bits, do a >> - * single MUL instruction with that value in the proper location. >> - */ >> - if (inst->src[1].file == IMM && >> - inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { >> - if (devinfo->gen < 7) { >> - fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), >> - inst->dst.type); >> - ibld.MOV(imm, inst->src[1]); >> - ibld.MUL(inst->dst, imm, inst->src[0]); >> - } else { >> - ibld.MUL(inst->dst, inst->src[0], inst->src[1]); >> - } >> - } else { >> - /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot >> - * do 32-bit integer multiplication in one instruction, but instead >> - * must do a sequence (which actually calculates a 64-bit result): >> - * >> - * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D >> - * mach(8) null g3<8,8,1>D g4<8,8,1>D >> - * mov(8) g2<1>D acc0<8,8,1>D >> - * >> - * But on Gen > 6, the ability to use second accumulator register >> - * (acc1) for non-float data types was removed, preventing a simple >> - * implementation in SIMD16. A 16-channel result can be calculated >> by >> - * executing the three instructions twice in SIMD8, once with >> quarter >> - * control of 1Q for the first eight channels and again with 2Q for >> - * the second eight channels. >> - * >> - * Which accumulator register is implicitly accessed (by >> AccWrEnable >> - * for instance) is determined by the quarter control. >> Unfortunately >> - * Ivybridge (and presumably Baytrail) has a hardware bug in which >> an >> - * implicit accumulator access by an instruction with 2Q will >> access >> - * acc1 regardless of whether the data type is usable in acc1. >> - * >> - * Specifically, the 2Q mach(8) writes acc1 which does not exist >> for >> - * integer data types. >> - * >> - * Since we only want the low 32-bits of the result, we can do two >> - * 32-bit x 16-bit multiplies (like the mul and mach are doing), >> and >> - * adjust the high result and add them (like the mach is doing): >> - * >> - * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW >> - * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW >> - * shl(8) g9<1>D g8<8,8,1>D 16D >> - * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D >> - * >> - * We avoid the shl instruction by realizing that we only want to >> add >> - * the low 16-bits of the "high" result to the high 16-bits of the >> - * "low" result and using proper regioning on the add: >> - * >> - * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW >> - * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW >> - * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW >> - * >> - * Since it does not use the (single) accumulator register, we can >> - * schedule multi-component multiplications much better. >> + if (inst->opcode == BRW_OPCODE_MUL) { >> + if (inst->dst.is_accumulator() || >> + (inst->dst.type != BRW_REGISTER_TYPE_D && >> + inst->dst.type != BRW_REGISTER_TYPE_UD)) >> + continue; >> + >> + /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit >> + * operation directly, but CHV/BXT cannot. >> */ >> + if (devinfo->gen >= 8 && >> + !devinfo->is_cherryview && !devinfo->is_broxton) >> + continue; >> >> - if (inst->conditional_mod && inst->dst.is_null()) { >> - inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), >> - inst->dst.type); >> - } >> - fs_reg low = inst->dst; >> - fs_reg high(GRF, alloc.allocate(dispatch_width / 8), >> - inst->dst.type); >> + if (inst->src[1].file == IMM && >> + inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { >> + /* The MUL instruction isn't commutative. On Gen <= 6, only the >> low >> + * 16-bits of src0 are read, and on Gen >= 7 only the low >> 16-bits of >> + * src1 are used. >> + * >> + * If multiplying by an immediate value that fits in 16-bits, >> do a >> + * single MUL instruction with that value in the proper >> location. >> + */ >> + if (devinfo->gen < 7) { >> + fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), >> + inst->dst.type); >> + ibld.MOV(imm, inst->src[1]); >> + ibld.MUL(inst->dst, imm, inst->src[0]); >> + } else { >> + ibld.MUL(inst->dst, inst->src[0], inst->src[1]); >> + } >> + } else { >> + /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) >> cannot >> + * do 32-bit integer multiplication in one instruction, but >> instead >> + * must do a sequence (which actually calculates a 64-bit >> result): >> + * >> + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D >> + * mach(8) null g3<8,8,1>D g4<8,8,1>D >> + * mov(8) g2<1>D acc0<8,8,1>D >> + * >> + * But on Gen > 6, the ability to use second accumulator >> register >> + * (acc1) for non-float data types was removed, preventing a >> simple >> + * implementation in SIMD16. A 16-channel result can be >> calculated by >> + * executing the three instructions twice in SIMD8, once with >> quarter >> + * control of 1Q for the first eight channels and again with 2Q >> for >> + * the second eight channels. >> + * >> + * Which accumulator register is implicitly accessed (by >> AccWrEnable >> + * for instance) is determined by the quarter control. >> Unfortunately >> + * Ivybridge (and presumably Baytrail) has a hardware bug in >> which an >> + * implicit accumulator access by an instruction with 2Q will >> access >> + * acc1 regardless of whether the data type is usable in acc1. >> + * >> + * Specifically, the 2Q mach(8) writes acc1 which does not >> exist for >> + * integer data types. >> + * >> + * Since we only want the low 32-bits of the result, we can do >> two >> + * 32-bit x 16-bit multiplies (like the mul and mach are >> doing), and >> + * adjust the high result and add them (like the mach is doing): >> + * >> + * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW >> + * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW >> + * shl(8) g9<1>D g8<8,8,1>D 16D >> + * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D >> + * >> + * We avoid the shl instruction by realizing that we only want >> to add >> + * the low 16-bits of the "high" result to the high 16-bits of >> the >> + * "low" result and using proper regioning on the add: >> + * >> + * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW >> + * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW >> + * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW >> + * >> + * Since it does not use the (single) accumulator register, we >> can >> + * schedule multi-component multiplications much better. >> + */ >> + >> + if (inst->conditional_mod && inst->dst.is_null()) { >> + inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), >> + inst->dst.type); >> + } >> + fs_reg low = inst->dst; >> + fs_reg high(GRF, alloc.allocate(dispatch_width / 8), >> + inst->dst.type); >> >> - if (devinfo->gen >= 7) { >> - fs_reg src1_0_w = inst->src[1]; >> - fs_reg src1_1_w = inst->src[1]; >> + if (devinfo->gen >= 7) { >> + fs_reg src1_0_w = inst->src[1]; >> + fs_reg src1_1_w = inst->src[1]; >> >> - if (inst->src[1].file == IMM) { >> - src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff; >> - src1_1_w.fixed_hw_reg.dw1.ud >>= 16; >> - } else { >> - src1_0_w.type = BRW_REGISTER_TYPE_UW; >> - if (src1_0_w.stride != 0) { >> - assert(src1_0_w.stride == 1); >> - src1_0_w.stride = 2; >> + if (inst->src[1].file == IMM) { >> + src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff; >> + src1_1_w.fixed_hw_reg.dw1.ud >>= 16; >> + } else { >> + src1_0_w.type = BRW_REGISTER_TYPE_UW; >> + if (src1_0_w.stride != 0) { >> + assert(src1_0_w.stride == 1); >> + src1_0_w.stride = 2; >> + } >> + >> + src1_1_w.type = BRW_REGISTER_TYPE_UW; >> + if (src1_1_w.stride != 0) { >> + assert(src1_1_w.stride == 1); >> + src1_1_w.stride = 2; >> + } >> + src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); >> } >> + ibld.MUL(low, inst->src[0], src1_0_w); >> + ibld.MUL(high, inst->src[0], src1_1_w); >> + } else { >> + fs_reg src0_0_w = inst->src[0]; >> + fs_reg src0_1_w = inst->src[0]; >> >> - src1_1_w.type = BRW_REGISTER_TYPE_UW; >> - if (src1_1_w.stride != 0) { >> - assert(src1_1_w.stride == 1); >> - src1_1_w.stride = 2; >> + src0_0_w.type = BRW_REGISTER_TYPE_UW; >> + if (src0_0_w.stride != 0) { >> + assert(src0_0_w.stride == 1); >> + src0_0_w.stride = 2; >> } >> - src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); >> - } >> - ibld.MUL(low, inst->src[0], src1_0_w); >> - ibld.MUL(high, inst->src[0], src1_1_w); >> - } else { >> - fs_reg src0_0_w = inst->src[0]; >> - fs_reg src0_1_w = inst->src[0]; >> >> - src0_0_w.type = BRW_REGISTER_TYPE_UW; >> - if (src0_0_w.stride != 0) { >> - assert(src0_0_w.stride == 1); >> - src0_0_w.stride = 2; >> - } >> + src0_1_w.type = BRW_REGISTER_TYPE_UW; >> + if (src0_1_w.stride != 0) { >> + assert(src0_1_w.stride == 1); >> + src0_1_w.stride = 2; >> + } >> + src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); >> >> - src0_1_w.type = BRW_REGISTER_TYPE_UW; >> - if (src0_1_w.stride != 0) { >> - assert(src0_1_w.stride == 1); >> - src0_1_w.stride = 2; >> + ibld.MUL(low, src0_0_w, inst->src[1]); >> + ibld.MUL(high, src0_1_w, inst->src[1]); >> } >> - src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); >> >> - ibld.MUL(low, src0_0_w, inst->src[1]); >> - ibld.MUL(high, src0_1_w, inst->src[1]); >> - } >> + fs_reg dst = inst->dst; >> + dst.type = BRW_REGISTER_TYPE_UW; >> + dst.subreg_offset = 2; >> + dst.stride = 2; >> >> - fs_reg dst = inst->dst; >> - dst.type = BRW_REGISTER_TYPE_UW; >> - dst.subreg_offset = 2; >> - dst.stride = 2; >> + high.type = BRW_REGISTER_TYPE_UW; >> + high.stride = 2; >> >> - high.type = BRW_REGISTER_TYPE_UW; >> - high.stride = 2; >> + low.type = BRW_REGISTER_TYPE_UW; >> + low.subreg_offset = 2; >> + low.stride = 2; >> >> - low.type = BRW_REGISTER_TYPE_UW; >> - low.subreg_offset = 2; >> - low.stride = 2; >> + ibld.ADD(dst, low, high); >> >> - ibld.ADD(dst, low, high); >> + if (inst->conditional_mod) { >> + fs_reg null(retype(ibld.null_reg_f(), inst->dst.type)); >> + set_condmod(inst->conditional_mod, >> + ibld.MOV(null, inst->dst)); >> + } >> + } >> >> - if (inst->conditional_mod) { >> - fs_reg null(retype(ibld.null_reg_f(), inst->dst.type)); >> - set_condmod(inst->conditional_mod, >> - ibld.MOV(null, inst->dst)); >> } >> + > > Extra newline before the else. > >> + } else { >> + continue; >> } > > There's a build-break here, probably a rebase problem in splitting > this from the next patch. There seems to be an extra } that should be > in the next patch. >
Oops, good catch, I'll fix that. > With that fixed, these five patches are > > Reviewed-by: Matt Turner <matts...@gmail.com> Thanks.
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev