Francisco Jerez <curroje...@riseup.net> writes: > Jason Ekstrand <ja...@jlekstrand.net> writes: > >> On Sat, May 26, 2018 at 12:15 PM, Francisco Jerez <curroje...@riseup.net> >> wrote: >> >>> Jason Ekstrand <ja...@jlekstrand.net> writes: >>> >>> > On Sat, May 26, 2018 at 11:27 AM, Francisco Jerez <curroje...@riseup.net >>> > >>> > wrote: >>> > >>> >> Jason Ekstrand <ja...@jlekstrand.net> writes: >>> >> >>> >> > Doing instruction header setup in the generator is aweful for a number >>> >> > of reasons. For one, we can't schedule the header setup at all. For >>> >> > another, it means lots of implied writes which the instruction >>> scheduler >>> >> > and other passes can't properly read about. The second isn't a huge >>> >> > problem for FB writes since they always happen at the end. We made a >>> >> > similar change to sampler handling in ff4726077d86. >>> >> > --- >>> >> > src/intel/compiler/brw_fs.cpp | 103 >>> >> ++++++++++++++++++++++++++------ >>> >> > src/intel/compiler/brw_fs_generator.cpp | 66 -------------------- >>> >> > 2 files changed, 86 insertions(+), 83 deletions(-) >>> >> > >>> >> > diff --git a/src/intel/compiler/brw_fs.cpp >>> b/src/intel/compiler/brw_fs. >>> >> cpp >>> >> > index 3e9ccc4..5276a66 100644 >>> >> > --- a/src/intel/compiler/brw_fs.cpp >>> >> > +++ b/src/intel/compiler/brw_fs.cpp >>> >> > @@ -3247,7 +3247,18 @@ fs_visitor::emit_repclear_shader() >>> >> > write->mlen = 1; >>> >> > } else { >>> >> > assume(key->nr_color_regions > 0); >>> >> > + >>> >> > + struct brw_reg header = >>> >> > + retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD); >>> >> > + bld.exec_all().group(16, 0) >>> >> > + .MOV(header, retype(brw_vec8_grf(0, 0), >>> BRW_REGISTER_TYPE_UD)); >>> >> > + >>> >> > for (int i = 0; i < key->nr_color_regions; ++i) { >>> >> > + if (i > 0) { >>> >> > + bld.exec_all().group(1, 0) >>> >> > + .MOV(component(header, 2), brw_imm_ud(i)); >>> >> > + } >>> >> > + >>> >> > write = bld.emit(FS_OPCODE_REP_FB_WRITE); >>> >> > write->saturate = key->clamp_fragment_color; >>> >> > write->base_mrf = base_mrf; >>> >> > @@ -3972,25 +3983,83 @@ lower_fb_write_logical_send(const fs_builder >>> >> &bld, fs_inst *inst, >>> >> > int header_size = 2, payload_header_size; >>> >> > unsigned length = 0; >>> >> > >>> >> > - /* From the Sandy Bridge PRM, volume 4, page 198: >>> >> > - * >>> >> > - * "Dispatched Pixel Enables. One bit per pixel indicating >>> >> > - * which pixels were originally enabled when the thread was >>> >> > - * dispatched. This field is only required for the end-of- >>> >> > - * thread message and on all dual-source messages." >>> >> > - */ >>> >> > - if (devinfo->gen >= 6 && >>> >> > - (devinfo->is_haswell || devinfo->gen >= 8 || >>> >> !prog_data->uses_kill) && >>> >> > - color1.file == BAD_FILE && >>> >> > - key->nr_color_regions == 1) { >>> >> > - header_size = 0; >>> >> > - } >>> >> > + if (devinfo->gen < 6) { >>> >> > + /* For gen4-5, we always have a header consisting of g0 and g1. >>> >> We have >>> >> > + * an implied MOV from g0,g1 to the start of the message. The >>> >> MOV from >>> >> > + * g0 is handled by the hardware and the MOV from g1 is >>> provided >>> >> by the >>> >> > + * generator. This is required because, on gen4-5, the >>> generator >>> >> may >>> >> > + * generate two write messages with different message lengths >>> in >>> >> order >>> >> > + * to handle AA data properly. >>> >> > + * >>> >> > + * Also, since the pixel mask goes in the g0 portion of the >>> >> message and >>> >> > + * since render target writes are the last thing in the shader, >>> >> we write >>> >> > + * the pixel mask directly into g0 and it will get copied as >>> part >>> >> of the >>> >> > + * implied write. >>> >> > + */ >>> >> > + if (prog_data->uses_kill) { >>> >> > + bld.exec_all().group(1, 0) >>> >> > + .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), >>> >> > + brw_flag_reg(0, 1)); >>> >> > + } >>> >> > + >>> >> > + assert(length == 0); >>> >> > + length = 2; >>> >> > + } else if ((devinfo->gen <= 7 && !devinfo->is_haswell && >>> >> > + prog_data->uses_kill) || >>> >> > + color1.file != BAD_FILE || >>> >> > + key->nr_color_regions > 1) { >>> >> > + /* From the Sandy Bridge PRM, volume 4, page 198: >>> >> > + * >>> >> > + * "Dispatched Pixel Enables. One bit per pixel indicating >>> >> > + * which pixels were originally enabled when the thread >>> was >>> >> > + * dispatched. This field is only required for the end-of- >>> >> > + * thread message and on all dual-source messages." >>> >> > + */ >>> >> > + const fs_builder ubld = bld.exec_all().group(8, 0); >>> >> > + >>> >> > + /* The header starts off as g0 and g1 */ >>> >> > + fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); >>> >> > + ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0), >>> >> > + BRW_REGISTER_TYPE_UD)); >>> >> > + >>> >> > + uint32_t g00_bits = 0; >>> >> > + >>> >> > + /* Set "Source0 Alpha Present to RenderTarget" bit in message >>> >> > + * header. >>> >> > + */ >>> >> > + if (inst->target > 0 && key->replicate_alpha) >>> >> > + g00_bits |= 1 << 11; >>> >> > + >>> >> > + /* Set computes stencil to render target */ >>> >> > + if (prog_data->computed_stencil) >>> >> > + g00_bits |= 1 << 14; >>> >> > + >>> >> > + if (g00_bits) { >>> >> > + /* OR extra bits into g0.0 */ >>> >> > + ubld.group(1, 0).OR(component(header, 0), >>> >> > + retype(brw_vec1_grf(0, 0), >>> >> > + BRW_REGISTER_TYPE_UD), >>> >> > + brw_imm_ud(g00_bits)); >>> >> > + } >>> >> > + >>> >> > + /* Set the render target index for choosing BLEND_STATE. */ >>> >> > + if (inst->target > 0) { >>> >> > + ubld.group(1, 0).MOV(component(header, 2), >>> >> brw_imm_ud(inst->target)); >>> >> > + } >>> >> > + >>> >> > + if (prog_data->uses_kill) { >>> >> > + ubld.group(1, 0).MOV(retype(component(header, 15), >>> >> > + BRW_REGISTER_TYPE_UW), >>> >> > + brw_flag_reg(0, 1)); >>> >> > + } >>> >> >>> >> The original patch from my branch implemented the header setup in a >>> >> helper function called setup_fb_write_header() which allowed it to share >>> >> the code among emit_repclear_shader() and lower_fb_write_logical_send(). >>> >> I don't think there is any benefit from open-coding the header setup in >>> >> the already complex enough callers, seems like an obfuscation to me >>> >> (particularly of the emit_repclear_shader() codepath with PATCH 52 in >>> >> mind). >>> >> >>> > >>> > Yeah, I think this one could go either way, TBH. Having it in a helper >>> > avoids some setup in emit_repclear_shader (though not that much) and is >>> > nice because it's sort-of functional in nature. The thing I like about >>> > this approach is that it combines the decision to use the header with the >>> > code to create said header. We could do that in a helper I suppose and >>> > have it return BAD_FILE if no header is needed. >>> > >>> >>> Yes, a helper could combine the decision of whether to send a header or >>> not pretty easily. >>> >>> > Also, the "allocate and return a register" model ended up resulting in >>> > worse code on gen4-5 because it didn't play nicely with the implied MRF >>> > write. On those platforms, it's easier to set the header up by doing the >>> > needed MOVs to g0 and letting the implied MRF write copy it into the >>> > message. >>> > >>> >>> I don't think there's much of a performance benefit from an implied MRF >>> write relative to a favorably scheduled single-GRF MOV on Gen4-5. >>> Actually under some conditions it can take more EU cycles because you >>> eat the latency of the implied MOV without being able to schedule >>> anything in between. >>> >> >> The bigger problem is that we either have to keep the impled MRF write or >> we have to handle the AA data workaround in the visitor as well. >> Ultimately, that's probably what we want to do. I'm trying to avoid >> debugging gen4... >> > > That's no big deal, the implied MRF write can probably stay for the > moment,
In the "runtime_check_aads_emit == true" case that is. > whether the header is set up in the visitor or in the generator. > The question is just whether the source of the implied move will be r0 > or the first payload register of the FB write. > >> >>> > As far as complexity goes, I'm not sure. Setting up FB writes is an >>> > inherently complex operation. Probably the biggest thing I didn't like >>> > about having it be in a helper was that the header code was around 7-800 >>> > lines away from the code to set up the rest of the FB write. That could >>> > have been solved by a pre-declaration, however. >>> > >>> >>> It's a couple of keystrokes away if your editor supports some sort of >>> search function. And I appreciate the possibility not to see the header >>> setup code while dealing with the FB write lowering code if it's not >>> what I'm looking for. And the possibility to share code which my >>> original patch took advantage of. >>> >>> > You are free to continue to disagree with my approach. Hopefully, I've >>> at >>> > least convinced you that I didn't do it just because I like straight >>> inline >>> > code (which I do like). >>> > >>> >>> Ugh... I was just thinking you had inlined it in order to >>> micro-optimize it, but knowing you actually prefer a bunch of hairy code >>> over a single call of a well-behaved abstraction (it was indeed >>> referentially transparent and close to a pure function) to achieve the >>> same effect makes me even more shocked ;P >>> >> >> We have always had different definitions of code cleanliness. :-) >> > > Didn't ever think we disagreed on structured programming being useful to > write code which is easily maintainable and reusable... :P > >> We can switch it back to being a function that returns a register though >> it'll take a little reworking of gen4 as mentioned above. I don't think it >> really saves us anything but whatever... >> >> --Jason >> >> >>> > --Jason >>> > >>> > >>> >> > >>> >> > - if (header_size != 0) { >>> >> > - assert(header_size == 2); >>> >> > - /* Allocate 2 registers for a header */ >>> >> > - length += 2; >>> >> > + assert(length == 0); >>> >> > + sources[0] = header; >>> >> > + sources[1] = horiz_offset(header, 8); >>> >> > + length = 2; >>> >> > } >>> >> > + assert(length == 0 || length == 2); >>> >> > + header_size = length; >>> >> > >>> >> > if (payload.aa_dest_stencil_reg) { >>> >> > sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1)); >>> >> > diff --git a/src/intel/compiler/brw_fs_generator.cpp >>> >> b/src/intel/compiler/brw_fs_generator.cpp >>> >> > index 8cc9d31..1a2b961 100644 >>> >> > --- a/src/intel/compiler/brw_fs_generator.cpp >>> >> > +++ b/src/intel/compiler/brw_fs_generator.cpp >>> >> > @@ -307,9 +307,6 @@ fs_generator::fire_fb_write(fs_inst *inst, >>> >> > void >>> >> > fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg >>> payload) >>> >> > { >>> >> > - struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_ >>> >> data); >>> >> > - const brw_wm_prog_key * const key = (brw_wm_prog_key * const) >>> >> this->key; >>> >> > - >>> >> > if (devinfo->gen < 8 && !devinfo->is_haswell) { >>> >> > brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); >>> >> > } >>> >> > @@ -320,69 +317,6 @@ fs_generator::generate_fb_write(fs_inst *inst, >>> >> struct brw_reg payload) >>> >> > if (inst->base_mrf >= 0) >>> >> > payload = brw_message_reg(inst->base_mrf); >>> >> > >>> >> > - /* Header is 2 regs, g0 and g1 are the contents. g0 will be >>> implied >>> >> > - * move, here's g1. >>> >> > - */ >>> >> > - if (inst->header_size != 0) { >>> >> > - brw_push_insn_state(p); >>> >> > - brw_set_default_mask_control(p, BRW_MASK_DISABLE); >>> >> > - brw_set_default_exec_size(p, BRW_EXECUTE_1); >>> >> > - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); >>> >> > - brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); >>> >> > - brw_set_default_flag_reg(p, 0, 0); >>> >> > - >>> >> > - /* On HSW, the GPU will use the predicate on SENDC, unless the >>> >> header is >>> >> > - * present. >>> >> > - */ >>> >> > - if (prog_data->uses_kill) { >>> >> > - struct brw_reg pixel_mask; >>> >> > - >>> >> > - if (devinfo->gen >= 6) >>> >> > - pixel_mask = retype(brw_vec1_grf(1, 7), >>> >> BRW_REGISTER_TYPE_UW); >>> >> > - else >>> >> > - pixel_mask = retype(brw_vec1_grf(0, 0), >>> >> BRW_REGISTER_TYPE_UW); >>> >> > - >>> >> > - brw_MOV(p, pixel_mask, brw_flag_reg(0, 1)); >>> >> > - } >>> >> > - >>> >> > - if (devinfo->gen >= 6) { >>> >> > - brw_push_insn_state(p); >>> >> > - brw_set_default_exec_size(p, BRW_EXECUTE_16); >>> >> > - brw_set_default_compression_control(p, >>> >> BRW_COMPRESSION_COMPRESSED); >>> >> > - brw_MOV(p, >>> >> > - retype(payload, BRW_REGISTER_TYPE_UD), >>> >> > - retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); >>> >> > - brw_pop_insn_state(p); >>> >> > - >>> >> > - if (inst->target > 0 && key->replicate_alpha) { >>> >> > - /* Set "Source0 Alpha Present to RenderTarget" bit in >>> >> message >>> >> > - * header. >>> >> > - */ >>> >> > - brw_OR(p, >>> >> > - vec1(retype(payload, BRW_REGISTER_TYPE_UD)), >>> >> > - vec1(retype(brw_vec8_grf(0, 0), >>> BRW_REGISTER_TYPE_UD)), >>> >> > - brw_imm_ud(0x1 << 11)); >>> >> > - } >>> >> > - >>> >> > - if (inst->target > 0) { >>> >> > - /* Set the render target index for choosing BLEND_STATE. */ >>> >> > - brw_MOV(p, retype(vec1(suboffset(payload, 2)), >>> >> > - BRW_REGISTER_TYPE_UD), >>> >> > - brw_imm_ud(inst->target)); >>> >> > - } >>> >> > - >>> >> > - /* Set computes stencil to render target */ >>> >> > - if (prog_data->computed_stencil) { >>> >> > - brw_OR(p, >>> >> > - vec1(retype(payload, BRW_REGISTER_TYPE_UD)), >>> >> > - vec1(retype(brw_vec8_grf(0, 0), >>> >> BRW_REGISTER_TYPE_UD)), >>> >> > - brw_imm_ud(0x1 << 14)); >>> >> > - } >>> >> > - } >>> >> > - >>> >> > - brw_pop_insn_state(p); >>> >> > - } >>> >> > - >>> >> > if (!runtime_check_aads_emit) { >>> >> > fire_fb_write(inst, payload, implied_header, inst->mlen); >>> >> > } else { >>> >> > -- >>> >> > 2.5.0.400.gff86faf >>> >> > >>> >> > _______________________________________________ >>> >> > mesa-dev mailing list >>> >> > mesa-dev@lists.freedesktop.org >>> >> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >>> >> >>>
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev