Re: [Mesa-dev] [PATCH 02/10] intel/fs: Implement quad swizzles on ICL+.
On Sat, 2018-12-29 at 12:38 -0800, Francisco Jerez wrote: > Align16 is no longer a thing, so a new implementation is provided > using Align1 instead. Not all possible swizzles can be represented > as > a single Align1 region, but some fast paths are provided for > frequently used swizzles that can be represented efficiently in > Align1 > mode. > > Fixes ~90 subgroup quad swap Vulkan CTS tests. > > Cc: mesa-sta...@lists.freedesktop.org > --- > src/intel/compiler/brw_fs.cpp | 25 +++- > src/intel/compiler/brw_fs.h | 4 ++ > src/intel/compiler/brw_fs_generator.cpp | 82 --- > -- > 3 files changed, 93 insertions(+), 18 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp > b/src/intel/compiler/brw_fs.cpp > index 2f0f0151219..97544fdf465 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard() > const > * may stomp all over it. > */ >return true; > + case SHADER_OPCODE_QUAD_SWIZZLE: > + switch (src[1].ud) { Maybe it is worth adding a small comment here indicating that these are the cases where we implement the opcode as a single instruction and refer to the generator for details? > + case BRW_SWIZZLE_: > + case BRW_SWIZZLE_: > + case BRW_SWIZZLE_: > + case BRW_SWIZZLE_: > + case BRW_SWIZZLE_XXZZ: > + case BRW_SWIZZLE_YYWW: > + case BRW_SWIZZLE_XYXY: > + case BRW_SWIZZLE_ZWZW: > + return false; > + default: > + return !is_uniform(src[0]); Shouldn't this be: return !is_uniform(src[0]) || (devinfo->gen < 11 && type_sz(src.type) == 4); Since in that case we also implement the opcode with a single ALIGN16 instruction. > + } > default: >/* The SIMD16 compressed instruction > * > @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct > gen_device_info *devinfo, > case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: >return MIN2(8, inst->exec_size); > > - case SHADER_OPCODE_QUAD_SWIZZLE: > - return 8; > - > + case SHADER_OPCODE_QUAD_SWIZZLE: { > + const unsigned swiz = inst->src[1].ud; > + return (is_uniform(inst->src[0]) ? > + get_fpu_lowered_simd_width(devinfo, inst) : > + devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? > 8 : > + swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? > 4 : > + get_fpu_lowered_simd_width(devinfo, inst)); > + } > case SHADER_OPCODE_MOV_INDIRECT: { >/* From IVB and HSW PRMs: > * > diff --git a/src/intel/compiler/brw_fs.h > b/src/intel/compiler/brw_fs.h > index 53d9b6ce7bf..dc36ecc21ac 100644 > --- a/src/intel/compiler/brw_fs.h > +++ b/src/intel/compiler/brw_fs.h > @@ -480,6 +480,10 @@ private: > struct brw_reg src, > struct brw_reg idx); > > + void generate_quad_swizzle(const fs_inst *inst, > + struct brw_reg dst, struct brw_reg > src, > + unsigned swiz); > + > bool patch_discard_jumps_to_fb_writes(); > > const struct brw_compiler *compiler; > diff --git a/src/intel/compiler/brw_fs_generator.cpp > b/src/intel/compiler/brw_fs_generator.cpp > index 08dd83dded7..84627e83132 100644 > --- a/src/intel/compiler/brw_fs_generator.cpp > +++ b/src/intel/compiler/brw_fs_generator.cpp > @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst, > } > } > > +void > +fs_generator::generate_quad_swizzle(const fs_inst *inst, > +struct brw_reg dst, struct > brw_reg src, > +unsigned swiz) > +{ > + /* Requires a quad. */ > + assert(inst->exec_size >= 4); > + > + if (src.file == BRW_IMMEDIATE_VALUE || > + has_scalar_region(src)) { > + /* The value is uniform across all channels */ > + brw_MOV(p, dst, src); > + > + } else if (devinfo->gen < 11 && type_sz(src.type) == 4) { > + /* This only works on 8-wide 32-bit values */ > + assert(inst->exec_size == 8); > + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); > + assert(src.vstride == src.width + 1); > + brw_set_default_access_mode(p, BRW_ALIGN_16); > + struct brw_reg swiz_src = stride(src, 4, 4, 1); > + swiz_src.swizzle = swiz; > + brw_MOV(p, dst, swiz_src); > + Extra blank line. > + } else { > + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); > + assert(src.vstride == src.width + 1); > + const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, > 0)); > + > + switch (swiz) { > + case BRW_SWIZZLE_: > + case BRW_SWIZZLE_: > + case BRW_SWIZZLE_: > + case BRW_SWIZZLE_: > + brw_MOV(p, dst, stride(src_0, 4, 4, 0)); > + break; > + > + case BRW_SWIZZLE_XXZZ: > + case BRW_SWIZZLE_YYWW: > +
[Mesa-dev] [PATCH 02/10] intel/fs: Implement quad swizzles on ICL+.
Align16 is no longer a thing, so a new implementation is provided using Align1 instead. Not all possible swizzles can be represented as a single Align1 region, but some fast paths are provided for frequently used swizzles that can be represented efficiently in Align1 mode. Fixes ~90 subgroup quad swap Vulkan CTS tests. Cc: mesa-sta...@lists.freedesktop.org --- src/intel/compiler/brw_fs.cpp | 25 +++- src/intel/compiler/brw_fs.h | 4 ++ src/intel/compiler/brw_fs_generator.cpp | 82 - 3 files changed, 93 insertions(+), 18 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 2f0f0151219..97544fdf465 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard() const * may stomp all over it. */ return true; + case SHADER_OPCODE_QUAD_SWIZZLE: + switch (src[1].ud) { + case BRW_SWIZZLE_: + case BRW_SWIZZLE_: + case BRW_SWIZZLE_: + case BRW_SWIZZLE_: + case BRW_SWIZZLE_XXZZ: + case BRW_SWIZZLE_YYWW: + case BRW_SWIZZLE_XYXY: + case BRW_SWIZZLE_ZWZW: + return false; + default: + return !is_uniform(src[0]); + } default: /* The SIMD16 compressed instruction * @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: return MIN2(8, inst->exec_size); - case SHADER_OPCODE_QUAD_SWIZZLE: - return 8; - + case SHADER_OPCODE_QUAD_SWIZZLE: { + const unsigned swiz = inst->src[1].ud; + return (is_uniform(inst->src[0]) ? + get_fpu_lowered_simd_width(devinfo, inst) : + devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 : + swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 : + get_fpu_lowered_simd_width(devinfo, inst)); + } case SHADER_OPCODE_MOV_INDIRECT: { /* From IVB and HSW PRMs: * diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 53d9b6ce7bf..dc36ecc21ac 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -480,6 +480,10 @@ private: struct brw_reg src, struct brw_reg idx); + void generate_quad_swizzle(const fs_inst *inst, + struct brw_reg dst, struct brw_reg src, + unsigned swiz); + bool patch_discard_jumps_to_fb_writes(); const struct brw_compiler *compiler; diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 08dd83dded7..84627e83132 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst, } } +void +fs_generator::generate_quad_swizzle(const fs_inst *inst, +struct brw_reg dst, struct brw_reg src, +unsigned swiz) +{ + /* Requires a quad. */ + assert(inst->exec_size >= 4); + + if (src.file == BRW_IMMEDIATE_VALUE || + has_scalar_region(src)) { + /* The value is uniform across all channels */ + brw_MOV(p, dst, src); + + } else if (devinfo->gen < 11 && type_sz(src.type) == 4) { + /* This only works on 8-wide 32-bit values */ + assert(inst->exec_size == 8); + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src.vstride == src.width + 1); + brw_set_default_access_mode(p, BRW_ALIGN_16); + struct brw_reg swiz_src = stride(src, 4, 4, 1); + swiz_src.swizzle = swiz; + brw_MOV(p, dst, swiz_src); + + } else { + assert(src.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(src.vstride == src.width + 1); + const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0)); + + switch (swiz) { + case BRW_SWIZZLE_: + case BRW_SWIZZLE_: + case BRW_SWIZZLE_: + case BRW_SWIZZLE_: + brw_MOV(p, dst, stride(src_0, 4, 4, 0)); + break; + + case BRW_SWIZZLE_XXZZ: + case BRW_SWIZZLE_YYWW: + brw_MOV(p, dst, stride(src_0, 2, 2, 0)); + break; + + case BRW_SWIZZLE_XYXY: + case BRW_SWIZZLE_ZWZW: + assert(inst->exec_size == 4); + brw_MOV(p, dst, stride(src_0, 0, 2, 1)); + break; + + default: + assert(inst->force_writemask_all); + brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1); + + for (unsigned c = 0; c < 4; c++) { +brw_inst *insn = brw_MOV( + p, stride(suboffset(dst, c), + 4 * inst->dst.stride, 1, 4 * inst->dst.stride), + stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0)); + +brw_inst_set_no_dd_clear(devinfo, insn, c < 3); +