Module: Mesa Branch: main Commit: 1a3197094649487d8ca6f3233e206d4d869746f4 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=1a3197094649487d8ca6f3233e206d4d869746f4
Author: Caio Oliveira <caio.olive...@intel.com> Date: Wed Jan 10 13:32:22 2024 -0800 intel/compiler/xe2: Implement instruction compaction for DPAS. These use different tables but map to the same bits, so it is just a matter of picking the right tables for the instruction. Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26860> --- src/intel/compiler/brw_eu_compact.c | 127 ++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 33 deletions(-) diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c index 077c68cb21c..ac6916402ca 100644 --- a/src/intel/compiler/brw_eu_compact.c +++ b/src/intel/compiler/brw_eu_compact.c @@ -1094,6 +1094,25 @@ static const uint64_t xe2_3src_control_index_table[16] = { 0b0000011011000011101100000000000011, /* (8|M0) arf<1>:df :df :df :df */ }; +static const uint64_t xe2_3src_dpas_control_index_table[16] = { + 0b0000000000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub Atomic */ + 0b0000000100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :b Atomic */ + 0b0000100000111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :ub Atomic */ + 0b0000100100111110011001000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b Atomic */ + 0b0000000000111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :ub :ub */ + 0b0000100100111110011000000000000100, /* dpas.8x* (16|M0) grf:d :d :b :b */ + 0b0000101101111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf Atomic */ + 0b0000101101111101101001000000000100, /* dpas.8x* (16|M0) grf:f :bf :bf :bf Atomic */ + 0b0000101101111010110101000000000100, /* dpas.8x* (16|M0) grf:bf :f :bf :bf Atomic */ + 0b0000101101111101110101000000000100, /* dpas.8x* (16|M0) grf:bf :bf :bf :bf Atomic */ + 0b0000101101111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :bf :bf */ + 0b0000001001111010101001000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf Atomic */ + 0b0000001001111001101001000000000100, /* dpas.8x* (16|M0) grf:f :hf :hf :hf Atomic */ + 0b0000001001111010100101000000000100, /* dpas.8x* (16|M0) grf:hf :f :hf :hf Atomic */ + 0b0000001001111001100101000000000100, /* dpas.8x* (16|M0) grf:hf :hf :hf :hf Atomic */ + 0b0000001001111010101000000000000100, /* dpas.8x* (16|M0) grf:f :f :hf :hf */ +}; + static const uint32_t gfx12_3src_source_index_table[32] = { 0b100101100001100000000, /* grf<0;0> grf<8;1> grf<0> */ 0b100101100001001000010, /* arf<4;1> grf<8;1> grf<0> */ @@ -1186,6 +1205,28 @@ static const uint32_t xe2_3src_source_index_table[16] = { 0b100100010001000000001, /* arf<1;0> -grf<1;0> grf<0> */ }; +static const uint32_t xe2_3src_dpas_source_index_table[16] = { + 0b100100000000100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[ub,b] + * dpas.*x1 grf:[f,bf] grf:bf grf:bf + * dpas.*x1 grf:[f,hf] grf:hf grf:hf + */ + 0b100100000010100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u4,s4] */ + 0b100100000100100000000, /* dpas.*x1 grf:d grf:[ub,b] grf:[u2,s2] */ + 0b100100001000100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[ub,b] */ + 0b100100001010100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u4,s4] */ + 0b100100001100100000000, /* dpas.*x1 grf:d grf:[u4,s4] grf:[u2,s2] */ + 0b100100010000100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[ub,b] */ + 0b100100010010100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u4,s4] */ + 0b100100010100100000000, /* dpas.*x1 grf:d grf:[u2,s2] grf:[u2,s2] */ + 0b100100000000100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[ub,b] */ + 0b100100000010100000010, /* dpas.*x2 grf:d grf:[ub,b] grf:[u4,s4] */ + 0b100100001000100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[ub,b] */ + 0b100100001010100000010, /* dpas.*x2 grf:d grf:[u4,s4] grf:[u4,s4] */ + 0b100100010100100000010, /* dpas.*x2 grf:d grf:[u2,s2] grf:[u2,s2] */ + 0b100100000000100001110, /* dpas.*x8 grf:d grf:[ub,b] grf:[ub,b] */ + 0b100100001010100001110, /* dpas.*x8 grf:d grf:[u4,s4] grf:[u4,s4] */ +}; + static const uint32_t gfx12_3src_subreg_table[32] = { 0b00000000000000000000, /* .0 .0 .0 .0 */ 0b00100000000000000000, /* .0 .0 .0 .4 */ @@ -1488,18 +1529,20 @@ set_src1_index(const struct compaction_state *c, brw_compact_inst *dst, static bool set_3src_control_index(const struct intel_device_info *devinfo, - brw_compact_inst *dst, const brw_inst *src) + brw_compact_inst *dst, const brw_inst *src, + bool is_dpas) { assert(devinfo->ver >= 8); if (devinfo->ver >= 20) { + assert(is_dpas || !brw_inst_bits(src, 49, 49)); + const uint64_t uncompacted = /* 34b/Xe2+ */ (brw_inst_bits(src, 95, 92) << 30) | /* 4b */ (brw_inst_bits(src, 90, 88) << 27) | /* 3b */ (brw_inst_bits(src, 82, 80) << 24) | /* 3b */ (brw_inst_bits(src, 50, 50) << 23) | /* 1b */ - 0 | /* 1b */ - (brw_inst_bits(src, 48, 48) << 21) | /* 1b */ + (brw_inst_bits(src, 49, 48) << 21) | /* 2b */ (brw_inst_bits(src, 42, 40) << 18) | /* 3b */ (brw_inst_bits(src, 39, 39) << 17) | /* 1b */ (brw_inst_bits(src, 38, 36) << 14) | /* 3b */ @@ -1512,8 +1555,15 @@ set_3src_control_index(const struct intel_device_info *devinfo, (brw_inst_bits(src, 23, 21) << 3) | /* 3b */ (brw_inst_bits(src, 20, 18)); /* 3b */ - for (unsigned i = 0; i < ARRAY_SIZE(xe2_3src_control_index_table); i++) { - if (xe2_3src_control_index_table[i] == uncompacted) { + /* The bits used to index the tables for 3src and 3src-dpas + * are the same, so just need to pick the right one. + */ + const uint64_t *table = is_dpas ? xe2_3src_dpas_control_index_table : + xe2_3src_control_index_table; + const unsigned size = is_dpas ? ARRAY_SIZE(xe2_3src_dpas_control_index_table) : + ARRAY_SIZE(xe2_3src_control_index_table); + for (unsigned i = 0; i < size; i++) { + if (table[i] == uncompacted) { brw_compact_inst_set_3src_control_index(devinfo, dst, i); return true; } @@ -1595,7 +1645,8 @@ set_3src_control_index(const struct intel_device_info *devinfo, static bool set_3src_source_index(const struct intel_device_info *devinfo, - brw_compact_inst *dst, const brw_inst *src) + brw_compact_inst *dst, const brw_inst *src, + bool is_dpas) { assert(devinfo->ver >= 8); @@ -1617,12 +1668,17 @@ set_3src_source_index(const struct intel_device_info *devinfo, (brw_inst_bits(src, 43, 43) << 1) | /* 1b */ (brw_inst_bits(src, 35, 35)); /* 1b */ + /* In Xe2, the bits used to index the tables for 3src and 3src-dpas + * are the same, so just need to pick the right one. + */ const uint32_t *three_src_source_index_table = - devinfo->ver >= 20 ? xe2_3src_source_index_table : + devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table : + xe2_3src_source_index_table) : devinfo->verx10 >= 125 ? xehp_3src_source_index_table : gfx12_3src_source_index_table; const uint32_t three_src_source_index_table_len = - devinfo->ver >= 20 ? ARRAY_SIZE(xe2_3src_source_index_table) : + devinfo->ver >= 20 ? (is_dpas ? ARRAY_SIZE(xe2_3src_dpas_source_index_table) : + ARRAY_SIZE(xe2_3src_source_index_table)) : devinfo->verx10 >= 125 ? ARRAY_SIZE(xehp_3src_source_index_table) : ARRAY_SIZE(gfx12_3src_source_index_table); @@ -1727,22 +1783,19 @@ has_unmapped_bits(const struct brw_isa_info *isa, const brw_inst *src) } static bool -has_3src_unmapped_bits(const struct brw_isa_info *isa, - const brw_inst *src) +has_3src_unmapped_bits(const struct intel_device_info *devinfo, + const brw_inst *src, bool is_dpas) { - const struct intel_device_info *devinfo = isa->devinfo; - /* Check for three-source instruction bits that don't map to any of the * fields of the compacted instruction. All of them seem to be reserved * bits currently. */ - ASSERTED enum opcode opcode = brw_inst_opcode(isa, src); if (devinfo->ver >= 20) { - assert(opcode == BRW_OPCODE_DPAS || !brw_inst_bits(src, 49, 49)); + assert(is_dpas || !brw_inst_bits(src, 49, 49)); assert(!brw_inst_bits(src, 33, 33)); assert(!brw_inst_bits(src, 7, 7)); } else if (devinfo->ver >= 12) { - assert(opcode == BRW_OPCODE_DPAS || !brw_inst_bits(src, 49, 49)); + assert(is_dpas || !brw_inst_bits(src, 49, 49)); assert(!brw_inst_bits(src, 7, 7)); } else if (devinfo->ver >= 9 || devinfo->platform == INTEL_PLATFORM_CHV) { assert(!brw_inst_bits(src, 127, 127) && @@ -1769,7 +1822,8 @@ brw_try_compact_3src_instruction(const struct brw_isa_info *isa, const struct intel_device_info *devinfo = isa->devinfo; assert(devinfo->ver >= 8); - if (has_3src_unmapped_bits(isa, src)) + bool is_dpas = brw_inst_opcode(isa, src) == BRW_OPCODE_DPAS; + if (has_3src_unmapped_bits(devinfo, src, is_dpas)) return false; #define compact(field) \ @@ -1779,10 +1833,10 @@ brw_try_compact_3src_instruction(const struct brw_isa_info *isa, compact(hw_opcode); - if (!set_3src_control_index(devinfo, dst, src)) + if (!set_3src_control_index(devinfo, dst, src, is_dpas)) return false; - if (!set_3src_source_index(devinfo, dst, src)) + if (!set_3src_source_index(devinfo, dst, src, is_dpas)) return false; if (devinfo->ver >= 12) { @@ -2340,20 +2394,22 @@ set_uncompacted_src1(const struct compaction_state *c, brw_inst *dst, static void set_uncompacted_3src_control_index(const struct compaction_state *c, - brw_inst *dst, brw_compact_inst *src) + brw_inst *dst, brw_compact_inst *src, + bool is_dpas) { const struct intel_device_info *devinfo = c->isa->devinfo; assert(devinfo->ver >= 8); if (devinfo->ver >= 20) { uint64_t compacted = brw_compact_inst_3src_control_index(devinfo, src); - uint64_t uncompacted = xe2_3src_control_index_table[compacted]; + uint64_t uncompacted = is_dpas ? xe2_3src_dpas_control_index_table[compacted] : + xe2_3src_control_index_table[compacted]; brw_inst_set_bits(dst, 95, 92, (uncompacted >> 30) & 0xf); brw_inst_set_bits(dst, 90, 88, (uncompacted >> 27) & 0x7); brw_inst_set_bits(dst, 82, 80, (uncompacted >> 24) & 0x7); brw_inst_set_bits(dst, 50, 50, (uncompacted >> 23) & 0x1); - brw_inst_set_bits(dst, 48, 48, (uncompacted >> 21) & 0x1); + brw_inst_set_bits(dst, 49, 48, (uncompacted >> 21) & 0x3); brw_inst_set_bits(dst, 42, 40, (uncompacted >> 18) & 0x7); brw_inst_set_bits(dst, 39, 39, (uncompacted >> 17) & 0x1); brw_inst_set_bits(dst, 38, 36, (uncompacted >> 14) & 0x7); @@ -2425,7 +2481,8 @@ set_uncompacted_3src_control_index(const struct compaction_state *c, static void set_uncompacted_3src_source_index(const struct intel_device_info *devinfo, - brw_inst *dst, brw_compact_inst *src) + brw_inst *dst, brw_compact_inst *src, + bool is_dpas) { assert(devinfo->ver >= 8); @@ -2433,7 +2490,8 @@ set_uncompacted_3src_source_index(const struct intel_device_info *devinfo, if (devinfo->ver >= 12) { const uint32_t *three_src_source_index_table = - devinfo->ver >= 20 ? xe2_3src_source_index_table : + devinfo->ver >= 20 ? (is_dpas ? xe2_3src_dpas_source_index_table : + xe2_3src_source_index_table) : devinfo->verx10 >= 125 ? xehp_3src_source_index_table : gfx12_3src_source_index_table; uint32_t uncompacted = three_src_source_index_table[compacted]; @@ -2491,7 +2549,7 @@ set_uncompacted_3src_subreg_index(const struct intel_device_info *devinfo, static void brw_uncompact_3src_instruction(const struct compaction_state *c, - brw_inst *dst, brw_compact_inst *src) + brw_inst *dst, brw_compact_inst *src, bool is_dpas) { const struct intel_device_info *devinfo = c->isa->devinfo; assert(devinfo->ver >= 8); @@ -2504,8 +2562,8 @@ brw_uncompact_3src_instruction(const struct compaction_state *c, uncompact(hw_opcode); if (devinfo->ver >= 12) { - set_uncompacted_3src_control_index(c, dst, src); - set_uncompacted_3src_source_index(devinfo, dst, src); + set_uncompacted_3src_control_index(c, dst, src, is_dpas); + set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas); set_uncompacted_3src_subreg_index(devinfo, dst, src); uncompact(debug_control); @@ -2515,8 +2573,8 @@ brw_uncompact_3src_instruction(const struct compaction_state *c, uncompact(src1_reg_nr); uncompact(src2_reg_nr); } else { - set_uncompacted_3src_control_index(c, dst, src); - set_uncompacted_3src_source_index(devinfo, dst, src); + set_uncompacted_3src_control_index(c, dst, src, is_dpas); + set_uncompacted_3src_source_index(devinfo, dst, src, is_dpas); uncompact(dst_reg_nr); uncompact_a16(src0_rep_ctrl); @@ -2544,11 +2602,14 @@ uncompact_instruction(const struct compaction_state *c, brw_inst *dst, const struct intel_device_info *devinfo = c->isa->devinfo; memset(dst, 0, sizeof(*dst)); - if (devinfo->ver >= 8 && - is_3src(c->isa, brw_opcode_decode(c->isa, - brw_compact_inst_3src_hw_opcode(devinfo, src)))) { - brw_uncompact_3src_instruction(c, dst, src); - return; + if (devinfo->ver >= 8) { + const enum opcode opcode = + brw_opcode_decode(c->isa, brw_compact_inst_3src_hw_opcode(devinfo, src)); + if (is_3src(c->isa, opcode)) { + const bool is_dpas = opcode == BRW_OPCODE_DPAS; + brw_uncompact_3src_instruction(c, dst, src, is_dpas); + return; + } } #define uncompact(field) \