Module: Mesa Branch: main Commit: ae54cbeb3f40abebb8f534c69de620d6a0ca4b2b URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=ae54cbeb3f40abebb8f534c69de620d6a0ca4b2b
Author: Rhys Perry <pendingchao...@gmail.com> Date: Mon Nov 20 15:53:39 2023 +0000 nir: remove sad_u8x4 All uses of this can be replaced with msad_4x8. Signed-off-by: Rhys Perry <pendingchao...@gmail.com> Reviewed-by: Georg Lehmann <dadschoo...@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26907> --- src/amd/common/ac_nir_lower_ngg.c | 8 ++++---- src/amd/compiler/aco_instruction_selection.cpp | 5 ----- .../compiler/aco_instruction_selection_setup.cpp | 1 - src/amd/llvm/ac_nir_to_llvm.c | 5 ----- src/compiler/nir/nir_opcodes.py | 24 ---------------------- src/compiler/nir/nir_range_analysis.c | 1 - 6 files changed, 4 insertions(+), 40 deletions(-) diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index 245b7d27453..987f5b8fed8 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -284,7 +284,7 @@ summarize_repack(nir_builder *b, nir_def *packed_counts, unsigned num_lds_dwords * * If the v_dot instruction can't be used, we left-shift the packed bytes. * This will shift out the unneeded bytes and shift in zeroes instead, - * then we sum them using v_sad_u8. + * then we sum them using v_msad_u8. */ nir_def *lane_id = nir_load_subgroup_invocation(b); @@ -302,7 +302,7 @@ summarize_repack(nir_builder *b, nir_def *packed_counts, unsigned num_lds_dwords return nir_udot_4x8_uadd(b, packed, dot_op, nir_imm_int(b, 0)); } else { nir_def *sad_op = nir_ishl(b, nir_ishl(b, packed, shift), shift); - return nir_sad_u8x4(b, sad_op, nir_imm_int(b, 0), nir_imm_int(b, 0)); + return nir_msad_4x8(b, sad_op, nir_imm_int(b, 0), nir_imm_int(b, 0)); } } else if (num_lds_dwords == 2) { nir_def *dot_op = !use_dot ? NULL : nir_ushr(b, nir_ushr(b, nir_imm_int64(b, 0x0101010101010101), shift), shift); @@ -317,8 +317,8 @@ summarize_repack(nir_builder *b, nir_def *packed_counts, unsigned num_lds_dwords return nir_udot_4x8_uadd(b, packed_dw1, nir_unpack_64_2x32_split_y(b, dot_op), sum); } else { nir_def *sad_op = nir_ishl(b, nir_ishl(b, nir_pack_64_2x32_split(b, packed_dw0, packed_dw1), shift), shift); - nir_def *sum = nir_sad_u8x4(b, nir_unpack_64_2x32_split_x(b, sad_op), nir_imm_int(b, 0), nir_imm_int(b, 0)); - return nir_sad_u8x4(b, nir_unpack_64_2x32_split_y(b, sad_op), nir_imm_int(b, 0), sum); + nir_def *sum = nir_msad_4x8(b, nir_unpack_64_2x32_split_x(b, sad_op), nir_imm_int(b, 0), nir_imm_int(b, 0)); + return nir_msad_4x8(b, nir_unpack_64_2x32_split_y(b, sad_op), nir_imm_int(b, 0), sum); } } else { unreachable("Unimplemented NGG wave count"); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index be1468f1fd5..1fe6918aa51 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3420,11 +3420,6 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } break; } - case nir_op_sad_u8x4: { - assert(dst.regClass() == v1); - emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false); - break; - } case nir_op_msad_4x8: { assert(dst.regClass() == v1); emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 9004ef2e632..32e9a08a808 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -392,7 +392,6 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_op_frexp_sig: case nir_op_frexp_exp: case nir_op_cube_amd: - case nir_op_sad_u8x4: case nir_op_msad_4x8: case nir_op_udot_4x8_uadd: case nir_op_sdot_4x8_iadd: diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index e712e2d6820..06dfaed1a39 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -1253,11 +1253,6 @@ static bool visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) break; } - case nir_op_sad_u8x4: - result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.sad.u8", ctx->ac.i32, - (LLVMValueRef[]){src[0], src[1], src[2]}, 3, 0); - break; - case nir_op_msad_4x8: result = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.msad.u8", ctx->ac.i32, (LLVMValueRef[]){src[1], src[0], src[2]}, 3, 0); diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 66ce98e46cf..0770351c988 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -1126,30 +1126,6 @@ if (bits == 0) { } """) -triop_horiz("sad_u8x4", 1, 1, 1, 1, """ -uint8_t s0_b0 = (src0.x & 0x000000ff) >> 0; -uint8_t s0_b1 = (src0.x & 0x0000ff00) >> 8; -uint8_t s0_b2 = (src0.x & 0x00ff0000) >> 16; -uint8_t s0_b3 = (src0.x & 0xff000000) >> 24; - -uint8_t s1_b0 = (src1.x & 0x000000ff) >> 0; -uint8_t s1_b1 = (src1.x & 0x0000ff00) >> 8; -uint8_t s1_b2 = (src1.x & 0x00ff0000) >> 16; -uint8_t s1_b3 = (src1.x & 0xff000000) >> 24; - -dst.x = src2.x + - (s0_b0 > s1_b0 ? (s0_b0 - s1_b0) : (s1_b0 - s0_b0)) + - (s0_b1 > s1_b1 ? (s0_b1 - s1_b1) : (s1_b1 - s0_b1)) + - (s0_b2 > s1_b2 ? (s0_b2 - s1_b2) : (s1_b2 - s0_b2)) + - (s0_b3 > s1_b3 ? (s0_b3 - s1_b3) : (s1_b3 - s0_b3)); -""", description = """ -Sum of absolute differences with accumulation. Equivalent to AMD's v_sad_u8 instruction. - -The first two sources contain packed 8-bit unsigned integers, the instruction will -calculate the absolute difference of these, and then add them together. There is also a -third source which is a 32-bit unsigned integer and added to the result. -""") - triop("msad_4x8", tuint32, "", """ dst = msad(src0, src1, src2); """, description = """ diff --git a/src/compiler/nir/nir_range_analysis.c b/src/compiler/nir/nir_range_analysis.c index 0ac5638f695..79c7ed92cfd 100644 --- a/src/compiler/nir/nir_range_analysis.c +++ b/src/compiler/nir/nir_range_analysis.c @@ -1864,7 +1864,6 @@ get_alu_uub(struct analysis_state *state, struct uub_query q, uint32_t *result, case nir_op_b2i32: *result = 1; break; - case nir_op_sad_u8x4: case nir_op_msad_4x8: *result = MIN2((uint64_t)src[2] + 4 * 255, UINT32_MAX); break;