Module: Mesa Branch: main Commit: df87c593f8a55f0a95359dc10bb4652b9ba19cde URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=df87c593f8a55f0a95359dc10bb4652b9ba19cde
Author: Marek Olšák <marek.ol...@amd.com> Date: Mon Oct 23 22:22:49 2023 -0400 radeonsi: rewrite PM4 packet building helpers with less duplication First, the following universal helpers are defined: - radeon_set_reg_seq - radeon_set_reg - radeon_opt_set_reg - radeon_opt_set_reg2 - radeon_opt_set_reg3 - radeon_opt_set_reg4 - radeon_opt_set_reg5 - radeon_opt_set_regn - gfx11_push_sh_reg - gfx11_opt_push_sh_reg Then the config, context, sh, uconfig, push_gfx and push_compute helpers are implemented calling the above. A lot of macros were receiving sctx via a parameter, which is changed to use sctx directly in the macro (and the parameter is renamed to "_unused"). The only functional change is that the perfctr registers that incorrectly set the predicate bit now correctly set the RESET_FILTER_CAM bit. The helpers no longer check info.uses_kernel_cu_mask. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26095> --- src/gallium/drivers/radeonsi/si_build_pm4.h | 481 +++++++++------------- src/gallium/drivers/radeonsi/si_perfcounter.c | 12 +- src/gallium/drivers/radeonsi/si_sqtt.c | 6 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 45 +- 4 files changed, 225 insertions(+), 319 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h index eba45c473f9..3f45226dcb4 100644 --- a/src/gallium/drivers/radeonsi/si_build_pm4.h +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h @@ -36,10 +36,10 @@ #define radeon_emit(value) __cs_buf[__cs_num++] = (value) #define radeon_packets_added() (__cs_num != __cs_num_initial) -#define radeon_end_update_context_roll(sctx) do { \ +#define radeon_end_update_context_roll(_unused) do { \ radeon_end(); \ if (radeon_packets_added()) \ - (sctx)->context_roll = true; \ + sctx->context_roll = true; \ } while (0) #define radeon_emit_array(values, num) do { \ @@ -56,337 +56,197 @@ __cs_num += (num); \ } while (0) -#define radeon_set_config_reg_seq(reg, num) do { \ - assert((reg) < SI_CONTEXT_REG_OFFSET); \ - radeon_emit(PKT3(PKT3_SET_CONFIG_REG, num, 0)); \ - radeon_emit(((reg) - SI_CONFIG_REG_OFFSET) >> 2); \ +/* Packet building helpers. Don't use directly. */ +#define radeon_set_reg_seq(reg, num, idx, prefix_name, packet, reset_filter_cam) do { \ + assert((reg) >= prefix_name##_REG_OFFSET && (reg) < prefix_name##_REG_END); \ + radeon_emit(PKT3(packet, num, 0) | PKT3_RESET_FILTER_CAM_S(reset_filter_cam)); \ + radeon_emit((((reg) - prefix_name##_REG_OFFSET) >> 2) | ((idx) << 28)); \ } while (0) -#define radeon_set_config_reg(reg, value) do { \ - radeon_set_config_reg_seq(reg, 1); \ +#define radeon_set_reg(reg, idx, value, prefix_name, packet) do { \ + radeon_set_reg_seq(reg, 1, idx, prefix_name, packet, 0); \ radeon_emit(value); \ } while (0) -#define radeon_set_context_reg_seq(reg, num) do { \ - assert((reg) >= SI_CONTEXT_REG_OFFSET); \ - radeon_emit(PKT3(PKT3_SET_CONTEXT_REG, num, 0)); \ - radeon_emit(((reg) - SI_CONTEXT_REG_OFFSET) >> 2); \ +#define radeon_opt_set_reg(reg, reg_enum, idx, value, prefix_name, packet, category) do { \ + unsigned __value = (value); \ + if (!((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1) || \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] != __value) { \ + radeon_set_reg(reg, idx, __value, prefix_name, packet); \ + sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_BIT(reg_enum); \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] = __value; \ + } \ } while (0) -#define radeon_set_context_reg(reg, value) do { \ - radeon_set_context_reg_seq(reg, 1); \ - radeon_emit(value); \ +/* Set consecutive registers if any value is different. */ +#define radeon_opt_set_reg2(reg, reg_enum, v1, v2, prefix_name, packet, category) do { \ + unsigned __v1 = (v1), __v2 = (v2); \ + if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x3) != 0x3 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2) { \ + radeon_set_reg_seq(reg, 2, 0, prefix_name, packet, 0); \ + radeon_emit(__v1); \ + radeon_emit(__v2); \ + sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 2); \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ + } \ } while (0) -#define radeon_set_context_reg_seq_array(reg, num, values) do { \ - radeon_set_context_reg_seq(reg, num); \ - radeon_emit_array(values, num); \ +#define radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, prefix_name, packet, category) do { \ + unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3); \ + if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x7) != 0x7 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3) { \ + radeon_set_reg_seq(reg, 3, 0, prefix_name, packet, 0); \ + radeon_emit(__v1); \ + radeon_emit(__v2); \ + radeon_emit(__v3); \ + sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 3); \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ + } \ } while (0) -#define radeon_set_context_reg_idx(reg, idx, value) do { \ - assert((reg) >= SI_CONTEXT_REG_OFFSET); \ - radeon_emit(PKT3(PKT3_SET_CONTEXT_REG, 1, 0)); \ - radeon_emit(((reg) - SI_CONTEXT_REG_OFFSET) >> 2 | ((idx) << 28)); \ - radeon_emit(value); \ +#define radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, prefix_name, packet, category) do { \ + unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4); \ + if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0xf) != 0xf || \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4) { \ + radeon_set_reg_seq(reg, 4, 0, prefix_name, packet, 0); \ + radeon_emit(__v1); \ + radeon_emit(__v2); \ + radeon_emit(__v3); \ + radeon_emit(__v4); \ + sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 4); \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \ + } \ } while (0) -#define radeon_set_sh_reg_seq(reg, num) do { \ - assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \ - radeon_emit(PKT3(PKT3_SET_SH_REG, num, 0)); \ - radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \ +#define radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, prefix_name, packet, category) do { \ + unsigned __v1 = (v1), __v2 = (v2), __v3 = (v3), __v4 = (v4), __v5 = (v5); \ + if (((sctx->tracked_regs.category##_reg_saved_mask >> (reg_enum)) & 0x1f) != 0x1f || \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] != __v1 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] != __v2 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] != __v3 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] != __v4 || \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 4] != __v5) { \ + radeon_set_reg_seq(reg, 5, 0, prefix_name, packet, 0); \ + radeon_emit(__v1); \ + radeon_emit(__v2); \ + radeon_emit(__v3); \ + radeon_emit(__v4); \ + radeon_emit(__v5); \ + sctx->tracked_regs.category##_reg_saved_mask |= BITFIELD64_RANGE(reg_enum, 5); \ + sctx->tracked_regs.category##_reg_value[(reg_enum)] = __v1; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 1] = __v2; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 2] = __v3; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 3] = __v4; \ + sctx->tracked_regs.category##_reg_value[(reg_enum) + 4] = __v5; \ + } \ } while (0) -#define radeon_set_sh_reg_idx_seq(sctx, reg, idx, num) do { \ - assert((reg) >= SI_SH_REG_OFFSET && (reg) < SI_SH_REG_END); \ - if ((sctx)->screen->info.uses_kernel_cu_mask) { \ - assert((sctx)->gfx_level >= GFX10); \ - radeon_emit(PKT3(PKT3_SET_SH_REG_INDEX, num, 0)); \ - radeon_emit((((reg) - SI_SH_REG_OFFSET) >> 2) | ((idx) << 28)); \ - } else { \ - radeon_emit(PKT3(PKT3_SET_SH_REG, num, 0)); \ - radeon_emit(((reg) - SI_SH_REG_OFFSET) >> 2); \ +#define radeon_opt_set_regn(reg, values, saved_values, num, prefix_name, packet) do { \ + if (memcmp(values, saved_values, sizeof(uint32_t) * (num))) { \ + radeon_set_reg_seq(reg, num, 0, prefix_name, packet, 0); \ + radeon_emit_array(values, num); \ + memcpy(saved_values, values, sizeof(uint32_t) * (num)); \ } \ } while (0) -#define radeon_set_sh_reg(reg, value) do { \ - radeon_set_sh_reg_seq(reg, 1); \ - radeon_emit(value); \ -} while (0) +/* Packet building helpers for CONFIG registers. */ +#define radeon_set_config_reg(reg, value) \ + radeon_set_reg(reg, 0, value, SI_CONFIG, PKT3_SET_CONFIG_REG) -#define radeon_set_sh_reg_idx(sctx, reg, idx, value) do { \ - radeon_set_sh_reg_idx_seq(sctx, reg, idx, 1); \ - radeon_emit(value); \ -} while (0) +/* Packet building helpers for CONTEXT registers. */ +/* TODO: Remove the _unused parameters everywhere. */ +#define radeon_set_context_reg_seq(reg, num) \ + radeon_set_reg_seq(reg, num, 0, SI_CONTEXT, PKT3_SET_CONTEXT_REG, 0) -#define radeon_push_gfx_sh_reg(reg, value) do { \ - unsigned __i = sctx->num_buffered_gfx_sh_regs++; \ - assert(__i / 2 < ARRAY_SIZE(sctx->buffered_gfx_sh_regs)); \ - sctx->buffered_gfx_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \ - sctx->buffered_gfx_sh_regs[__i / 2].reg_value[__i % 2] = value; \ -} while (0) +#define radeon_set_context_reg(reg, value) \ + radeon_set_reg(reg, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG) -#define radeon_push_compute_sh_reg(reg, value) do { \ - unsigned __i = sctx->num_buffered_compute_sh_regs++; \ - assert(__i / 2 < ARRAY_SIZE(sctx->buffered_compute_sh_regs)); \ - sctx->buffered_compute_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \ - sctx->buffered_compute_sh_regs[__i / 2].reg_value[__i % 2] = value; \ -} while (0) +#define radeon_opt_set_context_reg(_unused, reg, reg_enum, value) \ + radeon_opt_set_reg(reg, reg_enum, 0, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) -#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \ - if (HAS_PAIRS) { \ - radeon_push_gfx_sh_reg(reg, value); \ - } else { \ - radeon_set_sh_reg_seq(reg, 1); \ - radeon_emit(value); \ - } \ -} while (0) +#define radeon_opt_set_context_reg_idx(_unused, reg, reg_enum, idx, value) \ + radeon_opt_set_reg(reg, reg_enum, idx, value, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) -#define radeon_opt_push_gfx_sh_reg(offset, reg, val) do { \ - unsigned __value = val; \ - unsigned __reg = reg; \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (__reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.other_reg_value[__reg] != __value) { \ - radeon_push_gfx_sh_reg(offset, __value); \ - sctx->tracked_regs.other_reg_saved_mask |= BITFIELD_BIT(__reg); \ - sctx->tracked_regs.other_reg_value[__reg] = __value; \ - } \ -} while (0) +#define radeon_opt_set_context_reg2(_unused, reg, reg_enum, v1, v2) \ + radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) -#define radeon_opt_push_compute_sh_reg(offset, reg, val) do { \ - unsigned __value = val; \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.other_reg_value[reg] != __value) { \ - radeon_push_compute_sh_reg(offset, __value); \ - sctx->tracked_regs.other_reg_saved_mask |= BITFIELD_BIT(reg); \ - sctx->tracked_regs.other_reg_value[reg] = __value; \ - } \ -} while (0) +#define radeon_opt_set_context_reg3(_unused, reg, reg_enum, v1, v2, v3) \ + radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) -#define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \ - assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \ - radeon_emit(PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \ - radeon_emit(((reg) - CIK_UCONFIG_REG_OFFSET) >> 2); \ -} while (0) +#define radeon_opt_set_context_reg4(_unused, reg, reg_enum, v1, v2, v3, v4) \ + radeon_opt_set_reg4(reg, reg_enum, v1, v2, v3, v4, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) -#define radeon_set_uconfig_reg(reg, value) do { \ - radeon_set_uconfig_reg_seq(reg, 1, false); \ - radeon_emit(value); \ -} while (0) +#define radeon_opt_set_context_reg5(_unused, reg, reg_enum, v1, v2, v3, v4, v5) \ + radeon_opt_set_reg5(reg, reg_enum, v1, v2, v3, v4, v5, SI_CONTEXT, PKT3_SET_CONTEXT_REG, context) -#define radeon_set_uconfig_reg_perfctr(reg, value) do { \ - radeon_set_uconfig_reg_seq(reg, 1, true); \ - radeon_emit(value); \ -} while (0) +#define radeon_opt_set_context_regn(_unused, reg, values, saved_values, num) \ + radeon_opt_set_regn(reg, values, saved_values, num, SI_CONTEXT, PKT3_SET_CONTEXT_REG) -#define radeon_set_uconfig_reg_idx(screen, gfx_level, reg, idx, value) do { \ - assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \ - assert((idx) != 0); \ - unsigned __opcode = PKT3_SET_UCONFIG_REG_INDEX; \ - if ((gfx_level) < GFX9 || \ - ((gfx_level) == GFX9 && (screen)->info.me_fw_version < 26)) \ - __opcode = PKT3_SET_UCONFIG_REG; \ - radeon_emit(PKT3(__opcode, 1, 0)); \ - radeon_emit(((reg) - CIK_UCONFIG_REG_OFFSET) >> 2 | ((idx) << 28)); \ - radeon_emit(value); \ -} while (0) +/* Packet building helpers for SH registers. */ +#define radeon_set_sh_reg_seq(reg, num) \ + radeon_set_reg_seq(reg, num, 0, SI_SH, PKT3_SET_SH_REG, 0) -/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */ -#define radeon_opt_set_context_reg(sctx, offset, reg, val) do { \ - unsigned __value = val; \ - if (((sctx->tracked_regs.context_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.context_reg_value[reg] != __value) { \ - radeon_set_context_reg(offset, __value); \ - sctx->tracked_regs.context_reg_saved_mask |= 0x1ull << (reg); \ - sctx->tracked_regs.context_reg_value[reg] = __value; \ - } \ -} while (0) +#define radeon_set_sh_reg(reg, value) \ + radeon_set_reg(reg, 0, value, SI_SH, PKT3_SET_SH_REG) -#define radeon_opt_set_context_reg_idx(sctx, offset, reg, idx, val) do { \ - unsigned __value = val; \ - if (((sctx->tracked_regs.context_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.context_reg_value[reg] != __value) { \ - radeon_set_context_reg_idx(offset, idx, __value); \ - sctx->tracked_regs.context_reg_saved_mask |= 0x1ull << (reg); \ - sctx->tracked_regs.context_reg_value[reg] = __value; \ - } \ -} while (0) +#define radeon_opt_set_sh_reg(_unused, reg, reg_enum, value) \ + radeon_opt_set_reg(reg, reg_enum, 0, value, SI_SH, PKT3_SET_SH_REG, other) -/** - * Set 2 consecutive registers if any registers value is different. - * @param offset starting register offset - * @param val1 is written to first register - * @param val2 is written to second register - */ -#define radeon_opt_set_context_reg2(sctx, offset, reg, val1, val2) do { \ - unsigned __value1 = (val1), __value2 = (val2); \ - if (((sctx->tracked_regs.context_reg_saved_mask >> (reg)) & 0x3) != 0x3 || \ - sctx->tracked_regs.context_reg_value[reg] != __value1 || \ - sctx->tracked_regs.context_reg_value[(reg) + 1] != __value2) { \ - radeon_set_context_reg_seq(offset, 2); \ - radeon_emit(__value1); \ - radeon_emit(__value2); \ - sctx->tracked_regs.context_reg_value[reg] = __value1; \ - sctx->tracked_regs.context_reg_value[(reg) + 1] = __value2; \ - sctx->tracked_regs.context_reg_saved_mask |= 0x3ull << (reg); \ - } \ -} while (0) +#define radeon_opt_set_sh_reg2(_unused, reg, reg_enum, v1, v2) \ + radeon_opt_set_reg2(reg, reg_enum, v1, v2, SI_SH, PKT3_SET_SH_REG, other) -/** - * Set 3 consecutive registers if any registers value is different. - */ -#define radeon_opt_set_context_reg3(sctx, offset, reg, val1, val2, val3) do { \ - unsigned __value1 = (val1), __value2 = (val2), __value3 = (val3); \ - if (((sctx->tracked_regs.context_reg_saved_mask >> (reg)) & 0x7) != 0x7 || \ - sctx->tracked_regs.context_reg_value[reg] != __value1 || \ - sctx->tracked_regs.context_reg_value[(reg) + 1] != __value2 || \ - sctx->tracked_regs.context_reg_value[(reg) + 2] != __value3) { \ - radeon_set_context_reg_seq(offset, 3); \ - radeon_emit(__value1); \ - radeon_emit(__value2); \ - radeon_emit(__value3); \ - sctx->tracked_regs.context_reg_value[reg] = __value1; \ - sctx->tracked_regs.context_reg_value[(reg) + 1] = __value2; \ - sctx->tracked_regs.context_reg_value[(reg) + 2] = __value3; \ - sctx->tracked_regs.context_reg_saved_mask |= 0x7ull << (reg); \ - } \ -} while (0) +#define radeon_opt_set_sh_reg3(_unused, reg, reg_enum, v1, v2, v3) \ + radeon_opt_set_reg3(reg, reg_enum, v1, v2, v3, SI_SH, PKT3_SET_SH_REG, other) -/** - * Set 4 consecutive registers if any registers value is different. - */ -#define radeon_opt_set_context_reg4(sctx, offset, reg, val1, val2, val3, val4) do { \ - unsigned __value1 = (val1), __value2 = (val2), __value3 = (val3), __value4 = (val4); \ - if (((sctx->tracked_regs.context_reg_saved_mask >> (reg)) & 0xf) != 0xf || \ - sctx->tracked_regs.context_reg_value[reg] != __value1 || \ - sctx->tracked_regs.context_reg_value[(reg) + 1] != __value2 || \ - sctx->tracked_regs.context_reg_value[(reg) + 2] != __value3 || \ - sctx->tracked_regs.context_reg_value[(reg) + 3] != __value4) { \ - radeon_set_context_reg_seq(offset, 4); \ - radeon_emit(__value1); \ - radeon_emit(__value2); \ - radeon_emit(__value3); \ - radeon_emit(__value4); \ - sctx->tracked_regs.context_reg_value[reg] = __value1; \ - sctx->tracked_regs.context_reg_value[(reg) + 1] = __value2; \ - sctx->tracked_regs.context_reg_value[(reg) + 2] = __value3; \ - sctx->tracked_regs.context_reg_value[(reg) + 3] = __value4; \ - sctx->tracked_regs.context_reg_saved_mask |= 0xfull << (reg); \ - } \ +#define radeon_opt_set_sh_reg_idx(_unused, reg, reg_enum, idx, value) do { \ + assert(sctx->gfx_level >= GFX10); \ + radeon_opt_set_reg(reg, reg_enum, idx, value, SI_SH, PKT3_SET_SH_REG_INDEX, other); \ } while (0) -/** - * Set 5 consecutive registers if any register value is different. - */ -#define radeon_opt_set_context_reg5(sctx, offset, reg, val0, val1, val2, val3, val4) do { \ - unsigned __value0 = (val0), __value1 = (val1), __value2 = (val2), __value3 = (val3), __value4 = (val4); \ - if (((sctx->tracked_regs.context_reg_saved_mask >> (reg)) & 0x1f) != 0x1f || \ - sctx->tracked_regs.context_reg_value[(reg) + 0] != __value0 || \ - sctx->tracked_regs.context_reg_value[(reg) + 1] != __value1 || \ - sctx->tracked_regs.context_reg_value[(reg) + 2] != __value2 || \ - sctx->tracked_regs.context_reg_value[(reg) + 3] != __value3 || \ - sctx->tracked_regs.context_reg_value[(reg) + 4] != __value4) { \ - radeon_set_context_reg_seq(offset, 5); \ - radeon_emit(__value0); \ - radeon_emit(__value1); \ - radeon_emit(__value2); \ - radeon_emit(__value3); \ - radeon_emit(__value4); \ - sctx->tracked_regs.context_reg_value[(reg) + 0] = __value0; \ - sctx->tracked_regs.context_reg_value[(reg) + 1] = __value1; \ - sctx->tracked_regs.context_reg_value[(reg) + 2] = __value2; \ - sctx->tracked_regs.context_reg_value[(reg) + 3] = __value3; \ - sctx->tracked_regs.context_reg_value[(reg) + 4] = __value4; \ - sctx->tracked_regs.context_reg_saved_mask |= 0x1full << (reg); \ - } \ +#define radeon_emit_32bit_pointer(_unused, va) do { \ + assert((va) == 0 || ((va) >> 32) == sctx->screen->info.address32_hi); \ + radeon_emit(va); \ } while (0) -/** - * Set consecutive registers if any registers value is different. - */ -#define radeon_opt_set_context_regn(sctx, offset, value, saved_val, num) do { \ - if (memcmp(value, saved_val, sizeof(uint32_t) * (num))) { \ - radeon_set_context_reg_seq(offset, num); \ - radeon_emit_array(value, num); \ - memcpy(saved_val, value, sizeof(uint32_t) * (num)); \ - } \ +#define radeon_emit_one_32bit_pointer(_unused, desc, sh_base) do { \ + radeon_set_sh_reg_seq((sh_base) + (desc)->shader_userdata_offset, 1); \ + radeon_emit_32bit_pointer(_unused, (desc)->gpu_address); \ } while (0) -#define radeon_opt_set_sh_reg(sctx, offset, reg, val) do { \ - unsigned __value = val; \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.other_reg_value[reg] != __value) { \ - radeon_set_sh_reg(offset, __value); \ - sctx->tracked_regs.other_reg_saved_mask |= BITFIELD_BIT(reg); \ - sctx->tracked_regs.other_reg_value[reg] = __value; \ - } \ -} while (0) +/* Packet building helpers for UCONFIG registers. */ +#define radeon_set_uconfig_reg_seq(reg, num) \ + radeon_set_reg_seq(reg, num, 0, CIK_UCONFIG, PKT3_SET_UCONFIG_REG, 0) -/** - * Set 2 consecutive registers if any register value is different. - */ -#define radeon_opt_set_sh_reg2(sctx, offset, reg, val1, val2) do { \ - unsigned __value1 = (val1), __value2 = (val2); \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x3) != 0x3 || \ - sctx->tracked_regs.other_reg_value[reg] != __value1 || \ - sctx->tracked_regs.other_reg_value[(reg) + 1] != __value2) { \ - radeon_set_sh_reg_seq(offset, 2); \ - radeon_emit(__value1); \ - radeon_emit(__value2); \ - sctx->tracked_regs.other_reg_value[reg] = __value1; \ - sctx->tracked_regs.other_reg_value[(reg) + 1] = __value2; \ - sctx->tracked_regs.other_reg_saved_mask |= 0x3ull << (reg); \ - } \ -} while (0) +#define radeon_set_uconfig_perfctr_reg_seq(reg, num) \ + radeon_set_reg_seq(reg, num, 0, CIK_UCONFIG, PKT3_SET_UCONFIG_REG, sctx->gfx_level >= GFX10) -/** - * Set 3 consecutive registers if any register value is different. - */ -#define radeon_opt_set_sh_reg3(sctx, offset, reg, val1, val2, val3) do { \ - unsigned __value1 = (val1), __value2 = (val2), __value3 = (val3); \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x7) != 0x7 || \ - sctx->tracked_regs.other_reg_value[reg] != __value1 || \ - sctx->tracked_regs.other_reg_value[(reg) + 1] != __value2 || \ - sctx->tracked_regs.other_reg_value[(reg) + 2] != __value3) { \ - radeon_set_sh_reg_seq(offset, 3); \ - radeon_emit(__value1); \ - radeon_emit(__value2); \ - radeon_emit(__value3); \ - sctx->tracked_regs.other_reg_value[reg] = __value1; \ - sctx->tracked_regs.other_reg_value[(reg) + 1] = __value2; \ - sctx->tracked_regs.other_reg_value[(reg) + 2] = __value3; \ - sctx->tracked_regs.other_reg_saved_mask |= 0x7ull << (reg); \ - } \ -} while (0) +#define radeon_set_uconfig_reg(reg, value) \ + radeon_set_reg(reg, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG) -#define radeon_opt_set_sh_reg_idx(sctx, offset, reg, idx, val) do { \ - unsigned __value = val; \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.other_reg_value[reg] != __value) { \ - radeon_set_sh_reg_idx(sctx, offset, idx, __value); \ - sctx->tracked_regs.other_reg_saved_mask |= BITFIELD_BIT(reg); \ - sctx->tracked_regs.other_reg_value[reg] = __value; \ - } \ -} while (0) +#define radeon_opt_set_uconfig_reg(_unused, reg, reg_enum, value) \ + radeon_opt_set_reg(reg, reg_enum, 0, value, CIK_UCONFIG, PKT3_SET_UCONFIG_REG, other) -#define radeon_opt_set_uconfig_reg(sctx, offset, reg, val) do { \ - unsigned __value = val; \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.other_reg_value[reg] != __value) { \ - radeon_set_uconfig_reg(offset, __value); \ - sctx->tracked_regs.other_reg_saved_mask |= 0x1ull << (reg); \ - sctx->tracked_regs.other_reg_value[reg] = __value; \ - } \ -} while (0) +#define RESOLVE_PKT3_SET_UCONFIG_REG_INDEX \ + (GFX_VERSION >= GFX10 || (GFX_VERSION == GFX9 && sctx->screen->info.me_fw_version >= 26) ? \ + PKT3_SET_UCONFIG_REG_INDEX : PKT3_SET_UCONFIG_REG) -#define radeon_opt_set_uconfig_reg_idx(sctx, gfx_level, offset, reg, idx, val) do { \ - unsigned __value = val; \ - if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \ - sctx->tracked_regs.other_reg_value[reg] != __value) { \ - radeon_set_uconfig_reg_idx((sctx)->screen, gfx_level, offset, idx, __value); \ - sctx->tracked_regs.other_reg_saved_mask |= 0x1ull << (reg); \ - sctx->tracked_regs.other_reg_value[reg] = __value; \ - } \ -} while (0) +#define radeon_set_uconfig_reg_idx(_unused, _unused2, reg, idx, value) \ + radeon_set_reg(reg, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX) + +#define radeon_opt_set_uconfig_reg_idx(_unused, _unused2, reg, reg_enum, idx, value) \ + radeon_opt_set_reg(reg, reg_enum, idx, value, CIK_UCONFIG, RESOLVE_PKT3_SET_UCONFIG_REG_INDEX, other) #define radeon_set_privileged_config_reg(reg, value) do { \ assert((reg) < CIK_UCONFIG_REG_OFFSET); \ @@ -399,15 +259,44 @@ radeon_emit(0); /* unused */ \ } while (0) -#define radeon_emit_32bit_pointer(sscreen, va) do { \ - radeon_emit(va); \ - assert((va) == 0 || ((va) >> 32) == sscreen->info.address32_hi); \ +/* GFX11 generic packet building helpers for buffered SH registers. Don't use these directly. */ +#define gfx11_push_sh_reg(reg, value, type) do { \ + unsigned __i = sctx->num_buffered_##type##_sh_regs++; \ + assert(__i / 2 < ARRAY_SIZE(sctx->buffered_##type##_sh_regs)); \ + sctx->buffered_##type##_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \ + sctx->buffered_##type##_sh_regs[__i / 2].reg_value[__i % 2] = value; \ +} while (0) + +#define gfx11_opt_push_sh_reg(reg, reg_enum, value, type) do { \ + unsigned __value = value; \ + if (((sctx->tracked_regs.other_reg_saved_mask >> (reg_enum)) & 0x1) != 0x1 || \ + sctx->tracked_regs.other_reg_value[reg_enum] != __value) { \ + gfx11_push_sh_reg(reg, __value, type); \ + sctx->tracked_regs.other_reg_saved_mask |= BITFIELD_BIT(reg_enum); \ + sctx->tracked_regs.other_reg_value[reg_enum] = __value; \ + } \ } while (0) -#define radeon_emit_one_32bit_pointer(sctx, desc, sh_base) do { \ - unsigned sh_offset = (sh_base) + (desc)->shader_userdata_offset; \ - radeon_set_sh_reg_seq(sh_offset, 1); \ - radeon_emit_32bit_pointer(sctx->screen, (desc)->gpu_address); \ +/* GFX11 packet building helpers for buffered SH registers. */ +#define radeon_push_gfx_sh_reg(reg, value) \ + gfx11_push_sh_reg(reg, value, gfx) + +#define radeon_push_compute_sh_reg(reg, value) \ + gfx11_push_sh_reg(reg, value, compute) + +#define radeon_opt_push_gfx_sh_reg(reg, reg_enum, value) \ + gfx11_opt_push_sh_reg(reg, reg_enum, value, gfx) + +#define radeon_opt_push_compute_sh_reg(reg, reg_enum, value) \ + gfx11_opt_push_sh_reg(reg, reg_enum, value, compute) + +#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \ + if (GFX_VERSION >= GFX11 && HAS_PAIRS) { \ + radeon_push_gfx_sh_reg(reg, value); \ + } else { \ + radeon_set_sh_reg_seq(reg, 1); \ + radeon_emit(value); \ + } \ } while (0) /* This should be evaluated at compile time if all parameters are constants. */ diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 0a68d7efc8a..821d3a436bd 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -70,7 +70,7 @@ static void si_pc_emit_instance(struct si_context *sctx, int se, int instance) void si_pc_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders) { radeon_begin(cs); - radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2, false); + radeon_set_uconfig_reg_seq(R_036780_SQ_PERFCOUNTER_CTRL, 2); radeon_emit(shaders & 0x7f); radeon_emit(0xffffffff); radeon_end(); @@ -92,12 +92,12 @@ static void si_pc_emit_select(struct si_context *sctx, struct ac_pc_block *block radeon_begin(cs); for (idx = 0; idx < count; ++idx) { - radeon_set_uconfig_reg_seq(regs->select0[idx], 1, false); + radeon_set_uconfig_reg_seq(regs->select0[idx], 1); radeon_emit(selectors[idx] | regs->select_or); } for (idx = 0; idx < regs->num_spm_counters; idx++) { - radeon_set_uconfig_reg_seq(regs->select1[idx], 1, false); + radeon_set_uconfig_reg_seq(regs->select1[idx], 1); radeon_emit(0); } @@ -748,7 +748,7 @@ si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs) const struct ac_spm_counter_select *cntr_sel = &spm->sqg[instance].counters[b]; uint32_t reg_base = R_036700_SQ_PERFCOUNTER0_SELECT; - radeon_set_uconfig_reg_seq(reg_base + b * 4, 1, false); + radeon_set_uconfig_reg_seq(reg_base + b * 4, 1); radeon_emit(cntr_sel->sel0 | S_036700_SQC_BANK_MASK(0xf)); /* SQC_BANK_MASK only gfx10 */ } } @@ -768,10 +768,10 @@ si_emit_spm_counters(struct si_context *sctx, struct radeon_cmdbuf *cs) if (!cntr_sel->active) continue; - radeon_set_uconfig_reg_seq(regs->select0[c], 1, false); + radeon_set_uconfig_reg_seq(regs->select0[c], 1); radeon_emit(cntr_sel->sel0); - radeon_set_uconfig_reg_seq(regs->select1[c], 1, false); + radeon_set_uconfig_reg_seq(regs->select1[c], 1); radeon_emit(cntr_sel->sel1); } } diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index 63618e666be..05891d98115 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -798,11 +798,7 @@ static void si_emit_sqtt_userdata(struct si_context *sctx, while (num_dwords > 0) { uint32_t count = MIN2(num_dwords, 2); - /* Without the perfctr bit the CP might not always pass the - * write on correctly. */ - radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, - sctx->gfx_level >= GFX10); - + radeon_set_uconfig_perfctr_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count); radeon_emit_array(dwords, count); dwords += count; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 8b3686a5694..d0f76a1c5a5 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -975,14 +975,26 @@ static void si_emit_shader_gs(struct si_context *sctx, unsigned index) /* These don't cause any context rolls. */ radeon_begin_again(&sctx->gfx_cs); if (sctx->gfx_level >= GFX7) { - radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, - 3, shader->gs.spi_shader_pgm_rsrc3_gs); + if (sctx->screen->info.uses_kernel_cu_mask) { + radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + 3, shader->gs.spi_shader_pgm_rsrc3_gs); + } else { + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->gs.spi_shader_pgm_rsrc3_gs); + } } if (sctx->gfx_level >= GFX10) { - radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, - 3, shader->gs.spi_shader_pgm_rsrc4_gs); + if (sctx->screen->info.uses_kernel_cu_mask) { + radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + 3, shader->gs.spi_shader_pgm_rsrc4_gs); + } else { + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->gs.spi_shader_pgm_rsrc4_gs); + } } radeon_end(); } @@ -1204,12 +1216,21 @@ static void gfx10_emit_shader_ngg(struct si_context *sctx, unsigned index) SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, shader->gs.spi_shader_pgm_rsrc4_gs); } else { - radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, - 3, shader->ngg.spi_shader_pgm_rsrc3_gs); - radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, - SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, - 3, shader->ngg.spi_shader_pgm_rsrc4_gs); + if (sctx->screen->info.uses_kernel_cu_mask) { + radeon_opt_set_sh_reg_idx(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + 3, shader->ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg_idx(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + 3, shader->ngg.spi_shader_pgm_rsrc4_gs); + } else { + radeon_opt_set_sh_reg(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS, + shader->ngg.spi_shader_pgm_rsrc3_gs); + radeon_opt_set_sh_reg(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS, + SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS, + shader->ngg.spi_shader_pgm_rsrc4_gs); + } } radeon_end(); }