Module: Mesa Branch: main Commit: 7ffb65f935655f16bc19d1918cac5e40104a06d9 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=7ffb65f935655f16bc19d1918cac5e40104a06d9
Author: Marek Olšák <marek.ol...@amd.com> Date: Fri Nov 24 04:52:18 2023 -0500 ac: add an IB parser that gathers context rolls This is an important performance bottleneck analysis tool. Try it with radeonsi: AMD_ROLLS=filename app Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26349> --- src/amd/common/ac_debug.h | 4 + src/amd/common/ac_gather_context_rolls.c | 345 +++++++++++++++++++++++++++++++ src/amd/common/meson.build | 1 + src/gallium/drivers/radeonsi/si_debug.c | 21 ++ src/gallium/drivers/radeonsi/si_gfx_cs.c | 3 + src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 2 + 7 files changed, 377 insertions(+) diff --git a/src/amd/common/ac_debug.h b/src/amd/common/ac_debug.h index 60b83e57e74..d8017e2f1a3 100644 --- a/src/amd/common/ac_debug.h +++ b/src/amd/common/ac_debug.h @@ -56,6 +56,10 @@ unsigned ac_get_wave_info(enum amd_gfx_level gfx_level, const struct radeon_info void ac_print_gpuvm_fault_status(FILE *output, enum amd_gfx_level gfx_level, uint32_t status); +/* ac_gather_context_rolls.c */ +void ac_gather_context_rolls(FILE *f, uint32_t **ibs, uint32_t *ib_dw_sizes, unsigned num_ibs, + const struct radeon_info *info); + /* ac_parse_ib.c */ void ac_dump_reg(FILE *file, enum amd_gfx_level gfx_level, enum radeon_family family, unsigned offset, uint32_t value, uint32_t field_mask); diff --git a/src/amd/common/ac_gather_context_rolls.c b/src/amd/common/ac_gather_context_rolls.c new file mode 100644 index 00000000000..7bb8575b3f0 --- /dev/null +++ b/src/amd/common/ac_gather_context_rolls.c @@ -0,0 +1,345 @@ +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * SPDX-License-Identifier: MIT + */ + +/* Utility for gathering context rolls for performance bottleneck analysis. + * + * Usage for radeonsi: + * AMD_ROLLS=filename app1 + * AMD_ROLLS=filename app2 + * ... + * AMD_ROLLS=filename appN + * + * sort filename | uniq -c | sort -n > rolls_sorted.txt + * + * Then try to reduce the most frequent context rolls. + */ + +#include "ac_debug.h" +#include "sid.h" +#include "sid_tables.h" + +#include "util/bitset.h" +#include "util/u_dynarray.h" +#include "util/u_memory.h" + +struct ac_context_reg_deltas { + uint32_t changed_masks[1024]; /* changes masks of context registers */ + BITSET_DECLARE(changed, 1024); /* which context register was set */ + bool acquire_mem; /* whether ACQUIRE_MEM rolled the context */ +}; + +struct ac_context_reg_state { + uint32_t regs[1024]; + struct ac_context_reg_deltas deltas; +}; + +struct ac_context_roll_ctx { + struct ac_context_reg_state *cur; + bool context_busy; + + unsigned num_busy_contexts; + struct util_dynarray rolls; + + const struct radeon_info *info; +}; + +static void ac_roll_context(struct ac_context_roll_ctx *ctx) +{ + if (!ctx->context_busy) + return; + + struct ac_context_reg_state *last = ctx->cur; + ctx->cur = CALLOC_STRUCT(ac_context_reg_state); + memcpy(ctx->cur->regs, last->regs, sizeof(last->regs)); + ctx->context_busy = false; + ctx->num_busy_contexts++; + + /* Ignore the first context at the beginning or after waiting for idle. */ + if (ctx->num_busy_contexts > 1) { + util_dynarray_append(&ctx->rolls, struct ac_context_reg_state *, last); + } else { + FREE(last); + } +} + +static void ac_record_wait_idle(struct ac_context_roll_ctx *ctx) +{ + ctx->num_busy_contexts = 0; + ctx->context_busy = false; + memset(&ctx->cur->deltas, 0, sizeof(ctx->cur->deltas)); +} + +static void ac_record_set_context_reg(struct ac_context_roll_ctx *ctx, + unsigned reg_rel_dw_offset, unsigned value) +{ + if (!ac_register_exists(ctx->info->gfx_level, ctx->info->family, + SI_CONTEXT_REG_OFFSET + reg_rel_dw_offset * 4)) { + fprintf(stderr, "This register is not supported by this chip: 0x%X\n", + SI_CONTEXT_REG_OFFSET + reg_rel_dw_offset * 4); + abort(); + } + + assert(reg_rel_dw_offset < 1024); + BITSET_SET(ctx->cur->deltas.changed, reg_rel_dw_offset); + ctx->cur->deltas.changed_masks[reg_rel_dw_offset] |= ctx->cur->regs[reg_rel_dw_offset] ^ value; + ctx->cur->regs[reg_rel_dw_offset] = value; +} + +static unsigned get_reg_index(unsigned reg) +{ + return (reg - SI_CONTEXT_REG_OFFSET) / 4; +} + +static void ac_ib_gather_context_rolls(struct ac_context_roll_ctx *ctx, uint32_t *ib, int num_dw) +{ + for (unsigned cur_dw = 0; cur_dw < num_dw;) { + uint32_t header = ib[cur_dw++]; + unsigned type = PKT_TYPE_G(header); + + if (type != 3) { + fprintf(stderr, "Unexpected type %u packet\n", type); + abort(); + } + + int count = PKT_COUNT_G(header); + unsigned op = PKT3_IT_OPCODE_G(header); + + switch (op) { + /* Record context register changes. */ + case PKT3_SET_CONTEXT_REG: { + ac_roll_context(ctx); + + unsigned reg_dw = ib[cur_dw++]; + unsigned reg_rel_dw_offset = reg_dw & 0xFFFF; + + for (int i = 0; i < count; i++) + ac_record_set_context_reg(ctx, reg_rel_dw_offset + i, ib[cur_dw++]); + continue; + } + + case PKT3_SET_CONTEXT_REG_PAIRS: + ac_roll_context(ctx); + + for (int i = 0; i < (count + 1) / 2; i++) { + unsigned reg_rel_dw_offset = ib[cur_dw++]; + ac_record_set_context_reg(ctx, reg_rel_dw_offset, ib[cur_dw++]); + } + continue; + + case PKT3_SET_CONTEXT_REG_PAIRS_PACKED: { + ac_roll_context(ctx); + + unsigned reg_rel_dw_offset0 = 0, reg_rel_dw_offset1 = 0; + cur_dw++; + + for (int i = 0; i < count; i++) { + if (i % 3 == 0) { + unsigned tmp = ib[cur_dw++]; + reg_rel_dw_offset0 = tmp & 0xffff; + reg_rel_dw_offset1 = tmp >> 16; + } else if (i % 3 == 1) { + ac_record_set_context_reg(ctx, reg_rel_dw_offset0, ib[cur_dw++]); + } else { + ac_record_set_context_reg(ctx, reg_rel_dw_offset1, ib[cur_dw++]); + } + } + continue; + } + + case PKT3_CLEAR_STATE: + ac_roll_context(ctx); + + ac_record_set_context_reg(ctx, get_reg_index(R_028000_DB_RENDER_CONTROL), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028004_DB_COUNT_CONTROL), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028BDC_PA_SC_LINE_CNTL), 0x1000); + ac_record_set_context_reg(ctx, get_reg_index(R_028BE0_PA_SC_AA_CONFIG), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028BE4_PA_SU_VTX_CNTL), 0x5); + ac_record_set_context_reg(ctx, get_reg_index(R_028BE8_PA_CL_GB_VERT_CLIP_ADJ), 0x3f800000); + ac_record_set_context_reg(ctx, get_reg_index(R_028BEC_PA_CL_GB_VERT_DISC_ADJ), 0x3f800000); + ac_record_set_context_reg(ctx, get_reg_index(R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ), 0x3f800000); + ac_record_set_context_reg(ctx, get_reg_index(R_028BF4_PA_CL_GB_HORZ_DISC_ADJ), 0x3f800000); + + ac_record_set_context_reg(ctx, get_reg_index(R_02870C_SPI_SHADER_POS_FORMAT), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028710_SPI_SHADER_Z_FORMAT), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028714_SPI_SHADER_COL_FORMAT), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_0286E0_SPI_BARYC_CNTL), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_0286CC_SPI_PS_INPUT_ENA), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_0286D0_SPI_PS_INPUT_ADDR), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028804_DB_EQAA), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_02880C_DB_SHADER_CONTROL), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_02823C_CB_SHADER_MASK), 0xffffffff); + ac_record_set_context_reg(ctx, get_reg_index(R_028238_CB_TARGET_MASK), 0xffffffff); + ac_record_set_context_reg(ctx, get_reg_index(R_028810_PA_CL_CLIP_CNTL), 0x90000); + ac_record_set_context_reg(ctx, get_reg_index(R_02881C_PA_CL_VS_OUT_CNTL), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028818_PA_CL_VTE_CNTL), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_02820C_PA_SC_CLIPRECT_RULE), 0xffff); + ac_record_set_context_reg(ctx, get_reg_index(R_028A0C_PA_SC_LINE_STIPPLE), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028A4C_PA_SC_MODE_CNTL_1), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028234_PA_SU_HARDWARE_SCREEN_OFFSET), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_0286D8_SPI_PS_IN_CONTROL), 0x2); + ac_record_set_context_reg(ctx, get_reg_index(R_028B90_VGT_GS_INSTANCE_CNT), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B38_VGT_GS_MAX_VERT_OUT), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B54_VGT_SHADER_STAGES_EN), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B58_VGT_LS_HS_CONFIG), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B6C_VGT_TF_PARAM), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028C44_PA_SC_BINNER_CNTL_0), 0x3); + if (ctx->info->gfx_level >= GFX10) { + ac_record_set_context_reg(ctx, get_reg_index(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B4C_GE_NGG_SUBGRP_CNTL), 0); + } + if (ctx->info->gfx_level >= GFX11) + ac_record_set_context_reg(ctx, get_reg_index(R_0283D0_PA_SC_VRS_OVERRIDE_CNTL), 0); + else if (ctx->info->gfx_level == GFX10_3) + ac_record_set_context_reg(ctx, get_reg_index(R_028064_DB_VRS_OVERRIDE_CNTL), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028754_SX_PS_DOWNCONVERT), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028758_SX_BLEND_OPT_EPSILON), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_02875C_SX_BLEND_OPT_CONTROL), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028AAC_VGT_ESGS_RING_ITEMSIZE), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028AB4_VGT_REUSE_OFF), 0); + if (ctx->info->gfx_level <= GFX9) + ac_record_set_context_reg(ctx, get_reg_index(R_028AA8_IA_MULTI_VGT_PARAM), 0xff); + + if (ctx->info->gfx_level == GFX9) + ac_record_set_context_reg(ctx, get_reg_index(R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP), 0); + if (ctx->info->gfx_level <= GFX10_3) { + ac_record_set_context_reg(ctx, get_reg_index(R_028A44_VGT_GS_ONCHIP_CNTL), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028AB0_VGT_GSVS_RING_ITEMSIZE), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028A40_VGT_GS_MODE), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL), 0x1e); + ac_record_set_context_reg(ctx, get_reg_index(R_028A6C_VGT_GS_OUT_PRIM_TYPE), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028A60_VGT_GSVS_RING_OFFSET_1), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028A64_VGT_GSVS_RING_OFFSET_2), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028A68_VGT_GSVS_RING_OFFSET_3), 0); + + ac_record_set_context_reg(ctx, get_reg_index(R_028B5C_VGT_GS_VERT_ITEMSIZE), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B60_VGT_GS_VERT_ITEMSIZE_1), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B64_VGT_GS_VERT_ITEMSIZE_2), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028B68_VGT_GS_VERT_ITEMSIZE_3), 0); + } + + ac_record_set_context_reg(ctx, get_reg_index(R_028010_DB_RENDER_OVERRIDE2), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_0286C4_SPI_VS_OUT_CONFIG), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028A84_VGT_PRIMITIVEID_EN), 0); + ac_record_set_context_reg(ctx, get_reg_index(R_028424_CB_DCC_CONTROL), 0); + break; + + case PKT3_LOAD_CONTEXT_REG_INDEX: + case PKT3_COPY_DATA: + /* TODO */ + break; + + case PKT3_ACQUIRE_MEM: + if (G_580_PWS_ENA2(ib[cur_dw])) { + ac_record_wait_idle(ctx); + } else { + ac_roll_context(ctx); + ctx->cur->deltas.acquire_mem = true; + } + break; + + case PKT3_WAIT_REG_MEM: + ac_record_wait_idle(ctx); + break; + + case PKT3_EVENT_WRITE: + if (G_490_EVENT_TYPE(ib[cur_dw]) == V_028A90_PS_PARTIAL_FLUSH) + ac_record_wait_idle(ctx); + break; + + /* Record draws. */ + case PKT3_DRAW_INDEX_AUTO: + case PKT3_DRAW_INDEX_IMMD: + case PKT3_DRAW_INDEX_MULTI_AUTO: + case PKT3_DRAW_INDEX_2: + case PKT3_DRAW_INDEX_OFFSET_2: + case PKT3_DRAW_INDIRECT: + case PKT3_DRAW_INDEX_INDIRECT: + case PKT3_DRAW_INDIRECT_MULTI: + case PKT3_DRAW_INDEX_INDIRECT_MULTI: + case PKT3_DISPATCH_MESH_DIRECT: + case PKT3_DISPATCH_MESH_INDIRECT_MULTI: + case PKT3_DISPATCH_TASKMESH_GFX: + ctx->context_busy = true; + break; + + case PKT3_INDIRECT_BUFFER: + /* Chaining. Note that the CHAIN bit is not set at this point, so we can't distinguish + * between chaining and IB2. + */ + return; + + case PKT3_CONTEXT_REG_RMW: + case PKT3_INDIRECT_BUFFER_SI: + case PKT3_SURFACE_SYNC: + fprintf(stderr, "Unhandled packet: 0x%x\n", op); + abort(); + break; + } + + cur_dw += count + 1; + } +} + +void ac_gather_context_rolls(FILE *f, uint32_t **ibs, uint32_t *ib_dw_sizes, unsigned num_ibs, + const struct radeon_info *info) +{ + struct ac_context_roll_ctx ctx; + + /* Initialize. */ + memset(&ctx, 0, sizeof(ctx)); + ctx.info = info; + ctx.cur = CALLOC_STRUCT(ac_context_reg_state); + util_dynarray_init(&ctx.rolls, NULL); + + /* Parse the IBs. */ + for (unsigned i = 0; i < num_ibs; i++) + ac_ib_gather_context_rolls(&ctx, ibs[i], ib_dw_sizes[i]); + + /* Roll the last context to add it to the list. */ + ac_roll_context(&ctx); + + /* Print context rolls. */ + if (util_dynarray_num_elements(&ctx.rolls, struct ac_context_reg_state *)) { + /* Print the context rolls starting with the most frequent one. */ + util_dynarray_foreach(&ctx.rolls, struct ac_context_reg_state *, iter) { + struct ac_context_reg_state *state = *iter; + + unsigned i; + BITSET_FOREACH_SET(i, state->deltas.changed, 1024) { + unsigned reg_offset = SI_CONTEXT_REG_OFFSET + i * 4; + const struct si_reg *reg = ac_find_register(info->gfx_level, info->family, + reg_offset); + + if (!reg) { + fprintf(f, "0x%X(0x%x) ", reg_offset, state->deltas.changed_masks[i]); + } else { + fprintf(f, "%s(0x%x) ", sid_strings + reg->name_offset, + state->deltas.changed_masks[i]); + } + } + + if (state->deltas.acquire_mem) + fprintf(f, "ACQUIRE_MEM"); + + fprintf(f, "\n"); + } + } + + /* Free. */ + FREE(ctx.cur); + util_dynarray_foreach(&ctx.rolls, struct ac_context_reg_state *, iter) { + FREE(*iter); + } + util_dynarray_fini(&ctx.rolls); +} diff --git a/src/amd/common/meson.build b/src/amd/common/meson.build index 527ed905904..04154180264 100644 --- a/src/amd/common/meson.build +++ b/src/amd/common/meson.build @@ -73,6 +73,7 @@ amd_common_files = files( 'ac_shader_args.h', 'ac_shader_util.c', 'ac_shader_util.h', + 'ac_gather_context_rolls.c', 'ac_gpu_info.c', 'ac_gpu_info.h', 'ac_surface.c', diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index 3199d70b5d3..5e62ec1b0f0 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -1084,6 +1084,27 @@ void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, exit(0); } +void si_gather_context_rolls(struct si_context *sctx) +{ + struct radeon_cmdbuf *cs = &sctx->gfx_cs; + uint32_t **ibs = alloca(sizeof(ibs[0]) * (cs->num_prev + 1)); + uint32_t *ib_dw_sizes = alloca(sizeof(ib_dw_sizes[0]) * (cs->num_prev + 1)); + + for (unsigned i = 0; i < cs->num_prev; i++) { + struct radeon_cmdbuf_chunk *chunk = &cs->prev[i]; + + ibs[i] = chunk->buf; + ib_dw_sizes[i] = chunk->cdw; + } + + ibs[cs->num_prev] = cs->current.buf; + ib_dw_sizes[cs->num_prev] = cs->current.cdw; + + FILE *f = fopen(sctx->screen->context_roll_log_filename, "a"); + ac_gather_context_rolls(f, ibs, ib_dw_sizes, cs->num_prev + 1, &sctx->screen->info); + fclose(f); +} + void si_init_debug_functions(struct si_context *sctx) { sctx->b.dump_debug_state = si_dump_debug_state; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 65e661ef44d..44e95f6eb41 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -120,6 +120,9 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h if (sscreen->debug_flags & DBG(IB)) si_print_current_ib(ctx, stderr); + if (sscreen->context_roll_log_filename) + si_gather_context_rolls(ctx); + if (ctx->is_noop) flags |= RADEON_FLUSH_NOOP; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a831e4440c1..264474f4e0f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -1192,6 +1192,7 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, &sscreen->pa_sc_raster_config_1, &sscreen->se_tile_repeat); } + sscreen->context_roll_log_filename = debug_get_option("AMD_ROLLS", NULL); sscreen->debug_flags = debug_get_flags_option("R600_DEBUG", radeonsi_debug_options, 0); sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG", radeonsi_debug_options, 0); test_flags = debug_get_flags_option("AMD_TEST", test_options, 0); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 83e18a1f1c2..5c3b31842f2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -591,6 +591,7 @@ struct si_screen { /* Whether shaders are monolithic (1-part) or separate (3-part). */ bool use_monolithic_shaders; bool record_llvm_ir; + const char *context_roll_log_filename; struct slab_parent_pool pool_transfers; @@ -1526,6 +1527,7 @@ void si_cp_copy_data(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned void si_init_cp_reg_shadowing(struct si_context *sctx); /* si_debug.c */ +void si_gather_context_rolls(struct si_context *sctx); void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved, bool get_buffer_list); void si_clear_saved_cs(struct radeon_saved_cs *saved);