Module: Mesa Branch: main Commit: 0cc4aca34558f434d09b15dceb19b53d18c7e395 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=0cc4aca34558f434d09b15dceb19b53d18c7e395
Author: Connor Abbott <[email protected]> Date: Fri Dec 17 17:51:36 2021 +0100 ir3: Use new (sy)/(ss) stall helpers in the compiler This fixes a few bad assumptions in the pre-RA and post-RA scheduler, for example that (sy) is only for texture instructions and (ss) is only for SFU instructions and (sy) and (ss) producers will always take the same number of cycles. This means we now start doing latency hiding for cat6 instructions like ldib and ldc. It also should make us hide latency more aggressively, since the number used for (sy) stall cycles was way lower than the real numbers for everything except ldc. Finally it unifies the various places (ss) soft nops were calculated. selected shader-db results: total nops in shared programs: 345278 -> 358959 (3.96%) nops in affected programs: 215622 -> 229303 (6.34%) helped: 690 HURT: 2430 helped stats (abs) min: 1 max: 125 x̄: 11.40 x̃: 5 helped stats (rel) min: 0.53% max: 100.00% x̄: 24.19% x̃: 18.52% HURT stats (abs) min: 1 max: 501 x̄: 8.87 x̃: 5 HURT stats (rel) min: 0.00% max: 9900.00% x̄: 52.36% x̃: 14.29% 95% mean confidence interval for nops value: 3.78 4.99 95% mean confidence interval for nops %-change: 28.21% 42.66% Nops are HURT. total mov in shared programs: 75049 -> 74110 (-1.25%) mov in affected programs: 15754 -> 14815 (-5.96%) helped: 566 HURT: 455 helped stats (abs) min: 1 max: 36 x̄: 4.52 x̃: 3 helped stats (rel) min: 0.83% max: 100.00% x̄: 35.85% x̃: 30.00% HURT stats (abs) min: 1 max: 35 x̄: 3.55 x̃: 3 HURT stats (rel) min: 0.00% max: 1100.00% x̄: 63.60% x̃: 25.00% 95% mean confidence interval for mov value: -1.25 -0.58 95% mean confidence interval for mov %-change: 2.92% 14.02% Inconclusive result (value mean confidence interval and %-change mean confidence interval disagree). total last-baryf in shared programs: 80468 -> 67670 (-15.90%) last-baryf in affected programs: 63676 -> 50878 (-20.10%) helped: 309 HURT: 147 helped stats (abs) min: 1 max: 260 x̄: 49.20 x̃: 24 helped stats (rel) min: 0.60% max: 98.81% x̄: 37.92% x̃: 40.91% HURT stats (abs) min: 1 max: 115 x̄: 16.35 x̃: 12 HURT stats (rel) min: 0.96% max: 1933.33% x̄: 45.55% x̃: 7.89% 95% mean confidence interval for last-baryf value: -33.03 -23.10 95% mean confidence interval for last-baryf %-change: -21.52% -0.50% Last-baryf are helped. total sstall in shared programs: 133997 -> 126398 (-5.67%) sstall in affected programs: 86866 -> 79267 (-8.75%) helped: 1893 HURT: 598 helped stats (abs) min: 1 max: 77 x̄: 6.06 x̃: 4 helped stats (rel) min: 0.71% max: 100.00% x̄: 32.82% x̃: 16.67% HURT stats (abs) min: 1 max: 65 x̄: 6.47 x̃: 6 HURT stats (rel) min: 0.00% max: 900.00% x̄: 65.51% x̃: 25.00% 95% mean confidence interval for sstall value: -3.39 -2.71 95% mean confidence interval for sstall %-change: -12.19% -6.24% Sstall are helped. total systall in shared programs: 350304 -> 288234 (-17.72%) systall in affected programs: 234855 -> 172785 (-26.43%) helped: 1456 HURT: 260 helped stats (abs) min: 1 max: 574 x̄: 46.42 x̃: 27 helped stats (rel) min: 0.19% max: 100.00% x̄: 39.43% x̃: 36.06% HURT stats (abs) min: 1 max: 757 x̄: 21.20 x̃: 8 HURT stats (rel) min: 0.00% max: 180.95% x̄: 24.82% x̃: 12.50% 95% mean confidence interval for systall value: -39.31 -33.03 95% mean confidence interval for systall %-change: -31.49% -27.90% Systall are helped. total waves in shared programs: 236732 -> 235142 (-0.67%) waves in affected programs: 6142 -> 4552 (-25.89%) helped: 535 HURT: 17 helped stats (abs) min: 2 max: 8 x̄: 3.08 x̃: 2 helped stats (rel) min: 12.50% max: 75.00% x̄: 28.78% x̃: 25.00% HURT stats (abs) min: 2 max: 6 x̄: 3.53 x̃: 4 HURT stats (rel) min: 16.67% max: 75.00% x̄: 37.35% x̃: 33.33% 95% mean confidence interval for waves value: -3.04 -2.72 95% mean confidence interval for waves %-change: -28.10% -25.39% Waves are helped. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14246> --- src/freedreno/ir3/ir3_delay.c | 19 +++---------------- src/freedreno/ir3/ir3_postsched.c | 14 +++++++------- src/freedreno/ir3/ir3_sched.c | 18 +++++++++--------- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c index 71617aee90f..83730b5132e 100644 --- a/src/freedreno/ir3/ir3_delay.c +++ b/src/freedreno/ir3/ir3_delay.c @@ -30,19 +30,6 @@ */ #define MAX_NOPS 6 -/* The soft delay for approximating the cost of (ss). On a6xx, it takes the - * number of delay slots to get a SFU result back (ie. using nop's instead of - * (ss) is: - * - * 8 - single warp - * 9 - two warps - * 10 - four warps - * - * and so on. Not quite sure where it tapers out (ie. how many warps share an - * SFU unit). But 10 seems like a reasonable # to choose: - */ -#define SOFT_SS_NOPS 10 - /* * Helpers to figure out the necessary delay slots between instructions. Used * both in scheduling pass(es) and the final pass to insert any required nop's @@ -76,11 +63,11 @@ ir3_delayslots(struct ir3_instruction *assigner, if (writes_addr0(assigner) || writes_addr1(assigner)) return 6; - if (soft && is_sfu(assigner)) - return SOFT_SS_NOPS; + if (soft && is_ss_producer(assigner)) + return soft_ss_delay(assigner); /* handled via sync flags: */ - if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner)) + if (is_ss_producer(assigner) || is_sy_producer(assigner)) return 0; /* As far as we know, shader outputs don't need any delay. */ diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c index 43a6223ee0a..402262053c9 100644 --- a/src/freedreno/ir3/ir3_postsched.c +++ b/src/freedreno/ir3/ir3_postsched.c @@ -140,16 +140,16 @@ schedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH)) return; - if (is_sfu(instr)) { - ctx->sfu_delay = 8; + if (is_ss_producer(instr)) { + ctx->sfu_delay = soft_ss_delay(instr); } else if (has_sfu_src(instr)) { ctx->sfu_delay = 0; } else if (ctx->sfu_delay > 0) { ctx->sfu_delay--; } - if (is_tex_or_prefetch(instr)) { - ctx->tex_delay = 10; + if (is_sy_producer(instr)) { + ctx->tex_delay = soft_sy_delay(instr, ctx->block->shader); } else if (has_tex_src(instr)) { ctx->tex_delay = 0; } else if (ctx->tex_delay > 0) { @@ -261,7 +261,7 @@ choose_instr(struct ir3_postsched_ctx *ctx) if (d > 0) continue; - if (!(is_sfu(n->instr) || is_tex(n->instr))) + if (!(is_ss_producer(n->instr) || is_sy_producer(n->instr))) continue; if (!chosen || (chosen->max_delay < n->max_delay)) @@ -403,9 +403,9 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state, unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true); d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n); node->delay = MAX2(node->delay, d_soft); - if (is_tex_or_prefetch(dep->instr)) + if (is_sy_producer(dep->instr)) node->has_tex_src = true; - if (is_sfu(dep->instr)) + if (is_ss_producer(dep->instr)) node->has_sfu_src = true; } diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c index f26de995044..98c9c69c105 100644 --- a/src/freedreno/ir3/ir3_sched.c +++ b/src/freedreno/ir3/ir3_sched.c @@ -212,7 +212,7 @@ static bool is_outstanding_tex_or_prefetch(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx) { - if (!is_tex_or_prefetch(instr)) + if (!is_sy_producer(instr)) return false; /* The sched node is only valid within the same block, we cannot @@ -228,7 +228,7 @@ is_outstanding_tex_or_prefetch(struct ir3_instruction *instr, static bool is_outstanding_sfu(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx) { - if (!is_sfu(instr)) + if (!is_ss_producer(instr)) return false; /* The sched node is only valid within the same block, we cannot @@ -330,8 +330,8 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) unsigned cycles = cycle_count(instr); - if (is_sfu(instr)) { - ctx->sfu_delay = 8; + if (is_ss_producer(instr)) { + ctx->sfu_delay = soft_ss_delay(instr); n->sfu_index = ctx->sfu_index++; } else if (!is_meta(instr) && sched_check_src_cond(instr, is_outstanding_sfu, ctx)) { @@ -341,13 +341,13 @@ schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) ctx->sfu_delay -= MIN2(cycles, ctx->sfu_delay); } - if (is_tex_or_prefetch(instr)) { + if (is_sy_producer(instr)) { /* NOTE that this isn't an attempt to hide texture fetch latency, * but an attempt to hide the cost of switching to another warp. * If we can, we'd like to try to schedule another texture fetch * before scheduling something that would sync. */ - ctx->tex_delay = 10; + ctx->tex_delay = soft_sy_delay(instr, ctx->block->shader); assert(ctx->remaining_tex > 0); ctx->remaining_tex--; n->tex_index = ctx->tex_index++; @@ -607,10 +607,10 @@ should_defer(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr) * and prevents unacceptably large increases in register pressure from too * many outstanding texture instructions. */ - if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_tex(instr)) + if (ctx->tex_index - ctx->first_outstanding_tex_index >= 8 && is_sy_producer(instr)) return true; - if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_sfu(instr)) + if (ctx->sfu_index - ctx->first_outstanding_sfu_index >= 8 && is_ss_producer(instr)) return true; return false; @@ -1179,7 +1179,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block) foreach_instr_safe (instr, &ctx->unscheduled_list) { if (is_kill_or_demote(instr)) ctx->remaining_kills++; - if (is_tex_or_prefetch(instr)) + if (is_sy_producer(instr)) ctx->remaining_tex++; }
