Module: Mesa Branch: main Commit: e528823400eea0f752227407740033462d5cfcf3 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=e528823400eea0f752227407740033462d5cfcf3
Author: Pierre-Eric Pelloux-Prayer <[email protected]> Date: Fri Apr 7 09:48:02 2023 +0200 radeonsi: stop reporting reset to app once gpu recovery is done This way apps know they can recreate their contexts when the status go back to NO_ERROR. This depends on new UAPI in the kernel; for older kernel, radeonsi will stop reporting a reset after 3 seconds. Apps will be able to create new contexts but they'll have to handle not being able to submit tasks. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/7460 Reviewed-by: André Almeida <[email protected]> Reviewed-by: Marek Olšák <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22290> --- src/gallium/drivers/r600/r600_pipe_common.c | 2 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.c | 21 ++++++++++++++------- src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/include/winsys/radeon_winsys.h | 2 +- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 16 +++++++++++++++- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 6 +++++- 7 files changed, 38 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c index 17a5f15d725..ffd2efdd66a 100644 --- a/src/gallium/drivers/r600/r600_pipe_common.c +++ b/src/gallium/drivers/r600/r600_pipe_common.c @@ -489,7 +489,7 @@ static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - return rctx->ws->ctx_query_reset_status(rctx->ctx, false, NULL); + return rctx->ws->ctx_query_reset_status(rctx->ctx, false, NULL, NULL); } static void r600_set_debug_callback(struct pipe_context *ctx, diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index be2d353169f..698d9841f31 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -80,7 +80,7 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h * while si_get_reset_status can't. */ if (!(ctx->context_flags & SI_CONTEXT_FLAG_AUX) && ctx->device_reset_callback.reset) { - enum pipe_reset_status status = ctx->ws->ctx_query_reset_status(ctx->ctx, true, NULL); + enum pipe_reset_status status = ctx->ws->ctx_query_reset_status(ctx->ctx, true, NULL, NULL); if (status != PIPE_NO_RESET) ctx->device_reset_callback.reset(ctx->device_reset_callback.data, status); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index ab42506cda4..afd13ffcc1e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -385,13 +385,20 @@ static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx) if (sctx->context_flags & SI_CONTEXT_FLAG_AUX) return PIPE_NO_RESET; - bool needs_reset; - enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx, false, &needs_reset); + bool needs_reset, reset_completed; + enum pipe_reset_status status = sctx->ws->ctx_query_reset_status(sctx->ctx, false, + &needs_reset, &reset_completed); - if (status != PIPE_NO_RESET && needs_reset && !(sctx->context_flags & SI_CONTEXT_FLAG_AUX)) { - /* Call the gallium frontend to set a no-op API dispatch. */ - if (sctx->device_reset_callback.reset) { - sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status); + if (status != PIPE_NO_RESET) { + if (sctx->has_reset_been_notified && reset_completed) + return PIPE_NO_RESET; + + sctx->has_reset_been_notified = true; + + if (!(sctx->context_flags & SI_CONTEXT_FLAG_AUX)) { + /* Call the gallium frontend to set a no-op API dispatch. */ + if (needs_reset && sctx->device_reset_callback.reset) + sctx->device_reset_callback.reset(sctx->device_reset_callback.data, status); } } return status; @@ -818,7 +825,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign struct si_context *saux = si_get_aux_context(sscreen); enum pipe_reset_status status = sctx->ws->ctx_query_reset_status( - saux->ctx, true, NULL); + saux->ctx, true, NULL, NULL); if (status != PIPE_NO_RESET) { /* We lost the aux_context, create a new one */ struct u_log_context *aux_log = (saux)->log; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index c9ff07247b8..29bc59f20db 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1218,6 +1218,7 @@ struct si_context { bool dpbb_force_off_profile_ps; bool vs_writes_viewport_index; bool vs_disables_clipping_viewport; + bool has_reset_been_notified; /* Precomputed IA_MULTI_VGT_PARAM */ union si_vgt_param_key ia_multi_vgt_param_key; diff --git a/src/gallium/include/winsys/radeon_winsys.h b/src/gallium/include/winsys/radeon_winsys.h index 46b9c96d844..6e9b01f8dc6 100644 --- a/src/gallium/include/winsys/radeon_winsys.h +++ b/src/gallium/include/winsys/radeon_winsys.h @@ -527,7 +527,7 @@ struct radeon_winsys { */ enum pipe_reset_status (*ctx_query_reset_status)(struct radeon_winsys_ctx *ctx, bool full_reset_only, - bool *needs_reset); + bool *needs_reset, bool *reset_completed); /** * Create a command stream. diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index b7e1b9c02c5..24a2bb021f3 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -349,13 +349,15 @@ static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) static enum pipe_reset_status amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only, - bool *needs_reset) + bool *needs_reset, bool *reset_completed) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; int r; if (needs_reset) *needs_reset = false; + if (reset_completed) + *reset_completed = false; /* Return a failure due to a GPU hang. */ if (ctx->ws->info.drm_minor >= 24) { @@ -376,6 +378,18 @@ amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_o } if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) { + if (reset_completed) { + /* The ARB_robustness spec says: + * + * If a reset status other than NO_ERROR is returned and subsequent + * calls return NO_ERROR, the context reset was encountered and + * completed. If a reset status is repeatedly returned, the context may + * be in the process of resetting. + * + */ + if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS)) + *reset_completed = true; + } if (needs_reset) *needs_reset = flags & AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; if (flags & AMDGPU_CTX_QUERY2_FLAGS_GUILTY) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index 45342a68095..69ce9777327 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -89,7 +89,7 @@ static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx) static enum pipe_reset_status radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only, - bool *needs_reset) + bool *needs_reset, bool *reset_completed) { struct radeon_ctx *ctx = (struct radeon_ctx*)rctx; @@ -98,11 +98,15 @@ radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_rese if (ctx->gpu_reset_counter == latest) { if (needs_reset) *needs_reset = false; + if (reset_completed) + *reset_completed = false; return PIPE_NO_RESET; } if (needs_reset) *needs_reset = true; + if (reset_completed) + *reset_completed = true; ctx->gpu_reset_counter = latest; return PIPE_UNKNOWN_CONTEXT_RESET;
