From: Marek Olšák <marek.ol...@amd.com> Now draw calls from multiple IBs can be executed in parallel.
v2: do emit partial flushes on SI v3: invalidate all shader caches at the beginning of IBs v4: squash with the AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE change, don't call si_emit_cache_flush in si_flush_gfx_cs if not needed, only do this for flushes invoked internally If we artificially limit the number of draw calls per IB to 5, we'll get a lot more IBs, leading to a lot more partial flushes. Let's see how the removal of partial flushes changes GPU utilization in that scenario: With partial flushes (time busy): CP: 99% SPI: 86% CB: 73: Without partial flushes (time busy): CP: 99% SPI: 93% CB: 81% --- src/gallium/drivers/radeon/radeon_winsys.h | 7 ++++ src/gallium/drivers/radeonsi/si_buffer.c | 6 ++-- src/gallium/drivers/radeonsi/si_dma_cs.c | 2 +- src/gallium/drivers/radeonsi/si_fence.c | 5 ++- src/gallium/drivers/radeonsi/si_gfx_cs.c | 48 +++++++++++++++++-------- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 4 +-- src/gallium/drivers/radeonsi/si_texture.c | 2 +- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 12 ++++--- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 36 +++++++++++++------ src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 12 ++++--- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 3 +- 12 files changed, 96 insertions(+), 43 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 157b2e40550..fae4fb7a95d 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -21,20 +21,27 @@ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef RADEON_WINSYS_H #define RADEON_WINSYS_H /* The public winsys interface header for the radeon driver. */ +/* Whether the next IB can start immediately and not wait for draws and + * dispatches from the current IB to finish. */ +#define RADEON_FLUSH_START_NEXT_GFX_IB_NOW (1u << 31) + +#define RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW \ + (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_IB_NOW) + #include "pipebuffer/pb_buffer.h" #include "amd/common/ac_gpu_info.h" #include "amd/common/ac_surface.h" /* Tiling flags. */ enum radeon_bo_layout { RADEON_LAYOUT_LINEAR = 0, RADEON_LAYOUT_TILED, RADEON_LAYOUT_SQUARETILED, diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index 1420702d8d4..d17b2c6a831 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -57,24 +57,24 @@ void *si_buffer_map_sync_with_rings(struct si_context *sctx, if (!(usage & PIPE_TRANSFER_WRITE)) { /* have to wait for the last write */ rusage = RADEON_USAGE_WRITE; } if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) && sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } else { - si_flush_gfx_cs(sctx, 0, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); busy = true; } } if (radeon_emitted(sctx->dma_cs, 0) && sctx->ws->cs_is_buffer_referenced(sctx->dma_cs, resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL); return NULL; } else { @@ -718,21 +718,21 @@ static bool si_resource_commit(struct pipe_context *pctx, /* * Since buffer commitment changes cannot be pipelined, we need to * (a) flush any pending commands that refer to the buffer we're about * to change, and * (b) wait for threaded submit to finish, including those that were * triggered by some other, earlier operation. */ if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) { - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } if (radeon_emitted(ctx->dma_cs, 0) && ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, res->buf, RADEON_USAGE_READWRITE)) { si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); } ctx->ws->cs_sync_flush(ctx->dma_cs); ctx->ws->cs_sync_flush(ctx->gfx_cs); diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c index 7af7c5623b7..1eefaeb6ad5 100644 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c @@ -51,21 +51,21 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw, } /* Flush the GFX IB if DMA depends on it. */ if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && ((dst && ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE)))) - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); /* Flush if there's not enough space, or if the memory usage per IB * is too large. * * IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index 26d6c43b34d..19fcb96041f 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -367,21 +367,24 @@ static boolean si_fence_finish(struct pipe_screen *screen, * * and the calls to ClientWaitSync and FenceSync were * issued from the same context, * * then the GL will behave as if the equivalent of Flush * were inserted immediately after the creation of sync." * * This means we need to flush for such fences even when we're * not going to wait. */ threaded_context_unwrap_sync(ctx); - si_flush_gfx_cs(sctx, timeout ? 0 : PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, + (timeout ? 0 : PIPE_FLUSH_ASYNC) | + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, + NULL); rfence->gfx_unflushed.ctx = NULL; if (!timeout) return false; /* Recompute the timeout after all that. */ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { int64_t time = os_time_get_nano(); timeout = abs_timeout > time ? abs_timeout - time : 0; } diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 2d5e510b19e..9fb5595f454 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -40,35 +40,35 @@ void si_need_gfx_cs_space(struct si_context *ctx) */ /* There are two memory usage counters in the winsys for all buffers * that have been added (cs_add_buffer) and two counters in the pipe * driver for those that haven't been added yet. */ if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) { ctx->gtt = 0; ctx->vram = 0; - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return; } ctx->gtt = 0; ctx->vram = 0; /* If the IB is sufficiently large, don't count the space needed * and just flush if there is not enough space left. * * Also reserve space for stopping queries at the end of IB, because * the number of active queries is mostly unlimited. */ unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend; if (!ctx->ws->cs_check_space(cs, need_dwords)) - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct radeon_winsys_cs *cs = ctx->gfx_cs; struct radeon_winsys *ws = ctx->ws; if (ctx->gfx_flush_in_progress) return; @@ -96,27 +96,36 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, if (!LIST_IS_EMPTY(&ctx->active_queries)) si_suspend_queries(ctx); ctx->streamout.suspended = false; if (ctx->streamout.begin_emitted) { si_emit_streamout_end(ctx); ctx->streamout.suspended = true; } - ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_PS_PARTIAL_FLUSH; - - /* DRM 3.1.0 doesn't flush TC for VI correctly. */ - if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) - ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; + if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) { + /* DRM 3.1.0 doesn't flush TC for VI correctly. */ + ctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_GLOBAL_L2; + si_emit_cache_flush(ctx); + } else if (ctx->chip_class == SI) { + /* The kernel flushes L2 before shaders are finished. */ + ctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + si_emit_cache_flush(ctx); + } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { + ctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + si_emit_cache_flush(ctx); + } /* Make sure CP DMA is idle at the end of IBs after L2 prefetches * because the kernel doesn't wait for it. */ if (ctx->chip_class >= CIK) si_cp_dma_wait_for_idle(ctx); if (ctx->current_saved_cs) { si_trace_emit(ctx); si_log_hw_flush(ctx); @@ -180,26 +189,35 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx) radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); } void si_begin_new_gfx_cs(struct si_context *ctx) { if (ctx->is_debug) si_begin_gfx_cs_debug(ctx); - /* Flush read caches at the beginning of CS not flushed by the kernel. */ - if (ctx->chip_class >= CIK) - ctx->flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_ICACHE; - - ctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + /* Always invalidate caches at the beginning of IBs, because external + * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our + * buffers. + * + * Note that the cache flush done by the kernel at the end of GFX IBs + * isn't useful here, because that flush can finish after the following + * IB starts drawing. + * + * TODO: Do we also need to invalidate CB & DB caches? + */ + ctx->flags |= SI_CONTEXT_INV_ICACHE | + SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_START_PIPELINE_STATS; /* set all valid group as dirty so they get reemited on * next draw command */ si_pm4_reset_emitted(ctx); /* The CS initialization should be emitted before everything else. */ si_pm4_emit(ctx, ctx->init_config); if (ctx->init_config_gs_rings) si_pm4_emit(ctx, ctx->init_config_gs_rings); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0c90a6c6e46..0da947504b7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1320,19 +1320,19 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx, struct r600_resource *rbo, enum radeon_bo_usage usage, enum radeon_bo_priority priority, bool check_mem) { if (check_mem && !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, sctx->vram + rbo->vram_usage, sctx->gtt + rbo->gart_usage)) - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); radeon_add_to_buffer_list(sctx, sctx->gfx_cs, rbo, usage, priority); } #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) #endif diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 7e1660415f5..67ab75bbd2d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2767,21 +2767,21 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); sctx->init_config_gs_rings = pm4; if (!sctx->init_config_has_vgt_flush) { si_init_config_add_vgt_flush(sctx); si_pm4_upload_indirect_buffer(sctx, sctx->init_config); } /* Flush the context to re-emit both init_config states. */ sctx->initial_gfx_cs_size = 0; /* force flush */ - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); /* Set ring bindings. */ if (sctx->esgs_ring) { assert(sctx->chip_class <= VI); si_set_ring_buffer(sctx, SI_ES_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, true, true, 4, 64, 0); si_set_ring_buffer(sctx, SI_GS_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false, false, 0, 0, 0); @@ -3044,21 +3044,21 @@ static void si_init_tess_factor_ring(struct si_context *sctx) factor_va >> 8); si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM, sctx->screen->vgt_hs_offchip_param); } /* Flush the context to re-emit the init_config state. * This is done only once in a lifetime of a context. */ si_pm4_upload_indirect_buffer(sctx, sctx->init_config); sctx->initial_gfx_cs_size = 0; /* force flush */ - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } /** * This is used when TCS is NULL in the VS->TCS->TES chain. In this case, * VS passes its outputs to TES directly, so the fixed-function shader only * has to write TESSOUTER and TESSINNER. */ static void si_generate_fixed_func_tcs(struct si_context *sctx) { struct ureg_src outer, inner; diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index 1f0de5e71ec..8964c6b730c 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -1862,21 +1862,21 @@ static void si_texture_transfer_unmap(struct pipe_context *ctx, * The idea is that we don't want to build IBs that use too much * memory and put pressure on the kernel memory manager and we also * want to make temporary and invalidated buffers go idle ASAP to * decrease the total memory usage or make them reusable. The memory * usage will be slightly higher than given here because of the buffer * cache in the winsys. * * The result is that the kernel memory manager is never a bottleneck. */ if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) { - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); sctx->num_alloc_tex_transfer_bytes = 0; } pipe_resource_reference(&transfer->resource, NULL); FREE(transfer); } static const struct u_resource_vtbl si_texture_vtbl = { NULL, /* get_handle */ diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 22b5a73143d..9b6d6e83032 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -232,31 +232,33 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, if (!(usage & PIPE_TRANSFER_WRITE)) { /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, RADEON_USAGE_WRITE)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_WRITE)) { return NULL; } } else { if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) { return NULL; } } } else { uint64_t time = os_time_get_nano(); @@ -265,35 +267,37 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs) { if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, RADEON_USAGE_WRITE)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } else { /* Try to avoid busy-waiting in amdgpu_bo_wait. */ if (p_atomic_read(&bo->num_active_ioctls)) amdgpu_cs_sync_flush(rcs); } } amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_WRITE); } else { /* Mapping for write. */ if (cs) { if (amdgpu_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } else { /* Try to avoid busy-waiting in amdgpu_bo_wait. */ if (p_atomic_read(&bo->num_active_ioctls)) amdgpu_cs_sync_flush(rcs); } } amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE); } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index a3feeb93026..eb050b8fdb2 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -26,20 +26,24 @@ * of the Software. */ #include "amdgpu_cs.h" #include "util/os_time.h" #include <inttypes.h> #include <stdio.h> #include "amd/common/sid.h" +#ifndef AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) +#endif + DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false) /* FENCES */ static struct pipe_fence_handle * amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type, unsigned ip_instance, unsigned ring) { struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); @@ -801,56 +805,68 @@ static void amdgpu_set_ib_size(struct amdgpu_ib *ib) } static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) { amdgpu_set_ib_size(ib); ib->used_ib_space += ib->base.current.cdw * 4; ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_start_alignment); ib->max_ib_size = MAX2(ib->max_ib_size, ib->base.prev_dw + ib->base.current.cdw); } -static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs, +static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws, + struct amdgpu_cs_context *cs, enum ring_type ring_type) { switch (ring_type) { case RING_DMA: cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_DMA; break; case RING_UVD: cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD; break; case RING_UVD_ENC: cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_UVD_ENC; break; case RING_VCE: cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCE; break; - case RING_COMPUTE: - cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_COMPUTE; - break; - case RING_VCN_DEC: cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_DEC; break; - case RING_VCN_ENC: + case RING_VCN_ENC: cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_VCN_ENC; break; - default: + case RING_COMPUTE: case RING_GFX: - cs->ib[IB_MAIN].ip_type = AMDGPU_HW_IP_GFX; + cs->ib[IB_MAIN].ip_type = ring_type == RING_GFX ? AMDGPU_HW_IP_GFX : + AMDGPU_HW_IP_COMPUTE; + + /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache + * invalidation is the beginning of IBs (the previous commit does that), + * because completion of an IB doesn't care about the state of GPU caches, + * but the beginning of an IB does. Draw calls from multiple IBs can be + * executed in parallel, so draw calls from the current IB can finish after + * the next IB starts drawing, and so the cache flush at the end of IB + * is always late. + */ + if (ws->info.drm_minor >= 26) + cs->ib[IB_MAIN].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE; break; + + default: + assert(0); } memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); cs->last_added_bo = NULL; return true; } static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) { unsigned i; @@ -918,26 +934,26 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, cs->flush_data = flush_ctx; cs->ring_type = ring_type; struct amdgpu_cs_fence_info fence_info; fence_info.handle = cs->ctx->user_fence_bo; fence_info.offset = cs->ring_type; amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk); cs->main.ib_type = IB_MAIN; - if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) { + if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) { FREE(cs); return NULL; } - if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) { + if (!amdgpu_init_cs_context(ctx->ws, &cs->csc2, ring_type)) { amdgpu_destroy_cs_context(&cs->csc1); FREE(cs); return NULL; } /* Set the first submission context as current. */ cs->csc = &cs->csc1; cs->cst = &cs->csc2; if (!amdgpu_get_new_ib(&ctx->ws->base, cs, IB_MAIN)) { diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 1617a2fe32e..6652977e586 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -509,60 +509,64 @@ static void *radeon_bo_map(struct pb_buffer *buf, if (usage & PIPE_TRANSFER_DONTBLOCK) { if (!(usage & PIPE_TRANSFER_WRITE)) { /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!radeon_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_WRITE)) { return NULL; } } else { if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!radeon_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) { return NULL; } } } else { uint64_t time = os_time_get_nano(); if (!(usage & PIPE_TRANSFER_WRITE)) { /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_WRITE); } else { /* Mapping for write. */ if (cs) { if (radeon_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } else { /* Try to avoid busy-waiting in radeon_bo_wait. */ if (p_atomic_read(&bo->num_active_ioctls)) radeon_drm_cs_sync_flush(rcs); } } radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE); } diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index a1975dff8df..9070464bec8 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -400,21 +400,22 @@ static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs) unsigned i; for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) { p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references); radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL); } cs->csc->num_relocs = cs->csc->num_validated_relocs; /* Flush if there are any relocs. Clean up otherwise. */ if (cs->csc->num_relocs) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } else { radeon_cs_context_cleanup(cs->csc); cs->base.used_vram = 0; cs->base.used_gart = 0; assert(cs->base.current.cdw == 0); if (cs->base.current.cdw != 0) { fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__); } } -- 2.15.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev