From: Marek Olšák <marek.ol...@amd.com> also use assertions for the requirements that offset and size are a multiple of 4. --- src/gallium/drivers/radeon/radeon_video.c | 3 +- src/gallium/drivers/radeonsi/cik_sdma.c | 41 ------------- src/gallium/drivers/radeonsi/si_cp_dma.c | 2 +- src/gallium/drivers/radeonsi/si_dma.c | 40 ------------ src/gallium/drivers/radeonsi/si_dma_cs.c | 61 ++++++++++++++++++- src/gallium/drivers/radeonsi/si_pipe.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 5 +- .../drivers/radeonsi/si_test_dma_perf.c | 2 +- 8 files changed, 66 insertions(+), 90 deletions(-)
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c index 749f30c2306..a39ce4cc73e 100644 --- a/src/gallium/drivers/radeon/radeon_video.c +++ b/src/gallium/drivers/radeon/radeon_video.c @@ -113,22 +113,21 @@ error: si_vid_destroy_buffer(new_buf); *new_buf = old_buf; return false; } /* clear the buffer with zeros */ void si_vid_clear_buffer(struct pipe_context *context, struct rvid_buffer* buffer) { struct si_context *sctx = (struct si_context*)context; - sctx->dma_clear_buffer(sctx, &buffer->res->b.b, 0, - buffer->res->buf->size, 0); + si_sdma_clear_buffer(sctx, &buffer->res->b.b, 0, buffer->res->buf->size, 0); context->flush(context, NULL, 0); } /** * join surfaces into the same buffer with identical tiling params * sumup their sizes and replace the backend buffers with a single bo */ void si_vid_join_surfaces(struct si_context *sctx, struct pb_buffer** buffers[VL_NUM_COMPONENTS], struct radeon_surf *surfaces[VL_NUM_COMPONENTS]) diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index 595f8d49a80..1c2fd0f7b1c 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -60,60 +60,20 @@ static void cik_sdma_copy_buffer(struct si_context *ctx, radeon_emit(cs, src_offset); radeon_emit(cs, src_offset >> 32); radeon_emit(cs, dst_offset); radeon_emit(cs, dst_offset >> 32); dst_offset += csize; src_offset += csize; size -= csize; } } -static void cik_sdma_clear_buffer(struct si_context *sctx, - struct pipe_resource *dst, - uint64_t offset, - uint64_t size, - unsigned clear_value) -{ - struct radeon_cmdbuf *cs = sctx->dma_cs; - unsigned i, ncopy, csize; - struct r600_resource *rdst = r600_resource(dst); - - if (!cs || offset % 4 != 0 || size % 4 != 0 || - dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { - sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); - return; - } - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&rdst->valid_buffer_range, offset, offset + size); - - offset += rdst->gpu_address; - - /* the same maximum size as for copying */ - ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); - si_need_dma_space(sctx, ncopy * 5, rdst, NULL); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); - radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, - 0x8000 /* dword copy */)); - radeon_emit(cs, offset); - radeon_emit(cs, offset >> 32); - radeon_emit(cs, clear_value); - radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); - offset += csize; - size -= csize; - } -} - static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w) { width = u_minify(width, level); return DIV_ROUND_UP(width, blk_w); } static unsigned encode_tile_info(struct si_context *sctx, struct si_texture *tex, unsigned level, bool set_bpp) { @@ -547,12 +507,11 @@ static void cik_sdma_copy(struct pipe_context *ctx, return; fallback: si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); } void cik_init_sdma_functions(struct si_context *sctx) { sctx->dma_copy = cik_sdma_copy; - sctx->dma_clear_buffer = cik_sdma_clear_buffer; } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 486ae75c77f..598d5ecf0dc 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -294,21 +294,21 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, * alone improves DeusEx:MD performance by 70%. */ (size > CP_DMA_CLEAR_PERF_THRESHOLD || /* Buffers not used by the GFX IB yet will be cleared by SDMA. * This happens to move most buffer clears to SDMA, including * DCC and CMASK clears, because pipe->clear clears them before * si_emit_framebuffer_state (in a draw call) adds them. * For example, DeusEx:MD has 21 buffer clears per frame and all * of them are moved to SDMA thanks to this. */ !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf, RADEON_USAGE_READWRITE))) { - sctx->dma_clear_buffer(sctx, dst, offset, dma_clear_size, value); + si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value); offset += dma_clear_size; size -= dma_clear_size; } else if (dma_clear_size >= 4) { si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value, coher, cache_policy); offset += dma_clear_size; size -= dma_clear_size; } diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index da5bd47b5dd..046d8445ce3 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -70,59 +70,20 @@ static void si_dma_copy_buffer(struct si_context *ctx, radeon_emit(cs, dst_offset); radeon_emit(cs, src_offset); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += count; src_offset += count; size -= count; } } -static void si_dma_clear_buffer(struct si_context *sctx, - struct pipe_resource *dst, - uint64_t offset, - uint64_t size, - unsigned clear_value) -{ - struct radeon_cmdbuf *cs = sctx->dma_cs; - unsigned i, ncopy, csize; - struct r600_resource *rdst = r600_resource(dst); - - if (!cs || offset % 4 != 0 || size % 4 != 0 || - dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { - sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); - return; - } - - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&rdst->valid_buffer_range, offset, offset + size); - - offset += rdst->gpu_address; - - /* the same maximum size as for copying */ - ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); - si_need_dma_space(sctx, ncopy * 4, rdst, NULL); - - for (i = 0; i < ncopy; i++) { - csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); - radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, - csize / 4)); - radeon_emit(cs, offset); - radeon_emit(cs, clear_value); - radeon_emit(cs, (offset >> 32) << 16); - offset += csize; - size -= csize; - } -} - static void si_dma_copy_tile(struct si_context *ctx, struct pipe_resource *dst, unsigned dst_level, unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src, unsigned src_level, unsigned src_x, unsigned src_y, @@ -318,12 +279,11 @@ static void si_dma_copy(struct pipe_context *ctx, return; fallback: si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); } void si_init_dma_functions(struct si_context *sctx) { sctx->dma_copy = si_dma_copy; - sctx->dma_clear_buffer = si_dma_clear_buffer; } diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c index 7db9570af3c..ffa2f5ae69b 100644 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c @@ -57,20 +57,79 @@ void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst, si_need_dma_space(sctx, 4, dst, NULL); si_dma_emit_wait_idle(sctx); radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); } +void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned clear_value) +{ + struct radeon_cmdbuf *cs = sctx->dma_cs; + unsigned i, ncopy, csize; + struct r600_resource *rdst = r600_resource(dst); + + assert(offset % 4 == 0); + assert(size); + assert(size % 4 == 0); + + if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE) { + sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4); + return; + } + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&rdst->valid_buffer_range, offset, offset + size); + + offset += rdst->gpu_address; + + if (sctx->chip_class == SI) { + /* the same maximum size as for copying */ + ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); + si_need_dma_space(sctx, ncopy * 4, rdst, NULL); + + for (i = 0; i < ncopy; i++) { + csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); + radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, + csize / 4)); + radeon_emit(cs, offset); + radeon_emit(cs, clear_value); + radeon_emit(cs, (offset >> 32) << 16); + offset += csize; + size -= csize; + } + return; + } + + /* The following code is for CI, VI, Vega/Raven, etc. */ + /* the same maximum size as for copying */ + ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE); + si_need_dma_space(sctx, ncopy * 5, rdst, NULL); + + for (i = 0; i < ncopy; i++) { + csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, + 0x8000 /* dword copy */)); + radeon_emit(cs, offset); + radeon_emit(cs, offset >> 32); + radeon_emit(cs, clear_value); + radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize); + offset += csize; + size -= csize; + } +} + void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { uint64_t vram = ctx->dma_cs->used_vram; uint64_t gtt = ctx->dma_cs->used_gart; if (dst) { vram += dst->vram_usage; gtt += dst->gart_usage; } @@ -163,14 +222,14 @@ void si_flush_dma_cs(struct si_context *ctx, unsigned flags, si_clear_saved_cs(&saved); } } void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value) { struct si_context *ctx = (struct si_context*)sscreen->aux_context; mtx_lock(&sscreen->aux_context_lock); - ctx->dma_clear_buffer(ctx, dst, offset, size, value); + si_sdma_clear_buffer(ctx, dst, offset, size, value); sscreen->aux_context->flush(sscreen->aux_context, NULL, 0); mtx_unlock(&sscreen->aux_context_lock); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index c259c260550..bce7b3f550e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -728,21 +728,21 @@ static void si_test_vmfault(struct si_screen *sscreen) } r600_resource(buf)->gpu_address = 0; /* cause a VM fault */ if (sscreen->debug_flags & DBG(TEST_VMFAULT_CP)) { si_copy_buffer(sctx, buf, buf, 0, 4, 4, 0, -1); ctx->flush(ctx, NULL, 0); puts("VM fault test: CP - done."); } if (sscreen->debug_flags & DBG(TEST_VMFAULT_SDMA)) { - sctx->dma_clear_buffer(sctx, buf, 0, 4, 0); + si_sdma_clear_buffer(sctx, buf, 0, 4, 0); ctx->flush(ctx, NULL, 0); puts("VM fault test: SDMA - done."); } if (sscreen->debug_flags & DBG(TEST_VMFAULT_SHADER)) { util_test_constant_buffer(ctx, buf); puts("VM fault test: Shader - done."); } exit(0); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index ef4f06f41d5..a6f09b65f74 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1022,23 +1022,20 @@ struct si_context { /* Copy one resource to another using async DMA. */ void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src, unsigned src_level, const struct pipe_box *src_box); - void (*dma_clear_buffer)(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value); - struct si_tracked_regs tracked_regs; }; /* cik_sdma.c */ void cik_init_sdma_functions(struct si_context *sctx); /* si_blit.c */ enum si_blitter_op /* bitmask */ { SI_SAVE_TEXTURES = 1, @@ -1152,20 +1149,22 @@ void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring); bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); /* si_dma.c */ void si_init_dma_functions(struct si_context *sctx); /* si_dma_cs.c */ void si_dma_emit_timestamp(struct si_context *sctx, struct r600_resource *dst, uint64_t offset); +void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned clear_value); void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src); void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value); /* si_fence.c */ void si_gfx_write_event_eop(struct si_context *ctx, unsigned event, unsigned event_flags, diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c index be2ad079e1a..f097a642999 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c @@ -184,21 +184,21 @@ void si_test_dma_perf(struct si_screen *sscreen) si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value, SI_COHERENCY_NONE, cache_policy); } } else if (test_sdma) { /* SDMA */ if (is_copy) { struct pipe_box box; u_box_1d(0, size, &box); sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box); } else { - sctx->dma_clear_buffer(sctx, dst, 0, size, clear_value); + si_sdma_clear_buffer(sctx, dst, 0, size, clear_value); } } else { /* Compute */ /* The memory accesses are coalesced, meaning that the 1st instruction writes * the 1st contiguous block of data for the whole wave, the 2nd instruction * writes the 2nd contiguous block of data, etc. */ unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4); unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread; unsigned dwords_per_wave = cs_dwords_per_thread * 64; -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev