Module: Mesa Branch: main Commit: a19e46f5d05293bd933914a4645602593da8fb80 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=a19e46f5d05293bd933914a4645602593da8fb80
Author: Timur Kristóf <timur.kris...@gmail.com> Date: Fri Oct 6 01:29:01 2023 +0200 radv: Implement workaround for unaligned buffer/image copies. When the pitch or slice pitch isn't properly aligned, the SDMA HW is unable to copy between tiled images and buffers. To work around this, we process the image chunk by chunk, copying the data to a temporary buffer which uses supported pitches, and then copy it to the intended destination. The implementation assumes that at least one pixel row of the image will fit into the temporary buffer, and will try to copy as many rows at once as possible. Sadly, this still results in a lot of packets being generated for large images. A possibe future improvement is to copy the image slice by slice when only the slice pitch is misaligned. However, that is out of scope for this commit. Signed-off-by: Timur Kristóf <timur.kris...@gmail.com> Reviewed-by: Tatsuyuki Ishi <ishitatsuy...@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25831> --- src/amd/vulkan/meta/radv_meta_copy.c | 9 ++ src/amd/vulkan/radv_private.h | 6 ++ src/amd/vulkan/radv_sdma.c | 162 +++++++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+) diff --git a/src/amd/vulkan/meta/radv_meta_copy.c b/src/amd/vulkan/meta/radv_meta_copy.c index 86a91c8b302..33eb5e99d26 100644 --- a/src/amd/vulkan/meta/radv_meta_copy.c +++ b/src/amd/vulkan/meta/radv_meta_copy.c @@ -115,6 +115,15 @@ transfer_copy_buffer_image(struct radv_cmd_buffer *cmd_buffer, struct radv_buffe radv_cs_add_buffer(device->ws, cs, image->bindings[0].bo); radv_cs_add_buffer(device->ws, cs, buffer->bo); + if (radv_sdma_use_unaligned_buffer_image_copy(device, image, buffer, region)) { + if (!alloc_transfer_temp_bo(cmd_buffer)) + return; + + radv_sdma_copy_buffer_image_unaligned(device, cs, image, buffer, region, cmd_buffer->transfer.copy_temp, + to_image); + return; + } + radv_sdma_copy_buffer_image(device, cs, image, buffer, region, to_image); } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 56002af0bab..16ec1d65f79 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -3136,6 +3136,12 @@ void radv_rra_trace_finish(VkDevice vk_device, struct radv_rra_trace_data *data) void radv_sdma_copy_buffer_image(const struct radv_device *device, struct radeon_cmdbuf *cs, struct radv_image *image, struct radv_buffer *buffer, const VkBufferImageCopy2 *region, bool to_image); +bool radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device *device, const struct radv_image *image, + const struct radv_buffer *buffer, const VkBufferImageCopy2 *region); +void radv_sdma_copy_buffer_image_unaligned(const struct radv_device *device, struct radeon_cmdbuf *cs, + struct radv_image *image, struct radv_buffer *buffer, + const VkBufferImageCopy2 *region, struct radeon_winsys_bo *temp_bo, + bool to_image); void radv_sdma_copy_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, uint64_t src_va, uint64_t dst_va, uint64_t size); diff --git a/src/amd/vulkan/radv_sdma.c b/src/amd/vulkan/radv_sdma.c index 70cd6334ddc..e3f4dcaf1d2 100644 --- a/src/amd/vulkan/radv_sdma.c +++ b/src/amd/vulkan/radv_sdma.c @@ -50,6 +50,17 @@ struct radv_sdma_tiled_info { unsigned blk_h; }; +struct radv_sdma_chunked_copy_info { + unsigned bpp; + unsigned blk_w; + unsigned blk_h; + unsigned row_pitch_alignment; + unsigned extent_horizontal_blocks; + unsigned extent_vertical_blocks; + unsigned aligned_row_pitch; + unsigned num_rows_per_copy; +}; + ALWAYS_INLINE static void radv_sdma_check_pitches(const unsigned pitch, const unsigned slice_pitch, const unsigned bpp, const bool uses_depth) { @@ -156,6 +167,42 @@ radv_sdma_pixel_area_to_blocks(const unsigned linear_slice_pitch, const unsigned return DIV_ROUND_UP(DIV_ROUND_UP(linear_slice_pitch, blk_w), blk_h); } +static struct radv_sdma_chunked_copy_info +radv_sdma_get_chunked_copy_info(const struct radv_device *const device, const struct radv_image *const image, + const VkExtent3D extent) +{ + const struct radeon_surf *const surf = &image->planes[0].surface; + + const unsigned bpp = surf->bpe; + const unsigned blk_w = surf->blk_w; + const unsigned blk_h = surf->blk_h; + const unsigned row_pitch_alignment = 4; + const unsigned extent_horizontal_blocks = DIV_ROUND_UP(extent.width, blk_w); + const unsigned extent_vertical_blocks = DIV_ROUND_UP(extent.height, blk_h); + const unsigned aligned_row_pitch = ALIGN(extent_horizontal_blocks, row_pitch_alignment); + const unsigned aligned_row_bytes = aligned_row_pitch * bpp; + + /* Assume that we can always copy at least one full row at a time. */ + const unsigned max_num_rows_per_copy = MIN2(RADV_SDMA_TRANSFER_TEMP_BYTES / aligned_row_bytes, extent.height); + assert(max_num_rows_per_copy); + + /* Ensure that the number of rows copied at a time is a power of two. */ + const unsigned num_rows_per_copy = MAX2(1, util_next_power_of_two(max_num_rows_per_copy + 1) / 2); + + const struct radv_sdma_chunked_copy_info r = { + .bpp = bpp, + .blk_w = blk_w, + .blk_h = blk_h, + .row_pitch_alignment = row_pitch_alignment, + .extent_horizontal_blocks = extent_horizontal_blocks, + .extent_vertical_blocks = extent_vertical_blocks, + .aligned_row_pitch = aligned_row_pitch, + .num_rows_per_copy = num_rows_per_copy, + }; + + return r; +} + static struct radv_sdma_linear_info radv_sdma_get_linear_buf_info(const struct radv_buffer *const buffer, const struct radv_image *const image, const VkBufferImageCopy2 *const region) @@ -295,6 +342,14 @@ radv_sdma_get_tiled_img_info(const struct radv_device *const device, const struc return info; } +static void +radv_sdma_emit_nop(const struct radv_device *device, struct radeon_cmdbuf *cs) +{ + /* SDMA NOP acts as a fence command and causes the SDMA engine to wait for pending copy operations. */ + radeon_check_space(device->ws, cs, 1); + radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_NOP, 0, 0)); +} + void radv_sdma_copy_buffer(const struct radv_device *device, struct radeon_cmdbuf *cs, uint64_t src_va, uint64_t dst_va, uint64_t size) @@ -460,3 +515,110 @@ radv_sdma_copy_buffer_image(const struct radv_device *device, struct radeon_cmdb radv_sdma_emit_copy_tiled_sub_window(device, cs, &tiled, &buf_info, img_offset, zero_offset, extent, !to_image); } } + +bool +radv_sdma_use_unaligned_buffer_image_copy(const struct radv_device *device, const struct radv_image *image, + const struct radv_buffer *buffer, const VkBufferImageCopy2 *region) +{ + const struct radeon_surf *const surf = &image->planes[0].surface; + const enum amd_gfx_level gfx_level = device->physical_device->rad_info.gfx_level; + const unsigned pitch_alignment = gfx_level >= GFX10 ? MAX2(1, 4 / surf->bpe) : 4; + const unsigned pitch = (region->bufferRowLength ? region->bufferRowLength : region->imageExtent.width); + const unsigned pitch_blocks = radv_sdma_pixels_to_blocks(pitch, surf->blk_w); + + if (!radv_is_aligned(pitch_blocks, pitch_alignment)) + return true; + + const VkOffset3D off = radv_sdma_get_img_offset(image, region->imageSubresource, region->imageOffset); + const VkExtent3D ext = radv_sdma_get_copy_extent(image, region->imageSubresource, region->imageExtent); + const bool uses_depth = off.z != 0 || ext.depth != 1; + if (!surf->is_linear && uses_depth) { + const unsigned slice_pitch = + (region->bufferImageHeight ? region->bufferImageHeight : region->imageExtent.height) * pitch; + const unsigned slice_pitch_blocks = radv_sdma_pixel_area_to_blocks(slice_pitch, surf->blk_w, surf->blk_h); + + if (!radv_is_aligned(slice_pitch_blocks, 4)) + return true; + } + + return false; +} + +void +radv_sdma_copy_buffer_image_unaligned(const struct radv_device *device, struct radeon_cmdbuf *cs, + struct radv_image *image, struct radv_buffer *buffer, + const VkBufferImageCopy2 *region, struct radeon_winsys_bo *temp_bo, bool to_image) +{ + const bool is_linear = image->planes[0].surface.is_linear; + const VkOffset3D base_offset = radv_sdma_get_img_offset(image, region->imageSubresource, region->imageOffset); + const VkExtent3D base_extent = radv_sdma_get_copy_extent(image, region->imageSubresource, region->imageExtent); + const struct radv_sdma_chunked_copy_info info = radv_sdma_get_chunked_copy_info(device, image, base_extent); + const struct radv_sdma_linear_info buf = radv_sdma_get_linear_buf_info(buffer, image, region); + const struct radv_sdma_linear_info linear = radv_sdma_get_linear_img_info(image, region->imageSubresource); + const struct radv_sdma_tiled_info tiled = radv_sdma_get_tiled_img_info(device, image, region->imageSubresource); + + struct radv_sdma_linear_info tmp = { + .va = temp_bo->va, + .bpp = info.bpp, + .blk_w = info.blk_w, + .blk_h = info.blk_h, + .pitch = info.aligned_row_pitch * info.blk_w, + .slice_pitch = info.aligned_row_pitch * info.blk_w * info.extent_vertical_blocks * info.blk_h, + }; + + const VkOffset3D zero_offset = {0}; + VkExtent3D extent = base_extent; + VkOffset3D offset = base_offset; + const unsigned buf_pitch_blocks = DIV_ROUND_UP(buf.pitch, info.blk_w); + const unsigned buf_slice_pitch_blocks = DIV_ROUND_UP(DIV_ROUND_UP(buf.slice_pitch, info.blk_w), info.blk_h); + assert(buf_pitch_blocks); + assert(buf_slice_pitch_blocks); + extent.depth = 1; + + for (unsigned slice = 0; slice < base_extent.depth; ++slice) { + for (unsigned row = 0; row < info.extent_vertical_blocks; row += info.num_rows_per_copy) { + const unsigned rows = MIN2(info.extent_vertical_blocks - row, info.num_rows_per_copy); + + offset.y = base_offset.y + row * info.blk_h; + offset.z = base_offset.z + slice; + extent.height = rows * info.blk_h; + tmp.slice_pitch = tmp.pitch * rows * info.blk_h; + + if (!to_image) { + /* Copy the rows from the source image to the temporary buffer. */ + if (is_linear) + radv_sdma_emit_copy_linear_sub_window(device, cs, &linear, &tmp, offset, zero_offset, extent); + else + radv_sdma_emit_copy_tiled_sub_window(device, cs, &tiled, &tmp, offset, zero_offset, extent, true); + + /* Wait for the copy to finish. */ + radv_sdma_emit_nop(device, cs); + } + + /* buffer to image: copy each row from source buffer to temporary buffer. + * image to buffer: copy each row from temporary buffer to destination buffer. + */ + for (unsigned r = 0; r < rows; ++r) { + const uint64_t buf_va = + buf.va + slice * buf_slice_pitch_blocks * info.bpp + (row + r) * buf_pitch_blocks * info.bpp; + const uint64_t tmp_va = tmp.va + r * info.aligned_row_pitch * info.bpp; + radv_sdma_copy_buffer(device, cs, to_image ? buf_va : tmp_va, to_image ? tmp_va : buf_va, + info.extent_horizontal_blocks * info.bpp); + } + + /* Wait for the copy to finish. */ + radv_sdma_emit_nop(device, cs); + + if (to_image) { + /* Copy the rows from the temporary buffer to the destination image. */ + if (is_linear) + radv_sdma_emit_copy_linear_sub_window(device, cs, &tmp, &linear, zero_offset, offset, extent); + else + radv_sdma_emit_copy_tiled_sub_window(device, cs, &tiled, &tmp, offset, zero_offset, extent, false); + + /* Wait for the copy to finish. */ + radv_sdma_emit_nop(device, cs); + } + } + } +}