On 07/11/2018 06:52 PM, Dylan Baker wrote:
Quoting Samuel Pitoiset (2018-07-09 09:02:58)
This might fix some synchronization issues. I don't know if
that will affect performance but it's required for correctness.

v3: - wait for CP DMA in CmdPipelineBarrier()
     - clear the busy value when CP_DMA_SYNC is requested
v2: - wait for CP DMA in CmdWaitEvents()
     - track if CP DMA is used

CC: <mesa-sta...@lists.freedesktop.org>
Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>
---
  src/amd/vulkan/radv_cmd_buffer.c | 15 +++++++++++++
  src/amd/vulkan/radv_private.h    |  5 +++++
  src/amd/vulkan/si_cmd_buffer.c   | 36 ++++++++++++++++++++++++++++----
  3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 9da42fe03e..5dbdb3d996 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2596,6 +2596,11 @@ VkResult radv_EndCommandBuffer(
                 si_emit_cache_flush(cmd_buffer);
         }
+ /* Make sure CP DMA is idle at the end of IBs because the kernel
+        * doesn't wait for it.
+        */
+       si_cp_dma_wait_for_idle(cmd_buffer);
+
         vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
@@ -4242,6 +4247,11 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer,
                                              0);
         }
+ /* Make sure CP DMA is idle because the driver might have performed a
+        * DMA operation for copying or filling buffers/images.
+        */
+       si_cp_dma_wait_for_idle(cmd_buffer);
+
         cmd_buffer->state.flush_bits |= dst_flush_bits;
  }
@@ -4292,6 +4302,11 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
                 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
                 VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
+ /* Make sure CP DMA is idle because the driver might have performed a
+        * DMA operation for copying or filling buffers/images.
+        */
+       si_cp_dma_wait_for_idle(cmd_buffer);
+
         /* TODO: Emit EOS events for syncing PS/CS stages. */
if (!(stageMask & ~top_of_pipe_flags)) {
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 4e4b3a6037..2400de49a2 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -979,6 +979,9 @@ struct radv_cmd_state {
         uint32_t last_num_instances;
         uint32_t last_first_instance;
         uint32_t last_vertex_offset;
+
+       /* Whether CP DMA is busy/idle. */
+       bool dma_is_busy;
  };
struct radv_cmd_pool {
@@ -1091,6 +1094,8 @@ void si_cp_dma_prefetch(struct radv_cmd_buffer 
*cmd_buffer, uint64_t va,
                          unsigned size);
  void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
                             uint64_t size, unsigned value);
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
+
  void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
  bool
  radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 454fd8c39c..6d566a918d 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -1040,7 +1040,6 @@ static void si_emit_cp_dma(struct radv_cmd_buffer 
*cmd_buffer,
         struct radeon_cmdbuf *cs = cmd_buffer->cs;
         uint32_t header = 0, command = 0;
- assert(size);
         assert(size <= cp_dma_max_byte_count(cmd_buffer));
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
@@ -1099,9 +1098,14 @@ static void si_emit_cp_dma(struct radv_cmd_buffer 
*cmd_buffer,
          * indices. If we wanted to execute CP DMA in PFP, this packet
          * should precede it.
          */
-       if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == 
RADV_QUEUE_GENERAL) {
-               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 
cmd_buffer->state.predicating));
-               radeon_emit(cs, 0);
+       if (flags & CP_DMA_SYNC) {
+               if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+                       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 
cmd_buffer->state.predicating));
+                       radeon_emit(cs, 0);
+               }
+
+               /* CP will see the sync flag and wait for all DMAs to complete. 
*/
+               cmd_buffer->state.dma_is_busy = false;
         }
if (unlikely(cmd_buffer->device->trace_bo))
@@ -1165,6 +1169,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer 
*cmd_buffer,
         uint64_t main_src_va, main_dest_va;
         uint64_t skipped_size = 0, realign_size = 0;
+ /* Assume that we are not going to sync after the last DMA operation. */
+       cmd_buffer->state.dma_is_busy = true;
if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
             cmd_buffer->device->physical_device->rad_info.family == 
CHIP_STONEY) {
@@ -1228,6 +1234,9 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer 
*cmd_buffer, uint64_t va,
assert(va % 4 == 0 && size % 4 == 0); + /* Assume that we are not going to sync after the last DMA operation. */
+       cmd_buffer->state.dma_is_busy = true;
+
         while (size) {
                 unsigned byte_count = MIN2(size, 
cp_dma_max_byte_count(cmd_buffer));
                 unsigned dma_flags = CP_DMA_CLEAR;
@@ -1243,6 +1252,25 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer 
*cmd_buffer, uint64_t va,
         }
  }
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer)
+{
+       if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK)
+               return;
+
+       if (!cmd_buffer->state.dma_is_busy)
+               return;
+
+       /* Issue a dummy DMA that copies zero bytes.
+        *
+        * The DMA engine will see that there's no work to do and skip this
+        * DMA request, however, the CP will see the sync flag and still wait
+        * for all DMAs to complete.
+        */
+       si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC);
+
+       cmd_buffer->state.dma_is_busy = false;
+}
+
  /* For MSAA sample positions. */
  #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
         (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |                  \

Hi Samuel,

I'm trying to get this to apply to 18.1 and having a heck of a time, git can't
figure out how to apply this because of 8339ba827bf3f18463824206347665a45989b99e
(radv: optimize vkCmd{Set,Reset}Event() a little bit), which doesn't seem
suitable for stable and has other dependencies. I'm not confident in my ability
to rebase this, if you'd still like this in 18.1, can you provide a patch that
applies to the staging/18.1 branch in the main gitlab repo?

Hi Dylan,

Backport for 18.1 sent to mesa-stable. Thanks!


Thanks,
Dylan

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to