Nice, seems to halve CPU cost of submitting 100 command buffers in 1 submit with the simultaneous use flag set.
Reviewed-by: Bas Nieuwenhuizen <b...@basnieuwenhuizen.nl> for the series. On Thu, Nov 15, 2018 at 11:27 AM Samuel Pitoiset <samuel.pitoi...@gmail.com> wrote: > > The chained submission is the fastest path and it should now > be used more often than before. This removes some EOP events. > > Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com> > --- > src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 103 ++++++++++-------- > 1 file changed, 55 insertions(+), 48 deletions(-) > > diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c > b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c > index abc4f3903d..f2d07a54db 100644 > --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c > +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c > @@ -865,66 +865,73 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct > radeon_winsys_ctx *_ctx, > struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx); > struct radv_amdgpu_fence *fence = (struct radv_amdgpu_fence *)_fence; > amdgpu_bo_list_handle bo_list; > - struct amdgpu_cs_request request; > - bool emit_signal_sem = sem_info->cs_emit_signal; > + struct amdgpu_cs_request request = {}; > + struct amdgpu_cs_ib_info *ibs; > + struct radv_amdgpu_cs *cs0; > + unsigned number_of_ibs; > + > assert(cs_count); > + cs0 = radv_amdgpu_cs(cs_array[0]); > > - for (unsigned i = 0; i < cs_count;) { > - struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[i]); > - struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT]; > - struct radeon_cmdbuf *preamble_cs = i ? continue_preamble_cs > : initial_preamble_cs; > - unsigned cnt = MIN2(AMDGPU_CS_MAX_IBS_PER_SUBMIT - > !!preamble_cs, > - cs_count - i); > + /* Compute the number of IBs for this submit. */ > + number_of_ibs = cs_count + !!initial_preamble_cs; > > - memset(&request, 0, sizeof(request)); > + /* Create a buffer object list. */ > + r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[0], cs_count, NULL, > 0, > + initial_preamble_cs, radv_bo_list, > + &bo_list); > + if (r) { > + fprintf(stderr, "amdgpu: buffer list creation failed " > + "for the fallback submission (%d)\n", r); > + return r; > + } > > - r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, > NULL, 0, > - preamble_cs, radv_bo_list, > &bo_list); > - if (r) { > - fprintf(stderr, "amdgpu: buffer list creation failed " > - "for the fallback submission (%d)\n", > r); > - return r; > - } > + ibs = malloc(number_of_ibs * sizeof(*ibs)); > + if (!ibs) { > + if (bo_list) > + amdgpu_bo_list_destroy(bo_list); > + return -ENOMEM; > + } > > - request.ip_type = cs0->hw_ip; > - request.ring = queue_idx; > - request.resources = bo_list; > - request.number_of_ibs = cnt + !!preamble_cs; > - request.ibs = ibs; > - request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, > queue_idx); > + /* Configure the CS request. */ > + if (initial_preamble_cs) > + ibs[0] = radv_amdgpu_cs(initial_preamble_cs)->ib; > > - if (preamble_cs) { > - ibs[0] = radv_amdgpu_cs(preamble_cs)->ib; > - } > + for (unsigned i = 0; i < cs_count; i++) { > + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); > > - for (unsigned j = 0; j < cnt; ++j) { > - struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i > + j]); > - ibs[j + !!preamble_cs] = cs->ib; > + ibs[i + !!initial_preamble_cs] = cs->ib; > > - if (cs->is_chained) { > - *cs->ib_size_ptr -= 4; > - cs->is_chained = false; > - } > + if (cs->is_chained) { > + *cs->ib_size_ptr -= 4; > + cs->is_chained = false; > } > + } > > - sem_info->cs_emit_signal = (i == cs_count - cnt) ? > emit_signal_sem : false; > - r = radv_amdgpu_cs_submit(ctx, &request, sem_info); > - if (r) { > - if (r == -ENOMEM) > - fprintf(stderr, "amdgpu: Not enough memory > for command submission.\n"); > - else > - fprintf(stderr, "amdgpu: The CS has been > rejected, " > - "see dmesg for more > information.\n"); > - } > + request.ip_type = cs0->hw_ip; > + request.ring = queue_idx; > + request.resources = bo_list; > + request.number_of_ibs = number_of_ibs; > + request.ibs = ibs; > + request.fence_info = radv_set_cs_fence(ctx, cs0->hw_ip, queue_idx); > > - if (bo_list) > - amdgpu_bo_list_destroy(bo_list); > + /* Submit the CS. */ > + r = radv_amdgpu_cs_submit(ctx, &request, sem_info); > + if (r) { > + if (r == -ENOMEM) > + fprintf(stderr, "amdgpu: Not enough memory for > command submission.\n"); > + else > + fprintf(stderr, "amdgpu: The CS has been rejected, " > + "see dmesg for more information.\n"); > + } > > - if (r) > - return r; > + if (bo_list) > + amdgpu_bo_list_destroy(bo_list); > + free(ibs); > + > + if (r) > + return r; > > - i += cnt; > - } > if (fence) > radv_amdgpu_request_to_fence(ctx, fence, &request); > > @@ -1131,7 +1138,7 @@ static int radv_amdgpu_winsys_cs_submit(struct > radeon_winsys_ctx *_ctx, > if (!cs->ws->use_ib_bos) { > ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, > sem_info, bo_list, cs_array, > cs_count, > initial_preamble_cs, continue_preamble_cs, _fence); > - } else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && > cs->ws->batchchain) { > + } else if (can_patch && cs->ws->batchchain) { > ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, > sem_info, bo_list, cs_array, > cs_count, > initial_preamble_cs, continue_preamble_cs, _fence); > } else { > -- > 2.19.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev