Passing the index of the target buffer via the reloc.target_handle is marginally more efficient for the kernel (it can avoid some allocations, and can use a direct lookup rather than a hash or search). It is also useful for ourselves as we can use the index into our exec_bos for other tasks.
v2: Only enable HANDLE_LUT if we can use BATCH_FIRST and thereby avoid a post-processing loop to fixup the relocations. v3: Move kernel probing from context creation to screen init. Use batch->use_exec_lut as it more descriptive of what's going on (Daniel) v4: Kernel features already exists, use it for BATCH_FIRST Rename locals to preserve current flavouring v5: Squash in "always insert batch bo first" Signed-off-by: Chris Wilson <[email protected]> Cc: Kenneth Graunke <[email protected]> Cc: Matt Turner <[email protected]> Cc: Jason Ekstrand <[email protected]> Cc: Daniel Vetter <[email protected]> Reviewed-by: Kenneth Graunke <[email protected]> #v4 --- src/mesa/drivers/dri/i965/brw_context.h | 1 + src/mesa/drivers/dri/i965/intel_batchbuffer.c | 109 ++++++++++++++------------ src/mesa/drivers/dri/i965/intel_screen.c | 4 + src/mesa/drivers/dri/i965/intel_screen.h | 1 + 4 files changed, 65 insertions(+), 50 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 2acebaa820..57081fb434 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -452,6 +452,7 @@ struct intel_batchbuffer { uint32_t state_batch_offset; enum brw_gpu_ring ring; + bool use_exec_lut; bool needs_sol_reset; bool state_base_address_emitted; diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index 5e861a555a..16791de3de 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -62,8 +62,6 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch, struct brw_bufmgr *bufmgr, bool has_llc) { - intel_batchbuffer_reset(batch, bufmgr, has_llc); - if (!has_llc) { batch->cpu_map = malloc(BATCH_SZ); batch->map = batch->cpu_map; @@ -85,6 +83,17 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch, batch->state_batch_sizes = _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare); } + + struct brw_context *brw = container_of(batch, brw, batch); + /* To use the LUT method for execbuf, we also require placing the batch + * first (to simplify our implementation). We require a kernel recent + * enough to always support EXEC_LUT_HANDLE, but we must check that + * the kernel supports EXEC_BATCH_FIRST. + */ + batch->use_exec_lut = + brw->screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST; + + intel_batchbuffer_reset(batch, bufmgr, has_llc); } #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) @@ -92,19 +101,15 @@ intel_batchbuffer_init(struct intel_batchbuffer *batch, static unsigned int add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo) { - if (bo != batch->bo) { - unsigned int index = READ_ONCE(bo->index); - - if (index < batch->exec_count && batch->exec_bos[index] == bo) - return index; + unsigned int index = READ_ONCE(bo->index); - /* May have been shared between multiple active batches */ - for (index = 0; index < batch->exec_count; index++) { - if (batch->exec_bos[index] == bo) - return index; - } + if (index < batch->exec_count && batch->exec_bos[index] == bo) + return index; - brw_bo_reference(bo); + /* May have been shared between multiple active batches */ + for (index = 0; index < batch->exec_count; index++) { + if (batch->exec_bos[index] == bo) + return index; } if (batch->exec_count == batch->exec_array_size) { @@ -117,26 +122,21 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo) batch->exec_array_size * sizeof(batch->validation_list[0])); } - struct drm_i915_gem_exec_object2 *validation_entry = - &batch->validation_list[batch->exec_count]; - validation_entry->handle = bo->gem_handle; - if (bo == batch->bo) { - validation_entry->relocation_count = batch->reloc_count; - validation_entry->relocs_ptr = (uintptr_t) batch->relocs; - } else { - validation_entry->relocation_count = 0; - validation_entry->relocs_ptr = 0; - } - validation_entry->alignment = bo->align; - validation_entry->offset = bo->offset64; - validation_entry->flags = bo->kflags; - validation_entry->rsvd1 = 0; - validation_entry->rsvd2 = 0; + batch->validation_list[batch->exec_count] = + (struct drm_i915_gem_exec_object2) { + .handle = bo->gem_handle, + .alignment = bo->align, + .offset = bo->offset64, + .flags = bo->kflags, + }; bo->index = batch->exec_count; batch->exec_bos[batch->exec_count] = bo; batch->aperture_space += bo->size; + if (bo != batch->bo) + brw_bo_reference(bo); + return batch->exec_count++; } @@ -157,6 +157,9 @@ intel_batchbuffer_reset(struct intel_batchbuffer *batch, } batch->map_next = batch->map; + add_exec_bo(batch, batch->bo); + assert(batch->bo->index == 0); + batch->reserved_space = BATCH_RESERVED; batch->state_batch_offset = batch->bo->size; batch->needs_sol_reset = false; @@ -581,9 +584,6 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd) } if (ret == 0) { - /* Add the batch itself to the end of the validation list */ - add_exec_bo(batch, batch->bo); - struct drm_i915_gem_execbuffer2 execbuf = { .buffers_ptr = (uintptr_t) batch->validation_list, .buffer_count = batch->exec_count, @@ -614,6 +614,22 @@ do_flush_locked(struct brw_context *brw, int in_fence_fd, int *out_fence_fd) if (batch->needs_sol_reset) execbuf.flags |= I915_EXEC_GEN7_SOL_RESET; + struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0]; + assert(entry->handle == batch->bo->gem_handle); + entry->relocation_count = batch->reloc_count; + entry->relocs_ptr = (uintptr_t) batch->relocs; + + if (batch->use_exec_lut) { + execbuf.flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT; + } else { + struct drm_i915_gem_exec_object2 tmp; + unsigned int index = batch->exec_count - 1; + + tmp = *entry; + *entry = batch->validation_list[index]; + batch->validation_list[index] = tmp; + } + unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2; if (in_fence_fd != -1) { @@ -777,40 +793,33 @@ brw_emit_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset, assert(batch_offset <= BATCH_SZ - sizeof(uint32_t)); assert(_mesa_bitcount(write_domain) <= 1); - uint64_t offset64; - if (target != batch->bo) { - unsigned int index = add_exec_bo(batch, target); - struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index]; + unsigned int index = add_exec_bo(batch, target); + struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index]; - if (write_domain) { - entry->flags |= EXEC_OBJECT_WRITE; + if (write_domain) { + entry->flags |= EXEC_OBJECT_WRITE; - /* PIPECONTROL needs a w/a on gen6 */ - if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) { - struct brw_context *brw = container_of(batch, brw, batch); - if (brw->gen == 6) - entry->flags |= EXEC_OBJECT_NEEDS_GTT; - } + /* PIPECONTROL needs a w/a on gen6 */ + if (write_domain == I915_GEM_DOMAIN_INSTRUCTION) { + struct brw_context *brw = container_of(batch, brw, batch); + if (brw->gen == 6) + entry->flags |= EXEC_OBJECT_NEEDS_GTT; } - - offset64 = entry->offset; - } else { - offset64 = target->offset64; } batch->relocs[batch->reloc_count++] = (struct drm_i915_gem_relocation_entry) { .offset = batch_offset, .delta = target_offset, - .target_handle = target->gem_handle, - .presumed_offset = offset64, + .target_handle = batch->use_exec_lut ? index : target->gem_handle, + .presumed_offset = entry->offset, }; /* Using the old buffer offset, write in what the right data would be, in * case the buffer doesn't move and we can short-circuit the relocation * processing in the kernel */ - return offset64 + target_offset; + return target_offset + entry->offset; } void diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index 44ea6a4562..109ba79b67 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -2278,6 +2278,10 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen) screen->kernel_features |= KERNEL_ALLOWS_EXEC_CAPTURE; } + if (intel_get_boolean(screen, I915_PARAM_HAS_EXEC_BATCH_FIRST)) { + screen->kernel_features |= KERNEL_ALLOWS_EXEC_BATCH_FIRST; + } + if (!intel_detect_pipelined_so(screen)) { /* We can't do anything, so the effective version is 0. */ screen->cmd_parser_version = 0; diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h index 0980c8f561..577058dc15 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.h +++ b/src/mesa/drivers/dri/i965/intel_screen.h @@ -80,6 +80,7 @@ struct intel_screen #define KERNEL_ALLOWS_HSW_SCRATCH1_AND_ROW_CHICKEN3 (1<<3) #define KERNEL_ALLOWS_COMPUTE_DISPATCH (1<<4) #define KERNEL_ALLOWS_EXEC_CAPTURE (1<<5) +#define KERNEL_ALLOWS_EXEC_BATCH_FIRST (1<<6) struct brw_bufmgr *bufmgr; -- 2.13.3 _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
