From: Kristian Høgsberg Kristensen <kristian.h.kristen...@intel.com>
This reduces the amount of stalling that the kernel does between batches and improves the performance of Dota 2 on a Sky Lake GT2 desktop by around 30%. v2 (Jason Ekstrand): - Use canonical form addresses on gen8+ (Chris Wilson) - Provide a better correctness proof (Chris Wilson) Signed-off-by: Jason Ekstrand <ja...@jlekstrand.net> --- src/intel/vulkan/anv_device.c | 112 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index baa767e..71ba8d8 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -1068,6 +1068,105 @@ void anv_GetDeviceQueue( *pQueue = anv_queue_to_handle(&device->queue); } +static void +write_reloc(const struct anv_device *device, void *p, uint64_t v) +{ + unsigned reloc_size = 0; + if (device->info.gen >= 8) { + /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress: + * + * "This field specifies the address of the memory location where the + * register value specified in the DWord above will read from. The + * address specifies the DWord location of the data. Range = + * GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress + * [63:48] are ignored by the HW and assumed to be in correct + * canonical form [63:48] == [47]." + */ + reloc_size = sizeof(uint64_t); + *(uint64_t *)p = (((int64_t)v) << 8) >> 8; + } else { + reloc_size = sizeof(uint32_t); + *(uint32_t *)p = v; + } + + if (!device->info.has_llc) + anv_clflush_range(p, reloc_size); +} + +static void +anv_reloc_list_apply(struct anv_reloc_list *list, + struct anv_device *device, struct anv_bo *bo) +{ + for (size_t i = 0; i < list->num_relocs; i++) { + void *p = bo->map + list->relocs[i].offset; + + struct anv_bo *target_bo = list->reloc_bos[i]; + write_reloc(device, p, target_bo->offset + list->relocs[i].delta); + list->relocs[i].presumed_offset = bo->offset; + } +} + +/** + * This function applies the relocation for a command buffer and writes the + * actual addresses into the buffers as per what we were told by the kernel on + * the previous execbuf2 call. This should be safe to do because, for each + * relocated address, we have two cases: + * + * 1) The target BO is inactive (as seen by the kernel). In this case, it is + * not in use by the GPU so updating the address is 100% ok. It won't be + * in-use by the GPU (from our context) again until the next execbuf2 + * happens. If the kernel decides to move it in the next execbuf2, it + * will have to do the relocations itself, but that's ok because it should + * have all of the information needed to do so. + * + * 2) The target BO is active (as seen by the kernel). In this case, it + * hasn't moved since the last execbuffer2 call because GTT shuffling + * *only* happens inside the execbuffer2 ioctl. Since the target BO + * hasn't moved, our anv_bo::offset exactly matches the BO's GTT address + * and the relocated value we are writing into the BO will be the same as + * the value that is already there. + * + * There is also a possibility that the target BO is active but the exact + * RENDER_SURFACE_STATE object we are writing the relocation into isn't in + * use. In this case, the address currently in the RENDER_SURFACE_STATE + * may be stale but it's still safe to write the relocation because that + * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and + * won't be until the next execbuf2 call. + * + * By doing relocations on the CPU, we can tell the kernel that it doesn't + * need to bother. We want to do this because the surface state buffer is + * used by every command buffer so, if the kernel does the relocations, it + * will always be busy and the kernel will always stall. This is also + * probably the fastest mechanism for doing relocations since the kernel would + * have to make a full copy of all the relocations lists. + */ +static void +relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer) +{ + for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) { + if (cmd_buffer->execbuf2.bos[i]->offset == (uint64_t)-1) + return; + } + + anv_reloc_list_apply(&cmd_buffer->surface_relocs, + cmd_buffer->device, + &cmd_buffer->device->surface_state_block_pool.bo); + + struct anv_batch_bo **bbo; + u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { + anv_reloc_list_apply(&(*bbo)->relocs, + cmd_buffer->device, &(*bbo)->bo); + } + + for (uint32_t i = 0; i < cmd_buffer->execbuf2.bo_count; i++) { + struct anv_bo *bo = cmd_buffer->execbuf2.bos[i]; + + cmd_buffer->execbuf2.objects[i].offset = bo->offset; + } + + cmd_buffer->execbuf2.execbuf.flags |= I915_EXEC_NO_RELOC; +} + VkResult anv_device_execbuf(struct anv_device *device, struct drm_i915_gem_execbuffer2 *execbuf, @@ -1097,16 +1196,20 @@ VkResult anv_QueueSubmit( struct anv_device *device = queue->device; VkResult result = VK_SUCCESS; + pthread_mutex_lock(&device->mutex); + for (uint32_t i = 0; i < submitCount; i++) { for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, pSubmits[i].pCommandBuffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + relocate_cmd_buffer(cmd_buffer); + result = anv_device_execbuf(device, &cmd_buffer->execbuf2.execbuf, cmd_buffer->execbuf2.bos); if (result != VK_SUCCESS) - return result; + goto out; } } @@ -1114,10 +1217,13 @@ VkResult anv_QueueSubmit( struct anv_bo *fence_bo = &fence->bo; result = anv_device_execbuf(device, &fence->execbuf, &fence_bo); if (result != VK_SUCCESS) - return result; + goto out; } - return VK_SUCCESS; +out: + pthread_mutex_unlock(&device->mutex); + + return result; } VkResult anv_QueueWaitIdle( -- 2.5.0.400.gff86faf _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev