When a client causes a GPU hang (or experiences issues due to a hang in another client) we want to let it know as soon as possible. In particular, if it submits work with a fence and calls vkWaitForFences or vkQueueQaitIdle and it returns VK_SUCCESS, then the client should be able to trust the results of that rendering. In order to provide this guarantee, we have to ask the kernel for context status in a few key locations. --- src/intel/vulkan/anv_device.c | 78 ++++++++++++++++++++++++++---------------- src/intel/vulkan/anv_gem.c | 18 ++++++++++ src/intel/vulkan/anv_private.h | 3 ++ src/intel/vulkan/genX_query.c | 11 ++---- 4 files changed, 72 insertions(+), 38 deletions(-)
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 5f0d00f..33d1984 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -884,8 +884,6 @@ anv_device_submit_simple_batch(struct anv_device *device, struct anv_bo bo, *exec_bos[1]; VkResult result = VK_SUCCESS; uint32_t size; - int64_t timeout; - int ret; /* Kernel driver requires 8 byte aligned batch length */ size = align_u32(batch->next - batch->start, 8); @@ -925,14 +923,7 @@ anv_device_submit_simple_batch(struct anv_device *device, if (result != VK_SUCCESS) goto fail; - timeout = INT64_MAX; - ret = anv_gem_wait(device, bo.gem_handle, &timeout); - if (ret != 0) { - /* We don't know the real error. */ - device->lost = true; - result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m"); - goto fail; - } + result = anv_device_wait(device, &bo, INT64_MAX); fail: anv_bo_pool_free(&device->batch_bo_pool, &bo); @@ -1264,6 +1255,28 @@ anv_device_execbuf(struct anv_device *device, return VK_SUCCESS; } +VkResult +anv_device_wait(struct anv_device *device, struct anv_bo *bo, + int64_t timeout) +{ + int ret = anv_gem_wait(device, bo->gem_handle, &timeout); + if (ret == -1 && errno == ETIME) { + return VK_TIMEOUT; + } else if (ret == -1) { + /* We don't know the real error. */ + device->lost = true; + return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m"); + } + + if (anv_gem_gpu_has_reset(device)) { + device->lost = true; + return vk_errorf(VK_ERROR_DEVICE_LOST, + "GPU has hung with commands in-flight"); + } + + return VK_SUCCESS; +} + VkResult anv_QueueSubmit( VkQueue _queue, uint32_t submitCount, @@ -1273,8 +1286,13 @@ VkResult anv_QueueSubmit( ANV_FROM_HANDLE(anv_queue, queue, _queue); ANV_FROM_HANDLE(anv_fence, fence, _fence); struct anv_device *device = queue->device; - if (unlikely(device->lost)) + + if (unlikely(device->lost)) { return VK_ERROR_DEVICE_LOST; + } else if (anv_gem_gpu_has_reset(device)) { + device->lost = true; + return vk_error(VK_ERROR_DEVICE_LOST); + } VkResult result = VK_SUCCESS; @@ -1802,9 +1820,6 @@ VkResult anv_GetFenceStatus( if (unlikely(device->lost)) return VK_ERROR_DEVICE_LOST; - int64_t t = 0; - int ret; - switch (fence->state) { case ANV_FENCE_STATE_RESET: /* If it hasn't even been sent off to the GPU yet, it's not ready */ @@ -1814,15 +1829,18 @@ VkResult anv_GetFenceStatus( /* It's been signaled, return success */ return VK_SUCCESS; - case ANV_FENCE_STATE_SUBMITTED: - /* It's been submitted to the GPU but we don't know if it's done yet. */ - ret = anv_gem_wait(device, fence->bo.gem_handle, &t); - if (ret == 0) { + case ANV_FENCE_STATE_SUBMITTED: { + VkResult result = anv_device_wait(device, &fence->bo, 0); + switch (result) { + case VK_SUCCESS: fence->state = ANV_FENCE_STATE_SIGNALED; return VK_SUCCESS; - } else { + case VK_TIMEOUT: return VK_NOT_READY; + default: + return result; } + } default: unreachable("Invalid fence status"); } @@ -1884,20 +1902,20 @@ VkResult anv_WaitForFences( /* These are the fences we really care about. Go ahead and wait * on it until we hit a timeout. */ - ret = anv_gem_wait(device, fence->bo.gem_handle, &timeout); - if (ret == -1 && errno == ETIME) { - result = VK_TIMEOUT; - goto done; - } else if (ret == -1) { - /* We don't know the real error. */ - device->lost = true; - return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m"); - } else { + result = anv_device_wait(device, &fence->bo, 0); + switch (result) { + case VK_SUCCESS: fence->state = ANV_FENCE_STATE_SIGNALED; signaled_fences = true; if (!waitAll) - return VK_SUCCESS; - continue; + goto done; + break; + + case VK_TIMEOUT: + goto done; + + default: + return result; } } } diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c index 0dde6d9..7d4b638 100644 --- a/src/intel/vulkan/anv_gem.c +++ b/src/intel/vulkan/anv_gem.c @@ -301,6 +301,24 @@ anv_gem_get_aperture(int fd, uint64_t *size) return 0; } +bool +anv_gem_gpu_has_reset(struct anv_device *device) +{ + struct drm_i915_reset_stats stats = { + .ctx_id = device->context_id, + }; + + int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats); + if (ret == -1) { + /* This really shouldn't be possible but the impossible should probably + * be treated as a GPU hang anyway. + */ + return true; + } + + return stats.batch_active > 0 || stats.batch_pending > 0; +} + int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle) { diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 27c887c..f0a2b8d 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -637,6 +637,8 @@ void anv_device_finish_blorp(struct anv_device *device); VkResult anv_device_execbuf(struct anv_device *device, struct drm_i915_gem_execbuffer2 *execbuf, struct anv_bo **execbuf_bos); +VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo, + int64_t timeout); void* anv_gem_mmap(struct anv_device *device, uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags); @@ -654,6 +656,7 @@ int anv_gem_destroy_context(struct anv_device *device, int context); int anv_gem_get_param(int fd, uint32_t param); bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling); int anv_gem_get_aperture(int fd, uint64_t *size); +bool anv_gem_gpu_has_reset(struct anv_device *device); int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle); uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd); int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching); diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 3610665..7ea9404 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -143,8 +143,6 @@ VkResult genX(GetQueryPoolResults)( { ANV_FROM_HANDLE(anv_device, device, _device); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); - int64_t timeout = INT64_MAX; - int ret; assert(pool->type == VK_QUERY_TYPE_OCCLUSION || pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || @@ -157,12 +155,9 @@ VkResult genX(GetQueryPoolResults)( return VK_SUCCESS; if (flags & VK_QUERY_RESULT_WAIT_BIT) { - ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout); - if (ret == -1) { - /* We don't know the real error. */ - return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, - "gem_wait failed %m"); - } + VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX); + if (result != VK_SUCCESS) + return result; } void *data_end = pData + dataSize; -- 2.5.0.400.gff86faf _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev