Module: Mesa
Branch: main
Commit: 2f25d16653608f69b9dff39678b65dcc67ebed00
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=2f25d16653608f69b9dff39678b65dcc67ebed00

Author: Emma Anholt <[email protected]>
Date:   Wed Feb  2 12:59:54 2022 -0800

turnip: Use the DRM or KGSL GPU reset status ioctls to report device loss.

ANGLE-on-venus-on-turnip and zink-on-turnip want real data here for EGL's
reset tests.

This required moving the remaining GPU-reset-causing tests from flakes or
xfails to skips.  Otherwise, the rest of the caselist associated with them
ends up being marked as fails as well.  The alternative would be to put
these tests in their own test groups with tests_per_group = 1, but that
didn't seem worth the effort.  Or, we could finally do something with
https://gitlab.freedesktop.org/anholt/deqp-runner/-/issues/14.

Fixes: #5955
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14839>

---

 src/freedreno/ci/freedreno-a618-fails.txt  |  6 ------
 src/freedreno/ci/freedreno-a618-flakes.txt |  7 -------
 src/freedreno/ci/freedreno-a618-skips.txt  | 13 ++++++++++++-
 src/freedreno/ci/freedreno-a630-fails.txt  |  3 ---
 src/freedreno/ci/freedreno-a630-flakes.txt |  7 -------
 src/freedreno/ci/freedreno-a630-skips.txt  |  9 +++++++++
 src/freedreno/vulkan/tu_device.c           |  1 +
 src/freedreno/vulkan/tu_drm.c              | 24 ++++++++++++++++++++++++
 src/freedreno/vulkan/tu_kgsl.c             | 27 +++++++++++++++++++++++++++
 src/freedreno/vulkan/tu_private.h          |  6 ++++++
 10 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/src/freedreno/ci/freedreno-a618-fails.txt 
b/src/freedreno/ci/freedreno-a618-fails.txt
index d88caea8f84..707870d97a4 100644
--- a/src/freedreno/ci/freedreno-a618-fails.txt
+++ b/src/freedreno/ci/freedreno-a618-fails.txt
@@ -1,12 +1,6 @@
 # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505
 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail
 
-# CTS 1.3.1.0 uprev:
-dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap,Fail
-dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap,Fail
-
-spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail
-
 # Fails when TU_DEBUG=forcebin is set
 
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
 
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
diff --git a/src/freedreno/ci/freedreno-a618-flakes.txt 
b/src/freedreno/ci/freedreno-a618-flakes.txt
index 3d0853fde5a..2c3087eb3a5 100644
--- a/src/freedreno/ci/freedreno-a618-flakes.txt
+++ b/src/freedreno/ci/freedreno-a618-flakes.txt
@@ -4,10 +4,3 @@
 
 
dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_2.alpha_opaque
 
dEQP-VK.pipeline.multisample.alpha_to_coverage_unused_attachment.samples_4.alpha_opaque
-
-# Could trip hangcheck timeout
-dEQP-VK.api.command_buffers.record_many_draws_primary_2
-dEQP-VK.api.command_buffers.record_many_draws_secondary_2
-
-# Sometimes hangchecks
-spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code
diff --git a/src/freedreno/ci/freedreno-a618-skips.txt 
b/src/freedreno/ci/freedreno-a618-skips.txt
index 2e625669aca..9e2cb4caafe 100644
--- a/src/freedreno/ci/freedreno-a618-skips.txt
+++ b/src/freedreno/ci/freedreno-a618-skips.txt
@@ -25,11 +25,22 @@ dEQP-VK.ubo.random.all_shared_buffer.48
 # Still running after 3 hours, time is spent in batch_draw_tracking().
 KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs
 
-# causes a hangcheck timeout on a630:
+# causes a hangcheck timeout on a618:
 # msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A618: hangcheck detected 
gpu lockup rb 0!
+#
+# even if they sometimes pass and could be categorized as flakes, we skip them
+# because device loss will end up failing the rest of the caselist.
+dEQP-VK.api.command_buffers.record_many_draws_primary_2
+dEQP-VK.api.command_buffers.record_many_draws_secondary_2
 dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
+spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies
 spill-dEQP-VK.graphicsfuzz.cov-nested-loop-undefined-smoothstep-never-executed
+spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-dead-code
 spill-dEQP-VK.graphicsfuzz.spv-stable-maze-O-memory-accesses
 
+# Hangs the GPU, fixed to be a skip in VK-GL-CTS 736eec57dc0c ("Fix 
checkSupport in compressed texture sampling tests")
+dEQP-VK.image.sample_texture.128_bit_compressed_format_cubemap
+dEQP-VK.image.sample_texture.64_bit_compressed_format_cubemap
+
 # Crashes in RA, but slow enough to get there that CI times out sometimes
 dEQP-VK.spirv_assembly.instruction.*.spirv_ids_abuse.lots_ids.*
diff --git a/src/freedreno/ci/freedreno-a630-fails.txt 
b/src/freedreno/ci/freedreno-a630-fails.txt
index bfdb31e70de..58080e09381 100644
--- a/src/freedreno/ci/freedreno-a630-fails.txt
+++ b/src/freedreno/ci/freedreno-a630-fails.txt
@@ -34,9 +34,6 @@ 
bypass-dEQP-GLES31.functional.blend_equation_advanced.msaa.softlight,Fail
 # https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/3505
 dEQP-VK.subgroups.multiple_dispatches.uniform_subgroup_size,Fail
 
-# Showed up with VK-GL-CTS 1.3.1.0:
-spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies,Fail
-
 # Fails when TU_DEBUG=forcebin is set
 
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_geom,Fail
 
gmem-dEQP-VK.spirv_assembly.instruction.graphics.variable_pointers.graphics.writes_two_buffers_vert,Fail
diff --git a/src/freedreno/ci/freedreno-a630-flakes.txt 
b/src/freedreno/ci/freedreno-a630-flakes.txt
index 4e7f435402a..69395851673 100644
--- a/src/freedreno/ci/freedreno-a630-flakes.txt
+++ b/src/freedreno/ci/freedreno-a630-flakes.txt
@@ -91,13 +91,6 @@ 
dEQP-GLES31.functional.layout_binding.ssbo.fragment_binding_array
 dEQP-GLES3.functional.fbo.blit.conversion.rg8i_to_r16i
 dEQP-GLES3.functional.fbo.blit.conversion.rg8_to_r16f
 
-# Could trip hangcheck timeout
-dEQP-VK.api.command_buffers.record_many_draws_primary_2
-dEQP-VK.api.command_buffers.record_many_draws_secondary_2
-
-# Looks likely to be a hangcheck trigger.
-spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components
-
 # First noticed Jun 1 2020 on an innocent branch.
 KHR-GL33.packed_depth_stencil.verify_copy_tex_image.depth32f_stencil8
 
diff --git a/src/freedreno/ci/freedreno-a630-skips.txt 
b/src/freedreno/ci/freedreno-a630-skips.txt
index a05d7144341..b7d49e80510 100644
--- a/src/freedreno/ci/freedreno-a630-skips.txt
+++ b/src/freedreno/ci/freedreno-a630-skips.txt
@@ -18,6 +18,15 @@ 
dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_equal_spacing
 
dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_even_spacing
 
dEQP-VK.tessellation.invariance.outer_triangle_set.triangles_fractional_odd_spacing
 
+# Can cause a hangcheck.
+#
+# even if they sometimes pass and could be categorized as flakes, we skip them
+# because device loss will end up failing the rest of the caselist.
+dEQP-VK.api.command_buffers.record_many_draws_primary_2
+dEQP-VK.api.command_buffers.record_many_draws_secondary_2
+spill-dEQP-VK.graphicsfuzz.cov-function-two-loops-limit-using-arguments-array-element-copies
+spill-dEQP-VK.graphicsfuzz.cov-nested-loop-large-array-index-using-vector-components
+
 # timeout, spending all its time in nir_compare_deref_paths()
 # https://gitlab.freedesktop.org/mesa/mesa/-/issues/5152
 dEQP-VK.ubo.random.all_shared_buffer.48
diff --git a/src/freedreno/vulkan/tu_device.c b/src/freedreno/vulkan/tu_device.c
index a1684b848f2..b1ba78dc37c 100644
--- a/src/freedreno/vulkan/tu_device.c
+++ b/src/freedreno/vulkan/tu_device.c
@@ -1699,6 +1699,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
    device->instance = physical_device->instance;
    device->physical_device = physical_device;
    device->fd = physical_device->local_fd;
+   device->vk.check_status = tu_device_check_status;
 
    mtx_init(&device->bo_mutex, mtx_plain);
    u_rwlock_init(&device->dma_bo_lock);
diff --git a/src/freedreno/vulkan/tu_drm.c b/src/freedreno/vulkan/tu_drm.c
index 92ff8755c1e..9e46b08a1f0 100644
--- a/src/freedreno/vulkan/tu_drm.c
+++ b/src/freedreno/vulkan/tu_drm.c
@@ -137,6 +137,23 @@ tu_device_get_suspend_count(struct tu_device *dev, 
uint64_t *suspend_count)
    return ret;
 }
 
+VkResult
+tu_device_check_status(struct vk_device *vk_device)
+{
+   struct tu_device *device = container_of(vk_device, struct tu_device, vk);
+   struct tu_physical_device *physical_device = device->physical_device;
+
+   uint64_t last_fault_count = physical_device->fault_count;
+   int ret = tu_drm_get_param(physical_device, MSM_PARAM_FAULTS, 
&physical_device->fault_count);
+   if (ret != 0)
+      return vk_device_set_lost(&device->vk, "error getting GPU fault count: 
%d", ret);
+
+   if (last_fault_count != physical_device->fault_count)
+      return vk_device_set_lost(&device->vk, "GPU faulted or hung");
+
+   return VK_SUCCESS;
+}
+
 int
 tu_drm_submitqueue_new(const struct tu_device *dev,
                        int priority,
@@ -729,6 +746,13 @@ tu_drm_device_init(struct tu_physical_device *device,
       goto fail;
    }
 
+   int ret = tu_drm_get_param(device, MSM_PARAM_FAULTS, &device->fault_count);
+   if (ret != 0) {
+      result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
+                                 "Failed to get initial fault count: %d", ret);
+      goto fail;
+   }
+
    device->syncobj_type = vk_drm_syncobj_get_type(fd);
    device->timeline_type = vk_sync_timeline_get_type(&tu_timeline_sync_type);
 
diff --git a/src/freedreno/vulkan/tu_kgsl.c b/src/freedreno/vulkan/tu_kgsl.c
index 0caece7ffea..55fae932da3 100644
--- a/src/freedreno/vulkan/tu_kgsl.c
+++ b/src/freedreno/vulkan/tu_kgsl.c
@@ -706,6 +706,33 @@ tu_device_get_suspend_count(struct tu_device *dev, 
uint64_t *suspend_count)
    return 0;
 }
 
+VkResult
+tu_device_check_status(struct vk_device *vk_device)
+{
+   struct tu_device *device = container_of(vk_device, struct tu_device, vk);
+
+   for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
+      for (unsigned q = 0; q < device->queue_count[i]; q++) {
+         /* KGSL's KGSL_PROP_GPU_RESET_STAT takes the u32 msm_queue_id and 
returns a
+         * KGSL_CTX_STAT_* for the worst reset that happened since the last 
time it
+         * was queried on that queue.
+         */
+         uint32_t value = device->queues[i][q].msm_queue_id;
+         VkResult status = get_kgsl_prop(device->fd, KGSL_PROP_GPU_RESET_STAT,
+                                       &value, sizeof(value));
+         if (status != VK_SUCCESS)
+            return vk_device_set_lost(&device->vk, "Failed to get GPU reset 
status");
+
+         if (value != KGSL_CTX_STAT_NO_ERROR &&
+            value != KGSL_CTX_STAT_INNOCENT_CONTEXT_RESET_EXT) {
+            return vk_device_set_lost(&device->vk, "GPU faulted or hung");
+         }
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
 #ifdef ANDROID
 VKAPI_ATTR VkResult VKAPI_CALL
 tu_QueueSignalReleaseImageANDROID(VkQueue _queue,
diff --git a/src/freedreno/vulkan/tu_private.h 
b/src/freedreno/vulkan/tu_private.h
index 50f6a648a4d..2d0e4e756c3 100644
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@@ -236,6 +236,9 @@ struct tu_physical_device
    int msm_major_version;
    int msm_minor_version;
 
+   /* Address space and global fault count for this local_fd with DRM backend 
*/
+   uint64_t fault_count;
+
    /* This is the drivers on-disk cache used as a fallback as opposed to
     * the pipeline cache defined by apps.
     */
@@ -538,6 +541,9 @@ tu_device_wait_u_trace(struct tu_device *dev, struct 
tu_u_trace_syncobj *syncobj
 uint64_t
 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
 
+VkResult
+tu_device_check_status(struct vk_device *vk_device);
+
 enum tu_bo_alloc_flags
 {
    TU_BO_ALLOC_NO_FLAGS = 0,

Reply via email to