On Fri, Feb 21, 2025 at 04:01:01PM +0000, Alex Bennée wrote: > While running the new GPU tests it was noted that the proprietary > nVidia driver barfed when run under the sanitiser: > > 2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts > EOTF mode SDR and colorimetry mode default. > 2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color > profile: stock sRGB color profile > > and that's the last thing it outputs. > > The sanitizer reports that when the framework sends the SIGTERM > because of the timeout we get a write to a NULL pointer (but > interesting not this time in an atexit callback): > > UndefinedBehaviorSanitizer:DEADLYSIGNAL > ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address > 0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0 > T471863) > ==471863==The signal is caused by a WRITE memory access. > ==471863==Hint: address points to the zero page. > #0 0x7a18ceaafe80 > (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80) > (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) > #1 0x7a18ce9e72c0 > (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0) > (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) > #2 0x7a18ce9f11bb > (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb) > (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) > #3 0x7a18ce6dc9d1 > (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1) > (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) > #4 0x7a18e7d15326 in vrend_renderer_create_fence > > /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26 > #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd > > The #dri-devel channel confirmed: > > <digetx> stsquad: nv driver is known to not work with venus, don't use > it for testing > > So lets implement a blocklist to stop users starting a known bad > setup.
I don't much like the conceptual idea of blocking usage of QEMU itself based on current point-in-time bugs in the host OS driver stack, because it is making an assertion that all future versions of the driver will also be broken and that's not generally valid. If the user chose to use a dodgy graphics driver, they can deal with the consequences of their choice. Skipping only the functional test, without any qemu-system code changes though is more palettable as that's not a hard block on usage. > > Reported-by: Peter Maydell <peter.mayd...@linaro.org> > Signed-off-by: Alex Bennée <alex.ben...@linaro.org> > Cc: Dmitry Osipenko <dmitry.osipe...@collabora.com> > --- > meson.build | 4 + > include/qemu/host-gpu.h | 23 +++++ > hw/display/virtio-gpu.c | 4 + > stubs/host-gpu.c | 17 ++++ > util/host-gpu.c | 102 ++++++++++++++++++++++ > stubs/meson.build | 4 + > tests/functional/test_aarch64_virt_gpu.py | 2 + > util/meson.build | 2 + > 8 files changed, 158 insertions(+) > create mode 100644 include/qemu/host-gpu.h > create mode 100644 stubs/host-gpu.c > create mode 100644 util/host-gpu.c > > diff --git a/meson.build b/meson.build > index 4588bfd864..8f4a431445 100644 > --- a/meson.build > +++ b/meson.build > @@ -1373,12 +1373,16 @@ if not get_option('qatzip').auto() or have_system > endif > > virgl = not_found > +vulkan = not_found > > have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found() > if not get_option('virglrenderer').auto() or have_system or > have_vhost_user_gpu > virgl = dependency('virglrenderer', > method: 'pkg-config', > required: get_option('virglrenderer')) > + vulkan = dependency('vulkan', > + method: 'pkg-config', > + required: get_option('virglrenderer')) > endif > rutabaga = not_found > if not get_option('rutabaga_gfx').auto() or have_system or > have_vhost_user_gpu > diff --git a/include/qemu/host-gpu.h b/include/qemu/host-gpu.h > new file mode 100644 > index 0000000000..45053c2f77 > --- /dev/null > +++ b/include/qemu/host-gpu.h > @@ -0,0 +1,23 @@ > +/* > + * Utility functions to probe host GPU > + * > + * Copyright (c) 2025 Linaro Ltd > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > +#ifndef HOST_GPU_H > +#define HOST_GPU_H > + > +#include "qapi/error.h" > + > +/** > + * validate_vulkan_backend() - verify working backend > + * > + * errp: error pointer > + * > + * If the system vulkan implementation is known to not work return > + * false otherwise true. > + */ > +bool validate_vulkan_backend(Error **errp); > + > +#endif /* HOST_GPU_H */ > diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c > index 11a7a85750..816eedf838 100644 > --- a/hw/display/virtio-gpu.c > +++ b/hw/display/virtio-gpu.c > @@ -32,6 +32,7 @@ > #include "qemu/module.h" > #include "qapi/error.h" > #include "qemu/error-report.h" > +#include "qemu/host-gpu.h" > > #define VIRTIO_GPU_VM_VERSION 1 > > @@ -1498,6 +1499,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, Error > **errp) > error_setg(errp, "venus requires enabled blob and hostmem > options"); > return; > } > + if (!validate_vulkan_backend(errp)) { > + return; > + } > #else > error_setg(errp, "old virglrenderer, venus unsupported"); > return; > diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c > new file mode 100644 > index 0000000000..7bf76ee4f6 > --- /dev/null > +++ b/stubs/host-gpu.c > @@ -0,0 +1,17 @@ > +/* > + * Stub of utility functions to probe host GPU > + * > + * Copyright (c) 2025 Linaro Ltd > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#include "qemu/osdep.h" > +#include "qapi/error.h" > +#include "qemu/host-gpu.h" > + > +bool validate_vulkan_backend(Error **errp) > +{ > + error_setg(errp, "No vulkan library present"); > + return false; > +} > diff --git a/util/host-gpu.c b/util/host-gpu.c > new file mode 100644 > index 0000000000..5e7bf2557c > --- /dev/null > +++ b/util/host-gpu.c > @@ -0,0 +1,102 @@ > +/* > + * Utility functions to probe host GPU > + * > + * Copyright (c) 2025 Linaro Ltd > + * > + * SPDX-License-Identifier: GPL-2.0-or-later > + */ > + > +#include "qemu/osdep.h" > +#include "qapi/error.h" > +#include "qemu/host-gpu.h" > + > +#include <vulkan/vulkan.h> > + > +const char *extensions[] = { > + /* needed to query the driver details */ > + "VK_KHR_get_physical_device_properties2", > +}; > + > +/* > + * Info for known broken drivers. Sadly driver version info tends to > + * be in the driverInfo text field which is free form so tricky to > + * parse. > + */ > +struct VkDriverBlockList { > + VkDriverId id; > + const char *reason; > +}; > + > +struct VkDriverBlockList vulkan_blocklist[] = { > + /* at least 535.183.01 is reported to SEGV in libnvidia-eglcore.so */ > + { VK_DRIVER_ID_NVIDIA_PROPRIETARY, "proprietary nVidia driver is broken" > }, > +}; > + > +static bool is_driver_blocked(VkPhysicalDevice dev, Error **errp) > +{ > + VkPhysicalDeviceDriverProperties driverProperties = { > + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES, > + .pNext = NULL > + }; > + VkPhysicalDeviceProperties2 deviceProperties2 = { > + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, > + .pNext = &driverProperties > + }; > + VkPhysicalDeviceProperties *deviceProperties = > &deviceProperties2.properties; > + > + vkGetPhysicalDeviceProperties2(dev, &deviceProperties2); > + > + for (int i = 0; i < ARRAY_SIZE(vulkan_blocklist); i++) { > + if (driverProperties.driverID == vulkan_blocklist[i].id) { > + error_setg(errp, "Blocked GPU %s because %s", > + deviceProperties->deviceName, > + vulkan_blocklist[i].reason); > + return true; > + } > + } > + > + return false; > +} > + > +bool validate_vulkan_backend(Error **errp) > +{ > + VkInstanceCreateInfo instance_info = { > + VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, > + NULL, /* pNext extension */ > + 0, /* VkInstanceCreateFlags */ > + NULL, /* Application Info */ > + 0, NULL, /* no Enabled Layers */ > + ARRAY_SIZE(extensions), extensions, /* Extensions */ > + }; > + > + VkInstance inst; > + VkResult res; > + > + res = vkCreateInstance(&instance_info, NULL, &inst); > + > + if ( res == VK_SUCCESS ) { > + uint32_t count; > + VkPhysicalDevice *devices; > + > + /* Do the enumeration two-step */ > + vkEnumeratePhysicalDevices(inst, &count, NULL); > + devices = g_new0(VkPhysicalDevice, count); > + vkEnumeratePhysicalDevices(inst, &count, devices); > + > + for (int i = 0; i < count; i++) { > + if (is_driver_blocked(devices[i], errp)) { > + return false; > + } > + } > + } else { > + error_setg(errp, "Could not initialise a Vulkan instance"); > + return false; > + } > + > + /* > + * It would be nice to g_autofree the instance, but returning > + * false will abort start-up anyway. > + */ > + vkDestroyInstance(inst, NULL); > + return true; > +} > diff --git a/stubs/meson.build b/stubs/meson.build > index b0fee37e05..c18501aa6d 100644 > --- a/stubs/meson.build > +++ b/stubs/meson.build > @@ -89,3 +89,7 @@ if have_system or have_user > stub_ss.add(files('hotplug-stubs.c')) > stub_ss.add(files('sysbus.c')) > endif > + > +if not vulkan.found() > + stubs_ss.add(files('host-gpu.c')) > +endif > diff --git a/tests/functional/test_aarch64_virt_gpu.py > b/tests/functional/test_aarch64_virt_gpu.py > index 7a8471d1ca..9a0e694049 100755 > --- a/tests/functional/test_aarch64_virt_gpu.py > +++ b/tests/functional/test_aarch64_virt_gpu.py > @@ -79,6 +79,8 @@ def _run_virt_gpu_test(self, gpu_device, weston_cmd, > weston_pattern): > self.skipTest("Can't access host DRM render node") > elif "'type' does not accept value 'egl-headless'" in > excp.output: > self.skipTest("egl-headless support is not available") > + elif "Blocked GPU" in excp.output: > + self.skipTest("GPU is in block list") > else: > self.log.info(f"unhandled launch failure: {excp.output}") > raise excp > diff --git a/util/meson.build b/util/meson.build > index 780b5977a8..7c6cc36e07 100644 > --- a/util/meson.build > +++ b/util/meson.build > @@ -132,3 +132,5 @@ elif cpu in ['ppc', 'ppc64'] > elif cpu in ['riscv32', 'riscv64'] > util_ss.add(files('cpuinfo-riscv.c')) > endif > + > +util_ss.add(when: vulkan, if_true: files('host-gpu.c')) > -- > 2.39.5 > With regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|