Re: [PATCH v5 09/13] osdep: move O_DSYNC and O_DIRECT defines from file-posix
On Thu, May 23, 2024 at 04:55:18PM GMT, Stefano Garzarella wrote: These defines are also useful for vhost-user-blk when it is compiled in some POSIX systems that do not define them, so let's move them to “qemu/osdep.h”. Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- include/qemu/osdep.h | 14 ++ block/file-posix.c | 14 -- 2 files changed, 14 insertions(+), 14 deletions(-) This seems to break the compilation on win64: https://gitlab.com/sgarzarella/qemu/-/jobs/6923403322 In file included from ../util/osdep.c:24: ../util/osdep.c: In function 'qemu_open_internal': ../include/qemu/osdep.h:339:18: error: 'O_DSYNC' undeclared (first use in this function) 339 | #define O_DIRECT O_DSYNC | ^~~ ../util/osdep.c:334:41: note: in expansion of macro 'O_DIRECT' 334 | if (errno == EINVAL && (flags & O_DIRECT)) { | ^~~~ ../include/qemu/osdep.h:339:18: note: each undeclared identifier is reported only once for each function it appears in 339 | #define O_DIRECT O_DSYNC | ^~~ ../util/osdep.c:334:41: note: in expansion of macro 'O_DIRECT' 334 | if (errno == EINVAL && (flags & O_DIRECT)) { | ^~~~ ../util/osdep.c: In function 'qemu_open_old': ../include/qemu/osdep.h:339:18: error: 'O_DSYNC' undeclared (first use in this function) 339 | #define O_DIRECT O_DSYNC | ^~~ ../util/osdep.c:385:50: note: in expansion of macro 'O_DIRECT' 385 | if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) { | Indeed file-posix.c was not compiled on windows. Oops, I didn't think of that :-( I'm thinking on putting a guard on CONFIG_POSIX, or just checking that O_DSYNC is defined. Any suggestion? Thanks, Stefano diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index f61edcfdc2..e165b5cb1b 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -325,6 +325,20 @@ void QEMU_ERROR("code path is reachable") #define ESHUTDOWN 4099 #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + #define RETRY_ON_EINTR(expr) \ (__extension__ \ ({ typeof(expr) __result; \ diff --git a/block/file-posix.c b/block/file-posix.c index 35684f7e21..7a196a2abf 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -110,20 +110,6 @@ #include #endif -/* OS X does not have O_DSYNC */ -#ifndef O_DSYNC -#ifdef O_SYNC -#define O_DSYNC O_SYNC -#elif defined(O_FSYNC) -#define O_DSYNC O_FSYNC -#endif -#endif - -/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ -#ifndef O_DIRECT -#define O_DIRECT O_DSYNC -#endif - #define FTYPE_FILE 0 #define FTYPE_CD 1 -- 2.45.1
Re: [PATCH v5 12/13] tests/qtest/vhost-user-blk-test: use memory-backend-shm
On Thu, May 23, 2024 at 05:06:00PM GMT, David Hildenbrand wrote: On 23.05.24 16:55, Stefano Garzarella wrote: `memory-backend-memfd` is available only on Linux while the new `memory-backend-shm` can be used on any POSIX-compliant operating system. Let's use it so we can run the test in multiple environments. Acked-by: Thomas Huth Acked-by: Stefan Hajnoczi Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-blk-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qtest/vhost-user-blk-test.c b/tests/qtest/vhost-user-blk-test.c index 117b9acd10..e945f6abf2 100644 --- a/tests/qtest/vhost-user-blk-test.c +++ b/tests/qtest/vhost-user-blk-test.c @@ -906,7 +906,7 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, vhost_user_blk_bin); g_string_append_printf(cmd_line, -" -object memory-backend-memfd,id=mem,size=256M,share=on " +" -object memory-backend-shm,id=mem,size=256M,share=on " Can we simplifya nd drop the share=on? Good catch! I'll do in the next version! Reviewed-by: David Hildenbrand Thanks for the reviews, Stefano
Re: [PATCH v5 09/13] osdep: move O_DSYNC and O_DIRECT defines from file-posix
On Thu, May 23, 2024 at 04:14:48PM GMT, Daniel P. Berrangé wrote: On Thu, May 23, 2024 at 04:55:18PM +0200, Stefano Garzarella wrote: These defines are also useful for vhost-user-blk when it is compiled in some POSIX systems that do not define them, so let's move them to “qemu/osdep.h”. Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- include/qemu/osdep.h | 14 ++ block/file-posix.c | 14 -- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index f61edcfdc2..e165b5cb1b 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -325,6 +325,20 @@ void QEMU_ERROR("code path is reachable") #define ESHUTDOWN 4099 #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif Please don't do this - we can't be confident that all code in QEMU will be OK with O_DIRECT being simulated in this way. I'm not convinced that the O_DSYNC simulation is a good idea to do tree-wide either. I was a little scared, and you and the failing tests on win64 convinced me to bring this back as in v4 ;-) Thanks, Stefano
[PATCH v5 05/13] contrib/vhost-user-blk: fix bind() using the right size of the address
On macOS passing `-s /tmp/vhost.socket` parameter to the vhost-user-blk application, the bind was done on `/tmp/vhost.socke` pathname, missing the last character. This sounds like one of the portability problems described in the unix(7) manpage: Pathname sockets When binding a socket to a pathname, a few rules should be observed for maximum portability and ease of coding: • The pathname in sun_path should be null-terminated. • The length of the pathname, including the terminating null byte, should not exceed the size of sun_path. • The addrlen argument that describes the enclosing sockaddr_un structure should have a value of at least: offsetof(struct sockaddr_un, sun_path) + strlen(addr.sun_path)+1 or, more simply, addrlen can be specified as sizeof(struct sockaddr_un). So let's follow the last advice and simplify the code as well. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- contrib/vhost-user-blk/vhost-user-blk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 89e5f11a64..a8ab9269a2 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -469,7 +469,6 @@ static int unix_sock_new(char *unix_fn) { int sock; struct sockaddr_un un; -size_t len; assert(unix_fn); @@ -481,10 +480,9 @@ static int unix_sock_new(char *unix_fn) un.sun_family = AF_UNIX; (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn); -len = sizeof(un.sun_family) + strlen(un.sun_path); (void)unlink(unix_fn); -if (bind(sock, (struct sockaddr *), len) < 0) { +if (bind(sock, (struct sockaddr *), sizeof(un)) < 0) { perror("bind"); goto fail; } -- 2.45.1
[PATCH v5 11/13] hostmem: add a new memory backend based on POSIX shm_open()
shm_open() creates and opens a new POSIX shared memory object. A POSIX shared memory object allows creating memory backend with an associated file descriptor that can be shared with external processes (e.g. vhost-user). The new `memory-backend-shm` can be used as an alternative when `memory-backend-memfd` is not available (Linux only), since shm_open() should be provided by any POSIX-compliant operating system. This backend mimics memfd, allocating memory that is practically anonymous. In theory shm_open() requires a name, but this is allocated for a short time interval and shm_unlink() is called right after shm_open(). After that, only fd is shared with external processes (e.g., vhost-user) as if it were associated with anonymous memory. In the future we may also allow the user to specify the name to be passed to shm_open(), but for now we keep the backend simple, mimicking anonymous memory such as memfd. Acked-by: David Hildenbrand Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- v5 - fixed documentation in qapi/qom.json and qemu-options.hx [Markus] v4 - fail if we find "share=off" in shm_backend_memory_alloc() [David] v3 - enriched commit message and documentation to highlight that we want to mimic memfd (David) --- docs/system/devices/vhost-user.rst | 5 +- qapi/qom.json | 19 + backends/hostmem-shm.c | 123 + backends/meson.build | 1 + qemu-options.hx| 16 5 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 backends/hostmem-shm.c diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 9b2da106ce..35259d8ec7 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -98,8 +98,9 @@ Shared memory object In order for the daemon to access the VirtIO queues to process the requests it needs access to the guest's address space. This is -achieved via the ``memory-backend-file`` or ``memory-backend-memfd`` -objects. A reference to a file-descriptor which can access this object +achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or +``memory-backend-shm`` objects. +A reference to a file-descriptor which can access this object will be passed via the socket as part of the protocol negotiation. Currently the shared memory object needs to match the size of the main diff --git a/qapi/qom.json b/qapi/qom.json index 38dde6d785..d40592d863 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -721,6 +721,21 @@ '*hugetlbsize': 'size', '*seal': 'bool' } } +## +# @MemoryBackendShmProperties: +# +# Properties for memory-backend-shm objects. +# +# The @share boolean option is true by default with shm. Setting it to false +# will cause a failure during allocation because it is not supported by this +# backend. +# +# Since: 9.1 +## +{ 'struct': 'MemoryBackendShmProperties', + 'base': 'MemoryBackendProperties', + 'data': { } } + ## # @MemoryBackendEpcProperties: # @@ -985,6 +1000,8 @@ { 'name': 'memory-backend-memfd', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram', +{ 'name': 'memory-backend-shm', + 'if': 'CONFIG_POSIX' }, 'pef-guest', { 'name': 'pr-manager-helper', 'if': 'CONFIG_LINUX' }, @@ -1056,6 +1073,8 @@ 'memory-backend-memfd': { 'type': 'MemoryBackendMemfdProperties', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram': 'MemoryBackendProperties', + 'memory-backend-shm': { 'type': 'MemoryBackendShmProperties', + 'if': 'CONFIG_POSIX' }, 'pr-manager-helper': { 'type': 'PrManagerHelperProperties', 'if': 'CONFIG_LINUX' }, 'qtest': 'QtestProperties', diff --git a/backends/hostmem-shm.c b/backends/hostmem-shm.c new file mode 100644 index 00..374edc3db8 --- /dev/null +++ b/backends/hostmem-shm.c @@ -0,0 +1,123 @@ +/* + * QEMU host POSIX shared memory object backend + * + * Copyright (C) 2024 Red Hat Inc + * + * Authors: + * Stefano Garzarella + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/hostmem.h" +#include "qapi/error.h" + +#define TYPE_MEMORY_BACKEND_SHM "memory-backend-shm" + +OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendShm, MEMORY_BACKEND_SHM) + +struct HostMemoryBackendShm { +HostMemoryBackend parent_obj; +}; + +static bool +shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ +g_autoptr(GString) shm_name = g_string_new(NULL); +g_autofree char *backend_name = NULL; +uint32_t ram_flags; +int fd, oflag; +mode_t mode; + +if (!backend->size) { +error_setg(errp, "can
[PATCH v5 10/13] contrib/vhost-user-blk: enable it on any POSIX system
Previous patches made the vhost-user-blk application and the vhost-user-server.c dependency buildable for any POSIX system. Acked-by: Stefan Hajnoczi Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- v5: - O_DSYNC and O_DIRECT definition are now in osdep [Phil] - commit updated since we moved out all code changes v4: - moved using of "qemu/bswap.h" API in a separate patch [Phil] --- meson.build | 2 -- util/meson.build | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 543105af2a..06a1835a09 100644 --- a/meson.build +++ b/meson.build @@ -1974,8 +1974,6 @@ has_statx = cc.has_header_symbol('sys/stat.h', 'STATX_BASIC_STATS', prefix: gnu_ has_statx_mnt_id = cc.has_header_symbol('sys/stat.h', 'STATX_MNT_ID', prefix: gnu_source_prefix) have_vhost_user_blk_server = get_option('vhost_user_blk_server') \ - .require(host_os == 'linux', - error_message: 'vhost_user_blk_server requires linux') \ .require(have_vhost_user, error_message: 'vhost_user_blk_server requires vhost-user support') \ .disable_auto_if(not have_tools and not have_system) \ diff --git a/util/meson.build b/util/meson.build index 72b505df11..c414178ace 100644 --- a/util/meson.build +++ b/util/meson.build @@ -112,10 +112,12 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif if host_os == 'linux' -util_ss.add(files('vhost-user-server.c'), vhost_user) util_ss.add(files('vfio-helpers.c')) util_ss.add(files('chardev_open.c')) endif + if host_os != 'windows' +util_ss.add(files('vhost-user-server.c'), vhost_user) + endif util_ss.add(files('yank.c')) endif -- 2.45.1
[PATCH v5 13/13] tests/qtest/vhost-user-test: add a test case for memory-backend-shm
`memory-backend-shm` can be used with vhost-user devices, so let's add a new test case for it. Acked-by: Thomas Huth Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-test.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index d4e437265f..8c1d903b2a 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -44,6 +44,8 @@ "mem-path=%s,share=on -numa node,memdev=mem" #define QEMU_CMD_MEMFD " -m %d -object memory-backend-memfd,id=mem,size=%dM," \ " -numa node,memdev=mem" +#define QEMU_CMD_SHM" -m %d -object memory-backend-shm,id=mem,size=%dM," \ +" -numa node,memdev=mem" #define QEMU_CMD_CHR" -chardev socket,id=%s,path=%s%s" #define QEMU_CMD_NETDEV " -netdev vhost-user,id=hs0,chardev=%s,vhostforce=on" @@ -195,6 +197,7 @@ enum test_memfd { TEST_MEMFD_AUTO, TEST_MEMFD_YES, TEST_MEMFD_NO, +TEST_MEMFD_SHM, }; static void append_vhost_net_opts(TestServer *s, GString *cmd_line, @@ -228,6 +231,8 @@ static void append_mem_opts(TestServer *server, GString *cmd_line, if (memfd == TEST_MEMFD_YES) { g_string_append_printf(cmd_line, QEMU_CMD_MEMFD, size, size); +} else if (memfd == TEST_MEMFD_SHM) { +g_string_append_printf(cmd_line, QEMU_CMD_SHM, size, size); } else { const char *root = init_hugepagefs() ? : server->tmpfs; @@ -788,6 +793,19 @@ static void *vhost_user_test_setup_memfd(GString *cmd_line, void *arg) return server; } +static void *vhost_user_test_setup_shm(GString *cmd_line, void *arg) +{ +TestServer *server = test_server_new("vhost-user-test", arg); +test_server_listen(server); + +append_mem_opts(server, cmd_line, 256, TEST_MEMFD_SHM); +server->vu_ops->append_opts(server, cmd_line, ""); + +g_test_queue_destroy(vhost_user_test_cleanup, server); + +return server; +} + static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc) { TestServer *server = arg; @@ -1081,6 +1099,11 @@ static void register_vhost_user_test(void) "virtio-net", test_read_guest_mem, ); +opts.before = vhost_user_test_setup_shm; +qos_add_test("vhost-user/read-guest-mem/shm", + "virtio-net", + test_read_guest_mem, ); + if (qemu_memfd_check(MFD_ALLOW_SEALING)) { opts.before = vhost_user_test_setup_memfd; qos_add_test("vhost-user/read-guest-mem/memfd", -- 2.45.1
[PATCH v5 07/13] vhost-user: enable frontends on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable vhost-user frontends for any POSIX system. In vhost_net.c we use VHOST_FILE_UNBIND which is defined in a Linux specific header, let's define it for other systems as well. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- meson.build| 1 - hw/net/vhost_net.c | 5 + hw/block/Kconfig | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index a9de71d450..5a6f7a36eb 100644 --- a/meson.build +++ b/meson.build @@ -151,7 +151,6 @@ have_tpm = get_option('tpm') \ # vhost have_vhost_user = get_option('vhost_user') \ - .disable_auto_if(host_os != 'linux') \ .require(host_os != 'windows', error_message: 'vhost-user is not available on Windows').allowed() have_vhost_vdpa = get_option('vhost_vdpa') \ diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index fd1a93701a..fced429813 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -34,8 +34,13 @@ #include "standard-headers/linux/virtio_ring.h" #include "hw/virtio/vhost.h" #include "hw/virtio/virtio-bus.h" +#if defined(__linux__) #include "linux-headers/linux/vhost.h" +#endif +#ifndef VHOST_FILE_UNBIND +#define VHOST_FILE_UNBIND -1 +#endif /* Features supported by host kernel. */ static const int kernel_feature_bits[] = { diff --git a/hw/block/Kconfig b/hw/block/Kconfig index 9e8f28f982..29ee09e434 100644 --- a/hw/block/Kconfig +++ b/hw/block/Kconfig @@ -40,7 +40,7 @@ config VHOST_USER_BLK bool # Only PCI devices are provided for now default y if VIRTIO_PCI -depends on VIRTIO && VHOST_USER && LINUX +depends on VIRTIO && VHOST_USER config SWIM bool -- 2.45.1
[PATCH v5 08/13] libvhost-user: enable it on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable libvhost-user for any POSIX system. Compiling it on macOS and FreeBSD some problems came up: - avoid to include linux/vhost.h which is available only on Linux (vhost_types.h contains many of the things we need) - macOS doesn't provide sys/endian.h, so let's define them (note: libvhost-user doesn't include QEMU's headers, so we can't use use "qemu/bswap.h") - define eventfd_[write|read] as write/read wrapper when system doesn't provide those (e.g. macOS) - copy SEAL defines from include/qemu/memfd.h to make the code works on FreeBSD where MFD_ALLOW_SEALING is defined - define MAP_NORESERVE if it's not defined (e.g. on FreeBSD) Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- v5: - fixed typos in the commit description [Phil] --- meson.build | 2 +- subprojects/libvhost-user/libvhost-user.h | 2 +- subprojects/libvhost-user/libvhost-user.c | 60 +-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 5a6f7a36eb..543105af2a 100644 --- a/meson.build +++ b/meson.build @@ -3155,7 +3155,7 @@ if have_system and vfio_user_server_allowed endif vhost_user = not_found -if host_os == 'linux' and have_vhost_user +if have_vhost_user libvhost_user = subproject('libvhost-user') vhost_user = libvhost_user.get_variable('vhost_user_dep') endif diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index deb40e77b3..e13e1d3931 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -18,9 +18,9 @@ #include #include #include -#include #include #include "standard-headers/linux/virtio_ring.h" +#include "standard-headers/linux/vhost_types.h" /* Based on qemu/hw/virtio/vhost-user.c */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 2c20cdc16e..57e58d4adb 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include /* Necessary to provide VIRTIO_F_VERSION_1 on system * with older linux headers. Must appear before @@ -39,8 +37,8 @@ #include "standard-headers/linux/virtio_config.h" #if defined(__linux__) +#include #include -#include #include #include #include @@ -52,6 +50,62 @@ #endif +#if defined(__APPLE__) && (__MACH__) +#include +#define htobe16(x) OSSwapHostToBigInt16(x) +#define htole16(x) OSSwapHostToLittleInt16(x) +#define be16toh(x) OSSwapBigToHostInt16(x) +#define le16toh(x) OSSwapLittleToHostInt16(x) + +#define htobe32(x) OSSwapHostToBigInt32(x) +#define htole32(x) OSSwapHostToLittleInt32(x) +#define be32toh(x) OSSwapBigToHostInt32(x) +#define le32toh(x) OSSwapLittleToHostInt32(x) + +#define htobe64(x) OSSwapHostToBigInt64(x) +#define htole64(x) OSSwapHostToLittleInt64(x) +#define be64toh(x) OSSwapBigToHostInt64(x) +#define le64toh(x) OSSwapLittleToHostInt64(x) +#endif + +#ifdef CONFIG_EVENTFD +#include +#else +#define eventfd_t uint64_t + +int eventfd_write(int fd, eventfd_t value) +{ +return (write(fd, , sizeof(value)) == sizeof(value)) ? 0 : -1; +} + +int eventfd_read(int fd, eventfd_t *value) +{ +return (read(fd, value, sizeof(*value)) == sizeof(*value)) ? 0 : -1; +} +#endif + +#ifdef MFD_ALLOW_SEALING +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE0x0008 /* prevent writes */ +#endif +#endif + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + #include "include/atomic.h" #include "libvhost-user.h" -- 2.45.1
[PATCH v5 12/13] tests/qtest/vhost-user-blk-test: use memory-backend-shm
`memory-backend-memfd` is available only on Linux while the new `memory-backend-shm` can be used on any POSIX-compliant operating system. Let's use it so we can run the test in multiple environments. Acked-by: Thomas Huth Acked-by: Stefan Hajnoczi Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-blk-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qtest/vhost-user-blk-test.c b/tests/qtest/vhost-user-blk-test.c index 117b9acd10..e945f6abf2 100644 --- a/tests/qtest/vhost-user-blk-test.c +++ b/tests/qtest/vhost-user-blk-test.c @@ -906,7 +906,7 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, vhost_user_blk_bin); g_string_append_printf(cmd_line, -" -object memory-backend-memfd,id=mem,size=256M,share=on " +" -object memory-backend-shm,id=mem,size=256M,share=on " " -M memory-backend=mem -m 256M "); for (i = 0; i < vus_instances; i++) { -- 2.45.1
[PATCH v5 03/13] libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
libvhost-user will panic when receiving VHOST_USER_GET_INFLIGHT_FD message if MFD_ALLOW_SEALING is not defined, since it's not able to create a memfd. VHOST_USER_GET_INFLIGHT_FD is used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD is negotiated. So, let's mask that feature if the backend is not able to properly handle these messages. Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a11afd1960..2c20cdc16e 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -1674,6 +1674,17 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) features |= dev->iface->get_protocol_features(dev); } +#ifndef MFD_ALLOW_SEALING +/* + * If MFD_ALLOW_SEALING is not defined, we are not able to handle + * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd. + * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + * is negotiated. A device implementation can enable it, so let's mask + * it to avoid a runtime panic. + */ +features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); +#endif + vmsg_set_reply_u64(vmsg, features); return true; } -- 2.45.1
[PATCH v5 02/13] libvhost-user: fail vu_message_write() if sendmsg() is failing
In vu_message_write() we use sendmsg() to send the message header, then a write() to send the payload. If sendmsg() fails we should avoid sending the payload, since we were unable to send the header. Discovered before fixing the issue with the previous patch, where sendmsg() failed on macOS due to wrong parameters, but the frontend still sent the payload which the backend incorrectly interpreted as a wrong header. Reviewed-by: Daniel P. Berrangé Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 5 + 1 file changed, 5 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 22bea0c775..a11afd1960 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -639,6 +639,11 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) rc = sendmsg(conn_fd, , 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); +if (rc <= 0) { +vu_panic(dev, "Error while writing: %s", strerror(errno)); +return false; +} + if (vmsg->size) { do { if (vmsg->data) { -- 2.45.1
[PATCH v5 09/13] osdep: move O_DSYNC and O_DIRECT defines from file-posix
These defines are also useful for vhost-user-blk when it is compiled in some POSIX systems that do not define them, so let's move them to “qemu/osdep.h”. Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- include/qemu/osdep.h | 14 ++ block/file-posix.c | 14 -- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index f61edcfdc2..e165b5cb1b 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -325,6 +325,20 @@ void QEMU_ERROR("code path is reachable") #define ESHUTDOWN 4099 #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + #define RETRY_ON_EINTR(expr) \ (__extension__ \ ({ typeof(expr) __result; \ diff --git a/block/file-posix.c b/block/file-posix.c index 35684f7e21..7a196a2abf 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -110,20 +110,6 @@ #include #endif -/* OS X does not have O_DSYNC */ -#ifndef O_DSYNC -#ifdef O_SYNC -#define O_DSYNC O_SYNC -#elif defined(O_FSYNC) -#define O_DSYNC O_FSYNC -#endif -#endif - -/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ -#ifndef O_DIRECT -#define O_DIRECT O_DSYNC -#endif - #define FTYPE_FILE 0 #define FTYPE_CD 1 -- 2.45.1
[PATCH v5 04/13] vhost-user-server: do not set memory fd non-blocking
In vhost-user-server we set all fd received from the other peer in non-blocking mode. For some of them (e.g. memfd, shm_open, etc.) it's not really needed, because we don't use these fd with blocking operations, but only to map memory. In addition, in some systems this operation can fail (e.g. in macOS setting an fd returned by shm_open() non-blocking fails with errno = ENOTTY). So, let's avoid setting fd non-blocking for those messages that we know carry memory fd (e.g. VHOST_USER_ADD_MEM_REG, VHOST_USER_SET_MEM_TABLE). Reviewed-by: Daniel P. Berrangé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- v3: - avoiding setting fd non-blocking for messages where we have memory fd (Eric) --- util/vhost-user-server.c | 12 1 file changed, 12 insertions(+) diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index 3bfb1ad3ec..b19229074a 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -65,6 +65,18 @@ static void vmsg_close_fds(VhostUserMsg *vmsg) static void vmsg_unblock_fds(VhostUserMsg *vmsg) { int i; + +/* + * These messages carry fd used to map memory, not to send/receive messages, + * so this operation is useless. In addition, in some systems this + * operation can fail (e.g. in macOS setting an fd returned by shm_open() + * non-blocking fails with errno = ENOTTY) + */ +if (vmsg->request == VHOST_USER_ADD_MEM_REG || +vmsg->request == VHOST_USER_SET_MEM_TABLE) { +return; +} + for (i = 0; i < vmsg->fd_num; i++) { qemu_socket_set_nonblock(vmsg->fds[i]); } -- 2.45.1
[PATCH v5 06/13] contrib/vhost-user-*: use QEMU bswap helper functions
Let's replace the calls to le*toh() and htole*() with qemu/bswap.h helpers to make the code more portable. Suggested-by: Philippe Mathieu-Daudé Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- contrib/vhost-user-blk/vhost-user-blk.c | 9 + contrib/vhost-user-input/main.c | 16 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index a8ab9269a2..9492146855 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/bswap.h" #include "standard-headers/linux/virtio_blk.h" #include "libvhost-user-glib.h" @@ -194,8 +195,8 @@ vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt, #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT) VubDev *vdev_blk = req->vdev_blk; desc = buf; -uint64_t range[2] = { le64toh(desc->sector) << 9, - le32toh(desc->num_sectors) << 9 }; +uint64_t range[2] = { le64_to_cpu(desc->sector) << 9, + le32_to_cpu(desc->num_sectors) << 9 }; if (type == VIRTIO_BLK_T_DISCARD) { if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) { g_free(buf); @@ -267,13 +268,13 @@ static int vub_virtio_process_req(VubDev *vdev_blk, req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base; in_num--; -type = le32toh(req->out->type); +type = le32_to_cpu(req->out->type); switch (type & ~VIRTIO_BLK_T_BARRIER) { case VIRTIO_BLK_T_IN: case VIRTIO_BLK_T_OUT: { ssize_t ret = 0; bool is_write = type & VIRTIO_BLK_T_OUT; -req->sector_num = le64toh(req->out->sector); +req->sector_num = le64_to_cpu(req->out->sector); if (is_write) { ret = vub_writev(req, >out_sg[1], out_num); } else { diff --git a/contrib/vhost-user-input/main.c b/contrib/vhost-user-input/main.c index 081230da54..f3362d41ac 100644 --- a/contrib/vhost-user-input/main.c +++ b/contrib/vhost-user-input/main.c @@ -51,8 +51,8 @@ static void vi_input_send(VuInput *vi, struct virtio_input_event *event) vi->queue[vi->qindex++].event = *event; /* ... until we see a report sync ... */ -if (event->type != htole16(EV_SYN) || -event->code != htole16(SYN_REPORT)) { +if (event->type != cpu_to_le16(EV_SYN) || +event->code != cpu_to_le16(SYN_REPORT)) { return; } @@ -103,9 +103,9 @@ vi_evdev_watch(VuDev *dev, int condition, void *data) g_debug("input %d %d %d", evdev.type, evdev.code, evdev.value); -virtio.type = htole16(evdev.type); -virtio.code = htole16(evdev.code); -virtio.value = htole32(evdev.value); +virtio.type = cpu_to_le16(evdev.type); +virtio.code = cpu_to_le16(evdev.code); +virtio.value = cpu_to_le32(evdev.value); vi_input_send(vi, ); } } @@ -124,9 +124,9 @@ static void vi_handle_status(VuInput *vi, virtio_input_event *event) evdev.input_event_sec = tval.tv_sec; evdev.input_event_usec = tval.tv_usec; -evdev.type = le16toh(event->type); -evdev.code = le16toh(event->code); -evdev.value = le32toh(event->value); +evdev.type = le16_to_cpu(event->type); +evdev.code = le16_to_cpu(event->code); +evdev.value = le32_to_cpu(event->value); rc = write(vi->evdevfd, , sizeof(evdev)); if (rc == -1) { -- 2.45.1
[PATCH v5 01/13] libvhost-user: set msg.msg_control to NULL when it is empty
On some OS (e.g. macOS) sendmsg() returns -1 (errno EINVAL) if the `struct msghdr` has the field `msg_controllen` set to 0, but `msg_control` is not NULL. Reviewed-by: Eric Blake Reviewed-by: David Hildenbrand Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Acked-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a879149fef..22bea0c775 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -632,6 +632,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); } else { msg.msg_controllen = 0; +msg.msg_control = NULL; } do { -- 2.45.1
[PATCH v5 00/13] vhost-user: support any POSIX system (tested on macOS, FreeBSD, OpenBSD)
v1: https://patchew.org/QEMU/20240228114759.44758-1-sgarz...@redhat.com/ v2: https://patchew.org/QEMU/20240326133936.125332-1-sgarz...@redhat.com/ v3: https://patchew.org/QEMU/20240404122330.92710-1-sgarz...@redhat.com/ v4: https://patchew.org/QEMU/20240508074457.12367-1-sgarz...@redhat.com/ v5: - rebased on 7e1c0047015ffbd408e1aa4a5ec1abe4751dbf7e - added some R-b/A-b/T-b tags [Daniel, Phil, Thomas, Stefan thanks!] - added new patch to move O_DSYNC and O_DIRECT defines in osdep [Phil] - fixed memory-backend-shm documentation in qapi/qom.json and qemu-options.hx [Markus] - fixed typos in some commits description [Phil] The vhost-user protocol is not really Linux-specific, so let's try support QEMU's frontends and backends (including libvhost-user) in any POSIX system with this series. The main use case is to be able to use virtio devices that we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even in non-Linux systems. The first 5 patches are more like fixes discovered at runtime on macOS or FreeBSD that could go even independently of this series. Patches 6, 7, 8, 9, and 10 enable building of frontends and backends (including libvhost-user) with associated code changes to succeed in compilation. Patch 11 adds `memory-backend-shm` that uses the POSIX shm_open() API to create shared memory which is identified by an fd that can be shared with vhost-user backends. This is useful on those systems (like macOS) where we don't have memfd_create() or special filesystems like "/dev/shm". Patches 12 and 13 use `memory-backend-shm` in some vhost-user tests. Maybe the first 5 patches can go separately, but I only discovered those problems after testing patches 6 - 10, so I have included them in this series for now. Please let me know if you prefer that I send them separately. I tested this series using vhost-user-blk and QSD on macOS Sonoma 14.4 (aarch64), FreeBSD 14 (x86_64), OpenBSD 7.4 (x86_64), and Fedora 40 (x86_64) in this way: - Start vhost-user-blk or QSD (same commands for all systems) vhost-user-blk -s /tmp/vhost.socket \ -b Fedora-Cloud-Base-39-1.5.x86_64.raw qemu-storage-daemon \ --blockdev file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \ --blockdev qcow2,file=file,node-name=qcow2 \ --export vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on - macOS (aarch64): start QEMU (using hvf accelerator) qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \ -drive file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \ -device virtio-net-device,netdev=net0 -netdev user,id=net0 \ -device ramfb -device usb-ehci -device usb-kbd \ -object memory-backend-shm,id=mem,size=512M \ -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - FreeBSD/OpenBSD (x86_64): start QEMU (no accelerators available) qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \ -object memory-backend-shm,id=mem,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - Fedora (x86_64): start QEMU (using kvm accelerator) qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \ -object memory-backend-shm,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket Branch pushed (and CI started) at https://gitlab.com/sgarzarella/qemu/-/tree/macos-vhost-user?ref_type=heads Thanks, Stefano Stefano Garzarella (13): libvhost-user: set msg.msg_control to NULL when it is empty libvhost-user: fail vu_message_write() if sendmsg() is failing libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported vhost-user-server: do not set memory fd non-blocking contrib/vhost-user-blk: fix bind() using the right size of the address contrib/vhost-user-*: use QEMU bswap helper functions vhost-user: enable frontends on any POSIX system libvhost-user: enable it on any POSIX system osdep: move O_DSYNC and O_DIRECT defines from file-posix contrib/vhost-user-blk: enable it on any POSIX system hostmem: add a new memory backend based on POSIX shm_open() tests/qtest/vhost-user-blk-test: use memory-backend-shm tests/qtest/vhost-user-test: add a test case for memory-backend-shm docs/system/devices/vhost-user.rst| 5 +- meson.build | 5 +- qapi/qom.json | 19 include/qemu/osdep.h | 14 +++ subprojects/libvhost-user/libvhost-user.h | 2 +- backends/hostmem-shm.c| 123 ++ block/file-posix.c| 14 --- contrib/vhost-user-blk/vhost-user-blk.c | 13 ++- contrib/vhost-user-input/main.c | 16 +-- hw/net/vhost_net.c
Re: [PATCH v4 10/12] hostmem: add a new memory backend based on POSIX shm_open()
On Wed, May 08, 2024 at 01:59:33PM GMT, Markus Armbruster wrote: Stefano Garzarella writes: shm_open() creates and opens a new POSIX shared memory object. A POSIX shared memory object allows creating memory backend with an associated file descriptor that can be shared with external processes (e.g. vhost-user). The new `memory-backend-shm` can be used as an alternative when `memory-backend-memfd` is not available (Linux only), since shm_open() should be provided by any POSIX-compliant operating system. This backend mimics memfd, allocating memory that is practically anonymous. In theory shm_open() requires a name, but this is allocated for a short time interval and shm_unlink() is called right after shm_open(). After that, only fd is shared with external processes (e.g., vhost-user) as if it were associated with anonymous memory. In the future we may also allow the user to specify the name to be passed to shm_open(), but for now we keep the backend simple, mimicking anonymous memory such as memfd. Acked-by: David Hildenbrand Signed-off-by: Stefano Garzarella --- v4 - fail if we find "share=off" in shm_backend_memory_alloc() [David] v3 - enriched commit message and documentation to highlight that we want to mimic memfd (David) --- docs/system/devices/vhost-user.rst | 5 +- qapi/qom.json | 17 backends/hostmem-shm.c | 123 + backends/meson.build | 1 + qemu-options.hx| 13 +++ 5 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 backends/hostmem-shm.c diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 9b2da106ce..35259d8ec7 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -98,8 +98,9 @@ Shared memory object In order for the daemon to access the VirtIO queues to process the requests it needs access to the guest's address space. This is -achieved via the ``memory-backend-file`` or ``memory-backend-memfd`` -objects. A reference to a file-descriptor which can access this object +achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or +``memory-backend-shm`` objects. +A reference to a file-descriptor which can access this object will be passed via the socket as part of the protocol negotiation. Currently the shared memory object needs to match the size of the main diff --git a/qapi/qom.json b/qapi/qom.json index 38dde6d785..52df052df8 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -721,6 +721,19 @@ '*hugetlbsize': 'size', '*seal': 'bool' } } +## +# @MemoryBackendShmProperties: +# +# Properties for memory-backend-shm objects. +# +# The @share boolean option is true by default with shm. This contradicts the doc comment for @share: # @share: if false, the memory is private to QEMU; if true, it is # shared (default: false) Your intention is to override that text. But that's less than clear. Moreover, the documentation of @share is pretty far from this override. John Snow is working on patches that'll pull it closer. Hmm, MemoryBackendMemfdProperties has the same override. Yep, I followed @MemoryBackendMemfdProperties and @MemoryBackendEpcProperties. I think we should change the doc comment for @share to something like # @share: if false, the memory is private to QEMU; if true, it is # shared (default depends on the backend type) and then document the actual default with each backend type. I agree on that, but I think we should do in a separate series/patch. If you prefer, I can send that patch. +# +# Since: 9.1 +## +{ 'struct': 'MemoryBackendShmProperties', + 'base': 'MemoryBackendProperties', + 'data': { } } Let's add 'if': 'CONFIG_POSIX' here. Do you mean something like this: { 'struct': 'MemoryBackendShmProperties', 'if': 'CONFIG_POSIX', 'base': 'MemoryBackendProperties', 'data': { } } I didn't because for MemoryBackendMemfdProperties and MemoryBackendEpcProperties we have 'if': 'CONFIG_POSIX' only later in the ObjectOptions union, so I did the same. Should we fix them as well? + ## # @MemoryBackendEpcProperties: # @@ -985,6 +998,8 @@ { 'name': 'memory-backend-memfd', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram', +{ 'name': 'memory-backend-shm', + 'if': 'CONFIG_POSIX' }, 'pef-guest', { 'name': 'pr-manager-helper', 'if': 'CONFIG_LINUX' }, @@ -1056,6 +1071,8 @@ 'memory-backend-memfd': { 'type': 'MemoryBackendMemfdProperties', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram': 'MemoryBackendProperties', + 'memory-backend-shm': { 'type': 'MemoryBackendShmProperties', + 'if': 'CONFIG_POSIX' }, 'pr-manager-helper': { 'type': 'PrManagerHelperProperties', 'if': 'CO
Re: [PATCH v4 09/12] contrib/vhost-user-blk: enable it on any POSIX system
On Wed, May 08, 2024 at 12:32:08PM GMT, Philippe Mathieu-Daudé wrote: On 8/5/24 09:44, Stefano Garzarella wrote: Let's make the code more portable by adding defines from block/file-posix.c to support O_DIRECT in other systems (e.g. macOS). vhost-user-server.c is a dependency, let's enable it for any POSIX system. Signed-off-by: Stefano Garzarella --- v4: - moved using of "qemu/bswap.h" API in a separate patch [Phil] --- meson.build | 2 -- contrib/vhost-user-blk/vhost-user-blk.c | 14 ++ util/meson.build| 4 +++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 9492146855..a450337685 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -25,6 +25,20 @@ #include #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif Could we add that in "qemu/osdep.h" instead? Since "qemu/osdep.h" includes fcntl.h, I think it could be fine. @Hanna, @Kevin WDYT? Otherwise, Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Thanks, Stefano
Re: [PATCH v4 08/12] libvhost-user: enable it on any POSIX system
On Wed, May 08, 2024 at 12:36:30PM GMT, Philippe Mathieu-Daudé wrote: On 8/5/24 09:44, Stefano Garzarella wrote: The vhost-user protocol is not really Linux-specific so let's enable libvhost-user for any POSIX system. Compiling it on macOS and FreeBSD some problems came up: - avoid to include linux/vhost.h which is avaibale only on Linux "available" (vhost_types.h contains many of the things we need) - macOS doesn't provide sys/endian.h, so let's define them (note: libvhost-user doesn't include qemu's headers, so we can't use "QEMU" Good catches, I'll fix them! use "qemu/bswap.h") - define eventfd_[write|read] as write/read wrapper when system doesn't provide those (e.g. macOS) - copy SEAL defines from include/qemu/memfd.h to make the code works on FreeBSD where MFD_ALLOW_SEALING is defined Alternatively add in subprojects/libvhost-user/include/osdep.h. I like the idea, but we also have other things already present before this patch (e.g. G_GNUC_PRINTF, MIN, etc.) so do you think it's better to add 2 patches (move everything to osdep.h, add things from this patch), or after this series is merged, send a patch to introduce osdep.h? I'm tempted for the last option just to prevent this series from becoming too big, but I don't have a strong opinion. Thanks, Stefano - define MAP_NORESERVE if it's not defined (e.g. on FreeBSD) Signed-off-by: Stefano Garzarella --- meson.build | 2 +- subprojects/libvhost-user/libvhost-user.h | 2 +- subprojects/libvhost-user/libvhost-user.c | 60 +-- 3 files changed, 59 insertions(+), 5 deletions(-) Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé
Re: [PATCH v4 03/12] libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
On Wed, May 08, 2024 at 12:39:33PM GMT, Philippe Mathieu-Daudé wrote: On 8/5/24 09:44, Stefano Garzarella wrote: libvhost-user will panic when receiving VHOST_USER_GET_INFLIGHT_FD message if MFD_ALLOW_SEALING is not defined, since it's not able to create a memfd. VHOST_USER_GET_INFLIGHT_FD is used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD is negotiated. So, let's mask that feature if the backend is not able to properly handle these messages. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a11afd1960..1c361ffd51 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -1674,6 +1674,16 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) features |= dev->iface->get_protocol_features(dev); } Maybe move the #ifndef here? Yep, I'll do. +/* + * If MFD_ALLOW_SEALING is not defined, we are not able to handle + * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd. + * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + * is negotiated. A device implementation can enable it, so let's mask + * it to avoid a runtime panic. + */ +#ifndef MFD_ALLOW_SEALING +features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); +#endif vmsg_set_reply_u64(vmsg, features); return true; } Reviewed-by: Philippe Mathieu-Daudé Tested-by: Philippe Mathieu-Daudé Thanks, Stefano
Re: [PATCH v4 01/12] libvhost-user: set msg.msg_control to NULL when it is empty
On Wed, May 08, 2024 at 09:57:13AM GMT, Daniel P. Berrangé wrote: On Wed, May 08, 2024 at 09:44:45AM +0200, Stefano Garzarella wrote: On some OS (e.g. macOS) sendmsg() returns -1 (errno EINVAL) if the `struct msghdr` has the field `msg_controllen` set to 0, but `msg_control` is not NULL. Reviewed-by: Eric Blake Reviewed-by: David Hildenbrand Reviewed-by: Philippe Mathieu-Daud?? Philippe's name has got mangled here Thank you for bringing this to my attention and helping me solve it off-list. It should be fixed with the next posting! Stefano Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a879149fef..22bea0c775 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -632,6 +632,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); } else { msg.msg_controllen = 0; +msg.msg_control = NULL; } do { -- 2.45.0 With regards, Daniel -- |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o-https://fstop138.berrange.com :| |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
[PATCH v4 04/12] vhost-user-server: do not set memory fd non-blocking
In vhost-user-server we set all fd received from the other peer in non-blocking mode. For some of them (e.g. memfd, shm_open, etc.) it's not really needed, because we don't use these fd with blocking operations, but only to map memory. In addition, in some systems this operation can fail (e.g. in macOS setting an fd returned by shm_open() non-blocking fails with errno = ENOTTY). So, let's avoid setting fd non-blocking for those messages that we know carry memory fd (e.g. VHOST_USER_ADD_MEM_REG, VHOST_USER_SET_MEM_TABLE). Signed-off-by: Stefano Garzarella --- v3: - avoiding setting fd non-blocking for messages where we have memory fd (Eric) --- util/vhost-user-server.c | 12 1 file changed, 12 insertions(+) diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index 3bfb1ad3ec..b19229074a 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -65,6 +65,18 @@ static void vmsg_close_fds(VhostUserMsg *vmsg) static void vmsg_unblock_fds(VhostUserMsg *vmsg) { int i; + +/* + * These messages carry fd used to map memory, not to send/receive messages, + * so this operation is useless. In addition, in some systems this + * operation can fail (e.g. in macOS setting an fd returned by shm_open() + * non-blocking fails with errno = ENOTTY) + */ +if (vmsg->request == VHOST_USER_ADD_MEM_REG || +vmsg->request == VHOST_USER_SET_MEM_TABLE) { +return; +} + for (i = 0; i < vmsg->fd_num; i++) { qemu_socket_set_nonblock(vmsg->fds[i]); } -- 2.45.0
[PATCH v4 02/12] libvhost-user: fail vu_message_write() if sendmsg() is failing
In vu_message_write() we use sendmsg() to send the message header, then a write() to send the payload. If sendmsg() fails we should avoid sending the payload, since we were unable to send the header. Discovered before fixing the issue with the previous patch, where sendmsg() failed on macOS due to wrong parameters, but the frontend still sent the payload which the backend incorrectly interpreted as a wrong header. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 5 + 1 file changed, 5 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 22bea0c775..a11afd1960 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -639,6 +639,11 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) rc = sendmsg(conn_fd, , 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); +if (rc <= 0) { +vu_panic(dev, "Error while writing: %s", strerror(errno)); +return false; +} + if (vmsg->size) { do { if (vmsg->data) { -- 2.45.0
[PATCH v4 10/12] hostmem: add a new memory backend based on POSIX shm_open()
shm_open() creates and opens a new POSIX shared memory object. A POSIX shared memory object allows creating memory backend with an associated file descriptor that can be shared with external processes (e.g. vhost-user). The new `memory-backend-shm` can be used as an alternative when `memory-backend-memfd` is not available (Linux only), since shm_open() should be provided by any POSIX-compliant operating system. This backend mimics memfd, allocating memory that is practically anonymous. In theory shm_open() requires a name, but this is allocated for a short time interval and shm_unlink() is called right after shm_open(). After that, only fd is shared with external processes (e.g., vhost-user) as if it were associated with anonymous memory. In the future we may also allow the user to specify the name to be passed to shm_open(), but for now we keep the backend simple, mimicking anonymous memory such as memfd. Acked-by: David Hildenbrand Signed-off-by: Stefano Garzarella --- v4 - fail if we find "share=off" in shm_backend_memory_alloc() [David] v3 - enriched commit message and documentation to highlight that we want to mimic memfd (David) --- docs/system/devices/vhost-user.rst | 5 +- qapi/qom.json | 17 backends/hostmem-shm.c | 123 + backends/meson.build | 1 + qemu-options.hx| 13 +++ 5 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 backends/hostmem-shm.c diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 9b2da106ce..35259d8ec7 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -98,8 +98,9 @@ Shared memory object In order for the daemon to access the VirtIO queues to process the requests it needs access to the guest's address space. This is -achieved via the ``memory-backend-file`` or ``memory-backend-memfd`` -objects. A reference to a file-descriptor which can access this object +achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or +``memory-backend-shm`` objects. +A reference to a file-descriptor which can access this object will be passed via the socket as part of the protocol negotiation. Currently the shared memory object needs to match the size of the main diff --git a/qapi/qom.json b/qapi/qom.json index 38dde6d785..52df052df8 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -721,6 +721,19 @@ '*hugetlbsize': 'size', '*seal': 'bool' } } +## +# @MemoryBackendShmProperties: +# +# Properties for memory-backend-shm objects. +# +# The @share boolean option is true by default with shm. +# +# Since: 9.1 +## +{ 'struct': 'MemoryBackendShmProperties', + 'base': 'MemoryBackendProperties', + 'data': { } } + ## # @MemoryBackendEpcProperties: # @@ -985,6 +998,8 @@ { 'name': 'memory-backend-memfd', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram', +{ 'name': 'memory-backend-shm', + 'if': 'CONFIG_POSIX' }, 'pef-guest', { 'name': 'pr-manager-helper', 'if': 'CONFIG_LINUX' }, @@ -1056,6 +1071,8 @@ 'memory-backend-memfd': { 'type': 'MemoryBackendMemfdProperties', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram': 'MemoryBackendProperties', + 'memory-backend-shm': { 'type': 'MemoryBackendShmProperties', + 'if': 'CONFIG_POSIX' }, 'pr-manager-helper': { 'type': 'PrManagerHelperProperties', 'if': 'CONFIG_LINUX' }, 'qtest': 'QtestProperties', diff --git a/backends/hostmem-shm.c b/backends/hostmem-shm.c new file mode 100644 index 00..374edc3db8 --- /dev/null +++ b/backends/hostmem-shm.c @@ -0,0 +1,123 @@ +/* + * QEMU host POSIX shared memory object backend + * + * Copyright (C) 2024 Red Hat Inc + * + * Authors: + * Stefano Garzarella + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/hostmem.h" +#include "qapi/error.h" + +#define TYPE_MEMORY_BACKEND_SHM "memory-backend-shm" + +OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendShm, MEMORY_BACKEND_SHM) + +struct HostMemoryBackendShm { +HostMemoryBackend parent_obj; +}; + +static bool +shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ +g_autoptr(GString) shm_name = g_string_new(NULL); +g_autofree char *backend_name = NULL; +uint32_t ram_flags; +int fd, oflag; +mode_t mode; + +if (!backend->size) { +error_setg(errp, "can't create shm backend with size 0"); +return false; +} + +if (!backend->share) { +error_setg(errp, "can't create shm backend with `share=off`"); +return fals
[PATCH v4 11/12] tests/qtest/vhost-user-blk-test: use memory-backend-shm
`memory-backend-memfd` is available only on Linux while the new `memory-backend-shm` can be used on any POSIX-compliant operating system. Let's use it so we can run the test in multiple environments. Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-blk-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qtest/vhost-user-blk-test.c b/tests/qtest/vhost-user-blk-test.c index 117b9acd10..e945f6abf2 100644 --- a/tests/qtest/vhost-user-blk-test.c +++ b/tests/qtest/vhost-user-blk-test.c @@ -906,7 +906,7 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, vhost_user_blk_bin); g_string_append_printf(cmd_line, -" -object memory-backend-memfd,id=mem,size=256M,share=on " +" -object memory-backend-shm,id=mem,size=256M,share=on " " -M memory-backend=mem -m 256M "); for (i = 0; i < vus_instances; i++) { -- 2.45.0
[PATCH v4 05/12] contrib/vhost-user-blk: fix bind() using the right size of the address
On macOS passing `-s /tmp/vhost.socket` parameter to the vhost-user-blk application, the bind was done on `/tmp/vhost.socke` pathname, missing the last character. This sounds like one of the portability problems described in the unix(7) manpage: Pathname sockets When binding a socket to a pathname, a few rules should be observed for maximum portability and ease of coding: ⢠The pathname in sun_path should be null-terminated. ⢠The length of the pathname, including the terminating null byte, should not exceed the size of sun_path. ⢠The addrlen argument that describes the enclosing sockaddr_un structure should have a value of at least: offsetof(struct sockaddr_un, sun_path) + strlen(addr.sun_path)+1 or, more simply, addrlen can be specified as sizeof(struct sockaddr_un). So let's follow the last advice and simplify the code as well. Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- contrib/vhost-user-blk/vhost-user-blk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 89e5f11a64..a8ab9269a2 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -469,7 +469,6 @@ static int unix_sock_new(char *unix_fn) { int sock; struct sockaddr_un un; -size_t len; assert(unix_fn); @@ -481,10 +480,9 @@ static int unix_sock_new(char *unix_fn) un.sun_family = AF_UNIX; (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn); -len = sizeof(un.sun_family) + strlen(un.sun_path); (void)unlink(unix_fn); -if (bind(sock, (struct sockaddr *), len) < 0) { +if (bind(sock, (struct sockaddr *), sizeof(un)) < 0) { perror("bind"); goto fail; } -- 2.45.0
[PATCH v4 12/12] tests/qtest/vhost-user-test: add a test case for memory-backend-shm
`memory-backend-shm` can be used with vhost-user devices, so let's add a new test case for it. Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-test.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index d4e437265f..8c1d903b2a 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -44,6 +44,8 @@ "mem-path=%s,share=on -numa node,memdev=mem" #define QEMU_CMD_MEMFD " -m %d -object memory-backend-memfd,id=mem,size=%dM," \ " -numa node,memdev=mem" +#define QEMU_CMD_SHM" -m %d -object memory-backend-shm,id=mem,size=%dM," \ +" -numa node,memdev=mem" #define QEMU_CMD_CHR" -chardev socket,id=%s,path=%s%s" #define QEMU_CMD_NETDEV " -netdev vhost-user,id=hs0,chardev=%s,vhostforce=on" @@ -195,6 +197,7 @@ enum test_memfd { TEST_MEMFD_AUTO, TEST_MEMFD_YES, TEST_MEMFD_NO, +TEST_MEMFD_SHM, }; static void append_vhost_net_opts(TestServer *s, GString *cmd_line, @@ -228,6 +231,8 @@ static void append_mem_opts(TestServer *server, GString *cmd_line, if (memfd == TEST_MEMFD_YES) { g_string_append_printf(cmd_line, QEMU_CMD_MEMFD, size, size); +} else if (memfd == TEST_MEMFD_SHM) { +g_string_append_printf(cmd_line, QEMU_CMD_SHM, size, size); } else { const char *root = init_hugepagefs() ? : server->tmpfs; @@ -788,6 +793,19 @@ static void *vhost_user_test_setup_memfd(GString *cmd_line, void *arg) return server; } +static void *vhost_user_test_setup_shm(GString *cmd_line, void *arg) +{ +TestServer *server = test_server_new("vhost-user-test", arg); +test_server_listen(server); + +append_mem_opts(server, cmd_line, 256, TEST_MEMFD_SHM); +server->vu_ops->append_opts(server, cmd_line, ""); + +g_test_queue_destroy(vhost_user_test_cleanup, server); + +return server; +} + static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc) { TestServer *server = arg; @@ -1081,6 +1099,11 @@ static void register_vhost_user_test(void) "virtio-net", test_read_guest_mem, ); +opts.before = vhost_user_test_setup_shm; +qos_add_test("vhost-user/read-guest-mem/shm", + "virtio-net", + test_read_guest_mem, ); + if (qemu_memfd_check(MFD_ALLOW_SEALING)) { opts.before = vhost_user_test_setup_memfd; qos_add_test("vhost-user/read-guest-mem/memfd", -- 2.45.0
[PATCH v4 08/12] libvhost-user: enable it on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable libvhost-user for any POSIX system. Compiling it on macOS and FreeBSD some problems came up: - avoid to include linux/vhost.h which is avaibale only on Linux (vhost_types.h contains many of the things we need) - macOS doesn't provide sys/endian.h, so let's define them (note: libvhost-user doesn't include qemu's headers, so we can't use use "qemu/bswap.h") - define eventfd_[write|read] as write/read wrapper when system doesn't provide those (e.g. macOS) - copy SEAL defines from include/qemu/memfd.h to make the code works on FreeBSD where MFD_ALLOW_SEALING is defined - define MAP_NORESERVE if it's not defined (e.g. on FreeBSD) Signed-off-by: Stefano Garzarella --- meson.build | 2 +- subprojects/libvhost-user/libvhost-user.h | 2 +- subprojects/libvhost-user/libvhost-user.c | 60 +-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 955921dcb8..7954da5971 100644 --- a/meson.build +++ b/meson.build @@ -3168,7 +3168,7 @@ endif config_host_data.set('CONFIG_FDT', fdt.found()) vhost_user = not_found -if host_os == 'linux' and have_vhost_user +if have_vhost_user libvhost_user = subproject('libvhost-user') vhost_user = libvhost_user.get_variable('vhost_user_dep') endif diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index deb40e77b3..e13e1d3931 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -18,9 +18,9 @@ #include #include #include -#include #include #include "standard-headers/linux/virtio_ring.h" +#include "standard-headers/linux/vhost_types.h" /* Based on qemu/hw/virtio/vhost-user.c */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 1c361ffd51..03edb4bf64 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include /* Necessary to provide VIRTIO_F_VERSION_1 on system * with older linux headers. Must appear before @@ -39,8 +37,8 @@ #include "standard-headers/linux/virtio_config.h" #if defined(__linux__) +#include #include -#include #include #include #include @@ -52,6 +50,62 @@ #endif +#if defined(__APPLE__) && (__MACH__) +#include +#define htobe16(x) OSSwapHostToBigInt16(x) +#define htole16(x) OSSwapHostToLittleInt16(x) +#define be16toh(x) OSSwapBigToHostInt16(x) +#define le16toh(x) OSSwapLittleToHostInt16(x) + +#define htobe32(x) OSSwapHostToBigInt32(x) +#define htole32(x) OSSwapHostToLittleInt32(x) +#define be32toh(x) OSSwapBigToHostInt32(x) +#define le32toh(x) OSSwapLittleToHostInt32(x) + +#define htobe64(x) OSSwapHostToBigInt64(x) +#define htole64(x) OSSwapHostToLittleInt64(x) +#define be64toh(x) OSSwapBigToHostInt64(x) +#define le64toh(x) OSSwapLittleToHostInt64(x) +#endif + +#ifdef CONFIG_EVENTFD +#include +#else +#define eventfd_t uint64_t + +int eventfd_write(int fd, eventfd_t value) +{ +return (write(fd, , sizeof(value)) == sizeof(value)) ? 0 : -1; +} + +int eventfd_read(int fd, eventfd_t *value) +{ +return (read(fd, value, sizeof(*value)) == sizeof(*value)) ? 0 : -1; +} +#endif + +#ifdef MFD_ALLOW_SEALING +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE0x0008 /* prevent writes */ +#endif +#endif + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + #include "include/atomic.h" #include "libvhost-user.h" -- 2.45.0
[PATCH v4 03/12] libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
libvhost-user will panic when receiving VHOST_USER_GET_INFLIGHT_FD message if MFD_ALLOW_SEALING is not defined, since it's not able to create a memfd. VHOST_USER_GET_INFLIGHT_FD is used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD is negotiated. So, let's mask that feature if the backend is not able to properly handle these messages. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a11afd1960..1c361ffd51 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -1674,6 +1674,16 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) features |= dev->iface->get_protocol_features(dev); } +/* + * If MFD_ALLOW_SEALING is not defined, we are not able to handle + * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd. + * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + * is negotiated. A device implementation can enable it, so let's mask + * it to avoid a runtime panic. + */ +#ifndef MFD_ALLOW_SEALING +features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); +#endif vmsg_set_reply_u64(vmsg, features); return true; } -- 2.45.0
[PATCH v4 07/12] vhost-user: enable frontends on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable vhost-user frontends for any POSIX system. In vhost_net.c we use VHOST_FILE_UNBIND which is defined in a Linux specific header, let's define it for other systems as well. Signed-off-by: Stefano Garzarella --- meson.build| 1 - hw/net/vhost_net.c | 5 + hw/block/Kconfig | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index 43da492372..955921dcb8 100644 --- a/meson.build +++ b/meson.build @@ -151,7 +151,6 @@ have_tpm = get_option('tpm') \ # vhost have_vhost_user = get_option('vhost_user') \ - .disable_auto_if(host_os != 'linux') \ .require(host_os != 'windows', error_message: 'vhost-user is not available on Windows').allowed() have_vhost_vdpa = get_option('vhost_vdpa') \ diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index fd1a93701a..fced429813 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -34,8 +34,13 @@ #include "standard-headers/linux/virtio_ring.h" #include "hw/virtio/vhost.h" #include "hw/virtio/virtio-bus.h" +#if defined(__linux__) #include "linux-headers/linux/vhost.h" +#endif +#ifndef VHOST_FILE_UNBIND +#define VHOST_FILE_UNBIND -1 +#endif /* Features supported by host kernel. */ static const int kernel_feature_bits[] = { diff --git a/hw/block/Kconfig b/hw/block/Kconfig index 9e8f28f982..29ee09e434 100644 --- a/hw/block/Kconfig +++ b/hw/block/Kconfig @@ -40,7 +40,7 @@ config VHOST_USER_BLK bool # Only PCI devices are provided for now default y if VIRTIO_PCI -depends on VIRTIO && VHOST_USER && LINUX +depends on VIRTIO && VHOST_USER config SWIM bool -- 2.45.0
[PATCH v4 09/12] contrib/vhost-user-blk: enable it on any POSIX system
Let's make the code more portable by adding defines from block/file-posix.c to support O_DIRECT in other systems (e.g. macOS). vhost-user-server.c is a dependency, let's enable it for any POSIX system. Signed-off-by: Stefano Garzarella --- v4: - moved using of "qemu/bswap.h" API in a separate patch [Phil] --- meson.build | 2 -- contrib/vhost-user-blk/vhost-user-blk.c | 14 ++ util/meson.build| 4 +++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 7954da5971..25047db3c1 100644 --- a/meson.build +++ b/meson.build @@ -1960,8 +1960,6 @@ has_statx = cc.has_header_symbol('sys/stat.h', 'STATX_BASIC_STATS', prefix: gnu_ has_statx_mnt_id = cc.has_header_symbol('sys/stat.h', 'STATX_MNT_ID', prefix: gnu_source_prefix) have_vhost_user_blk_server = get_option('vhost_user_blk_server') \ - .require(host_os == 'linux', - error_message: 'vhost_user_blk_server requires linux') \ .require(have_vhost_user, error_message: 'vhost_user_blk_server requires vhost-user support') \ .disable_auto_if(not have_tools and not have_system) \ diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 9492146855..a450337685 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -25,6 +25,20 @@ #include #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + enum { VHOST_USER_BLK_MAX_QUEUES = 8, }; diff --git a/util/meson.build b/util/meson.build index 2ad57b10ba..93054f2340 100644 --- a/util/meson.build +++ b/util/meson.build @@ -112,10 +112,12 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif if host_os == 'linux' -util_ss.add(files('vhost-user-server.c'), vhost_user) util_ss.add(files('vfio-helpers.c')) util_ss.add(files('chardev_open.c')) endif + if host_os != 'windows' +util_ss.add(files('vhost-user-server.c'), vhost_user) + endif util_ss.add(files('yank.c')) endif -- 2.45.0
[PATCH v4 06/12] contrib/vhost-user-*: use QEMU bswap helper functions
Let's replace the calls to le*toh() and htole*() with qemu/bswap.h helpers to make the code more portable. Suggested-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- contrib/vhost-user-blk/vhost-user-blk.c | 9 + contrib/vhost-user-input/main.c | 16 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index a8ab9269a2..9492146855 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/bswap.h" #include "standard-headers/linux/virtio_blk.h" #include "libvhost-user-glib.h" @@ -194,8 +195,8 @@ vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt, #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT) VubDev *vdev_blk = req->vdev_blk; desc = buf; -uint64_t range[2] = { le64toh(desc->sector) << 9, - le32toh(desc->num_sectors) << 9 }; +uint64_t range[2] = { le64_to_cpu(desc->sector) << 9, + le32_to_cpu(desc->num_sectors) << 9 }; if (type == VIRTIO_BLK_T_DISCARD) { if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) { g_free(buf); @@ -267,13 +268,13 @@ static int vub_virtio_process_req(VubDev *vdev_blk, req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base; in_num--; -type = le32toh(req->out->type); +type = le32_to_cpu(req->out->type); switch (type & ~VIRTIO_BLK_T_BARRIER) { case VIRTIO_BLK_T_IN: case VIRTIO_BLK_T_OUT: { ssize_t ret = 0; bool is_write = type & VIRTIO_BLK_T_OUT; -req->sector_num = le64toh(req->out->sector); +req->sector_num = le64_to_cpu(req->out->sector); if (is_write) { ret = vub_writev(req, >out_sg[1], out_num); } else { diff --git a/contrib/vhost-user-input/main.c b/contrib/vhost-user-input/main.c index 081230da54..f3362d41ac 100644 --- a/contrib/vhost-user-input/main.c +++ b/contrib/vhost-user-input/main.c @@ -51,8 +51,8 @@ static void vi_input_send(VuInput *vi, struct virtio_input_event *event) vi->queue[vi->qindex++].event = *event; /* ... until we see a report sync ... */ -if (event->type != htole16(EV_SYN) || -event->code != htole16(SYN_REPORT)) { +if (event->type != cpu_to_le16(EV_SYN) || +event->code != cpu_to_le16(SYN_REPORT)) { return; } @@ -103,9 +103,9 @@ vi_evdev_watch(VuDev *dev, int condition, void *data) g_debug("input %d %d %d", evdev.type, evdev.code, evdev.value); -virtio.type = htole16(evdev.type); -virtio.code = htole16(evdev.code); -virtio.value = htole32(evdev.value); +virtio.type = cpu_to_le16(evdev.type); +virtio.code = cpu_to_le16(evdev.code); +virtio.value = cpu_to_le32(evdev.value); vi_input_send(vi, ); } } @@ -124,9 +124,9 @@ static void vi_handle_status(VuInput *vi, virtio_input_event *event) evdev.input_event_sec = tval.tv_sec; evdev.input_event_usec = tval.tv_usec; -evdev.type = le16toh(event->type); -evdev.code = le16toh(event->code); -evdev.value = le32toh(event->value); +evdev.type = le16_to_cpu(event->type); +evdev.code = le16_to_cpu(event->code); +evdev.value = le32_to_cpu(event->value); rc = write(vi->evdevfd, , sizeof(evdev)); if (rc == -1) { -- 2.45.0
[PATCH v4 01/12] libvhost-user: set msg.msg_control to NULL when it is empty
On some OS (e.g. macOS) sendmsg() returns -1 (errno EINVAL) if the `struct msghdr` has the field `msg_controllen` set to 0, but `msg_control` is not NULL. Reviewed-by: Eric Blake Reviewed-by: David Hildenbrand Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a879149fef..22bea0c775 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -632,6 +632,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); } else { msg.msg_controllen = 0; +msg.msg_control = NULL; } do { -- 2.45.0
[PATCH v4 00/12] vhost-user: support any POSIX system (tested on macOS, FreeBSD, OpenBSD)
v1: https://patchew.org/QEMU/20240228114759.44758-1-sgarz...@redhat.com/ v2: https://patchew.org/QEMU/20240326133936.125332-1-sgarz...@redhat.com/ v3: https://patchew.org/QEMU/20240404122330.92710-1-sgarz...@redhat.com/ v4: - rebased on master (commit e116b92d01c2cd75957a9f8ad1d4932292867b81) - added patch 6 to move using QEMU bswap helper functions in a separate patch (Phil) - fail if we find "share=off" in shm_backend_memory_alloc() (David) - added Phil's R-b and David's A-b The vhost-user protocol is not really Linux-specific, so let's try support QEMU's frontends and backends (including libvhost-user) in any POSIX system with this series. The main use case is to be able to use virtio devices that we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even in non-Linux systems. The first 5 patches are more like fixes discovered at runtime on macOS or FreeBSD that could go even independently of this series. Patches 6, 7, 8, and 9 enable building of frontends and backends (including libvhost-user) with associated code changes to succeed in compilation. Patch 10 adds `memory-backend-shm` that uses the POSIX shm_open() API to create shared memory which is identified by an fd that can be shared with vhost-user backends. This is useful on those systems (like macOS) where we don't have memfd_create() or special filesystems like "/dev/shm". Patches 11 and 12 use `memory-backend-shm` in some vhost-user tests. Maybe the first 5 patches can go separately, but I only discovered those problems after testing patches 6 - 9, so I have included them in this series for now. Please let me know if you prefer that I send them separately. I tested this series using vhost-user-blk and QSD on macOS Sonoma 14.4 (aarch64), FreeBSD 14 (x86_64), OpenBSD 7.4 (x86_64), and Fedora 39 (x86_64) in this way: - Start vhost-user-blk or QSD (same commands for all systems) vhost-user-blk -s /tmp/vhost.socket \ -b Fedora-Cloud-Base-39-1.5.x86_64.raw qemu-storage-daemon \ --blockdev file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \ --blockdev qcow2,file=file,node-name=qcow2 \ --export vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on - macOS (aarch64): start QEMU (using hvf accelerator) qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \ -drive file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \ -device virtio-net-device,netdev=net0 -netdev user,id=net0 \ -device ramfb -device usb-ehci -device usb-kbd \ -object memory-backend-shm,id=mem,size=512M \ -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - FreeBSD/OpenBSD (x86_64): start QEMU (no accelerators available) qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \ -object memory-backend-shm,id=mem,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - Fedora (x86_64): start QEMU (using kvm accelerator) qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \ -object memory-backend-shm,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket Branch pushed (and CI started) at https://gitlab.com/sgarzarella/qemu/-/tree/macos-vhost-user?ref_type=heads Thanks, Stefano Stefano Garzarella (12): libvhost-user: set msg.msg_control to NULL when it is empty libvhost-user: fail vu_message_write() if sendmsg() is failing libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported vhost-user-server: do not set memory fd non-blocking contrib/vhost-user-blk: fix bind() using the right size of the address contrib/vhost-user-*: use QEMU bswap helper functions vhost-user: enable frontends on any POSIX system libvhost-user: enable it on any POSIX system contrib/vhost-user-blk: enable it on any POSIX system hostmem: add a new memory backend based on POSIX shm_open() tests/qtest/vhost-user-blk-test: use memory-backend-shm tests/qtest/vhost-user-test: add a test case for memory-backend-shm docs/system/devices/vhost-user.rst| 5 +- meson.build | 5 +- qapi/qom.json | 17 +++ subprojects/libvhost-user/libvhost-user.h | 2 +- backends/hostmem-shm.c| 123 ++ contrib/vhost-user-blk/vhost-user-blk.c | 27 +++-- contrib/vhost-user-input/main.c | 16 +-- hw/net/vhost_net.c| 5 + subprojects/libvhost-user/libvhost-user.c | 76 - tests/qtest/vhost-user-blk-test.c | 2 +- tests/qtest/vhost-user-test.c | 23 util/vhost-user-server.c | 12 +++ backends/meson.build | 1 + hw/bloc
Re: [PATCH for-9.1 v3 09/11] hostmem: add a new memory backend based on POSIX shm_open()
On Mon, Apr 08, 2024 at 10:03:15AM +0200, David Hildenbrand wrote: On 08.04.24 09:58, Stefano Garzarella wrote: On Thu, Apr 04, 2024 at 04:09:34PM +0200, David Hildenbrand wrote: On 04.04.24 14:23, Stefano Garzarella wrote: shm_open() creates and opens a new POSIX shared memory object. A POSIX shared memory object allows creating memory backend with an associated file descriptor that can be shared with external processes (e.g. vhost-user). The new `memory-backend-shm` can be used as an alternative when `memory-backend-memfd` is not available (Linux only), since shm_open() should be provided by any POSIX-compliant operating system. This backend mimics memfd, allocating memory that is practically anonymous. In theory shm_open() requires a name, but this is allocated for a short time interval and shm_unlink() is called right after shm_open(). After that, only fd is shared with external processes (e.g., vhost-user) as if it were associated with anonymous memory. In the future we may also allow the user to specify the name to be passed to shm_open(), but for now we keep the backend simple, mimicking anonymous memory such as memfd. Signed-off-by: Stefano Garzarella --- v3 - enriched commit message and documentation to highlight that we want to mimic memfd (David) --- docs/system/devices/vhost-user.rst | 5 +- qapi/qom.json | 17 + backends/hostmem-shm.c | 118 + backends/meson.build | 1 + qemu-options.hx| 11 +++ 5 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 backends/hostmem-shm.c diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 9b2da106ce..35259d8ec7 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -98,8 +98,9 @@ Shared memory object In order for the daemon to access the VirtIO queues to process the requests it needs access to the guest's address space. This is -achieved via the ``memory-backend-file`` or ``memory-backend-memfd`` -objects. A reference to a file-descriptor which can access this object +achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or +``memory-backend-shm`` objects. +A reference to a file-descriptor which can access this object will be passed via the socket as part of the protocol negotiation. Currently the shared memory object needs to match the size of the main diff --git a/qapi/qom.json b/qapi/qom.json index 85e6b4f84a..5252ec69e3 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -721,6 +721,19 @@ '*hugetlbsize': 'size', '*seal': 'bool' } } +## +# @MemoryBackendShmProperties: +# +# Properties for memory-backend-shm objects. +# +# The @share boolean option is true by default with shm. +# +# Since: 9.1 +## +{ 'struct': 'MemoryBackendShmProperties', + 'base': 'MemoryBackendProperties', + 'data': { } } + Acked-by: David Hildenbrand One comment: we should maybe just forbid setting share=off. it doesn't make any sense and it can even result in an unexpected double memory consumption. We missed doing that for memfd, unfortunately. Good point! IIUC the `share` property is defined by the parent `hostmem`, so I should find a way to override the property here and disable the setter, or add an option to `hostmem` to make the property non-writable. Right, or simply fail later when you would find "share=off" in shm_backend_memory_alloc(). This seems like the simplest and cleanest approach, I'll go in this direction! When ever supporting named shmem_open(), it could make sense for VM snapshotting. Right now it doesn't really make any sense. Yeah, I see. Thanks, Stefano
Re: [PATCH for-9.1 v3 00/11] vhost-user: support any POSIX system (tested on macOS, FreeBSD, OpenBSD)
FYI I'll be on PTO till May 2nd, I'll send the v4 when I'm back ASAP. Thanks, Stefano On Thu, Apr 04, 2024 at 02:23:19PM +0200, Stefano Garzarella wrote: v1: https://patchew.org/QEMU/20240228114759.44758-1-sgarz...@redhat.com/ v2: https://patchew.org/QEMU/20240326133936.125332-1-sgarz...@redhat.com/ v3: - rebased on v9.0.0-rc2 - patch 4: avoiding setting fd non-blocking for messages where we have memory fd (Eric) - patch 9: enriched commit message and documentation to highlight that we want to mimic memfd (David) The vhost-user protocol is not really Linux-specific, so let's try support QEMU's frontends and backends (including libvhost-user) in any POSIX system with this series. The main use case is to be able to use virtio devices that we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even in non-Linux systems. The first 5 patches are more like fixes discovered at runtime on macOS or FreeBSD that could go even independently of this series. Patches 6, 7, and 8 enable building of frontends and backends (including libvhost-user) with associated code changes to succeed in compilation. Patch 9 adds `memory-backend-shm` that uses the POSIX shm_open() API to create shared memory which is identified by an fd that can be shared with vhost-user backends. This is useful on those systems (like macOS) where we don't have memfd_create() or special filesystems like "/dev/shm". Patches 10 and 11 use `memory-backend-shm` in some vhost-user tests. Maybe the first 5 patches can go separately, but I only discovered those problems after testing patches 6 - 9, so I have included them in this series for now. Please let me know if you prefer that I send them separately. I tested this series using vhost-user-blk and QSD on macOS Sonoma 14.4 (aarch64), FreeBSD 14 (x86_64), OpenBSD 7.4 (x86_64), and Fedora 39 (x86_64) in this way: - Start vhost-user-blk or QSD (same commands for all systems) vhost-user-blk -s /tmp/vhost.socket \ -b Fedora-Cloud-Base-39-1.5.x86_64.raw qemu-storage-daemon \ --blockdev file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \ --blockdev qcow2,file=file,node-name=qcow2 \ --export vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on - macOS (aarch64): start QEMU (using hvf accelerator) qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \ -drive file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \ -device virtio-net-device,netdev=net0 -netdev user,id=net0 \ -device ramfb -device usb-ehci -device usb-kbd \ -object memory-backend-shm,id=mem,size=512M \ -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - FreeBSD/OpenBSD (x86_64): start QEMU (no accelerators available) qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \ -object memory-backend-shm,id=mem,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - Fedora (x86_64): start QEMU (using kvm accelerator) qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \ -object memory-backend-shm,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket Branch pushed (and CI started) at https://gitlab.com/sgarzarella/qemu/-/tree/macos-vhost-user?ref_type=heads Thanks, Stefano Stefano Garzarella (11): libvhost-user: set msg.msg_control to NULL when it is empty libvhost-user: fail vu_message_write() if sendmsg() is failing libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported vhost-user-server: do not set memory fd non-blocking contrib/vhost-user-blk: fix bind() using the right size of the address vhost-user: enable frontends on any POSIX system libvhost-user: enable it on any POSIX system contrib/vhost-user-blk: enable it on any POSIX system hostmem: add a new memory backend based on POSIX shm_open() tests/qtest/vhost-user-blk-test: use memory-backend-shm tests/qtest/vhost-user-test: add a test case for memory-backend-shm docs/system/devices/vhost-user.rst| 5 +- meson.build | 5 +- qapi/qom.json | 17 subprojects/libvhost-user/libvhost-user.h | 2 +- backends/hostmem-shm.c| 118 ++ contrib/vhost-user-blk/vhost-user-blk.c | 23 - hw/net/vhost_net.c| 5 + subprojects/libvhost-user/libvhost-user.c | 76 +- tests/qtest/vhost-user-blk-test.c | 2 +- tests/qtest/vhost-user-test.c | 23 + util/vhost-user-server.c | 12 +++ backends/meson.build | 1 + hw/block/Kconfig | 2 +- qemu-options.hx | 11 ++ util/meson.build
Re: [PATCH for-9.1 v3 09/11] hostmem: add a new memory backend based on POSIX shm_open()
On Thu, Apr 04, 2024 at 04:09:34PM +0200, David Hildenbrand wrote: On 04.04.24 14:23, Stefano Garzarella wrote: shm_open() creates and opens a new POSIX shared memory object. A POSIX shared memory object allows creating memory backend with an associated file descriptor that can be shared with external processes (e.g. vhost-user). The new `memory-backend-shm` can be used as an alternative when `memory-backend-memfd` is not available (Linux only), since shm_open() should be provided by any POSIX-compliant operating system. This backend mimics memfd, allocating memory that is practically anonymous. In theory shm_open() requires a name, but this is allocated for a short time interval and shm_unlink() is called right after shm_open(). After that, only fd is shared with external processes (e.g., vhost-user) as if it were associated with anonymous memory. In the future we may also allow the user to specify the name to be passed to shm_open(), but for now we keep the backend simple, mimicking anonymous memory such as memfd. Signed-off-by: Stefano Garzarella --- v3 - enriched commit message and documentation to highlight that we want to mimic memfd (David) --- docs/system/devices/vhost-user.rst | 5 +- qapi/qom.json | 17 + backends/hostmem-shm.c | 118 + backends/meson.build | 1 + qemu-options.hx| 11 +++ 5 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 backends/hostmem-shm.c diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 9b2da106ce..35259d8ec7 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -98,8 +98,9 @@ Shared memory object In order for the daemon to access the VirtIO queues to process the requests it needs access to the guest's address space. This is -achieved via the ``memory-backend-file`` or ``memory-backend-memfd`` -objects. A reference to a file-descriptor which can access this object +achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or +``memory-backend-shm`` objects. +A reference to a file-descriptor which can access this object will be passed via the socket as part of the protocol negotiation. Currently the shared memory object needs to match the size of the main diff --git a/qapi/qom.json b/qapi/qom.json index 85e6b4f84a..5252ec69e3 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -721,6 +721,19 @@ '*hugetlbsize': 'size', '*seal': 'bool' } } +## +# @MemoryBackendShmProperties: +# +# Properties for memory-backend-shm objects. +# +# The @share boolean option is true by default with shm. +# +# Since: 9.1 +## +{ 'struct': 'MemoryBackendShmProperties', + 'base': 'MemoryBackendProperties', + 'data': { } } + Acked-by: David Hildenbrand One comment: we should maybe just forbid setting share=off. it doesn't make any sense and it can even result in an unexpected double memory consumption. We missed doing that for memfd, unfortunately. Good point! IIUC the `share` property is defined by the parent `hostmem`, so I should find a way to override the property here and disable the setter, or add an option to `hostmem` to make the property non-writable. Thanks, Stefano
Re: [PATCH for-9.1 v3 08/11] contrib/vhost-user-blk: enable it on any POSIX system
On Thu, Apr 04, 2024 at 04:00:38PM +0200, Philippe Mathieu-Daudé wrote: Hi Stefano, Hi Phil! On 4/4/24 14:23, Stefano Garzarella wrote: Let's make the code more portable by using the "qemu/bswap.h" API and adding defines from block/file-posix.c to support O_DIRECT in other systems (e.g. macOS). vhost-user-server.c is a dependency, let's enable it for any POSIX system. Signed-off-by: Stefano Garzarella --- meson.build | 2 -- contrib/vhost-user-blk/vhost-user-blk.c | 19 +-- util/meson.build| 4 +++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index a8ab9269a2..462e584857 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/bswap.h" #include "standard-headers/linux/virtio_blk.h" #include "libvhost-user-glib.h" @@ -267,13 +282,13 @@ static int vub_virtio_process_req(VubDev *vdev_blk, req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base; in_num--; -type = le32toh(req->out->type); +type = le32_to_cpu(req->out->type); switch (type & ~VIRTIO_BLK_T_BARRIER) { case VIRTIO_BLK_T_IN: case VIRTIO_BLK_T_OUT: { ssize_t ret = 0; bool is_write = type & VIRTIO_BLK_T_OUT; -req->sector_num = le64toh(req->out->sector); +req->sector_num = le64_to_cpu(req->out->sector); if (is_write) { ret = vub_writev(req, >out_sg[1], out_num); } else { Can we switch to the bswap API in a preliminary patch, Sure, I tried to minimize the patches because it's already big, but I can split this. converting all the source files? What do you mean with "all the source files"? "le64toh" is used here and in some subprojects (e.g. libvduse, libvhost-user), where IIUC we can't use QEMU's bswap.h because we don't want to put a dependency with the QEMU code. BTW I'll check for other *toh() usage in QEMU code and change in the preliminary patch you suggested to add. Thanks for the review, Stefano
[PATCH for-9.1 v3 08/11] contrib/vhost-user-blk: enable it on any POSIX system
Let's make the code more portable by using the "qemu/bswap.h" API and adding defines from block/file-posix.c to support O_DIRECT in other systems (e.g. macOS). vhost-user-server.c is a dependency, let's enable it for any POSIX system. Signed-off-by: Stefano Garzarella --- meson.build | 2 -- contrib/vhost-user-blk/vhost-user-blk.c | 19 +-- util/meson.build| 4 +++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 3197a2f62e..b541e5c875 100644 --- a/meson.build +++ b/meson.build @@ -1956,8 +1956,6 @@ has_statx = cc.has_header_symbol('sys/stat.h', 'STATX_BASIC_STATS', prefix: gnu_ has_statx_mnt_id = cc.has_header_symbol('sys/stat.h', 'STATX_MNT_ID', prefix: gnu_source_prefix) have_vhost_user_blk_server = get_option('vhost_user_blk_server') \ - .require(host_os == 'linux', - error_message: 'vhost_user_blk_server requires linux') \ .require(have_vhost_user, error_message: 'vhost_user_blk_server requires vhost-user support') \ .disable_auto_if(not have_tools and not have_system) \ diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index a8ab9269a2..462e584857 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/bswap.h" #include "standard-headers/linux/virtio_blk.h" #include "libvhost-user-glib.h" @@ -24,6 +25,20 @@ #include #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + enum { VHOST_USER_BLK_MAX_QUEUES = 8, }; @@ -267,13 +282,13 @@ static int vub_virtio_process_req(VubDev *vdev_blk, req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base; in_num--; -type = le32toh(req->out->type); +type = le32_to_cpu(req->out->type); switch (type & ~VIRTIO_BLK_T_BARRIER) { case VIRTIO_BLK_T_IN: case VIRTIO_BLK_T_OUT: { ssize_t ret = 0; bool is_write = type & VIRTIO_BLK_T_OUT; -req->sector_num = le64toh(req->out->sector); +req->sector_num = le64_to_cpu(req->out->sector); if (is_write) { ret = vub_writev(req, >out_sg[1], out_num); } else { diff --git a/util/meson.build b/util/meson.build index 0ef9886be0..f52682ce96 100644 --- a/util/meson.build +++ b/util/meson.build @@ -113,10 +113,12 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif if host_os == 'linux' -util_ss.add(files('vhost-user-server.c'), vhost_user) util_ss.add(files('vfio-helpers.c')) util_ss.add(files('chardev_open.c')) endif + if host_os != 'windows' +util_ss.add(files('vhost-user-server.c'), vhost_user) + endif endif if cpu == 'aarch64' -- 2.44.0
[PATCH for-9.1 v3 05/11] contrib/vhost-user-blk: fix bind() using the right size of the address
On macOS passing `-s /tmp/vhost.socket` parameter to the vhost-user-blk application, the bind was done on `/tmp/vhost.socke` pathname, missing the last character. This sounds like one of the portability problems described in the unix(7) manpage: Pathname sockets When binding a socket to a pathname, a few rules should be observed for maximum portability and ease of coding: ⢠The pathname in sun_path should be null-terminated. ⢠The length of the pathname, including the terminating null byte, should not exceed the size of sun_path. ⢠The addrlen argument that describes the enclosing sockaddr_un structure should have a value of at least: offsetof(struct sockaddr_un, sun_path) + strlen(addr.sun_path)+1 or, more simply, addrlen can be specified as sizeof(struct sockaddr_un). So let's follow the last advice and simplify the code as well. Signed-off-by: Stefano Garzarella --- contrib/vhost-user-blk/vhost-user-blk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 89e5f11a64..a8ab9269a2 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -469,7 +469,6 @@ static int unix_sock_new(char *unix_fn) { int sock; struct sockaddr_un un; -size_t len; assert(unix_fn); @@ -481,10 +480,9 @@ static int unix_sock_new(char *unix_fn) un.sun_family = AF_UNIX; (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn); -len = sizeof(un.sun_family) + strlen(un.sun_path); (void)unlink(unix_fn); -if (bind(sock, (struct sockaddr *), len) < 0) { +if (bind(sock, (struct sockaddr *), sizeof(un)) < 0) { perror("bind"); goto fail; } -- 2.44.0
[PATCH for-9.1 v3 09/11] hostmem: add a new memory backend based on POSIX shm_open()
shm_open() creates and opens a new POSIX shared memory object. A POSIX shared memory object allows creating memory backend with an associated file descriptor that can be shared with external processes (e.g. vhost-user). The new `memory-backend-shm` can be used as an alternative when `memory-backend-memfd` is not available (Linux only), since shm_open() should be provided by any POSIX-compliant operating system. This backend mimics memfd, allocating memory that is practically anonymous. In theory shm_open() requires a name, but this is allocated for a short time interval and shm_unlink() is called right after shm_open(). After that, only fd is shared with external processes (e.g., vhost-user) as if it were associated with anonymous memory. In the future we may also allow the user to specify the name to be passed to shm_open(), but for now we keep the backend simple, mimicking anonymous memory such as memfd. Signed-off-by: Stefano Garzarella --- v3 - enriched commit message and documentation to highlight that we want to mimic memfd (David) --- docs/system/devices/vhost-user.rst | 5 +- qapi/qom.json | 17 + backends/hostmem-shm.c | 118 + backends/meson.build | 1 + qemu-options.hx| 11 +++ 5 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 backends/hostmem-shm.c diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 9b2da106ce..35259d8ec7 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -98,8 +98,9 @@ Shared memory object In order for the daemon to access the VirtIO queues to process the requests it needs access to the guest's address space. This is -achieved via the ``memory-backend-file`` or ``memory-backend-memfd`` -objects. A reference to a file-descriptor which can access this object +achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or +``memory-backend-shm`` objects. +A reference to a file-descriptor which can access this object will be passed via the socket as part of the protocol negotiation. Currently the shared memory object needs to match the size of the main diff --git a/qapi/qom.json b/qapi/qom.json index 85e6b4f84a..5252ec69e3 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -721,6 +721,19 @@ '*hugetlbsize': 'size', '*seal': 'bool' } } +## +# @MemoryBackendShmProperties: +# +# Properties for memory-backend-shm objects. +# +# The @share boolean option is true by default with shm. +# +# Since: 9.1 +## +{ 'struct': 'MemoryBackendShmProperties', + 'base': 'MemoryBackendProperties', + 'data': { } } + ## # @MemoryBackendEpcProperties: # @@ -976,6 +989,8 @@ { 'name': 'memory-backend-memfd', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram', +{ 'name': 'memory-backend-shm', + 'if': 'CONFIG_POSIX' }, 'pef-guest', { 'name': 'pr-manager-helper', 'if': 'CONFIG_LINUX' }, @@ -1047,6 +1062,8 @@ 'memory-backend-memfd': { 'type': 'MemoryBackendMemfdProperties', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram': 'MemoryBackendProperties', + 'memory-backend-shm': { 'type': 'MemoryBackendShmProperties', + 'if': 'CONFIG_POSIX' }, 'pr-manager-helper': { 'type': 'PrManagerHelperProperties', 'if': 'CONFIG_LINUX' }, 'qtest': 'QtestProperties', diff --git a/backends/hostmem-shm.c b/backends/hostmem-shm.c new file mode 100644 index 00..7595204d29 --- /dev/null +++ b/backends/hostmem-shm.c @@ -0,0 +1,118 @@ +/* + * QEMU host POSIX shared memory object backend + * + * Copyright (C) 2024 Red Hat Inc + * + * Authors: + * Stefano Garzarella + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/hostmem.h" +#include "qapi/error.h" + +#define TYPE_MEMORY_BACKEND_SHM "memory-backend-shm" + +OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendShm, MEMORY_BACKEND_SHM) + +struct HostMemoryBackendShm { +HostMemoryBackend parent_obj; +}; + +static bool +shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ +g_autoptr(GString) shm_name = g_string_new(NULL); +g_autofree char *backend_name = NULL; +uint32_t ram_flags; +int fd, oflag; +mode_t mode; + +if (!backend->size) { +error_setg(errp, "can't create backend with size 0"); +return false; +} + +/* + * Let's use `mode = 0` because we don't want other processes to open our + * memory unless we share the file descriptor with them. + */ +mode = 0; +oflag = O_RDWR | O_CREAT | O_EXCL; +backend_name = host_mem
[PATCH for-9.1 v3 01/11] libvhost-user: set msg.msg_control to NULL when it is empty
On some OS (e.g. macOS) sendmsg() returns -1 (errno EINVAL) if the `struct msghdr` has the field `msg_controllen` set to 0, but `msg_control` is not NULL. Reviewed-by: Eric Blake Reviewed-by: David Hildenbrand Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a879149fef..22bea0c775 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -632,6 +632,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); } else { msg.msg_controllen = 0; +msg.msg_control = NULL; } do { -- 2.44.0
[PATCH for-9.1 v3 11/11] tests/qtest/vhost-user-test: add a test case for memory-backend-shm
`memory-backend-shm` can be used with vhost-user devices, so let's add a new test case for it. Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-test.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index d4e437265f..8c1d903b2a 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -44,6 +44,8 @@ "mem-path=%s,share=on -numa node,memdev=mem" #define QEMU_CMD_MEMFD " -m %d -object memory-backend-memfd,id=mem,size=%dM," \ " -numa node,memdev=mem" +#define QEMU_CMD_SHM" -m %d -object memory-backend-shm,id=mem,size=%dM," \ +" -numa node,memdev=mem" #define QEMU_CMD_CHR" -chardev socket,id=%s,path=%s%s" #define QEMU_CMD_NETDEV " -netdev vhost-user,id=hs0,chardev=%s,vhostforce=on" @@ -195,6 +197,7 @@ enum test_memfd { TEST_MEMFD_AUTO, TEST_MEMFD_YES, TEST_MEMFD_NO, +TEST_MEMFD_SHM, }; static void append_vhost_net_opts(TestServer *s, GString *cmd_line, @@ -228,6 +231,8 @@ static void append_mem_opts(TestServer *server, GString *cmd_line, if (memfd == TEST_MEMFD_YES) { g_string_append_printf(cmd_line, QEMU_CMD_MEMFD, size, size); +} else if (memfd == TEST_MEMFD_SHM) { +g_string_append_printf(cmd_line, QEMU_CMD_SHM, size, size); } else { const char *root = init_hugepagefs() ? : server->tmpfs; @@ -788,6 +793,19 @@ static void *vhost_user_test_setup_memfd(GString *cmd_line, void *arg) return server; } +static void *vhost_user_test_setup_shm(GString *cmd_line, void *arg) +{ +TestServer *server = test_server_new("vhost-user-test", arg); +test_server_listen(server); + +append_mem_opts(server, cmd_line, 256, TEST_MEMFD_SHM); +server->vu_ops->append_opts(server, cmd_line, ""); + +g_test_queue_destroy(vhost_user_test_cleanup, server); + +return server; +} + static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc) { TestServer *server = arg; @@ -1081,6 +1099,11 @@ static void register_vhost_user_test(void) "virtio-net", test_read_guest_mem, ); +opts.before = vhost_user_test_setup_shm; +qos_add_test("vhost-user/read-guest-mem/shm", + "virtio-net", + test_read_guest_mem, ); + if (qemu_memfd_check(MFD_ALLOW_SEALING)) { opts.before = vhost_user_test_setup_memfd; qos_add_test("vhost-user/read-guest-mem/memfd", -- 2.44.0
[PATCH for-9.1 v3 07/11] libvhost-user: enable it on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable libvhost-user for any POSIX system. Compiling it on macOS and FreeBSD some problems came up: - avoid to include linux/vhost.h which is avaibale only on Linux (vhost_types.h contains many of the things we need) - macOS doesn't provide sys/endian.h, so let's define them (note: libvhost-user doesn't include qemu's headers, so we can't use use "qemu/bswap.h") - define eventfd_[write|read] as write/read wrapper when system doesn't provide those (e.g. macOS) - copy SEAL defines from include/qemu/memfd.h to make the code works on FreeBSD where MFD_ALLOW_SEALING is defined - define MAP_NORESERVE if it's not defined (e.g. on FreeBSD) Signed-off-by: Stefano Garzarella --- meson.build | 2 +- subprojects/libvhost-user/libvhost-user.h | 2 +- subprojects/libvhost-user/libvhost-user.c | 60 +-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index c19d51501a..3197a2f62e 100644 --- a/meson.build +++ b/meson.build @@ -3194,7 +3194,7 @@ endif config_host_data.set('CONFIG_FDT', fdt.found()) vhost_user = not_found -if host_os == 'linux' and have_vhost_user +if have_vhost_user libvhost_user = subproject('libvhost-user') vhost_user = libvhost_user.get_variable('vhost_user_dep') endif diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index deb40e77b3..e13e1d3931 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -18,9 +18,9 @@ #include #include #include -#include #include #include "standard-headers/linux/virtio_ring.h" +#include "standard-headers/linux/vhost_types.h" /* Based on qemu/hw/virtio/vhost-user.c */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 1c361ffd51..03edb4bf64 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include /* Necessary to provide VIRTIO_F_VERSION_1 on system * with older linux headers. Must appear before @@ -39,8 +37,8 @@ #include "standard-headers/linux/virtio_config.h" #if defined(__linux__) +#include #include -#include #include #include #include @@ -52,6 +50,62 @@ #endif +#if defined(__APPLE__) && (__MACH__) +#include +#define htobe16(x) OSSwapHostToBigInt16(x) +#define htole16(x) OSSwapHostToLittleInt16(x) +#define be16toh(x) OSSwapBigToHostInt16(x) +#define le16toh(x) OSSwapLittleToHostInt16(x) + +#define htobe32(x) OSSwapHostToBigInt32(x) +#define htole32(x) OSSwapHostToLittleInt32(x) +#define be32toh(x) OSSwapBigToHostInt32(x) +#define le32toh(x) OSSwapLittleToHostInt32(x) + +#define htobe64(x) OSSwapHostToBigInt64(x) +#define htole64(x) OSSwapHostToLittleInt64(x) +#define be64toh(x) OSSwapBigToHostInt64(x) +#define le64toh(x) OSSwapLittleToHostInt64(x) +#endif + +#ifdef CONFIG_EVENTFD +#include +#else +#define eventfd_t uint64_t + +int eventfd_write(int fd, eventfd_t value) +{ +return (write(fd, , sizeof(value)) == sizeof(value)) ? 0 : -1; +} + +int eventfd_read(int fd, eventfd_t *value) +{ +return (read(fd, value, sizeof(*value)) == sizeof(*value)) ? 0 : -1; +} +#endif + +#ifdef MFD_ALLOW_SEALING +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE0x0008 /* prevent writes */ +#endif +#endif + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + #include "include/atomic.h" #include "libvhost-user.h" -- 2.44.0
[PATCH for-9.1 v3 04/11] vhost-user-server: do not set memory fd non-blocking
In vhost-user-server we set all fd received from the other peer in non-blocking mode. For some of them (e.g. memfd, shm_open, etc.) it's not really needed, because we don't use these fd with blocking operations, but only to map memory. In addition, in some systems this operation can fail (e.g. in macOS setting an fd returned by shm_open() non-blocking fails with errno = ENOTTY). So, let's avoid setting fd non-blocking for those messages that we know carry memory fd (e.g. VHOST_USER_ADD_MEM_REG, VHOST_USER_SET_MEM_TABLE). Signed-off-by: Stefano Garzarella --- v3: - avoiding setting fd non-blocking for messages where we have memory fd (Eric) --- util/vhost-user-server.c | 12 1 file changed, 12 insertions(+) diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index 3bfb1ad3ec..b19229074a 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -65,6 +65,18 @@ static void vmsg_close_fds(VhostUserMsg *vmsg) static void vmsg_unblock_fds(VhostUserMsg *vmsg) { int i; + +/* + * These messages carry fd used to map memory, not to send/receive messages, + * so this operation is useless. In addition, in some systems this + * operation can fail (e.g. in macOS setting an fd returned by shm_open() + * non-blocking fails with errno = ENOTTY) + */ +if (vmsg->request == VHOST_USER_ADD_MEM_REG || +vmsg->request == VHOST_USER_SET_MEM_TABLE) { +return; +} + for (i = 0; i < vmsg->fd_num; i++) { qemu_socket_set_nonblock(vmsg->fds[i]); } -- 2.44.0
[PATCH for-9.1 v3 02/11] libvhost-user: fail vu_message_write() if sendmsg() is failing
In vu_message_write() we use sendmsg() to send the message header, then a write() to send the payload. If sendmsg() fails we should avoid sending the payload, since we were unable to send the header. Discovered before fixing the issue with the previous patch, where sendmsg() failed on macOS due to wrong parameters, but the frontend still sent the payload which the backend incorrectly interpreted as a wrong header. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 5 + 1 file changed, 5 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 22bea0c775..a11afd1960 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -639,6 +639,11 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) rc = sendmsg(conn_fd, , 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); +if (rc <= 0) { +vu_panic(dev, "Error while writing: %s", strerror(errno)); +return false; +} + if (vmsg->size) { do { if (vmsg->data) { -- 2.44.0
[PATCH for-9.1 v3 10/11] tests/qtest/vhost-user-blk-test: use memory-backend-shm
`memory-backend-memfd` is available only on Linux while the new `memory-backend-shm` can be used on any POSIX-compliant operating system. Let's use it so we can run the test in multiple environments. Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-blk-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qtest/vhost-user-blk-test.c b/tests/qtest/vhost-user-blk-test.c index 117b9acd10..e945f6abf2 100644 --- a/tests/qtest/vhost-user-blk-test.c +++ b/tests/qtest/vhost-user-blk-test.c @@ -906,7 +906,7 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, vhost_user_blk_bin); g_string_append_printf(cmd_line, -" -object memory-backend-memfd,id=mem,size=256M,share=on " +" -object memory-backend-shm,id=mem,size=256M,share=on " " -M memory-backend=mem -m 256M "); for (i = 0; i < vus_instances; i++) { -- 2.44.0
[PATCH for-9.1 v3 06/11] vhost-user: enable frontends on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable vhost-user frontends for any POSIX system. In vhost_net.c we use VHOST_FILE_UNBIND which is defined in a Linux specific header, let's define it for other systems as well. Signed-off-by: Stefano Garzarella --- meson.build| 1 - hw/net/vhost_net.c | 5 + hw/block/Kconfig | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index c9c3217ba4..c19d51501a 100644 --- a/meson.build +++ b/meson.build @@ -151,7 +151,6 @@ have_tpm = get_option('tpm') \ # vhost have_vhost_user = get_option('vhost_user') \ - .disable_auto_if(host_os != 'linux') \ .require(host_os != 'windows', error_message: 'vhost-user is not available on Windows').allowed() have_vhost_vdpa = get_option('vhost_vdpa') \ diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index fd1a93701a..fced429813 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -34,8 +34,13 @@ #include "standard-headers/linux/virtio_ring.h" #include "hw/virtio/vhost.h" #include "hw/virtio/virtio-bus.h" +#if defined(__linux__) #include "linux-headers/linux/vhost.h" +#endif +#ifndef VHOST_FILE_UNBIND +#define VHOST_FILE_UNBIND -1 +#endif /* Features supported by host kernel. */ static const int kernel_feature_bits[] = { diff --git a/hw/block/Kconfig b/hw/block/Kconfig index 9e8f28f982..29ee09e434 100644 --- a/hw/block/Kconfig +++ b/hw/block/Kconfig @@ -40,7 +40,7 @@ config VHOST_USER_BLK bool # Only PCI devices are provided for now default y if VIRTIO_PCI -depends on VIRTIO && VHOST_USER && LINUX +depends on VIRTIO && VHOST_USER config SWIM bool -- 2.44.0
[PATCH for-9.1 v3 03/11] libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
libvhost-user will panic when receiving VHOST_USER_GET_INFLIGHT_FD message if MFD_ALLOW_SEALING is not defined, since it's not able to create a memfd. VHOST_USER_GET_INFLIGHT_FD is used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD is negotiated. So, let's mask that feature if the backend is not able to properly handle these messages. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a11afd1960..1c361ffd51 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -1674,6 +1674,16 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) features |= dev->iface->get_protocol_features(dev); } +/* + * If MFD_ALLOW_SEALING is not defined, we are not able to handle + * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd. + * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + * is negotiated. A device implementation can enable it, so let's mask + * it to avoid a runtime panic. + */ +#ifndef MFD_ALLOW_SEALING +features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); +#endif vmsg_set_reply_u64(vmsg, features); return true; } -- 2.44.0
[PATCH for-9.1 v3 00/11] vhost-user: support any POSIX system (tested on macOS, FreeBSD, OpenBSD)
v1: https://patchew.org/QEMU/20240228114759.44758-1-sgarz...@redhat.com/ v2: https://patchew.org/QEMU/20240326133936.125332-1-sgarz...@redhat.com/ v3: - rebased on v9.0.0-rc2 - patch 4: avoiding setting fd non-blocking for messages where we have memory fd (Eric) - patch 9: enriched commit message and documentation to highlight that we want to mimic memfd (David) The vhost-user protocol is not really Linux-specific, so let's try support QEMU's frontends and backends (including libvhost-user) in any POSIX system with this series. The main use case is to be able to use virtio devices that we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even in non-Linux systems. The first 5 patches are more like fixes discovered at runtime on macOS or FreeBSD that could go even independently of this series. Patches 6, 7, and 8 enable building of frontends and backends (including libvhost-user) with associated code changes to succeed in compilation. Patch 9 adds `memory-backend-shm` that uses the POSIX shm_open() API to create shared memory which is identified by an fd that can be shared with vhost-user backends. This is useful on those systems (like macOS) where we don't have memfd_create() or special filesystems like "/dev/shm". Patches 10 and 11 use `memory-backend-shm` in some vhost-user tests. Maybe the first 5 patches can go separately, but I only discovered those problems after testing patches 6 - 9, so I have included them in this series for now. Please let me know if you prefer that I send them separately. I tested this series using vhost-user-blk and QSD on macOS Sonoma 14.4 (aarch64), FreeBSD 14 (x86_64), OpenBSD 7.4 (x86_64), and Fedora 39 (x86_64) in this way: - Start vhost-user-blk or QSD (same commands for all systems) vhost-user-blk -s /tmp/vhost.socket \ -b Fedora-Cloud-Base-39-1.5.x86_64.raw qemu-storage-daemon \ --blockdev file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \ --blockdev qcow2,file=file,node-name=qcow2 \ --export vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on - macOS (aarch64): start QEMU (using hvf accelerator) qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \ -drive file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \ -device virtio-net-device,netdev=net0 -netdev user,id=net0 \ -device ramfb -device usb-ehci -device usb-kbd \ -object memory-backend-shm,id=mem,size=512M \ -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - FreeBSD/OpenBSD (x86_64): start QEMU (no accelerators available) qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \ -object memory-backend-shm,id=mem,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - Fedora (x86_64): start QEMU (using kvm accelerator) qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \ -object memory-backend-shm,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket Branch pushed (and CI started) at https://gitlab.com/sgarzarella/qemu/-/tree/macos-vhost-user?ref_type=heads Thanks, Stefano Stefano Garzarella (11): libvhost-user: set msg.msg_control to NULL when it is empty libvhost-user: fail vu_message_write() if sendmsg() is failing libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported vhost-user-server: do not set memory fd non-blocking contrib/vhost-user-blk: fix bind() using the right size of the address vhost-user: enable frontends on any POSIX system libvhost-user: enable it on any POSIX system contrib/vhost-user-blk: enable it on any POSIX system hostmem: add a new memory backend based on POSIX shm_open() tests/qtest/vhost-user-blk-test: use memory-backend-shm tests/qtest/vhost-user-test: add a test case for memory-backend-shm docs/system/devices/vhost-user.rst| 5 +- meson.build | 5 +- qapi/qom.json | 17 subprojects/libvhost-user/libvhost-user.h | 2 +- backends/hostmem-shm.c| 118 ++ contrib/vhost-user-blk/vhost-user-blk.c | 23 - hw/net/vhost_net.c| 5 + subprojects/libvhost-user/libvhost-user.c | 76 +- tests/qtest/vhost-user-blk-test.c | 2 +- tests/qtest/vhost-user-test.c | 23 + util/vhost-user-server.c | 12 +++ backends/meson.build | 1 + hw/block/Kconfig | 2 +- qemu-options.hx | 11 ++ util/meson.build | 4 +- 15 files changed, 288 insertions(+), 18 deletions(-) create mode 100644 backends/hostmem-shm.c -- 2.44.0
Re: [PATCH for-9.1 v2 09/11] hostmem: add a new memory backend based on POSIX shm_open()
On Wed, Mar 27, 2024 at 12:51:51PM +0100, David Hildenbrand wrote: On 27.03.24 11:23, Stefano Garzarella wrote: On Tue, Mar 26, 2024 at 03:45:52PM +0100, David Hildenbrand wrote: +mode = 0; +oflag = O_RDWR | O_CREAT | O_EXCL; +backend_name = host_memory_backend_get_name(backend); + +/* + * Some operating systems allow creating anonymous POSIX shared memory + * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not + * defined by POSIX, so let's create a unique name. + * + * From Linux's shm_open(3) man-page: + * For portable use, a shared memory object should be identified + * by a name of the form /somename;" + */ +g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%s", getpid(), +backend_name); Hm, shouldn't we just let the user specify a name, and if no name was specified, generate one ourselves? I thought about it and initially did it that way, but then some problems came up so I tried to keep it as simple as possible for the user and for our use case (having an fd associated with memory and sharing it with other processes). The problems I had were: - what mode_t to use if the object does not exist and needs to be created? > - exclude O_EXCL if the user passes the name since they may have already created it? I'd handle both like we handle files. But I understand now that you really just want to "simulate" memfd. Right, maybe I should write that in the commit message and documentation. - call shm_unlink() only at object deallocation? Right, we don't really do that for ordinary files, they keep existing. We only have the "discard-data" property to free up memory. For memfd, it just happens "automatically". They cannot be looked up by name, and once the last user closes the fd, it gets destroyed. Yep, I see. So I thought that for now we only support the "anonymous" mode, and if in the future we have a use case where the user wants to specify the name, we can add it later. Okay, so for now you really only want an anonymous fd that behaves like a memfd, got it. Likely we should somehow fail if the "POSIX shared memory object" already exists. Hmm. `O_CREAT | O_EXCL` should provide just this behavior. From shm_open(3) manpage: O_EXCL If O_CREAT was also specified, and a shared memory object with the given name already exists, return an error. The check for the existence of the object, and its creation if it does not exist, are performed atomically. That said, if you think it's already useful from the beginning, I can add the name as an optional parameter. I'm also not quite sure if "host_memory_backend_get_name()" should be used for the purpose here. What problem do you see? As an alternative I thought of a static counter. If it's really just an "internal UUID", as we'll remove the file using shm_unlink(), then the name in fact shouldn't matter and this would be fine. Correct? Right. It's a name that will only "appear" in the system for a very short time since we call shm_unlink() right after shm_open(). So I just need the unique name to prevent several QEMUs launched at the same time from colliding with the name. So I assume if we ever want to have non-anonymous fds here, we'd pass in a new property "name", and change the logic how we open/unlink. Makes sense. Exactly! I can clarify this in the commit description as well. Thanks for the helpful comments! If there is anything I need to change for v3, please tell me ;-) Stefano
Re: [PATCH for-9.1 v2 09/11] hostmem: add a new memory backend based on POSIX shm_open()
On Tue, Mar 26, 2024 at 03:45:52PM +0100, David Hildenbrand wrote: +mode = 0; +oflag = O_RDWR | O_CREAT | O_EXCL; +backend_name = host_memory_backend_get_name(backend); + +/* + * Some operating systems allow creating anonymous POSIX shared memory + * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not + * defined by POSIX, so let's create a unique name. + * + * From Linux's shm_open(3) man-page: + * For portable use, a shared memory object should be identified + * by a name of the form /somename;" + */ +g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%s", getpid(), +backend_name); Hm, shouldn't we just let the user specify a name, and if no name was specified, generate one ourselves? I thought about it and initially did it that way, but then some problems came up so I tried to keep it as simple as possible for the user and for our use case (having an fd associated with memory and sharing it with other processes). The problems I had were: - what mode_t to use if the object does not exist and needs to be created? - exclude O_EXCL if the user passes the name since they may have already created it? - call shm_unlink() only at object deallocation? So I thought that for now we only support the "anonymous" mode, and if in the future we have a use case where the user wants to specify the name, we can add it later. That said, if you think it's already useful from the beginning, I can add the name as an optional parameter. I'm also not quite sure if "host_memory_backend_get_name()" should be used for the purpose here. What problem do you see? As an alternative I thought of a static counter. Thanks, Stefano
Re: [PATCH for-9.1 v2 04/11] vhost-user-server: don't abort if we can't set fd non-blocking
On Tue, Mar 26, 2024 at 09:40:12AM -0500, Eric Blake wrote: On Tue, Mar 26, 2024 at 02:39:29PM +0100, Stefano Garzarella wrote: In vhost-user-server we set all fd received from the other peer in non-blocking mode. For some of them (e.g. memfd, shm_open, etc.) if we fail, it's not really a problem, because we don't use these fd with blocking operations, but only to map memory. In these cases a failure is not bad, so let's just report a warning instead of panicking if we fail to set some fd in non-blocking mode. This for example occurs in macOS where setting shm_open() fd non-blocking is failing (errno: 25). What is errno 25 on MacOS? It should be ENOTTY. I'll add in the commit description. Signed-off-by: Stefano Garzarella --- util/vhost-user-server.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index 3bfb1ad3ec..064999f0b7 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -66,7 +66,11 @@ static void vmsg_unblock_fds(VhostUserMsg *vmsg) { int i; for (i = 0; i < vmsg->fd_num; i++) { -qemu_socket_set_nonblock(vmsg->fds[i]); +int ret = qemu_socket_try_set_nonblock(vmsg->fds[i]); +if (ret) { Should this be 'if (ret < 0)'? I was confused by the assert() in qemu_socket_set_nonblock(): void qemu_socket_set_nonblock(int fd) { int f; f = qemu_socket_try_set_nonblock(fd); assert(f == 0); } BTW, I see most of the code checks ret < 0, so I'll fix it. +warn_report("Failed to set fd %d nonblock for request %d: %s", +vmsg->fds[i], vmsg->request, strerror(-ret)); +} This now ignores all errors even on pre-existing fds where we NEED non-blocking, rather than just the specific (expected) error we are seeing on MacOS. Should this code be a bit more precise about checking that -ret == EXXX for the expected errno value we are ignoring for the specific fds where non-blocking is not essential? Good point, maybe I'll just avoid calling vmsg_unblock_fds() when the message is VHOST_USER_ADD_MEM_REG or VHOST_USER_SET_MEM_TABLE. These should be the cases where carried fds are used for mmap() and so there is no need to mark them non-blocking. WDYT? Stefano
Re: [PATCH for-9.1 v2 03/11] libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
On Tue, Mar 26, 2024 at 09:36:54AM -0500, Eric Blake wrote: On Tue, Mar 26, 2024 at 02:39:28PM +0100, Stefano Garzarella wrote: libvhost-user will panic when receiving VHOST_USER_GET_INFLIGHT_FD message if MFD_ALLOW_SEALING is not defined, since it's not able to create a memfd. VHOST_USER_GET_INFLIGHT_FD is used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD is negotiated. So, let's mask that feature if the backend is not able to properly handle these messages. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a11afd1960..1c361ffd51 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -1674,6 +1674,16 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) features |= dev->iface->get_protocol_features(dev); } +/* + * If MFD_ALLOW_SEALING is not defined, we are not able to handle + * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd. + * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + * is negotiated. A device implementation can enable it, so let's mask + * it to avoid a runtime panic. + */ +#ifndef MFD_ALLOW_SEALING +features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); +#endif Masking the feature out of advertisement is obviously correct. But should we also fix the code for handling VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD to return an error to any client that requests it in error when the feature was not advertised, instead of panicking? Totally agree! Do I send a separate patch from this series or include it in this series? I would do the former because this one is already long enough. Thanks, Stefano
Re: [PATCH for-9.1 v2 02/11] libvhost-user: fail vu_message_write() if sendmsg() is failing
On Tue, Mar 26, 2024 at 03:36:52PM +0100, David Hildenbrand wrote: On 26.03.24 15:34, Eric Blake wrote: On Tue, Mar 26, 2024 at 02:39:27PM +0100, Stefano Garzarella wrote: In vu_message_write() we use sendmsg() to send the message header, then a write() to send the payload. If sendmsg() fails we should avoid sending the payload, since we were unable to send the header. Discovered before fixing the issue with the previous patch, where sendmsg() failed on macOS due to wrong parameters, but the frontend still sent the payload which the backend incorrectly interpreted as a wrong header. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 5 + 1 file changed, 5 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 22bea0c775..a11afd1960 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -639,6 +639,11 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) rc = sendmsg(conn_fd, , 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); +if (rc <= 0) { Is rejecting a 0 return value correct? Technically, a 0 return means a successful write of 0 bytes - but then again, it is unwise to attempt to send an fd or other auxilliary ddata without at least one regular byte, so we should not be attempting a write of 0 bytes. So I guess this one is okay, although I might have used < instead of <=. I blindly copied what they already do at the end of the same function. However, your observation is correct. That said they have a sendmsg() to send the header. After this we have a write() loop to send the payload. Now if sendmsg() returns 0, but we have not sent all the header, what should we do? Try sendmsg() again? For how many times? I was wondering if we could see some partial sendmsg()/write succeeding. Meaning, we transferred some bytes but not all, and we'd actually need to loop ... Yep, true, but I would fix it in another patch/series if you agree. Thanks, Stefano
[PATCH for-9.1 v2 04/11] vhost-user-server: don't abort if we can't set fd non-blocking
In vhost-user-server we set all fd received from the other peer in non-blocking mode. For some of them (e.g. memfd, shm_open, etc.) if we fail, it's not really a problem, because we don't use these fd with blocking operations, but only to map memory. In these cases a failure is not bad, so let's just report a warning instead of panicking if we fail to set some fd in non-blocking mode. This for example occurs in macOS where setting shm_open() fd non-blocking is failing (errno: 25). Signed-off-by: Stefano Garzarella --- util/vhost-user-server.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index 3bfb1ad3ec..064999f0b7 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -66,7 +66,11 @@ static void vmsg_unblock_fds(VhostUserMsg *vmsg) { int i; for (i = 0; i < vmsg->fd_num; i++) { -qemu_socket_set_nonblock(vmsg->fds[i]); +int ret = qemu_socket_try_set_nonblock(vmsg->fds[i]); +if (ret) { +warn_report("Failed to set fd %d nonblock for request %d: %s", +vmsg->fds[i], vmsg->request, strerror(-ret)); +} } } -- 2.44.0
[PATCH for-9.1 v2 00/11] vhost-user: support any POSIX system (tested on macOS, FreeBSD, OpenBSD)
v1: https://patchew.org/QEMU/20240228114759.44758-1-sgarz...@redhat.com/ v2: - fixed Author email and little typos in some patches - added memory-backend-shm (patches 9, 10, 11) [Daniel, David, Markus] - removed changes to memory-backend-file (ex patch 9) - used memory-backend-shm in tests/qtest/vhost-user-blk-test (patch 10) - added test case in tests/qtest/vhost-user-test (patch 11) The vhost-user protocol is not really Linux-specific, so let's try support QEMU's frontends and backends (including libvhost-user) in any POSIX system with this series. The main use case is to be able to use virtio devices that we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even in non-Linux systems. The first 5 patches are more like fixes discovered at runtime on macOS or FreeBSD that could go even independently of this series. Patches 6, 7, and 8 enable building of frontends and backends (including libvhost-user) with associated code changes to succeed in compilation. Patch 9 adds `memory-backend-shm` that uses the POSIX shm_open() API to create shared memory which is identified by an fd that can be shared with vhost-user backends. This is useful on those systems (like macOS) where we don't have memfd_create() or special filesystems like "/dev/shm". Patches 10 and 11 use `memory-backend-shm` in some vhost-user tests. Maybe the first 5 patches can go separately, but I only discovered those problems after testing patches 6 - 9, so I have included them in this series for now. Please let me know if you prefer that I send them separately. I tested this series using vhost-user-blk and QSD on macOS Sonoma 14.4 (aarch64), FreeBSD 14 (x86_64), OpenBSD 7.4 (x86_64), and Fedora 39 (x86_64) in this way: - Start vhost-user-blk or QSD (same commands for all systems) vhost-user-blk -s /tmp/vhost.socket \ -b Fedora-Cloud-Base-39-1.5.x86_64.raw qemu-storage-daemon \ --blockdev file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \ --blockdev qcow2,file=file,node-name=qcow2 \ --export vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on - macOS (aarch64): start QEMU (using hvf accelerator) qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \ -drive file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \ -device virtio-net-device,netdev=net0 -netdev user,id=net0 \ -device ramfb -device usb-ehci -device usb-kbd \ -object memory-backend-shm,id=mem,size=512M \ -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - FreeBSD/OpenBSD (x86_64): start QEMU (no accelerators available) qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \ -object memory-backend-shm,id=mem,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - Fedora (x86_64): start QEMU (using kvm accelerator) qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \ -object memory-backend-shm,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket Branch pushed (and CI started) at https://gitlab.com/sgarzarella/qemu/-/tree/macos-vhost-user?ref_type=heads Thanks, Stefano Stefano Garzarella (11): libvhost-user: set msg.msg_control to NULL when it is empty libvhost-user: fail vu_message_write() if sendmsg() is failing libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported vhost-user-server: don't abort if we can't set fd non-blocking contrib/vhost-user-blk: fix bind() using the right size of the address vhost-user: enable frontends on any POSIX system libvhost-user: enable it on any POSIX system contrib/vhost-user-blk: enable it on any POSIX system hostmem: add a new memory backend based on POSIX shm_open() tests/qtest/vhost-user-blk-test: use memory-backend-shm tests/qtest/vhost-user-test: add a test case for memory-backend-shm docs/system/devices/vhost-user.rst| 5 +- meson.build | 5 +- qapi/qom.json | 17 subprojects/libvhost-user/libvhost-user.h | 2 +- backends/hostmem-shm.c| 118 ++ contrib/vhost-user-blk/vhost-user-blk.c | 23 - hw/net/vhost_net.c| 5 + subprojects/libvhost-user/libvhost-user.c | 76 +- tests/qtest/vhost-user-blk-test.c | 2 +- tests/qtest/vhost-user-test.c | 23 + util/vhost-user-server.c | 6 +- backends/meson.build | 1 + hw/block/Kconfig | 2 +- qemu-options.hx | 10 ++ util/meson.build | 4 +- 15 files changed, 280 insertions(+), 19 deletions(-) creat
[PATCH for-9.1 v2 08/11] contrib/vhost-user-blk: enable it on any POSIX system
Let's make the code more portable by using the "qemu/bswap.h" API and adding defines from block/file-posix.c to support O_DIRECT in other systems (e.g. macOS). vhost-user-server.c is a dependency, let's enable it for any POSIX system. Signed-off-by: Stefano Garzarella --- meson.build | 2 -- contrib/vhost-user-blk/vhost-user-blk.c | 19 +-- util/meson.build| 4 +++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 3197a2f62e..b541e5c875 100644 --- a/meson.build +++ b/meson.build @@ -1956,8 +1956,6 @@ has_statx = cc.has_header_symbol('sys/stat.h', 'STATX_BASIC_STATS', prefix: gnu_ has_statx_mnt_id = cc.has_header_symbol('sys/stat.h', 'STATX_MNT_ID', prefix: gnu_source_prefix) have_vhost_user_blk_server = get_option('vhost_user_blk_server') \ - .require(host_os == 'linux', - error_message: 'vhost_user_blk_server requires linux') \ .require(have_vhost_user, error_message: 'vhost_user_blk_server requires vhost-user support') \ .disable_auto_if(not have_tools and not have_system) \ diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index a8ab9269a2..462e584857 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/bswap.h" #include "standard-headers/linux/virtio_blk.h" #include "libvhost-user-glib.h" @@ -24,6 +25,20 @@ #include #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + enum { VHOST_USER_BLK_MAX_QUEUES = 8, }; @@ -267,13 +282,13 @@ static int vub_virtio_process_req(VubDev *vdev_blk, req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base; in_num--; -type = le32toh(req->out->type); +type = le32_to_cpu(req->out->type); switch (type & ~VIRTIO_BLK_T_BARRIER) { case VIRTIO_BLK_T_IN: case VIRTIO_BLK_T_OUT: { ssize_t ret = 0; bool is_write = type & VIRTIO_BLK_T_OUT; -req->sector_num = le64toh(req->out->sector); +req->sector_num = le64_to_cpu(req->out->sector); if (is_write) { ret = vub_writev(req, >out_sg[1], out_num); } else { diff --git a/util/meson.build b/util/meson.build index 0ef9886be0..f52682ce96 100644 --- a/util/meson.build +++ b/util/meson.build @@ -113,10 +113,12 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif if host_os == 'linux' -util_ss.add(files('vhost-user-server.c'), vhost_user) util_ss.add(files('vfio-helpers.c')) util_ss.add(files('chardev_open.c')) endif + if host_os != 'windows' +util_ss.add(files('vhost-user-server.c'), vhost_user) + endif endif if cpu == 'aarch64' -- 2.44.0
[PATCH for-9.1 v2 01/11] libvhost-user: set msg.msg_control to NULL when it is empty
On some OS (e.g. macOS) sendmsg() returns -1 (errno EINVAL) if the `struct msghdr` has the field `msg_controllen` set to 0, but `msg_control` is not NULL. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a879149fef..22bea0c775 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -632,6 +632,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); } else { msg.msg_controllen = 0; +msg.msg_control = NULL; } do { -- 2.44.0
[PATCH for-9.1 v2 11/11] tests/qtest/vhost-user-test: add a test case for memory-backend-shm
`memory-backend-shm` can be used with vhost-user devices, so let's add a new test case for it. Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-test.c | 23 +++ 1 file changed, 23 insertions(+) diff --git a/tests/qtest/vhost-user-test.c b/tests/qtest/vhost-user-test.c index d4e437265f..8c1d903b2a 100644 --- a/tests/qtest/vhost-user-test.c +++ b/tests/qtest/vhost-user-test.c @@ -44,6 +44,8 @@ "mem-path=%s,share=on -numa node,memdev=mem" #define QEMU_CMD_MEMFD " -m %d -object memory-backend-memfd,id=mem,size=%dM," \ " -numa node,memdev=mem" +#define QEMU_CMD_SHM" -m %d -object memory-backend-shm,id=mem,size=%dM," \ +" -numa node,memdev=mem" #define QEMU_CMD_CHR" -chardev socket,id=%s,path=%s%s" #define QEMU_CMD_NETDEV " -netdev vhost-user,id=hs0,chardev=%s,vhostforce=on" @@ -195,6 +197,7 @@ enum test_memfd { TEST_MEMFD_AUTO, TEST_MEMFD_YES, TEST_MEMFD_NO, +TEST_MEMFD_SHM, }; static void append_vhost_net_opts(TestServer *s, GString *cmd_line, @@ -228,6 +231,8 @@ static void append_mem_opts(TestServer *server, GString *cmd_line, if (memfd == TEST_MEMFD_YES) { g_string_append_printf(cmd_line, QEMU_CMD_MEMFD, size, size); +} else if (memfd == TEST_MEMFD_SHM) { +g_string_append_printf(cmd_line, QEMU_CMD_SHM, size, size); } else { const char *root = init_hugepagefs() ? : server->tmpfs; @@ -788,6 +793,19 @@ static void *vhost_user_test_setup_memfd(GString *cmd_line, void *arg) return server; } +static void *vhost_user_test_setup_shm(GString *cmd_line, void *arg) +{ +TestServer *server = test_server_new("vhost-user-test", arg); +test_server_listen(server); + +append_mem_opts(server, cmd_line, 256, TEST_MEMFD_SHM); +server->vu_ops->append_opts(server, cmd_line, ""); + +g_test_queue_destroy(vhost_user_test_cleanup, server); + +return server; +} + static void test_read_guest_mem(void *obj, void *arg, QGuestAllocator *alloc) { TestServer *server = arg; @@ -1081,6 +1099,11 @@ static void register_vhost_user_test(void) "virtio-net", test_read_guest_mem, ); +opts.before = vhost_user_test_setup_shm; +qos_add_test("vhost-user/read-guest-mem/shm", + "virtio-net", + test_read_guest_mem, ); + if (qemu_memfd_check(MFD_ALLOW_SEALING)) { opts.before = vhost_user_test_setup_memfd; qos_add_test("vhost-user/read-guest-mem/memfd", -- 2.44.0
[PATCH for-9.1 v2 07/11] libvhost-user: enable it on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable libvhost-user for any POSIX system. Compiling it on macOS and FreeBSD some problems came up: - avoid to include linux/vhost.h which is avaibale only on Linux (vhost_types.h contains many of the things we need) - macOS doesn't provide sys/endian.h, so let's define them (note: libvhost-user doesn't include qemu's headers, so we can't use use "qemu/bswap.h") - define eventfd_[write|read] as write/read wrapper when system doesn't provide those (e.g. macOS) - copy SEAL defines from include/qemu/memfd.h to make the code works on FreeBSD where MFD_ALLOW_SEALING is defined - define MAP_NORESERVE if it's not defined (e.g. on FreeBSD) Signed-off-by: Stefano Garzarella --- meson.build | 2 +- subprojects/libvhost-user/libvhost-user.h | 2 +- subprojects/libvhost-user/libvhost-user.c | 60 +-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index c19d51501a..3197a2f62e 100644 --- a/meson.build +++ b/meson.build @@ -3194,7 +3194,7 @@ endif config_host_data.set('CONFIG_FDT', fdt.found()) vhost_user = not_found -if host_os == 'linux' and have_vhost_user +if have_vhost_user libvhost_user = subproject('libvhost-user') vhost_user = libvhost_user.get_variable('vhost_user_dep') endif diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index deb40e77b3..e13e1d3931 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -18,9 +18,9 @@ #include #include #include -#include #include #include "standard-headers/linux/virtio_ring.h" +#include "standard-headers/linux/vhost_types.h" /* Based on qemu/hw/virtio/vhost-user.c */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 1c361ffd51..03edb4bf64 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include /* Necessary to provide VIRTIO_F_VERSION_1 on system * with older linux headers. Must appear before @@ -39,8 +37,8 @@ #include "standard-headers/linux/virtio_config.h" #if defined(__linux__) +#include #include -#include #include #include #include @@ -52,6 +50,62 @@ #endif +#if defined(__APPLE__) && (__MACH__) +#include +#define htobe16(x) OSSwapHostToBigInt16(x) +#define htole16(x) OSSwapHostToLittleInt16(x) +#define be16toh(x) OSSwapBigToHostInt16(x) +#define le16toh(x) OSSwapLittleToHostInt16(x) + +#define htobe32(x) OSSwapHostToBigInt32(x) +#define htole32(x) OSSwapHostToLittleInt32(x) +#define be32toh(x) OSSwapBigToHostInt32(x) +#define le32toh(x) OSSwapLittleToHostInt32(x) + +#define htobe64(x) OSSwapHostToBigInt64(x) +#define htole64(x) OSSwapHostToLittleInt64(x) +#define be64toh(x) OSSwapBigToHostInt64(x) +#define le64toh(x) OSSwapLittleToHostInt64(x) +#endif + +#ifdef CONFIG_EVENTFD +#include +#else +#define eventfd_t uint64_t + +int eventfd_write(int fd, eventfd_t value) +{ +return (write(fd, , sizeof(value)) == sizeof(value)) ? 0 : -1; +} + +int eventfd_read(int fd, eventfd_t *value) +{ +return (read(fd, value, sizeof(*value)) == sizeof(*value)) ? 0 : -1; +} +#endif + +#ifdef MFD_ALLOW_SEALING +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE0x0008 /* prevent writes */ +#endif +#endif + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + #include "include/atomic.h" #include "libvhost-user.h" -- 2.44.0
[PATCH for-9.1 v2 10/11] tests/qtest/vhost-user-blk-test: use memory-backend-shm
`memory-backend-memfd` is available only on Linux while the new `memory-backend-shm` can be used on any POSIX-compliant operating system. Let's use it so we can run the test in multiple environments. Signed-off-by: Stefano Garzarella --- tests/qtest/vhost-user-blk-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qtest/vhost-user-blk-test.c b/tests/qtest/vhost-user-blk-test.c index 117b9acd10..e945f6abf2 100644 --- a/tests/qtest/vhost-user-blk-test.c +++ b/tests/qtest/vhost-user-blk-test.c @@ -906,7 +906,7 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, vhost_user_blk_bin); g_string_append_printf(cmd_line, -" -object memory-backend-memfd,id=mem,size=256M,share=on " +" -object memory-backend-shm,id=mem,size=256M,share=on " " -M memory-backend=mem -m 256M "); for (i = 0; i < vus_instances; i++) { -- 2.44.0
[PATCH for-9.1 v2 03/11] libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
libvhost-user will panic when receiving VHOST_USER_GET_INFLIGHT_FD message if MFD_ALLOW_SEALING is not defined, since it's not able to create a memfd. VHOST_USER_GET_INFLIGHT_FD is used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD is negotiated. So, let's mask that feature if the backend is not able to properly handle these messages. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a11afd1960..1c361ffd51 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -1674,6 +1674,16 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) features |= dev->iface->get_protocol_features(dev); } +/* + * If MFD_ALLOW_SEALING is not defined, we are not able to handle + * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd. + * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + * is negotiated. A device implementation can enable it, so let's mask + * it to avoid a runtime panic. + */ +#ifndef MFD_ALLOW_SEALING +features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); +#endif vmsg_set_reply_u64(vmsg, features); return true; } -- 2.44.0
[PATCH for-9.1 v2 02/11] libvhost-user: fail vu_message_write() if sendmsg() is failing
In vu_message_write() we use sendmsg() to send the message header, then a write() to send the payload. If sendmsg() fails we should avoid sending the payload, since we were unable to send the header. Discovered before fixing the issue with the previous patch, where sendmsg() failed on macOS due to wrong parameters, but the frontend still sent the payload which the backend incorrectly interpreted as a wrong header. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 5 + 1 file changed, 5 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 22bea0c775..a11afd1960 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -639,6 +639,11 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) rc = sendmsg(conn_fd, , 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); +if (rc <= 0) { +vu_panic(dev, "Error while writing: %s", strerror(errno)); +return false; +} + if (vmsg->size) { do { if (vmsg->data) { -- 2.44.0
[PATCH for-9.1 v2 09/11] hostmem: add a new memory backend based on POSIX shm_open()
shm_open() creates and opens a new POSIX shared memory object. A POSIX shared memory object allows creating memory backend with an associated file descriptor that can be shared with external processes (e.g. vhost-user). The new `memory-backend-shm` can be used as an alternative when `memory-backend-memfd` is not available (Linux only), since shm_open() should be provided by any POSIX-compliant operating system. Signed-off-by: Stefano Garzarella --- docs/system/devices/vhost-user.rst | 5 +- qapi/qom.json | 17 + backends/hostmem-shm.c | 118 + backends/meson.build | 1 + qemu-options.hx| 10 +++ 5 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 backends/hostmem-shm.c diff --git a/docs/system/devices/vhost-user.rst b/docs/system/devices/vhost-user.rst index 9b2da106ce..35259d8ec7 100644 --- a/docs/system/devices/vhost-user.rst +++ b/docs/system/devices/vhost-user.rst @@ -98,8 +98,9 @@ Shared memory object In order for the daemon to access the VirtIO queues to process the requests it needs access to the guest's address space. This is -achieved via the ``memory-backend-file`` or ``memory-backend-memfd`` -objects. A reference to a file-descriptor which can access this object +achieved via the ``memory-backend-file``, ``memory-backend-memfd``, or +``memory-backend-shm`` objects. +A reference to a file-descriptor which can access this object will be passed via the socket as part of the protocol negotiation. Currently the shared memory object needs to match the size of the main diff --git a/qapi/qom.json b/qapi/qom.json index baae3a183f..88136b861d 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -720,6 +720,19 @@ '*hugetlbsize': 'size', '*seal': 'bool' } } +## +# @MemoryBackendShmProperties: +# +# Properties for memory-backend-shm objects. +# +# The @share boolean option is true by default with shm. +# +# Since: 9.1 +## +{ 'struct': 'MemoryBackendShmProperties', + 'base': 'MemoryBackendProperties', + 'data': { } } + ## # @MemoryBackendEpcProperties: # @@ -976,6 +989,8 @@ { 'name': 'memory-backend-memfd', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram', +{ 'name': 'memory-backend-shm', + 'if': 'CONFIG_POSIX' }, 'pef-guest', { 'name': 'pr-manager-helper', 'if': 'CONFIG_LINUX' }, @@ -1047,6 +1062,8 @@ 'memory-backend-memfd': { 'type': 'MemoryBackendMemfdProperties', 'if': 'CONFIG_LINUX' }, 'memory-backend-ram': 'MemoryBackendProperties', + 'memory-backend-shm': { 'type': 'MemoryBackendShmProperties', + 'if': 'CONFIG_POSIX' }, 'pr-manager-helper': { 'type': 'PrManagerHelperProperties', 'if': 'CONFIG_LINUX' }, 'qtest': 'QtestProperties', diff --git a/backends/hostmem-shm.c b/backends/hostmem-shm.c new file mode 100644 index 00..7595204d29 --- /dev/null +++ b/backends/hostmem-shm.c @@ -0,0 +1,118 @@ +/* + * QEMU host POSIX shared memory object backend + * + * Copyright (C) 2024 Red Hat Inc + * + * Authors: + * Stefano Garzarella + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "sysemu/hostmem.h" +#include "qapi/error.h" + +#define TYPE_MEMORY_BACKEND_SHM "memory-backend-shm" + +OBJECT_DECLARE_SIMPLE_TYPE(HostMemoryBackendShm, MEMORY_BACKEND_SHM) + +struct HostMemoryBackendShm { +HostMemoryBackend parent_obj; +}; + +static bool +shm_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ +g_autoptr(GString) shm_name = g_string_new(NULL); +g_autofree char *backend_name = NULL; +uint32_t ram_flags; +int fd, oflag; +mode_t mode; + +if (!backend->size) { +error_setg(errp, "can't create backend with size 0"); +return false; +} + +/* + * Let's use `mode = 0` because we don't want other processes to open our + * memory unless we share the file descriptor with them. + */ +mode = 0; +oflag = O_RDWR | O_CREAT | O_EXCL; +backend_name = host_memory_backend_get_name(backend); + +/* + * Some operating systems allow creating anonymous POSIX shared memory + * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not + * defined by POSIX, so let's create a unique name. + * + * From Linux's shm_open(3) man-page: + * For portable use, a shared memory object should be identified + * by a name of the form /somename;" + */ +g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%s", getpid(), +backend_name); + +fd = shm_open(shm_nam
[PATCH for-9.1 v2 06/11] vhost-user: enable frontends on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable vhost-user frontends for any POSIX system. In vhost_net.c we use VHOST_FILE_UNBIND which is defined in a Linux specific header, let's define it for other systems as well. Signed-off-by: Stefano Garzarella --- meson.build| 1 - hw/net/vhost_net.c | 5 + hw/block/Kconfig | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index c9c3217ba4..c19d51501a 100644 --- a/meson.build +++ b/meson.build @@ -151,7 +151,6 @@ have_tpm = get_option('tpm') \ # vhost have_vhost_user = get_option('vhost_user') \ - .disable_auto_if(host_os != 'linux') \ .require(host_os != 'windows', error_message: 'vhost-user is not available on Windows').allowed() have_vhost_vdpa = get_option('vhost_vdpa') \ diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e8e1661646..b8a59b9e90 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -34,8 +34,13 @@ #include "standard-headers/linux/virtio_ring.h" #include "hw/virtio/vhost.h" #include "hw/virtio/virtio-bus.h" +#if defined(__linux__) #include "linux-headers/linux/vhost.h" +#endif +#ifndef VHOST_FILE_UNBIND +#define VHOST_FILE_UNBIND -1 +#endif /* Features supported by host kernel. */ static const int kernel_feature_bits[] = { diff --git a/hw/block/Kconfig b/hw/block/Kconfig index 9e8f28f982..29ee09e434 100644 --- a/hw/block/Kconfig +++ b/hw/block/Kconfig @@ -40,7 +40,7 @@ config VHOST_USER_BLK bool # Only PCI devices are provided for now default y if VIRTIO_PCI -depends on VIRTIO && VHOST_USER && LINUX +depends on VIRTIO && VHOST_USER config SWIM bool -- 2.44.0
[PATCH for-9.1 v2 05/11] contrib/vhost-user-blk: fix bind() using the right size of the address
On macOS passing `-s /tmp/vhost.socket` parameter to the vhost-user-blk application, the bind was done on `/tmp/vhost.socke` pathname, missing the last character. This sounds like one of the portability problems described in the unix(7) manpage: Pathname sockets When binding a socket to a pathname, a few rules should be observed for maximum portability and ease of coding: ⢠The pathname in sun_path should be null-terminated. ⢠The length of the pathname, including the terminating null byte, should not exceed the size of sun_path. ⢠The addrlen argument that describes the enclosing sockaddr_un structure should have a value of at least: offsetof(struct sockaddr_un, sun_path) + strlen(addr.sun_path)+1 or, more simply, addrlen can be specified as sizeof(struct sockaddr_un). So let's follow the last advice and simplify the code as well. Signed-off-by: Stefano Garzarella --- contrib/vhost-user-blk/vhost-user-blk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 89e5f11a64..a8ab9269a2 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -469,7 +469,6 @@ static int unix_sock_new(char *unix_fn) { int sock; struct sockaddr_un un; -size_t len; assert(unix_fn); @@ -481,10 +480,9 @@ static int unix_sock_new(char *unix_fn) un.sun_family = AF_UNIX; (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn); -len = sizeof(un.sun_family) + strlen(un.sun_path); (void)unlink(unix_fn); -if (bind(sock, (struct sockaddr *), len) < 0) { +if (bind(sock, (struct sockaddr *), sizeof(un)) < 0) { perror("bind"); goto fail; } -- 2.44.0
Re: [PATCH for-9.0 v3] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Fri, Mar 15, 2024 at 04:59:49PM +0100, Kevin Wolf wrote: VDUSE requires that virtqueues are first enabled before the DRIVER_OK status flag is set; with the current API of the kernel module, it is impossible to enable the opposite order in our block export code because userspace is not notified when a virtqueue is enabled. This requirement also mathces the normal initialisation order as done by the generic vhost code in QEMU. However, commit 6c482547 accidentally changed the order for vdpa-dev and broke access to VDUSE devices with this. This changes vdpa-dev to use the normal order again and use the standard vhost callback .vhost_set_vring_enable for this. VDUSE devices can be used with vdpa-dev again after this fix. vhost_net intentionally avoided enabling the vrings for vdpa and does this manually later while it does enable them for other vhost backends. Reflect this in the vhost_net code and return early for vdpa, so that the behaviour doesn't change for this device. Cc: qemu-sta...@nongnu.org Fixes: 6c4825476a4351530bcac17abab72295b75ffe98 Signed-off-by: Kevin Wolf --- v2: - Actually make use of the @enable parameter - Change vhost_net to preserve the current behaviour v3: - Updated trace point [Stefano] - Fixed typo in comment [Stefano] hw/net/vhost_net.c | 10 ++ hw/virtio/vdpa-dev.c | 5 + hw/virtio/vhost-vdpa.c | 29 ++--- hw/virtio/vhost.c | 8 +++- hw/virtio/trace-events | 2 +- 5 files changed, 45 insertions(+), 9 deletions(-) LGTM! Reviewed-by: Stefano Garzarella Thanks, Stefano diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e8e1661646..fd1a93701a 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable) VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; +/* + * vhost-vdpa network devices need to enable dataplane virtqueues after + * DRIVER_OK, so they can recover device state before starting dataplane. + * Because of that, we don't enable virtqueues here and leave it to + * net/vhost-vdpa.c. + */ +if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { +return 0; +} + nc->vring_enable = enable; if (vhost_ops && vhost_ops->vhost_set_vring_enable) { diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index eb9ecea83b..13e87f06f6 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp) s->dev.acked_features = vdev->guest_features; -ret = vhost_dev_start(>dev, vdev, false); +ret = vhost_dev_start(>dev, vdev, true); if (ret < 0) { error_setg_errno(errp, -ret, "Error starting vhost"); goto err_guest_notifiers; } -for (i = 0; i < s->dev.nvqs; ++i) { -vhost_vdpa_set_vring_ready(>vdpa, i); -} s->started = true; /* diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index ddae494ca8..31453466af 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -886,19 +886,41 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) return idx; } -int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx, + int enable) { struct vhost_dev *dev = v->dev; struct vhost_vring_state state = { .index = idx, -.num = 1, +.num = enable, }; int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, ); -trace_vhost_vdpa_set_vring_ready(dev, idx, r); +trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r); return r; } +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) +{ +struct vhost_vdpa *v = dev->opaque; +unsigned int i; +int ret; + +for (i = 0; i < dev->nvqs; ++i) { +ret = vhost_vdpa_set_vring_enable_one(v, i, enable); +if (ret < 0) { +return ret; +} +} + +return 0; +} + +int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +{ +return vhost_vdpa_set_vring_enable_one(v, idx, 1); +} + static int vhost_vdpa_set_config_call(struct vhost_dev *dev, int fd) { @@ -1514,6 +1536,7 @@ const VhostOps vdpa_ops = { .vhost_set_features = vhost_vdpa_set_features, .vhost_reset_device = vhost_vdpa_reset_device, .vhost_get_vq_index = vhost_vdpa_get_vq_index, +.vhost_set_vring_enable = vhost_vdpa_set_vring_enable, .vhost_get_config = vhost_vdpa_get_config, .vhost_set_config = vhost_vdpa_set_config, .vhost_requires_shm_log = NULL, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 2c9ac79468..a66186 100644 --- a/
Re: [PATCH for-9.0 v2] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Fri, Mar 15, 2024 at 03:03:31PM +0100, Kevin Wolf wrote: VDUSE requires that virtqueues are first enabled before the DRIVER_OK status flag is set; with the current API of the kernel module, it is impossible to enable the opposite order in our block export code because userspace is not notified when a virtqueue is enabled. This requirement also mathces the normal initialisation order as done by the generic vhost code in QEMU. However, commit 6c482547 accidentally changed the order for vdpa-dev and broke access to VDUSE devices with this. This changes vdpa-dev to use the normal order again and use the standard vhost callback .vhost_set_vring_enable for this. VDUSE devices can be used with vdpa-dev again after this fix. vhost_net intentionally avoided enabling the vrings for vdpa and does this manually later while it does enable them for other vhost backends. Reflect this in the vhost_net code and return early for vdpa, so that the behaviour doesn't change for this device. Cc: qemu-sta...@nongnu.org Fixes: 6c4825476a4351530bcac17abab72295b75ffe98 Signed-off-by: Kevin Wolf --- v2: - Actually make use of the @enable parameter - Change vhost_net to preserve the current behaviour hw/net/vhost_net.c | 10 ++ hw/virtio/vdpa-dev.c | 5 + hw/virtio/vhost-vdpa.c | 27 +-- hw/virtio/vhost.c | 8 +++- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e8e1661646..fd1a93701a 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -541,6 +541,16 @@ int vhost_set_vring_enable(NetClientState *nc, int enable) VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; +/* + * vhost-vdpa network devices need to enable dataplane virtqueues after + * DRIVER_OK, so they can recover device state before starting dataplane. + * Because of that, we don't enable virtqueues here and leave it to + * net/vhost-vdpa.c. + */ +if (nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) { +return 0; +} + nc->vring_enable = enable; if (vhost_ops && vhost_ops->vhost_set_vring_enable) { diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index eb9ecea83b..13e87f06f6 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp) s->dev.acked_features = vdev->guest_features; -ret = vhost_dev_start(>dev, vdev, false); +ret = vhost_dev_start(>dev, vdev, true); if (ret < 0) { error_setg_errno(errp, -ret, "Error starting vhost"); goto err_guest_notifiers; } -for (i = 0; i < s->dev.nvqs; ++i) { -vhost_vdpa_set_vring_ready(>vdpa, i); -} s->started = true; /* diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index ddae494ca8..401afac2f5 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -886,12 +886,13 @@ static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx) return idx; } -int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx, + int enable) { struct vhost_dev *dev = v->dev; struct vhost_vring_state state = { .index = idx, -.num = 1, +.num = enable, }; int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, ); After this line we now have: trace_vhost_vdpa_set_vring_ready(dev, idx, r); Should we rename it or move it to the new function? If we rename it, we should trace also `enable`. @@ -899,6 +900,27 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) return r; } +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) +{ +struct vhost_vdpa *v = dev->opaque; +unsigned int i; +int ret; + +for (i = 0; i < dev->nvqs; ++i) { +ret = vhost_vdpa_set_vring_enable_one(v, i, enable); +if (ret < 0) { +return ret; +} +} + +return 0; +} + +int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) +{ +return vhost_vdpa_set_vring_enable_one(v, idx, 1); +} + static int vhost_vdpa_set_config_call(struct vhost_dev *dev, int fd) { @@ -1514,6 +1536,7 @@ const VhostOps vdpa_ops = { .vhost_set_features = vhost_vdpa_set_features, .vhost_reset_device = vhost_vdpa_reset_device, .vhost_get_vq_index = vhost_vdpa_get_vq_index, +.vhost_set_vring_enable = vhost_vdpa_set_vring_enable, .vhost_get_config = vhost_vdpa_get_config, .vhost_set_config = vhost_vdpa_set_config, .vhost_requires_shm_log = NULL, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 2c9ac79468..decfb85184 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1984,7 +1984,13 @@ static int
Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()
On Thu, Feb 29, 2024 at 11:28:37AM +0100, Markus Armbruster wrote: Stefano Garzarella writes: On Wed, Feb 28, 2024 at 01:32:17PM +0100, Markus Armbruster wrote: Stefano Garzarella writes: [...] +# @shm: if true, shm_open(3) is used to create/open POSIX shared memory +# object; if false, an open(2) is used. (default: false) (since 9.0) +# Please format like this for consistency: Sure. # @shm: if true, shm_open(3) is used to create/open POSIX shared memory # object; if false, an open(2) is used (default: false) (since 9.0) I just noticed that I followed the property just above (@rom). Should we fix that one? Yes, please. Done: https://patchew.org/QEMU/20240229105826.16354-1-sgarz...@redhat.com/ Thanks, Stefano See commit a937b6aa739 (qapi: Reformat doc comments to conform to current conventions).
Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()
On Wed, Feb 28, 2024 at 01:32:17PM +0100, Markus Armbruster wrote: Stefano Garzarella writes: Add a new `shm` bool option for `-object memory-backend-file`. When this option is set to true, the POSIX shm_open(3) is used instead of open(2). So a file will not be created in the filesystem, but a "POSIX shared memory object" will be instantiated. In Linux this turns into a file in /dev/shm, but in other OSes this may not happen (for example in macOS or FreeBSD nothing is shown in any filesystem). This new feature is useful when we need to share guest memory with another process (e.g. vhost-user backend), but we don't have memfd_create() or any special filesystems (e.g. /dev/shm) available as in macOS. Signed-off-by: Stefano Garzarella --- I am not sure this is the best way to support shm_open() in QEMU. Other solutions I had in mind were: - create a new memory-backend-shm How would that look like? Would it involve duplicating code? I was looking at it just now, and apart from some boilerplate code to create the object, the rest in the end is pretty specific and a lot of things in memory-backend-file wouldn't be supported by memory-backend-shm anyway, so I'll give it a try for v2 by adding it. - extend memory-backend-memfd to use shm_open() on systems where memfd is not available (problem: shm_open wants a name to assign to the object, but we can do a workaround by using a random name and do the unlink right away) Hmm. Too much magic? I don't know... Yeah, I agree. Any preference/suggestion? [...] diff --git a/qapi/qom.json b/qapi/qom.json index 2a6e49365a..bfb01b909f 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -682,6 +682,9 @@ # @mem-path: the path to either a shared memory or huge page # filesystem mount Does this need adjustment? Good point. For now I think I will drop this patch and add memory-backend-shm in v2, but if I go back I will fix it! [...] # writable RAM instead of ROM, and want to set this property to 'off'. # (default: auto, since 8.2) # +# @shm: if true, shm_open(3) is used to create/open POSIX shared memory +# object; if false, an open(2) is used. (default: false) (since 9.0) +# Please format like this for consistency: Sure. # @shm: if true, shm_open(3) is used to create/open POSIX shared memory # object; if false, an open(2) is used (default: false) (since 9.0) I just noticed that I followed the property just above (@rom). Should we fix that one? Thanks, Stefano
Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()
On Wed, Feb 28, 2024 at 12:08:37PM +, Daniel P. Berrangé wrote: On Wed, Feb 28, 2024 at 12:47:59PM +0100, Stefano Garzarella wrote: Add a new `shm` bool option for `-object memory-backend-file`. When this option is set to true, the POSIX shm_open(3) is used instead of open(2). So a file will not be created in the filesystem, but a "POSIX shared memory object" will be instantiated. In Linux this turns into a file in /dev/shm, but in other OSes this may not happen (for example in macOS or FreeBSD nothing is shown in any filesystem). This new feature is useful when we need to share guest memory with another process (e.g. vhost-user backend), but we don't have memfd_create() or any special filesystems (e.g. /dev/shm) available as in macOS. Signed-off-by: Stefano Garzarella --- I am not sure this is the best way to support shm_open() in QEMU. Other solutions I had in mind were: - create a new memory-backend-shm - extend memory-backend-memfd to use shm_open() on systems where memfd is not available (problem: shm_open wants a name to assign to the object, but we can do a workaround by using a random name and do the unlink right away) IMHO, create a new memory-backend-shm, don't overload memory-backend-memfd, as this lets users choose between shm & memfd, even on Linux. Yeah, good point! I think there's enough of a consensus on adding memory-backend-shm, so I'm going to go toward that direction in v2. Thanks, Stefano
Re: [PATCH 9/9] hostmem-file: support POSIX shm_open()
On Wed, Feb 28, 2024 at 01:01:55PM +0100, David Hildenbrand wrote: On 28.02.24 12:47, Stefano Garzarella wrote: Add a new `shm` bool option for `-object memory-backend-file`. When this option is set to true, the POSIX shm_open(3) is used instead of open(2). So a file will not be created in the filesystem, but a "POSIX shared memory object" will be instantiated. In Linux this turns into a file in /dev/shm, but in other OSes this may not happen (for example in macOS or FreeBSD nothing is shown in any filesystem). This new feature is useful when we need to share guest memory with another process (e.g. vhost-user backend), but we don't have memfd_create() or any special filesystems (e.g. /dev/shm) available as in macOS. Signed-off-by: Stefano Garzarella --- I am not sure this is the best way to support shm_open() in QEMU. Other solutions I had in mind were: - create a new memory-backend-shm - extend memory-backend-memfd to use shm_open() on systems where memfd is not available (problem: shm_open wants a name to assign to the object, but we can do a workaround by using a random name and do the unlink right away) Any preference/suggestion? Both sound like reasonable options, and IMHO better than hostmem-file with things that are not necessarily "real" files. Yeah, I see. Regarding memory-backend-memfd, we similarly have to pass a name to memfd_create(), although for different purpose: "The name supplied in name is used as a filename and will be displayed as the target of the corresponding symbolic link in the directory /proc/self/fd/". So we simply pass TYPE_MEMORY_BACKEND_MEMFD. Okay, so I guess it must be unique only in the process, while for shm_open() it is global. Likely, memory-backend-shm that directly maps to shm_open() and only provides properties reasonable for shm_open() is the cleanest approach. So that would currently be my preference :) Thank you for your thoughts, I think I will go toward this direction (memory-backend-shm). It was also my first choice, but in order to have a working RFC right away, I modified memory-backend-file. Stefano
Re: [PATCH 0/9] vhost-user: support any POSIX system (tested on macOS and FreeBSD)
I just noticed that I forgot to add RFC tag and fix Author to match SOB in some patches, sorry! Stefano On Wed, Feb 28, 2024 at 12:48 PM Stefano Garzarella wrote: > > The vhost-user protocol is not really Linux-specific, so let's try support > QEMU's frontends and backends (including libvhost-user) in any POSIX system > with this series. The main use case is to be able to use virtio devices that > we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even > in non-Linux systems. > > The first 5 patches are more like fixes discovered at runtime on macOS or > FreeBSD that could go even independently of this series. > > Patches 6, 7, and 8 enable building of frontends and backends (including > libvhost-user) with associated code changes to succeed in compilation. > > The latest patch (9) adds support for the POSIX shm_open() API to create > shared memory which is identified by an fd that can be shared with vhost-user > backends. This is useful on those systems (like macOS) where we don't have > memfd_create() or special filesystems like "/dev/shm". > > I put the whole series in RFC because I have some questions especially in > patch 6 and 9, but in general any comment/suggestion/test are really welcome. > > Maybe the first 5 patches can go separately, but I only discovered those > problems after testing patches 6 - 9, so I have included them in this series > for now. Please let me know if you prefer that I send them separately. > > For now I tested this series using vhost-user-blk and QSD on > macOS Sonoma 14.3.1 (aarch64), FreeBSD 14 (x86_64), and Fedora 39 (x86_64) > in this way: > > - Start vhost-user-blk or QSD (same commands for all systems) > > vhost-user-blk -s /tmp/vhost.socket \ > -b Fedora-Cloud-Base-39-1.5.x86_64.raw > > qemu-storage-daemon \ > --blockdev > file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \ > --blockdev qcow2,file=file,node-name=qcow2 \ > --export > vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on > > - macOS (aarch64): start QEMU (using hvf accelerator) > > qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \ > -drive > file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \ > -device virtio-net-device,netdev=net0 -netdev user,id=net0 \ > -device ramfb -device usb-ehci -device usb-kbd \ > -object > memory-backend-file,mem-path="/mem0",shm=on,share=on,id=mem,size=512M \ > -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \ > -chardev socket,id=char0,path=/tmp/vhost.socket > > - FreeBSD (x86_64): start QEMU (no accelerators available) > > qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \ > -object > memory-backend-file,mem-path="/mem0",shm=on,share=on,id=mem,size="512M" \ > -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ > -chardev socket,id=char0,path=/tmp/vhost.socket > > - Fedora (x86_64): start QEMU (using kvm accelerator) > > qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \ > -object > memory-backend-file,mem-path="/mem0",shm=on,share=on,id=mem,size="512M" \ > -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ > -chardev socket,id=char0,path=/tmp/vhost.socket > > Thanks, > Stefano > > Stefano Garzarella (9): > libvhost-user: set msg.msg_control to NULL when it is empty > libvhost-user: fail vu_message_write() if sendmsg() is failing > libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported > vhost-user-server: don't abort if we can't set fd non-blocking > contrib/vhost-user-blk: fix bind() using the right size of the address > vhost-user: enable frontends on any POSIX system > libvhost-user: enable it on any POSIX system > contrib/vhost-user-blk: enabled it on any POSIX system > hostmem-file: support POSIX shm_open() > > meson.build | 5 +- > qapi/qom.json | 4 ++ > subprojects/libvhost-user/libvhost-user.h | 2 +- > backends/hostmem-file.c | 57 - > contrib/vhost-user-blk/vhost-user-blk.c | 23 +-- > hw/net/vhost_net.c| 8 ++- > subprojects/libvhost-user/libvhost-user.c | 76 ++- > util/vhost-user-server.c | 6 +- > backends/meson.build | 2 +- > hw/block/Kconfig | 2 +- > qemu-options.hx | 10 ++- > util/meson.build | 4 +- > 12 files changed, 179 insertions(+), 20 deletions(-) > > -- > 2.43.2 >
[PATCH 6/9] vhost-user: enable frontends on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable vhost-user frontends for any POSIX system. In vhost_net.c we use VHOST_FILE_UNBIND which is defined in a Linux specific header, let's define it for other systems as well. Signed-off-by: Stefano Garzarella --- If we want to be more conservative, maybe we can leave `have_vhost_user` auto only on linux. I removed it more to test with CI if we have any problems in the other POSIX systems. In hw/net/vhost_net.c maybe we can just do: #ifndef VHOST_FILE_UNBIND #define VHOST_FILE_UNBIND -1 #endif Any suggestion? Thanks, Stefano --- meson.build| 1 - hw/net/vhost_net.c | 8 +++- hw/block/Kconfig | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 0ef1654e86..462f2d7593 100644 --- a/meson.build +++ b/meson.build @@ -151,7 +151,6 @@ have_tpm = get_option('tpm') \ # vhost have_vhost_user = get_option('vhost_user') \ - .disable_auto_if(host_os != 'linux') \ .require(host_os != 'windows', error_message: 'vhost-user is not available on Windows').allowed() have_vhost_vdpa = get_option('vhost_vdpa') \ diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index e8e1661646..346ef74eb1 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -34,8 +34,14 @@ #include "standard-headers/linux/virtio_ring.h" #include "hw/virtio/vhost.h" #include "hw/virtio/virtio-bus.h" -#include "linux-headers/linux/vhost.h" +#if defined(__linux__) +#include "linux-headers/linux/vhost.h" +#else +#ifndef VHOST_FILE_UNBIND +#define VHOST_FILE_UNBIND -1 +#endif +#endif /* Features supported by host kernel. */ static const int kernel_feature_bits[] = { diff --git a/hw/block/Kconfig b/hw/block/Kconfig index 9e8f28f982..29ee09e434 100644 --- a/hw/block/Kconfig +++ b/hw/block/Kconfig @@ -40,7 +40,7 @@ config VHOST_USER_BLK bool # Only PCI devices are provided for now default y if VIRTIO_PCI -depends on VIRTIO && VHOST_USER && LINUX +depends on VIRTIO && VHOST_USER config SWIM bool -- 2.43.2
[PATCH 4/9] vhost-user-server: don't abort if we can't set fd non-blocking
From: Stefano Garzarella In vhost-user-server we set all fd received from the other peer in non-blocking mode. For some of them (e.g. memfd, shm_open, etc.) if we fail, it's not really a problem, because we don't use these fd with blocking operations, but only to map memory. In these cases a failure is not bad, so let's just report a warning instead of panicking if we fail to set some fd in non-blocking mode. This for example occurs in macOS where setting shm_open() fd non-blocking is failing (errno -25). Signed-off-by: Stefano Garzarella --- util/vhost-user-server.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c index 3bfb1ad3ec..064999f0b7 100644 --- a/util/vhost-user-server.c +++ b/util/vhost-user-server.c @@ -66,7 +66,11 @@ static void vmsg_unblock_fds(VhostUserMsg *vmsg) { int i; for (i = 0; i < vmsg->fd_num; i++) { -qemu_socket_set_nonblock(vmsg->fds[i]); +int ret = qemu_socket_try_set_nonblock(vmsg->fds[i]); +if (ret) { +warn_report("Failed to set fd %d nonblock for request %d: %s", +vmsg->fds[i], vmsg->request, strerror(-ret)); +} } } -- 2.43.2
[PATCH 3/9] libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported
libvhost-user will panic when receiving VHOST_USER_GET_INFLIGHT_FD message if MFD_ALLOW_SEALING is not defined, since it's not able to create a memfd. VHOST_USER_GET_INFLIGHT_FD is used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD is negotiated. So, let's mask that feature if the backend is not able to properly handle these messages. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 527a550345..5da8de838b 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -1610,6 +1610,16 @@ vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) features |= dev->iface->get_protocol_features(dev); } +/* + * If MFD_ALLOW_SEALING is not defined, we are not able to handle + * VHOST_USER_GET_INFLIGHT_FD messages, since we can't create a memfd. + * Those messages are used only if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD + * is negotiated. A device implementation can enable it, so let's mask + * it to avoid a runtime panic. + */ +#ifndef MFD_ALLOW_SEALING +features &= ~(1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD); +#endif vmsg_set_reply_u64(vmsg, features); return true; } -- 2.43.2
[PATCH 5/9] contrib/vhost-user-blk: fix bind() using the right size of the address
From: Stefano Garzarella On macOS passing `-s /tmp/vhost.socket` parameter to the vhost-user-blk application, the bind was done on `/tmp/vhost.socke` pathname, missing the last character. This sounds like one of the portability problems described in the unix(7) manpage: Pathname sockets When binding a socket to a pathname, a few rules should be observed for maximum portability and ease of coding: ⢠The pathname in sun_path should be null-terminated. ⢠The length of the pathname, including the terminating null byte, should not exceed the size of sun_path. ⢠The addrlen argument that describes the enclosing sockaddr_un structure should have a value of at least: offsetof(struct sockaddr_un, sun_path) + strlen(addr.sun_path)+1 or, more simply, addrlen can be specified as sizeof(struct sockaddr_un). So let's follow the last advice and simplify the code as well. Signed-off-by: Stefano Garzarella --- contrib/vhost-user-blk/vhost-user-blk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 89e5f11a64..a8ab9269a2 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -469,7 +469,6 @@ static int unix_sock_new(char *unix_fn) { int sock; struct sockaddr_un un; -size_t len; assert(unix_fn); @@ -481,10 +480,9 @@ static int unix_sock_new(char *unix_fn) un.sun_family = AF_UNIX; (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn); -len = sizeof(un.sun_family) + strlen(un.sun_path); (void)unlink(unix_fn); -if (bind(sock, (struct sockaddr *), len) < 0) { +if (bind(sock, (struct sockaddr *), sizeof(un)) < 0) { perror("bind"); goto fail; } -- 2.43.2
[PATCH 7/9] libvhost-user: enable it on any POSIX system
The vhost-user protocol is not really Linux-specific so let's enable libvhost-user for any POSIX system. Compiling it on macOS and FreeBSD some problems came up: - avoid to include linux/vhost.h which is avaibale only on Linux (vhost_types.h contains many of the things we need) - macOS doesn't provide sys/endian.h, so let's define them (note: libvhost-user doesn't include qemu's headers, so we can't use use "qemu/bswap.h") - define eventfd_[write|read] as write/read wrapper when system doesn't provide those (e.g. macOS) - copy SEAL defines from include/qemu/memfd.h to make the code works on FreeBSD where MFD_ALLOW_SEALING is defined - define MAP_NORESERVE if it's not defined (e.g. on FreeBSD) Signed-off-by: Stefano Garzarella --- meson.build | 2 +- subprojects/libvhost-user/libvhost-user.h | 2 +- subprojects/libvhost-user/libvhost-user.c | 60 +-- 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 462f2d7593..af15178969 100644 --- a/meson.build +++ b/meson.build @@ -3188,7 +3188,7 @@ endif config_host_data.set('CONFIG_FDT', fdt.found()) vhost_user = not_found -if host_os == 'linux' and have_vhost_user +if have_vhost_user libvhost_user = subproject('libvhost-user') vhost_user = libvhost_user.get_variable('vhost_user_dep') endif diff --git a/subprojects/libvhost-user/libvhost-user.h b/subprojects/libvhost-user/libvhost-user.h index c2352904f0..e684b999b4 100644 --- a/subprojects/libvhost-user/libvhost-user.h +++ b/subprojects/libvhost-user/libvhost-user.h @@ -18,9 +18,9 @@ #include #include #include -#include #include #include "standard-headers/linux/virtio_ring.h" +#include "standard-headers/linux/vhost_types.h" /* Based on qemu/hw/virtio/vhost-user.c */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 5da8de838b..1075fd3147 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -28,9 +28,7 @@ #include #include #include -#include #include -#include /* Necessary to provide VIRTIO_F_VERSION_1 on system * with older linux headers. Must appear before @@ -39,8 +37,8 @@ #include "standard-headers/linux/virtio_config.h" #if defined(__linux__) +#include #include -#include #include #include @@ -50,6 +48,62 @@ #endif +#if defined(__APPLE__) && (__MACH__) +#include +#define htobe16(x) OSSwapHostToBigInt16(x) +#define htole16(x) OSSwapHostToLittleInt16(x) +#define be16toh(x) OSSwapBigToHostInt16(x) +#define le16toh(x) OSSwapLittleToHostInt16(x) + +#define htobe32(x) OSSwapHostToBigInt32(x) +#define htole32(x) OSSwapHostToLittleInt32(x) +#define be32toh(x) OSSwapBigToHostInt32(x) +#define le32toh(x) OSSwapLittleToHostInt32(x) + +#define htobe64(x) OSSwapHostToBigInt64(x) +#define htole64(x) OSSwapHostToLittleInt64(x) +#define be64toh(x) OSSwapBigToHostInt64(x) +#define le64toh(x) OSSwapLittleToHostInt64(x) +#endif + +#ifdef CONFIG_EVENTFD +#include +#else +#define eventfd_t uint64_t + +int eventfd_write(int fd, eventfd_t value) +{ +return (write(fd, , sizeof(value)) == sizeof(value)) ? 0 : -1; +} + +int eventfd_read(int fd, eventfd_t *value) +{ +return (read(fd, value, sizeof(*value)) == sizeof(*value)) ? 0 : -1; +} +#endif + +#ifdef MFD_ALLOW_SEALING +#include + +#ifndef F_LINUX_SPECIFIC_BASE +#define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS +#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE0x0008 /* prevent writes */ +#endif +#endif + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + #include "include/atomic.h" #include "libvhost-user.h" -- 2.43.2
[PATCH 9/9] hostmem-file: support POSIX shm_open()
Add a new `shm` bool option for `-object memory-backend-file`. When this option is set to true, the POSIX shm_open(3) is used instead of open(2). So a file will not be created in the filesystem, but a "POSIX shared memory object" will be instantiated. In Linux this turns into a file in /dev/shm, but in other OSes this may not happen (for example in macOS or FreeBSD nothing is shown in any filesystem). This new feature is useful when we need to share guest memory with another process (e.g. vhost-user backend), but we don't have memfd_create() or any special filesystems (e.g. /dev/shm) available as in macOS. Signed-off-by: Stefano Garzarella --- I am not sure this is the best way to support shm_open() in QEMU. Other solutions I had in mind were: - create a new memory-backend-shm - extend memory-backend-memfd to use shm_open() on systems where memfd is not available (problem: shm_open wants a name to assign to the object, but we can do a workaround by using a random name and do the unlink right away) Any preference/suggestion? Thanks, Stefano --- qapi/qom.json | 4 +++ backends/hostmem-file.c | 57 - backends/meson.build| 2 +- qemu-options.hx | 10 +++- 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/qapi/qom.json b/qapi/qom.json index 2a6e49365a..bfb01b909f 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -682,6 +682,9 @@ # writable RAM instead of ROM, and want to set this property to 'off'. # (default: auto, since 8.2) # +# @shm: if true, shm_open(3) is used to create/open POSIX shared memory +# object; if false, an open(2) is used. (default: false) (since 9.0) +# # Since: 2.1 ## { 'struct': 'MemoryBackendFileProperties', @@ -692,6 +695,7 @@ 'mem-path': 'str', '*pmem': { 'type': 'bool', 'if': 'CONFIG_LIBPMEM' }, '*readonly': 'bool', +'*shm': 'bool', '*rom': 'OnOffAuto' } } ## diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index ac3e433cbd..9d60375c1f 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -34,6 +34,7 @@ struct HostMemoryBackendFile { bool is_pmem; bool readonly; OnOffAuto rom; +bool shm; }; static bool @@ -86,7 +87,37 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) ram_flags |= fb->rom == ON_OFF_AUTO_ON ? RAM_READONLY : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; ram_flags |= fb->is_pmem ? RAM_PMEM : 0; +/* TODO: check if this should be enabled if shm is enabled */ ram_flags |= RAM_NAMED_FILE; + +if (fb->shm) { +mode_t mode = S_IRUSR | S_IWUSR; +int fd, oflag = 0; + +oflag |= fb->readonly ? O_RDONLY : O_RDWR; +oflag |= O_CREAT; + +fd = shm_open(fb->mem_path, oflag, mode); +if (fd < 0) { +error_setg_errno(errp, errno, + "failed to create POSIX shared memory"); +return false; +} + +if (ftruncate(fd, backend->size) == -1) { +error_setg_errno(errp, errno, + "failed to resize POSIX shared memory to %" PRIu64, + backend->size); +shm_unlink(fb->mem_path); +return false; +} + +return memory_region_init_ram_from_fd(>mr, OBJECT(backend), + name, backend->size, ram_flags, + fd, fb->offset, errp); + +} + return memory_region_init_ram_from_file(>mr, OBJECT(backend), name, backend->size, fb->align, ram_flags, fb->mem_path, fb->offset, errp); @@ -254,17 +285,36 @@ static void file_memory_backend_set_rom(Object *obj, Visitor *v, visit_type_OnOffAuto(v, name, >rom, errp); } +static bool file_memory_backend_get_shm(Object *obj, Error **errp) +{ +return MEMORY_BACKEND_FILE(obj)->shm; +} + +static void file_memory_backend_set_shm(Object *obj, bool value, + Error **errp) +{ +MEMORY_BACKEND_FILE(obj)->shm = value; +} + static void file_backend_unparent(Object *obj) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(obj); -if (host_memory_backend_mr_inited(backend) && fb->discard_data) { +if (!host_memory_backend_mr_inited(backend)) { +return; +} + +if (fb->discard_data) { void *ptr = memory_region_get_ram_ptr(>mr); uint64_t sz = memory_region_size(>mr); qemu_madvise(ptr, sz, QEMU_MADV_REMOVE); } + +if (fb->shm) { +shm_unlink(fb->mem_path); +} } static void @@ -300,6 +350,11 @@ file_backen
[PATCH 8/9] contrib/vhost-user-blk: enabled it on any POSIX system
Let's make the code more portable by using the "qemu/bswap.h" API and adding defines from block/file-posix.c to support O_DIRECT in other systems (e.g. macOS). vhost-user-server.c is a dependency, let's enable it for any POSIX system. Signed-off-by: Stefano Garzarella --- meson.build | 2 -- contrib/vhost-user-blk/vhost-user-blk.c | 19 +-- util/meson.build| 4 +++- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index af15178969..8afda86d3d 100644 --- a/meson.build +++ b/meson.build @@ -1954,8 +1954,6 @@ has_statx = cc.has_header_symbol('sys/stat.h', 'STATX_BASIC_STATS', prefix: gnu_ has_statx_mnt_id = cc.has_header_symbol('sys/stat.h', 'STATX_MNT_ID', prefix: gnu_source_prefix) have_vhost_user_blk_server = get_option('vhost_user_blk_server') \ - .require(host_os == 'linux', - error_message: 'vhost_user_blk_server requires linux') \ .require(have_vhost_user, error_message: 'vhost_user_blk_server requires vhost-user support') \ .disable_auto_if(not have_tools and not have_system) \ diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index a8ab9269a2..462e584857 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -16,6 +16,7 @@ */ #include "qemu/osdep.h" +#include "qemu/bswap.h" #include "standard-headers/linux/virtio_blk.h" #include "libvhost-user-glib.h" @@ -24,6 +25,20 @@ #include #endif +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + enum { VHOST_USER_BLK_MAX_QUEUES = 8, }; @@ -267,13 +282,13 @@ static int vub_virtio_process_req(VubDev *vdev_blk, req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base; in_num--; -type = le32toh(req->out->type); +type = le32_to_cpu(req->out->type); switch (type & ~VIRTIO_BLK_T_BARRIER) { case VIRTIO_BLK_T_IN: case VIRTIO_BLK_T_OUT: { ssize_t ret = 0; bool is_write = type & VIRTIO_BLK_T_OUT; -req->sector_num = le64toh(req->out->sector); +req->sector_num = le64_to_cpu(req->out->sector); if (is_write) { ret = vub_writev(req, >out_sg[1], out_num); } else { diff --git a/util/meson.build b/util/meson.build index 0ef9886be0..f52682ce96 100644 --- a/util/meson.build +++ b/util/meson.build @@ -113,10 +113,12 @@ if have_block util_ss.add(files('filemonitor-stub.c')) endif if host_os == 'linux' -util_ss.add(files('vhost-user-server.c'), vhost_user) util_ss.add(files('vfio-helpers.c')) util_ss.add(files('chardev_open.c')) endif + if host_os != 'windows' +util_ss.add(files('vhost-user-server.c'), vhost_user) + endif endif if cpu == 'aarch64' -- 2.43.2
[PATCH 0/9] vhost-user: support any POSIX system (tested on macOS and FreeBSD)
The vhost-user protocol is not really Linux-specific, so let's try support QEMU's frontends and backends (including libvhost-user) in any POSIX system with this series. The main use case is to be able to use virtio devices that we don't have built-in in QEMU (e.g. virtiofsd, vhost-user-vsock, etc.) even in non-Linux systems. The first 5 patches are more like fixes discovered at runtime on macOS or FreeBSD that could go even independently of this series. Patches 6, 7, and 8 enable building of frontends and backends (including libvhost-user) with associated code changes to succeed in compilation. The latest patch (9) adds support for the POSIX shm_open() API to create shared memory which is identified by an fd that can be shared with vhost-user backends. This is useful on those systems (like macOS) where we don't have memfd_create() or special filesystems like "/dev/shm". I put the whole series in RFC because I have some questions especially in patch 6 and 9, but in general any comment/suggestion/test are really welcome. Maybe the first 5 patches can go separately, but I only discovered those problems after testing patches 6 - 9, so I have included them in this series for now. Please let me know if you prefer that I send them separately. For now I tested this series using vhost-user-blk and QSD on macOS Sonoma 14.3.1 (aarch64), FreeBSD 14 (x86_64), and Fedora 39 (x86_64) in this way: - Start vhost-user-blk or QSD (same commands for all systems) vhost-user-blk -s /tmp/vhost.socket \ -b Fedora-Cloud-Base-39-1.5.x86_64.raw qemu-storage-daemon \ --blockdev file,filename=Fedora-Cloud-Base-39-1.5.x86_64.qcow2,node-name=file \ --blockdev qcow2,file=file,node-name=qcow2 \ --export vhost-user-blk,addr.type=unix,addr.path=/tmp/vhost.socket,id=vub,num-queues=1,node-name=qcow2,writable=on - macOS (aarch64): start QEMU (using hvf accelerator) qemu-system-aarch64 -smp 2 -cpu host -M virt,accel=hvf,memory-backend=mem \ -drive file=./build/pc-bios/edk2-aarch64-code.fd,if=pflash,format=raw,readonly=on \ -device virtio-net-device,netdev=net0 -netdev user,id=net0 \ -device ramfb -device usb-ehci -device usb-kbd \ -object memory-backend-file,mem-path="/mem0",shm=on,share=on,id=mem,size=512M \ -device vhost-user-blk-pci,num-queues=1,disable-legacy=on,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - FreeBSD (x86_64): start QEMU (no accelerators available) qemu-system-x86_64 -smp 2 -M q35,memory-backend=mem \ -object memory-backend-file,mem-path="/mem0",shm=on,share=on,id=mem,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket - Fedora (x86_64): start QEMU (using kvm accelerator) qemu-system-x86_64 -smp 2 -M q35,accel=kvm,memory-backend=mem \ -object memory-backend-file,mem-path="/mem0",shm=on,share=on,id=mem,size="512M" \ -device vhost-user-blk-pci,num-queues=1,chardev=char0 \ -chardev socket,id=char0,path=/tmp/vhost.socket Thanks, Stefano Stefano Garzarella (9): libvhost-user: set msg.msg_control to NULL when it is empty libvhost-user: fail vu_message_write() if sendmsg() is failing libvhost-user: mask F_INFLIGHT_SHMFD if memfd is not supported vhost-user-server: don't abort if we can't set fd non-blocking contrib/vhost-user-blk: fix bind() using the right size of the address vhost-user: enable frontends on any POSIX system libvhost-user: enable it on any POSIX system contrib/vhost-user-blk: enabled it on any POSIX system hostmem-file: support POSIX shm_open() meson.build | 5 +- qapi/qom.json | 4 ++ subprojects/libvhost-user/libvhost-user.h | 2 +- backends/hostmem-file.c | 57 - contrib/vhost-user-blk/vhost-user-blk.c | 23 +-- hw/net/vhost_net.c| 8 ++- subprojects/libvhost-user/libvhost-user.c | 76 ++- util/vhost-user-server.c | 6 +- backends/meson.build | 2 +- hw/block/Kconfig | 2 +- qemu-options.hx | 10 ++- util/meson.build | 4 +- 12 files changed, 179 insertions(+), 20 deletions(-) -- 2.43.2
[PATCH 2/9] libvhost-user: fail vu_message_write() if sendmsg() is failing
In vu_message_write() we use sendmsg() to send the message header, then a write() to send the payload. If sendmsg() fails we should avoid sending the payload, since we were unable to send the header. Discovered before fixing the issue with the previous patch, where sendmsg() failed on macOS due to wrong parameters, but the frontend still sent the payload which the backend incorrectly interpreted as a wrong header. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 5 + 1 file changed, 5 insertions(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index b9c18eee53..527a550345 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -394,6 +394,11 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) rc = sendmsg(conn_fd, , 0); } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); +if (rc <= 0) { +vu_panic(dev, "Error while writing: %s", strerror(errno)); +return false; +} + if (vmsg->size) { do { if (vmsg->data) { -- 2.43.2
[PATCH 1/9] libvhost-user: set msg.msg_control to NULL when it is empty
From: Stefano Garzarella On some OS (e.g. macOS) sendmsg() returns -1 (errno EINVAL) if the `struct msghdr` has the field `msg_controllen` set to 0, but `msg_control` is not NULL. Signed-off-by: Stefano Garzarella --- subprojects/libvhost-user/libvhost-user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index a3b158c671..b9c18eee53 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -387,6 +387,7 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) memcpy(CMSG_DATA(cmsg), vmsg->fds, fdsize); } else { msg.msg_controllen = 0; +msg.msg_control = NULL; } do { -- 2.43.2
Re: Re: [PATCH] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Wed, Feb 07, 2024 at 11:18:54AM +0100, Kevin Wolf wrote: Am 06.02.2024 um 17:44 hat Eugenio Perez Martin geschrieben: On Mon, Feb 5, 2024 at 2:49 PM Kevin Wolf wrote: > > Am 05.02.2024 um 13:22 hat Eugenio Perez Martin geschrieben: > > On Fri, Feb 2, 2024 at 2:25 PM Kevin Wolf wrote: > > > > > > VDUSE requires that virtqueues are first enabled before the DRIVER_OK > > > status flag is set; with the current API of the kernel module, it is > > > impossible to enable the opposite order in our block export code because > > > userspace is not notified when a virtqueue is enabled. > > > > > > This requirement also mathces the normal initialisation order as done by > > > the generic vhost code in QEMU. However, commit 6c482547 accidentally > > > changed the order for vdpa-dev and broke access to VDUSE devices with > > > this. > > > > > > This changes vdpa-dev to use the normal order again and use the standard > > > vhost callback .vhost_set_vring_enable for this. VDUSE devices can be > > > used with vdpa-dev again after this fix. > > > > > > Cc: qemu-sta...@nongnu.org > > > Fixes: 6c4825476a4351530bcac17abab72295b75ffe98 > > > Signed-off-by: Kevin Wolf > > > --- > > > hw/virtio/vdpa-dev.c | 5 + > > > hw/virtio/vhost-vdpa.c | 17 + > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c > > > index eb9ecea83b..13e87f06f6 100644 > > > --- a/hw/virtio/vdpa-dev.c > > > +++ b/hw/virtio/vdpa-dev.c > > > @@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp) > > > > > > s->dev.acked_features = vdev->guest_features; > > > > > > -ret = vhost_dev_start(>dev, vdev, false); > > > +ret = vhost_dev_start(>dev, vdev, true); > > > if (ret < 0) { > > > error_setg_errno(errp, -ret, "Error starting vhost"); > > > goto err_guest_notifiers; > > > } > > > -for (i = 0; i < s->dev.nvqs; ++i) { > > > -vhost_vdpa_set_vring_ready(>vdpa, i); > > > -} > > > s->started = true; > > > > > > /* > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > > > index 3a43beb312..c4574d56c5 100644 > > > --- a/hw/virtio/vhost-vdpa.c > > > +++ b/hw/virtio/vhost-vdpa.c > > > @@ -904,6 +904,22 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) > > > return r; > > > } > > > > > > +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) > > > +{ > > > +struct vhost_vdpa *v = dev->opaque; > > > +unsigned int i; > > > +int ret; > > > + > > > +for (i = 0; i < dev->nvqs; ++i) { > > > +ret = vhost_vdpa_set_vring_ready(v, i); > > > +if (ret < 0) { > > > +return ret; > > > +} > > > +} > > > + > > > +return 0; > > > +} > > > + > > > static int vhost_vdpa_set_config_call(struct vhost_dev *dev, > > > int fd) > > > { > > > @@ -1524,6 +1540,7 @@ const VhostOps vdpa_ops = { > > > .vhost_set_features = vhost_vdpa_set_features, > > > .vhost_reset_device = vhost_vdpa_reset_device, > > > .vhost_get_vq_index = vhost_vdpa_get_vq_index, > > > +.vhost_set_vring_enable = vhost_vdpa_set_vring_enable, > > > .vhost_get_config = vhost_vdpa_get_config, > > > .vhost_set_config = vhost_vdpa_set_config, > > > .vhost_requires_shm_log = NULL, > > > > vhost-vdpa net enables CVQ before dataplane ones to configure all the > > device in the destination of a live migration. To go back again to > > this callback would cause the device to enable the dataplane before > > virtqueues are configured again. > > Not that it makes a difference, but I don't think it would actually be > going back. Even before your commit 6c482547, we were not making use of > the generic callback but just called the function in a slightly > different place (but less different than after commit 6c482547). > > But anyway... Why don't the other vhost backend need the same for > vhost-net then? Do they just not support live migration? They don't support control virtqueue. More specifically, control virtqueue is handled directly in QEMU. So the network device already has to special case vdpa instead of using the same code for all vhost backends? :-/ > I don't know the code well enough to say where the problem is, but if > vhost-vdpa networking code relies on the usual vhost operations not > being implemented and bypasses VhostOps to replace it, that sounds like > a design problem to me. I don't follow this. What vhost operation is expected not to be implemented? You were concerned about implementing .vhost_set_vring_enable in vdpa_ops like my patch does. So it seems that the networking code requires that it is not implemented? On the other hand, for vhost-vdpa, the callback seems to be called in exactly the right place where virtqueues need to be enabled, like for other vhost devices. > Maybe
Re: Re: Re: [PATCH] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Wed, Feb 07, 2024 at 04:05:29PM +0800, Cindy Lu wrote: Jason Wang 于2024年2月7日周三 11:17写道: On Tue, Feb 6, 2024 at 4:31 PM Stefano Garzarella wrote: > > On Tue, Feb 06, 2024 at 10:47:40AM +0800, Jason Wang wrote: > >On Mon, Feb 5, 2024 at 6:51 PM Stefano Garzarella wrote: > >> > >> On Fri, Feb 02, 2024 at 02:25:21PM +0100, Kevin Wolf wrote: > >> >VDUSE requires that virtqueues are first enabled before the DRIVER_OK > >> >status flag is set; with the current API of the kernel module, it is > >> >impossible to enable the opposite order in our block export code because > >> >userspace is not notified when a virtqueue is enabled. > > > >Did this mean virtio-blk will enable a virtqueue after DRIVER_OK? > > It's not specific to virtio-blk, but to the generic vdpa device we have > in QEMU (i.e. vhost-vdpa-device). Yep, after commit > 6c4825476a4351530bcac17abab72295b75ffe98, virtqueues are enabled after > DRIVER_OK. Right. > > >Sepc is not clear about this and that's why we introduce > >VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK. > > Ah, I didn't know about this new feature. So after commit > 6c4825476a4351530bcac17abab72295b75ffe98 the vhost-vdpa-device is not > complying with the specification, right? Kind of, but as stated, it's just because spec is unclear about the behaviour. There's a chance that spec will explicitly support it in the future. > > > > >> > >> Yeah, IMHO the VDUSE protocol is missing a VDUSE_SET_VQ_READY message, > > > >I think you meant when VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is > >negotiated. > > At this point yes. But if VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not > negotiated, should we return an error in vhost-vdpa kernel module if > VHOST_VDPA_SET_VRING_ENABLE is called when DRIVER_OK is already set? I'm not sure if this can break some setups or not. It might be better to leave it as is? Without VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK, we don't know if parent support vq_ready after driver_ok. With VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK, we know parent support vq_ready after driver_ok. > > >If this is truth, it seems a little more complicated, for > >example the get_backend_features needs to be forward to the userspace? > > I'm not understanding, don't we already have VHOST_GET_BACKEND_FEATURES > for this? Or do you mean userspace on the VDUSE side? Yes, since in this case the parent is in the userspace, there's no way for VDUSE to know if user space supports vq_ready after driver_ok or not. As you may have noticed, we don't have a message for vq_ready which implies that vq_ready after driver_ok can't be supported. > > >This seems suboptimal to implement this in the spec first and then we > >can leverage the features. Or we can have another parameter for the > >ioctl that creates the vduse device. > > I got a little lost, though in vhost-user, the device can always expect > a vring_enable/disable, so I thought it was not complicated in VDUSE. Yes, the problem is assuming we have a message for vq_ready, there could be a "legacy" userspace that doesn't support that. So in that case, VDUSE needs to know if the userspace parent can support that or not. > > > > >> I'll start another thread about that, but in the meantime I agree that > >> we should fix QEMU since we need to work properly with old kernels as > >> well. > >> > >> > > >> >This requirement also mathces the normal initialisation order as done by > >> >the generic vhost code in QEMU. However, commit 6c482547 accidentally > >> >changed the order for vdpa-dev and broke access to VDUSE devices with > >> >this. > >> > > >> >This changes vdpa-dev to use the normal order again and use the standard > >> >vhost callback .vhost_set_vring_enable for this. VDUSE devices can be > >> >used with vdpa-dev again after this fix. > >> > >> I like this approach and the patch LGTM, but I'm a bit worried about > >> this function in hw/net/vhost_net.c: > >> > >> int vhost_set_vring_enable(NetClientState *nc, int enable) > >> { > >> VHostNetState *net = get_vhost_net(nc); > >> const VhostOps *vhost_ops = net->dev.vhost_ops; > >> > >> nc->vring_enable = enable; > >> > >> if (vhost_ops && vhost_ops->vhost_set_vring_enable) { > >> return vhost_ops->vhost_set_vring_enable(>dev, enable); > >> } > >> > >> return 0; > >> } > >> > >> @Eugenio, @Jason, should we change
Re: Re: Re: [PATCH] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Wed, Feb 07, 2024 at 11:17:34AM +0800, Jason Wang wrote: On Tue, Feb 6, 2024 at 4:31 PM Stefano Garzarella wrote: On Tue, Feb 06, 2024 at 10:47:40AM +0800, Jason Wang wrote: >On Mon, Feb 5, 2024 at 6:51 PM Stefano Garzarella wrote: >> >> On Fri, Feb 02, 2024 at 02:25:21PM +0100, Kevin Wolf wrote: >> >VDUSE requires that virtqueues are first enabled before the DRIVER_OK >> >status flag is set; with the current API of the kernel module, it is >> >impossible to enable the opposite order in our block export code because >> >userspace is not notified when a virtqueue is enabled. > >Did this mean virtio-blk will enable a virtqueue after DRIVER_OK? It's not specific to virtio-blk, but to the generic vdpa device we have in QEMU (i.e. vhost-vdpa-device). Yep, after commit 6c4825476a4351530bcac17abab72295b75ffe98, virtqueues are enabled after DRIVER_OK. Right. >Sepc is not clear about this and that's why we introduce >VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK. Ah, I didn't know about this new feature. So after commit 6c4825476a4351530bcac17abab72295b75ffe98 the vhost-vdpa-device is not complying with the specification, right? Kind of, but as stated, it's just because spec is unclear about the behaviour. There's a chance that spec will explicitly support it in the future. > >> >> Yeah, IMHO the VDUSE protocol is missing a VDUSE_SET_VQ_READY message, > >I think you meant when VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is >negotiated. At this point yes. But if VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not negotiated, should we return an error in vhost-vdpa kernel module if VHOST_VDPA_SET_VRING_ENABLE is called when DRIVER_OK is already set? I'm not sure if this can break some setups or not. It might be better to leave it as is? As I also answered in the kernel patch, IMHO we have to return an error, because any setups are broken anyway (see the problem we're fixing with this patch), so at this point it's better to return an error, so they don't spend time figuring out why the VDUSE device doesn't work. Without VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK, we don't know if parent support vq_ready after driver_ok. So we have to assume that it doesn't support it, to be more conservative, right? With VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK, we know parent support vq_ready after driver_ok. >If this is truth, it seems a little more complicated, for >example the get_backend_features needs to be forward to the userspace? I'm not understanding, don't we already have VHOST_GET_BACKEND_FEATURES for this? Or do you mean userspace on the VDUSE side? Yes, since in this case the parent is in the userspace, there's no way for VDUSE to know if user space supports vq_ready after driver_ok or not. As you may have noticed, we don't have a message for vq_ready which implies that vq_ready after driver_ok can't be supported. Yep, I see. >This seems suboptimal to implement this in the spec first and then we >can leverage the features. Or we can have another parameter for the >ioctl that creates the vduse device. I got a little lost, though in vhost-user, the device can always expect a vring_enable/disable, so I thought it was not complicated in VDUSE. Yes, the problem is assuming we have a message for vq_ready, there could be a "legacy" userspace that doesn't support that. So in that case, VDUSE needs to know if the userspace parent can support that or not. I think VDUSE needs to negotiate features (if it doesn't already) as vhost-user does with the backend. Also for new future functionality. > >> I'll start another thread about that, but in the meantime I agree that >> we should fix QEMU since we need to work properly with old kernels as >> well. >> >> > >> >This requirement also mathces the normal initialisation order as done by >> >the generic vhost code in QEMU. However, commit 6c482547 accidentally >> >changed the order for vdpa-dev and broke access to VDUSE devices with >> >this. >> > >> >This changes vdpa-dev to use the normal order again and use the standard >> >vhost callback .vhost_set_vring_enable for this. VDUSE devices can be >> >used with vdpa-dev again after this fix. >> >> I like this approach and the patch LGTM, but I'm a bit worried about >> this function in hw/net/vhost_net.c: >> >> int vhost_set_vring_enable(NetClientState *nc, int enable) >> { >> VHostNetState *net = get_vhost_net(nc); >> const VhostOps *vhost_ops = net->dev.vhost_ops; >> >> nc->vring_enable = enable; >> >> if (vhost_ops && vhost_ops->vhost_set_vring_enable) { >> return vhost_ops->vhost_set_vring_enable(>dev, enable); >>
Re: Re: [PATCH] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Tue, Feb 6, 2024 at 9:31 AM Stefano Garzarella wrote: > > On Tue, Feb 06, 2024 at 10:47:40AM +0800, Jason Wang wrote: > >On Mon, Feb 5, 2024 at 6:51 PM Stefano Garzarella > >wrote: > >> > >> On Fri, Feb 02, 2024 at 02:25:21PM +0100, Kevin Wolf wrote: > >> >VDUSE requires that virtqueues are first enabled before the DRIVER_OK > >> >status flag is set; with the current API of the kernel module, it is > >> >impossible to enable the opposite order in our block export code because > >> >userspace is not notified when a virtqueue is enabled. > > > >Did this mean virtio-blk will enable a virtqueue after DRIVER_OK? > > It's not specific to virtio-blk, but to the generic vdpa device we have > in QEMU (i.e. vhost-vdpa-device). Yep, after commit > 6c4825476a4351530bcac17abab72295b75ffe98, virtqueues are enabled after > DRIVER_OK. > > >Sepc is not clear about this and that's why we introduce > >VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK. > > Ah, I didn't know about this new feature. So after commit > 6c4825476a4351530bcac17abab72295b75ffe98 the vhost-vdpa-device is not > complying with the specification, right? > > > > >> > >> Yeah, IMHO the VDUSE protocol is missing a VDUSE_SET_VQ_READY message, > > > >I think you meant when VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is > >negotiated. > > At this point yes. But if VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not > negotiated, should we return an error in vhost-vdpa kernel module if > VHOST_VDPA_SET_VRING_ENABLE is called when DRIVER_OK is already set? I just sent a patch: https://lore.kernel.org/virtualization/20240206145154.118044-1-sgarz...@redhat.com/T/#u Then I discovered that we never check the return value of vhost_vdpa_set_vring_ready() in QEMU. I'll send a patch for that! Stefano > > >If this is truth, it seems a little more complicated, for > >example the get_backend_features needs to be forward to the userspace? > > I'm not understanding, don't we already have VHOST_GET_BACKEND_FEATURES > for this? Or do you mean userspace on the VDUSE side? > > >This seems suboptimal to implement this in the spec first and then we > >can leverage the features. Or we can have another parameter for the > >ioctl that creates the vduse device. > > I got a little lost, though in vhost-user, the device can always expect > a vring_enable/disable, so I thought it was not complicated in VDUSE. > > > > >> I'll start another thread about that, but in the meantime I agree that > >> we should fix QEMU since we need to work properly with old kernels as > >> well. > >> > >> > > >> >This requirement also mathces the normal initialisation order as done by > >> >the generic vhost code in QEMU. However, commit 6c482547 accidentally > >> >changed the order for vdpa-dev and broke access to VDUSE devices with > >> >this. > >> > > >> >This changes vdpa-dev to use the normal order again and use the standard > >> >vhost callback .vhost_set_vring_enable for this. VDUSE devices can be > >> >used with vdpa-dev again after this fix. > >> > >> I like this approach and the patch LGTM, but I'm a bit worried about > >> this function in hw/net/vhost_net.c: > >> > >> int vhost_set_vring_enable(NetClientState *nc, int enable) > >> { > >> VHostNetState *net = get_vhost_net(nc); > >> const VhostOps *vhost_ops = net->dev.vhost_ops; > >> > >> nc->vring_enable = enable; > >> > >> if (vhost_ops && vhost_ops->vhost_set_vring_enable) { > >> return vhost_ops->vhost_set_vring_enable(>dev, enable); > >> } > >> > >> return 0; > >> } > >> > >> @Eugenio, @Jason, should we change some things there if vhost-vdpa > >> implements the vhost_set_vring_enable callback? > > > >Eugenio may know more, I remember we need to enable cvq first for > >shadow virtqueue to restore some states. > > > >> > >> Do you remember why we didn't implement it from the beginning? > > > >It seems the vrings parameter is introduced after vhost-vdpa is > >implemented. > > Sorry, I mean why we didn't implement the vhost_set_vring_enable > callback for vhost-vdpa from the beginning. > > Thanks, > Stefano
Re: Re: [PATCH] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Tue, Feb 06, 2024 at 10:47:40AM +0800, Jason Wang wrote: On Mon, Feb 5, 2024 at 6:51 PM Stefano Garzarella wrote: On Fri, Feb 02, 2024 at 02:25:21PM +0100, Kevin Wolf wrote: >VDUSE requires that virtqueues are first enabled before the DRIVER_OK >status flag is set; with the current API of the kernel module, it is >impossible to enable the opposite order in our block export code because >userspace is not notified when a virtqueue is enabled. Did this mean virtio-blk will enable a virtqueue after DRIVER_OK? It's not specific to virtio-blk, but to the generic vdpa device we have in QEMU (i.e. vhost-vdpa-device). Yep, after commit 6c4825476a4351530bcac17abab72295b75ffe98, virtqueues are enabled after DRIVER_OK. Sepc is not clear about this and that's why we introduce VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK. Ah, I didn't know about this new feature. So after commit 6c4825476a4351530bcac17abab72295b75ffe98 the vhost-vdpa-device is not complying with the specification, right? Yeah, IMHO the VDUSE protocol is missing a VDUSE_SET_VQ_READY message, I think you meant when VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is negotiated. At this point yes. But if VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK is not negotiated, should we return an error in vhost-vdpa kernel module if VHOST_VDPA_SET_VRING_ENABLE is called when DRIVER_OK is already set? If this is truth, it seems a little more complicated, for example the get_backend_features needs to be forward to the userspace? I'm not understanding, don't we already have VHOST_GET_BACKEND_FEATURES for this? Or do you mean userspace on the VDUSE side? This seems suboptimal to implement this in the spec first and then we can leverage the features. Or we can have another parameter for the ioctl that creates the vduse device. I got a little lost, though in vhost-user, the device can always expect a vring_enable/disable, so I thought it was not complicated in VDUSE. I'll start another thread about that, but in the meantime I agree that we should fix QEMU since we need to work properly with old kernels as well. > >This requirement also mathces the normal initialisation order as done by >the generic vhost code in QEMU. However, commit 6c482547 accidentally >changed the order for vdpa-dev and broke access to VDUSE devices with >this. > >This changes vdpa-dev to use the normal order again and use the standard >vhost callback .vhost_set_vring_enable for this. VDUSE devices can be >used with vdpa-dev again after this fix. I like this approach and the patch LGTM, but I'm a bit worried about this function in hw/net/vhost_net.c: int vhost_set_vring_enable(NetClientState *nc, int enable) { VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; nc->vring_enable = enable; if (vhost_ops && vhost_ops->vhost_set_vring_enable) { return vhost_ops->vhost_set_vring_enable(>dev, enable); } return 0; } @Eugenio, @Jason, should we change some things there if vhost-vdpa implements the vhost_set_vring_enable callback? Eugenio may know more, I remember we need to enable cvq first for shadow virtqueue to restore some states. Do you remember why we didn't implement it from the beginning? It seems the vrings parameter is introduced after vhost-vdpa is implemented. Sorry, I mean why we didn't implement the vhost_set_vring_enable callback for vhost-vdpa from the beginning. Thanks, Stefano
Re: [PATCH] vdpa-dev: Fix initialisation order to restore VDUSE compatibility
On Fri, Feb 02, 2024 at 02:25:21PM +0100, Kevin Wolf wrote: VDUSE requires that virtqueues are first enabled before the DRIVER_OK status flag is set; with the current API of the kernel module, it is impossible to enable the opposite order in our block export code because userspace is not notified when a virtqueue is enabled. Yeah, IMHO the VDUSE protocol is missing a VDUSE_SET_VQ_READY message, I'll start another thread about that, but in the meantime I agree that we should fix QEMU since we need to work properly with old kernels as well. This requirement also mathces the normal initialisation order as done by the generic vhost code in QEMU. However, commit 6c482547 accidentally changed the order for vdpa-dev and broke access to VDUSE devices with this. This changes vdpa-dev to use the normal order again and use the standard vhost callback .vhost_set_vring_enable for this. VDUSE devices can be used with vdpa-dev again after this fix. I like this approach and the patch LGTM, but I'm a bit worried about this function in hw/net/vhost_net.c: int vhost_set_vring_enable(NetClientState *nc, int enable) { VHostNetState *net = get_vhost_net(nc); const VhostOps *vhost_ops = net->dev.vhost_ops; nc->vring_enable = enable; if (vhost_ops && vhost_ops->vhost_set_vring_enable) { return vhost_ops->vhost_set_vring_enable(>dev, enable); } return 0; } @Eugenio, @Jason, should we change some things there if vhost-vdpa implements the vhost_set_vring_enable callback? Do you remember why we didn't implement it from the beginning? Thanks, Stefano Cc: qemu-sta...@nongnu.org Fixes: 6c4825476a4351530bcac17abab72295b75ffe98 Signed-off-by: Kevin Wolf --- hw/virtio/vdpa-dev.c | 5 + hw/virtio/vhost-vdpa.c | 17 + 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/hw/virtio/vdpa-dev.c b/hw/virtio/vdpa-dev.c index eb9ecea83b..13e87f06f6 100644 --- a/hw/virtio/vdpa-dev.c +++ b/hw/virtio/vdpa-dev.c @@ -253,14 +253,11 @@ static int vhost_vdpa_device_start(VirtIODevice *vdev, Error **errp) s->dev.acked_features = vdev->guest_features; -ret = vhost_dev_start(>dev, vdev, false); +ret = vhost_dev_start(>dev, vdev, true); if (ret < 0) { error_setg_errno(errp, -ret, "Error starting vhost"); goto err_guest_notifiers; } -for (i = 0; i < s->dev.nvqs; ++i) { -vhost_vdpa_set_vring_ready(>vdpa, i); -} s->started = true; /* diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index 3a43beb312..c4574d56c5 100644 --- a/hw/virtio/vhost-vdpa.c +++ b/hw/virtio/vhost-vdpa.c @@ -904,6 +904,22 @@ int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx) return r; } +static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable) +{ +struct vhost_vdpa *v = dev->opaque; +unsigned int i; +int ret; + +for (i = 0; i < dev->nvqs; ++i) { +ret = vhost_vdpa_set_vring_ready(v, i); +if (ret < 0) { +return ret; +} +} + +return 0; +} + static int vhost_vdpa_set_config_call(struct vhost_dev *dev, int fd) { @@ -1524,6 +1540,7 @@ const VhostOps vdpa_ops = { .vhost_set_features = vhost_vdpa_set_features, .vhost_reset_device = vhost_vdpa_reset_device, .vhost_get_vq_index = vhost_vdpa_get_vq_index, +.vhost_set_vring_enable = vhost_vdpa_set_vring_enable, .vhost_get_config = vhost_vdpa_get_config, .vhost_set_config = vhost_vdpa_set_config, .vhost_requires_shm_log = NULL, -- 2.43.0
Re: [PATCH] blkio: Respect memory-alignment for bounce buffer allocations
On Wed, Jan 31, 2024 at 06:31:40PM +0100, Kevin Wolf wrote: blkio_alloc_mem_region() requires that the requested buffer size is a multiple of the memory-alignment property. If it isn't, the allocation fails with a return value of -EINVAL. Fix the call in blkio_resize_bounce_pool() to make sure the requested size is properly aligned. I observed this problem with vhost-vdpa, which requires page aligned memory. As the virtio-blk device behind it still had 512 byte blocks, we got bs->bl.request_alignment = 512, but actually any request that needed a bounce buffer and was not aligned to 4k would fail without this fix. Suggested-by: Stefano Garzarella Signed-off-by: Kevin Wolf --- block/blkio.c | 3 +++ 1 file changed, 3 insertions(+) Thanks for fixinig this! Reviewed-by: Stefano Garzarella diff --git a/block/blkio.c b/block/blkio.c index 0a0a6c0f5f..b989617608 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -89,6 +89,9 @@ static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) /* Pad size to reduce frequency of resize calls */ bytes += 128 * 1024; +/* Align the pool size to avoid blkio_alloc_mem_region() failure */ +bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment); + WITH_QEMU_LOCK_GUARD(>blkio_lock) { int ret; -- 2.43.0
[PATCH v2 1/2] block/blkio: close the fd when blkio_connect() fails
libblkio drivers take ownership of `fd` only after a successful blkio_connect(), so if it fails, we are still the owners. Fixes: cad2ccc395 ("block/blkio: use qemu_open() to support fd passing for virtio-blk") Suggested-by: Hanna Czenczek Signed-off-by: Stefano Garzarella --- Notes: v2: - avoid to use `fd_supported` to track a valid fd [Hanna] block/blkio.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/block/blkio.c b/block/blkio.c index 8e7ce42c79..baba2f0b67 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -678,7 +678,7 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, const char *path = qdict_get_try_str(options, "path"); BDRVBlkioState *s = bs->opaque; bool fd_supported = false; -int fd, ret; +int fd = -1, ret; if (!path) { error_setg(errp, "missing 'path' option"); @@ -719,6 +719,7 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, if (ret < 0) { fd_supported = false; qemu_close(fd); +fd = -1; } } } @@ -733,14 +734,18 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, } ret = blkio_connect(s->blkio); +if (ret < 0 && fd >= 0) { +/* Failed to give the FD to libblkio, close it */ +qemu_close(fd); +fd = -1; +} + /* * If the libblkio driver doesn't support the `fd` property, blkio_connect() * will fail with -EINVAL. So let's try calling blkio_connect() again by * directly setting `path`. */ if (fd_supported && ret == -EINVAL) { -qemu_close(fd); - /* * We need to clear the `fd` property we set previously by setting * it to -1. -- 2.41.0
[PATCH v2 2/2] block/blkio: add more comments on the fd passing handling
As Hanna pointed out, it is not clear in the code why qemu_open() can fail, and why blkio_set_int("fd") is not enough to discover the `fd` property support. Let's fix them by adding more details in the code comments. Suggested-by: Hanna Czenczek Reviewed-by: Hanna Czenczek Signed-off-by: Stefano Garzarella --- block/blkio.c | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/block/blkio.c b/block/blkio.c index baba2f0b67..1dd495617c 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -713,6 +713,12 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, */ fd = qemu_open(path, O_RDWR, NULL); if (fd < 0) { +/* + * qemu_open() can fail if the user specifies a path that is not + * a file or device, for example in the case of Unix Domain Socket + * for the virtio-blk-vhost-user driver. In such cases let's have + * libblkio open the path directly. + */ fd_supported = false; } else { ret = blkio_set_int(s->blkio, "fd", fd); @@ -741,9 +747,12 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, } /* - * If the libblkio driver doesn't support the `fd` property, blkio_connect() - * will fail with -EINVAL. So let's try calling blkio_connect() again by - * directly setting `path`. + * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208 + * (libblkio <= v1.3.0), setting the `fd` property is not enough to check + * whether the driver supports the `fd` property or not. In that case, + * blkio_connect() will fail with -EINVAL. + * So let's try calling blkio_connect() again by directly setting `path` + * to cover this scenario. */ if (fd_supported && ret == -EINVAL) { /* -- 2.41.0
[PATCH v2 0/2] block/blkio: fix fd leak and add more comments for the fd passing
Hanna discovered an fd leak in the error path, and a few comments to improve in the code. v2: - avoid to use `fd_supported` to track a valid fd [Hanna] v1: https://lore.kernel.org/qemu-devel/20230801160332.122564-1-sgarz...@redhat.com/ Stefano Garzarella (2): block/blkio: close the fd when blkio_connect() fails block/blkio: add more comments on the fd passing handling block/blkio.c | 26 -- 1 file changed, 20 insertions(+), 6 deletions(-) -- 2.41.0
Re: [PATCH 1/2] block/blkio: close the fd when blkio_connect() fails
On Wed, Aug 02, 2023 at 01:15:40PM +0200, Hanna Czenczek wrote: On 01.08.23 18:03, Stefano Garzarella wrote: libblkio drivers take ownership of `fd` only after a successful blkio_connect(), so if it fails, we are still the owners. Fixes: cad2ccc395 ("block/blkio: use qemu_open() to support fd passing for virtio-blk") Suggested-by: Hanna Czenczek Signed-off-by: Stefano Garzarella --- block/blkio.c | 9 + 1 file changed, 9 insertions(+) Works, so: Reviewed-by: Hanna Czenczek But personally, instead of having `fd_supported` track whether we have a valid FD or not, I’d find it more intuitive to track ownership through the `fd` variable itself, i.e. initialize it to -1, and set it -1 when ownership is transferred, and then close it once we don’t need it anymore, but failed to transfer ownership to blkio. The elaborate way would be something like ... -int fd, ret; +int fd = -1; +int ret; ... ret = blkio_connect(s->blkio); +if (!ret) { + /* If we had an FD, libblkio now has ownership of it */ + fd = -1; +} +if (fd >= 0) { + /* We still have FD ownership, but no longer need it, so close it */ + qemu_close(fd); + fd = -1; +} /* * [...] */ if (fd_supported && ret == -EINVAL) { - qemu_close(fd); - ... Or the shorter less-verbose version would be: ... -int fd, ret; +int fd = -1; +int ret; ... ret = blkio_connect(s->blkio); +if (fd >= 0 && ret < 0) { + /* Failed to give the FD to libblkio, close it */ + qemu_close(fd); +} I like this, I'll do it in v2! Thanks, Stefano
[PATCH 2/2] block/blkio: add more comments on the fd passing handling
As Hanna pointed out, it is not clear in the code why qemu_open() can fail, and why blkio_set_int("fd") is not enough to discover the `fd` property support. Let's fix them by adding more details in the code comments. Suggested-by: Hanna Czenczek Signed-off-by: Stefano Garzarella --- block/blkio.c | 15 --- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/block/blkio.c b/block/blkio.c index 2d53a865e7..848b8189d0 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -713,6 +713,12 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, */ fd = qemu_open(path, O_RDWR, NULL); if (fd < 0) { +/* + * qemu_open() can fail if the user specifies a path that is not + * a file or device, for example in the case of Unix Domain Socket + * for the virtio-blk-vhost-user driver. In such cases let's have + * libblkio open the path directly. + */ fd_supported = false; } else { ret = blkio_set_int(s->blkio, "fd", fd); @@ -734,9 +740,12 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, ret = blkio_connect(s->blkio); /* - * If the libblkio driver doesn't support the `fd` property, blkio_connect() - * will fail with -EINVAL. So let's try calling blkio_connect() again by - * directly setting `path`. + * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208 + * (libblkio <= v1.3.0), setting the `fd` property is not enough to check + * whether the driver supports the `fd` property or not. In that case, + * blkio_connect() will fail with -EINVAL. + * So let's try calling blkio_connect() again by directly setting `path` + * to cover this scenario. */ if (fd_supported && ret == -EINVAL) { fd_supported = false; -- 2.41.0
[PATCH 1/2] block/blkio: close the fd when blkio_connect() fails
libblkio drivers take ownership of `fd` only after a successful blkio_connect(), so if it fails, we are still the owners. Fixes: cad2ccc395 ("block/blkio: use qemu_open() to support fd passing for virtio-blk") Suggested-by: Hanna Czenczek Signed-off-by: Stefano Garzarella --- block/blkio.c | 9 + 1 file changed, 9 insertions(+) diff --git a/block/blkio.c b/block/blkio.c index 8e7ce42c79..2d53a865e7 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -739,6 +739,7 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, * directly setting `path`. */ if (fd_supported && ret == -EINVAL) { +fd_supported = false; qemu_close(fd); /* @@ -763,6 +764,14 @@ static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, } if (ret < 0) { +if (fd_supported) { +/* + * libblkio drivers take ownership of `fd` only after a successful + * blkio_connect(), so if it fails, we are still the owners. + */ +qemu_close(fd); +} + error_setg_errno(errp, -ret, "blkio_connect failed: %s", blkio_get_error_msg()); return ret; -- 2.41.0
[PATCH 0/2] block/blkio: fix fd leak and add more comments for the fd passing
Hanna discovered an fd leak in the error path, and a few comments to improve in the code. Stefano Garzarella (2): block/blkio: close the fd when blkio_connect() fails block/blkio: add more comments on the fd passing handling block/blkio.c | 24 +--- 1 file changed, 21 insertions(+), 3 deletions(-) -- 2.41.0
[PATCH v2 1/4] block/blkio: move blkio_connect() in the drivers functions
This is in preparation for the next patch, where for virtio-blk drivers we need to handle the failure of blkio_connect(). Let's also rename the *_open() functions to *_connect() to make the code reflect the changes applied. Signed-off-by: Stefano Garzarella --- block/blkio.c | 67 ++- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/block/blkio.c b/block/blkio.c index 7eb1b94820..8ad7c0b575 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -603,8 +603,8 @@ static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size) } } -static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) +static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options, + int flags, Error **errp) { const char *filename = qdict_get_str(options, "filename"); BDRVBlkioState *s = bs->opaque; @@ -627,11 +627,18 @@ static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags, } } +ret = blkio_connect(s->blkio); +if (ret < 0) { +error_setg_errno(errp, -ret, "blkio_connect failed: %s", + blkio_get_error_msg()); +return ret; +} + return 0; } -static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, - Error **errp) +static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options, + int flags, Error **errp) { const char *path = qdict_get_try_str(options, "path"); BDRVBlkioState *s = bs->opaque; @@ -655,11 +662,18 @@ static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } +ret = blkio_connect(s->blkio); +if (ret < 0) { +error_setg_errno(errp, -ret, "blkio_connect failed: %s", + blkio_get_error_msg()); +return ret; +} + return 0; } -static int blkio_virtio_blk_common_open(BlockDriverState *bs, -QDict *options, int flags, Error **errp) +static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options, +int flags, Error **errp) { const char *path = qdict_get_try_str(options, "path"); BDRVBlkioState *s = bs->opaque; @@ -718,6 +732,13 @@ static int blkio_virtio_blk_common_open(BlockDriverState *bs, } } +ret = blkio_connect(s->blkio); +if (ret < 0) { +error_setg_errno(errp, -ret, "blkio_connect failed: %s", + blkio_get_error_msg()); +return ret; +} + qdict_del(options, "path"); return 0; @@ -737,24 +758,6 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, return ret; } -if (strcmp(blkio_driver, "io_uring") == 0) { -ret = blkio_io_uring_open(bs, options, flags, errp); -} else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { -ret = blkio_nvme_io_uring(bs, options, flags, errp); -} else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { -ret = blkio_virtio_blk_common_open(bs, options, flags, errp); -} else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { -ret = blkio_virtio_blk_common_open(bs, options, flags, errp); -} else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { -ret = blkio_virtio_blk_common_open(bs, options, flags, errp); -} else { -g_assert_not_reached(); -} -if (ret < 0) { -blkio_destroy(>blkio); -return ret; -} - if (!(flags & BDRV_O_RDWR)) { ret = blkio_set_bool(s->blkio, "read-only", true); if (ret < 0) { @@ -765,10 +768,20 @@ static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags, } } -ret = blkio_connect(s->blkio); +if (strcmp(blkio_driver, "io_uring") == 0) { +ret = blkio_io_uring_connect(bs, options, flags, errp); +} else if (strcmp(blkio_driver, "nvme-io_uring") == 0) { +ret = blkio_nvme_io_uring_connect(bs, options, flags, errp); +} else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) { +ret = blkio_virtio_blk_connect(bs, options, flags, errp); +} else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) { +ret = blkio_virtio_blk_connect(bs, options, flags, errp); +} else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) { +ret = blkio_virtio_blk_connect(bs, options, flags, errp); +} else { +g_assert_not_reached(); +} if (ret < 0) { -error_setg_errno(errp, -ret, "blkio_connect failed: %s", - blkio_get_error_msg()); blkio_destroy(>blkio); return ret; } -- 2.41.0