The vhost-user-blk-test qtest has been hanging intermittently for a while. The root cause is not yet fully understood, but the hang is impacting enough users that it is important to merge a workaround for it.
The race which causes the hang occurs early on in vhost-user setup, where a vhost-user message is never received by the backend. Forcing QEMU to wait until the storage-daemon has had some time to initialize prevents the hang. Thus the existing storage-daemon pidfile option can be used to implement a workaround cleanly and effectively, since it creates a file only once the storage-daemon initialization is complete. This change implements a workaround for the vhost-user-blk-test hang by making QEMU wait until the storage-daemon has written out a pidfile before attempting to connect and send messages over the vhost-user socket. Some relevent mailing list discussions: [1] https://lore.kernel.org/qemu-devel/CAFEAcA8kYpz9LiPNxnWJAPSjc=nv532bedyfynabemeohqb...@mail.gmail.com/ [2] https://lore.kernel.org/qemu-devel/YWaky%2FKVbS%2FKZjlV@stefanha-x1.localdomain/ Signed-off-by: Raphael Norwitz <raphael.norw...@nutanix.com> Reviewed-by: Eric Blake <ebl...@redhat.com> --- tests/qtest/vhost-user-blk-test.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/qtest/vhost-user-blk-test.c b/tests/qtest/vhost-user-blk-test.c index 6f108a1b62..c6626a286b 100644 --- a/tests/qtest/vhost-user-blk-test.c +++ b/tests/qtest/vhost-user-blk-test.c @@ -24,6 +24,7 @@ #define TEST_IMAGE_SIZE (64 * 1024 * 1024) #define QVIRTIO_BLK_TIMEOUT_US (30 * 1000 * 1000) #define PCI_SLOT_HP 0x06 +#define PIDFILE_RETRIES 5 typedef struct { pid_t pid; @@ -885,7 +886,8 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, int num_queues) { const char *vhost_user_blk_bin = qtest_qemu_storage_daemon_binary(); - int i; + int i, retries; + char *daemon_pidfile_path; gchar *img_path; GString *storage_daemon_command = g_string_new(NULL); QemuStorageDaemonState *qsd; @@ -898,6 +900,8 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, " -object memory-backend-memfd,id=mem,size=256M,share=on " " -M memory-backend=mem -m 256M "); + daemon_pidfile_path = g_strdup_printf("/tmp/daemon-%d", getpid()); + for (i = 0; i < vus_instances; i++) { int fd; char *sock_path = create_listen_socket(&fd); @@ -914,6 +918,9 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, i + 1, sock_path); } + g_string_append_printf(storage_daemon_command, "--pidfile %s ", + daemon_pidfile_path); + g_test_message("starting vhost-user backend: %s", storage_daemon_command->str); pid_t pid = fork(); @@ -930,7 +937,27 @@ static void start_vhost_user_blk(GString *cmd_line, int vus_instances, execlp("/bin/sh", "sh", "-c", storage_daemon_command->str, NULL); exit(1); } + + /* + * FIXME: The loop here ensures the storage-daemon has come up properly + * before allowing the test to proceed. This is a workaround for + * a race which used to cause the vhost-user-blk-test to hang. It + * should be deleted once the root cause is fully understood and + * fixed. + */ + retries = 0; + while (access(daemon_pidfile_path, F_OK) != 0) { + g_assert_cmpint(retries, <, PIDFILE_RETRIES); + + retries++; + g_usleep(1000); + } + g_string_free(storage_daemon_command, true); + if (access(daemon_pidfile_path, F_OK) == 0) { + unlink(daemon_pidfile_path); + } + g_free(daemon_pidfile_path); qsd = g_new(QemuStorageDaemonState, 1); qsd->pid = pid; -- 2.20.1