[PATCH 3/6] block/blkio: convert to blk_io_plug_call() API

2023-05-17 Thread Stefan Hajnoczi
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
---
 block/blkio.c | 40 +---
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/block/blkio.c b/block/blkio.c
index 0cdc99a729..f2a1dc1fb2 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -325,16 +325,28 @@ static void blkio_detach_aio_context(BlockDriverState *bs)
false, NULL, NULL, NULL, NULL, NULL);
 }
 
-/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
-static void blkio_submit_io(BlockDriverState *bs)
+/*
+ * Called by blk_io_unplug() or immediately if not plugged. Called without
+ * blkio_lock.
+ */
+static void blkio_unplug_fn(BlockDriverState *bs)
 {
-if (qatomic_read(>io_plugged) == 0) {
-BDRVBlkioState *s = bs->opaque;
+BDRVBlkioState *s = bs->opaque;
 
+WITH_QEMU_LOCK_GUARD(>blkio_lock) {
 blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
 }
 }
 
+/*
+ * Schedule I/O submission after enqueuing a new request. Called without
+ * blkio_lock.
+ */
+static void blkio_submit_io(BlockDriverState *bs)
+{
+blk_io_plug_call(blkio_unplug_fn, bs);
+}
+
 static int coroutine_fn
 blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
@@ -345,9 +357,9 @@ blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, 
int64_t bytes)
 
 WITH_QEMU_LOCK_GUARD(>blkio_lock) {
 blkioq_discard(s->blkioq, offset, bytes, , 0);
-blkio_submit_io(bs);
 }
 
+blkio_submit_io(bs);
 qemu_coroutine_yield();
 return cod.ret;
 }
@@ -378,9 +390,9 @@ blkio_co_preadv(BlockDriverState *bs, int64_t offset, 
int64_t bytes,
 
 WITH_QEMU_LOCK_GUARD(>blkio_lock) {
 blkioq_readv(s->blkioq, offset, iov, iovcnt, , 0);
-blkio_submit_io(bs);
 }
 
+blkio_submit_io(bs);
 qemu_coroutine_yield();
 
 if (use_bounce_buffer) {
@@ -423,9 +435,9 @@ static int coroutine_fn blkio_co_pwritev(BlockDriverState 
*bs, int64_t offset,
 
 WITH_QEMU_LOCK_GUARD(>blkio_lock) {
 blkioq_writev(s->blkioq, offset, iov, iovcnt, , blkio_flags);
-blkio_submit_io(bs);
 }
 
+blkio_submit_io(bs);
 qemu_coroutine_yield();
 
 if (use_bounce_buffer) {
@@ -444,9 +456,9 @@ static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
 
 WITH_QEMU_LOCK_GUARD(>blkio_lock) {
 blkioq_flush(s->blkioq, , 0);
-blkio_submit_io(bs);
 }
 
+blkio_submit_io(bs);
 qemu_coroutine_yield();
 return cod.ret;
 }
@@ -472,22 +484,13 @@ static int coroutine_fn 
blkio_co_pwrite_zeroes(BlockDriverState *bs,
 
 WITH_QEMU_LOCK_GUARD(>blkio_lock) {
 blkioq_write_zeroes(s->blkioq, offset, bytes, , blkio_flags);
-blkio_submit_io(bs);
 }
 
+blkio_submit_io(bs);
 qemu_coroutine_yield();
 return cod.ret;
 }
 
-static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
-{
-BDRVBlkioState *s = bs->opaque;
-
-WITH_QEMU_LOCK_GUARD(>blkio_lock) {
-blkio_submit_io(bs);
-}
-}
-
 typedef enum {
 BMRR_OK,
 BMRR_SKIP,
@@ -1009,7 +1012,6 @@ static void blkio_refresh_limits(BlockDriverState *bs, 
Error **errp)
 .bdrv_co_pwritev = blkio_co_pwritev, \
 .bdrv_co_flush_to_disk   = blkio_co_flush, \
 .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
-.bdrv_co_io_unplug   = blkio_co_io_unplug, \
 .bdrv_refresh_limits = blkio_refresh_limits, \
 .bdrv_register_buf   = blkio_register_buf, \
 .bdrv_unregister_buf = blkio_unregister_buf, \
-- 
2.40.1




[PATCH 6/6] block: remove bdrv_co_io_plug() API

2023-05-17 Thread Stefan Hajnoczi
No block driver implements .bdrv_co_io_plug() anymore. Get rid of the
function pointers.

Signed-off-by: Stefan Hajnoczi 
---
 include/block/block-io.h |  3 ---
 include/block/block_int-common.h | 11 --
 block/io.c   | 37 
 3 files changed, 51 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index a27e471a87..43af816d75 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -259,9 +259,6 @@ void coroutine_fn bdrv_co_leave(BlockDriverState *bs, 
AioContext *old_ctx);
 
 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
 
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_plug(BlockDriverState *bs);
-void coroutine_fn GRAPH_RDLOCK bdrv_co_io_unplug(BlockDriverState *bs);
-
 bool coroutine_fn GRAPH_RDLOCK
 bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
uint32_t granularity, Error **errp);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index dbec0e3bb4..fa369d83dd 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -753,11 +753,6 @@ struct BlockDriver {
 void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)(
 BlockDriverState *bs, BlkdebugEvent event);
 
-/* io queue for linux-aio */
-void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState 
*bs);
-void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)(
-BlockDriverState *bs);
-
 /**
  * bdrv_drain_begin is called if implemented in the beginning of a
  * drain operation to drain and stop any internal sources of requests in
@@ -1227,12 +1222,6 @@ struct BlockDriverState {
 unsigned int in_flight;
 unsigned int serialising_in_flight;
 
-/*
- * counter for nested bdrv_io_plug.
- * Accessed with atomic ops.
- */
-unsigned io_plugged;
-
 /* do we need to tell the quest if we have a volatile write cache? */
 int enable_write_cache;
 
diff --git a/block/io.c b/block/io.c
index 4d54fda593..56b0c1ce6c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3219,43 +3219,6 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t 
size)
 return mem;
 }
 
-void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs)
-{
-BdrvChild *child;
-IO_CODE();
-assert_bdrv_graph_readable();
-
-QLIST_FOREACH(child, >children, next) {
-bdrv_co_io_plug(child->bs);
-}
-
-if (qatomic_fetch_inc(>io_plugged) == 0) {
-BlockDriver *drv = bs->drv;
-if (drv && drv->bdrv_co_io_plug) {
-drv->bdrv_co_io_plug(bs);
-}
-}
-}
-
-void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs)
-{
-BdrvChild *child;
-IO_CODE();
-assert_bdrv_graph_readable();
-
-assert(bs->io_plugged);
-if (qatomic_fetch_dec(>io_plugged) == 1) {
-BlockDriver *drv = bs->drv;
-if (drv && drv->bdrv_co_io_unplug) {
-drv->bdrv_co_io_unplug(bs);
-}
-}
-
-QLIST_FOREACH(child, >children, next) {
-bdrv_co_io_unplug(child->bs);
-}
-}
-
 /* Helper that undoes bdrv_register_buf() when it fails partway through */
 static void GRAPH_RDLOCK
 bdrv_register_buf_rollback(BlockDriverState *bs, void *host, size_t size,
-- 
2.40.1




[PATCH 1/6] block: add blk_io_plug_call() API

2023-05-17 Thread Stefan Hajnoczi
Introduce a new API for thread-local blk_io_plug() that does not
traverse the block graph. The goal is to make blk_io_plug() multi-queue
friendly.

Instead of having block drivers track whether or not we're in a plugged
section, provide an API that allows them to defer a function call until
we're unplugged: blk_io_plug_call(fn, opaque). If blk_io_plug_call() is
called multiple times with the same fn/opaque pair, then fn() is only
called once at the end of the function - resulting in batching.

This patch introduces the API and changes blk_io_plug()/blk_io_unplug().
blk_io_plug()/blk_io_unplug() no longer require a BlockBackend argument
because the plug state is now thread-local.

Later patches convert block drivers to blk_io_plug_call() and then we
can finally remove .bdrv_co_io_plug() once all block drivers have been
converted.

Signed-off-by: Stefan Hajnoczi 
---
 MAINTAINERS   |   1 +
 include/sysemu/block-backend-io.h |  13 +--
 block/block-backend.c |  22 -
 block/plug.c  | 159 ++
 hw/block/dataplane/xen-block.c|   8 +-
 hw/block/virtio-blk.c |   4 +-
 hw/scsi/virtio-scsi.c |   6 +-
 block/meson.build |   1 +
 8 files changed, 173 insertions(+), 41 deletions(-)
 create mode 100644 block/plug.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 50585117a0..574202295c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2644,6 +2644,7 @@ F: util/aio-*.c
 F: util/aio-*.h
 F: util/fdmon-*.c
 F: block/io.c
+F: block/plug.c
 F: migration/block*
 F: include/block/aio.h
 F: include/block/aio-wait.h
diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
index d62a7ee773..be4dcef59d 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -100,16 +100,9 @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
 int blk_get_max_iov(BlockBackend *blk);
 int blk_get_max_hw_iov(BlockBackend *blk);
 
-/*
- * blk_io_plug/unplug are thread-local operations. This means that multiple
- * IOThreads can simultaneously call plug/unplug, but the caller must ensure
- * that each unplug() is called in the same IOThread of the matching plug().
- */
-void coroutine_fn blk_co_io_plug(BlockBackend *blk);
-void co_wrapper blk_io_plug(BlockBackend *blk);
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
-void co_wrapper blk_io_unplug(BlockBackend *blk);
+void blk_io_plug(void);
+void blk_io_unplug(void);
+void blk_io_plug_call(void (*fn)(void *), void *opaque);
 
 AioContext *blk_get_aio_context(BlockBackend *blk);
 BlockAcctStats *blk_get_stats(BlockBackend *blk);
diff --git a/block/block-backend.c b/block/block-backend.c
index ca537cd0ad..1f1d226ba6 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2568,28 +2568,6 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, 
Notifier *notify)
 notifier_list_add(>insert_bs_notifiers, notify);
 }
 
-void coroutine_fn blk_co_io_plug(BlockBackend *blk)
-{
-BlockDriverState *bs = blk_bs(blk);
-IO_CODE();
-GRAPH_RDLOCK_GUARD();
-
-if (bs) {
-bdrv_co_io_plug(bs);
-}
-}
-
-void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
-{
-BlockDriverState *bs = blk_bs(blk);
-IO_CODE();
-GRAPH_RDLOCK_GUARD();
-
-if (bs) {
-bdrv_co_io_unplug(bs);
-}
-}
-
 BlockAcctStats *blk_get_stats(BlockBackend *blk)
 {
 IO_CODE();
diff --git a/block/plug.c b/block/plug.c
new file mode 100644
index 00..6738a568ba
--- /dev/null
+++ b/block/plug.c
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Block I/O plugging
+ *
+ * Copyright Red Hat.
+ *
+ * This API defers a function call within a blk_io_plug()/blk_io_unplug()
+ * section, allowing multiple calls to batch up. This is a performance
+ * optimization that is used in the block layer to submit several I/O requests
+ * at once instead of individually:
+ *
+ *   blk_io_plug(); <-- start of plugged region
+ *   ...
+ *   blk_io_plug_call(my_func, my_obj); <-- deferred my_func(my_obj) call
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   blk_io_plug_call(my_func, my_obj); <-- another
+ *   ...
+ *   blk_io_unplug(); <-- end of plugged region, my_func(my_obj) is called once
+ *
+ * This code is actually generic and not tied to the block layer. If another
+ * subsystem needs this functionality, it could be renamed.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine-tls.h"
+#include "qemu/notify.h"
+#include "qemu/thread.h"
+#include "sysemu/block-backend.h"
+
+/* A function call that has been deferred until unplug() */
+typedef struct {
+void (*fn)(void *);
+void *opaque;
+} UnplugFn;
+
+/* Per-thread state */
+typedef struct {
+unsigned count;   /* how many times has plug() been called? */
+GArray *unplug_fns;   /* functions to call at unplug time */
+} Plug;
+
+/* Use get_ptr_plug() to fetch this thread-local value */

[PATCH 2/6] block/nvme: convert to blk_io_plug_call() API

2023-05-17 Thread Stefan Hajnoczi
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
---
 block/nvme.c | 44 
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 5b744c2bda..100b38b592 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -25,6 +25,7 @@
 #include "qemu/vfio-helpers.h"
 #include "block/block-io.h"
 #include "block/block_int.h"
+#include "sysemu/block-backend.h"
 #include "sysemu/replay.h"
 #include "trace.h"
 
@@ -119,7 +120,6 @@ struct BDRVNVMeState {
 int blkshift;
 
 uint64_t max_transfer;
-bool plugged;
 
 bool supports_write_zeroes;
 bool supports_discard;
@@ -282,7 +282,7 @@ static void nvme_kick(NVMeQueuePair *q)
 {
 BDRVNVMeState *s = q->s;
 
-if (s->plugged || !q->need_kick) {
+if (!q->need_kick) {
 return;
 }
 trace_nvme_kick(s, q->index);
@@ -387,10 +387,6 @@ static bool nvme_process_completion(NVMeQueuePair *q)
 NvmeCqe *c;
 
 trace_nvme_process_completion(s, q->index, q->inflight);
-if (s->plugged) {
-trace_nvme_process_completion_queue_plugged(s, q->index);
-return false;
-}
 
 /*
  * Support re-entrancy when a request cb() function invokes aio_poll().
@@ -480,6 +476,15 @@ static void nvme_trace_command(const NvmeCmd *cmd)
 }
 }
 
+static void nvme_unplug_fn(void *opaque)
+{
+NVMeQueuePair *q = opaque;
+
+QEMU_LOCK_GUARD(>lock);
+nvme_kick(q);
+nvme_process_completion(q);
+}
+
 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
 NvmeCmd *cmd, BlockCompletionFunc cb,
 void *opaque)
@@ -496,8 +501,7 @@ static void nvme_submit_command(NVMeQueuePair *q, 
NVMeRequest *req,
q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
 q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
 q->need_kick++;
-nvme_kick(q);
-nvme_process_completion(q);
+blk_io_plug_call(nvme_unplug_fn, q);
 qemu_mutex_unlock(>lock);
 }
 
@@ -1567,27 +1571,6 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
 }
 }
 
-static void coroutine_fn nvme_co_io_plug(BlockDriverState *bs)
-{
-BDRVNVMeState *s = bs->opaque;
-assert(!s->plugged);
-s->plugged = true;
-}
-
-static void coroutine_fn nvme_co_io_unplug(BlockDriverState *bs)
-{
-BDRVNVMeState *s = bs->opaque;
-assert(s->plugged);
-s->plugged = false;
-for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
-NVMeQueuePair *q = s->queues[i];
-qemu_mutex_lock(>lock);
-nvme_kick(q);
-nvme_process_completion(q);
-qemu_mutex_unlock(>lock);
-}
-}
-
 static bool nvme_register_buf(BlockDriverState *bs, void *host, size_t size,
   Error **errp)
 {
@@ -1664,9 +1647,6 @@ static BlockDriver bdrv_nvme = {
 .bdrv_detach_aio_context  = nvme_detach_aio_context,
 .bdrv_attach_aio_context  = nvme_attach_aio_context,
 
-.bdrv_co_io_plug  = nvme_co_io_plug,
-.bdrv_co_io_unplug= nvme_co_io_unplug,
-
 .bdrv_register_buf= nvme_register_buf,
 .bdrv_unregister_buf  = nvme_unregister_buf,
 };
-- 
2.40.1




[PATCH 5/6] block/linux-aio: convert to blk_io_plug_call() API

2023-05-17 Thread Stefan Hajnoczi
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
---
 include/block/raw-aio.h |  7 ---
 block/file-posix.c  | 28 
 block/linux-aio.c   | 41 +++--
 3 files changed, 11 insertions(+), 65 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index da60ca13ef..0f63c2800c 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, 
QEMUIOVector *qiov,
 
 void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
-
-/*
- * laio_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void laio_io_plug(void);
-void laio_io_unplug(uint64_t dev_max_batch);
 #endif
 /* io_uring.c - Linux io_uring implementation */
 #ifdef CONFIG_LINUX_IO_URING
diff --git a/block/file-posix.c b/block/file-posix.c
index 7baa8491dd..ac1ed54811 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2550,26 +2550,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState 
*bs, int64_t offset,
 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }
 
-static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
-{
-BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-if (s->use_linux_aio) {
-laio_io_plug();
-}
-#endif
-}
-
-static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
-{
-BDRVRawState __attribute__((unused)) *s = bs->opaque;
-#ifdef CONFIG_LINUX_AIO
-if (s->use_linux_aio) {
-laio_io_unplug(s->aio_max_batch);
-}
-#endif
-}
-
 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
 {
 BDRVRawState *s = bs->opaque;
@@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
 .bdrv_co_copy_range_from = raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
 .bdrv_refresh_limits = raw_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
 .bdrv_co_truncate   = raw_co_truncate,
@@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
 .bdrv_co_copy_range_from = raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
 .bdrv_refresh_limits = raw_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
 .bdrv_co_truncate   = raw_co_truncate,
@@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
 .bdrv_refresh_limits= cdrom_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
 .bdrv_co_truncate   = raw_co_truncate,
@@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
 .bdrv_refresh_limits= cdrom_refresh_limits,
-.bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_co_io_unplug  = raw_co_io_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
 .bdrv_co_truncate   = raw_co_truncate,
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 442c86209b..5021aed68f 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -15,6 +15,7 @@
 #include "qemu/event_notifier.h"
 #include "qemu/coroutine.h"
 #include "qapi/error.h"
+#include "sysemu/block-backend.h"
 
 /* Only used for assertions.  */
 #include "qemu/coroutine_int.h"
@@ -46,7 +47,6 @@ struct qemu_laiocb {
 };
 
 typedef struct {
-int plugged;
 unsigned int in_queue;
 unsigned int in_flight;
 bool blocked;
@@ -236,7 +236,7 @@ static void 
qemu_laio_process_completions_and_submit(LinuxAioState *s)
 {
 qemu_laio_process_completions(s);
 
-if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(>io_q.pending)) {
+if (!QSIMPLEQ_EMPTY(>io_q.pending)) {
 ioq_submit(s);
 }
 }
@@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
 static void ioq_init(LaioQueue *io_q)
 {
 QSIMPLEQ_INIT(_q->pending);
-io_q->plugged = 0;
 io_q->in_queue = 0;
 io_q->in_flight = 0;
 io_q->blocked = false;
@@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, uint64_t 
dev_max_batch)
 return max_batch;
 }
 
-void laio_io_plug(void)
+static void laio_unplug_fn(void *opaque)
 {
-AioContext *ctx = 

[PATCH 4/6] block/io_uring: convert to blk_io_plug_call() API

2023-05-17 Thread Stefan Hajnoczi
Stop using the .bdrv_co_io_plug() API because it is not multi-queue
block layer friendly. Use the new blk_io_plug_call() API to batch I/O
submission instead.

Signed-off-by: Stefan Hajnoczi 
---
 include/block/raw-aio.h |  7 ---
 block/file-posix.c  | 10 -
 block/io_uring.c| 45 -
 block/trace-events  |  5 ++---
 4 files changed, 19 insertions(+), 48 deletions(-)

diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
index 0fe85ade77..da60ca13ef 100644
--- a/include/block/raw-aio.h
+++ b/include/block/raw-aio.h
@@ -81,13 +81,6 @@ int coroutine_fn luring_co_submit(BlockDriverState *bs, int 
fd, uint64_t offset,
   QEMUIOVector *qiov, int type);
 void luring_detach_aio_context(LuringState *s, AioContext *old_context);
 void luring_attach_aio_context(LuringState *s, AioContext *new_context);
-
-/*
- * luring_io_plug/unplug work in the thread's current AioContext, therefore the
- * caller must ensure that they are paired in the same IOThread.
- */
-void luring_io_plug(void);
-void luring_io_unplug(void);
 #endif
 
 #ifdef _WIN32
diff --git a/block/file-posix.c b/block/file-posix.c
index 0ab158efba..7baa8491dd 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2558,11 +2558,6 @@ static void coroutine_fn raw_co_io_plug(BlockDriverState 
*bs)
 laio_io_plug();
 }
 #endif
-#ifdef CONFIG_LINUX_IO_URING
-if (s->use_linux_io_uring) {
-luring_io_plug();
-}
-#endif
 }
 
 static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
@@ -2573,11 +2568,6 @@ static void coroutine_fn 
raw_co_io_unplug(BlockDriverState *bs)
 laio_io_unplug(s->aio_max_batch);
 }
 #endif
-#ifdef CONFIG_LINUX_IO_URING
-if (s->use_linux_io_uring) {
-luring_io_unplug();
-}
-#endif
 }
 
 static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
diff --git a/block/io_uring.c b/block/io_uring.c
index 82cab6a5bd..9a45e5fb8b 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -16,6 +16,7 @@
 #include "block/raw-aio.h"
 #include "qemu/coroutine.h"
 #include "qapi/error.h"
+#include "sysemu/block-backend.h"
 #include "trace.h"
 
 /* Only used for assertions.  */
@@ -41,7 +42,6 @@ typedef struct LuringAIOCB {
 } LuringAIOCB;
 
 typedef struct LuringQueue {
-int plugged;
 unsigned int in_queue;
 unsigned int in_flight;
 bool blocked;
@@ -267,7 +267,7 @@ static void 
luring_process_completions_and_submit(LuringState *s)
 {
 luring_process_completions(s);
 
-if (!s->io_q.plugged && s->io_q.in_queue > 0) {
+if (s->io_q.in_queue > 0) {
 ioq_submit(s);
 }
 }
@@ -301,29 +301,17 @@ static void qemu_luring_poll_ready(void *opaque)
 static void ioq_init(LuringQueue *io_q)
 {
 QSIMPLEQ_INIT(_q->submit_queue);
-io_q->plugged = 0;
 io_q->in_queue = 0;
 io_q->in_flight = 0;
 io_q->blocked = false;
 }
 
-void luring_io_plug(void)
+static void luring_unplug_fn(void *opaque)
 {
-AioContext *ctx = qemu_get_current_aio_context();
-LuringState *s = aio_get_linux_io_uring(ctx);
-trace_luring_io_plug(s);
-s->io_q.plugged++;
-}
-
-void luring_io_unplug(void)
-{
-AioContext *ctx = qemu_get_current_aio_context();
-LuringState *s = aio_get_linux_io_uring(ctx);
-assert(s->io_q.plugged);
-trace_luring_io_unplug(s, s->io_q.blocked, s->io_q.plugged,
-   s->io_q.in_queue, s->io_q.in_flight);
-if (--s->io_q.plugged == 0 &&
-!s->io_q.blocked && s->io_q.in_queue > 0) {
+LuringState *s = opaque;
+trace_luring_unplug_fn(s, s->io_q.blocked, s->io_q.in_queue,
+   s->io_q.in_flight);
+if (!s->io_q.blocked && s->io_q.in_queue > 0) {
 ioq_submit(s);
 }
 }
@@ -337,7 +325,6 @@ void luring_io_unplug(void)
  * @type: type of request
  *
  * Fetches sqes from ring, adds to pending queue and preps them
- *
  */
 static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
 uint64_t offset, int type)
@@ -370,14 +357,16 @@ static int luring_do_submit(int fd, LuringAIOCB 
*luringcb, LuringState *s,
 
 QSIMPLEQ_INSERT_TAIL(>io_q.submit_queue, luringcb, next);
 s->io_q.in_queue++;
-trace_luring_do_submit(s, s->io_q.blocked, s->io_q.plugged,
-   s->io_q.in_queue, s->io_q.in_flight);
-if (!s->io_q.blocked &&
-(!s->io_q.plugged ||
- s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES)) {
-ret = ioq_submit(s);
-trace_luring_do_submit_done(s, ret);
-return ret;
+trace_luring_do_submit(s, s->io_q.blocked, s->io_q.in_queue,
+   s->io_q.in_flight);
+if (!s->io_q.blocked) {
+if (s->io_q.in_flight + s->io_q.in_queue >= MAX_ENTRIES) {
+ret = ioq_submit(s);
+trace_luring_do_submit_done(s, ret);
+return ret;
+}
+
+blk_io_plug_call(luring_unplug_fn, 

[PATCH 0/6] block: add blk_io_plug_call() API

2023-05-17 Thread Stefan Hajnoczi
The existing blk_io_plug() API is not block layer multi-queue friendly because
the plug state is per-BlockDriverState.

Change blk_io_plug()'s implementation so it is thread-local. This is done by
introducing the blk_io_plug_call() function that block drivers use to batch
calls while plugged. It is relatively easy to convert block drivers from
.bdrv_co_io_plug() to blk_io_plug_call().

Random read 4KB performance with virtio-blk on a host NVMe block device:

iodepth   iops   change vs today
145612   -4%
287967   +2%
4   129872   +0%
8   171096   -3%
16  194508   -4%
32  208947   -1%
64  217647   +0%
128 229629   +0%

The results are within the noise for these benchmarks. This is to be expected
because the plugging behavior for a single thread hasn't changed in this patch
series, only that the state is thread-local now.

The following graph compares several approaches:
https://vmsplice.net/~stefan/blk_io_plug-thread-local.png
- v7.2.0: before most of the multi-queue block layer changes landed.
- with-blk_io_plug: today's post-8.0.0 QEMU.
- blk_io_plug-thread-local: this patch series.
- no-blk_io_plug: what happens when we simply remove plugging?
- call-after-dispatch: what if we integrate plugging into the event loop? I
  decided against this approach in the end because it's more likely to
  introduce performance regressions since I/O submission is deferred until the
  end of the event loop iteration.

Aside from the no-blk_io_plug case, which bottlenecks much earlier than the
others, we see that all plugging approaches are more or less equivalent in this
benchmark. It is also clear that QEMU 8.0.0 has lower performance than 7.2.0.

The Ansible playbook, fio results, and a Jupyter notebook are available here:
https://github.com/stefanha/qemu-perf/tree/remove-blk_io_plug

Stefan Hajnoczi (6):
  block: add blk_io_plug_call() API
  block/nvme: convert to blk_io_plug_call() API
  block/blkio: convert to blk_io_plug_call() API
  block/io_uring: convert to blk_io_plug_call() API
  block/linux-aio: convert to blk_io_plug_call() API
  block: remove bdrv_co_io_plug() API

 MAINTAINERS   |   1 +
 include/block/block-io.h  |   3 -
 include/block/block_int-common.h  |  11 ---
 include/block/raw-aio.h   |  14 ---
 include/sysemu/block-backend-io.h |  13 +--
 block/blkio.c |  40 
 block/block-backend.c |  22 -
 block/file-posix.c|  38 ---
 block/io.c|  37 ---
 block/io_uring.c  |  45 -
 block/linux-aio.c |  41 +++-
 block/nvme.c  |  44 +++--
 block/plug.c  | 159 ++
 hw/block/dataplane/xen-block.c|   8 +-
 hw/block/virtio-blk.c |   4 +-
 hw/scsi/virtio-scsi.c |   6 +-
 block/meson.build |   1 +
 block/trace-events|   5 +-
 18 files changed, 236 insertions(+), 256 deletions(-)
 create mode 100644 block/plug.c

-- 
2.40.1




Re: [PATCH] hw/ide: Remove unuseful IDE_DMA__COUNT definition

2023-05-17 Thread John Snow
On Fri, Feb 24, 2023 at 10:34 AM Philippe Mathieu-Daudé
 wrote:
>
> First, IDE_DMA__COUNT represents the number of enumerated
> values, but is incorrectly listed as part of the enum.
>
> Second, IDE_DMA_CMD_str() is internal to core.c and only
> takes sane enums from ide_dma_cmd. So no need to check the
> 'enval' argument is within the enum range. Only checks
> IDE_DMA_CMD_lookup[] entry is not NULL.
>
> Both combined, IDE_DMA__COUNT can go.
>
> Reduce IDE_DMA_CMD_lookup[] scope which is only used locally.
>

You reviewed the patch where this got written in :)

I didn't think anything actually protected us from giving
IDE_DMA_CMD_str() a bogus integer. I'm not familiar with the idea that
it takes "only sane enums". Is that true? Or, is it just because it's
internal to the file that we can statically prove that it's true?

--js

> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/ide/core.c | 10 +-
>  include/hw/ide/internal.h |  3 ---
>  2 files changed, 5 insertions(+), 8 deletions(-)
>
> diff --git a/hw/ide/core.c b/hw/ide/core.c
> index 5d1039378f..8bf61e7267 100644
> --- a/hw/ide/core.c
> +++ b/hw/ide/core.c
> @@ -63,19 +63,19 @@ static const int smart_attributes[][12] = {
>  { 190,  0x03, 0x00, 0x45, 0x45, 0x1f, 0x00, 0x1f, 0x1f, 0x00, 0x00, 
> 0x32},
>  };
>
> -const char *IDE_DMA_CMD_lookup[IDE_DMA__COUNT] = {
> +static const char *IDE_DMA_CMD_lookup[] = {
>  [IDE_DMA_READ] = "DMA READ",
>  [IDE_DMA_WRITE] = "DMA WRITE",
>  [IDE_DMA_TRIM] = "DMA TRIM",
> -[IDE_DMA_ATAPI] = "DMA ATAPI"
> +[IDE_DMA_ATAPI] = "DMA ATAPI",
>  };
>
>  static const char *IDE_DMA_CMD_str(enum ide_dma_cmd enval)
>  {
> -if ((unsigned)enval < IDE_DMA__COUNT) {
> -return IDE_DMA_CMD_lookup[enval];
> +if (!IDE_DMA_CMD_lookup[enval]) {
> +return "DMA UNKNOWN CMD";
>  }
> -return "DMA UNKNOWN CMD";
> +return IDE_DMA_CMD_lookup[enval];
>  }
>
>  static void ide_dummy_transfer_stop(IDEState *s);
> diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h
> index fc0aa81a88..e864fe8caf 100644
> --- a/include/hw/ide/internal.h
> +++ b/include/hw/ide/internal.h
> @@ -352,11 +352,8 @@ enum ide_dma_cmd {
>  IDE_DMA_WRITE,
>  IDE_DMA_TRIM,
>  IDE_DMA_ATAPI,
> -IDE_DMA__COUNT
>  };
>
> -extern const char *IDE_DMA_CMD_lookup[IDE_DMA__COUNT];
> -
>  #define ide_cmd_is_read(s) \
>  ((s)->dma_cmd == IDE_DMA_READ)
>
> --
> 2.38.1
>




Re: [PATCH 9/9] hw/ide/ahci: fix broken SError handling

2023-05-17 Thread John Snow
On Fri, Apr 28, 2023 at 9:23 AM Niklas Cassel  wrote:
>
> From: Niklas Cassel 
>
> When encountering an NCQ error, you should not write the NCQ tag to the
> SError register. This is completely wrong.

Mea culpa ... !

>
> The SError register has a clear definition, where each bit represents a
> different error, see PxSERR definition in AHCI 1.3.1.
>
> If we write a random value (like the NCQ tag) in SError, e.g. Linux will
> read SError, and will trigger arbitrary error handling depending on the
> NCQ tag that happened to be executing.
>
> In case of success, ncq_cb() will call ncq_finish().
> In case of error, ncq_cb() will call ncq_err() (which will clear
> ncq_tfs->used), and then call ncq_finish(), thus using ncq_tfs->used is
> sufficient to tell if finished should get set or not.
>
> Signed-off-by: Niklas Cassel 

The bulk of this series looks good, I think I'd be happy to take it,
the commit messages are extremely well written so if a regression
happens to surface, we'll be able to track down what went awry.

Want to rebase and resend it?

--js

> ---
>  hw/ide/ahci.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
> index 4950d3575e..09a14408e3 100644
> --- a/hw/ide/ahci.c
> +++ b/hw/ide/ahci.c
> @@ -1011,7 +1011,6 @@ static void ncq_err(NCQTransferState *ncq_tfs)
>
>  ide_state->error = ABRT_ERR;
>  ide_state->status = READY_STAT | ERR_STAT;
> -ncq_tfs->drive->port_regs.scr_err |= (1 << ncq_tfs->tag);
>  qemu_sglist_destroy(_tfs->sglist);
>  ncq_tfs->used = 0;
>  }
> @@ -1021,7 +1020,7 @@ static void ncq_finish(NCQTransferState *ncq_tfs)
>  /* If we didn't error out, set our finished bit. Errored commands
>   * do not get a bit set for the SDB FIS ACT register, nor do they
>   * clear the outstanding bit in scr_act (PxSACT). */
> -if (!(ncq_tfs->drive->port_regs.scr_err & (1 << ncq_tfs->tag))) {
> +if (ncq_tfs->used) {
>  ncq_tfs->drive->finished |= (1 << ncq_tfs->tag);
>  }
>
> --
> 2.40.0
>




Re: [PATCH 7/9] hw/ide/ahci: trigger either error IRQ or regular IRQ, not both

2023-05-17 Thread John Snow
On Fri, Apr 28, 2023 at 9:23 AM Niklas Cassel  wrote:
>
> From: Niklas Cassel 
>
> According to AHCI 1.3.1, 5.3.8.1 RegFIS:Entry, if ERR_STAT is set,
> we jump to state ERR:FatalTaskfile, which will raise a TFES IRQ
> unconditionally, regardless if the I bit is set in the FIS or not.
>
> Thus, we should never raise a normal IRQ after having sent an error
> IRQ.
>
> Signed-off-by: Niklas Cassel 

ACK - and thanks for the spec pointers.

> ---
>  hw/ide/ahci.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
> index 2a59d0e0f5..d88961b4c0 100644
> --- a/hw/ide/ahci.c
> +++ b/hw/ide/ahci.c
> @@ -891,11 +891,10 @@ static bool ahci_write_fis_d2h(AHCIDevice *ad, bool 
> d2h_fis_i)
>  pr->tfdata = (ad->port.ifs[0].error << 8) |
>  ad->port.ifs[0].status;
>
> +/* TFES IRQ is always raised if ERR_STAT is set, regardless of I bit. */
>  if (d2h_fis[2] & ERR_STAT) {
>  ahci_trigger_irq(ad->hba, ad, AHCI_PORT_IRQ_BIT_TFES);
> -}
> -
> -if (d2h_fis_i) {
> +} else if (d2h_fis_i) {
>  ahci_trigger_irq(ad->hba, ad, AHCI_PORT_IRQ_BIT_DHRS);
>  }
>
> --
> 2.40.0
>




Re: [PATCH 6/9] hw/ide/ahci: PxSACT and PxCI is cleared when PxCMD.ST is cleared

2023-05-17 Thread John Snow
On Fri, Apr 28, 2023 at 9:23 AM Niklas Cassel  wrote:
>
> From: Niklas Cassel 
>
> According to AHCI 1.3.1 definition of PxSACT:
> This field is cleared when PxCMD.ST is written from a '1' to a '0' by
> software. This field is not cleared by a COMRESET or a software reset.
>
> According to AHCI 1.3.1 definition of PxCI:
> This field is also cleared when PxCMD.ST is written from a '1' to a '0'
> by software.
>
> Clearing PxCMD.ST is part of the error recovery procedure, see
> AHCI 1.3.1, section "6.2 Error Recovery".
>
> If we don't clear PxCI on error recovery, the previous command will
> incorrectly still be marked as pending after error recovery.
>
> Signed-off-by: Niklas Cassel 

ACK.

> ---
>  hw/ide/ahci.c | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
> index 366929132b..2a59d0e0f5 100644
> --- a/hw/ide/ahci.c
> +++ b/hw/ide/ahci.c
> @@ -329,6 +329,11 @@ static void ahci_port_write(AHCIState *s, int port, int 
> offset, uint32_t val)
>  ahci_check_irq(s);
>  break;
>  case AHCI_PORT_REG_CMD:
> +if ((pr->cmd & PORT_CMD_START) && !(val & PORT_CMD_START)) {
> +pr->scr_act = 0;
> +pr->cmd_issue = 0;
> +}
> +
>  /* Block any Read-only fields from being set;
>   * including LIST_ON and FIS_ON.
>   * The spec requires to set ICC bits to zero after the ICC change
> --
> 2.40.0
>




Re: [PATCH 3/9] hw/ide/ahci: write D2H FIS on when processing NCQ command

2023-05-17 Thread John Snow
On Fri, Apr 28, 2023 at 9:22 AM Niklas Cassel  wrote:
>
> From: Niklas Cassel 
>
> The way that BUSY + PxCI is cleared for NCQ (FPDMA QUEUED) commands is
> described in SATA 3.5a Gold:
>
> 11.15 FPDMA QUEUED command protocol
> DFPDMAQ2: ClearInterfaceBsy
> "Transmit Register Device to Host FIS with the BSY bit cleared to zero
> and the DRQ bit cleared to zero and Interrupt bit cleared to zero to
> mark interface ready for the next command."
>
> PxCI is currently cleared by handle_cmd(), but we don't write the D2H
> FIS to the FIS Receive Area that actually caused PxCI to be cleared.
>
> Similar to how ahci_pio_transfer() calls ahci_write_fis_pio() with an
> additional parameter to write a PIO Setup FIS without raising an IRQ,
> add a parameter to ahci_write_fis_d2h() so that ahci_write_fis_d2h()
> also can write the FIS to the FIS Receive Area without raising an IRQ.
>
> Change process_ncq_command() to call ahci_write_fis_d2h() without
> raising an IRQ (similar to ahci_pio_transfer()), such that the FIS
> Receive Area is in sync with the PxTFD shadow register.
>
> E.g. Linux reads status and error fields from the FIS Receive Area
> directly, so it is wise to keep the FIS Receive Area and the PxTFD
> shadow register in sync.

I think for some time I wondered if this mattered, because I wasn't
sure when the guest CPU would actually regain control to check an
intermediate state in the memory area before we wrote the next FIS.
But, trusting your quoted blurb, I think this is more obviously
correct.

ACK

(Although, there seems to be a conflict on latest origin/master - can
you rebase, please?)

>
> Signed-off-by: Niklas Cassel 
> ---
>  hw/ide/ahci.c | 17 +++--
>  1 file changed, 11 insertions(+), 6 deletions(-)
>
> diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
> index a36e3fb77c..62aebc8de7 100644
> --- a/hw/ide/ahci.c
> +++ b/hw/ide/ahci.c
> @@ -43,7 +43,7 @@
>  static void check_cmd(AHCIState *s, int port);
>  static int handle_cmd(AHCIState *s, int port, uint8_t slot);
>  static void ahci_reset_port(AHCIState *s, int port);
> -static bool ahci_write_fis_d2h(AHCIDevice *ad);
> +static bool ahci_write_fis_d2h(AHCIDevice *ad, bool d2h_fis_i);
>  static void ahci_init_d2h(AHCIDevice *ad);
>  static int ahci_dma_prepare_buf(const IDEDMA *dma, int32_t limit);
>  static bool ahci_map_clb_address(AHCIDevice *ad);
> @@ -618,7 +618,7 @@ static void ahci_init_d2h(AHCIDevice *ad)
>  return;
>  }
>
> -if (ahci_write_fis_d2h(ad)) {
> +if (ahci_write_fis_d2h(ad, true)) {
>  ad->init_d2h_sent = true;
>  /* We're emulating receiving the first Reg H2D Fis from the device;
>   * Update the SIG register, but otherwise proceed as normal. */
> @@ -850,7 +850,7 @@ static void ahci_write_fis_pio(AHCIDevice *ad, uint16_t 
> len, bool pio_fis_i)
>  }
>  }
>
> -static bool ahci_write_fis_d2h(AHCIDevice *ad)
> +static bool ahci_write_fis_d2h(AHCIDevice *ad, bool d2h_fis_i)
>  {
>  AHCIPortRegs *pr = >port_regs;
>  uint8_t *d2h_fis;
> @@ -864,7 +864,7 @@ static bool ahci_write_fis_d2h(AHCIDevice *ad)
>  d2h_fis = >res_fis[RES_FIS_RFIS];
>
>  d2h_fis[0] = SATA_FIS_TYPE_REGISTER_D2H;
> -d2h_fis[1] = (1 << 6); /* interrupt bit */
> +d2h_fis[1] = d2h_fis_i ? (1 << 6) : 0; /* interrupt bit */
>  d2h_fis[2] = s->status;
>  d2h_fis[3] = s->error;
>
> @@ -890,7 +890,10 @@ static bool ahci_write_fis_d2h(AHCIDevice *ad)
>  ahci_trigger_irq(ad->hba, ad, AHCI_PORT_IRQ_BIT_TFES);
>  }
>
> -ahci_trigger_irq(ad->hba, ad, AHCI_PORT_IRQ_BIT_DHRS);
> +if (d2h_fis_i) {
> +ahci_trigger_irq(ad->hba, ad, AHCI_PORT_IRQ_BIT_DHRS);
> +}
> +
>  return true;
>  }
>
> @@ -1120,6 +1123,8 @@ static void process_ncq_command(AHCIState *s, int port, 
> const uint8_t *cmd_fis,
>  return;
>  }
>
> +ahci_write_fis_d2h(ad, false);
> +
>  ncq_tfs->used = 1;
>  ncq_tfs->drive = ad;
>  ncq_tfs->slot = slot;
> @@ -1506,7 +1511,7 @@ static void ahci_cmd_done(const IDEDMA *dma)
>  }
>
>  /* update d2h status */
> -ahci_write_fis_d2h(ad);
> +ahci_write_fis_d2h(ad, true);
>
>  if (ad->port_regs.cmd_issue && !ad->check_bh) {
>  ad->check_bh = qemu_bh_new(ahci_check_cmd_bh, ad);
> --
> 2.40.0
>




Re: [PATCH 2/9] hw/ide/core: set ERR_STAT in unsupported command completion

2023-05-17 Thread John Snow
On Fri, Apr 28, 2023 at 9:22 AM Niklas Cassel  wrote:
>
> From: Niklas Cassel 
>
> Currently, the first time sending an unsupported command
> (e.g. READ LOG DMA EXT) will not have ERR_STAT set in the completion.
> Sending the unsupported command again, will correctly have ERR_STAT set.
>
> When ide_cmd_permitted() returns false, it calls ide_abort_command().
> ide_abort_command() first calls ide_transfer_stop(), which will call
> ide_transfer_halt() and ide_cmd_done(), after that ide_abort_command()
> sets ERR_STAT in status.
>
> ide_cmd_done() for AHCI will call ahci_write_fis_d2h() which writes the
> current status in the FIS, and raises an IRQ. (The status here will not
> have ERR_STAT set!).
>
> Thus, we cannot call ide_transfer_stop() before setting ERR_STAT, as
> ide_transfer_stop() will result in the FIS being written and an IRQ
> being raised.
>
> The reason why it works the second time, is that ERR_STAT will still
> be set from the previous command, so when writing the FIS, the
> completion will correctly have ERR_STAT set.
>
> Set ERR_STAT before writing the FIS (calling cmd_done), so that we will
> raise an error IRQ correctly when receiving an unsupported command.
>
> Signed-off-by: Niklas Cassel 
> ---
>  hw/ide/core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/hw/ide/core.c b/hw/ide/core.c
> index 45d14a25e9..c144d1155d 100644
> --- a/hw/ide/core.c
> +++ b/hw/ide/core.c
> @@ -531,9 +531,9 @@ BlockAIOCB *ide_issue_trim(
>
>  void ide_abort_command(IDEState *s)
>  {
> -ide_transfer_stop(s);
>  s->status = READY_STAT | ERR_STAT;
>  s->error = ABRT_ERR;
> +ide_transfer_stop(s);
>  }
>
>  static void ide_set_retry(IDEState *s)
> --
> 2.40.0
>

Seems OK at a glance. Does this change the behavior of
ide_transfer_stop at all? I guess we've been using this order of
operations since 2008 at least. I didn't know C then.

ACK




Re: [PATCH] migration: for snapshots, hold the BQL during setup callbacks

2023-05-17 Thread Peter Xu
On Wed, May 10, 2023 at 08:31:13AM +0200, Juan Quintela wrote:
> Peter Xu  wrote:
> 
> Hi
> 
> [Adding Kevin to the party]
> 
> > On Fri, May 05, 2023 at 03:46:52PM +0200, Fiona Ebner wrote:
> >> To fix it, ensure that the BQL is held during setup. To avoid changing
> >> the behavior for migration too, introduce conditionals for the setup
> >> callbacks that need the BQL and only take the lock if it's not already
> >> held.
> >
> > The major complexity of this patch is the "conditionally taking" part.
> 
> Yeap.
> 
> I don't want that bit.
> 
> This is another case of:
> - I have a problem
> - I will use recursive mutexes to solve it
> 
> Now you have two problems O:-)
> 
> > Pure question: what is the benefit of not holding BQL always during
> > save_setup(), if after all we have this coroutine issue to be solved?
> 
> Dunno.
> 
> I would like that paolo commented on this.  I "reviewed the code" 10
> years ago.  I don't remember at all why we wanted to change that.
> 
> > I can understand that it helps us to avoid taking BQL too long, but we'll
> > need to take it anyway during ramblock dirty track initializations, and so
> > far IIUC it's the major time to be consumed during setup().
> >
> > Commit message of 9b0950375277467 says, "Only the migration_bitmap_sync()
> > call needs the iothread lock". Firstly I think it's also covering
> > enablement of dirty tracking:
> >
> > +qemu_mutex_lock_iothread();
> > +qemu_mutex_lock_ramlist();
> > +bytes_transferred = 0;
> > +reset_ram_globals();
> > +
> >  memory_global_dirty_log_start();
> >  migration_bitmap_sync();
> > +qemu_mutex_unlock_iothread();
> >
> > And I think enablement itself can be slow too, maybe even slower than
> > migration_bitmap_sync() especially with KVM_DIRTY_LOG_INITIALLY_SET
> > supported in the kernel.
> >
> > Meanwhile I always got confused on why we need to sync dirty bitmap when
> > setup at all.  Say, what if we drop migration_bitmap_sync() here?  After
> > all, shouldn't all pages be dirty from start (ram_list_init_bitmaps())?
> 
> How do you convince KVM (or the other lists) to start doing dirty
> tracking?  Doing a bitmap sync.

I think memory_global_dirty_log_start() kicks off the tracking already.

Take KVM as example, normally the case is KVM_MEM_LOG_DIRTY_PAGES is set
there, then ioctl(KVM_SET_USER_MEMORY_REGION) will start dirty tracking for
all of the guest memory slots necessary (including wr-protect all pages).

KVM_DIRTY_LOG_INITIALLY_SET is slightly special, it'll skip that procedure
during ioctl(KVM_SET_USER_MEMORY_REGION), but that also means the kernel
assumed the userapp (QEMU) marked all pages dirty initially (always the
case for QEMU, I think..).  Hence in this case the sync doesn't help either
because we'll simply get no new dirty bits in this shot..

> 
> And yeap, probably there is a better way of doing it.
> 
> > This is slightly off-topic, but I'd like to know if someone can help
> > answer.
> >
> > My whole point is still questioning whether we can unconditionally take bql
> > during save_setup().
> 
> I agree with you.
> 
> > I could have missed something, though, where we want to do in setup() but
> > avoid holding BQL.  Help needed on figuring this out (and if there is, IMHO
> > it'll be worthwhile to put that into comment of save_setup() hook).
> 
> I am more towards revert completely
> 9b0950375277467fd74a9075624477ae43b9bb22
> 
> and call it a day.  On migration we don't use coroutines on the sending
> side (I mean the migration code, the block layer uses coroutines for
> everything/anything).
> 
> Paolo, Stefan any clues for not run setup with the BKL?
> 
> Later, Juan.
> 

-- 
Peter Xu




Re: [PULL 18/18] tested: add test for nested aio_poll() in poll handlers

2023-05-17 Thread Richard Henderson

On 5/17/23 09:51, Kevin Wolf wrote:

From: Stefan Hajnoczi 

Signed-off-by: Stefan Hajnoczi 
Message-Id: <20230502184134.534703-3-stefa...@redhat.com>
Tested-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
  tests/unit/test-nested-aio-poll.c | 130 ++
  tests/unit/meson.build|   1 +
  2 files changed, 131 insertions(+)
  create mode 100644 tests/unit/test-nested-aio-poll.c


This new test fails on windows:

https://gitlab.com/qemu-project/qemu/-/jobs/4304413315#L3375
https://gitlab.com/qemu-project/qemu/-/jobs/4304413313#L3357


r~



Re: [PATCH 1/9] hw/ide/ahci: remove stray backslash

2023-05-17 Thread John Snow
On Fri, Apr 28, 2023 at 9:22 AM Niklas Cassel  wrote:
>
> From: Niklas Cassel 
>
> This backslash obviously does not belong here, so remove it.

Reviewed-by: John Snow 

>
> Signed-off-by: Niklas Cassel 
> ---
>  hw/ide/ahci.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
> index 55902e1df7..a36e3fb77c 100644
> --- a/hw/ide/ahci.c
> +++ b/hw/ide/ahci.c
> @@ -690,7 +690,7 @@ static void ahci_reset_port(AHCIState *s, int port)
>
>  s->dev[port].port_state = STATE_RUN;
>  if (ide_state->drive_kind == IDE_CD) {
> -ahci_set_signature(d, SATA_SIGNATURE_CDROM);\
> +ahci_set_signature(d, SATA_SIGNATURE_CDROM);
>  ide_state->status = SEEK_STAT | WRERR_STAT | READY_STAT;
>  } else {
>  ahci_set_signature(d, SATA_SIGNATURE_DISK);
> --
> 2.40.0
>




Re: [PATCH 0/9] misc AHCI cleanups

2023-05-17 Thread John Snow
On Fri, Apr 28, 2023 at 9:22 AM Niklas Cassel  wrote:
>
> From: Niklas Cassel 
>
> Hello John,
>

Hi Niklas!

I haven't been actively involved with AHCI for a while, so I am not
sure I can find the time to give this a deep scrub. I'm going to
assume based on '@wdc.com` that you probably know a thing or two more
about AHCI than I do, though. Can you tell me what testing you've
performed on this? As long as it doesn't cause any obvious
regressions, we might be able to push it through, but it might not be
up to me anymore. I can give it a review on technical merit, but with
regards to "correctness" I have to admit I am flying blind.

(I haven't looked at the patches yet, but if in your commit messages
you can point to the relevant sections of the relevant specifications,
that'd help immensely.)

> Here comes some misc AHCI cleanups.
>
> Most are related to error handling.

I've always found this the most difficult part to verify. In a real
system, the registers between AHCI and the actual hard disk are
*physically separate*, and they update at specific times based on the
transmission of the FIS packets. The model in QEMU doesn't bother with
a perfect reproduction of that, and so it's been a little tough to
verify correctness. I tried to improve it a bit back in the day, but
my understanding has always been a bit limited :)

Are there any vendor tools you're aware of that have test suites we
could use to verify behavior? Our AHCI tests are very rudimentary - I
wrote them straight out of undergrad as my first project at RH - and
likely gloss over and misunderstand many things about the
ATA/SATA/AHCI specifications.

>
> Please review.
>
> (I'm also working on a second series which will add support for
> READ LOG EXT and READ LOG DMA EXT, but I will send that one out
> once it is ready.)

Wow!

A question for you: is it worth solidifying which ATA specification we
conform to? I don't believe we adhere to any one specific model,
because a lot of the code is shared between PATA and SATA, and we "in
theory" support IDE hard drives for fairly old guest operating systems
that may or may not predate the DMA extensions. As a result, the
actual device emulation is kind of a mish-mash of different ATA
specifications, generally whichever version provided the
least-abstracted detail and was easy to implement.

If you're adding the logging features, that seems to push us towards
the newer end of the spectrum, but I'm not sure if this causes any
problems for guest operating systems doing probing to guess what kind
of device they're talking to.

Any input?

>
>
> Kind regards,
> Niklas
>
>
> Niklas Cassel (9):
>   hw/ide/ahci: remove stray backslash
>   hw/ide/core: set ERR_STAT in unsupported command completion
>   hw/ide/ahci: write D2H FIS on when processing NCQ command
>   hw/ide/ahci: simplify and document PxCI handling
>   hw/ide/ahci: PxCI should not get cleared when ERR_STAT is set
>   hw/ide/ahci: PxSACT and PxCI is cleared when PxCMD.ST is cleared
>   hw/ide/ahci: trigger either error IRQ or regular IRQ, not both
>   hw/ide/ahci: fix ahci_write_fis_sdb()
>   hw/ide/ahci: fix broken SError handling
>
>  hw/ide/ahci.c | 111 +++---
>  hw/ide/core.c |   2 +-
>  2 files changed, 80 insertions(+), 33 deletions(-)
>
> --
> 2.40.0
>




[PULL 16/18] iotests/245: Check if 'compress' driver is available

2023-05-17 Thread Kevin Wolf
Skip TestBlockdevReopen.test_insert_compress_filter() if the 'compress'
driver isn't available.

In order to make the test succeed when the case is skipped, we also need
to remove any output from it (which would be missing in the case where
we skip it). This is done by replacing qemu_io_log() with qemu_io(). In
case of failure, qemu_io() raises an exception with the output of the
qemu-io binary in its message, so we don't actually lose anything.

Signed-off-by: Kevin Wolf 
Message-Id: <20230511143801.255021-1-kw...@redhat.com>
Signed-off-by: Kevin Wolf 
---
 tests/qemu-iotests/245 | 7 ---
 tests/qemu-iotests/245.out | 9 +
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245
index edaf29094b..92b28c79be 100755
--- a/tests/qemu-iotests/245
+++ b/tests/qemu-iotests/245
@@ -611,6 +611,7 @@ class TestBlockdevReopen(iotests.QMPTestCase):
 self.reopen(hd0_opts, {'file': 'hd0-file'})
 
 # Insert (and remove) a compress filter
+@iotests.skip_if_unsupported(['compress'])
 def test_insert_compress_filter(self):
 # Add an image to the VM: hd (raw) -> hd0 (qcow2) -> hd0-file (file)
 opts = {'driver': 'raw', 'node-name': 'hd', 'file': hd_opts(0)}
@@ -650,9 +651,9 @@ class TestBlockdevReopen(iotests.QMPTestCase):
 
 # Check the first byte of the first three L2 entries and verify that
 # the second one is compressed (0x40) while the others are not (0x80)
-iotests.qemu_io_log('-f', 'raw', '-c', 'read -P 0x80 0x4 1',
- '-c', 'read -P 0x40 0x40008 1',
- '-c', 'read -P 0x80 0x40010 1', 
hd_path[0])
+iotests.qemu_io('-f', 'raw', '-c', 'read -P 0x80 0x4 1',
+ '-c', 'read -P 0x40 0x40008 1',
+ '-c', 'read -P 0x80 0x40010 1', 
hd_path[0])
 
 # Swap the disk images of two active block devices
 def test_swap_files(self):
diff --git a/tests/qemu-iotests/245.out b/tests/qemu-iotests/245.out
index a4e04a3266..0970ced62a 100644
--- a/tests/qemu-iotests/245.out
+++ b/tests/qemu-iotests/245.out
@@ -10,14 +10,7 @@
 {"return": {}}
 {"data": {"id": "stream0", "type": "stream"}, "event": "BLOCK_JOB_PENDING", 
"timestamp": {"microseconds": "USECS", "seconds": "SECS"}}
 {"data": {"device": "stream0", "len": 3145728, "offset": 3145728, "speed": 0, 
"type": "stream"}, "event": "BLOCK_JOB_COMPLETED", "timestamp": 
{"microseconds": "USECS", "seconds": "SECS"}}
-read 1/1 bytes at offset 262144
-1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 1/1 bytes at offset 262152
-1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-read 1/1 bytes at offset 262160
-1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-
-
+
 --
 Ran 26 tests
 
-- 
2.40.1




[PULL 11/18] qemu-img: Take graph lock more selectively

2023-05-17 Thread Kevin Wolf
If we take a reader lock, we can't call any functions that take a writer
lock internally without causing deadlocks once the reader lock is
actually enforced in the main thread, too. Take the reader lock only
where it is actually needed.

Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-5-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 qemu-img.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 9f9f0a7629..27f48051b0 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2938,8 +2938,6 @@ static BlockGraphInfoList *collect_image_info_list(bool 
image_opts,
 }
 bs = blk_bs(blk);
 
-GRAPH_RDLOCK_GUARD_MAINLOOP();
-
 /*
  * Note that the returned BlockGraphInfo object will not have
  * information about this image's backing node, because we have opened
@@ -2947,7 +2945,10 @@ static BlockGraphInfoList *collect_image_info_list(bool 
image_opts,
  * duplicate the backing chain information that we obtain by walking
  * the chain manually here.
  */
+bdrv_graph_rdlock_main_loop();
 bdrv_query_block_graph_info(bs, , );
+bdrv_graph_rdunlock_main_loop();
+
 if (err) {
 error_report_err(err);
 blk_unref(blk);
-- 
2.40.1




[PULL 12/18] test-bdrv-drain: Take graph lock more selectively

2023-05-17 Thread Kevin Wolf
If we take a reader lock, we can't call any functions that take a writer
lock internally without causing deadlocks once the reader lock is
actually enforced in the main thread, too. Take the reader lock only
where it is actually needed.

Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-6-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 tests/unit/test-bdrv-drain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index 9a4c5e59d6..ae4299ccfa 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -1004,8 +1004,6 @@ static void coroutine_fn test_co_delete_by_drain(void 
*opaque)
 void *buffer = g_malloc(65536);
 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buffer, 65536);
 
-GRAPH_RDLOCK_GUARD();
-
 /* Pretend some internal write operation from parent to child.
  * Important: We have to read from the child, not from the parent!
  * Draining works by first propagating it all up the tree to the
@@ -1014,7 +1012,9 @@ static void coroutine_fn test_co_delete_by_drain(void 
*opaque)
  * everything will be drained before we go back down the tree, but
  * we do not want that.  We want to be in the middle of draining
  * when this following requests returns. */
+bdrv_graph_co_rdlock();
 bdrv_co_preadv(tts->wait_child, 0, 65536, , 0);
+bdrv_graph_co_rdunlock();
 
 g_assert_cmpint(bs->refcnt, ==, 1);
 
-- 
2.40.1




[PULL 06/18] blockdev: qmp_transaction: drop extra generic layer

2023-05-17 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

Let's simplify things:

First, actions generally don't need access to common BlkActionState
structure. The only exclusion are backup actions that need
block_job_txn.

Next, for transaction actions of Transaction API is more native to
allocated state structure in the action itself.

So, do the following transformation:

1. Let all actions be represented by a function with corresponding
   structure as arguments.

2. Instead of array-map marshaller, let's make a function, that calls
   corresponding action directly.

3. BlkActionOps and BlkActionState structures become unused

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20230510150624.310640-7-vsement...@yandex-team.ru>
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 blockdev.c | 265 +
 1 file changed, 85 insertions(+), 180 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 4bf15566b2..5d56b79df4 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1188,54 +1188,8 @@ out_aio_context:
 return NULL;
 }
 
-/* New and old BlockDriverState structs for atomic group operations */
-
-typedef struct BlkActionState BlkActionState;
-
-/**
- * BlkActionOps:
- * Table of operations that define an Action.
- *
- * @instance_size: Size of state struct, in bytes.
- * @prepare: Prepare the work, must NOT be NULL.
- * @commit: Commit the changes, can be NULL.
- * @abort: Abort the changes on fail, can be NULL.
- * @clean: Clean up resources after all transaction actions have called
- * commit() or abort(). Can be NULL.
- *
- * Only prepare() may fail. In a single transaction, only one of commit() or
- * abort() will be called. clean() will always be called if it is present.
- *
- * Always run under BQL.
- */
-typedef struct BlkActionOps {
-size_t instance_size;
-void (*action)(BlkActionState *common, Transaction *tran, Error **errp);
-} BlkActionOps;
-
-/**
- * BlkActionState:
- * Describes one Action's state within a Transaction.
- *
- * @action: QAPI-defined enum identifying which Action to perform.
- * @ops: Table of ActionOps this Action can perform.
- * @block_job_txn: Transaction which this action belongs to.
- * @entry: List membership for all Actions in this Transaction.
- *
- * This structure must be arranged as first member in a subclassed type,
- * assuming that the compiler will also arrange it to the same offsets as the
- * base class.
- */
-struct BlkActionState {
-TransactionAction *action;
-const BlkActionOps *ops;
-JobTxn *block_job_txn;
-QTAILQ_ENTRY(BlkActionState) entry;
-};
-
 /* internal snapshot private data */
 typedef struct InternalSnapshotState {
-BlkActionState common;
 BlockDriverState *bs;
 QEMUSnapshotInfo sn;
 bool created;
@@ -1248,7 +1202,7 @@ TransactionActionDrv internal_snapshot_drv = {
 .clean = internal_snapshot_clean,
 };
 
-static void internal_snapshot_action(BlkActionState *common,
+static void internal_snapshot_action(BlockdevSnapshotInternal *internal,
  Transaction *tran, Error **errp)
 {
 Error *local_err = NULL;
@@ -1258,16 +1212,10 @@ static void internal_snapshot_action(BlkActionState 
*common,
 QEMUSnapshotInfo old_sn, *sn;
 bool ret;
 int64_t rt;
-BlockdevSnapshotInternal *internal;
-InternalSnapshotState *state;
+InternalSnapshotState *state = g_new0(InternalSnapshotState, 1);
 AioContext *aio_context;
 int ret1;
 
-g_assert(common->action->type ==
- TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC);
-internal = common->action->u.blockdev_snapshot_internal_sync.data;
-state = DO_UPCAST(InternalSnapshotState, common, common);
-
 tran_add(tran, _snapshot_drv, state);
 
 device = internal->device;
@@ -1393,7 +1341,6 @@ static void internal_snapshot_clean(void *opaque)
 
 /* external snapshot private data */
 typedef struct ExternalSnapshotState {
-BlkActionState common;
 BlockDriverState *old_bs;
 BlockDriverState *new_bs;
 bool overlay_appended;
@@ -1408,8 +1355,8 @@ TransactionActionDrv external_snapshot_drv = {
 .clean = external_snapshot_clean,
 };
 
-static void external_snapshot_action(BlkActionState *common, Transaction *tran,
- Error **errp)
+static void external_snapshot_action(TransactionAction *action,
+ Transaction *tran, Error **errp)
 {
 int ret;
 int flags = 0;
@@ -1422,9 +1369,7 @@ static void external_snapshot_action(BlkActionState 
*common, Transaction *tran,
 const char *snapshot_ref;
 /* File name of the new image (for 'blockdev-snapshot-sync') */
 const char *new_image_file;
-ExternalSnapshotState *state =
- DO_UPCAST(ExternalSnapshotState, common, common);
-TransactionAction *action = common->action;
+ExternalSnapshotState *state = g_new0(ExternalSnapshotState, 1);
 AioContext 

[PULL 08/18] block: Call .bdrv_co_create(_opts) unlocked

2023-05-17 Thread Kevin Wolf
These are functions that modify the graph, so they must be able to take
a writer lock. This is impossible if they already hold the reader lock.
If they need a reader lock for some of their operations, they should
take it internally.

Many of them go through blk_*(), which will always take the lock itself.
Direct calls of bdrv_*() need to take the reader lock. Note that while
locking for bdrv_co_*() calls is checked by TSA, this is not the case
for the mixed_coroutine_fns bdrv_*(). Holding the lock is still required
when they are called from coroutine context like here!

This effectively reverts 4ec8df0183, but adds some internal locking
instead.

Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-2-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 include/block/block-global-state.h |  8 +++
 include/block/block_int-common.h   |  4 ++--
 block.c|  1 -
 block/create.c |  1 -
 block/crypto.c | 25 ++--
 block/parallels.c  |  6 ++---
 block/qcow.c   |  6 ++---
 block/qcow2.c  | 37 +++---
 block/qed.c|  6 ++---
 block/raw-format.c |  2 +-
 block/vdi.c| 11 +
 block/vhdx.c   |  8 ---
 block/vmdk.c   | 27 ++
 block/vpc.c|  6 ++---
 14 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index 2d93423d35..f347199bff 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -58,14 +58,14 @@ BlockDriver *bdrv_find_protocol(const char *filename,
 Error **errp);
 BlockDriver *bdrv_find_format(const char *format_name);
 
-int coroutine_fn GRAPH_RDLOCK
+int coroutine_fn GRAPH_UNLOCKED
 bdrv_co_create(BlockDriver *drv, const char *filename, QemuOpts *opts,
Error **errp);
 
-int co_wrapper_bdrv_rdlock bdrv_create(BlockDriver *drv, const char *filename,
-   QemuOpts *opts, Error **errp);
+int co_wrapper bdrv_create(BlockDriver *drv, const char *filename,
+   QemuOpts *opts, Error **errp);
 
-int coroutine_fn GRAPH_RDLOCK
+int coroutine_fn GRAPH_UNLOCKED
 bdrv_co_create_file(const char *filename, QemuOpts *opts, Error **errp);
 
 BlockDriverState *bdrv_new(void);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index dbec0e3bb4..6492a1e538 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -250,10 +250,10 @@ struct BlockDriver {
 BlockDriverState *bs, QDict *options, int flags, Error **errp);
 void (*bdrv_close)(BlockDriverState *bs);
 
-int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_create)(
+int coroutine_fn GRAPH_UNLOCKED_PTR (*bdrv_co_create)(
 BlockdevCreateOptions *opts, Error **errp);
 
-int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_create_opts)(
+int coroutine_fn GRAPH_UNLOCKED_PTR (*bdrv_co_create_opts)(
 BlockDriver *drv, const char *filename, QemuOpts *opts, Error **errp);
 
 int (*bdrv_amend_options)(BlockDriverState *bs,
diff --git a/block.c b/block.c
index f04a6ad4e8..a2f8d5a0c0 100644
--- a/block.c
+++ b/block.c
@@ -533,7 +533,6 @@ int coroutine_fn bdrv_co_create(BlockDriver *drv, const 
char *filename,
 int ret;
 GLOBAL_STATE_CODE();
 ERRP_GUARD();
-assert_bdrv_graph_readable();
 
 if (!drv->bdrv_co_create_opts) {
 error_setg(errp, "Driver '%s' does not support image creation",
diff --git a/block/create.c b/block/create.c
index bf67b9947c..6b23a21675 100644
--- a/block/create.c
+++ b/block/create.c
@@ -43,7 +43,6 @@ static int coroutine_fn blockdev_create_run(Job *job, Error 
**errp)
 int ret;
 
 GLOBAL_STATE_CODE();
-GRAPH_RDLOCK_GUARD();
 
 job_progress_set_remaining(>common, 1);
 ret = s->drv->bdrv_co_create(s->opts, errp);
diff --git a/block/crypto.c b/block/crypto.c
index 30093cff9b..6ee8d46d30 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -99,12 +99,10 @@ struct BlockCryptoCreateData {
 };
 
 
-static int block_crypto_create_write_func(QCryptoBlock *block,
-  size_t offset,
-  const uint8_t *buf,
-  size_t buflen,
-  void *opaque,
-  Error **errp)
+static int coroutine_fn GRAPH_UNLOCKED
+block_crypto_create_write_func(QCryptoBlock *block, size_t offset,
+   const uint8_t *buf, size_t buflen, void *opaque,
+   Error **errp)
 {
 struct BlockCryptoCreateData *data = opaque;
 ssize_t ret;
@@ -117,10 +115,9 

[PULL 17/18] aio-posix: do not nest poll handlers

2023-05-17 Thread Kevin Wolf
From: Stefan Hajnoczi 

QEMU's event loop supports nesting, which means that event handler
functions may themselves call aio_poll(). The condition that triggered a
handler must be reset before the nested aio_poll() call, otherwise the
same handler will be called and immediately re-enter aio_poll. This
leads to an infinite loop and stack exhaustion.

Poll handlers are especially prone to this issue, because they typically
reset their condition by finishing the processing of pending work.
Unfortunately it is during the processing of pending work that nested
aio_poll() calls typically occur and the condition has not yet been
reset.

Disable a poll handler during ->io_poll_ready() so that a nested
aio_poll() call cannot invoke ->io_poll_ready() again. As a result, the
disabled poll handler and its associated fd handler do not run during
the nested aio_poll(). Calling aio_set_fd_handler() from inside nested
aio_poll() could cause it to run again. If the fd handler is pending
inside nested aio_poll(), then it will also run again.

In theory fd handlers can be affected by the same issue, but they are
more likely to reset the condition before calling nested aio_poll().

This is a special case and it's somewhat complex, but I don't see a way
around it as long as nested aio_poll() is supported.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2186181
Fixes: c38270692593 ("block: Mark bdrv_co_io_(un)plug() and callers 
GRAPH_RDLOCK")
Cc: Kevin Wolf 
Cc: Emanuele Giuseppe Esposito 
Cc: Paolo Bonzini 
Signed-off-by: Stefan Hajnoczi 
Message-Id: <20230502184134.534703-2-stefa...@redhat.com>
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 util/aio-posix.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/util/aio-posix.c b/util/aio-posix.c
index a8be940f76..34bc2a64d8 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -353,8 +353,19 @@ static bool aio_dispatch_handler(AioContext *ctx, 
AioHandler *node)
 poll_ready && revents == 0 &&
 aio_node_check(ctx, node->is_external) &&
 node->io_poll_ready) {
+/*
+ * Remove temporarily to avoid infinite loops when ->io_poll_ready()
+ * calls aio_poll() before clearing the condition that made the poll
+ * handler become ready.
+ */
+QLIST_SAFE_REMOVE(node, node_poll);
+
 node->io_poll_ready(node->opaque);
 
+if (!QLIST_IS_INSERTED(node, node_poll)) {
+QLIST_INSERT_HEAD(>poll_aio_handlers, node, node_poll);
+}
+
 /*
  * Return early since revents was zero. aio_notify() does not count as
  * progress.
-- 
2.40.1




[PULL 14/18] blockjob: Adhere to rate limit even when reentered early

2023-05-17 Thread Kevin Wolf
When jobs are sleeping, for example to enforce a given rate limit, they
can be reentered early, in particular in order to get paused, to update
the rate limit or to get cancelled.

Before this patch, they behave in this case as if they had fully
completed their rate limiting delay. This means that requests are sped
up beyond their limit, violating the constraints that the user gave us.

Change the block jobs to sleep in a loop until the necessary delay is
completed, while still allowing cancelling them immediately as well
pausing (handled by the pause point in job_sleep_ns()) and updating the
rate limit.

This change is also motivated by iotests cases being prone to fail
because drain operations pause and unpause them so often that block jobs
complete earlier than they are supposed to. In particular, the next
commit would fail iotests 030 without this change.

Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-8-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 include/block/blockjob_int.h | 14 ++
 block/commit.c   |  7 ++-
 block/mirror.c   | 23 ++-
 block/stream.c   |  7 ++-
 blockjob.c   | 22 --
 5 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index f008446285..104824040c 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -126,12 +126,18 @@ void block_job_user_resume(Job *job);
  */
 
 /**
- * block_job_ratelimit_get_delay:
+ * block_job_ratelimit_processed_bytes:
  *
- * Calculate and return delay for the next request in ns. See the documentation
- * of ratelimit_calculate_delay() for details.
+ * To be called after some work has been done. Adjusts the delay for the next
+ * request. See the documentation of ratelimit_calculate_delay() for details.
  */
-int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n);
+void block_job_ratelimit_processed_bytes(BlockJob *job, uint64_t n);
+
+/**
+ * Put the job to sleep (assuming that it wasn't canceled) to throttle it to 
the
+ * right speed according to its rate limiting.
+ */
+void block_job_ratelimit_sleep(BlockJob *job);
 
 /**
  * block_job_error_action:
diff --git a/block/commit.c b/block/commit.c
index 2b20fd0fd4..aa45beb0f0 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -116,7 +116,6 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 {
 CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
 int64_t offset;
-uint64_t delay_ns = 0;
 int ret = 0;
 int64_t n = 0; /* bytes */
 QEMU_AUTO_VFREE void *buf = NULL;
@@ -149,7 +148,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 /* Note that even when no rate limit is applied we need to yield
  * with no pending I/O here so that bdrv_drain_all() returns.
  */
-job_sleep_ns(>common.job, delay_ns);
+block_job_ratelimit_sleep(>common);
 if (job_is_cancelled(>common.job)) {
 break;
 }
@@ -184,9 +183,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 job_progress_update(>common.job, n);
 
 if (copy) {
-delay_ns = block_job_ratelimit_get_delay(>common, n);
-} else {
-delay_ns = 0;
+block_job_ratelimit_processed_bytes(>common, n);
 }
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index 717442ca4d..b7d92d1378 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -471,12 +471,11 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t 
offset,
 return bytes_handled;
 }
 
-static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s)
+static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
 BlockDriverState *source = s->mirror_top_bs->backing->bs;
 MirrorOp *pseudo_op;
 int64_t offset;
-uint64_t delay_ns = 0, ret = 0;
 /* At least the first dirty chunk is mirrored in one iteration. */
 int nb_chunks = 1;
 bool write_zeroes_ok = bdrv_can_write_zeroes_with_unmap(blk_bs(s->target));
@@ -608,16 +607,13 @@ static uint64_t coroutine_fn 
mirror_iteration(MirrorBlockJob *s)
 assert(io_bytes);
 offset += io_bytes;
 nb_chunks -= DIV_ROUND_UP(io_bytes, s->granularity);
-delay_ns = block_job_ratelimit_get_delay(>common, io_bytes_acct);
+block_job_ratelimit_processed_bytes(>common, io_bytes_acct);
 }
 
-ret = delay_ns;
 fail:
 QTAILQ_REMOVE(>ops_in_flight, pseudo_op, next);
 qemu_co_queue_restart_all(_op->waiting_requests);
 g_free(pseudo_op);
-
-return ret;
 }
 
 static void mirror_free_init(MirrorBlockJob *s)
@@ -1011,7 +1007,6 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 assert(!s->dbi);
 s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap);
 for (;;) {
-uint64_t delay_ns = 0;
 int64_t cnt, delta;
 

[PULL 01/18] blockdev: refactor transaction to use Transaction API

2023-05-17 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

We are going to add more block-graph modifying transaction actions,
and block-graph modifying functions are already based on Transaction
API.

Next, we'll need to separately update permissions after several
graph-modifying actions, and this would be simple with help of
Transaction API.

So, now let's just transform what we have into new-style transaction
actions.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20230510150624.310640-2-vsement...@yandex-team.ru>
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 blockdev.c | 309 ++---
 1 file changed, 178 insertions(+), 131 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index d141ca7a2d..3c72d40f51 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1210,10 +1210,7 @@ typedef struct BlkActionState BlkActionState;
  */
 typedef struct BlkActionOps {
 size_t instance_size;
-void (*prepare)(BlkActionState *common, Error **errp);
-void (*commit)(BlkActionState *common);
-void (*abort)(BlkActionState *common);
-void (*clean)(BlkActionState *common);
+void (*action)(BlkActionState *common, Transaction *tran, Error **errp);
 } BlkActionOps;
 
 /**
@@ -1245,6 +1242,12 @@ typedef struct InternalSnapshotState {
 bool created;
 } InternalSnapshotState;
 
+static void internal_snapshot_abort(void *opaque);
+static void internal_snapshot_clean(void *opaque);
+TransactionActionDrv internal_snapshot_drv = {
+.abort = internal_snapshot_abort,
+.clean = internal_snapshot_clean,
+};
 
 static int action_check_completion_mode(BlkActionState *s, Error **errp)
 {
@@ -1259,8 +1262,8 @@ static int action_check_completion_mode(BlkActionState 
*s, Error **errp)
 return 0;
 }
 
-static void internal_snapshot_prepare(BlkActionState *common,
-  Error **errp)
+static void internal_snapshot_action(BlkActionState *common,
+ Transaction *tran, Error **errp)
 {
 Error *local_err = NULL;
 const char *device;
@@ -1279,6 +1282,8 @@ static void internal_snapshot_prepare(BlkActionState 
*common,
 internal = common->action->u.blockdev_snapshot_internal_sync.data;
 state = DO_UPCAST(InternalSnapshotState, common, common);
 
+tran_add(tran, _snapshot_drv, state);
+
 /* 1. parse input */
 device = internal->device;
 name = internal->name;
@@ -1363,10 +1368,9 @@ out:
 aio_context_release(aio_context);
 }
 
-static void internal_snapshot_abort(BlkActionState *common)
+static void internal_snapshot_abort(void *opaque)
 {
-InternalSnapshotState *state =
- DO_UPCAST(InternalSnapshotState, common, common);
+InternalSnapshotState *state = opaque;
 BlockDriverState *bs = state->bs;
 QEMUSnapshotInfo *sn = >sn;
 AioContext *aio_context;
@@ -1390,10 +1394,9 @@ static void internal_snapshot_abort(BlkActionState 
*common)
 aio_context_release(aio_context);
 }
 
-static void internal_snapshot_clean(BlkActionState *common)
+static void internal_snapshot_clean(void *opaque)
 {
-InternalSnapshotState *state = DO_UPCAST(InternalSnapshotState,
- common, common);
+g_autofree InternalSnapshotState *state = opaque;
 AioContext *aio_context;
 
 if (!state->bs) {
@@ -1416,8 +1419,17 @@ typedef struct ExternalSnapshotState {
 bool overlay_appended;
 } ExternalSnapshotState;
 
-static void external_snapshot_prepare(BlkActionState *common,
-  Error **errp)
+static void external_snapshot_commit(void *opaque);
+static void external_snapshot_abort(void *opaque);
+static void external_snapshot_clean(void *opaque);
+TransactionActionDrv external_snapshot_drv = {
+.commit = external_snapshot_commit,
+.abort = external_snapshot_abort,
+.clean = external_snapshot_clean,
+};
+
+static void external_snapshot_action(BlkActionState *common, Transaction *tran,
+ Error **errp)
 {
 int ret;
 int flags = 0;
@@ -1436,6 +1448,8 @@ static void external_snapshot_prepare(BlkActionState 
*common,
 AioContext *aio_context;
 uint64_t perm, shared;
 
+tran_add(tran, _snapshot_drv, state);
+
 /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar
  * purpose but a different set of parameters */
 switch (action->type) {
@@ -1588,10 +1602,9 @@ out:
 aio_context_release(aio_context);
 }
 
-static void external_snapshot_commit(BlkActionState *common)
+static void external_snapshot_commit(void *opaque)
 {
-ExternalSnapshotState *state =
- DO_UPCAST(ExternalSnapshotState, common, common);
+ExternalSnapshotState *state = opaque;
 AioContext *aio_context;
 
 aio_context = bdrv_get_aio_context(state->old_bs);
@@ -1607,10 +1620,9 @@ static void external_snapshot_commit(BlkActionState 
*common)
 aio_context_release(aio_context);
 

[PULL 09/18] block/export: Fix null pointer dereference in error path

2023-05-17 Thread Kevin Wolf
There are some error paths in blk_exp_add() that jump to 'fail:' before
'exp' is even created. So we can't just unconditionally access exp->blk.

Add a NULL check, and switch from exp->blk to blk, which is available
earlier, just to be extra sure that we really cover all cases where
BlockDevOps could have been set for it (in practice, this only happens
in drv->create() today, so this part of the change isn't strictly
necessary).

Fixes: Coverity CID 1509238
Fixes: de79b52604e43fdeba6cee4f5af600b62169f2d2
Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-3-kw...@redhat.com>
Reviewed-by: Eric Blake 
Tested-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 block/export/export.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/export/export.c b/block/export/export.c
index 62c7c22d45..a5c8f42f53 100644
--- a/block/export/export.c
+++ b/block/export/export.c
@@ -192,8 +192,10 @@ BlockExport *blk_exp_add(BlockExportOptions *export, Error 
**errp)
 return exp;
 
 fail:
-blk_set_dev_ops(exp->blk, NULL, NULL);
-blk_unref(blk);
+if (blk) {
+blk_set_dev_ops(blk, NULL, NULL);
+blk_unref(blk);
+}
 aio_context_release(ctx);
 if (exp) {
 g_free(exp->id);
-- 
2.40.1




[PULL 15/18] graph-lock: Honour read locks even in the main thread

2023-05-17 Thread Kevin Wolf
There are some conditions under which we don't actually need to do
anything for taking a reader lock: Writing the graph is only possible
from the main context while holding the BQL. So if a reader is running
in the main context under the BQL and knows that it won't be interrupted
until the next writer runs, we don't actually need to do anything.

This is the case if the reader code neither has a nested event loop
(this is forbidden anyway while you hold the lock) nor is a coroutine
(because a writer could run when the coroutine has yielded).

These conditions are exactly what bdrv_graph_rdlock_main_loop() asserts.
They are not fulfilled in bdrv_graph_co_rdlock(), which always runs in a
coroutine.

This deletes the shortcuts in bdrv_graph_co_rdlock() that skip taking
the reader lock in the main thread.

Reported-by: Fiona Ebner 
Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-9-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 block/graph-lock.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/block/graph-lock.c b/block/graph-lock.c
index 377884c3a9..9c42bd9799 100644
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@@ -162,11 +162,6 @@ void coroutine_fn bdrv_graph_co_rdlock(void)
 BdrvGraphRWlock *bdrv_graph;
 bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
 
-/* Do not lock if in main thread */
-if (qemu_in_main_thread()) {
-return;
-}
-
 for (;;) {
 qatomic_set(_graph->reader_count,
 bdrv_graph->reader_count + 1);
@@ -230,11 +225,6 @@ void coroutine_fn bdrv_graph_co_rdunlock(void)
 BdrvGraphRWlock *bdrv_graph;
 bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
 
-/* Do not lock if in main thread */
-if (qemu_in_main_thread()) {
-return;
-}
-
 qatomic_store_release(_graph->reader_count,
   bdrv_graph->reader_count - 1);
 /* make sure writer sees reader_count before we check has_writer */
-- 
2.40.1




[PULL 03/18] blockdev: qmp_transaction: refactor loop to classic for

2023-05-17 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Kevin Wolf 
Message-Id: <20230510150624.310640-4-vsement...@yandex-team.ru>
Signed-off-by: Kevin Wolf 
---
 blockdev.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 75e313f403..dd0e98bbe1 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2377,7 +2377,7 @@ void qmp_transaction(TransactionActionList *actions,
  struct TransactionProperties *properties,
  Error **errp)
 {
-TransactionActionList *act = actions;
+TransactionActionList *act;
 bool has_properties = !!properties;
 JobTxn *block_job_txn = NULL;
 Error *local_err = NULL;
@@ -2397,14 +2397,11 @@ void qmp_transaction(TransactionActionList *actions,
 bdrv_drain_all();
 
 /* We don't do anything in this loop that commits us to the operations */
-while (NULL != act) {
-TransactionAction *dev_info = NULL;
+for (act = actions; act; act = act->next) {
+TransactionAction *dev_info = act->value;
 const BlkActionOps *ops;
 BlkActionState *state;
 
-dev_info = act->value;
-act = act->next;
-
 assert(dev_info->type < ARRAY_SIZE(actions_map));
 
 ops = _map[dev_info->type];
-- 
2.40.1




[PULL 02/18] blockdev: transactions: rename some things

2023-05-17 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

Look at qmp_transaction(): dev_list is not obvious name for list of
actions. Let's look at qapi spec, this argument is "actions". Let's
follow the common practice of using same argument names in qapi scheme
and code.

To be honest, rename props to properties for same reason.

Next, we have to rename global map of actions, to not conflict with new
name for function argument.

Rename also dev_entry loop variable accordingly to new name of the
list.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Kevin Wolf 
Message-Id: <20230510150624.310640-3-vsement...@yandex-team.ru>
Signed-off-by: Kevin Wolf 
---
 blockdev.c | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 3c72d40f51..75e313f403 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2291,7 +2291,7 @@ static void abort_commit(void *opaque)
 g_assert_not_reached(); /* this action never succeeds */
 }
 
-static const BlkActionOps actions[] = {
+static const BlkActionOps actions_map[] = {
 [TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT] = {
 .instance_size = sizeof(ExternalSnapshotState),
 .action  = external_snapshot_action,
@@ -2373,12 +2373,12 @@ static TransactionProperties 
*get_transaction_properties(
  *
  * Always run under BQL.
  */
-void qmp_transaction(TransactionActionList *dev_list,
- struct TransactionProperties *props,
+void qmp_transaction(TransactionActionList *actions,
+ struct TransactionProperties *properties,
  Error **errp)
 {
-TransactionActionList *dev_entry = dev_list;
-bool has_props = !!props;
+TransactionActionList *act = actions;
+bool has_properties = !!properties;
 JobTxn *block_job_txn = NULL;
 Error *local_err = NULL;
 Transaction *tran = tran_new();
@@ -2388,8 +2388,8 @@ void qmp_transaction(TransactionActionList *dev_list,
 /* Does this transaction get canceled as a group on failure?
  * If not, we don't really need to make a JobTxn.
  */
-props = get_transaction_properties(props);
-if (props->completion_mode != ACTION_COMPLETION_MODE_INDIVIDUAL) {
+properties = get_transaction_properties(properties);
+if (properties->completion_mode != ACTION_COMPLETION_MODE_INDIVIDUAL) {
 block_job_txn = job_txn_new();
 }
 
@@ -2397,24 +2397,24 @@ void qmp_transaction(TransactionActionList *dev_list,
 bdrv_drain_all();
 
 /* We don't do anything in this loop that commits us to the operations */
-while (NULL != dev_entry) {
+while (NULL != act) {
 TransactionAction *dev_info = NULL;
 const BlkActionOps *ops;
 BlkActionState *state;
 
-dev_info = dev_entry->value;
-dev_entry = dev_entry->next;
+dev_info = act->value;
+act = act->next;
 
-assert(dev_info->type < ARRAY_SIZE(actions));
+assert(dev_info->type < ARRAY_SIZE(actions_map));
 
-ops = [dev_info->type];
+ops = _map[dev_info->type];
 assert(ops->instance_size > 0);
 
 state = g_malloc0(ops->instance_size);
 state->ops = ops;
 state->action = dev_info;
 state->block_job_txn = block_job_txn;
-state->txn_props = props;
+state->txn_props = properties;
 
 state->ops->action(state, tran, _err);
 if (local_err) {
@@ -2432,8 +2432,8 @@ delete_and_fail:
 /* failure, and it is all-or-none; roll back all operations */
 tran_abort(tran);
 exit:
-if (!has_props) {
-qapi_free_TransactionProperties(props);
+if (!has_properties) {
+qapi_free_TransactionProperties(properties);
 }
 job_txn_unref(block_job_txn);
 }
-- 
2.40.1




[PULL 10/18] qcow2: Unlock the graph in qcow2_do_open() where necessary

2023-05-17 Thread Kevin Wolf
qcow2_do_open() calls a few no_co_wrappers that wrap functions taking
the graph lock internally as a writer. Therefore, it can't hold the
reader lock across these calls, it causes deadlocks. Drop the lock
temporarily around the calls.

Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-4-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 block/qcow2.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 73f36447f9..b00b4e7575 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1619,9 +1619,11 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int 
flags,
 
 if (open_data_file) {
 /* Open external data file */
+bdrv_graph_co_rdunlock();
 s->data_file = bdrv_co_open_child(NULL, options, "data-file", bs,
   _of_bds, BDRV_CHILD_DATA,
   true, errp);
+bdrv_graph_co_rdlock();
 if (*errp) {
 ret = -EINVAL;
 goto fail;
@@ -1629,10 +1631,12 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int 
flags,
 
 if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
 if (!s->data_file && s->image_data_file) {
+bdrv_graph_co_rdunlock();
 s->data_file = bdrv_co_open_child(s->image_data_file, options,
   "data-file", bs,
   _of_bds,
   BDRV_CHILD_DATA, false, 
errp);
+bdrv_graph_co_rdlock();
 if (!s->data_file) {
 ret = -EINVAL;
 goto fail;
@@ -1857,7 +1861,9 @@ qcow2_do_open(BlockDriverState *bs, QDict *options, int 
flags,
  fail:
 g_free(s->image_data_file);
 if (open_data_file && has_data_file(bs)) {
+bdrv_graph_co_rdunlock();
 bdrv_unref_child(bs, s->data_file);
+bdrv_graph_co_rdlock();
 s->data_file = NULL;
 }
 g_free(s->unknown_header_fields);
-- 
2.40.1




[PULL 18/18] tested: add test for nested aio_poll() in poll handlers

2023-05-17 Thread Kevin Wolf
From: Stefan Hajnoczi 

Signed-off-by: Stefan Hajnoczi 
Message-Id: <20230502184134.534703-3-stefa...@redhat.com>
Tested-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 tests/unit/test-nested-aio-poll.c | 130 ++
 tests/unit/meson.build|   1 +
 2 files changed, 131 insertions(+)
 create mode 100644 tests/unit/test-nested-aio-poll.c

diff --git a/tests/unit/test-nested-aio-poll.c 
b/tests/unit/test-nested-aio-poll.c
new file mode 100644
index 00..9bbe18b839
--- /dev/null
+++ b/tests/unit/test-nested-aio-poll.c
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Test that poll handlers are not re-entrant in nested aio_poll()
+ *
+ * Copyright Red Hat
+ *
+ * Poll handlers are usually level-triggered. That means they continue firing
+ * until the condition is reset (e.g. a virtqueue becomes empty). If a poll
+ * handler calls nested aio_poll() before the condition is reset, then infinite
+ * recursion occurs.
+ *
+ * aio_poll() is supposed to prevent this by disabling poll handlers in nested
+ * aio_poll() calls. This test case checks that this is indeed what happens.
+ */
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "qapi/error.h"
+
+typedef struct {
+AioContext *ctx;
+
+/* This is the EventNotifier that drives the test */
+EventNotifier poll_notifier;
+
+/* This EventNotifier is only used to wake aio_poll() */
+EventNotifier dummy_notifier;
+
+bool nested;
+} TestData;
+
+static void io_read(EventNotifier *notifier)
+{
+fprintf(stderr, "%s %p\n", __func__, notifier);
+event_notifier_test_and_clear(notifier);
+}
+
+static bool io_poll_true(void *opaque)
+{
+fprintf(stderr, "%s %p\n", __func__, opaque);
+return true;
+}
+
+static bool io_poll_false(void *opaque)
+{
+fprintf(stderr, "%s %p\n", __func__, opaque);
+return false;
+}
+
+static void io_poll_ready(EventNotifier *notifier)
+{
+TestData *td = container_of(notifier, TestData, poll_notifier);
+
+fprintf(stderr, "> %s\n", __func__);
+
+g_assert(!td->nested);
+td->nested = true;
+
+/* Wake the following nested aio_poll() call */
+event_notifier_set(>dummy_notifier);
+
+/* This nested event loop must not call io_poll()/io_poll_ready() */
+g_assert(aio_poll(td->ctx, true));
+
+td->nested = false;
+
+fprintf(stderr, "< %s\n", __func__);
+}
+
+/* dummy_notifier never triggers */
+static void io_poll_never_ready(EventNotifier *notifier)
+{
+g_assert_not_reached();
+}
+
+static void test(void)
+{
+TestData td = {
+.ctx = aio_context_new(_abort),
+};
+
+qemu_set_current_aio_context(td.ctx);
+
+/* Enable polling */
+aio_context_set_poll_params(td.ctx, 100, 2, 2, _abort);
+
+/*
+ * The GSource is unused but this has the side-effect of changing the fdmon
+ * that AioContext uses.
+ */
+aio_get_g_source(td.ctx);
+
+/* Make the event notifier active (set) right away */
+event_notifier_init(_notifier, 1);
+aio_set_event_notifier(td.ctx, _notifier, false,
+   io_read, io_poll_true, io_poll_ready);
+
+/* This event notifier will be used later */
+event_notifier_init(_notifier, 0);
+aio_set_event_notifier(td.ctx, _notifier, false,
+   io_read, io_poll_false, io_poll_never_ready);
+
+/* Consume aio_notify() */
+g_assert(!aio_poll(td.ctx, false));
+
+/*
+ * Run the io_read() handler. This has the side-effect of activating
+ * polling in future aio_poll() calls.
+ */
+g_assert(aio_poll(td.ctx, true));
+
+/* The second time around the io_poll()/io_poll_ready() handler runs */
+g_assert(aio_poll(td.ctx, true));
+
+/* Run io_poll()/io_poll_ready() one more time to show it keeps working */
+g_assert(aio_poll(td.ctx, true));
+
+aio_set_event_notifier(td.ctx, _notifier, false,
+   NULL, NULL, NULL);
+aio_set_event_notifier(td.ctx, _notifier, false, NULL, NULL, NULL);
+event_notifier_cleanup(_notifier);
+event_notifier_cleanup(_notifier);
+aio_context_unref(td.ctx);
+}
+
+int main(int argc, char **argv)
+{
+g_test_init(, , NULL);
+g_test_add_func("/nested-aio-poll", test);
+return g_test_run();
+}
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index 3bc78d8660..a314f82baa 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -67,6 +67,7 @@ if have_block
 'test-coroutine': [testblock],
 'test-aio': [testblock],
 'test-aio-multithread': [testblock],
+'test-nested-aio-poll': [testblock],
 'test-throttle': [testblock],
 'test-thread-pool': [testblock],
 'test-hbitmap': [testblock],
-- 
2.40.1




[PULL 13/18] test-bdrv-drain: Call bdrv_co_unref() in coroutine context

2023-05-17 Thread Kevin Wolf
bdrv_unref() is a no_coroutine_fn, so calling it from coroutine context
is invalid. Use bdrv_co_unref() instead.

Signed-off-by: Kevin Wolf 
Message-Id: <20230510203601.418015-7-kw...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Kevin Wolf 
---
 tests/unit/test-bdrv-drain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index ae4299ccfa..08bb0f9984 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -1019,7 +1019,7 @@ static void coroutine_fn test_co_delete_by_drain(void 
*opaque)
 g_assert_cmpint(bs->refcnt, ==, 1);
 
 if (!dbdd->detach_instead_of_delete) {
-blk_unref(blk);
+blk_co_unref(blk);
 } else {
 BdrvChild *c, *next_c;
 QLIST_FOREACH_SAFE(c, >children, next, next_c) {
-- 
2.40.1




[PULL 04/18] blockdev: transaction: refactor handling transaction properties

2023-05-17 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

Only backup supports GROUPED mode. Make this logic more clear. And
avoid passing extra thing to each action.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20230510150624.310640-5-vsement...@yandex-team.ru>
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 blockdev.c | 96 +-
 1 file changed, 22 insertions(+), 74 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index dd0e98bbe1..a2ebaa5afc 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1230,7 +1230,6 @@ struct BlkActionState {
 TransactionAction *action;
 const BlkActionOps *ops;
 JobTxn *block_job_txn;
-TransactionProperties *txn_props;
 QTAILQ_ENTRY(BlkActionState) entry;
 };
 
@@ -1249,19 +1248,6 @@ TransactionActionDrv internal_snapshot_drv = {
 .clean = internal_snapshot_clean,
 };
 
-static int action_check_completion_mode(BlkActionState *s, Error **errp)
-{
-if (s->txn_props->completion_mode != ACTION_COMPLETION_MODE_INDIVIDUAL) {
-error_setg(errp,
-   "Action '%s' does not support Transaction property "
-   "completion-mode = %s",
-   TransactionActionKind_str(s->action->type),
-   ActionCompletionMode_str(s->txn_props->completion_mode));
-return -1;
-}
-return 0;
-}
-
 static void internal_snapshot_action(BlkActionState *common,
  Transaction *tran, Error **errp)
 {
@@ -1284,15 +1270,9 @@ static void internal_snapshot_action(BlkActionState 
*common,
 
 tran_add(tran, _snapshot_drv, state);
 
-/* 1. parse input */
 device = internal->device;
 name = internal->name;
 
-/* 2. check for validation */
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
-
 bs = qmp_get_root_bs(device, errp);
 if (!bs) {
 return;
@@ -1476,9 +1456,6 @@ static void external_snapshot_action(BlkActionState 
*common, Transaction *tran,
 }
 
 /* start processing */
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
 
 state->old_bs = bdrv_lookup_bs(device, node_name, errp);
 if (!state->old_bs) {
@@ -2022,10 +1999,6 @@ static void block_dirty_bitmap_add_action(BlkActionState 
*common,
 
 tran_add(tran, _dirty_bitmap_add_drv, state);
 
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
-
 action = common->action->u.block_dirty_bitmap_add.data;
 /* AIO context taken and released within qmp_block_dirty_bitmap_add */
 qmp_block_dirty_bitmap_add(action->node, action->name,
@@ -2072,10 +2045,6 @@ static void 
block_dirty_bitmap_clear_action(BlkActionState *common,
 
 tran_add(tran, _dirty_bitmap_clear_drv, state);
 
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
-
 action = common->action->u.block_dirty_bitmap_clear.data;
 state->bitmap = block_dirty_bitmap_lookup(action->node,
   action->name,
@@ -2123,10 +2092,6 @@ static void 
block_dirty_bitmap_enable_action(BlkActionState *common,
 
 tran_add(tran, _dirty_bitmap_enable_drv, state);
 
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
-
 action = common->action->u.block_dirty_bitmap_enable.data;
 state->bitmap = block_dirty_bitmap_lookup(action->node,
   action->name,
@@ -2168,10 +2133,6 @@ static void 
block_dirty_bitmap_disable_action(BlkActionState *common,
 
 tran_add(tran, _dirty_bitmap_disable_drv, state);
 
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
-
 action = common->action->u.block_dirty_bitmap_disable.data;
 state->bitmap = block_dirty_bitmap_lookup(action->node,
   action->name,
@@ -2213,10 +2174,6 @@ static void 
block_dirty_bitmap_merge_action(BlkActionState *common,
 
 tran_add(tran, _dirty_bitmap_merge_drv, state);
 
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
-
 action = common->action->u.block_dirty_bitmap_merge.data;
 
 state->bitmap = block_dirty_bitmap_merge(action->node, action->target,
@@ -2241,10 +2198,6 @@ static void 
block_dirty_bitmap_remove_action(BlkActionState *common,
 
 tran_add(tran, _dirty_bitmap_remove_drv, state);
 
-if (action_check_completion_mode(common, errp) < 0) {
-return;
-}
-
 action = common->action->u.block_dirty_bitmap_remove.data;
 
 state->bitmap = block_dirty_bitmap_remove(action->node, action->name,
@@ -2348,25 +2301,6 @@ static const BlkActionOps actions_map[] = {
  */
 };
 
-/**
- * Allocate a TransactionProperties structure if necessary, and fill
- * that structure with desired defaults if they are unset.
- */
-static TransactionProperties *get_transaction_properties(
-TransactionProperties *props)
-{
-  

[PULL 05/18] blockdev: use state.bitmap in block-dirty-bitmap-add action

2023-05-17 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

Other bitmap related actions use the .bitmap pointer in .abort action,
let's do same here:

1. It helps further refactoring, as bitmap-add is the only bitmap
   action that uses state.action in .abort

2. It must be safe: transaction actions rely on the fact that on
   .abort() the state is the same as at the end of .prepare(), so that
   in .abort() we could precisely rollback the changes done by
   .prepare().
   The only way to remove the bitmap during transaction should be
   block-dirty-bitmap-remove action, but it postpones actual removal to
   .commit(), so we are OK on any rollback path. (Note also that
   bitmap-remove is the only bitmap action that has .commit() phase,
   except for simple g_free the state on .clean())

3. Again, other bitmap actions behave this way: keep the bitmap pointer
   during the transaction.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20230510150624.310640-6-vsement...@yandex-team.ru>
[kwolf: Also remove the now unused BlockDirtyBitmapState.prepared]
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 blockdev.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index a2ebaa5afc..4bf15566b2 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1979,7 +1979,6 @@ typedef struct BlockDirtyBitmapState {
 BdrvDirtyBitmap *bitmap;
 BlockDriverState *bs;
 HBitmap *backup;
-bool prepared;
 bool was_enabled;
 } BlockDirtyBitmapState;
 
@@ -2008,7 +2007,8 @@ static void block_dirty_bitmap_add_action(BlkActionState 
*common,
_err);
 
 if (!local_err) {
-state->prepared = true;
+state->bitmap = block_dirty_bitmap_lookup(action->node, action->name,
+  NULL, _abort);
 } else {
 error_propagate(errp, local_err);
 }
@@ -2016,15 +2016,10 @@ static void 
block_dirty_bitmap_add_action(BlkActionState *common,
 
 static void block_dirty_bitmap_add_abort(void *opaque)
 {
-BlockDirtyBitmapAdd *action;
 BlockDirtyBitmapState *state = opaque;
 
-action = state->common.action->u.block_dirty_bitmap_add.data;
-/* Should not be able to fail: IF the bitmap was added via .prepare(),
- * then the node reference and bitmap name must have been valid.
- */
-if (state->prepared) {
-qmp_block_dirty_bitmap_remove(action->node, action->name, 
_abort);
+if (state->bitmap) {
+bdrv_release_dirty_bitmap(state->bitmap);
 }
 }
 
-- 
2.40.1




[PULL 07/18] docs/interop/qcow2.txt: fix description about "zlib" clusters

2023-05-17 Thread Kevin Wolf
From: Akihiro Suda 

"zlib" clusters are actually raw deflate (RFC1951) clusters without
zlib headers.

Signed-off-by: Akihiro Suda 
Message-Id: <168424874322.11954.134094204635185952...@git.sr.ht>
Reviewed-by: Eric Blake 
Reviewed-by: Kevin Wolf 
Signed-off-by: Kevin Wolf 
---
 docs/interop/qcow2.txt | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index f7dc304ff6..e7f036c286 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -214,14 +214,18 @@ version 2.
 type.
 
 If the incompatible bit "Compression type" is set: the 
field
-must be present and non-zero (which means non-zlib
+must be present and non-zero (which means non-deflate
 compression type). Otherwise, this field must not be 
present
-or must be zero (which means zlib).
+or must be zero (which means deflate).
 
 Available compression type values:
-0: zlib 
+0: deflate 
 1: zstd 
 
+The deflate compression type is called "zlib"
+ in QEMU. However, clusters with the
+deflate compression type do not have zlib headers.
+
 
 === Header padding ===
 
-- 
2.40.1




[PULL 00/18] Block layer patches

2023-05-17 Thread Kevin Wolf
The following changes since commit 6972ef1440a9d685482d78672620a7482f2bd09a:

  Merge tag 'pull-tcg-20230516-3' of https://gitlab.com/rth7680/qemu into 
staging (2023-05-16 21:30:27 -0700)

are available in the Git repository at:

  https://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 75b2591bbce5dc9f3da89f140b7bdc00e92fa8ec:

  tested: add test for nested aio_poll() in poll handlers (2023-05-17 18:01:22 
+0200)


Block layer patches

- qcow2 spec: Rename "zlib" compression to "deflate"
- Honour graph read lock even in the main thread + prerequisite fixes
- aio-posix: do not nest poll handlers (fixes infinite recursion)
- Refactor QMP blockdev transactions
- iotests/245: Check if 'compress' driver is available


Akihiro Suda (1):
  docs/interop/qcow2.txt: fix description about "zlib" clusters

Kevin Wolf (9):
  block: Call .bdrv_co_create(_opts) unlocked
  block/export: Fix null pointer dereference in error path
  qcow2: Unlock the graph in qcow2_do_open() where necessary
  qemu-img: Take graph lock more selectively
  test-bdrv-drain: Take graph lock more selectively
  test-bdrv-drain: Call bdrv_co_unref() in coroutine context
  blockjob: Adhere to rate limit even when reentered early
  graph-lock: Honour read locks even in the main thread
  iotests/245: Check if 'compress' driver is available

Stefan Hajnoczi (2):
  aio-posix: do not nest poll handlers
  tested: add test for nested aio_poll() in poll handlers

Vladimir Sementsov-Ogievskiy (6):
  blockdev: refactor transaction to use Transaction API
  blockdev: transactions: rename some things
  blockdev: qmp_transaction: refactor loop to classic for
  blockdev: transaction: refactor handling transaction properties
  blockdev: use state.bitmap in block-dirty-bitmap-add action
  blockdev: qmp_transaction: drop extra generic layer

 docs/interop/qcow2.txt |  10 +-
 include/block/block-global-state.h |   8 +-
 include/block/block_int-common.h   |   4 +-
 include/block/blockjob_int.h   |  14 +-
 block.c|   1 -
 block/commit.c |   7 +-
 block/create.c |   1 -
 block/crypto.c |  25 +-
 block/export/export.c  |   6 +-
 block/graph-lock.c |  10 -
 block/mirror.c |  23 +-
 block/parallels.c  |   6 +-
 block/qcow.c   |   6 +-
 block/qcow2.c  |  43 ++-
 block/qed.c|   6 +-
 block/raw-format.c |   2 +-
 block/stream.c |   7 +-
 block/vdi.c|  11 +-
 block/vhdx.c   |   8 +-
 block/vmdk.c   |  27 +-
 block/vpc.c|   6 +-
 blockdev.c | 606 +++--
 blockjob.c |  22 +-
 qemu-img.c |   5 +-
 tests/unit/test-bdrv-drain.c   |   6 +-
 tests/unit/test-nested-aio-poll.c  | 130 
 util/aio-posix.c   |  11 +
 tests/qemu-iotests/245 |   7 +-
 tests/qemu-iotests/245.out |   9 +-
 tests/unit/meson.build |   1 +
 30 files changed, 541 insertions(+), 487 deletions(-)
 create mode 100644 tests/unit/test-nested-aio-poll.c




Re: [PATCH 0/2] aio-posix: do not nest poll handlers

2023-05-17 Thread Kevin Wolf
Am 02.05.2023 um 20:41 hat Stefan Hajnoczi geschrieben:
> The following stack exhaustion was reported in
> https://bugzilla.redhat.com/show_bug.cgi?id=2186181:
> 
>   ...
>   #51 0x55884fca7451 aio_poll (qemu-kvm + 0x9d6451)
>   #52 0x55884fab9cbd bdrv_poll_co (qemu-kvm + 0x7e8cbd)
>   #53 0x55884fab654b blk_io_plug (qemu-kvm + 0x7e554b)
>   #54 0x55884f927fef virtio_blk_handle_vq (qemu-kvm + 0x656fef)
>   #55 0x55884f96d384 virtio_queue_host_notifier_aio_poll_ready (qemu-kvm 
> + 0x69c384)
>   #56 0x55884fca671b aio_dispatch_handler (qemu-kvm + 0x9d571b)
>   #57 0x55884fca7451 aio_poll (qemu-kvm + 0x9d6451)
>   #58 0x55884fab9cbd bdrv_poll_co (qemu-kvm + 0x7e8cbd)
>   #59 0x55884fab654b blk_io_plug (qemu-kvm + 0x7e554b)
>   #60 0x55884f927fef virtio_blk_handle_vq (qemu-kvm + 0x656fef)
>   #61 0x55884f96d384 virtio_queue_host_notifier_aio_poll_ready (qemu-kvm 
> + 0x69c384)
>   #62 0x55884fca671b aio_dispatch_handler (qemu-kvm + 0x9d571b)
>   #63 0x55884fca7451 aio_poll (qemu-kvm + 0x9d6451)
>   ...
> 
> This happens because some block layer APIs in QEMU 8.0 run in coroutines in
> order to take the graph rdlock. Existing virtqueue handler functions weren't
> written with this in mind.
> 
> A simplified example of the problem is:
> 
>   void my_fd_handler(void *opaque)
>   {
>   do_something();
>   event_notifier_test_and_clear(opaque);
>   do_something_else();
>   }
> 
> When do_something() calls aio_poll(), my_fd_handler() will be entered again
> immediately because the fd is still readable and stack exhaustion will occur.
> 
> When do_something_else() calls aio_poll(), there is no stack exhaustion since
> the event notifier has been cleared and the fd is not readable.
> 
> The actual bug is more involved. The handler in question is a poll handler, 
> not
> an fd handler, but the principle is the same.
> 
> I haven't been able to reproduce the bug, but I have included a test case that
> demonstrates the problem.

Thanks, applied to the block branch.

Kevin




Re: [PATCH qemu] docs/interop/qcow2.txt: fix description about "zlib" clusters

2023-05-17 Thread Kevin Wolf
Am 16.05.2023 um 16:32 hat ~akihirosuda geschrieben:
> From: Akihiro Suda 
> 
> "zlib" clusters are actually raw deflate (RFC1951) clusters without
> zlib headers.
> 
> Signed-off-by: Akihiro Suda 

Thanks, applied to the block branch.

Kevin




[PATCH 3/3] iotests: Test commit with iothreads and ongoing I/O

2023-05-17 Thread Kevin Wolf
This tests exercises graph locking, draining, and graph modifications
with AioContext switches a lot. Amongst others, it serves as a
regression test for bdrv_graph_wrlock() deadlocking because it is called
with a locked AioContext and for AioContext handling in the NBD server.

Signed-off-by: Kevin Wolf 
---
 tests/qemu-iotests/iotests.py |  4 ++
 .../qemu-iotests/tests/graph-changes-while-io | 56 +--
 .../tests/graph-changes-while-io.out  |  4 +-
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 3e82c634cf..7073579a7d 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -462,6 +462,10 @@ def qmp(self, cmd: str, args: Optional[Dict[str, object]] 
= None) \
 assert self._qmp is not None
 return self._qmp.cmd(cmd, args)
 
+def get_qmp(self) -> QEMUMonitorProtocol:
+assert self._qmp is not None
+return self._qmp
+
 def stop(self, kill_signal=15):
 self._p.send_signal(kill_signal)
 self._p.wait()
diff --git a/tests/qemu-iotests/tests/graph-changes-while-io 
b/tests/qemu-iotests/tests/graph-changes-while-io
index 7664f33689..750e7d4d38 100755
--- a/tests/qemu-iotests/tests/graph-changes-while-io
+++ b/tests/qemu-iotests/tests/graph-changes-while-io
@@ -22,19 +22,19 @@
 import os
 from threading import Thread
 import iotests
-from iotests import imgfmt, qemu_img, qemu_img_create, QMPTestCase, \
-QemuStorageDaemon
+from iotests import imgfmt, qemu_img, qemu_img_create, qemu_io, \
+QMPTestCase, QemuStorageDaemon
 
 
 top = os.path.join(iotests.test_dir, 'top.img')
 nbd_sock = os.path.join(iotests.sock_dir, 'nbd.sock')
 
 
-def do_qemu_img_bench() -> None:
+def do_qemu_img_bench(count: int = 200) -> None:
 """
 Do some I/O requests on `nbd_sock`.
 """
-qemu_img('bench', '-f', 'raw', '-c', '200',
+qemu_img('bench', '-f', 'raw', '-c', str(count),
  f'nbd+unix:///node0?socket={nbd_sock}')
 
 
@@ -84,6 +84,54 @@ class TestGraphChangesWhileIO(QMPTestCase):
 
 bench_thr.join()
 
+def test_commit_while_io(self) -> None:
+# Run qemu-img bench in the background
+bench_thr = Thread(target=do_qemu_img_bench, args=(20, ))
+bench_thr.start()
+
+qemu_io('-c', 'write 0 64k', top)
+qemu_io('-c', 'write 128k 64k', top)
+
+result = self.qsd.qmp('blockdev-add', {
+'driver': imgfmt,
+'node-name': 'overlay',
+'backing': None,
+'file': {
+'driver': 'file',
+'filename': top
+}
+})
+self.assert_qmp(result, 'return', {})
+
+result = self.qsd.qmp('blockdev-snapshot', {
+'node': 'node0',
+'overlay': 'overlay',
+})
+self.assert_qmp(result, 'return', {})
+
+# While qemu-img bench is running, repeatedly commit overlay to node0
+while bench_thr.is_alive():
+result = self.qsd.qmp('block-commit', {
+'job-id': 'job0',
+'device': 'overlay',
+})
+self.assert_qmp(result, 'return', {})
+
+result = self.qsd.qmp('block-job-cancel', {
+'device': 'job0',
+})
+self.assert_qmp(result, 'return', {})
+
+cancelled = False
+while not cancelled:
+for event in self.qsd.get_qmp().get_events(wait=10.0):
+if event['event'] != 'JOB_STATUS_CHANGE':
+continue
+if event['data']['status'] == 'null':
+cancelled = True
+
+bench_thr.join()
+
 if __name__ == '__main__':
 # Format must support raw backing files
 iotests.main(supported_fmts=['qcow', 'qcow2', 'qed'],
diff --git a/tests/qemu-iotests/tests/graph-changes-while-io.out 
b/tests/qemu-iotests/tests/graph-changes-while-io.out
index ae1213e6f8..fbc63e62f8 100644
--- a/tests/qemu-iotests/tests/graph-changes-while-io.out
+++ b/tests/qemu-iotests/tests/graph-changes-while-io.out
@@ -1,5 +1,5 @@
-.
+..
 --
-Ran 1 tests
+Ran 2 tests
 
 OK
-- 
2.40.1




[PATCH 2/3] nbd/server: Fix drained_poll to wake coroutine in right AioContext

2023-05-17 Thread Kevin Wolf
nbd_drained_poll() generally runs in the main thread, not whatever
iothread the NBD server coroutine is meant to run in, so it can't
directly reenter the coroutines to wake them up.

The code seems to have the right intention, it specifies the correct
AioContext when it calls qemu_aio_coroutine_enter(). However, this
functions doesn't schedule the coroutine to run in that AioContext, but
it assumes it is already called in the home thread of the AioContext.

To fix this, add a new thread-safe qio_channel_wake_read() that can be
called in the main thread to wake up the coroutine in its AioContext,
and use this in nbd_drained_poll().

Cc: qemu-sta...@nongnu.org
Signed-off-by: Kevin Wolf 
---
 include/io/channel.h | 10 ++
 io/channel.c | 33 +++--
 nbd/server.c |  3 +--
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/include/io/channel.h b/include/io/channel.h
index 446a566e5e..229bf36910 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -757,6 +757,16 @@ void qio_channel_detach_aio_context(QIOChannel *ioc);
 void coroutine_fn qio_channel_yield(QIOChannel *ioc,
 GIOCondition condition);
 
+/**
+ * qio_channel_wake_read:
+ * @ioc: the channel object
+ *
+ * If qio_channel_yield() is currently waiting for the channel to become
+ * readable, interrupt it and reenter immediately. This function is safe to 
call
+ * from any thread.
+ */
+void qio_channel_wake_read(QIOChannel *ioc);
+
 /**
  * qio_channel_wait:
  * @ioc: the channel object
diff --git a/io/channel.c b/io/channel.c
index 375a130a39..72f0066af5 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -19,6 +19,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "block/aio-wait.h"
 #include "io/channel.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
@@ -514,7 +515,11 @@ int qio_channel_flush(QIOChannel *ioc,
 static void qio_channel_restart_read(void *opaque)
 {
 QIOChannel *ioc = opaque;
-Coroutine *co = ioc->read_coroutine;
+Coroutine *co = qatomic_xchg(>read_coroutine, NULL);
+
+if (!co) {
+return;
+}
 
 /* Assert that aio_co_wake() reenters the coroutine directly */
 assert(qemu_get_current_aio_context() ==
@@ -525,7 +530,11 @@ static void qio_channel_restart_read(void *opaque)
 static void qio_channel_restart_write(void *opaque)
 {
 QIOChannel *ioc = opaque;
-Coroutine *co = ioc->write_coroutine;
+Coroutine *co = qatomic_xchg(>write_coroutine, NULL);
+
+if (!co) {
+return;
+}
 
 /* Assert that aio_co_wake() reenters the coroutine directly */
 assert(qemu_get_current_aio_context() ==
@@ -568,7 +577,11 @@ void qio_channel_detach_aio_context(QIOChannel *ioc)
 void coroutine_fn qio_channel_yield(QIOChannel *ioc,
 GIOCondition condition)
 {
+AioContext *ioc_ctx = ioc->ctx ?: qemu_get_aio_context();
+
 assert(qemu_in_coroutine());
+assert(in_aio_context_home_thread(ioc_ctx));
+
 if (condition == G_IO_IN) {
 assert(!ioc->read_coroutine);
 ioc->read_coroutine = qemu_coroutine_self();
@@ -580,18 +593,26 @@ void coroutine_fn qio_channel_yield(QIOChannel *ioc,
 }
 qio_channel_set_aio_fd_handlers(ioc);
 qemu_coroutine_yield();
+assert(in_aio_context_home_thread(ioc_ctx));
 
 /* Allow interrupting the operation by reentering the coroutine other than
  * through the aio_fd_handlers. */
-if (condition == G_IO_IN && ioc->read_coroutine) {
-ioc->read_coroutine = NULL;
+if (condition == G_IO_IN) {
+assert(ioc->read_coroutine == NULL);
 qio_channel_set_aio_fd_handlers(ioc);
-} else if (condition == G_IO_OUT && ioc->write_coroutine) {
-ioc->write_coroutine = NULL;
+} else if (condition == G_IO_OUT) {
+assert(ioc->write_coroutine == NULL);
 qio_channel_set_aio_fd_handlers(ioc);
 }
 }
 
+void qio_channel_wake_read(QIOChannel *ioc)
+{
+Coroutine *co = qatomic_xchg(>read_coroutine, NULL);
+if (co) {
+aio_co_wake(co);
+}
+}
 
 static gboolean qio_channel_wait_complete(QIOChannel *ioc,
   GIOCondition condition,
diff --git a/nbd/server.c b/nbd/server.c
index e239c2890f..2664d43bff 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1599,8 +1599,7 @@ static bool nbd_drained_poll(void *opaque)
  * enter it here so we don't depend on the client to wake it up.
  */
 if (client->recv_coroutine != NULL && client->read_yielding) {
-qemu_aio_coroutine_enter(exp->common.ctx,
- client->recv_coroutine);
+qio_channel_wake_read(client->ioc);
 }
 
 return true;
-- 
2.40.1




[PATCH 1/3] graph-lock: Disable locking for now

2023-05-17 Thread Kevin Wolf
In QEMU 8.0, we've been seeing deadlocks in bdrv_graph_wrlock(). They
come from callers that hold an AioContext lock, which is not allowed
during polling. In theory, we could temporarily release the lock, but
callers are inconsistent about whether they hold a lock, and if they do,
some are also confused about which one they hold. While all of this is
fixable, it's not trivial, and the best course of action for 8.0.1 is
probably just disabling the graph locking code temporarily.

We don't currently rely on graph locking yet. It is supposed to replace
the AioContext lock eventually to enable multiqueue support, but as long
as we still have the AioContext lock, it is sufficient without the graph
lock. Once the AioContext lock goes away, the deadlock doesn't exist any
more either and this commit can be reverted. (Of course, it can also be
reverted while the AioContext lock still exists if the callers have been
fixed.)

Cc: qemu-sta...@nongnu.org
Signed-off-by: Kevin Wolf 
---
 block/graph-lock.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/block/graph-lock.c b/block/graph-lock.c
index 9c42bd9799..9806fd4ecb 100644
--- a/block/graph-lock.c
+++ b/block/graph-lock.c
@@ -30,8 +30,10 @@ BdrvGraphLock graph_lock;
 /* Protects the list of aiocontext and orphaned_reader_count */
 static QemuMutex aio_context_list_lock;
 
+#if 0
 /* Written and read with atomic operations. */
 static int has_writer;
+#endif
 
 /*
  * A reader coroutine could move from an AioContext to another.
@@ -88,6 +90,7 @@ void unregister_aiocontext(AioContext *ctx)
 g_free(ctx->bdrv_graph);
 }
 
+#if 0
 static uint32_t reader_count(void)
 {
 BdrvGraphRWlock *brdv_graph;
@@ -105,10 +108,17 @@ static uint32_t reader_count(void)
 assert((int32_t)rd >= 0);
 return rd;
 }
+#endif
 
 void bdrv_graph_wrlock(void)
 {
 GLOBAL_STATE_CODE();
+/*
+ * TODO Some callers hold an AioContext lock when this is called, which
+ * causes deadlocks. Reenable once the AioContext locking is cleaned up (or
+ * AioContext locks are gone).
+ */
+#if 0
 assert(!qatomic_read(_writer));
 
 /* Make sure that constantly arriving new I/O doesn't cause starvation */
@@ -139,11 +149,13 @@ void bdrv_graph_wrlock(void)
 } while (reader_count() >= 1);
 
 bdrv_drain_all_end();
+#endif
 }
 
 void bdrv_graph_wrunlock(void)
 {
 GLOBAL_STATE_CODE();
+#if 0
 QEMU_LOCK_GUARD(_context_list_lock);
 assert(qatomic_read(_writer));
 
@@ -155,10 +167,13 @@ void bdrv_graph_wrunlock(void)
 
 /* Wake up all coroutine that are waiting to read the graph */
 qemu_co_enter_all(_queue, _context_list_lock);
+#endif
 }
 
 void coroutine_fn bdrv_graph_co_rdlock(void)
 {
+/* TODO Reenable when wrlock is reenabled */
+#if 0
 BdrvGraphRWlock *bdrv_graph;
 bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
 
@@ -218,10 +233,12 @@ void coroutine_fn bdrv_graph_co_rdlock(void)
 qemu_co_queue_wait(_queue, _context_list_lock);
 }
 }
+#endif
 }
 
 void coroutine_fn bdrv_graph_co_rdunlock(void)
 {
+#if 0
 BdrvGraphRWlock *bdrv_graph;
 bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
 
@@ -239,6 +256,7 @@ void coroutine_fn bdrv_graph_co_rdunlock(void)
 if (qatomic_read(_writer)) {
 aio_wait_kick();
 }
+#endif
 }
 
 void bdrv_graph_rdlock_main_loop(void)
@@ -264,5 +282,8 @@ void assert_bdrv_graph_readable(void)
 void assert_bdrv_graph_writable(void)
 {
 assert(qemu_in_main_thread());
+/* TODO Reenable when wrlock is reenabled */
+#if 0
 assert(qatomic_read(_writer));
+#endif
 }
-- 
2.40.1




[PATCH 0/3] block/graph-lock: Disable locking for now

2023-05-17 Thread Kevin Wolf
tl;dr is that graph locking introduced deadlocks in 8.0, and disabling
it for now fixes them again. See patch 1 for the details.

I still intend the fix this properly before we remove the AioContext
lock (which is when the deadlock would be automatically solved), but
it's not trivial enough for something that would be ready now and
backportable to stable versions. Let's try the real thing again in 8.1
and fix 8.0 with this stopgap solution.

Patch 2 is a prerequisite for the test case. Instead of reproducing the
deadlock problem (which it unfortunately doesn't do reliably anyway, the
timing seems hard to get right), I got NBD server crashes without it. I
actually made some more NBD changes to fix the crashes before this one,
but it seems to be stable with only this. Maybe the rest only fixed
symptoms of the same root cause, I'll have another look at them.

Kevin Wolf (3):
  graph-lock: Disable locking for now
  nbd/server: Fix drained_poll to wake coroutine in right AioContext
  iotests: Test commit with iothreads and ongoing I/O

 include/io/channel.h  | 10 
 block/graph-lock.c| 21 +++
 io/channel.c  | 33 +--
 nbd/server.c  |  3 +-
 tests/qemu-iotests/iotests.py |  4 ++
 .../qemu-iotests/tests/graph-changes-while-io | 56 +--
 .../tests/graph-changes-while-io.out  |  4 +-
 7 files changed, 117 insertions(+), 14 deletions(-)

-- 
2.40.1




Re: [PATCH v3 1/1] block/blkio: use qemu_open() to support fd passing for virtio-blk

2023-05-17 Thread Stefan Hajnoczi
On Wed, May 17, 2023 at 09:19:26AM +0200, Stefano Garzarella wrote:
> CCing Markus for some advice.
> 
> On Tue, May 16, 2023 at 11:04:21AM -0500, Jonathon Jongsma wrote:
> > On 5/15/23 5:10 AM, Stefano Garzarella wrote:
> > > On Thu, May 11, 2023 at 11:03:22AM -0500, Jonathon Jongsma wrote:
> > > > On 5/11/23 4:15 AM, Stefano Garzarella wrote:
> > > > > The virtio-blk-vhost-vdpa driver in libblkio 1.3.0 supports the new
> > > > > 'fd' property. Let's expose this to the user, so the management layer
> > > > > can pass the file descriptor of an already opened vhost-vdpa character
> > > > > device. This is useful especially when the device can only be accessed
> > > > > with certain privileges.
> > > > > 
> > > > > If the libblkio virtio-blk driver supports fd passing, let's always
> > > > > use qemu_open() to open the `path`, so we can handle fd passing
> > > > > from the management layer through the "/dev/fdset/N" special path.
> > > > > 
> > > > > Signed-off-by: Stefano Garzarella 
> > > > > ---
> > > > > 
> > > > > Notes:
> > > > >     v3:
> > > > >     - use qemu_open() on `path` to simplify libvirt code [Jonathon]
> > > > 
> > > > 
> > > > Thanks
> > > > 
> > > > The one drawback now is that it doesn't seem possible for
> > > > libvirt to introspect whether or not qemu supports passing an fd
> > > > to the driver or not.
> > > 
> > > Yep, this was because the libblkio library did not support this new way.
> > > 
> > > > When I was writing my initial patch (before I realized that it
> > > > was missing fd-passing), I just checked for the existence of the
> > > > virtio-blk-vhost-vdpa device. But we actually need to know both
> > > > that this device exists and supports fd passing.
> > > 
> > > Yep, this was one of the advantages of using the new `fd` parameter.
> > > Can't libvirt handle the later failure?
> > 
> > Not very well. libvirt tries to provide useful errors to the user. So
> > for example if the qemu executable doesn't support a device, we would
> > want to provide an error indicating that the device is not supported
> > rather than a possibly-inscrutable qemu error.
> > 
> > For example, in this scenario, we would want an error such as:
> > 
> > error: unsupported configuration: vhostvdpa disk is not supported with
> > this QEMU binary
> > 
> > Instead of:
> > 
> > error: internal error: qemu unexpectedly closed the monitor:
> > 2023-05-16T15:17:36.666129Z qemu-system-x86_64: -blockdev 
> > {"driver":"virtio-blk-vhost-vdpa","path":"/dev/fdset/0","node-name":"libvirt-1-storage","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}:
> > blkio_connect failed: Failed to connect to vDPA device: Input/output
> > error
> > 
> > And we can only do that if we can determine that the binary has the
> > proper support for fds.
> 
> I see the problem, thanks for explaining this!
> 
> > 
> > > 
> > > > As far as I can tell, versions 7.2.0 and 8.0.0 include this
> > > > device but won't accept fds.
> > > 
> > > Right.
> > > 
> > > How do you suggest to proceed?
> > 
> > I need some way to determine that the particular qemu binary can accept
> > a /dev/fdset/ path for vdpa block devices. libvirt uses a variety of
> > methods to determine capabilities for a given qemu binary, including
> > querying the qmp schema, commands, object types, specific device/object
> > properties, etc. For example, right now I can determine (via querying
> > the qmp schema) whether virtio-blk-vhost-vdpa is a valid type for the
> > blockdev-add command by querying the qmp schema. I need something more
> > than that but I'm not sure how to do it without introducing a separate
> > 'fd' parameter. Any ideas?
> 
> The only thing I can think of is to make a mix between v2 and v3. I mean add
> both the new `fd` parameter, and support qemu_open() on `path`.
> 
> That way libvirt (or other users) can check that fd passing is supported and
> use `fd` or fdset with `path`.
> 
> Obviously I would have liked to implement only one of the two methods, but
> if this helps, maybe it makes sense to support both.
> 
> What do you think?

Markus: Is a preferred way to make this new path handling behavior
introspectable? I vaguely remember a way for QMP clients to query
strings that describe QMP behavior that's not otherwise
introspectable...

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v3 1/1] block/blkio: use qemu_open() to support fd passing for virtio-blk

2023-05-17 Thread Stefano Garzarella

CCing Markus for some advice.

On Tue, May 16, 2023 at 11:04:21AM -0500, Jonathon Jongsma wrote:

On 5/15/23 5:10 AM, Stefano Garzarella wrote:

On Thu, May 11, 2023 at 11:03:22AM -0500, Jonathon Jongsma wrote:

On 5/11/23 4:15 AM, Stefano Garzarella wrote:

The virtio-blk-vhost-vdpa driver in libblkio 1.3.0 supports the new
'fd' property. Let's expose this to the user, so the management layer
can pass the file descriptor of an already opened vhost-vdpa character
device. This is useful especially when the device can only be accessed
with certain privileges.

If the libblkio virtio-blk driver supports fd passing, let's always
use qemu_open() to open the `path`, so we can handle fd passing
from the management layer through the "/dev/fdset/N" special path.

Signed-off-by: Stefano Garzarella 
---

Notes:
    v3:
    - use qemu_open() on `path` to simplify libvirt code [Jonathon]



Thanks

The one drawback now is that it doesn't seem possible for libvirt 
to introspect whether or not qemu supports passing an fd to the 
driver or not.


Yep, this was because the libblkio library did not support this new way.

When I was writing my initial patch (before I realized that it was 
missing fd-passing), I just checked for the existence of the 
virtio-blk-vhost-vdpa device. But we actually need to know both 
that this device exists and supports fd passing.


Yep, this was one of the advantages of using the new `fd` parameter.
Can't libvirt handle the later failure?


Not very well. libvirt tries to provide useful errors to the user. So 
for example if the qemu executable doesn't support a device, we would 
want to provide an error indicating that the device is not supported 
rather than a possibly-inscrutable qemu error.


For example, in this scenario, we would want an error such as:

error: unsupported configuration: vhostvdpa disk is not supported with 
this QEMU binary


Instead of:

error: internal error: qemu unexpectedly closed the monitor: 
2023-05-16T15:17:36.666129Z qemu-system-x86_64: -blockdev {"driver":"virtio-blk-vhost-vdpa","path":"/dev/fdset/0","node-name":"libvirt-1-storage","cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}: 
blkio_connect failed: Failed to connect to vDPA device: Input/output 
error


And we can only do that if we can determine that the binary has the 
proper support for fds.


I see the problem, thanks for explaining this!





As far as I can tell, versions 7.2.0 and 8.0.0 include this device 
but won't accept fds.


Right.

How do you suggest to proceed?


I need some way to determine that the particular qemu binary can 
accept a /dev/fdset/ path for vdpa block devices. libvirt uses a 
variety of methods to determine capabilities for a given qemu binary, 
including querying the qmp schema, commands, object types, specific 
device/object properties, etc. For example, right now I can determine 
(via querying the qmp schema) whether virtio-blk-vhost-vdpa is a valid 
type for the blockdev-add command by querying the qmp schema. I need 
something more than that but I'm not sure how to do it without 
introducing a separate 'fd' parameter. Any ideas?


The only thing I can think of is to make a mix between v2 and v3. I mean 
add both the new `fd` parameter, and support qemu_open() on `path`.


That way libvirt (or other users) can check that fd passing is supported 
and use `fd` or fdset with `path`.


Obviously I would have liked to implement only one of the two methods, 
but if this helps, maybe it makes sense to support both.


What do you think?

Thanks,
Stefano