[PATCH v2 05/14] block: Rename refresh_total_sectors to bdrv_refresh_total_sectors

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

The name is not good, not the least because we are going to convert this
to a generated co_wrapper, which adds a _co infix after the first part
of the name.

No functional change intended.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block_int-io.h | 2 +-
 block.c  | 8 
 block/io.c   | 8 +---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
index 8bc061ebb8..6b285fb520 100644
--- a/include/block/block_int-io.h
+++ b/include/block/block_int-io.h
@@ -120,7 +120,7 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, 
int64_t src_offset,
BdrvRequestFlags read_flags,
BdrvRequestFlags write_flags);
 
-int refresh_total_sectors(BlockDriverState *bs, int64_t hint);
+int bdrv_refresh_total_sectors(BlockDriverState *bs, int64_t hint);
 
 BdrvChild *bdrv_cow_child(BlockDriverState *bs);
 BdrvChild *bdrv_filter_child(BlockDriverState *bs);
diff --git a/block.c b/block.c
index e89844557b..7c279fc0e6 100644
--- a/block.c
+++ b/block.c
@@ -1034,7 +1034,7 @@ static int find_image_format(BlockBackend *file, const 
char *filename,
  * Set the current 'total_sectors' value
  * Return 0 on success, -errno on error.
  */
-int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
+int bdrv_refresh_total_sectors(BlockDriverState *bs, int64_t hint)
 {
 BlockDriver *drv = bs->drv;
 IO_CODE();
@@ -1651,7 +1651,7 @@ static int bdrv_open_driver(BlockDriverState *bs, 
BlockDriver *drv,
 bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
 bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;
 
-ret = refresh_total_sectors(bs, bs->total_sectors);
+ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "Could not refresh total sector count");
 return ret;
@@ -5808,7 +5808,7 @@ int64_t bdrv_nb_sectors(BlockDriverState *bs)
 return -ENOMEDIUM;
 
 if (drv->has_variable_length) {
-int ret = refresh_total_sectors(bs, bs->total_sectors);
+int ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
 if (ret < 0) {
 return ret;
 }
@@ -6590,7 +6590,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp)
 bdrv_dirty_bitmap_skip_store(bm, false);
 }
 
-ret = refresh_total_sectors(bs, bs->total_sectors);
+ret = bdrv_refresh_total_sectors(bs, bs->total_sectors);
 if (ret < 0) {
 bs->open_flags |= BDRV_O_INACTIVE;
 error_setg_errno(errp, -ret, "Could not refresh total sector 
count");
diff --git a/block/io.c b/block/io.c
index 03becd32d2..e5e51563a5 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3473,15 +3473,17 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, 
int64_t offset, bool exact,
 goto out;
 }
 
-ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+ret = bdrv_refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "Could not refresh total sector count");
 } else {
 offset = bs->total_sectors * BDRV_SECTOR_SIZE;
 }
-/* It's possible that truncation succeeded but refresh_total_sectors
+/*
+ * It's possible that truncation succeeded but bdrv_refresh_total_sectors
  * failed, but the latter doesn't affect how we should finish the request.
- * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
+ * Pass 0 as the last parameter so that dirty bitmaps etc. are handled.
+ */
 bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, , 0);
 
 out:
-- 
2.38.1




[PATCH v2 14/14] block: Rename bdrv_load/save_vmstate() to bdrv_co_load/save_vmstate()

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

Since these functions always run in coroutine context, adjust
their name to include "_co_", just like all other BlockDriver callbacks.

No functional change intended.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block_int-common.h |  4 ++--
 block/io.c   |  8 
 block/qcow2.c| 12 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 93d4350f24..fd8ccaefee 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -704,10 +704,10 @@ struct BlockDriver {
  Error **errp);
 BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs);
 
-int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_save_vmstate)(
+int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_save_vmstate)(
 BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
-int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_load_vmstate)(
+int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_load_vmstate)(
 BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
 /* removable device specific */
diff --git a/block/io.c b/block/io.c
index cd1cea2515..445c757c2a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2719,8 +2719,8 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos)
 
 bdrv_inc_in_flight(bs);
 
-if (drv->bdrv_load_vmstate) {
-ret = drv->bdrv_load_vmstate(bs, qiov, pos);
+if (drv->bdrv_co_load_vmstate) {
+ret = drv->bdrv_co_load_vmstate(bs, qiov, pos);
 } else if (child_bs) {
 ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
 } else {
@@ -2752,8 +2752,8 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos)
 
 bdrv_inc_in_flight(bs);
 
-if (drv->bdrv_save_vmstate) {
-ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+if (drv->bdrv_co_save_vmstate) {
+ret = drv->bdrv_co_save_vmstate(bs, qiov, pos);
 } else if (child_bs) {
 ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
 } else {
diff --git a/block/qcow2.c b/block/qcow2.c
index 460579b70a..ce644dfc59 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -5286,8 +5286,8 @@ static int64_t 
qcow2_check_vmstate_request(BlockDriverState *bs,
 return pos;
 }
 
-static coroutine_fn int qcow2_save_vmstate(BlockDriverState *bs,
-   QEMUIOVector *qiov, int64_t pos)
+static coroutine_fn int qcow2_co_save_vmstate(BlockDriverState *bs,
+  QEMUIOVector *qiov, int64_t pos)
 {
 int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
 if (offset < 0) {
@@ -5298,8 +5298,8 @@ static coroutine_fn int 
qcow2_save_vmstate(BlockDriverState *bs,
 return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0);
 }
 
-static coroutine_fn int qcow2_load_vmstate(BlockDriverState *bs,
-   QEMUIOVector *qiov, int64_t pos)
+static coroutine_fn int qcow2_co_load_vmstate(BlockDriverState *bs,
+  QEMUIOVector *qiov, int64_t pos)
 {
 int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
 if (offset < 0) {
@@ -6080,8 +6080,8 @@ BlockDriver bdrv_qcow2 = {
 .bdrv_co_get_info   = qcow2_co_get_info,
 .bdrv_get_specific_info = qcow2_get_specific_info,
 
-.bdrv_save_vmstate= qcow2_save_vmstate,
-.bdrv_load_vmstate= qcow2_load_vmstate,
+.bdrv_co_save_vmstate   = qcow2_co_save_vmstate,
+.bdrv_co_load_vmstate   = qcow2_co_load_vmstate,
 
 .is_format  = true,
 .supports_backing   = true,
-- 
2.38.1




[PATCH v2 02/14] block: Convert bdrv_io_plug() to co_wrapper

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

BlockDriver->bdrv_io_plug is categorized as IO callback, and it
currently doesn't run in a coroutine. We should let it take a graph
rdlock since the callback traverses the block nodes graph, which however
is only possible in a coroutine.

The only caller of this function is blk_io_plug(), therefore make
blk_io_plug() a co_wrapper, so that we're always running in a coroutine
where the lock can be taken.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h  |  3 ++-
 include/block/block_int-common.h  |  2 +-
 include/sysemu/block-backend-io.h |  4 +++-
 block/block-backend.c |  4 ++--
 block/file-posix.c| 10 +-
 block/io.c|  8 
 block/nvme.c  |  4 ++--
 7 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index 2ed6214909..d96168375e 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -217,7 +217,8 @@ void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine 
*co);
 
 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
 
-void bdrv_io_plug(BlockDriverState *bs);
+void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs);
+
 void bdrv_io_unplug(BlockDriverState *bs);
 
 bool coroutine_fn bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index c34c525fa6..a76bb76290 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -729,7 +729,7 @@ struct BlockDriver {
 void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
 
 /* io queue for linux-aio */
-void (*bdrv_io_plug)(BlockDriverState *bs);
+void coroutine_fn (*bdrv_co_io_plug)(BlockDriverState *bs);
 void (*bdrv_io_unplug)(BlockDriverState *bs);
 
 /**
diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
index 7ec6d978d4..70b73f7d11 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -73,7 +73,9 @@ void blk_iostatus_set_err(BlockBackend *blk, int error);
 int blk_get_max_iov(BlockBackend *blk);
 int blk_get_max_hw_iov(BlockBackend *blk);
 
-void blk_io_plug(BlockBackend *blk);
+void coroutine_fn blk_co_io_plug(BlockBackend *blk);
+void co_wrapper blk_io_plug(BlockBackend *blk);
+
 void blk_io_unplug(BlockBackend *blk);
 AioContext *blk_get_aio_context(BlockBackend *blk);
 BlockAcctStats *blk_get_stats(BlockBackend *blk);
diff --git a/block/block-backend.c b/block/block-backend.c
index ba7bf1d6bc..2bca5729e1 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2315,13 +2315,13 @@ void blk_add_insert_bs_notifier(BlockBackend *blk, 
Notifier *notify)
 notifier_list_add(>insert_bs_notifiers, notify);
 }
 
-void blk_io_plug(BlockBackend *blk)
+void coroutine_fn blk_co_io_plug(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
 IO_CODE();
 
 if (bs) {
-bdrv_io_plug(bs);
+bdrv_co_io_plug(bs);
 }
 }
 
diff --git a/block/file-posix.c b/block/file-posix.c
index b9647c5ffc..c8551c8110 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2136,7 +2136,7 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState 
*bs, int64_t offset,
 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
 }
 
-static void raw_aio_plug(BlockDriverState *bs)
+static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
 {
 BDRVRawState __attribute__((unused)) *s = bs->opaque;
 #ifdef CONFIG_LINUX_AIO
@@ -3321,7 +3321,7 @@ BlockDriver bdrv_file = {
 .bdrv_co_copy_range_from = raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
 .bdrv_refresh_limits = raw_refresh_limits,
-.bdrv_io_plug = raw_aio_plug,
+.bdrv_co_io_plug= raw_co_io_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
@@ -3693,7 +3693,7 @@ static BlockDriver bdrv_host_device = {
 .bdrv_co_copy_range_from = raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
 .bdrv_refresh_limits = raw_refresh_limits,
-.bdrv_io_plug = raw_aio_plug,
+.bdrv_co_io_plug= raw_co_io_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
@@ -3817,7 +3817,7 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
 .bdrv_refresh_limits = raw_refresh_limits,
-.bdrv_io_plug = raw_aio_plug,
+.bdrv_co_io_plug= raw_co_io_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
 
@@ -3947,7 +3947,7 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
 

[PATCH v2 01/14] block-coroutine-wrapper: support void functions

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

Just omit the various 'return' when the return type is void.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 scripts/block-coroutine-wrapper.py | 20 +++-
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/scripts/block-coroutine-wrapper.py 
b/scripts/block-coroutine-wrapper.py
index 6e087fa0b7..0c5d7782b1 100644
--- a/scripts/block-coroutine-wrapper.py
+++ b/scripts/block-coroutine-wrapper.py
@@ -85,6 +85,16 @@ def __init__(self, return_type: str, name: str, args: str,
 ctx = 'qemu_get_aio_context()'
 self.ctx = ctx
 
+self.get_result = 's->ret = '
+self.ret = 'return s.ret;'
+self.co_ret = 'return '
+self.return_field = self.return_type + " ret;"
+if self.return_type == 'void':
+self.get_result = ''
+self.ret = ''
+self.co_ret = ''
+self.return_field = ''
+
 def gen_list(self, format: str) -> str:
 return ', '.join(format.format_map(arg.__dict__) for arg in self.args)
 
@@ -131,7 +141,7 @@ def create_mixed_wrapper(func: FuncDecl) -> str:
 {{
 if (qemu_in_coroutine()) {{
 {graph_assume_lock}
-return {name}({ func.gen_list('{name}') });
+{func.co_ret}{name}({ func.gen_list('{name}') });
 }} else {{
 {struct_name} s = {{
 .poll_state.ctx = {func.ctx},
@@ -143,7 +153,7 @@ def create_mixed_wrapper(func: FuncDecl) -> str:
 s.poll_state.co = qemu_coroutine_create({name}_entry, );
 
 bdrv_poll_co(_state);
-return s.ret;
+{func.ret}
 }}
 }}"""
 
@@ -168,7 +178,7 @@ def create_co_wrapper(func: FuncDecl) -> str:
 s.poll_state.co = qemu_coroutine_create({name}_entry, );
 
 bdrv_poll_co(_state);
-return s.ret;
+{func.ret}
 }}"""
 
 
@@ -195,7 +205,7 @@ def gen_wrapper(func: FuncDecl) -> str:
 
 typedef struct {struct_name} {{
 BdrvPollCo poll_state;
-{func.return_type} ret;
+{func.return_field}
 { func.gen_block('{decl};') }
 }} {struct_name};
 
@@ -204,7 +214,7 @@ def gen_wrapper(func: FuncDecl) -> str:
 {struct_name} *s = opaque;
 
 {graph_lock}
-s->ret = {name}({ func.gen_list('s->{name}') });
+{func.get_result}{name}({ func.gen_list('s->{name}') });
 {graph_unlock}
 s->poll_state.in_progress = false;
 
-- 
2.38.1




[PATCH v2 07/14] block-backend: use bdrv_getlength instead of blk_getlength

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

The only difference is that blk_ checks if the block is available,
but this check is already performed above in blk_check_byte_request().

This is in preparation for the graph rdlock, which will be taken
by both the callers of blk_check_byte_request() and blk_getlength().

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 block/block-backend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 6e5e2693b3..37b51f409f 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1253,7 +1253,7 @@ static int blk_check_byte_request(BlockBackend *blk, 
int64_t offset,
 }
 
 if (!blk->allow_write_beyond_eof) {
-len = blk_getlength(blk);
+len = bdrv_getlength(blk_bs(blk));
 if (len < 0) {
 return len;
 }
-- 
2.38.1




[PATCH v2 12/14] block: Convert bdrv_lock_medium() to co_wrapper

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

bdrv_lock_medium() is categorized as an I/O function, and it currently
doesn't run in a coroutine. We should let it take a graph rdlock since
it traverses the block nodes graph, which however is only possible in a
coroutine.

The only caller of this function is blk_lock_medium(). Therefore make
blk_lock_medium() a co_wrapper, so that it always creates a new
coroutine, and then make bdrv_lock_medium() a coroutine_fn where the
lock can be taken.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h  | 2 +-
 include/block/block_int-common.h  | 2 +-
 include/sysemu/block-backend-io.h | 4 +++-
 block.c   | 6 +++---
 block/block-backend.c | 4 ++--
 block/copy-on-read.c  | 6 +++---
 block/file-posix.c| 8 
 block/filter-compress.c   | 7 ---
 block/raw-format.c| 6 +++---
 9 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index f3d49ea05f..7e76bb647a 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -143,7 +143,7 @@ int bdrv_get_flags(BlockDriverState *bs);
 bool coroutine_fn bdrv_co_is_inserted(BlockDriverState *bs);
 bool co_wrapper bdrv_is_inserted(BlockDriverState *bs);
 
-void bdrv_lock_medium(BlockDriverState *bs, bool locked);
+void coroutine_fn bdrv_co_lock_medium(BlockDriverState *bs, bool locked);
 void coroutine_fn bdrv_co_eject(BlockDriverState *bs, bool eject_flag);
 
 const char *bdrv_get_format_name(BlockDriverState *bs);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 1631a26427..1174f13a2f 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -713,7 +713,7 @@ struct BlockDriver {
 /* removable device specific */
 bool coroutine_fn (*bdrv_co_is_inserted)(BlockDriverState *bs);
 void coroutine_fn (*bdrv_co_eject)(BlockDriverState *bs, bool eject_flag);
-void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
+void coroutine_fn (*bdrv_co_lock_medium)(BlockDriverState *bs, bool 
locked);
 
 /* to control generic scsi devices */
 BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
index 00209625e1..780c1e5f77 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -58,7 +58,9 @@ bool coroutine_fn blk_co_is_inserted(BlockBackend *blk);
 bool co_wrapper_mixed blk_is_inserted(BlockBackend *blk);
 
 bool blk_is_available(BlockBackend *blk);
-void blk_lock_medium(BlockBackend *blk, bool locked);
+
+void coroutine_fn blk_co_lock_medium(BlockBackend *blk, bool locked);
+void co_wrapper blk_lock_medium(BlockBackend *blk, bool locked);
 
 void coroutine_fn blk_co_eject(BlockBackend *blk, bool eject_flag);
 void co_wrapper blk_eject(BlockBackend *blk, bool eject_flag);
diff --git a/block.c b/block.c
index d8415c9519..b7ccb44184 100644
--- a/block.c
+++ b/block.c
@@ -6834,14 +6834,14 @@ void coroutine_fn bdrv_co_eject(BlockDriverState *bs, 
bool eject_flag)
  * Lock or unlock the media (if it is locked, the user won't be able
  * to eject it manually).
  */
-void bdrv_lock_medium(BlockDriverState *bs, bool locked)
+void coroutine_fn bdrv_co_lock_medium(BlockDriverState *bs, bool locked)
 {
 BlockDriver *drv = bs->drv;
 IO_CODE();
 trace_bdrv_lock_medium(bs, locked);
 
-if (drv && drv->bdrv_lock_medium) {
-drv->bdrv_lock_medium(bs, locked);
+if (drv && drv->bdrv_co_lock_medium) {
+drv->bdrv_co_lock_medium(bs, locked);
 }
 }
 
diff --git a/block/block-backend.c b/block/block-backend.c
index 7320081814..11e46ecb51 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1999,13 +1999,13 @@ bool blk_is_available(BlockBackend *blk)
 return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
 }
 
-void blk_lock_medium(BlockBackend *blk, bool locked)
+void coroutine_fn blk_co_lock_medium(BlockBackend *blk, bool locked)
 {
 BlockDriverState *bs = blk_bs(blk);
 IO_CODE();
 
 if (bs) {
-bdrv_lock_medium(bs, locked);
+bdrv_co_lock_medium(bs, locked);
 }
 }
 
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 43f09514dc..5032b78efc 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -222,9 +222,9 @@ static void coroutine_fn cor_co_eject(BlockDriverState *bs, 
bool eject_flag)
 }
 
 
-static void cor_lock_medium(BlockDriverState *bs, bool locked)
+static void coroutine_fn cor_co_lock_medium(BlockDriverState *bs, bool locked)
 {
-bdrv_lock_medium(bs->file->bs, locked);
+bdrv_co_lock_medium(bs->file->bs, locked);
 }
 
 
@@ -258,7 +258,7 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
 
 .bdrv_co_eject  = cor_co_eject,
-

[PATCH v2 04/14] block: Convert bdrv_is_inserted() to co_wrapper

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

bdrv_is_inserted() is categorized as an I/O function, and it currently
doesn't run in a coroutine. We should let it take a graph rdlock since
it traverses the block nodes graph, which however is only possible in a
coroutine.

Therefore turn it into a co_wrapper to move the actual function into a
coroutine where the lock can be taken.

At the same time, add also blk_is_inserted as co_wrapper_mixed, since it
is called in both coroutine and non-coroutine contexts.

Because now this function creates a new coroutine and polls, we need to
take the AioContext lock where it is missing, for the only reason that
internally c_w_mixed_bdrv_rdlock calls AIO_WAIT_WHILE and it expects to
release the AioContext lock. Once the rwlock is ultimated and placed in
every place it needs to be, we will poll using AIO_WAIT_WHILE_UNLOCKED
and remove the AioContext lock.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h  |  5 -
 include/block/block_int-common.h  |  2 +-
 include/sysemu/block-backend-io.h |  5 -
 block.c   |  8 
 block/block-backend.c |  4 ++--
 block/file-posix.c|  8 
 block/io.c| 12 ++--
 blockdev.c|  8 +++-
 8 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index 3bf201f7f4..fe031360c4 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -133,7 +133,10 @@ bool bdrv_is_read_only(BlockDriverState *bs);
 bool bdrv_is_writable(BlockDriverState *bs);
 bool bdrv_is_sg(BlockDriverState *bs);
 int bdrv_get_flags(BlockDriverState *bs);
-bool bdrv_is_inserted(BlockDriverState *bs);
+
+bool coroutine_fn bdrv_co_is_inserted(BlockDriverState *bs);
+bool co_wrapper bdrv_is_inserted(BlockDriverState *bs);
+
 void bdrv_lock_medium(BlockDriverState *bs, bool locked);
 void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 253df92509..e6229c64e6 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -708,7 +708,7 @@ struct BlockDriver {
 BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
 /* removable device specific */
-bool (*bdrv_is_inserted)(BlockDriverState *bs);
+bool coroutine_fn (*bdrv_co_is_inserted)(BlockDriverState *bs);
 void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
 void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
 
diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
index d8cc8d74f5..4358fc6476 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -53,7 +53,10 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long 
int req, void *buf,
 
 void blk_inc_in_flight(BlockBackend *blk);
 void blk_dec_in_flight(BlockBackend *blk);
-bool blk_is_inserted(BlockBackend *blk);
+
+bool coroutine_fn blk_co_is_inserted(BlockBackend *blk);
+bool co_wrapper_mixed blk_is_inserted(BlockBackend *blk);
+
 bool blk_is_available(BlockBackend *blk);
 void blk_lock_medium(BlockBackend *blk, bool locked);
 void blk_eject(BlockBackend *blk, bool eject_flag);
diff --git a/block.c b/block.c
index 9c2ac757e4..e89844557b 100644
--- a/block.c
+++ b/block.c
@@ -6781,7 +6781,7 @@ out:
 /**
  * Return TRUE if the media is present
  */
-bool bdrv_is_inserted(BlockDriverState *bs)
+bool coroutine_fn bdrv_co_is_inserted(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
 BdrvChild *child;
@@ -6790,11 +6790,11 @@ bool bdrv_is_inserted(BlockDriverState *bs)
 if (!drv) {
 return false;
 }
-if (drv->bdrv_is_inserted) {
-return drv->bdrv_is_inserted(bs);
+if (drv->bdrv_co_is_inserted) {
+return drv->bdrv_co_is_inserted(bs);
 }
 QLIST_FOREACH(child, >children, next) {
-if (!bdrv_is_inserted(child->bs)) {
+if (!bdrv_co_is_inserted(child->bs)) {
 return false;
 }
 }
diff --git a/block/block-backend.c b/block/block-backend.c
index 71c7ef4004..6e3b3b9fe8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1983,12 +1983,12 @@ void blk_activate(BlockBackend *blk, Error **errp)
 bdrv_activate(bs, errp);
 }
 
-bool blk_is_inserted(BlockBackend *blk)
+bool coroutine_fn blk_co_is_inserted(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
 IO_CODE();
 
-return bs && bdrv_is_inserted(bs);
+return bs && bdrv_co_is_inserted(bs);
 }
 
 bool blk_is_available(BlockBackend *blk)
diff --git a/block/file-posix.c b/block/file-posix.c
index dd1b8ec52a..b29cd02b30 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -3761,7 +3761,7 @@ out:
 return prio;
 }
 
-static bool cdrom_is_inserted(BlockDriverState *bs)

[PATCH v2 13/14] block: Convert bdrv_debug_event() to co_wrapper_mixed

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

bdrv_debug_event() is categorized as an I/O function, and it currently
doesn't run in a coroutine. We should let it take a graph rdlock since
it traverses the block nodes graph, which however is only possible in a
coroutine.

Therefore turn it into a co_wrapper_mixed to move the actual function
into a coroutine where the lock can be taken.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h |  5 -
 include/block/block_int-common.h |  3 ++-
 block.c  |  6 +++---
 block/blkdebug.c |  5 +++--
 block/io.c   | 22 +++---
 5 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index 7e76bb647a..9737fc63cb 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -188,7 +188,10 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t 
size);
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
 void bdrv_disable_copy_on_read(BlockDriverState *bs);
 
-void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);
+void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs,
+  BlkdebugEvent event);
+void co_wrapper_mixed bdrv_debug_event(BlockDriverState *bs,
+   BlkdebugEvent event);
 
 #define BLKDBG_EVENT(child, evt) \
 do { \
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 1174f13a2f..93d4350f24 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -729,7 +729,8 @@ struct BlockDriver {
 int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_check)(
 BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix);
 
-void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
+void coroutine_fn (*bdrv_co_debug_event)(BlockDriverState *bs,
+ BlkdebugEvent event);
 
 /* io queue for linux-aio */
 void coroutine_fn (*bdrv_co_io_plug)(BlockDriverState *bs);
diff --git a/block.c b/block.c
index b7ccb44184..eebb4560c4 100644
--- a/block.c
+++ b/block.c
@@ -6350,14 +6350,14 @@ BlockStatsSpecific 
*bdrv_get_specific_stats(BlockDriverState *bs)
 return drv->bdrv_get_specific_stats(bs);
 }
 
-void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event)
+void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent 
event)
 {
 IO_CODE();
-if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) {
+if (!bs || !bs->drv || !bs->drv->bdrv_co_debug_event) {
 return;
 }
 
-bs->drv->bdrv_debug_event(bs, event);
+bs->drv->bdrv_co_debug_event(bs, event);
 }
 
 static BlockDriverState *bdrv_find_debug_node(BlockDriverState *bs)
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 2294b0227b..69aa7aa43d 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -835,7 +835,8 @@ static void process_rule(BlockDriverState *bs, struct 
BlkdebugRule *rule,
 }
 }
 
-static void blkdebug_debug_event(BlockDriverState *bs, BlkdebugEvent event)
+static void coroutine_fn
+blkdebug_co_debug_event(BlockDriverState *bs, BlkdebugEvent event)
 {
 BDRVBlkdebugState *s = bs->opaque;
 struct BlkdebugRule *rule, *next;
@@ -1085,7 +1086,7 @@ static BlockDriver bdrv_blkdebug = {
 .bdrv_co_pdiscard   = blkdebug_co_pdiscard,
 .bdrv_co_block_status   = blkdebug_co_block_status,
 
-.bdrv_debug_event   = blkdebug_debug_event,
+.bdrv_co_debug_event= blkdebug_co_debug_event,
 .bdrv_debug_breakpoint  = blkdebug_debug_breakpoint,
 .bdrv_debug_remove_breakpoint
 = blkdebug_debug_remove_breakpoint,
diff --git a/block/io.c b/block/io.c
index c43637f5c1..cd1cea2515 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1250,7 +1250,7 @@ static int coroutine_fn 
bdrv_co_do_copy_on_readv(BdrvChild *child,
 goto err;
 }
 
-bdrv_debug_event(bs, BLKDBG_COR_WRITE);
+bdrv_co_debug_event(bs, BLKDBG_COR_WRITE);
 if (drv->bdrv_co_pwrite_zeroes &&
 buffer_is_zero(bounce_buffer, pnum)) {
 /* FIXME: Should we (perhaps conditionally) be setting
@@ -1495,10 +1495,10 @@ static coroutine_fn int bdrv_padding_rmw_read(BdrvChild 
*child,
 qemu_iovec_init_buf(_qiov, pad->buf, bytes);
 
 if (pad->head) {
-bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
+bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
 }
 if (pad->merge_reads && pad->tail) {
-bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
+bdrv_co_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
 }
 ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
   align, _qiov, 0, 0);
@@ -1506,10 +1506,10 @@ static coroutine_fn int 

[PATCH v2 09/14] block: Convert bdrv_get_allocated_file_size() to co_wrapper

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

bdrv_get_allocated_file_size() is categorized as an I/O function, and it
currently doesn't run in a coroutine. We should let it take a graph
rdlock since it traverses the block nodes graph, which however is only
possible in a coroutine.

Therefore turn it into a co_wrapper to move the actual function into a
coroutine where the lock can be taken.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h |  4 +++-
 include/block/block_int-common.h |  4 +++-
 block.c  | 12 ++--
 block/file-posix.c   | 14 +-
 block/file-win32.c   | 10 --
 block/gluster.c  | 11 ++-
 block/nfs.c  |  4 ++--
 block/null.c |  7 ---
 block/qcow2-refcount.c   |  2 +-
 block/vmdk.c |  9 +
 10 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index 4dcb5f73fa..0718554590 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -79,7 +79,9 @@ int64_t co_wrapper_mixed bdrv_nb_sectors(BlockDriverState 
*bs);
 int64_t coroutine_fn bdrv_co_getlength(BlockDriverState *bs);
 int64_t co_wrapper_mixed bdrv_getlength(BlockDriverState *bs);
 
-int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
+int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs);
+int64_t co_wrapper bdrv_get_allocated_file_size(BlockDriverState *bs);
+
 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
BlockDriverState *in_bs, Error **errp);
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 6336c7239a..cc03c599e7 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -685,7 +685,9 @@ struct BlockDriver {
  bool exact, PreallocMode prealloc,
  BdrvRequestFlags flags, Error **errp);
 int64_t coroutine_fn (*bdrv_co_getlength)(BlockDriverState *bs);
-int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
+int64_t coroutine_fn (*bdrv_co_get_allocated_file_size)(
+BlockDriverState *bs);
+
 BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs,
   Error **errp);
 
diff --git a/block.c b/block.c
index dd22a206f9..82c6cb6dcd 100644
--- a/block.c
+++ b/block.c
@@ -5719,7 +5719,7 @@ exit:
 }
 
 /**
- * Implementation of BlockDriver.bdrv_get_allocated_file_size() that
+ * Implementation of BlockDriver.bdrv_co_get_allocated_file_size() that
  * sums the size of all data-bearing children.  (This excludes backing
  * children.)
  */
@@ -5732,7 +5732,7 @@ static int64_t 
bdrv_sum_allocated_file_size(BlockDriverState *bs)
 if (child->role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
BDRV_CHILD_FILTERED))
 {
-child_size = bdrv_get_allocated_file_size(child->bs);
+child_size = bdrv_co_get_allocated_file_size(child->bs);
 if (child_size < 0) {
 return child_size;
 }
@@ -5747,7 +5747,7 @@ static int64_t 
bdrv_sum_allocated_file_size(BlockDriverState *bs)
  * Length of a allocated file in bytes. Sparse files are counted by actual
  * allocated space. Return < 0 if error or unknown.
  */
-int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
+int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
 IO_CODE();
@@ -5755,8 +5755,8 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 if (!drv) {
 return -ENOMEDIUM;
 }
-if (drv->bdrv_get_allocated_file_size) {
-return drv->bdrv_get_allocated_file_size(bs);
+if (drv->bdrv_co_get_allocated_file_size) {
+return drv->bdrv_co_get_allocated_file_size(bs);
 }
 
 if (drv->bdrv_file_open) {
@@ -5768,7 +5768,7 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 return -ENOTSUP;
 } else if (drv->is_filter) {
 /* Filter drivers default to the size of their filtered child */
-return bdrv_get_allocated_file_size(bdrv_filter_bs(bs));
+return bdrv_co_get_allocated_file_size(bdrv_filter_bs(bs));
 } else {
 /* Other drivers default to summing their children's sizes */
 return bdrv_sum_allocated_file_size(bs);
diff --git a/block/file-posix.c b/block/file-posix.c
index d70e4ec95a..a22f93e4b2 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2468,7 +2468,7 @@ static int64_t coroutine_fn 
raw_co_getlength(BlockDriverState *bs)
 }
 #endif
 
-static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
+static int64_t coroutine_fn 

[PATCH v2 00/14] block: Move more functions to coroutines

2023-01-13 Thread Kevin Wolf
This series converts some IO_CODE() functions to coroutine_fn because
they access the graph and will need to hold the graph lock in the
future. IO_CODE() functions can be called from iothreads, so taking the
graph lock requires the function to run in coroutine context.

Pretty much all of the changes in this series were posted by Emanuele
before as part of "Protect the block layer with a rwlock: part 3". The
major difference is that in the old version, the patches did two things
at once: Converting functions to coroutine_fn, and adding the locking to
them. This series does only the coroutine conversion. The locking part
will be in another series which now comes with TSA annotations and makes
the locking related changes big enough to have separate patches.

v2:
- In each patch converting a BlockDriver callback to be called in
  coroutine, also immediately rename it and its implementation to
  include co_ in its name, as well as mark the implementations
  coroutine_fn [Vladimir]
- Moved bdrv_is_inserted() earlier in the series because
  raw_is_inserted() calls raw_getlength(), so it needs to be converted
  first to avoid calling a coroutine_fn from a non-coroutine_fn in an
  intermediate state.
- The final patch only renames bdrv_load/save_vmstate() any more, which
  was already converted to coroutine_fn earlier.
- Since pretty much every patch was touched in this, I refrained from
  picking up any Reviewed-by for v1

Emanuele Giuseppe Esposito (14):
  block-coroutine-wrapper: support void functions
  block: Convert bdrv_io_plug() to co_wrapper
  block: Convert bdrv_io_unplug() to co_wrapper
  block: Convert bdrv_is_inserted() to co_wrapper
  block: Rename refresh_total_sectors to bdrv_refresh_total_sectors
  block: Convert bdrv_refresh_total_sectors() to co_wrapper_mixed
  block-backend: use bdrv_getlength instead of blk_getlength
  block: use bdrv_co_refresh_total_sectors when possible
  block: Convert bdrv_get_allocated_file_size() to co_wrapper
  block: Convert bdrv_get_info() to co_wrapper_mixed
  block: Convert bdrv_eject() to co_wrapper
  block: Convert bdrv_lock_medium() to co_wrapper
  block: Convert bdrv_debug_event() to co_wrapper_mixed
  block: Rename bdrv_load/save_vmstate() to bdrv_co_load/save_vmstate()

 include/block/block-io.h   |  36 +++---
 include/block/block_int-common.h   |  26 ---
 include/block/block_int-io.h   |   5 +-
 include/sysemu/block-backend-io.h  |  31 +++--
 block.c|  82 +-
 block/blkdebug.c   |  11 +--
 block/blkio.c  |  15 ++--
 block/blklogwrites.c   |   6 +-
 block/blkreplay.c  |   6 +-
 block/blkverify.c  |   6 +-
 block/block-backend.c  |  36 +-
 block/commit.c |   4 +-
 block/copy-on-read.c   |  18 ++---
 block/crypto.c |  14 ++--
 block/curl.c   |  10 +--
 block/file-posix.c | 107 ++---
 block/file-win32.c |  18 +++--
 block/filter-compress.c|  20 +++---
 block/gluster.c|  23 ---
 block/io.c |  76 ++--
 block/iscsi.c  |  17 ++---
 block/mirror.c |   6 +-
 block/nbd.c|   8 +--
 block/nfs.c|   4 +-
 block/null.c   |  13 ++--
 block/nvme.c   |  14 ++--
 block/preallocate.c|  16 ++---
 block/qcow.c   |   5 +-
 block/qcow2-refcount.c |   2 +-
 block/qcow2.c  |  17 ++---
 block/qed.c|  11 +--
 block/quorum.c |   8 +--
 block/raw-format.c |  25 +++
 block/rbd.c|   9 +--
 block/replication.c|   6 +-
 block/ssh.c|   4 +-
 block/throttle.c   |   6 +-
 block/vdi.c|   7 +-
 block/vhdx.c   |   5 +-
 block/vmdk.c   |  14 ++--
 block/vpc.c|   5 +-
 blockdev.c |   8 ++-
 hw/scsi/scsi-disk.c|   5 ++
 tests/unit/test-block-iothread.c   |   3 +
 scripts/block-coroutine-wrapper.py |  20 --
 block/meson.build  |   1 +
 46 files changed, 443 insertions(+), 346 deletions(-)

-- 
2.38.1




[PATCH v2 10/14] block: Convert bdrv_get_info() to co_wrapper_mixed

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

bdrv_get_info() is categorized as an I/O function, and it currently
doesn't run in a coroutine. We should let it take a graph rdlock since
it traverses the block nodes graph, which however is only possible in a
coroutine.

Therefore turn it into a co_wrapper to move the actual function into a
coroutine where the lock can be taken.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h | 5 -
 include/block/block_int-common.h | 3 ++-
 block.c  | 8 
 block/blkio.c| 5 +++--
 block/crypto.c   | 8 
 block/file-posix.c   | 7 ---
 block/io.c   | 8 
 block/iscsi.c| 7 ---
 block/mirror.c   | 2 +-
 block/qcow.c | 5 +++--
 block/qcow2.c| 5 +++--
 block/qed.c  | 5 +++--
 block/raw-format.c   | 7 ---
 block/rbd.c  | 5 +++--
 block/vdi.c  | 7 ---
 block/vhdx.c | 5 +++--
 block/vmdk.c | 5 +++--
 block/vpc.c  | 5 +++--
 18 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index 0718554590..e27dc9787b 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -151,7 +151,10 @@ bool bdrv_supports_compressed_writes(BlockDriverState *bs);
 const char *bdrv_get_node_name(const BlockDriverState *bs);
 const char *bdrv_get_device_name(const BlockDriverState *bs);
 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
-int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+
+int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+int co_wrapper_mixed bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+
 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
   Error **errp);
 BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index cc03c599e7..a6ac8afd5b 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -697,7 +697,8 @@ struct BlockDriver {
 int64_t offset, int64_t bytes, QEMUIOVector *qiov,
 size_t qiov_offset);
 
-int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
+int coroutine_fn (*bdrv_co_get_info)(BlockDriverState *bs,
+ BlockDriverInfo *bdi);
 
 ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs,
  Error **errp);
diff --git a/block.c b/block.c
index 82c6cb6dcd..2707069ab6 100644
--- a/block.c
+++ b/block.c
@@ -6300,7 +6300,7 @@ void bdrv_get_backing_filename(BlockDriverState *bs,
 pstrcpy(filename, filename_size, bs->backing_file);
 }
 
-int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
 int ret;
 BlockDriver *drv = bs->drv;
@@ -6309,15 +6309,15 @@ int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo 
*bdi)
 if (!drv) {
 return -ENOMEDIUM;
 }
-if (!drv->bdrv_get_info) {
+if (!drv->bdrv_co_get_info) {
 BlockDriverState *filtered = bdrv_filter_bs(bs);
 if (filtered) {
-return bdrv_get_info(filtered, bdi);
+return bdrv_co_get_info(filtered, bdi);
 }
 return -ENOTSUP;
 }
 memset(bdi, 0, sizeof(*bdi));
-ret = drv->bdrv_get_info(bs, bdi);
+ret = drv->bdrv_co_get_info(bs, bdi);
 if (ret < 0) {
 return ret;
 }
diff --git a/block/blkio.c b/block/blkio.c
index 2a3ab5a570..fc83e0fe13 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -878,7 +878,8 @@ static int coroutine_fn blkio_truncate(BlockDriverState 
*bs, int64_t offset,
 return 0;
 }
 
-static int blkio_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+static int coroutine_fn
+blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
 return 0;
 }
@@ -998,7 +999,7 @@ static void blkio_refresh_limits(BlockDriverState *bs, 
Error **errp)
 .bdrv_close  = blkio_close, \
 .bdrv_co_getlength   = blkio_co_getlength, \
 .bdrv_co_truncate= blkio_truncate, \
-.bdrv_get_info   = blkio_get_info, \
+.bdrv_co_get_info= blkio_co_get_info, \
 .bdrv_attach_aio_context = blkio_attach_aio_context, \
 .bdrv_detach_aio_context = blkio_detach_aio_context, \
 .bdrv_co_pdiscard= blkio_co_pdiscard, \
diff --git a/block/crypto.c b/block/crypto.c
index 6d6c006887..b70cec97c7 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -737,13 +737,13 @@ fail:
 return ret;
 }
 

[PATCH v2 03/14] block: Convert bdrv_io_unplug() to co_wrapper

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

BlockDriver->bdrv_io_unplug is categorized as IO callback, and it
currently doesn't run in a coroutine. We should let it take a graph
rdlock since the callback traverses the block nodes graph, which however
is only possible in a coroutine.

The only caller of this function is blk_io_unplug(), therefore make
blk_io_unplug() a co_wrapper, so that we're always running in a
coroutine where the lock can be taken.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h  |  3 +--
 include/block/block_int-common.h  |  2 +-
 include/sysemu/block-backend-io.h |  4 +++-
 block/blkio.c |  4 ++--
 block/block-backend.c |  4 ++--
 block/file-posix.c| 10 +-
 block/io.c|  8 
 block/nvme.c  |  4 ++--
 8 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index d96168375e..3bf201f7f4 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -218,8 +218,7 @@ void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine 
*co);
 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
 
 void coroutine_fn bdrv_co_io_plug(BlockDriverState *bs);
-
-void bdrv_io_unplug(BlockDriverState *bs);
+void coroutine_fn bdrv_co_io_unplug(BlockDriverState *bs);
 
 bool coroutine_fn bdrv_co_can_store_new_dirty_bitmap(BlockDriverState *bs,
  const char *name,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index a76bb76290..253df92509 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -730,7 +730,7 @@ struct BlockDriver {
 
 /* io queue for linux-aio */
 void coroutine_fn (*bdrv_co_io_plug)(BlockDriverState *bs);
-void (*bdrv_io_unplug)(BlockDriverState *bs);
+void coroutine_fn (*bdrv_co_io_unplug)(BlockDriverState *bs);
 
 /**
  * bdrv_drain_begin is called if implemented in the beginning of a
diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
index 70b73f7d11..d8cc8d74f5 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -76,7 +76,9 @@ int blk_get_max_hw_iov(BlockBackend *blk);
 void coroutine_fn blk_co_io_plug(BlockBackend *blk);
 void co_wrapper blk_io_plug(BlockBackend *blk);
 
-void blk_io_unplug(BlockBackend *blk);
+void coroutine_fn blk_co_io_unplug(BlockBackend *blk);
+void co_wrapper blk_io_unplug(BlockBackend *blk);
+
 AioContext *blk_get_aio_context(BlockBackend *blk);
 BlockAcctStats *blk_get_stats(BlockBackend *blk);
 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
diff --git a/block/blkio.c b/block/blkio.c
index 5eae3adfaf..1ff51ff4f3 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -477,7 +477,7 @@ static int coroutine_fn 
blkio_co_pwrite_zeroes(BlockDriverState *bs,
 return cod.ret;
 }
 
-static void blkio_io_unplug(BlockDriverState *bs)
+static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
 {
 BDRVBlkioState *s = bs->opaque;
 
@@ -1006,7 +1006,7 @@ static void blkio_refresh_limits(BlockDriverState *bs, 
Error **errp)
 .bdrv_co_pwritev = blkio_co_pwritev, \
 .bdrv_co_flush_to_disk   = blkio_co_flush, \
 .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
-.bdrv_io_unplug  = blkio_io_unplug, \
+.bdrv_co_io_unplug   = blkio_co_io_unplug, \
 .bdrv_refresh_limits = blkio_refresh_limits, \
 .bdrv_register_buf   = blkio_register_buf, \
 .bdrv_unregister_buf = blkio_unregister_buf, \
diff --git a/block/block-backend.c b/block/block-backend.c
index 2bca5729e1..71c7ef4004 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2325,13 +2325,13 @@ void coroutine_fn blk_co_io_plug(BlockBackend *blk)
 }
 }
 
-void blk_io_unplug(BlockBackend *blk)
+void coroutine_fn blk_co_io_unplug(BlockBackend *blk)
 {
 BlockDriverState *bs = blk_bs(blk);
 IO_CODE();
 
 if (bs) {
-bdrv_io_unplug(bs);
+bdrv_co_io_unplug(bs);
 }
 }
 
diff --git a/block/file-posix.c b/block/file-posix.c
index c8551c8110..dd1b8ec52a 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2153,7 +2153,7 @@ static void coroutine_fn raw_co_io_plug(BlockDriverState 
*bs)
 #endif
 }
 
-static void raw_aio_unplug(BlockDriverState *bs)
+static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
 {
 BDRVRawState __attribute__((unused)) *s = bs->opaque;
 #ifdef CONFIG_LINUX_AIO
@@ -3322,7 +3322,7 @@ BlockDriver bdrv_file = {
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
 .bdrv_refresh_limits = raw_refresh_limits,
 .bdrv_co_io_plug= raw_co_io_plug,
-.bdrv_io_unplug = raw_aio_unplug,
+.bdrv_co_io_unplug  = raw_co_io_unplug,
 .bdrv_attach_aio_context = 

[PATCH v2 08/14] block: use bdrv_co_refresh_total_sectors when possible

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

In some places we are sure we are always running in a
coroutine, therefore it's useless to call the generated_co_wrapper,
instead call directly the _co_ function.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 block/block-backend.c | 6 +++---
 block/io.c| 4 ++--
 block/preallocate.c   | 6 +++---
 block/qed.c   | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 37b51f409f..fc08400544 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1235,8 +1235,8 @@ void blk_set_disable_request_queuing(BlockBackend *blk, 
bool disable)
 blk->disable_request_queuing = disable;
 }
 
-static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
-  int64_t bytes)
+static coroutine_fn int blk_check_byte_request(BlockBackend *blk,
+   int64_t offset, int64_t bytes)
 {
 int64_t len;
 
@@ -1253,7 +1253,7 @@ static int blk_check_byte_request(BlockBackend *blk, 
int64_t offset,
 }
 
 if (!blk->allow_write_beyond_eof) {
-len = bdrv_getlength(blk_bs(blk));
+len = bdrv_co_getlength(blk_bs(blk));
 if (len < 0) {
 return len;
 }
diff --git a/block/io.c b/block/io.c
index e5e51563a5..fdc5ba9fb6 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3443,7 +3443,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, 
int64_t offset, bool exact,
 if (new_bytes && backing) {
 int64_t backing_len;
 
-backing_len = bdrv_getlength(backing->bs);
+backing_len = bdrv_co_getlength(backing->bs);
 if (backing_len < 0) {
 ret = backing_len;
 error_setg_errno(errp, -ret, "Could not get backing file size");
@@ -3473,7 +3473,7 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, 
int64_t offset, bool exact,
 goto out;
 }
 
-ret = bdrv_refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
+ret = bdrv_co_refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "Could not refresh total sector count");
 } else {
diff --git a/block/preallocate.c b/block/preallocate.c
index 94aa824e09..5815d7a78b 100644
--- a/block/preallocate.c
+++ b/block/preallocate.c
@@ -286,7 +286,7 @@ static bool coroutine_fn handle_write(BlockDriverState *bs, 
int64_t offset,
 }
 
 if (s->data_end < 0) {
-s->data_end = bdrv_getlength(bs->file->bs);
+s->data_end = bdrv_co_getlength(bs->file->bs);
 if (s->data_end < 0) {
 return false;
 }
@@ -308,7 +308,7 @@ static bool coroutine_fn handle_write(BlockDriverState *bs, 
int64_t offset,
 }
 
 if (s->file_end < 0) {
-s->file_end = bdrv_getlength(bs->file->bs);
+s->file_end = bdrv_co_getlength(bs->file->bs);
 if (s->file_end < 0) {
 return false;
 }
@@ -380,7 +380,7 @@ preallocate_co_truncate(BlockDriverState *bs, int64_t 
offset,
 
 if (s->data_end >= 0 && offset > s->data_end) {
 if (s->file_end < 0) {
-s->file_end = bdrv_getlength(bs->file->bs);
+s->file_end = bdrv_co_getlength(bs->file->bs);
 if (s->file_end < 0) {
 error_setg(errp, "failed to get file length");
 return s->file_end;
diff --git a/block/qed.c b/block/qed.c
index c8f9045b72..16bf0cb080 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -424,7 +424,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
*bs, QDict *options,
 }
 
 /* Round down file size to the last cluster */
-file_size = bdrv_getlength(bs->file->bs);
+file_size = bdrv_co_getlength(bs->file->bs);
 if (file_size < 0) {
 error_setg(errp, "Failed to get file length");
 return file_size;
-- 
2.38.1




[PATCH v2 06/14] block: Convert bdrv_refresh_total_sectors() to co_wrapper_mixed

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

BlockDriver->bdrv_getlength is categorized as IO callback, and it
currently doesn't run in a coroutine. We should let it take a graph
rdlock since the callback traverses the block nodes graph, which however
is only possible in a coroutine.

Therefore turn it into a co_wrapper to move the actual function into a
coroutine where the lock can be taken.

Because now this function creates a new coroutine and polls, we need to
take the AioContext lock where it is missing, for the only reason that
internally co_wrapper calls AIO_WAIT_WHILE and it expects to release the
AioContext lock.

This is especially messy when a co_wrapper creates a coroutine and polls
in bdrv_open_driver, because this function has so many callers in so
many context that it can easily lead to deadlocks. Therefore the new
rule for bdrv_open_driver is that the caller must always hold the
AioContext lock of the given bs (except if it is a coroutine), because
the function calls bdrv_refresh_total_sectors() which is now a
co_wrapper.

Once the rwlock is ultimated and placed in every place it needs to be,
we will poll using AIO_WAIT_WHILE_UNLOCKED and remove the AioContext
lock.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h  |  8 --
 include/block/block_int-common.h  |  2 +-
 include/block/block_int-io.h  |  5 +++-
 include/sysemu/block-backend-io.h | 10 ++--
 block.c   | 32 +--
 block/blkdebug.c  |  6 ++---
 block/blkio.c |  6 ++---
 block/blklogwrites.c  |  6 ++---
 block/blkreplay.c |  6 ++---
 block/blkverify.c |  6 ++---
 block/block-backend.c | 10 +---
 block/commit.c|  4 +--
 block/copy-on-read.c  |  6 ++---
 block/crypto.c|  6 ++---
 block/curl.c  | 10 
 block/file-posix.c| 42 +++
 block/file-win32.c|  8 +++---
 block/filter-compress.c   |  6 ++---
 block/gluster.c   | 12 -
 block/iscsi.c | 10 
 block/mirror.c|  4 +--
 block/nbd.c   |  8 +++---
 block/null.c  |  6 ++---
 block/nvme.c  |  6 ++---
 block/preallocate.c   | 10 
 block/qed.c   |  4 +--
 block/quorum.c|  8 +++---
 block/raw-format.c|  6 ++---
 block/rbd.c   |  4 +--
 block/replication.c   |  6 ++---
 block/ssh.c   |  4 +--
 block/throttle.c  |  6 ++---
 hw/scsi/scsi-disk.c   |  5 
 tests/unit/test-block-iothread.c  |  3 +++
 block/meson.build |  1 +
 35 files changed, 161 insertions(+), 121 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index fe031360c4..4dcb5f73fa 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -73,8 +73,12 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t 
offset, bool exact,
   PreallocMode prealloc, BdrvRequestFlags 
flags,
   Error **errp);
 
-int64_t bdrv_nb_sectors(BlockDriverState *bs);
-int64_t bdrv_getlength(BlockDriverState *bs);
+int64_t coroutine_fn bdrv_co_nb_sectors(BlockDriverState *bs);
+int64_t co_wrapper_mixed bdrv_nb_sectors(BlockDriverState *bs);
+
+int64_t coroutine_fn bdrv_co_getlength(BlockDriverState *bs);
+int64_t co_wrapper_mixed bdrv_getlength(BlockDriverState *bs);
+
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
BlockDriverState *in_bs, Error **errp);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index e6229c64e6..6336c7239a 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -684,7 +684,7 @@ struct BlockDriver {
 int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset,
  bool exact, PreallocMode prealloc,
  BdrvRequestFlags flags, Error **errp);
-int64_t (*bdrv_getlength)(BlockDriverState *bs);
+int64_t coroutine_fn (*bdrv_co_getlength)(BlockDriverState *bs);
 int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
 BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs,
   Error **errp);
diff --git a/include/block/block_int-io.h b/include/block/block_int-io.h
index 6b285fb520..d1559a501f 100644
--- a/include/block/block_int-io.h
+++ b/include/block/block_int-io.h
@@ -120,7 +120,10 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, 
int64_t 

[PATCH v2 11/14] block: Convert bdrv_eject() to co_wrapper

2023-01-13 Thread Kevin Wolf
From: Emanuele Giuseppe Esposito 

bdrv_eject() is categorized as an I/O function, and it currently
doesn't run in a coroutine. We should let it take a graph rdlock since
it traverses the block nodes graph, which however is only possible in a
coroutine.

The only caller of this function is blk_eject(). Therefore make
blk_eject() a co_wrapper, so that it always creates a new coroutine, and
then make bdrv_eject() coroutine_fn where the lock can be taken.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
---
 include/block/block-io.h  | 3 ++-
 include/block/block_int-common.h  | 2 +-
 include/sysemu/block-backend-io.h | 4 +++-
 block.c   | 6 +++---
 block/block-backend.c | 4 ++--
 block/copy-on-read.c  | 6 +++---
 block/file-posix.c| 8 
 block/filter-compress.c   | 7 ---
 block/raw-format.c| 6 +++---
 9 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index e27dc9787b..f3d49ea05f 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -144,7 +144,8 @@ bool coroutine_fn bdrv_co_is_inserted(BlockDriverState *bs);
 bool co_wrapper bdrv_is_inserted(BlockDriverState *bs);
 
 void bdrv_lock_medium(BlockDriverState *bs, bool locked);
-void bdrv_eject(BlockDriverState *bs, bool eject_flag);
+void coroutine_fn bdrv_co_eject(BlockDriverState *bs, bool eject_flag);
+
 const char *bdrv_get_format_name(BlockDriverState *bs);
 
 bool bdrv_supports_compressed_writes(BlockDriverState *bs);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index a6ac8afd5b..1631a26427 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -712,7 +712,7 @@ struct BlockDriver {
 
 /* removable device specific */
 bool coroutine_fn (*bdrv_co_is_inserted)(BlockDriverState *bs);
-void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
+void coroutine_fn (*bdrv_co_eject)(BlockDriverState *bs, bool eject_flag);
 void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
 
 /* to control generic scsi devices */
diff --git a/include/sysemu/block-backend-io.h 
b/include/sysemu/block-backend-io.h
index a1eac6c00a..00209625e1 100644
--- a/include/sysemu/block-backend-io.h
+++ b/include/sysemu/block-backend-io.h
@@ -59,7 +59,9 @@ bool co_wrapper_mixed blk_is_inserted(BlockBackend *blk);
 
 bool blk_is_available(BlockBackend *blk);
 void blk_lock_medium(BlockBackend *blk, bool locked);
-void blk_eject(BlockBackend *blk, bool eject_flag);
+
+void coroutine_fn blk_co_eject(BlockBackend *blk, bool eject_flag);
+void co_wrapper blk_eject(BlockBackend *blk, bool eject_flag);
 
 int64_t coroutine_fn blk_co_getlength(BlockBackend *blk);
 int64_t co_wrapper_mixed blk_getlength(BlockBackend *blk);
diff --git a/block.c b/block.c
index 2707069ab6..d8415c9519 100644
--- a/block.c
+++ b/block.c
@@ -6820,13 +6820,13 @@ bool coroutine_fn bdrv_co_is_inserted(BlockDriverState 
*bs)
 /**
  * If eject_flag is TRUE, eject the media. Otherwise, close the tray
  */
-void bdrv_eject(BlockDriverState *bs, bool eject_flag)
+void coroutine_fn bdrv_co_eject(BlockDriverState *bs, bool eject_flag)
 {
 BlockDriver *drv = bs->drv;
 IO_CODE();
 
-if (drv && drv->bdrv_eject) {
-drv->bdrv_eject(bs, eject_flag);
+if (drv && drv->bdrv_co_eject) {
+drv->bdrv_co_eject(bs, eject_flag);
 }
 }
 
diff --git a/block/block-backend.c b/block/block-backend.c
index fc08400544..7320081814 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2009,14 +2009,14 @@ void blk_lock_medium(BlockBackend *blk, bool locked)
 }
 }
 
-void blk_eject(BlockBackend *blk, bool eject_flag)
+void coroutine_fn blk_co_eject(BlockBackend *blk, bool eject_flag)
 {
 BlockDriverState *bs = blk_bs(blk);
 char *id;
 IO_CODE();
 
 if (bs) {
-bdrv_eject(bs, eject_flag);
+bdrv_co_eject(bs, eject_flag);
 }
 
 /* Whether or not we ejected on the backend,
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 41777b87a4..43f09514dc 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -216,9 +216,9 @@ static int coroutine_fn 
cor_co_pwritev_compressed(BlockDriverState *bs,
 }
 
 
-static void cor_eject(BlockDriverState *bs, bool eject_flag)
+static void coroutine_fn cor_co_eject(BlockDriverState *bs, bool eject_flag)
 {
-bdrv_eject(bs->file->bs, eject_flag);
+bdrv_co_eject(bs->file->bs, eject_flag);
 }
 
 
@@ -257,7 +257,7 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_co_pdiscard   = cor_co_pdiscard,
 .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
 
-.bdrv_eject = cor_eject,
+.bdrv_co_eject  = cor_co_eject,
 .bdrv_lock_medium   = cor_lock_medium,
 
 .has_variable_length= true,
diff 

Re: [PATCH 1/4] qcow2: Fix theoretical corruption in store_bitmap() error path

2023-01-13 Thread Philippe Mathieu-Daudé

On 13/1/23 11:45, Kevin Wolf wrote:

Am 13.01.2023 um 08:30 hat Philippe Mathieu-Daudé geschrieben:

On 12/1/23 20:14, Kevin Wolf wrote:

In order to write the bitmap table to the image file, it is converted to
big endian. If the write fails, it is passed to clear_bitmap_table() to
free all of the clusters it had allocated before. However, if we don't
convert it back to native endianness first, we'll free things at a wrong
offset.

In practical terms, the offsets will be so high that we won't actually
free any allocated clusters, but just run into an error, but in theory
this can cause image corruption.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Kevin Wolf 
---
   block/qcow2-bitmap.c | 5 +++--
   1 file changed, 3 insertions(+), 2 deletions(-)



Maybe add a comment here remembering to bswap back to native endianness?


-static inline void bitmap_table_to_be(uint64_t *bitmap_table, size_t size)
+static inline void bitmap_table_bswap_be(uint64_t *bitmap_table, size_t size)
   {


This function uses cpu_to_be64(), semantically we convert back calling
be64_to_cpu(), but technically both functions end up being the same.


Yes, but we don't seem to have any public "neutral" functions, it's
always either from or to.


Alternatively:

  for (i = 0; i < size; ++i) {
-bitmap_table[i] = cpu_to_be64(bitmap_table[i]);
+bswap64s(_table[i]);
  }


Doesn't that swap even on big endian hosts, resulting incorrectly in a
little endian table?


Oops yes you are right... sorry!


The closest thing we have that I can see is the be_bswap() macro in
bswap.h, but it's undefined again at the end of the header.


Indeed.

Regards,

Phil.



Re: [PATCH v6 13/13] docs/devel: Align VFIO migration docs to v2 protocol

2023-01-13 Thread Cédric Le Goater

On 1/12/23 09:50, Avihai Horon wrote:

Now that VFIO migration protocol v2 has been implemented and v1 protocol
has been removed, update the documentation according to v2 protocol.

Signed-off-by: Avihai Horon 



Reviewed-by: Cédric Le Goater 

Thanks,

C.




---
  docs/devel/vfio-migration.rst | 68 ---
  1 file changed, 30 insertions(+), 38 deletions(-)

diff --git a/docs/devel/vfio-migration.rst b/docs/devel/vfio-migration.rst
index 9ff6163c88..1d50c2fe5f 100644
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -7,46 +7,39 @@ the guest is running on source host and restoring this saved 
state on the
  destination host. This document details how saving and restoring of VFIO
  devices is done in QEMU.
  
-Migration of VFIO devices consists of two phases: the optional pre-copy phase,

-and the stop-and-copy phase. The pre-copy phase is iterative and allows to
-accommodate VFIO devices that have a large amount of data that needs to be
-transferred. The iterative pre-copy phase of migration allows for the guest to
-continue whilst the VFIO device state is transferred to the destination, this
-helps to reduce the total downtime of the VM. VFIO devices can choose to skip
-the pre-copy phase of migration by returning pending_bytes as zero during the
-pre-copy phase.
+Migration of VFIO devices currently consists of a single stop-and-copy phase.
+During the stop-and-copy phase the guest is stopped and the entire VFIO device
+data is transferred to the destination.
+
+The pre-copy phase of migration is currently not supported for VFIO devices.
+Support for VFIO pre-copy will be added later on.
  
  A detailed description of the UAPI for VFIO device migration can be found in

-the comment for the ``vfio_device_migration_info`` structure in the header
-file linux-headers/linux/vfio.h.
+the comment for the ``vfio_device_mig_state`` structure in the header file
+linux-headers/linux/vfio.h.
  
  VFIO implements the device hooks for the iterative approach as follows:
  
-* A ``save_setup`` function that sets up the migration region and sets _SAVING

-  flag in the VFIO device state.
+* A ``save_setup`` function that sets up migration on the source.
  
-* A ``load_setup`` function that sets up the migration region on the

-  destination and sets _RESUMING flag in the VFIO device state.
+* A ``load_setup`` function that sets the VFIO device on the destination in
+  _RESUMING state.
  
  * A ``save_live_pending`` function that reads pending_bytes from the vendor

driver, which indicates the amount of data that the vendor driver has yet to
save for the VFIO device.
  
-* A ``save_live_iterate`` function that reads the VFIO device's data from the

-  vendor driver through the migration region during iterative phase.
-
  * A ``save_state`` function to save the device config space if it is present.
  
-* A ``save_live_complete_precopy`` function that resets _RUNNING flag from the

-  VFIO device state and iteratively copies the remaining data for the VFIO
-  device until the vendor driver indicates that no data remains (pending bytes
-  is zero).
+* A ``save_live_complete_precopy`` function that sets the VFIO device in
+  _STOP_COPY state and iteratively copies the data for the VFIO device until
+  the vendor driver indicates that no data remains.
  
  * A ``load_state`` function that loads the config section and the data

-  sections that are generated by the save functions above
+  sections that are generated by the save functions above.
  
  * ``cleanup`` functions for both save and load that perform any migration

-  related cleanup, including unmapping the migration region
+  related cleanup.
  
  
  The VFIO migration code uses a VM state change handler to change the VFIO

@@ -71,13 +64,13 @@ tracking can identify dirtied pages, but any page pinned by 
the vendor driver
  can also be written by the device. There is currently no device or IOMMU
  support for dirty page tracking in hardware.
  
-By default, dirty pages are tracked when the device is in pre-copy as well as

-stop-and-copy phase. So, a page pinned by the vendor driver will be copied to
-the destination in both phases. Copying dirty pages in pre-copy phase helps
-QEMU to predict if it can achieve its downtime tolerances. If QEMU during
-pre-copy phase keeps finding dirty pages continuously, then it understands
-that even in stop-and-copy phase, it is likely to find dirty pages and can
-predict the downtime accordingly.
+By default, dirty pages are tracked during pre-copy as well as stop-and-copy
+phase. So, a page pinned by the vendor driver will be copied to the destination
+in both phases. Copying dirty pages in pre-copy phase helps QEMU to predict if
+it can achieve its downtime tolerances. If QEMU during pre-copy phase keeps
+finding dirty pages continuously, then it understands that even in 
stop-and-copy
+phase, it is likely to find dirty pages and can predict the downtime
+accordingly.

Re: [PATCH v6 04/33] hw/pci/pci: Factor out pci_bus_map_irqs() from pci_bus_irqs()

2023-01-13 Thread Bernhard Beschow



Am 13. Januar 2023 10:13:29 UTC schrieb "Philippe Mathieu-Daudé" 
:
>On 9/1/23 18:23, Bernhard Beschow wrote:
>> pci_bus_irqs() coupled together the assignment of pci_set_irq_fn and
>> pci_map_irq_fn to a PCI bus. This coupling gets in the way when the
>> pci_map_irq_fn is board-specific while the pci_set_irq_fn is device-
>> specific.
>> 
>> For example, both of QEMU's PIIX south bridge models have different
>> pci_map_irq_fn implementations which are board-specific rather than
>> device-specific. These implementations should therefore reside in board
>> code. The pci_set_irq_fn's, however, should stay in the device models
>> because they access memory internal to the model.
>> 
>> Factoring out pci_bus_map_irqs() from pci_bus_irqs() allows the
>> assignments to be decoupled, resolving the problem described above.
>> 
>> Note also how pci_vpb_realize() which gets touched in this commit
>> assigns different pci_map_irq_fn's depending on the board.
>> 
>> Signed-off-by: Bernhard Beschow 
>> Reviewed-by: Michael S. Tsirkin 
>> Reviewed-by: Philippe Mathieu-Daudé 
>> ---
>>   include/hw/pci/pci.h|  3 ++-
>>   hw/i386/pc_q35.c|  4 ++--
>>   hw/isa/piix3.c  |  8 
>>   hw/isa/piix4.c  |  3 ++-
>>   hw/pci-host/raven.c |  3 ++-
>>   hw/pci-host/versatile.c |  3 ++-
>>   hw/pci/pci.c| 12 +---
>>   hw/remote/machine.c |  3 ++-
>>   8 files changed, 25 insertions(+), 14 deletions(-)
>> 
>> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
>> index 7048a373d1..85ee458cd2 100644
>> --- a/include/hw/pci/pci.h
>> +++ b/include/hw/pci/pci.h
>> @@ -282,8 +282,9 @@ PCIBus *pci_root_bus_new(DeviceState *parent, const char 
>> *name,
>>MemoryRegion *address_space_io,
>>uint8_t devfn_min, const char *typename);
>>   void pci_root_bus_cleanup(PCIBus *bus);
>> -void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, pci_map_irq_fn 
>> map_irq,
>> +void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq,
>> void *irq_opaque, int nirq);
>> +void pci_bus_map_irqs(PCIBus *bus, pci_map_irq_fn map_irq);
>
>I'm squashing:
>
>-- >8 --
>diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
>index fe1fdfb5f7..46171f22f7 100644
>--- a/hw/remote/vfio-user-obj.c
>+++ b/hw/remote/vfio-user-obj.c
>@@ -667,4 +667,4 @@ void vfu_object_set_bus_irq(PCIBus *pci_bus)
>
>-pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
>- max_bdf);
>+pci_bus_irqs(pci_bus, vfu_object_set_irq, , pci_bus, max_bdf);
>+pci_bus_map_irqs(pci_bus, vfu_object_map_irq);
> }
>---
>
>to fix:
>
>../hw/remote/vfio-user-obj.c: In function ‘vfu_object_set_bus_irq’:
>../hw/remote/vfio-user-obj.c:668:67: error: passing argument 4 of 
>‘pci_bus_irqs’ makes integer from pointer without a cast 
>[-Werror=int-conversion]
> pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
>   ^~~
>In file included from include/hw/pci/pci_device.h:4,
> from include/hw/remote/iohub.h:14,
> from include/hw/remote/machine.h:18,
> from ../hw/remote/vfio-user-obj.c:43:
>include/hw/pci/pci.h:286:41: note: expected ‘int’ but argument is of type 
>‘PCIBus *’ {aka ‘struct PCIBus *’}
>   void *irq_opaque, int nirq);
> ^~~~
>../hw/remote/vfio-user-obj.c:668:5: error: too many arguments to function 
>‘pci_bus_irqs’
> pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
> ^~~~
>In file included from include/hw/pci/pci_device.h:4,
> from include/hw/remote/iohub.h:14,
> from include/hw/remote/machine.h:18,
> from ../hw/remote/vfio-user-obj.c:43:
>include/hw/pci/pci.h:285:6: note: declared here
> void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq,
>  ^~~~
>

Thanks!

I've missed enabling vfio-user-server for my builds. Fixed now.

Best regards,
Bernhard



Re: [PATCH v6 00/33] Consolidate PIIX south bridges

2023-01-13 Thread Bernhard Beschow



Am 13. Januar 2023 08:46:53 UTC schrieb "Philippe Mathieu-Daudé" 
:
>Hi Bernhard,
>
>On 9/1/23 18:23, Bernhard Beschow wrote:
>> This series consolidates the implementations of the PIIX3 and PIIX4 south
>> bridges and is an extended version of [1]. The motivation is to share as much
>> code as possible and to bring both device models to feature parity such that
>> perhaps PIIX4 can become a drop-in-replacement for PIIX3 in the pc machine. 
>> This
>> could resolve the "Frankenstein" PIIX4-PM problem in PIIX3 discussed on this
>> list before.
>
>> Bernhard Beschow (30):
>>hw/pci/pci: Factor out pci_bus_map_irqs() from pci_bus_irqs()
>>hw/isa/piix3: Decouple INTx-to-LNKx routing which is board-specific
>>hw/isa/piix4: Decouple INTx-to-LNKx routing which is board-specific
>>hw/mips/Kconfig: Track Malta's PIIX dependencies via Kconfig
>>hw/usb/hcd-uhci: Introduce TYPE_ defines for device models
>>hw/intc/i8259: Make using the isa_pic singleton more type-safe
>>hw/intc/i8259: Introduce i8259 proxy TYPE_ISA_PIC
>>hw/i386/pc: Create RTC controllers in south bridges
>>hw/i386/pc: No need for rtc_state to be an out-parameter
>>hw/i386/pc_piix: Allow for setting properties before realizing PIIX3
>>  south bridge
>>hw/isa/piix3: Create USB controller in host device
>>hw/isa/piix3: Create power management controller in host device
>>hw/isa/piix3: Create TYPE_ISA_PIC in host device
>>hw/isa/piix3: Create IDE controller in host device
>>hw/isa/piix3: Wire up ACPI interrupt internally
>>hw/isa/piix3: Resolve redundant PIIX_NUM_PIC_IRQS
>>hw/isa/piix3: Rename pci_piix3_props for sharing with PIIX4
>>hw/isa/piix3: Rename piix3_reset() for sharing with PIIX4
>>hw/isa/piix3: Drop the "3" from PIIX base class
>>hw/isa/piix4: Make PIIX4's ACPI and USB functions optional
>>hw/isa/piix4: Remove unused inbound ISA interrupt lines
>>hw/isa/piix4: Use TYPE_ISA_PIC device
>>hw/isa/piix4: Reuse struct PIIXState from PIIX3
>>hw/isa/piix4: Rename reset control operations to match PIIX3
>>hw/isa/piix3: Merge hw/isa/piix4.c
>>hw/isa/piix: Harmonize names of reset control memory regions
>>hw/isa/piix: Reuse PIIX3 base class' realize method in PIIX4
>>hw/isa/piix: Rename functions to be shared for interrupt triggering
>>hw/isa/piix: Consolidate IRQ triggering
>>hw/isa/piix: Share PIIX3's base class with PIIX4
>> 
>> Philippe Mathieu-Daudé (3):
>>hw/mips/malta: Introduce PIIX4_PCI_DEVFN definition
>>hw/mips/malta: Set PIIX4 IRQ routes in embedded bootloader
>>hw/isa/piix4: Correct IRQRC[A:D] reset values
>
>I'm queuing the first 10 patches for now to alleviate the size of this
>series, and I'll respin a v7 with the rest to avoid making you suffer
>any longer :/ Thanks for insisting in this effort and I apologize it
>is taking me so long...

Okay... What's the further plan? Is there anything missing?

Thanks,
Bernhard
>
>Regards,
>
>Phil.



Re: completion timeouts with pin-based interrupts in QEMU hw/nvme

2023-01-13 Thread Keith Busch
On Fri, Jan 13, 2023 at 12:32:29PM +, Peter Maydell wrote:
> On Fri, 13 Jan 2023 at 08:55, Klaus Jensen  wrote:
> >
> > +CC qemu pci maintainers
> >
> > Michael, Marcel,
> >
> > Do you have any comments on this thread? As you can see one solution is
> > to simply deassert prior to asserting, the other is to reintroduce a
> > pci_irq_pulse(). Both seem to solve the issue.
> 
> Both seem to be missing any analysis of "this is what is
> happening, this is where we differ from hardware, this
> is why this is the correct fix". We shouldn't put in
> random "this seems to happen to cause the guest to boot"
> fixes, please.

It looks like these are being treated as edge instead of level
interrupts so the work-around is to create more edges. I would
definitely prefer a real fix, whether that's in the kernel or CPU
emulation or somewhere else, but I'm not sure I'll have time to see it
to completion.

FWIW, x86 seems to handle legacy IRQs with NVMe as expected. It's
actually easy enough for the DEASSERT to take so long that kernel
reports the irq as "spurious" because it's spinning too often on the
level.



Re: [PATCH 02/14] block: Convert bdrv_io_plug() to co_wrapper

2023-01-13 Thread Kevin Wolf
Am 19.12.2022 um 13:26 hat Emanuele Giuseppe Esposito geschrieben:
> Am 16/12/2022 um 17:12 schrieb Vladimir Sementsov-Ogievskiy:
> > On 12/13/22 11:53, Kevin Wolf wrote:
> >> --- a/include/block/block_int-common.h
> >> +++ b/include/block/block_int-common.h
> >> @@ -729,7 +729,7 @@ struct BlockDriver {
> >>   void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent
> >> event);
> >>     /* io queue for linux-aio */
> >> -    void (*bdrv_io_plug)(BlockDriverState *bs);
> >> +    void coroutine_fn (*bdrv_io_plug)(BlockDriverState *bs);
> > 
> > But you didn't add coroutine_fn to realizations of this callback. Seems,
> > it should be done
> 
> This has to be done by Paolo & Alberto Faria work to add missing
> coroutine_fn and various annotations in a systematic way. It has to be
> done using tools like vrc, and not manually because it's not feasible.
> Not covered by this serie.

I didn't want to change your patches too much (and I'm sure it has
nothing at all to do with laziness!), but Vladimir is really right.
Doing the renames in the final patch is a bit sloppy, just as not adding
coroutine_fn is.

I'll fix it for v2 to do all of the changes in the respective patches.

Kevin




Re: [PATCH v6 09/13] vfio/migration: Implement VFIO migration protocol v2

2023-01-13 Thread Cédric Le Goater

On 1/12/23 09:50, Avihai Horon wrote:

Implement the basic mandatory part of VFIO migration protocol v2.
This includes all functionality that is necessary to support
VFIO_MIGRATION_STOP_COPY part of the v2 protocol.

The two protocols, v1 and v2, will co-exist and in the following patches
v1 protocol code will be removed.

There are several main differences between v1 and v2 protocols:
- VFIO device state is now represented as a finite state machine instead
   of a bitmap.

- Migration interface with kernel is now done using VFIO_DEVICE_FEATURE
   ioctl and normal read() and write() instead of the migration region.

- Pre-copy is made optional in v2 protocol. Support for pre-copy will be
   added later on.

Detailed information about VFIO migration protocol v2 and its difference
compared to v1 protocol can be found here [1].

[1]
https://lore.kernel.org/all/20220224142024.147653-10-yish...@nvidia.com/

Signed-off-by: Avihai Horon 



LGTM,

Reviewed-by: Cédric Le Goater 

Still a small issue below,


---
  include/hw/vfio/vfio-common.h |   5 +
  hw/vfio/common.c  |  19 +-
  hw/vfio/migration.c   | 455 +++---
  hw/vfio/trace-events  |   7 +
  4 files changed, 447 insertions(+), 39 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index bbaf72ba00..2ec3346fea 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -66,6 +66,11 @@ typedef struct VFIOMigration {
  int vm_running;
  Notifier migration_state;
  uint64_t pending_bytes;
+enum vfio_device_mig_state device_state;
+int data_fd;
+void *data_buffer;
+size_t data_buffer_size;
+bool v2;
  } VFIOMigration;
  
  typedef struct VFIOAddressSpace {

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 550b2d7ded..dcaa77d2a8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -355,10 +355,18 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer 
*container)
  return false;
  }
  
-if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&

+if (!migration->v2 &&
+(vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
  (migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING)) {
  return false;
  }
+
+if (migration->v2 &&
+(vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
+(migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+ migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+return false;
+}
  }
  }
  return true;
@@ -385,7 +393,14 @@ static bool 
vfio_devices_all_running_and_mig_active(VFIOContainer *container)
  return false;
  }
  
-if (migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING) {

+if (!migration->v2 &&
+migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING) {
+continue;
+}
+
+if (migration->v2 &&
+(migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+ migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P)) {
  continue;
  } else {
  return false;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 9df859f4d3..08f53189fa 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -10,6 +10,7 @@
  #include "qemu/osdep.h"
  #include "qemu/main-loop.h"
  #include "qemu/cutils.h"
+#include "qemu/units.h"
  #include 
  #include 
  
@@ -44,8 +45,103 @@

  #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xef13ULL)
  #define VFIO_MIG_FLAG_DEV_DATA_STATE(0xef14ULL)
  
+/*

+ * This is an arbitrary size based on migration of mlx5 devices, where 
typically
+ * total device migration size is on the order of 100s of MB. Testing with
+ * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
+ */
+#define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
+
  static int64_t bytes_transferred;
  
+static const char *mig_state_to_str(enum vfio_device_mig_state state)

+{
+switch (state) {
+case VFIO_DEVICE_STATE_ERROR:
+return "ERROR";
+case VFIO_DEVICE_STATE_STOP:
+return "STOP";
+case VFIO_DEVICE_STATE_RUNNING:
+return "RUNNING";
+case VFIO_DEVICE_STATE_STOP_COPY:
+return "STOP_COPY";
+case VFIO_DEVICE_STATE_RESUMING:
+return "RESUMING";
+case VFIO_DEVICE_STATE_RUNNING_P2P:
+return "RUNNING_P2P";
+default:
+return "UNKNOWN STATE";
+}
+}
+
+static int vfio_migration_set_state(VFIODevice *vbasedev,
+enum vfio_device_mig_state new_state,
+enum vfio_device_mig_state recover_state)
+{
+VFIOMigration *migration = vbasedev->migration;
+  

Re: [PATCH v6 10/13] vfio/migration: Optimize vfio_save_pending()

2023-01-13 Thread Cédric Le Goater

On 1/12/23 09:50, Avihai Horon wrote:

During pre-copy phase of migration vfio_save_pending() is called
repeatedly and queries the VFIO device for its pending data size.

As long as pending RAM size is over the threshold, migration can't
converge and be completed. Therefore, during this time there is no
point in querying the VFIO device pending data size.

Avoid these unnecessary queries by issuing them in a RAM pre-copy
notifier instead of vfio_save_pending().

This way the VFIO device is queried only when RAM pending data is
below the threshold, when there is an actual chance for migration to
converge.

Signed-off-by: Avihai Horon 



Reviewed-by: Cédric Le Goater 

Thanks,

C.




---
  include/hw/vfio/vfio-common.h |  2 ++
  hw/vfio/migration.c   | 56 +++
  hw/vfio/trace-events  |  1 +
  3 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 2ec3346fea..113f8d9208 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -65,11 +65,13 @@ typedef struct VFIOMigration {
  uint32_t device_state_v1;
  int vm_running;
  Notifier migration_state;
+NotifierWithReturn migration_data;
  uint64_t pending_bytes;
  enum vfio_device_mig_state device_state;
  int data_fd;
  void *data_buffer;
  size_t data_buffer_size;
+uint64_t stop_copy_size;
  bool v2;
  } VFIOMigration;
  
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c

index 08f53189fa..04f4397212 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -655,29 +655,19 @@ static void vfio_v1_save_cleanup(void *opaque)
  trace_vfio_save_cleanup(vbasedev->name);
  }
  
-/*

- * Migration size of VFIO devices can be as little as a few KBs or as big as
- * many GBs. This value should be big enough to cover the worst case.
- */
-#define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
  static void vfio_save_pending(void *opaque, uint64_t threshold_size,
uint64_t *res_precopy_only,
uint64_t *res_compatible,
uint64_t *res_postcopy_only)
  {
  VFIODevice *vbasedev = opaque;
-uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
+VFIOMigration *migration = vbasedev->migration;
  
-/*

- * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
- * reported so downtime limit won't be violated.
- */
-vfio_query_stop_copy_size(vbasedev, _copy_size);
-*res_precopy_only += stop_copy_size;
+*res_precopy_only += migration->stop_copy_size;
  
  trace_vfio_save_pending(vbasedev->name, *res_precopy_only,

  *res_postcopy_only, *res_compatible,
-stop_copy_size);
+migration->stop_copy_size);
  }
  
  static void vfio_v1_save_pending(void *opaque, uint64_t threshold_size,

@@ -1104,6 +1094,40 @@ static void vfio_migration_state_notifier(Notifier 
*notifier, void *data)
  }
  }
  
+/*

+ * Migration size of VFIO devices can be as little as a few KBs or as big as
+ * many GBs. This value should be big enough to cover the worst case.
+ */
+#define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
+static int vfio_migration_data_notifier(NotifierWithReturn *n, void *data)
+{
+VFIOMigration *migration = container_of(n, VFIOMigration, migration_data);
+VFIODevice *vbasedev = migration->vbasedev;
+PrecopyNotifyData *pnd = data;
+
+if (pnd->reason != PRECOPY_NOTIFY_AFTER_BITMAP_SYNC) {
+return 0;
+}
+
+/* No need to get pending size when finishing migration */
+if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
+return 0;
+}
+
+if (vfio_query_stop_copy_size(vbasedev, >stop_copy_size)) {
+/*
+ * Failed to get pending migration size. Report big pending size so
+ * downtime limit won't be violated.
+ */
+migration->stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
+}
+
+trace_vfio_migration_data_notifier(vbasedev->name,
+   migration->stop_copy_size);
+
+return 0;
+}
+
  static void vfio_migration_exit(VFIODevice *vbasedev)
  {
  VFIOMigration *migration = vbasedev->migration;
@@ -1225,6 +1249,9 @@ static int vfio_migration_init(VFIODevice *vbasedev)
  
  migration->vm_state = qdev_add_vm_change_state_handler(

  vbasedev->dev, vfio_vmstate_change, vbasedev);
+
+migration->migration_data.notify = vfio_migration_data_notifier;
+precopy_add_notifier(>migration_data);
  } else {
  register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1,
   _vfio_v1_handlers, vbasedev);
@@ -1283,6 +1310,9 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
  if (vbasedev->migration) {
  VFIOMigration *migration = vbasedev->migration;
  
+if (migration->v2) {

+

Re: [PATCH v6 12/13] vfio: Alphabetize migration section of VFIO trace-events file

2023-01-13 Thread Cédric Le Goater

On 1/12/23 09:50, Avihai Horon wrote:

Sort the migration section of VFIO trace events file alphabetically
and move two misplaced traces to common.c section.

Signed-off-by: Avihai Horon 




Reviewed-by: Cédric Le Goater 

Thanks,

C.


---
  hw/vfio/trace-events | 22 +++---
  1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 60c49b2ecf..db9cb94952 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -119,6 +119,8 @@ vfio_region_sparse_mmap_header(const char *name, int index, int 
nr_areas) "Devic
  vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) 
"sparse entry %d [0x%lx - 0x%lx]"
  vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) 
"%s index %d, %08x/%0x8"
  vfio_dma_unmap_overflow_workaround(void) ""
+vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container 
fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
+vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 
0x%"PRIx64" - 0x%"PRIx64
  
  # platform.c

  vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group 
#%d"
@@ -148,20 +150,18 @@ vfio_display_edid_update(uint32_t prefx, uint32_t prefy) 
"%ux%u"
  vfio_display_edid_write_error(void) ""
  
  # migration.c

+vfio_load_cleanup(const char *name) " (%s)"
+vfio_load_device_config_state(const char *name) " (%s)"
+vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
+vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 
0x%"PRIx64" ret %d"
+vfio_migration_data_notifier(const char *name, uint64_t stopcopy_size) " (%s) 
stopcopy size 0x%"PRIx64
  vfio_migration_probe(const char *name) " (%s)"
  vfio_migration_set_state(const char *name, const char *state) " (%s) state %s"
-vfio_vmstate_change(const char *name, int running, const char *reason, const char 
*dev_state) " (%s) running %d reason %s device state %s"
  vfio_migration_state_notifier(const char *name, const char *state) " (%s) state 
%s"
-vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer 
size 0x%"PRIx64
+vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
  vfio_save_cleanup(const char *name) " (%s)"
+vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
  vfio_save_device_config_state(const char *name) " (%s)"
  vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible, uint64_t stopcopy_size) 
" (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64" stopcopy size 
0x%"PRIx64
-vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
-vfio_load_device_config_state(const char *name) " (%s)"
-vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
-vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 
0x%"PRIx64" ret %d"
-vfio_load_cleanup(const char *name) " (%s)"
-vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container 
fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
-vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 
0x%"PRIx64" - 0x%"PRIx64
-vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
-vfio_migration_data_notifier(const char *name, uint64_t stopcopy_size) " (%s) 
stopcopy size 0x%"PRIx64
+vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer 
size 0x%"PRIx64
+vfio_vmstate_change(const char *name, int running, const char *reason, const char 
*dev_state) " (%s) running %d reason %s device state %s"





Re: [PATCH v6 06/13] vfio/common: Change vfio_devices_all_running_and_saving() logic to equivalent one

2023-01-13 Thread Cédric Le Goater

On 1/12/23 09:50, Avihai Horon wrote:

vfio_devices_all_running_and_saving() is used to check if migration is
in pre-copy phase. This is done by checking if migration is in setup or
active states and if all VFIO devices are in pre-copy state, i.e.
_SAVING | _RUNNING.

In VFIO migration protocol v2 pre-copy support is made optional. Hence,
a matching v2 protocol pre-copy state can't be used here.

As preparation for adding v2 protocol, change
vfio_devices_all_running_and_saving() logic such that it doesn't use the
VFIO pre-copy state.

The new equivalent logic checks if migration is in active state and if
all VFIO devices are in running state [1]. No functional changes
intended.

[1] Note that checking if migration is in setup or active states and if
all VFIO devices are in running state doesn't guarantee that we are in
pre-copy phase, thus we check if migration is only in active state.

Signed-off-by: Avihai Horon 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  hw/vfio/common.c | 17 ++---
  1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f6dd571549..3a35f4afad 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -40,6 +40,7 @@
  #include "trace.h"
  #include "qapi/error.h"
  #include "migration/migration.h"
+#include "migration/misc.h"
  #include "sysemu/tpm.h"
  
  VFIOGroupList vfio_group_list =

@@ -363,13 +364,16 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer 
*container)
  return true;
  }
  
-static bool vfio_devices_all_running_and_saving(VFIOContainer *container)

+/*
+ * Check if all VFIO devices are running and migration is active, which is
+ * essentially equivalent to the migration being in pre-copy phase.
+ */
+static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
  {
  VFIOGroup *group;
  VFIODevice *vbasedev;
-MigrationState *ms = migrate_get_current();
  
-if (!migration_is_setup_or_active(ms->state)) {

+if (!migration_is_active(migrate_get_current())) {
  return false;
  }
  
@@ -381,8 +385,7 @@ static bool vfio_devices_all_running_and_saving(VFIOContainer *container)

  return false;
  }
  
-if ((migration->device_state & VFIO_DEVICE_STATE_V1_SAVING) &&

-(migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING)) {
+if (migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING) {
  continue;
  } else {
  return false;
@@ -461,7 +464,7 @@ static int vfio_dma_unmap(VFIOContainer *container,
  };
  
  if (iotlb && container->dirty_pages_supported &&

-vfio_devices_all_running_and_saving(container)) {
+vfio_devices_all_running_and_mig_active(container)) {
  return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
  }
  
@@ -488,7 +491,7 @@ static int vfio_dma_unmap(VFIOContainer *container,

  return -errno;
  }
  
-if (iotlb && vfio_devices_all_running_and_saving(container)) {

+if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
  cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
  tcg_enabled() ? DIRTY_CLIENTS_ALL 
:
  DIRTY_CLIENTS_NOCODE);





Re: [PATCH v6 11/13] vfio/migration: Remove VFIO migration protocol v1

2023-01-13 Thread Cédric Le Goater

On 1/12/23 09:50, Avihai Horon wrote:

Now that v2 protocol implementation has been added, remove the
deprecated v1 implementation.

Signed-off-by: Avihai Horon 




Reviewed-by: Cédric Le Goater 

Thanks,

C.



---
  include/hw/vfio/vfio-common.h |   5 -
  hw/vfio/common.c  |  19 +-
  hw/vfio/migration.c   | 703 +-
  hw/vfio/trace-events  |   9 -
  4 files changed, 24 insertions(+), 712 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 113f8d9208..2aba45887c 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -61,18 +61,13 @@ typedef struct VFIORegion {
  typedef struct VFIOMigration {
  struct VFIODevice *vbasedev;
  VMChangeStateEntry *vm_state;
-VFIORegion region;
-uint32_t device_state_v1;
-int vm_running;
  Notifier migration_state;
  NotifierWithReturn migration_data;
-uint64_t pending_bytes;
  enum vfio_device_mig_state device_state;
  int data_fd;
  void *data_buffer;
  size_t data_buffer_size;
  uint64_t stop_copy_size;
-bool v2;
  } VFIOMigration;
  
  typedef struct VFIOAddressSpace {

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index dcaa77d2a8..9a0dbee6b4 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -355,14 +355,7 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer 
*container)
  return false;
  }
  
-if (!migration->v2 &&

-(vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
-(migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING)) {
-return false;
-}
-
-if (migration->v2 &&
-(vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
+if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
  (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
   migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P)) {
  return false;
@@ -393,14 +386,8 @@ static bool 
vfio_devices_all_running_and_mig_active(VFIOContainer *container)
  return false;
  }
  
-if (!migration->v2 &&

-migration->device_state_v1 & VFIO_DEVICE_STATE_V1_RUNNING) {
-continue;
-}
-
-if (migration->v2 &&
-(migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
- migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P)) {
+if (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
+migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P) {
  continue;
  } else {
  return false;
diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 04f4397212..7688c83127 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -142,220 +142,6 @@ static int vfio_migration_set_state(VFIODevice *vbasedev,
  return 0;
  }
  
-static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count,

-  off_t off, bool iswrite)
-{
-int ret;
-
-ret = iswrite ? pwrite(vbasedev->fd, val, count, off) :
-pread(vbasedev->fd, val, count, off);
-if (ret < count) {
-error_report("vfio_mig_%s %d byte %s: failed at offset 0x%"
- HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count,
- vbasedev->name, off, strerror(errno));
-return (ret < 0) ? ret : -EINVAL;
-}
-return 0;
-}
-
-static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count,
-   off_t off, bool iswrite)
-{
-int ret, done = 0;
-__u8 *tbuf = buf;
-
-while (count) {
-int bytes = 0;
-
-if (count >= 8 && !(off % 8)) {
-bytes = 8;
-} else if (count >= 4 && !(off % 4)) {
-bytes = 4;
-} else if (count >= 2 && !(off % 2)) {
-bytes = 2;
-} else {
-bytes = 1;
-}
-
-ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite);
-if (ret) {
-return ret;
-}
-
-count -= bytes;
-done += bytes;
-off += bytes;
-tbuf += bytes;
-}
-return done;
-}
-
-#define vfio_mig_read(f, v, c, o)   vfio_mig_rw(f, (__u8 *)v, c, o, false)
-#define vfio_mig_write(f, v, c, o)  vfio_mig_rw(f, (__u8 *)v, c, o, true)
-
-#define VFIO_MIG_STRUCT_OFFSET(f)   \
- offsetof(struct vfio_device_migration_info, f)
-/*
- * Change the device_state register for device @vbasedev. Bits set in @mask
- * are preserved, bits set in @value are set, and bits not set in either @mask
- * or @value are cleared in device_state. If the register cannot be accessed,
- * the resulting state would be invalid, or the device enters an error 

Re: [PATCH 0/1] hw/ide: share bmdma read and write functions

2023-01-13 Thread Liav Albani

  
  


On 1/11/23 01:07, Bernhard Beschow
  wrote:


  

Am 9. Januar 2023 19:24:16 UTC schrieb John Snow :

  
On Tue, Sep 6, 2022 at 10:27 AM Bernhard Beschow  wrote:


  
Am 19. Februar 2022 08:08:17 UTC schrieb Liav Albani :

  
This is a preparation before I send v3 of ich6-ide controller emulation patch.
I figured that it's more trivial to split the changes this way, by extracting
the bmdma functions from via.c and piix.c and sharing them together. Then,
I could easily put these into use when I send v3 of the ich6-ide patch by just
using the already separated functions. This was suggested by BALATON Zoltan when
he submitted a code review on my ich6-ide controller emulation patch.

  
  
Ping. Any news?



*cough*.

Has this been folded into subsequent series, or does this still need attention?

  
  
Both piix and via still have their own bmdma implementations. This patch might be worth having.

Best regards,
Bernhard


I see. Since you are still interested, I will try to see what was
  the outcome of that patch as I really don't remember if it passed
  the CI tests, etc. If applicable, I will send this as v2, or if
  it's already approved, then I guess we could just let it be merged
  to the tree?


Best regards,
Liav


  

  



  

  
Liav Albani (1):
 hw/ide: share bmdma read and write functions between piix.c and via.c

hw/ide/pci.c | 47 
hw/ide/piix.c| 50 ++-
hw/ide/via.c | 51 ++--
include/hw/ide/pci.h |  4 
4 files changed, 55 insertions(+), 97 deletions(-)


  
  




  

  




Re: [PATCH] block-backend: fix virtio-scsi assertion failure with blk_drain_noref()

2023-01-13 Thread Kevin Wolf
Am 04.01.2023 um 20:56 hat Stefan Hajnoczi geschrieben:
> scsi_device_purge_requests() is called from I/O code by virtio-scsi TMF
> emulation code. It must not call Global State APIs like blk_drain()
> because that results in an assertion failure.
> 
> blk_drain() is a Global State API because it uses bdrv_unref(). Actually
> ref/unref is unnecessary in device emulation code because the drive=
> qdev property holds the reference.

Just for the record: We came to the conclusion that this isn't true.

We're not doing blk_ref/unref here, but bdrv_ref/unref. The function
calls bdrv_drained_end(bs), so the node must still exist at that point
and the ref/unref is required to ensure this.

The root node of the BlockBackend can change, so without the ref/unref
pair the last reference to the node can go away even if the BlockBackend
still exists.

So we'll need a different solution.

Kevin




Re: completion timeouts with pin-based interrupts in QEMU hw/nvme

2023-01-13 Thread Klaus Jensen
On Jan 13 12:42, Peter Maydell wrote:
> On Fri, 13 Jan 2023 at 12:37, Klaus Jensen  wrote:
> > There are a fair amount of uses of pci_irq_pulse() still left in the
> > tree.
> 
> Are there? I feel like I'm missing something here:
> $ git grep pci_irq_pulse
> include/hw/pci/pci.h:static inline void pci_irq_pulse(PCIDevice *pci_dev)
> $
> 
> ...looks at first sight like an unused function we could delete.
> 

Oh! My mistake! I grepped for irq_pulse which matched qemu_irq_pulse(),
doh.


signature.asc
Description: PGP signature


Re: completion timeouts with pin-based interrupts in QEMU hw/nvme

2023-01-13 Thread Peter Maydell
On Fri, 13 Jan 2023 at 12:37, Klaus Jensen  wrote:
> There are a fair amount of uses of pci_irq_pulse() still left in the
> tree.

Are there? I feel like I'm missing something here:
$ git grep pci_irq_pulse
include/hw/pci/pci.h:static inline void pci_irq_pulse(PCIDevice *pci_dev)
$

...looks at first sight like an unused function we could delete.

thanks
-- PMM



Re: completion timeouts with pin-based interrupts in QEMU hw/nvme

2023-01-13 Thread Klaus Jensen
On Jan 13 12:32, Peter Maydell wrote:
> On Fri, 13 Jan 2023 at 08:55, Klaus Jensen  wrote:
> >
> > +CC qemu pci maintainers
> >
> > Michael, Marcel,
> >
> > Do you have any comments on this thread? As you can see one solution is
> > to simply deassert prior to asserting, the other is to reintroduce a
> > pci_irq_pulse(). Both seem to solve the issue.
> 
> Both seem to be missing any analysis of "this is what is
> happening, this is where we differ from hardware, this
> is why this is the correct fix". We shouldn't put in
> random "this seems to happen to cause the guest to boot"
> fixes, please.
> 

No, I'd like to get to the bottom of this, which is why I'm reaching out
to the pci maintainers to get an idea about if this is a general/known
issue with pin based interrupts on pci.

There are a fair amount of uses of pci_irq_pulse() still left in the
tree.


signature.asc
Description: PGP signature


Re: completion timeouts with pin-based interrupts in QEMU hw/nvme

2023-01-13 Thread Peter Maydell
On Fri, 13 Jan 2023 at 08:55, Klaus Jensen  wrote:
>
> +CC qemu pci maintainers
>
> Michael, Marcel,
>
> Do you have any comments on this thread? As you can see one solution is
> to simply deassert prior to asserting, the other is to reintroduce a
> pci_irq_pulse(). Both seem to solve the issue.

Both seem to be missing any analysis of "this is what is
happening, this is where we differ from hardware, this
is why this is the correct fix". We shouldn't put in
random "this seems to happen to cause the guest to boot"
fixes, please.

thanks
-- PMM



Re: [PATCH v1 1/1] virtio-block: switch to blk_get_max_hw_transfer

2023-01-13 Thread Kevin Wolf
Am 12.01.2023 um 21:28 hat Ilya Dryomov geschrieben:
> On Thu, Dec 9, 2021 at 10:34 AM Or Ozeri  wrote:
> >
> > The blk_get_max_hw_transfer API was recently added in 6.1.0.
> > It allows querying an underlying block device its max transfer capability.
> > This commit changes virtio-blk to use this.
> >
> > Signed-off-by: Or Ozeri 
> > ---
> >  hw/block/virtio-blk.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > index f139cd7cc9..1ba9a06888 100644
> > --- a/hw/block/virtio-blk.c
> > +++ b/hw/block/virtio-blk.c
> > @@ -458,7 +458,7 @@ static void virtio_blk_submit_multireq(BlockBackend 
> > *blk, MultiReqBuffer *mrb)
> >  return;
> >  }
> >
> > -max_transfer = blk_get_max_transfer(mrb->reqs[0]->dev->blk);
> > +max_transfer = blk_get_max_hw_transfer(mrb->reqs[0]->dev->blk);
> >
> >  qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
> >_compare);
> 
> Hi Or,
> 
> Superficially, this makes sense to me.

I'm not sure I understand. This is not a passthrough device (unlike
scsi-generic), so why should we consider the hardware limits rather than
the kernel/other backend limits for read/write requests?

See the documentation of both fields:

/*
 * Maximal transfer length in bytes.  Need not be power of 2, but
 * must be multiple of opt_transfer and bl.request_alignment, or 0
 * for no 32-bit limit.  For now, anything larger than INT_MAX is
 * clamped down.
 */
uint32_t max_transfer;

/*
 * Maximal hardware transfer length in bytes.  Applies whenever
 * transfers to the device bypass the kernel I/O scheduler, for
 * example with SG_IO.  If larger than max_transfer or if zero,
 * blk_get_max_hw_transfer will fall back to max_transfer.
 */
uint64_t max_hw_transfer;

Is the real problem that max_transfer isn't right?

Kevin




Re: [PATCH 0/4] qemu-img: Fix exit code for errors closing the image

2023-01-13 Thread Kevin Wolf
Am 13.01.2023 um 08:30 hat Markus Armbruster geschrieben:
> Drive-by comment...
> 
> Kevin Wolf  writes:
> 
> > This series addresses the problem described in these bug reports:
> > https://gitlab.com/qemu-project/qemu/-/issues/1330
> > https://bugzilla.redhat.com/show_bug.cgi?id=2147617
> >
> > qcow2 can fail when writing back dirty bitmaps in qcow2_inactivate().
> > However, when the function is called through blk_unref(), in the case of
> > such errors, while an error message is written to stderr, the callers
> > never see an error return. Specifically, 'qemu-img bitmap/commit' are
> > reported to exit with an exit code 0 despite the errors.
> 
> After having tead the "potential alternative" below, I figure this
> failure happens within blk_unref().  But I can't see a call chain.  Am I
> confused?

When I put an abort() into the error path:

#0  0x76aa156c in __pthread_kill_implementation () from /lib64/libc.so.6
#1  0x76a54d76 in raise () from /lib64/libc.so.6
#2  0x76a287f3 in abort () from /lib64/libc.so.6
#3  0x556108f3 in qcow2_inactivate (bs=0x55879a30) at 
../block/qcow2.c:2705
#4  0x55610a08 in qcow2_do_close (bs=0x55879a30, 
close_data_file=true) at ../block/qcow2.c:2741
#5  0x55610b38 in qcow2_close (bs=0x55879a30) at 
../block/qcow2.c:2770
#6  0x555a1b4e in bdrv_close (bs=0x55879a30) at ../block.c:4939
#7  0x555a2ad4 in bdrv_delete (bs=0x55879a30) at ../block.c:5330
#8  0x555a5b49 in bdrv_unref (bs=0x55879a30) at ../block.c:6850
#9  0x5559d6c5 in bdrv_root_unref_child (child=0x55873300) at 
../block.c:3207
#10 0x555c7beb in blk_remove_bs (blk=0x558796e0) at 
../block/block-backend.c:895
#11 0x555c6c3f in blk_delete (blk=0x558796e0) at 
../block/block-backend.c:479
#12 0x555c6fb0 in blk_unref (blk=0x558796e0) at 
../block/block-backend.c:537
#13 0x55587dc9 in img_bitmap (argc=7, argv=0x7fffd760) at 
../qemu-img.c:4820
#14 0x55589807 in main (argc=7, argv=0x7fffd760) at 
../qemu-img.c:5450

> > The solution taken here is inactivating the images first, which can
> > still return errors, but already performs all of the write operations.
> > Only then the images are actually blk_unref()-ed.
> >
> > If we agree that this is the way to go (as a potential alternative,
> > allowing blk_unref() to fail would require changes in all kinds of
> > places, many of which probably wouldn't even know what to do with the
> > error),
> 
> blk_unref() could fail only when it destroys @blk (refcnt goes to zero).
> Correct?

I think so, yes.

> We have a bunch of "unref" functions in the tree, and, as far as I can
> tell from a quick grep, none of them can fail.  Supports your apparent
> preference for not changing blk_unref().
> 
> > then I suppose doing the same for other qemu-img subcommands
> > would make sense, too.
> 
> I was about to ask whether there could be more silent failures like the
> ones in commit and bitmap.  This suggests there are.
> 
> Say we do the same for all known such failures.  Would any remaining (or
> new) such failures be programming errors?

Let's be honest: What I'm proposing here is not pretty and not a full
solution, it only covers the simplest part of the problem, which happens
to be the part that has shown up in practice.

If you have a good idea how to solve the general problem, I'm all ears.

I haven't checked other qemu-img subcommands, but I don't see why they
wouldn't be able to run into an error in .bdrv_close. They could be
fixed the same way.

The next level in difficulty might be QMP block-delete. It's still easy
because like in qemu-img, we know that we're freeing the last reference,
and so we could actually do the same here. Well, more or less the same
at least: Obviously not inactivate_all(), but just for a single node. We
also need to do this recursively for children, except only for those
that would actually go away together with our parent node and aren't
referenced elsewhere. Even if we manage to implement this correctly,
what do we do with the error? Would returning a QMP error imply that we
didn't actually close the image and it's still valid (and not
inactivated)?

Too easy? Let's make it a bit harder. Let's say a commit job completes
and we're now removing the intermediate nodes. One of these images could
in theory fail in .bdrv_close. We have successfully committed the data,
the new graph is ready and in good state. Just one of the old images
we're throwing out runs into ENOSPC in its .bdrv_close. Where do we
report that error? We don't even necessarily have a QMP command here, we
could only let the whole block job fail, which is probably not a good
way to let libvirt know what was happening. Also, we can't just
unconditionally inactivate the image beforehand there, it might still be
in use by other references.  Which may actually be dropped while we're
draining the node in 

Re: [PATCH 1/4] qcow2: Fix theoretical corruption in store_bitmap() error path

2023-01-13 Thread Kevin Wolf
Am 13.01.2023 um 08:30 hat Philippe Mathieu-Daudé geschrieben:
> On 12/1/23 20:14, Kevin Wolf wrote:
> > In order to write the bitmap table to the image file, it is converted to
> > big endian. If the write fails, it is passed to clear_bitmap_table() to
> > free all of the clusters it had allocated before. However, if we don't
> > convert it back to native endianness first, we'll free things at a wrong
> > offset.
> > 
> > In practical terms, the offsets will be so high that we won't actually
> > free any allocated clusters, but just run into an error, but in theory
> > this can cause image corruption.
> > 
> > Cc: qemu-sta...@nongnu.org
> > Signed-off-by: Kevin Wolf 
> > ---
> >   block/qcow2-bitmap.c | 5 +++--
> >   1 file changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
> > index bcad567c0c..3dff99ba06 100644
> > --- a/block/qcow2-bitmap.c
> > +++ b/block/qcow2-bitmap.c
> > @@ -115,7 +115,7 @@ static int update_header_sync(BlockDriverState *bs)
> >   return bdrv_flush(bs->file->bs);
> >   }
> 
> Maybe add a comment here remembering to bswap back to native endianness?
> 
> > -static inline void bitmap_table_to_be(uint64_t *bitmap_table, size_t size)
> > +static inline void bitmap_table_bswap_be(uint64_t *bitmap_table, size_t 
> > size)
> >   {
> 
> This function uses cpu_to_be64(), semantically we convert back calling
> be64_to_cpu(), but technically both functions end up being the same.

Yes, but we don't seem to have any public "neutral" functions, it's
always either from or to.

> Alternatively:
> 
>  for (i = 0; i < size; ++i) {
> -bitmap_table[i] = cpu_to_be64(bitmap_table[i]);
> +bswap64s(_table[i]);
>  }

Doesn't that swap even on big endian hosts, resulting incorrectly in a
little endian table?

The closest thing we have that I can see is the be_bswap() macro in
bswap.h, but it's undefined again at the end of the header.

> > @@ -1401,9 +1401,10 @@ static int store_bitmap(BlockDriverState *bs, 
> > Qcow2Bitmap *bm, Error **errp)
> >   goto fail;
> >   }
> > -bitmap_table_to_be(tb, tb_size);
> > +bitmap_table_bswap_be(tb, tb_size);
> >   ret = bdrv_pwrite(bs->file, tb_offset, tb_size * sizeof(tb[0]), tb, 
> > 0);
> >   if (ret < 0) {
> > +bitmap_table_bswap_be(tb, tb_size);
> >   error_setg_errno(errp, -ret, "Failed to write bitmap '%s' to 
> > file",
> >bm_name);
> >   goto fail;
> 
> Pre-existing, but consider using g_autofree for 'tb'.
> 
> Reviewed-by: Philippe Mathieu-Daudé 

Thanks!

Kevin




Re: [PATCH v6 04/33] hw/pci/pci: Factor out pci_bus_map_irqs() from pci_bus_irqs()

2023-01-13 Thread Philippe Mathieu-Daudé

On 9/1/23 18:23, Bernhard Beschow wrote:

pci_bus_irqs() coupled together the assignment of pci_set_irq_fn and
pci_map_irq_fn to a PCI bus. This coupling gets in the way when the
pci_map_irq_fn is board-specific while the pci_set_irq_fn is device-
specific.

For example, both of QEMU's PIIX south bridge models have different
pci_map_irq_fn implementations which are board-specific rather than
device-specific. These implementations should therefore reside in board
code. The pci_set_irq_fn's, however, should stay in the device models
because they access memory internal to the model.

Factoring out pci_bus_map_irqs() from pci_bus_irqs() allows the
assignments to be decoupled, resolving the problem described above.

Note also how pci_vpb_realize() which gets touched in this commit
assigns different pci_map_irq_fn's depending on the board.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Michael S. Tsirkin 
Reviewed-by: Philippe Mathieu-Daudé 
---
  include/hw/pci/pci.h|  3 ++-
  hw/i386/pc_q35.c|  4 ++--
  hw/isa/piix3.c  |  8 
  hw/isa/piix4.c  |  3 ++-
  hw/pci-host/raven.c |  3 ++-
  hw/pci-host/versatile.c |  3 ++-
  hw/pci/pci.c| 12 +---
  hw/remote/machine.c |  3 ++-
  8 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 7048a373d1..85ee458cd2 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -282,8 +282,9 @@ PCIBus *pci_root_bus_new(DeviceState *parent, const char 
*name,
   MemoryRegion *address_space_io,
   uint8_t devfn_min, const char *typename);
  void pci_root_bus_cleanup(PCIBus *bus);
-void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
+void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq,
void *irq_opaque, int nirq);
+void pci_bus_map_irqs(PCIBus *bus, pci_map_irq_fn map_irq);


I'm squashing:

-- >8 --
diff --git a/hw/remote/vfio-user-obj.c b/hw/remote/vfio-user-obj.c
index fe1fdfb5f7..46171f22f7 100644
--- a/hw/remote/vfio-user-obj.c
+++ b/hw/remote/vfio-user-obj.c
@@ -667,4 +667,4 @@ void vfu_object_set_bus_irq(PCIBus *pci_bus)

-pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
- max_bdf);
+pci_bus_irqs(pci_bus, vfu_object_set_irq, , pci_bus, max_bdf);
+pci_bus_map_irqs(pci_bus, vfu_object_map_irq);
 }
---

to fix:

../hw/remote/vfio-user-obj.c: In function ‘vfu_object_set_bus_irq’:
../hw/remote/vfio-user-obj.c:668:67: error: passing argument 4 of 
‘pci_bus_irqs’ makes integer from pointer without a cast 
[-Werror=int-conversion]

 pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
   ^~~
In file included from include/hw/pci/pci_device.h:4,
 from include/hw/remote/iohub.h:14,
 from include/hw/remote/machine.h:18,
 from ../hw/remote/vfio-user-obj.c:43:
include/hw/pci/pci.h:286:41: note: expected ‘int’ but argument is of 
type ‘PCIBus *’ {aka ‘struct PCIBus *’}

   void *irq_opaque, int nirq);
 ^~~~
../hw/remote/vfio-user-obj.c:668:5: error: too many arguments to 
function ‘pci_bus_irqs’

 pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
 ^~~~
In file included from include/hw/pci/pci_device.h:4,
 from include/hw/remote/iohub.h:14,
 from include/hw/remote/machine.h:18,
 from ../hw/remote/vfio-user-obj.c:43:
include/hw/pci/pci.h:285:6: note: declared here
 void pci_bus_irqs(PCIBus *bus, pci_set_irq_fn set_irq,
  ^~~~




Re: completion timeouts with pin-based interrupts in QEMU hw/nvme

2023-01-13 Thread Klaus Jensen
+CC qemu pci maintainers

Michael, Marcel,

Do you have any comments on this thread? As you can see one solution is
to simply deassert prior to asserting, the other is to reintroduce a
pci_irq_pulse(). Both seem to solve the issue.

On Jan 12 14:10, Klaus Jensen wrote:
> Hi all (linux-nvme, qemu-devel, maintainers),
> 
> On QEMU riscv64, which does not use MSI/MSI-X and thus relies on
> pin-based interrupts, I'm seeing occasional completion timeouts, i.e.
> 
>   nvme nvme0: I/O 333 QID 1 timeout, completion polled
> 
> To rule out issues with shadow doorbells (which have been a source of
> frustration in the past), those are disabled. FWIW I'm also seeing the
> issue with shadow doorbells.
> 
>   diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>   index f25cc2c235e9..28d8e7f4b56c 100644
>   --- a/hw/nvme/ctrl.c
>   +++ b/hw/nvme/ctrl.c
>   @@ -7407,7 +7407,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>id->mdts = n->params.mdts;
>id->ver = cpu_to_le32(NVME_SPEC_VER);
>id->oacs =
>   -cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | 
> NVME_OACS_DBBUF);
>   +cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
>id->cntrltype = 0x1;
> 
>/*
> 
> 
> I captured a trace from QEMU when this happens:
> 
> pci_nvme_mmio_write addr 0x1008 data 0x4e size 4
> pci_nvme_mmio_doorbell_sq sqid 1 new_tail 78
> pci_nvme_io_cmd cid 4428 nsid 0x1 sqid 1 opc 0x2 opname 'NVME_NVM_CMD_READ'
> pci_nvme_read cid 4428 nsid 1 nlb 32 count 16384 lba 0x1324
> pci_nvme_map_prp trans_len 4096 len 16384 prp1 0x80aca000 prp2 0x82474100 
> num_prps 5
> pci_nvme_map_addr addr 0x80aca000 len 4096
> pci_nvme_map_addr addr 0x80ac9000 len 4096
> pci_nvme_map_addr addr 0x80ac8000 len 4096
> pci_nvme_map_addr addr 0x80ac7000 len 4096
> pci_nvme_io_cmd cid 4429 nsid 0x1 sqid 1 opc 0x2 opname 'NVME_NVM_CMD_READ'
> pci_nvme_read cid 4429 nsid 1 nlb 224 count 114688 lba 0x1242
> pci_nvme_map_prp trans_len 4096 len 114688 prp1 0x80ae6000 prp2 0x82474000 
> num_prps 29
> pci_nvme_map_addr addr 0x80ae6000 len 4096
> pci_nvme_map_addr addr 0x80ae5000 len 4096
> pci_nvme_map_addr addr 0x80ae4000 len 4096
> pci_nvme_map_addr addr 0x80ae3000 len 4096
> pci_nvme_map_addr addr 0x80ae2000 len 4096
> pci_nvme_map_addr addr 0x80ae1000 len 4096
> pci_nvme_map_addr addr 0x80ae len 4096
> pci_nvme_map_addr addr 0x80adf000 len 4096
> pci_nvme_map_addr addr 0x80ade000 len 4096
> pci_nvme_map_addr addr 0x80add000 len 4096
> pci_nvme_map_addr addr 0x80adc000 len 4096
> pci_nvme_map_addr addr 0x80adb000 len 4096
> pci_nvme_map_addr addr 0x80ada000 len 4096
> pci_nvme_map_addr addr 0x80ad9000 len 4096
> pci_nvme_map_addr addr 0x80ad8000 len 4096
> pci_nvme_map_addr addr 0x80ad7000 len 4096
> pci_nvme_map_addr addr 0x80ad6000 len 4096
> pci_nvme_map_addr addr 0x80ad5000 len 4096
> pci_nvme_map_addr addr 0x80ad4000 len 4096
> pci_nvme_map_addr addr 0x80ad3000 len 4096
> pci_nvme_map_addr addr 0x80ad2000 len 4096
> pci_nvme_map_addr addr 0x80ad1000 len 4096
> pci_nvme_map_addr addr 0x80ad len 4096
> pci_nvme_map_addr addr 0x80acf000 len 4096
> pci_nvme_map_addr addr 0x80ace000 len 4096
> pci_nvme_map_addr addr 0x80acd000 len 4096
> pci_nvme_map_addr addr 0x80acc000 len 4096
> pci_nvme_map_addr addr 0x80acb000 len 4096
> pci_nvme_rw_cb cid 4428 blk 'd0'
> pci_nvme_rw_complete_cb cid 4428 blk 'd0'
> pci_nvme_enqueue_req_completion cid 4428 cqid 1 dw0 0x0 dw1 0x0 status 0x0
> [1]: pci_nvme_irq_pin pulsing IRQ pin
> pci_nvme_rw_cb cid 4429 blk 'd0'
> pci_nvme_rw_complete_cb cid 4429 blk 'd0'
> pci_nvme_enqueue_req_completion cid 4429 cqid 1 dw0 0x0 dw1 0x0 status 0x0
> [2]: pci_nvme_irq_pin pulsing IRQ pin
> [3]: pci_nvme_mmio_write addr 0x100c data 0x4d size 4
> [4]: pci_nvme_mmio_doorbell_cq cqid 1 new_head 77
>  TIMEOUT HERE (30s) ---
> [5]: pci_nvme_mmio_read addr 0x1c size 4
> [6]: pci_nvme_mmio_write addr 0x100c data 0x4e size 4
> [7]: pci_nvme_mmio_doorbell_cq cqid 1 new_head 78
> --- Interrupt deasserted (cq->tail == cq->head)
> [   31.757821] nvme nvme0: I/O 333 QID 1 timeout, completion polled
> 
> Following the timeout, everything returns to "normal" and device/driver
> happily continues.
> 
> The pin-based interrupt logic in hw/nvme seems sound enough to me, so I
> am wondering if there is something going on with the kernel driver (but
> I certainly do not rule out that hw/nvme is at fault here, since
> pin-based interrupts has also been a source of several issues in the
> past).
> 
> What I'm thinking is that following the interrupt in [1], the driver
> picks up completion for cid 4428 but does not find cid 4429 in the queue
> since it has not been posted yet. Before getting a cq head doorbell
> write (which would cause the pin to be deasserted), the device posts the
> completion for cid 4429 which just keeps the interrupt asserted in [2].
> The trace then shows the cq head doorbell update in [3,4] for cid 4428
> and then we hit 

Re: [PATCH v6 00/33] Consolidate PIIX south bridges

2023-01-13 Thread Philippe Mathieu-Daudé

Hi Bernhard,

On 9/1/23 18:23, Bernhard Beschow wrote:

This series consolidates the implementations of the PIIX3 and PIIX4 south
bridges and is an extended version of [1]. The motivation is to share as much
code as possible and to bring both device models to feature parity such that
perhaps PIIX4 can become a drop-in-replacement for PIIX3 in the pc machine. This
could resolve the "Frankenstein" PIIX4-PM problem in PIIX3 discussed on this
list before.



Bernhard Beschow (30):
   hw/pci/pci: Factor out pci_bus_map_irqs() from pci_bus_irqs()
   hw/isa/piix3: Decouple INTx-to-LNKx routing which is board-specific
   hw/isa/piix4: Decouple INTx-to-LNKx routing which is board-specific
   hw/mips/Kconfig: Track Malta's PIIX dependencies via Kconfig
   hw/usb/hcd-uhci: Introduce TYPE_ defines for device models
   hw/intc/i8259: Make using the isa_pic singleton more type-safe
   hw/intc/i8259: Introduce i8259 proxy TYPE_ISA_PIC
   hw/i386/pc: Create RTC controllers in south bridges
   hw/i386/pc: No need for rtc_state to be an out-parameter
   hw/i386/pc_piix: Allow for setting properties before realizing PIIX3
 south bridge
   hw/isa/piix3: Create USB controller in host device
   hw/isa/piix3: Create power management controller in host device
   hw/isa/piix3: Create TYPE_ISA_PIC in host device
   hw/isa/piix3: Create IDE controller in host device
   hw/isa/piix3: Wire up ACPI interrupt internally
   hw/isa/piix3: Resolve redundant PIIX_NUM_PIC_IRQS
   hw/isa/piix3: Rename pci_piix3_props for sharing with PIIX4
   hw/isa/piix3: Rename piix3_reset() for sharing with PIIX4
   hw/isa/piix3: Drop the "3" from PIIX base class
   hw/isa/piix4: Make PIIX4's ACPI and USB functions optional
   hw/isa/piix4: Remove unused inbound ISA interrupt lines
   hw/isa/piix4: Use TYPE_ISA_PIC device
   hw/isa/piix4: Reuse struct PIIXState from PIIX3
   hw/isa/piix4: Rename reset control operations to match PIIX3
   hw/isa/piix3: Merge hw/isa/piix4.c
   hw/isa/piix: Harmonize names of reset control memory regions
   hw/isa/piix: Reuse PIIX3 base class' realize method in PIIX4
   hw/isa/piix: Rename functions to be shared for interrupt triggering
   hw/isa/piix: Consolidate IRQ triggering
   hw/isa/piix: Share PIIX3's base class with PIIX4

Philippe Mathieu-Daudé (3):
   hw/mips/malta: Introduce PIIX4_PCI_DEVFN definition
   hw/mips/malta: Set PIIX4 IRQ routes in embedded bootloader
   hw/isa/piix4: Correct IRQRC[A:D] reset values


I'm queuing the first 10 patches for now to alleviate the size of this
series, and I'll respin a v7 with the rest to avoid making you suffer
any longer :/ Thanks for insisting in this effort and I apologize it
is taking me so long...

Regards,

Phil.



Re: Questions about how block devices use snapshots

2023-01-13 Thread Zhiyong Ye

Hi Kevin,

Thank you for your patience and explanations. I learned a lot from our 
discussions and thank you again for your help.


Regards

Zhiyong

On 1/12/23 7:47 PM, Kevin Wolf wrote:

Am 11.01.2023 um 17:21 hat Zhiyong Ye geschrieben:

Hi Kevin,

Can I ask again how base.img + diff.qcow2 can be re-merged into one image
via qemu-img or hmp command when modified.img is discarded?


You can either use 'qemu-img commit' to copy all of the data from
diff.qcow2 back into base.img (this is probably what you want), or
'qemu-img rebase' to copy all of the data from base.img into diff.qcow2.

Kevin