[PATCH v2 2/3] block/nbd: only enter connection coroutine if it's present

2021-01-28 Thread Roman Kagan
When an NBD block driver state is moved from one aio_context to another
(e.g. when doing a drain in a migration thread),
nbd_client_attach_aio_context_bh is executed that enters the connection
coroutine.

However, the assumption that ->connection_co is always present here
appears incorrect: the connection may have encountered an error other
than -EIO in the underlying transport, and thus may have decided to quit
rather than keep trying to reconnect, and therefore it may have
terminated the connection coroutine.  As a result an attempt to reassign
the client in this state (NBD_CLIENT_QUIT) to a different aio_context
leads to a null pointer dereference:

  #0  qio_channel_detach_aio_context (ioc=0x0)
  at /build/qemu-gYtjVn/qemu-5.0.1/io/channel.c:452
  #1  0x562a242824b3 in bdrv_detach_aio_context (bs=0x562a268d6a00)
  at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6151
  #2  bdrv_set_aio_context_ignore (bs=bs@entry=0x562a268d6a00,
  new_context=new_context@entry=0x562a260c9580,
  ignore=ignore@entry=0x7feeadc9b780)
  at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6230
  #3  0x562a24282969 in bdrv_child_try_set_aio_context
  (bs=bs@entry=0x562a268d6a00, ctx=0x562a260c9580,
  ignore_child=, errp=)
  at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6332
  #4  0x562a242bb7db in blk_do_set_aio_context (blk=0x562a2735d0d0,
  new_context=0x562a260c9580,
  update_root_node=update_root_node@entry=true, errp=errp@entry=0x0)
  at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:1989
  #5  0x562a242be0bd in blk_set_aio_context (blk=,
  new_context=, errp=errp@entry=0x0)
  at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:2010
  #6  0x562a23fbd953 in virtio_blk_data_plane_stop (vdev=)
  at /build/qemu-gYtjVn/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292
  #7  0x562a241fc7bf in virtio_bus_stop_ioeventfd (bus=0x562a260dbf08)
  at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio-bus.c:245
  #8  0x562a23fefb2e in virtio_vmstate_change (opaque=0x562a260dbf90,
  running=0, state=)
  at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio.c:3220
  #9  0x562a2402ebfd in vm_state_notify (running=running@entry=0,
  state=state@entry=RUN_STATE_FINISH_MIGRATE)
  at /build/qemu-gYtjVn/qemu-5.0.1/softmmu/vl.c:1275
  #10 0x562a23f7bc02 in do_vm_stop (state=RUN_STATE_FINISH_MIGRATE,
  send_stop=)
  at /build/qemu-gYtjVn/qemu-5.0.1/cpus.c:1032
  #11 0x562a24209765 in migration_completion (s=0x562a260e83a0)
  at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:2914
  #12 migration_iteration_run (s=0x562a260e83a0)
  at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3275
  #13 migration_thread (opaque=opaque@entry=0x562a260e83a0)
  at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3439
  #14 0x562a2435ca96 in qemu_thread_start (args=)
  at /build/qemu-gYtjVn/qemu-5.0.1/util/qemu-thread-posix.c:519
  #15 0x7feed31466ba in start_thread (arg=0x7feeadc9c700)
  at pthread_create.c:333
  #16 0x7feed2e7c41d in __GI___sysctl (name=0x0, nlen=608471908,
  oldval=0x562a2452b138, oldlenp=0x0, newval=0x562a2452c5e0
  <__func__.28102>, newlen=0)
  at ../sysdeps/unix/sysv/linux/sysctl.c:30
  #17 0x in ?? ()

Fix it by checking that the connection coroutine is non-null before
trying to enter it.  If it is null, no entering is needed, as the
connection is probably going down anyway.

Signed-off-by: Roman Kagan 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index bcd6641e90..b3cbbeb4b0 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -250,13 +250,15 @@ static void nbd_client_attach_aio_context_bh(void *opaque)
 BlockDriverState *bs = opaque;
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 
-/*
- * The node is still drained, so we know the coroutine has yielded in
- * nbd_read_eof(), the only place where bs->in_flight can reach 0, or it is
- * entered for the first time. Both places are safe for entering the
- * coroutine.
- */
-qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
+if (s->connection_co) {
+/*
+ * The node is still drained, so we know the coroutine has yielded in
+ * nbd_read_eof(), the only place where bs->in_flight can reach 0, or
+ * it is entered for the first time. Both places are safe for entering
+ * the coroutine.
+ */
+qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
+}
 bdrv_dec_in_flight(bs);
 }
 
-- 
2.29.2




[PATCH v2 3/3] nbd: make nbd_read* return -EIO on error

2021-01-28 Thread Roman Kagan
NBD reconnect logic considers the error code from the functions that
read NBD messages to tell if reconnect should be attempted or not: it is
attempted on -EIO, otherwise the client transitions to NBD_CLIENT_QUIT
state (see nbd_channel_error).  This error code is propagated from the
primitives like nbd_read.

The problem, however, is that nbd_read itself turns every error into -1
rather than -EIO.  As a result, if the NBD server happens to die while
sending the message, the client in QEMU receives less data than it
expects, considers it as a fatal error, and wouldn't attempt
reestablishing the connection.

Fix it by turning every negative return from qio_channel_read_all into
-EIO returned from nbd_read.  Apparently that was the original behavior,
but got broken later.  Also adjust nbd_readXX to follow.

Fixes: e6798f06a6 ("nbd: generalize usage of nbd_read")
Signed-off-by: Roman Kagan 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/nbd.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 4a52a43ef5..5f34d23bb0 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -364,7 +364,7 @@ static inline int nbd_read(QIOChannel *ioc, void *buffer, 
size_t size,
 if (desc) {
 error_prepend(errp, "Failed to read %s: ", desc);
 }
-return -1;
+return ret;
 }
 
 return 0;
@@ -375,8 +375,9 @@ static inline int nbd_read##bits(QIOChannel *ioc,   
\
  uint##bits##_t *val,   \
  const char *desc, Error **errp)\
 {   \
-if (nbd_read(ioc, val, sizeof(*val), desc, errp) < 0) { \
-return -1;  \
+int ret = nbd_read(ioc, val, sizeof(*val), desc, errp); \
+if (ret < 0) {  \
+return ret; \
 }   \
 *val = be##bits##_to_cpu(*val); \
 return 0;   \
-- 
2.29.2




[PATCH v2 0/3] block/nbd: fix crashers in reconnect while migrating

2021-01-28 Thread Roman Kagan
During the final phase of migration the NBD reconnection logic may
encounter situations it doesn't expect during regular operation.

This series addresses some of them that make qemu crash.  They are
reproducible when a vm with a secondary drive attached via nbd with
non-zero "reconnect-delay" runs a stress load (fio with big queue depth)
in the guest on that drive and is migrated (e.g. to a file), while the
nbd server is SIGKILL-ed and restarted every second.

See the individual patches for specific crash conditions and more
detailed explanations.

v1 -> v2:
- fix corrupted backtraces in log messages
- add r-b by Vladimir

Roman Kagan (3):
  block/nbd: only detach existing iochannel from aio_context
  block/nbd: only enter connection coroutine if it's present
  nbd: make nbd_read* return -EIO on error

 include/block/nbd.h |  7 ---
 block/nbd.c | 25 +
 2 files changed, 21 insertions(+), 11 deletions(-)

-- 
2.29.2




[PATCH v2 1/3] block/nbd: only detach existing iochannel from aio_context

2021-01-28 Thread Roman Kagan
When the reconnect in NBD client is in progress, the iochannel used for
NBD connection doesn't exist.  Therefore an attempt to detach it from
the aio_context of the parent BlockDriverState results in a NULL pointer
dereference.

The problem is triggerable, in particular, when an outgoing migration is
about to finish, and stopping the dataplane tries to move the
BlockDriverState from the iothread aio_context to the main loop.  If the
NBD connection is lost before this point, and the NBD client has entered
the reconnect procedure, QEMU crashes:

  #0  qemu_aio_coroutine_enter (ctx=0x5618056c7580, co=0x0)
  at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-coroutine.c:109
  #1  0x5618034b1b68 in nbd_client_attach_aio_context_bh (
  opaque=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block/nbd.c:164
  #2  0x56180353116b in aio_wait_bh (opaque=0x7f60e1e63700)
  at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:55
  #3  0x561803530633 in aio_bh_call (bh=0x7f60d40a7e80)
  at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:136
  #4  aio_bh_poll (ctx=ctx@entry=0x5618056c7580)
  at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:164
  #5  0x561803533e5a in aio_poll (ctx=ctx@entry=0x5618056c7580,
  blocking=blocking@entry=true)
  at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-posix.c:650
  #6  0x56180353128d in aio_wait_bh_oneshot (ctx=0x5618056c7580,
  cb=, opaque=)
  at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:71
  #7  0x56180345c50a in bdrv_attach_aio_context (new_context=0x5618056c7580,
  bs=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6172
  #8  bdrv_set_aio_context_ignore (bs=bs@entry=0x561805ed4c00,
  new_context=new_context@entry=0x5618056c7580,
  ignore=ignore@entry=0x7f60e1e63780)
  at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6237
  #9  0x56180345c969 in bdrv_child_try_set_aio_context (
  bs=bs@entry=0x561805ed4c00, ctx=0x5618056c7580,
  ignore_child=, errp=)
  at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6332
  #10 0x5618034957db in blk_do_set_aio_context (blk=0x56180695b3f0,
  new_context=0x5618056c7580, update_root_node=update_root_node@entry=true,
  errp=errp@entry=0x0)
  at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:1989
  #11 0x5618034980bd in blk_set_aio_context (blk=,
  new_context=, errp=errp@entry=0x0)
  at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:2010
  #12 0x561803197953 in virtio_blk_data_plane_stop (vdev=)
  at /build/qemu-6MF7tq/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292
  #13 0x5618033d67bf in virtio_bus_stop_ioeventfd (bus=0x5618056d9f08)
  at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio-bus.c:245
  #14 0x5618031c9b2e in virtio_vmstate_change (opaque=0x5618056d9f90,
  running=0, state=)
  at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio.c:3220
  #15 0x561803208bfd in vm_state_notify (running=running@entry=0,
  state=state@entry=RUN_STATE_FINISH_MIGRATE)
  at /build/qemu-6MF7tq/qemu-5.0.1/softmmu/vl.c:1275
  #16 0x561803155c02 in do_vm_stop (state=RUN_STATE_FINISH_MIGRATE,
  send_stop=) at /build/qemu-6MF7tq/qemu-5.0.1/cpus.c:1032
  #17 0x5618033e3765 in migration_completion (s=0x5618056e6960)
  at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:2914
  #18 migration_iteration_run (s=0x5618056e6960)
  at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3275
  #19 migration_thread (opaque=opaque@entry=0x5618056e6960)
  at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3439
  #20 0x561803536ad6 in qemu_thread_start (args=)
  at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-thread-posix.c:519
  #21 0x7f61085d06ba in start_thread ()
 from /lib/x86_64-linux-gnu/libpthread.so.0
  #22 0x7f610830641d in sysctl () from /lib/x86_64-linux-gnu/libc.so.6
  #23 0x in ?? ()

Fix it by checking that the iochannel is non-null before trying to
detach it from the aio_context.  If it is null, no detaching is needed,
and it will get reattached in the proper aio_context once the connection
is reestablished.

Signed-off-by: Roman Kagan 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/nbd.c b/block/nbd.c
index 42e10c7c93..bcd6641e90 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -235,7 +235,14 @@ static void nbd_client_detach_aio_context(BlockDriverState 
*bs)
 
 /* Timer is deleted in nbd_client_co_drain_begin() */
 assert(!s->reconnect_delay_timer);
-qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
+/*
+ * If reconnect is in progress we may have no ->ioc.  It will be
+ * re-instantiated in the proper aio context once the connection is
+ * reestablished.
+ */
+if (s->ioc) {
+qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
+}
 }
 
 static void nbd_client_attach_aio_context_bh(void *opaque)
-- 
2.29.2




Re: [PATCH 0/3] block/nbd: fix crashers in reconnect while migrating

2021-01-28 Thread Roman Kagan
On Fri, Jan 29, 2021 at 08:51:39AM +0300, Vladimir Sementsov-Ogievskiy wrote:
> 28.01.2021 23:14, Roman Kagan wrote:
> > During the final phase of migration the NBD reconnection logic may
> > encounter situations it doesn't expect during regular operation.
> > 
> > This series addresses some of them that make qemu crash.  They are
> > reproducible when a vm with a secondary drive attached via nbd with
> > non-zero "reconnect-delay" runs a stress load (fio with big queue depth)
> > in the guest on that drive and is migrated (e.g. to a file), while the
> > nbd server is SIGKILL-ed and restarted every second.
> > 
> > See the individual patches for specific crash conditions and more
> > detailed explanations.
> > 
> > Roman Kagan (3):
> >block/nbd: only detach existing iochannel from aio_context
> >block/nbd: only enter connection coroutine if it's present
> >nbd: make nbd_read* return -EIO on error
> > 
> >   include/block/nbd.h |  7 ---
> >   block/nbd.c | 25 +
> >   2 files changed, 21 insertions(+), 11 deletions(-)
> > 
> 
> Thanks a lot for fixing!
> 
> Do you have some reproducer scripts? Could you post them or may be add
> an iotest?

I don't have it scripted, just ad hoc command lines.  I'll look into
making up a test.  Can you perhaps suggest what existing test to base
on?

Thanks,
Roman.



Re: [PATCH 1/3] block/nbd: only detach existing iochannel from aio_context

2021-01-28 Thread Roman Kagan
On Fri, Jan 29, 2021 at 08:37:13AM +0300, Vladimir Sementsov-Ogievskiy wrote:
> 28.01.2021 23:14, Roman Kagan wrote:
> > When the reconnect in NBD client is in progress, the iochannel used for
> > NBD connection doesn't exist.  Therefore an attempt to detach it from
> > the aio_context of the parent BlockDriverState results in a NULL pointer
> > dereference.
> > 
> > The problem is triggerable, in particular, when an outgoing migration is
> > about to finish, and stopping the dataplane tries to move the
> > BlockDriverState from the iothread aio_context to the main loop.  If the
> > NBD connection is lost before this point, and the NBD client has entered
> > the reconnect procedure, QEMU crashes:
> > 
> >  at /build/qemu-gYtjVn/qemu-5.0.1/io/channel.c:452
> >  at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6151
> >  new_context=new_context@entry=0x562a260c9580,
> >  ignore=ignore@entry=0x7feeadc9b780)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6230
> >  (bs=bs@entry=0x562a268d6a00, ctx=0x562a260c9580,
> >  ignore_child=, errp=)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6332
> >  new_context=0x562a260c9580,
> >  update_root_node=update_root_node@entry=true, errp=errp@entry=0x0)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:1989
> >  new_context=, errp=errp@entry=0x0)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:2010
> >  out>)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292
> >  at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio-bus.c:245
> >  running=0, state=)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio.c:3220
> >  state=state@entry=RUN_STATE_FINISH_MIGRATE)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/softmmu/vl.c:1275
> >  send_stop=)
> >  at /build/qemu-gYtjVn/qemu-5.0.1/cpus.c:1032
> >  at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:2914
> >  at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3275
> >  at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3439
> >  at /build/qemu-gYtjVn/qemu-5.0.1/util/qemu-thread-posix.c:519
> >  at pthread_create.c:333
> >  oldval=0x562a2452b138, oldlenp=0x0, newval=0x562a2452c5e0
> >  <__func__.28102>, newlen=0)
> >  at ../sysdeps/unix/sysv/linux/sysctl.c:30
> 
> function names are somehow lost :(

Oops.  Backtraces in gdb have frame numbers prefixed with a hash which
got interpreted as comments by git commit; I should have offset the
backtrace when pasting.

Will respin with fixed log messages, thanks.

Roman.



[PATCH] hw/block/nvme: refactor zone state check for read

2021-01-28 Thread Klaus Jensen
From: Gollu Appalanaidu 

Align with nvme_check_zone_write.

Remove unnecessary storing status value and return at the end of the
function, if error occurs return immediately if applicable.

Signed-off-by: Gollu Appalanaidu 
Reviewed-by: Klaus Jensen 
---
 hw/block/nvme.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8ac997735041..1dbd6ec129ae 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1210,7 +1210,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 
 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
 {
-uint16_t status;
+uint64_t zslba = zone->d.zslba;
 
 switch (nvme_get_zone_state(zone)) {
 case NVME_ZONE_STATE_EMPTY:
@@ -1219,16 +1219,13 @@ static uint16_t nvme_check_zone_state_for_read(NvmeZone 
*zone)
 case NVME_ZONE_STATE_FULL:
 case NVME_ZONE_STATE_CLOSED:
 case NVME_ZONE_STATE_READ_ONLY:
-status = NVME_SUCCESS;
-break;
+return NVME_SUCCESS;
 case NVME_ZONE_STATE_OFFLINE:
-status = NVME_ZONE_OFFLINE;
-break;
+trace_pci_nvme_err_zone_is_offline(zslba);
+return NVME_ZONE_OFFLINE;
 default:
 assert(false);
 }
-
-return status;
 }
 
 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
@@ -1241,10 +1238,10 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
uint64_t slba,
 
 status = nvme_check_zone_state_for_read(zone);
 if (status) {
-;
+return status;
 } else if (unlikely(end > bndry)) {
 if (!ns->params.cross_zone_read) {
-status = NVME_ZONE_BOUNDARY_ERROR;
+return NVME_ZONE_BOUNDARY_ERROR;
 } else {
 /*
  * Read across zone boundary - check that all subsequent
@@ -1254,7 +1251,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
uint64_t slba,
 zone++;
 status = nvme_check_zone_state_for_read(zone);
 if (status) {
-break;
+return status;
 }
 } while (end > nvme_zone_rd_boundary(ns, zone));
 }
-- 
2.30.0




Re: [PATCH 0/3] block/nbd: fix crashers in reconnect while migrating

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 23:14, Roman Kagan wrote:

During the final phase of migration the NBD reconnection logic may
encounter situations it doesn't expect during regular operation.

This series addresses some of them that make qemu crash.  They are
reproducible when a vm with a secondary drive attached via nbd with
non-zero "reconnect-delay" runs a stress load (fio with big queue depth)
in the guest on that drive and is migrated (e.g. to a file), while the
nbd server is SIGKILL-ed and restarted every second.

See the individual patches for specific crash conditions and more
detailed explanations.

Roman Kagan (3):
   block/nbd: only detach existing iochannel from aio_context
   block/nbd: only enter connection coroutine if it's present
   nbd: make nbd_read* return -EIO on error

  include/block/nbd.h |  7 ---
  block/nbd.c | 25 +
  2 files changed, 21 insertions(+), 11 deletions(-)



Thanks a lot for fixing!

Do you have some reproducer scripts? Could you post them or may be add an 
iotest?

--
Best regards,
Vladimir



Re: [PATCH 3/3] nbd: make nbd_read* return -EIO on error

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 23:14, Roman Kagan wrote:

NBD reconnect logic considers the error code from the functions that
read NBD messages to tell if reconnect should be attempted or not: it is
attempted on -EIO, otherwise the client transitions to NBD_CLIENT_QUIT
state (see nbd_channel_error).  This error code is propagated from the
primitives like nbd_read.

The problem, however, is that nbd_read itself turns every error into -1
rather than -EIO.  As a result, if the NBD server happens to die while
sending the message, the client in QEMU receives less data than it
expects, considers it as a fatal error, and wouldn't attempt
reestablishing the connection.

Fix it by turning every negative return from qio_channel_read_all into
-EIO returned from nbd_read.  Apparently that was the original behavior,
but got broken later.  Also adjust nbd_readXX to follow.

Fixes: e6798f06a6 ("nbd: generalize usage of nbd_read")
Signed-off-by: Roman Kagan 


Reviewed-by: Vladimir Sementsov-Ogievskiy 

Really looks like a bug in e6798f06a6: it changes error code from -EIO to -1 
without any reasoning.


---
  include/block/nbd.h | 7 ---
  1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 4a52a43ef5..5f34d23bb0 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -364,7 +364,7 @@ static inline int nbd_read(QIOChannel *ioc, void *buffer, 
size_t size,
  if (desc) {
  error_prepend(errp, "Failed to read %s: ", desc);
  }
-return -1;
+return ret;
  }
  
  return 0;

@@ -375,8 +375,9 @@ static inline int nbd_read##bits(QIOChannel *ioc,   
\
   uint##bits##_t *val,   \
   const char *desc, Error **errp)\
  {   \
-if (nbd_read(ioc, val, sizeof(*val), desc, errp) < 0) { \
-return -1;  \
+int ret = nbd_read(ioc, val, sizeof(*val), desc, errp); \
+if (ret < 0) {  \
+return ret; \
  }   \
  *val = be##bits##_to_cpu(*val); \
  return 0;   \




--
Best regards,
Vladimir



Re: [PATCH 2/3] block/nbd: only enter connection coroutine if it's present

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 23:14, Roman Kagan wrote:

When an NBD block driver state is moved from one aio_context to another
(e.g. when doing a drain in a migration thread),
nbd_client_attach_aio_context_bh is executed that enters the connection
coroutine.

However, the assumption that ->connection_co is always present here
appears incorrect: the connection may have encountered an error other
than -EIO in the underlying transport, and thus may have decided to quit
rather than keep trying to reconnect, and therefore it may have
terminated the connection coroutine.  As a result an attempt to reassign
the client in this state (NBD_CLIENT_QUIT) to a different aio_context
leads to a null pointer dereference:

 at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-coroutine.c:109
 opaque=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block/nbd.c:164
 at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:55
 at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:136
 at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:164
 blocking=blocking@entry=true)
 at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-posix.c:650
 cb=, opaque=)
 at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:71
 bs=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6172
 new_context=new_context@entry=0x5618056c7580,
 ignore=ignore@entry=0x7f60e1e63780)
 at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6237
 bs=bs@entry=0x561805ed4c00, ctx=0x5618056c7580,
 ignore_child=, errp=)
 at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6332
 new_context=0x5618056c7580, update_root_node=update_root_node@entry=true,
 errp=errp@entry=0x0)
 at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:1989
 new_context=, errp=errp@entry=0x0)
 at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:2010
 at /build/qemu-6MF7tq/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292
 at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio-bus.c:245
 running=0, state=)
 at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio.c:3220
 state=state@entry=RUN_STATE_FINISH_MIGRATE)
 at /build/qemu-6MF7tq/qemu-5.0.1/softmmu/vl.c:1275
 send_stop=) at /build/qemu-6MF7tq/qemu-5.0.1/cpus.c:1032
 at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:2914
 at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3275
 at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3439
 at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-thread-posix.c:519
from /lib/x86_64-linux-gnu/libpthread.so.0

Fix it by checking that the connection coroutine is non-null before
trying to enter it.  If it is null, no entering is needed, as the
connection is probably going down anyway.

Signed-off-by: Roman Kagan 


Reviewed-by: Vladimir Sementsov-Ogievskiy 


---
  block/nbd.c | 16 +---
  1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index bcd6641e90..b3cbbeb4b0 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -250,13 +250,15 @@ static void nbd_client_attach_aio_context_bh(void *opaque)
  BlockDriverState *bs = opaque;
  BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
  
-/*

- * The node is still drained, so we know the coroutine has yielded in
- * nbd_read_eof(), the only place where bs->in_flight can reach 0, or it is
- * entered for the first time. Both places are safe for entering the
- * coroutine.
- */
-qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
+if (s->connection_co) {
+/*
+ * The node is still drained, so we know the coroutine has yielded in
+ * nbd_read_eof(), the only place where bs->in_flight can reach 0, or
+ * it is entered for the first time. Both places are safe for entering
+ * the coroutine.
+ */
+qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
+}
  bdrv_dec_in_flight(bs);
  }
  




--
Best regards,
Vladimir



Re: [PATCH 1/3] block/nbd: only detach existing iochannel from aio_context

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 23:14, Roman Kagan wrote:

When the reconnect in NBD client is in progress, the iochannel used for
NBD connection doesn't exist.  Therefore an attempt to detach it from
the aio_context of the parent BlockDriverState results in a NULL pointer
dereference.

The problem is triggerable, in particular, when an outgoing migration is
about to finish, and stopping the dataplane tries to move the
BlockDriverState from the iothread aio_context to the main loop.  If the
NBD connection is lost before this point, and the NBD client has entered
the reconnect procedure, QEMU crashes:

 at /build/qemu-gYtjVn/qemu-5.0.1/io/channel.c:452
 at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6151
 new_context=new_context@entry=0x562a260c9580,
 ignore=ignore@entry=0x7feeadc9b780)
 at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6230
 (bs=bs@entry=0x562a268d6a00, ctx=0x562a260c9580,
 ignore_child=, errp=)
 at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6332
 new_context=0x562a260c9580,
 update_root_node=update_root_node@entry=true, errp=errp@entry=0x0)
 at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:1989
 new_context=, errp=errp@entry=0x0)
 at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:2010
 out>)
 at /build/qemu-gYtjVn/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292
 at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio-bus.c:245
 running=0, state=)
 at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio.c:3220
 state=state@entry=RUN_STATE_FINISH_MIGRATE)
 at /build/qemu-gYtjVn/qemu-5.0.1/softmmu/vl.c:1275
 send_stop=)
 at /build/qemu-gYtjVn/qemu-5.0.1/cpus.c:1032
 at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:2914
 at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3275
 at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3439
 at /build/qemu-gYtjVn/qemu-5.0.1/util/qemu-thread-posix.c:519
 at pthread_create.c:333
 oldval=0x562a2452b138, oldlenp=0x0, newval=0x562a2452c5e0
 <__func__.28102>, newlen=0)
 at ../sysdeps/unix/sysv/linux/sysctl.c:30


function names are somehow lost :(



Fix it by checking that the iochannel is non-null before trying to
detach it from the aio_context.  If it is null, no detaching is needed,
and it will get reattached in the proper aio_context once the connection
is reestablished.

Signed-off-by: Roman Kagan 


Thanks!

Reviewed-by: Vladimir Sementsov-Ogievskiy 


---
  block/nbd.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/nbd.c b/block/nbd.c
index 42e10c7c93..bcd6641e90 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -235,7 +235,14 @@ static void nbd_client_detach_aio_context(BlockDriverState 
*bs)
  
  /* Timer is deleted in nbd_client_co_drain_begin() */

  assert(!s->reconnect_delay_timer);
-qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
+/*
+ * If reconnect is in progress we may have no ->ioc.  It will be
+ * re-instantiated in the proper aio context once the connection is
+ * reestablished.
+ */
+if (s->ioc) {
+qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
+}
  }
  
  static void nbd_client_attach_aio_context_bh(void *opaque)





--
Best regards,
Vladimir



Re: [PULL 14/53] block: apply COR-filter to block-stream jobs

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 21:38, Philippe Mathieu-Daudé wrote:

Hi Andrey,

On 1/26/21 3:19 PM, Max Reitz wrote:

From: Andrey Shinkevich 

This patch completes the series with the COR-filter applied to
block-stream operations.

Adding the filter makes it possible in future implement discarding
copied regions in backing files during the block-stream job, to reduce
the disk overuse (we need control on permissions).

Also, the filter now is smart enough to do copy-on-read with specified
base, so we have benefit on guest reads even when doing block-stream of
the part of the backing chain.

Several iotests are slightly modified due to filter insertion.

Signed-off-by: Andrey Shinkevich 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20201216061703.70908-14-vsement...@virtuozzo.com>
Reviewed-by: Max Reitz 
Signed-off-by: Max Reitz 
---
  block/stream.c | 105 ++---
  tests/qemu-iotests/030 |   8 +--
  tests/qemu-iotests/141.out |   2 +-
  tests/qemu-iotests/245 |  20 ---
  4 files changed, 80 insertions(+), 55 deletions(-)

diff --git a/block/stream.c b/block/stream.c

...

@@ -228,7 +211,9 @@ void stream_start(const char *job_id, BlockDriverState *bs,
  bool bs_read_only;
  int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
  BlockDriverState *base_overlay;
+BlockDriverState *cor_filter_bs = NULL;
  BlockDriverState *above_base;
+QDict *opts;
  
  assert(!(base && bottom));

  assert(!(backing_file_str && bottom));
@@ -266,30 +251,62 @@ void stream_start(const char *job_id, BlockDriverState 
*bs,
  }
  }
  
-if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) {

-return;
-}
-
  /* Make sure that the image is opened in read-write mode */
  bs_read_only = bdrv_is_read_only(bs);
  if (bs_read_only) {
-if (bdrv_reopen_set_read_only(bs, false, errp) != 0) {
-bs_read_only = false;
-goto fail;
+int ret;
+/* Hold the chain during reopen */
+if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) {
+return;
+}
+
+ret = bdrv_reopen_set_read_only(bs, false, errp);
+
+/* failure, or cor-filter will hold the chain */
+bdrv_unfreeze_backing_chain(bs, above_base);
+
+if (ret < 0) {
+return;
  }
  }
  
-/* Prevent concurrent jobs trying to modify the graph structure here, we

- * already have our own plans. Also don't allow resize as the image size is
- * queried only at the job start and then cached. */
-s = block_job_create(job_id, _job_driver, NULL, bs,
- basic_flags | BLK_PERM_GRAPH_MOD,
+opts = qdict_new();


Coverity reported (CID 1445793) that this resource could be leaked...


+
+qdict_put_str(opts, "driver", "copy-on-read");
+qdict_put_str(opts, "file", bdrv_get_node_name(bs));
+/* Pass the base_overlay node name as 'bottom' to COR driver */
+qdict_put_str(opts, "bottom", base_overlay->node_name);
+if (filter_node_name) {
+qdict_put_str(opts, "node-name", filter_node_name);
+}
+
+cor_filter_bs = bdrv_insert_node(bs, opts, BDRV_O_RDWR, errp);
+if (!cor_filter_bs) {


... probably here.

Should we call g_free(opts) here?



Actually, not..

bdrv_insert_node() calls bdrv_open() which eats options even on failure.

I see, CID already marked false-positive?




+goto fail;
+}
+
+if (!filter_node_name) {
+cor_filter_bs->implicit = true;
+}
+
+s = block_job_create(job_id, _job_driver, NULL, cor_filter_bs,
+ BLK_PERM_CONSISTENT_READ,
   basic_flags | BLK_PERM_WRITE,
   speed, creation_flags, NULL, NULL, errp);
  if (!s) {
  goto fail;
  }
  
+/*

+ * Prevent concurrent jobs trying to modify the graph structure here, we
+ * already have our own plans. Also don't allow resize as the image size is
+ * queried only at the job start and then cached.
+ */
+if (block_job_add_bdrv(>common, "active node", bs, 0,
+   basic_flags | BLK_PERM_WRITE, _abort)) {
+goto fail;
+}
+
  /* Block all intermediate nodes between bs and base, because they will
   * disappear from the chain after this operation. The streaming job reads
   * every block only once, assuming that it doesn't change, so forbid 
writes
@@ -310,9 +327,9 @@ void stream_start(const char *job_id, BlockDriverState *bs,
  s->base_overlay = base_overlay;
  s->above_base = above_base;
  s->backing_file_str = g_strdup(backing_file_str);
+s->cor_filter_bs = cor_filter_bs;
  s->target_bs = bs;
  s->bs_read_only = bs_read_only;
-s->chain_frozen = true;
  
  s->on_error = on_error;

  trace_stream_start(bs, base, s);
@@ -320,8 +337,10 @@ void stream_start(const char *job_id, BlockDriverState *bs,
  

[PATCH] tests/Makefile.include: export PYTHON for check-block.sh

2021-01-28 Thread Vladimir Sementsov-Ogievskiy
check-block.sh called by make check-block rely on PYTHON variable being
set.

Fixes: f203080bbd9f9e5b31041b1f2afcd6040c5aaec5
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---

Hi! As Peter reported, build fails on platforms where python3 is not
/usr/bin/python3.. This patch should help. At least it works for me if
I move /usr/bin/python3 to another location and configure it with
--python=. And doesn't work without the patch.
Don't know how the thing seemed to work for me before :\

 tests/Makefile.include | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/Makefile.include b/tests/Makefile.include
index 3a0524ce74..ceaf3f0d6e 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -138,6 +138,7 @@ check:
 ifeq ($(CONFIG_TOOLS)$(CONFIG_POSIX),yy)
 QEMU_IOTESTS_HELPERS-$(CONFIG_LINUX) = 
tests/qemu-iotests/socket_scm_helper$(EXESUF)
 check: check-block
+export PYTHON
 check-block: $(SRC_PATH)/tests/check-block.sh qemu-img$(EXESUF) \
qemu-io$(EXESUF) qemu-nbd$(EXESUF) $(QEMU_IOTESTS_HELPERS-y) \
$(filter qemu-system-%, $(ninja-targets))
-- 
2.29.2




[PATCH 1/3] block/nbd: only detach existing iochannel from aio_context

2021-01-28 Thread Roman Kagan
When the reconnect in NBD client is in progress, the iochannel used for
NBD connection doesn't exist.  Therefore an attempt to detach it from
the aio_context of the parent BlockDriverState results in a NULL pointer
dereference.

The problem is triggerable, in particular, when an outgoing migration is
about to finish, and stopping the dataplane tries to move the
BlockDriverState from the iothread aio_context to the main loop.  If the
NBD connection is lost before this point, and the NBD client has entered
the reconnect procedure, QEMU crashes:

at /build/qemu-gYtjVn/qemu-5.0.1/io/channel.c:452
at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6151
new_context=new_context@entry=0x562a260c9580,
ignore=ignore@entry=0x7feeadc9b780)
at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6230
(bs=bs@entry=0x562a268d6a00, ctx=0x562a260c9580,
ignore_child=, errp=)
at /build/qemu-gYtjVn/qemu-5.0.1/block.c:6332
new_context=0x562a260c9580,
update_root_node=update_root_node@entry=true, errp=errp@entry=0x0)
at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:1989
new_context=, errp=errp@entry=0x0)
at /build/qemu-gYtjVn/qemu-5.0.1/block/block-backend.c:2010
out>)
at /build/qemu-gYtjVn/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292
at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio-bus.c:245
running=0, state=)
at /build/qemu-gYtjVn/qemu-5.0.1/hw/virtio/virtio.c:3220
state=state@entry=RUN_STATE_FINISH_MIGRATE)
at /build/qemu-gYtjVn/qemu-5.0.1/softmmu/vl.c:1275
send_stop=)
at /build/qemu-gYtjVn/qemu-5.0.1/cpus.c:1032
at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:2914
at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3275
at /build/qemu-gYtjVn/qemu-5.0.1/migration/migration.c:3439
at /build/qemu-gYtjVn/qemu-5.0.1/util/qemu-thread-posix.c:519
at pthread_create.c:333
oldval=0x562a2452b138, oldlenp=0x0, newval=0x562a2452c5e0
<__func__.28102>, newlen=0)
at ../sysdeps/unix/sysv/linux/sysctl.c:30

Fix it by checking that the iochannel is non-null before trying to
detach it from the aio_context.  If it is null, no detaching is needed,
and it will get reattached in the proper aio_context once the connection
is reestablished.

Signed-off-by: Roman Kagan 
---
 block/nbd.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/nbd.c b/block/nbd.c
index 42e10c7c93..bcd6641e90 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -235,7 +235,14 @@ static void nbd_client_detach_aio_context(BlockDriverState 
*bs)
 
 /* Timer is deleted in nbd_client_co_drain_begin() */
 assert(!s->reconnect_delay_timer);
-qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
+/*
+ * If reconnect is in progress we may have no ->ioc.  It will be
+ * re-instantiated in the proper aio context once the connection is
+ * reestablished.
+ */
+if (s->ioc) {
+qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
+}
 }
 
 static void nbd_client_attach_aio_context_bh(void *opaque)
-- 
2.29.2




[PATCH 2/3] block/nbd: only enter connection coroutine if it's present

2021-01-28 Thread Roman Kagan
When an NBD block driver state is moved from one aio_context to another
(e.g. when doing a drain in a migration thread),
nbd_client_attach_aio_context_bh is executed that enters the connection
coroutine.

However, the assumption that ->connection_co is always present here
appears incorrect: the connection may have encountered an error other
than -EIO in the underlying transport, and thus may have decided to quit
rather than keep trying to reconnect, and therefore it may have
terminated the connection coroutine.  As a result an attempt to reassign
the client in this state (NBD_CLIENT_QUIT) to a different aio_context
leads to a null pointer dereference:

at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-coroutine.c:109
opaque=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block/nbd.c:164
at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:55
at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:136
at /build/qemu-6MF7tq/qemu-5.0.1/util/async.c:164
blocking=blocking@entry=true)
at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-posix.c:650
cb=, opaque=)
at /build/qemu-6MF7tq/qemu-5.0.1/util/aio-wait.c:71
bs=0x561805ed4c00) at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6172
new_context=new_context@entry=0x5618056c7580,
ignore=ignore@entry=0x7f60e1e63780)
at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6237
bs=bs@entry=0x561805ed4c00, ctx=0x5618056c7580,
ignore_child=, errp=)
at /build/qemu-6MF7tq/qemu-5.0.1/block.c:6332
new_context=0x5618056c7580, update_root_node=update_root_node@entry=true,
errp=errp@entry=0x0)
at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:1989
new_context=, errp=errp@entry=0x0)
at /build/qemu-6MF7tq/qemu-5.0.1/block/block-backend.c:2010
at /build/qemu-6MF7tq/qemu-5.0.1/hw/block/dataplane/virtio-blk.c:292
at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio-bus.c:245
running=0, state=)
at /build/qemu-6MF7tq/qemu-5.0.1/hw/virtio/virtio.c:3220
state=state@entry=RUN_STATE_FINISH_MIGRATE)
at /build/qemu-6MF7tq/qemu-5.0.1/softmmu/vl.c:1275
send_stop=) at /build/qemu-6MF7tq/qemu-5.0.1/cpus.c:1032
at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:2914
at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3275
at /build/qemu-6MF7tq/qemu-5.0.1/migration/migration.c:3439
at /build/qemu-6MF7tq/qemu-5.0.1/util/qemu-thread-posix.c:519
   from /lib/x86_64-linux-gnu/libpthread.so.0

Fix it by checking that the connection coroutine is non-null before
trying to enter it.  If it is null, no entering is needed, as the
connection is probably going down anyway.

Signed-off-by: Roman Kagan 
---
 block/nbd.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index bcd6641e90..b3cbbeb4b0 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -250,13 +250,15 @@ static void nbd_client_attach_aio_context_bh(void *opaque)
 BlockDriverState *bs = opaque;
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 
-/*
- * The node is still drained, so we know the coroutine has yielded in
- * nbd_read_eof(), the only place where bs->in_flight can reach 0, or it is
- * entered for the first time. Both places are safe for entering the
- * coroutine.
- */
-qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
+if (s->connection_co) {
+/*
+ * The node is still drained, so we know the coroutine has yielded in
+ * nbd_read_eof(), the only place where bs->in_flight can reach 0, or
+ * it is entered for the first time. Both places are safe for entering
+ * the coroutine.
+ */
+qemu_aio_coroutine_enter(bs->aio_context, s->connection_co);
+}
 bdrv_dec_in_flight(bs);
 }
 
-- 
2.29.2




[PATCH 3/3] nbd: make nbd_read* return -EIO on error

2021-01-28 Thread Roman Kagan
NBD reconnect logic considers the error code from the functions that
read NBD messages to tell if reconnect should be attempted or not: it is
attempted on -EIO, otherwise the client transitions to NBD_CLIENT_QUIT
state (see nbd_channel_error).  This error code is propagated from the
primitives like nbd_read.

The problem, however, is that nbd_read itself turns every error into -1
rather than -EIO.  As a result, if the NBD server happens to die while
sending the message, the client in QEMU receives less data than it
expects, considers it as a fatal error, and wouldn't attempt
reestablishing the connection.

Fix it by turning every negative return from qio_channel_read_all into
-EIO returned from nbd_read.  Apparently that was the original behavior,
but got broken later.  Also adjust nbd_readXX to follow.

Fixes: e6798f06a6 ("nbd: generalize usage of nbd_read")
Signed-off-by: Roman Kagan 
---
 include/block/nbd.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 4a52a43ef5..5f34d23bb0 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -364,7 +364,7 @@ static inline int nbd_read(QIOChannel *ioc, void *buffer, 
size_t size,
 if (desc) {
 error_prepend(errp, "Failed to read %s: ", desc);
 }
-return -1;
+return ret;
 }
 
 return 0;
@@ -375,8 +375,9 @@ static inline int nbd_read##bits(QIOChannel *ioc,   
\
  uint##bits##_t *val,   \
  const char *desc, Error **errp)\
 {   \
-if (nbd_read(ioc, val, sizeof(*val), desc, errp) < 0) { \
-return -1;  \
+int ret = nbd_read(ioc, val, sizeof(*val), desc, errp); \
+if (ret < 0) {  \
+return ret; \
 }   \
 *val = be##bits##_to_cpu(*val); \
 return 0;   \
-- 
2.29.2




[PATCH 0/3] block/nbd: fix crashers in reconnect while migrating

2021-01-28 Thread Roman Kagan
During the final phase of migration the NBD reconnection logic may
encounter situations it doesn't expect during regular operation.

This series addresses some of them that make qemu crash.  They are
reproducible when a vm with a secondary drive attached via nbd with
non-zero "reconnect-delay" runs a stress load (fio with big queue depth)
in the guest on that drive and is migrated (e.g. to a file), while the
nbd server is SIGKILL-ed and restarted every second.

See the individual patches for specific crash conditions and more
detailed explanations.

Roman Kagan (3):
  block/nbd: only detach existing iochannel from aio_context
  block/nbd: only enter connection coroutine if it's present
  nbd: make nbd_read* return -EIO on error

 include/block/nbd.h |  7 ---
 block/nbd.c | 25 +
 2 files changed, 21 insertions(+), 11 deletions(-)

-- 
2.29.2




Re: [PULL 0/8] Block layer patches

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 21:19, Peter Maydell wrote:

On Wed, 27 Jan 2021 at 19:58, Kevin Wolf  wrote:


The following changes since commit bf159f0bdc7b8e7aa8342dedb3829ca744c1b612:

   Merge remote-tracking branch 
'remotes/edgar/tags/edgar/xilinx-next-2021-01-27.for-upstream' into staging 
(2021-01-27 17:40:25 +)

are available in the Git repository at:

   git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to a44be0334beae3a9affb4a3a92cc6852993d7a84:

   iotests: rename and move 169 and 199 tests (2021-01-27 20:53:14 +0100)


Block layer patches:

- Fix crash on write to read-only devices
- iotests: Rewrite 'check' in Python, get rid of 'groups' and allow
   non-numeric test case names




I somehow failed to notice before applying this, but this
breaks 'make check' on the netbsd and freebsd VMs.

As usual the build log is pretty opaque, but this looks like the
probable culprit:

env: python3: No such file or directory
gmake: *** [/home/qemu/qemu-test.nU2bcG/src/tests/Makefile.include:144:
check-block] Error 1

The python in the netbsd VM is /usr/pkg/bin/python3.7. Something
seems to be ignoring the --python= passed into our configure
and assuming that "python3" is always a valid executable.



Seems, that's shows that the fact that "PYTHON" variable in check-block.sh 
works for me doesn't mean that it works everywhere.. My fault, I look closer tomorrow.

--
Best regards,
Vladimir



Re: [PATCH] iotests/297: pylint: ignore too many statements

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 23:04, Vladimir Sementsov-Ogievskiy wrote:

Ignore two complains, which now lead to 297 failure on testenv.py and
testrunner.py.

Fixes: 2e5a2f57db481f18fcf70be2a36b1417370b8476
Fixes: d74c754c924ca34e90b7c96ce2f5609d82c0e628
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---


Forget to note:

I don't add exclusions to pylintrc intentionally, as I think these warnings are 
reasonable, and it's good that vim ALE show them.. Still, adding them to 
pylintrc works too if you prefer.


  tests/qemu-iotests/297 | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
index a37910b42d..3e3e0e34aa 100755
--- a/tests/qemu-iotests/297
+++ b/tests/qemu-iotests/297
@@ -73,7 +73,9 @@ def run_linters():
  env['PYTHONPATH'] += os.pathsep + qemu_module_path
  except KeyError:
  env['PYTHONPATH'] = qemu_module_path
-subprocess.run(('pylint-3', '--score=n', '--notes=FIXME,XXX', *files),
+subprocess.run(('pylint-3', '--score=n', '--notes=FIXME,XXX',
+'--disable=too-many-return-statements',
+'--disable=too-many-statements', *files),
 env=env, check=False)
  
  print('=== mypy ===')





--
Best regards,
Vladimir



[PATCH] iotests/297: pylint: ignore too many statements

2021-01-28 Thread Vladimir Sementsov-Ogievskiy
Ignore two complains, which now lead to 297 failure on testenv.py and
testrunner.py.

Fixes: 2e5a2f57db481f18fcf70be2a36b1417370b8476
Fixes: d74c754c924ca34e90b7c96ce2f5609d82c0e628
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/297 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/297 b/tests/qemu-iotests/297
index a37910b42d..3e3e0e34aa 100755
--- a/tests/qemu-iotests/297
+++ b/tests/qemu-iotests/297
@@ -73,7 +73,9 @@ def run_linters():
 env['PYTHONPATH'] += os.pathsep + qemu_module_path
 except KeyError:
 env['PYTHONPATH'] = qemu_module_path
-subprocess.run(('pylint-3', '--score=n', '--notes=FIXME,XXX', *files),
+subprocess.run(('pylint-3', '--score=n', '--notes=FIXME,XXX',
+'--disable=too-many-return-statements',
+'--disable=too-many-statements', *files),
env=env, check=False)
 
 print('=== mypy ===')
-- 
2.29.2




Re: [PULL 14/53] block: apply COR-filter to block-stream jobs

2021-01-28 Thread Philippe Mathieu-Daudé
Hi Andrey,

On 1/26/21 3:19 PM, Max Reitz wrote:
> From: Andrey Shinkevich 
> 
> This patch completes the series with the COR-filter applied to
> block-stream operations.
> 
> Adding the filter makes it possible in future implement discarding
> copied regions in backing files during the block-stream job, to reduce
> the disk overuse (we need control on permissions).
> 
> Also, the filter now is smart enough to do copy-on-read with specified
> base, so we have benefit on guest reads even when doing block-stream of
> the part of the backing chain.
> 
> Several iotests are slightly modified due to filter insertion.
> 
> Signed-off-by: Andrey Shinkevich 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> Message-Id: <20201216061703.70908-14-vsement...@virtuozzo.com>
> Reviewed-by: Max Reitz 
> Signed-off-by: Max Reitz 
> ---
>  block/stream.c | 105 ++---
>  tests/qemu-iotests/030 |   8 +--
>  tests/qemu-iotests/141.out |   2 +-
>  tests/qemu-iotests/245 |  20 ---
>  4 files changed, 80 insertions(+), 55 deletions(-)
> 
> diff --git a/block/stream.c b/block/stream.c
...
> @@ -228,7 +211,9 @@ void stream_start(const char *job_id, BlockDriverState 
> *bs,
>  bool bs_read_only;
>  int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
>  BlockDriverState *base_overlay;
> +BlockDriverState *cor_filter_bs = NULL;
>  BlockDriverState *above_base;
> +QDict *opts;
>  
>  assert(!(base && bottom));
>  assert(!(backing_file_str && bottom));
> @@ -266,30 +251,62 @@ void stream_start(const char *job_id, BlockDriverState 
> *bs,
>  }
>  }
>  
> -if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) {
> -return;
> -}
> -
>  /* Make sure that the image is opened in read-write mode */
>  bs_read_only = bdrv_is_read_only(bs);
>  if (bs_read_only) {
> -if (bdrv_reopen_set_read_only(bs, false, errp) != 0) {
> -bs_read_only = false;
> -goto fail;
> +int ret;
> +/* Hold the chain during reopen */
> +if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) {
> +return;
> +}
> +
> +ret = bdrv_reopen_set_read_only(bs, false, errp);
> +
> +/* failure, or cor-filter will hold the chain */
> +bdrv_unfreeze_backing_chain(bs, above_base);
> +
> +if (ret < 0) {
> +return;
>  }
>  }
>  
> -/* Prevent concurrent jobs trying to modify the graph structure here, we
> - * already have our own plans. Also don't allow resize as the image size 
> is
> - * queried only at the job start and then cached. */
> -s = block_job_create(job_id, _job_driver, NULL, bs,
> - basic_flags | BLK_PERM_GRAPH_MOD,
> +opts = qdict_new();

Coverity reported (CID 1445793) that this resource could be leaked...

> +
> +qdict_put_str(opts, "driver", "copy-on-read");
> +qdict_put_str(opts, "file", bdrv_get_node_name(bs));
> +/* Pass the base_overlay node name as 'bottom' to COR driver */
> +qdict_put_str(opts, "bottom", base_overlay->node_name);
> +if (filter_node_name) {
> +qdict_put_str(opts, "node-name", filter_node_name);
> +}
> +
> +cor_filter_bs = bdrv_insert_node(bs, opts, BDRV_O_RDWR, errp);
> +if (!cor_filter_bs) {

... probably here.

Should we call g_free(opts) here?

> +goto fail;
> +}
> +
> +if (!filter_node_name) {
> +cor_filter_bs->implicit = true;
> +}
> +
> +s = block_job_create(job_id, _job_driver, NULL, cor_filter_bs,
> + BLK_PERM_CONSISTENT_READ,
>   basic_flags | BLK_PERM_WRITE,
>   speed, creation_flags, NULL, NULL, errp);
>  if (!s) {
>  goto fail;
>  }
>  
> +/*
> + * Prevent concurrent jobs trying to modify the graph structure here, we
> + * already have our own plans. Also don't allow resize as the image size 
> is
> + * queried only at the job start and then cached.
> + */
> +if (block_job_add_bdrv(>common, "active node", bs, 0,
> +   basic_flags | BLK_PERM_WRITE, _abort)) {
> +goto fail;
> +}
> +
>  /* Block all intermediate nodes between bs and base, because they will
>   * disappear from the chain after this operation. The streaming job reads
>   * every block only once, assuming that it doesn't change, so forbid 
> writes
> @@ -310,9 +327,9 @@ void stream_start(const char *job_id, BlockDriverState 
> *bs,
>  s->base_overlay = base_overlay;
>  s->above_base = above_base;
>  s->backing_file_str = g_strdup(backing_file_str);
> +s->cor_filter_bs = cor_filter_bs;
>  s->target_bs = bs;
>  s->bs_read_only = bs_read_only;
> -s->chain_frozen = true;
>  
>  s->on_error = on_error;
>  trace_stream_start(bs, base, s);
> @@ -320,8 +337,10 @@ void stream_start(const char *job_id, 

Re: [PULL 0/8] Block layer patches

2021-01-28 Thread Peter Maydell
On Wed, 27 Jan 2021 at 19:58, Kevin Wolf  wrote:
>
> The following changes since commit bf159f0bdc7b8e7aa8342dedb3829ca744c1b612:
>
>   Merge remote-tracking branch 
> 'remotes/edgar/tags/edgar/xilinx-next-2021-01-27.for-upstream' into staging 
> (2021-01-27 17:40:25 +)
>
> are available in the Git repository at:
>
>   git://repo.or.cz/qemu/kevin.git tags/for-upstream
>
> for you to fetch changes up to a44be0334beae3a9affb4a3a92cc6852993d7a84:
>
>   iotests: rename and move 169 and 199 tests (2021-01-27 20:53:14 +0100)
>
> 
> Block layer patches:
>
> - Fix crash on write to read-only devices
> - iotests: Rewrite 'check' in Python, get rid of 'groups' and allow
>   non-numeric test case names
>
> 

I somehow failed to notice before applying this, but this
breaks 'make check' on the netbsd and freebsd VMs.

As usual the build log is pretty opaque, but this looks like the
probable culprit:

env: python3: No such file or directory
gmake: *** [/home/qemu/qemu-test.nU2bcG/src/tests/Makefile.include:144:
check-block] Error 1

The python in the netbsd VM is /usr/pkg/bin/python3.7. Something
seems to be ignoring the --python= passed into our configure
and assuming that "python3" is always a valid executable.

thanks
-- PMM



Re: [PATCH v2 15/36] block: use topological sort for permission update

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 20:13, Kevin Wolf wrote:

Am 28.01.2021 um 10:34 hat Vladimir Sementsov-Ogievskiy geschrieben:

27.01.2021 21:38, Kevin Wolf wrote:

Am 27.11.2020 um 15:45 hat Vladimir Sementsov-Ogievskiy geschrieben:

-static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
-   uint64_t cumulative_perms,
-   uint64_t cumulative_shared_perms,
-   GSList *ignore_children, Error **errp)
+static int bdrv_node_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
+uint64_t cumulative_perms,
+uint64_t cumulative_shared_perms,
+GSList *ignore_children, Error **errp)
   {
   BlockDriver *drv = bs->drv;
   BdrvChild *c;
@@ -2166,21 +2193,43 @@ static int bdrv_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
   /* Check all children */
   QLIST_FOREACH(c, >children, next) {
   uint64_t cur_perm, cur_shared;
-GSList *cur_ignore_children;
   bdrv_child_perm(bs, c->bs, c, c->role, q,
   cumulative_perms, cumulative_shared_perms,
   _perm, _shared);
+bdrv_child_set_perm_safe(c, cur_perm, cur_shared, NULL);


This "added" line is actually old code. What is removed here is the
recursive call of bdrv_check_update_perm(). This is what the code below
will have to replace.


yes, we'll use explicit loop instead of recursion




+}
+
+return 0;
+}
+
+static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
+   uint64_t cumulative_perms,
+   uint64_t cumulative_shared_perms,
+   GSList *ignore_children, Error **errp)
+{
+int ret;
+BlockDriverState *root = bs;
+g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, root);
+
+for ( ; list; list = list->next) {
+bs = list->data;
+
+if (bs != root) {
+if (!bdrv_check_parents_compliance(bs, ignore_children, errp)) {
+return -EINVAL;
+}


At this point bs still had the old permissions, but we don't access
them. As we're going in topological order, the parents have already been
updated if they were a child covered in bdrv_node_check_perm(), so we're
checking the relevant values. Good.

What about the root node? If I understand correctly, the parents of the
root nodes wouldn't have been checked in the old code. In the new state,
the parent BdrvChild already has to contain the new permission.

In bdrv_refresh_perms(), we already check parent conflicts, so no change
for all callers going through it. Good.

bdrv_reopen_multiple() is less obvious. It passes permissions from the
BDRVReopenState, without applying the permissions first.


It will be changed in the series


Do we check the
old parent permissions instead of the new state here?


We use given (new) cumulative permissions for bs, and recalculate
permissions for bs subtree.


Where do we actually set them? I would expect a
bdrv_child_set_perm_safe() call somewhere, but I can't see it in the
call path from bdrv_reopen_multiple().


You mean parent BdrvChild objects? Then this question applies as well to 
pre-patch code.

So, we just call bdrv_check_perm() for bs in bdrv_reopen_multiple.. I think the 
answer is like this:

if state->perm and state->shared_perm are different from actual cumulative 
permissions (before reopne), then we must
have the parent(s) of the node in same bs_queue. Then, corresponding children 
are updated as part
of another bdrv_check_perm call from same loop in bdrv_reopen_multiple().

Let's check how state->perm and state->shared_perm are set:

bdrv_reopen_queue_child()

/* This needs to be overwritten in bdrv_reopen_prepare() */
bs_entry->state.perm = UINT64_MAX;
bs_entry->state.shared_perm = 0;


...
 
bdrv_reopen_prepare()


   bdrv_reopen_perm(queue, reopen_state->bs,
 _state->perm, _state->shared_perm);

and bdrv_reopen_perm() calculate cumulative permissions, taking permissions 
from the queue, for parents which exists in queue.

Not sure how much it correct, keeping in mind that we may look at a node in 
queue, for which bdrv_reopen_perm was not yet called, but the idea is clean.




It follows old behavior. The only thing is changed that pre-patch we
do DFS recursion starting from bs (and probably visit some nodes
several times), after-patch we first do topological sort of bs subtree
and go through the list. The order of nodes is better and we visit
each node once.


It's not the only thing that changes. Maybe this is what makes the patch
hard to understand, because it seems to do two steps at once:

1. Change the order in which nodes are processed

2. Replace bdrv_check_update_perm() with bdrv_check_parents_compliance()


hmm, yes. But we do bdrv_check_parents_compliance() only for nodes inside 
subtree, for all except 

Re: [PATCH 1/1] docs: fix mistake in dirty bitmap feature description

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

28.01.2021 20:13, Denis V. Lunev wrote:

Original specification says that l1 table size if 64 * l1_size, which
is obviously wrong. The size of the l1 entry is 64 _bits_, not bytes.
Thus 64 is to be replaces with 8 as specification says about bytes.

There is also minor tweak, field name is renamed from l1 to l1_table,
which matches with the later text.

Signed-off-by: Denis V. Lunev 
CC: Stefan Hajnoczi 
CC: Vladimir Sementsov-Ogievskiy 


Reviewed-by: Vladimir Sementsov-Ogievskiy 


---
  docs/interop/parallels.txt | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/interop/parallels.txt b/docs/interop/parallels.txt
index e9271eba5d..f15bf35bd1 100644
--- a/docs/interop/parallels.txt
+++ b/docs/interop/parallels.txt
@@ -208,7 +208,7 @@ of its data area are:
28 - 31:l1_size
The number of entries in the L1 table of the bitmap.
  
-  variable:   l1 (64 * l1_size bytes)

+  variable:   l1_table (8 * l1_size bytes)
L1 offset table (in bytes)


I don't remember why this "(in bytes)" is here.. What in bytes? L1 table size? 
But the described field is not L1 table size, but L1 table itself.. It's not in bytes, 
it's just L1 table :)

So, I'd also drop "(in bytes)" while being here. Or the whole line "L1 offset table 
(in bytes)" altogether.

  
  A dirty bitmap is stored using a one-level structure for the mapping to host





--
Best regards,
Vladimir



Re: [PATCH v2 15/36] block: use topological sort for permission update

2021-01-28 Thread Kevin Wolf
Am 28.01.2021 um 10:34 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 27.01.2021 21:38, Kevin Wolf wrote:
> > Am 27.11.2020 um 15:45 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > -static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
> > > -   uint64_t cumulative_perms,
> > > -   uint64_t cumulative_shared_perms,
> > > -   GSList *ignore_children, Error **errp)
> > > +static int bdrv_node_check_perm(BlockDriverState *bs, BlockReopenQueue 
> > > *q,
> > > +uint64_t cumulative_perms,
> > > +uint64_t cumulative_shared_perms,
> > > +GSList *ignore_children, Error **errp)
> > >   {
> > >   BlockDriver *drv = bs->drv;
> > >   BdrvChild *c;
> > > @@ -2166,21 +2193,43 @@ static int bdrv_check_perm(BlockDriverState *bs, 
> > > BlockReopenQueue *q,
> > >   /* Check all children */
> > >   QLIST_FOREACH(c, >children, next) {
> > >   uint64_t cur_perm, cur_shared;
> > > -GSList *cur_ignore_children;
> > >   bdrv_child_perm(bs, c->bs, c, c->role, q,
> > >   cumulative_perms, cumulative_shared_perms,
> > >   _perm, _shared);
> > > +bdrv_child_set_perm_safe(c, cur_perm, cur_shared, NULL);
> > 
> > This "added" line is actually old code. What is removed here is the
> > recursive call of bdrv_check_update_perm(). This is what the code below
> > will have to replace.
> 
> yes, we'll use explicit loop instead of recursion
> 
> > 
> > > +}
> > > +
> > > +return 0;
> > > +}
> > > +
> > > +static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
> > > +   uint64_t cumulative_perms,
> > > +   uint64_t cumulative_shared_perms,
> > > +   GSList *ignore_children, Error **errp)
> > > +{
> > > +int ret;
> > > +BlockDriverState *root = bs;
> > > +g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, root);
> > > +
> > > +for ( ; list; list = list->next) {
> > > +bs = list->data;
> > > +
> > > +if (bs != root) {
> > > +if (!bdrv_check_parents_compliance(bs, ignore_children, 
> > > errp)) {
> > > +return -EINVAL;
> > > +}
> > 
> > At this point bs still had the old permissions, but we don't access
> > them. As we're going in topological order, the parents have already been
> > updated if they were a child covered in bdrv_node_check_perm(), so we're
> > checking the relevant values. Good.
> > 
> > What about the root node? If I understand correctly, the parents of the
> > root nodes wouldn't have been checked in the old code. In the new state,
> > the parent BdrvChild already has to contain the new permission.
> > 
> > In bdrv_refresh_perms(), we already check parent conflicts, so no change
> > for all callers going through it. Good.
> > 
> > bdrv_reopen_multiple() is less obvious. It passes permissions from the
> > BDRVReopenState, without applying the permissions first.
> 
> It will be changed in the series
> 
> > Do we check the
> > old parent permissions instead of the new state here?
> 
> We use given (new) cumulative permissions for bs, and recalculate
> permissions for bs subtree.

Where do we actually set them? I would expect a
bdrv_child_set_perm_safe() call somewhere, but I can't see it in the
call path from bdrv_reopen_multiple().

> It follows old behavior. The only thing is changed that pre-patch we
> do DFS recursion starting from bs (and probably visit some nodes
> several times), after-patch we first do topological sort of bs subtree
> and go through the list. The order of nodes is better and we visit
> each node once.

It's not the only thing that changes. Maybe this is what makes the patch
hard to understand, because it seems to do two steps at once:

1. Change the order in which nodes are processed

2. Replace bdrv_check_update_perm() with bdrv_check_parents_compliance()

In step 2, the point I mentioned above is important (new permissions
must already be set in the BdrvChild objects).

The switch to bdrv_check_parents_compliance() also means that error
messages become a bit worse because we don't know any more which of the
conflicting nodes is the new one, so we can't provide two different
messages any more. This is probably unavoidable, though.

> > 
> > > +bdrv_get_cumulative_perm(bs, _perms,
> > > + _shared_perms);
> > > +}
> > > -cur_ignore_children = 
> > > g_slist_prepend(g_slist_copy(ignore_children), c);
> > > -ret = bdrv_check_update_perm(c->bs, q, cur_perm, cur_shared,
> > > - cur_ignore_children, errp);
> > > -g_slist_free(cur_ignore_children);
> > > +ret = bdrv_node_check_perm(bs, q, cumulative_perms,
> > > + 

[PATCH 1/1] docs: fix mistake in dirty bitmap feature description

2021-01-28 Thread Denis V. Lunev
Original specification says that l1 table size if 64 * l1_size, which
is obviously wrong. The size of the l1 entry is 64 _bits_, not bytes.
Thus 64 is to be replaces with 8 as specification says about bytes.

There is also minor tweak, field name is renamed from l1 to l1_table,
which matches with the later text.

Signed-off-by: Denis V. Lunev 
CC: Stefan Hajnoczi 
CC: Vladimir Sementsov-Ogievskiy 
---
 docs/interop/parallels.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/interop/parallels.txt b/docs/interop/parallels.txt
index e9271eba5d..f15bf35bd1 100644
--- a/docs/interop/parallels.txt
+++ b/docs/interop/parallels.txt
@@ -208,7 +208,7 @@ of its data area are:
   28 - 31:l1_size
   The number of entries in the L1 table of the bitmap.
 
-  variable:   l1 (64 * l1_size bytes)
+  variable:   l1_table (8 * l1_size bytes)
   L1 offset table (in bytes)
 
 A dirty bitmap is stored using a one-level structure for the mapping to host
-- 
2.27.0




Re: [PATCH] MAINTAINERS: suggest myself as co-maintainer for Block Jobs

2021-01-28 Thread Max Reitz

On 28.01.21 15:41, Vladimir Sementsov-Ogievskiy wrote:

I'm developing Qemu backup for several years, and finally new backup
architecture, including block-copy generic engine and backup-top filter
landed upstream, great thanks to reviewers and especially to
Max Reitz!

I also have plans of moving other block-jobs onto block-copy, so that
we finally have one generic block copying path, fast and well-formed.

So, now I suggest to bring all parts of backup architecture into
"Block Jobs" subsystem (actually, aio_task is shared with qcow2 and
qemu-co-shared-resource can be reused somewhere else, but I'd keep an
eye on them in context of block-jobs) and add myself as co-maintainer.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  MAINTAINERS | 9 +
  1 file changed, 9 insertions(+)


Great! :)

Reviewed-by: Max Reitz 




Re: [PATCH] MAINTAINERS: suggest myself as co-maintainer for Block Jobs

2021-01-28 Thread John Snow

On 1/28/21 10:09 AM, Markus Armbruster wrote:

Vladimir Sementsov-Ogievskiy  writes:


I'm developing Qemu backup for several years, and finally new backup
architecture, including block-copy generic engine and backup-top filter
landed upstream, great thanks to reviewers and especially to
Max Reitz!

I also have plans of moving other block-jobs onto block-copy, so that
we finally have one generic block copying path, fast and well-formed.

So, now I suggest to bring all parts of backup architecture into
"Block Jobs" subsystem (actually, aio_task is shared with qcow2 and
qemu-co-shared-resource can be reused somewhere else, but I'd keep an
eye on them in context of block-jobs) and add myself as co-maintainer.

Signed-off-by: Vladimir Sementsov-Ogievskiy 


With pleasure:
Reviewed-by: Markus Armbruster 



Absolutely! Glad to see it.

Reviewed-by: John Snow 




Re: [PATCH] MAINTAINERS: suggest myself as co-maintainer for Block Jobs

2021-01-28 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> I'm developing Qemu backup for several years, and finally new backup
> architecture, including block-copy generic engine and backup-top filter
> landed upstream, great thanks to reviewers and especially to
> Max Reitz!
>
> I also have plans of moving other block-jobs onto block-copy, so that
> we finally have one generic block copying path, fast and well-formed.
>
> So, now I suggest to bring all parts of backup architecture into
> "Block Jobs" subsystem (actually, aio_task is shared with qcow2 and
> qemu-co-shared-resource can be reused somewhere else, but I'd keep an
> eye on them in context of block-jobs) and add myself as co-maintainer.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

With pleasure:
Reviewed-by: Markus Armbruster 




[PATCH] MAINTAINERS: suggest myself as co-maintainer for Block Jobs

2021-01-28 Thread Vladimir Sementsov-Ogievskiy
I'm developing Qemu backup for several years, and finally new backup
architecture, including block-copy generic engine and backup-top filter
landed upstream, great thanks to reviewers and especially to
Max Reitz!

I also have plans of moving other block-jobs onto block-copy, so that
we finally have one generic block copying path, fast and well-formed.

So, now I suggest to bring all parts of backup architecture into
"Block Jobs" subsystem (actually, aio_task is shared with qcow2 and
qemu-co-shared-resource can be reused somewhere else, but I'd keep an
eye on them in context of block-jobs) and add myself as co-maintainer.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 MAINTAINERS | 9 +
 1 file changed, 9 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 34359a99b8..9957f39eba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2207,6 +2207,7 @@ F: scsi/*
 
 Block Jobs
 M: John Snow 
+M: Vladimir Sementsov-Ogievskiy 
 L: qemu-block@nongnu.org
 S: Supported
 F: blockjob.c
@@ -2219,6 +2220,14 @@ F: block/commit.c
 F: block/stream.c
 F: block/mirror.c
 F: qapi/job.json
+F: block/block-copy.c
+F: include/block/block-copy.c
+F: block/backup-top.h
+F: block/backup-top.c
+F: include/block/aio_task.h
+F: block/aio_task.c
+F: util/qemu-co-shared-resource.c
+F: include/qemu/co-shared-resource.h
 T: git https://gitlab.com/jsnow/qemu.git jobs
 
 Block QAPI, monitor, command line
-- 
2.29.2




[PATCH] qemu-img: add seek and -n option to dd command

2021-01-28 Thread Peter Lieven
Signed-off-by: Peter Lieven 

diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
index b615aa8419..7d4564c2b8 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -209,6 +209,10 @@ Parameters to dd subcommand:
 
 .. program:: qemu-img-dd
 
+.. option:: -n
+
+  Skip the creation of the output file
+
 .. option:: bs=BLOCK_SIZE
 
   Defines the block size
@@ -229,6 +233,10 @@ Parameters to dd subcommand:
 
   Sets the number of input blocks to skip
 
+.. option:: sseek=BLOCKS
+
+  Sets the number of blocks to seek into the output
+
 Parameters to snapshot subcommand:
 
 .. program:: qemu-img-snapshot
diff --git a/qemu-img.c b/qemu-img.c
index 8597d069af..d7f390e382 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -213,12 +213,17 @@ static void QEMU_NORETURN help(void)
"  '-s' run in Strict mode - fail on different image size or sector 
allocation\n"
"\n"
"Parameters to dd subcommand:\n"
+   "  '-n' skips the target volume creation (useful if the volume is 
created\n"
+   "   prior to running qemu-img). Note that he behaviour is not 
identical to\n"
+   "   original dd option conv=nocreat. The output is neither 
truncated nor\n"
+   "   is it possible to write past the end of an existing file.\n"
"  'bs=BYTES' read and write up to BYTES bytes at a time "
"(default: 512)\n"
"  'count=N' copy only N input blocks\n"
"  'if=FILE' read from FILE\n"
"  'of=FILE' write to FILE\n"
-   "  'skip=N' skip N bs-sized blocks at the start of input\n";
+   "  'skip=N' skip N bs-sized blocks at the start of input\n"
+   "  'seek=N' seek N bs-sized blocks into the output\n";
 
 printf("%s\nSupported formats:", help_msg);
 bdrv_iterate_format(format_print, NULL, false);
@@ -4885,6 +4890,7 @@ static int img_bitmap(int argc, char **argv)
 #define C_IF  04
 #define C_OF  010
 #define C_SKIP020
+#define C_SEEK040
 
 struct DdInfo {
 unsigned int flags;
@@ -4964,6 +4970,19 @@ static int img_dd_skip(const char *arg,
 return 0;
 }
 
+static int img_dd_seek(const char *arg,
+   struct DdIo *in, struct DdIo *out,
+   struct DdInfo *dd)
+{
+out->offset = cvtnum("seek", arg);
+
+if (in->offset < 0) {
+return 1;
+}
+
+return 0;
+}
+
 static int img_dd(int argc, char **argv)
 {
 int ret = 0;
@@ -4980,7 +4999,7 @@ static int img_dd(int argc, char **argv)
 const char *fmt = NULL;
 int64_t size = 0;
 int64_t block_count = 0, out_pos, in_pos;
-bool force_share = false;
+bool force_share = false, skip_create = false;
 struct DdInfo dd = {
 .flags = 0,
 .count = 0,
@@ -5004,6 +5023,7 @@ static int img_dd(int argc, char **argv)
 { "if", img_dd_if, C_IF },
 { "of", img_dd_of, C_OF },
 { "skip", img_dd_skip, C_SKIP },
+{ "seek", img_dd_seek, C_SEEK },
 { NULL, NULL, 0 }
 };
 const struct option long_options[] = {
@@ -5014,7 +5034,7 @@ static int img_dd(int argc, char **argv)
 { 0, 0, 0, 0 }
 };
 
-while ((c = getopt_long(argc, argv, ":hf:O:U", long_options, NULL))) {
+while ((c = getopt_long(argc, argv, ":hf:O:Un", long_options, NULL))) {
 if (c == EOF) {
 break;
 }
@@ -5037,6 +5057,9 @@ static int img_dd(int argc, char **argv)
 case 'U':
 force_share = true;
 break;
+case 'n':
+skip_create = true;
+break;
 case OPTION_OBJECT:
 if (!qemu_opts_parse_noisily(_object_opts, optarg, true)) {
 ret = -1;
@@ -5116,22 +5139,25 @@ static int img_dd(int argc, char **argv)
 ret = -1;
 goto out;
 }
-if (!drv->create_opts) {
-error_report("Format driver '%s' does not support image creation",
- drv->format_name);
-ret = -1;
-goto out;
-}
-if (!proto_drv->create_opts) {
-error_report("Protocol driver '%s' does not support image creation",
- proto_drv->format_name);
-ret = -1;
-goto out;
-}
-create_opts = qemu_opts_append(create_opts, drv->create_opts);
-create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
 
-opts = qemu_opts_create(create_opts, NULL, 0, _abort);
+if (!skip_create) {
+if (!drv->create_opts) {
+error_report("Format driver '%s' does not support image creation",
+ drv->format_name);
+ret = -1;
+goto out;
+}
+if (!proto_drv->create_opts) {
+error_report("Protocol driver '%s' does not support image 
creation",
+ proto_drv->format_name);
+ret = -1;
+goto out;
+}
+create_opts = qemu_opts_append(create_opts, drv->create_opts);
+  

Re: [PULL 0/8] Block layer patches

2021-01-28 Thread Peter Maydell
On Wed, 27 Jan 2021 at 19:58, Kevin Wolf  wrote:
>
> The following changes since commit bf159f0bdc7b8e7aa8342dedb3829ca744c1b612:
>
>   Merge remote-tracking branch 
> 'remotes/edgar/tags/edgar/xilinx-next-2021-01-27.for-upstream' into staging 
> (2021-01-27 17:40:25 +)
>
> are available in the Git repository at:
>
>   git://repo.or.cz/qemu/kevin.git tags/for-upstream
>
> for you to fetch changes up to a44be0334beae3a9affb4a3a92cc6852993d7a84:
>
>   iotests: rename and move 169 and 199 tests (2021-01-27 20:53:14 +0100)
>
> 
> Block layer patches:
>
> - Fix crash on write to read-only devices
> - iotests: Rewrite 'check' in Python, get rid of 'groups' and allow
>   non-numeric test case names



Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/6.0
for any user-visible changes.

-- PMM



RE: [PATCH v4] blockjob: Fix crash with IOthread when block commit after snapshot

2021-01-28 Thread 仇大玉
Any comments?

-Original Message-
From: 08005...@163.com <08005...@163.com> 
Sent: 2021年1月28日 9:31
To: kw...@redhat.com; mre...@redhat.com; js...@redhat.com
Cc: qemu-block@nongnu.org; qemu-de...@nongnu.org; 仇大玉 
Subject: [PATCH v4] blockjob: Fix crash with IOthread when block commit after 
snapshot

From: Michael Qiu 

v4: rebase to latest code

v3: reformat the commit log, remove duplicate content

v2: modify the coredump backtrace within commit log with the newest
qemu with master branch

Currently, if guest has workloads, IO thread will acquire aio_context lock 
before do io_submit, it leads to segmentfault when do block commit after 
snapshot. Just like below:

Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7f7c7d91f700 (LWP 99907)] 0x5576d0f65aab in 
bdrv_mirror_top_pwritev at ../block/mirror.c:1437
1437../block/mirror.c: No such file or directory.
(gdb) p s->job
$17 = (MirrorBlockJob *) 0x0
(gdb) p s->stop
$18 = false

(gdb) bt

Switch to qemu main thread:
/lib/../lib64/libpthread.so.0
/lib/../lib64/libpthread.so.0
../util/qemu-thread-posix.c:79
qapi/qapi-commands-block-core.c:346
../qapi/qmp-dispatch.c:110
/lib/../lib64/libglib-2.0.so.0

In IO thread when do bdrv_mirror_top_pwritev, the job is NULL, and stop field 
is false, this means the MirrorBDSOpaque "s" object has not been initialized 
yet, and this object is initialized by block_job_create(), but the initialize 
process is stuck in acquiring the lock.

The rootcause is that qemu do release/acquire when hold the lock, at the same 
time, IO thread get the lock after release stage, and the crash occured.

Actually, in this situation, job->job.aio_context will not equal to 
qemu_get_aio_context(), and will be the same as bs->aio_context, thus, no need 
to release the lock, becasue bdrv_root_attach_child() will not change the 
context.

This patch fix this issue.

Signed-off-by: Michael Qiu 
---
 blockjob.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index 98ac8af982..51a09f3b60 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -214,13 +214,15 @@ int block_job_add_bdrv(BlockJob *job, const char *name, 
BlockDriverState *bs,
 BdrvChild *c;
 
 bdrv_ref(bs);
-if (job->job.aio_context != qemu_get_aio_context()) {
+if (bdrv_get_aio_context(bs) != job->job.aio_context &&
+job->job.aio_context != qemu_get_aio_context()) {
 aio_context_release(job->job.aio_context);
 }
 c = bdrv_root_attach_child(bs, name, _job, 0,
job->job.aio_context, perm, shared_perm, job,
errp);
-if (job->job.aio_context != qemu_get_aio_context()) {
+if (bdrv_get_aio_context(bs) != job->job.aio_context &&
+job->job.aio_context != qemu_get_aio_context()) {
 aio_context_acquire(job->job.aio_context);
 }
 if (c == NULL) {
--
2.22.0




Re: [PATCH v2 15/36] block: use topological sort for permission update

2021-01-28 Thread Vladimir Sementsov-Ogievskiy

27.01.2021 21:38, Kevin Wolf wrote:

Am 27.11.2020 um 15:45 hat Vladimir Sementsov-Ogievskiy geschrieben:

Rewrite bdrv_check_perm(), bdrv_abort_perm_update() and bdrv_set_perm()
to update nodes in topological sort order instead of simple DFS. With
topologically sorted nodes, we update a node only when all its parents
already updated. With DFS it's not so.

Consider the following example:

 A -+
 |  |
 |  v
 |  B
 |  |
 v  |
 C<-+

A is parent for B and C, B is parent for C.

Obviously, to update permissions, we should go in order A B C, so, when
we update C, all parent permissions already updated.


I wondered for a moment why this order is obvious. Taking a permission
on A may mean that we need to take the permisson on C, too.

The answer is (or so I think) that the whole operation is atomic so the
half-updated state will never be visible to a caller, but this is about
calculating the right permissions. Permissions a node needs on its
children may depend on what its parents requested, but parent
permissions never depend on what children request.



yes, that's about these relations




But with current
approach (simple recursion) we can update in sequence A C B C (C is
updated twice). On first update of C, we consider old B permissions, so
doing wrong thing. If it succeed, all is OK, on second C update we will
finish with correct graph. But if the wrong thing failed, we break the
whole process for no reason (it's possible that updated B permission
will be less strict, but we will never check it).

Also new approach gives a way to simultaneously and correctly update
several nodes, we just need to run bdrv_topological_dfs() several times
to add all nodes and their subtrees into one topologically sorted list
(next patch will update bdrv_replace_node() in this manner).

Test test_parallel_perm_update() is now passing, so move it out of
debugging "if".

We also need to support ignore_children in
bdrv_check_parents_compliance().

For test 283 order of parents compliance check is changed.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block.c | 103 +---
  tests/test-bdrv-graph-mod.c |   4 +-
  tests/qemu-iotests/283.out  |   2 +-
  3 files changed, 86 insertions(+), 23 deletions(-)

diff --git a/block.c b/block.c
index 92bfcbedc9..81ccf51605 100644
--- a/block.c
+++ b/block.c
@@ -1994,7 +1994,9 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, 
Error **errp)
  return false;
  }
  
-static bool bdrv_check_parents_compliance(BlockDriverState *bs, Error **errp)

+static bool bdrv_check_parents_compliance(BlockDriverState *bs,
+  GSList *ignore_children,
+  Error **errp)
  {
  BdrvChild *a, *b;
  
@@ -2005,7 +2007,9 @@ static bool bdrv_check_parents_compliance(BlockDriverState *bs, Error **errp)

   */
  QLIST_FOREACH(a, >parents, next_parent) {
  QLIST_FOREACH(b, >parents, next_parent) {
-if (a == b) {
+if (a == b || g_slist_find(ignore_children, a) ||
+g_slist_find(ignore_children, b))


'a' should be checked in the outer loop, no reason to repeat the same
check all the time in the inner loop.


+{
  continue;
  }
  
@@ -2034,6 +2038,29 @@ static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,

  }
  }
  
+static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,

+BlockDriverState *bs)


It would be good to have a comment that explains the details of the
contract.

In particular, this seems to require that @list is already topologically
sorted, and it's complete in the sense that if a node is in the list,
all of its children are in the list, too.


Right, will add




+{
+BdrvChild *child;
+g_autoptr(GHashTable) local_found = NULL;
+
+if (!found) {
+assert(!list);
+found = local_found = g_hash_table_new(NULL, NULL);
+}
+
+if (g_hash_table_contains(found, bs)) {
+return list;
+}
+g_hash_table_add(found, bs);
+
+QLIST_FOREACH(child, >children, next) {
+list = bdrv_topological_dfs(list, found, child->bs);
+}
+
+return g_slist_prepend(list, bs);
+}
+
  static void bdrv_child_set_perm_commit(void *opaque)
  {
  BdrvChild *c = opaque;
@@ -2098,10 +2125,10 @@ static void bdrv_child_set_perm_safe(BdrvChild *c, 
uint64_t perm,
   * A call to this function must always be followed by a call to 
bdrv_set_perm()
   * or bdrv_abort_perm_update().
   */


One big source of confusion for me when trying to understand this was
that bdrv_check_perm() is a misnomer since commit f962e96150e and the
above comment isn't really accurate any more.

The function doesn't only check the validity of the new permissions in
advance to actually making the change, but it already updates the
permissions of all child nodes