date:20201123

From: Gollu Appalanaidu 

Add the Compare command.

This implementation uses a bounce buffer to read in the data from
storage and then compare with the host supplied buffer.

Signed-off-by: Gollu Appalanaidu 
[k.jensen: rebased]
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 100 +-
 hw/block/trace-events |   2 +
 2 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f7f888402b06..f88710ca3948 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -999,6 +999,50 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
 nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+struct nvme_compare_ctx {
+QEMUIOVector iov;
+uint8_t *bounce;
+size_t len;
+};
+
+static void nvme_compare_cb(void *opaque, int ret)
+{
+NvmeRequest *req = opaque;
+NvmeNamespace *ns = req->ns;
+struct nvme_compare_ctx *ctx = req->opaque;
+g_autofree uint8_t *buf = NULL;
+uint16_t status;
+
+trace_pci_nvme_compare_cb(nvme_cid(req));
+
+if (!ret) {
+block_acct_done(blk_get_stats(ns->blkconf.blk), >acct);
+} else {
+block_acct_failed(blk_get_stats(ns->blkconf.blk), >acct);
+nvme_aio_err(req, ret);
+goto out;
+}
+
+buf = g_malloc(ctx->len);
+
+status = nvme_dma(nvme_ctrl(req), buf, ctx->len, DMA_DIRECTION_TO_DEVICE,
+  req);
+if (status) {
+goto out;
+}
+
+if (memcmp(buf, ctx->bounce, ctx->len)) {
+req->status = NVME_CMP_FAILURE;
+}
+
+out:
+qemu_iovec_destroy(>iov);
+g_free(ctx->bounce);
+g_free(ctx);
+
+nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeNamespace *ns = req->ns;
@@ -1072,6 +1116,57 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
 return status;
 }
 
+static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)>cmd;
+NvmeNamespace *ns = req->ns;
+BlockBackend *blk = ns->blkconf.blk;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+size_t len = nvme_l2b(ns, nlb);
+int64_t offset = nvme_l2b(ns, slba);
+uint8_t *bounce = NULL;
+struct nvme_compare_ctx *ctx = NULL;
+uint16_t status;
+
+trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
+
+status = nvme_check_mdts(n, len);
+if (status) {
+trace_pci_nvme_err_mdts(nvme_cid(req), len);
+return status;
+}
+
+status = nvme_check_bounds(ns, slba, nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+return status;
+}
+
+if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
+status = nvme_check_dulbe(ns, slba, nlb);
+if (status) {
+return status;
+}
+}
+
+bounce = g_malloc(len);
+
+ctx = g_new(struct nvme_compare_ctx, 1);
+ctx->bounce = bounce;
+ctx->len = len;
+
+req->opaque = ctx;
+
+qemu_iovec_init(>iov, 1);
+qemu_iovec_add(>iov, bounce, len);
+
+block_acct_start(blk_get_stats(blk), >acct, len, BLOCK_ACCT_READ);
+blk_aio_preadv(blk, offset, >iov, 0, nvme_compare_cb, req);
+
+return NVME_NO_COMPLETE;
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
 block_acct_start(blk_get_stats(req->ns->blkconf.blk), >acct, 0,
@@ -1201,6 +1296,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 case NVME_CMD_WRITE:
 case NVME_CMD_READ:
 return nvme_rw(n, req);
+case NVME_CMD_COMPARE:
+return nvme_compare(n, req);
 case NVME_CMD_DSM:
 return nvme_dsm(n, req);
 default:
@@ -2927,7 +3024,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->cqes = (0x4 << 4) | 0x4;
 id->nn = cpu_to_le32(n->num_namespaces);
 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
-   NVME_ONCS_FEATURES | NVME_ONCS_DSM);
+   NVME_ONCS_FEATURES | NVME_ONCS_DSM |
+   NVME_ONCS_COMPARE);
 
 id->vwc = 0x1;
 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 1ffe0b3f76b5..68a4c8ed35e0 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -46,6 +46,8 @@ pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t 
slba, uint32_t nlb)
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, 
bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed 
%d"
 pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid 
%"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
 pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t 
nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
+pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t

[PATCH 2/2] hw/block/nvme: add simple copy command

From: Klaus Jensen 

Add support for TP 4065a ("Simple Copy Command"), v2020.05.04
("Ratified").

The implementation uses a bounce buffer to first read in the source
logical blocks, then issue a write of that bounce buffer. The default
maximum number of source logical blocks is 128, translating to 512 KiB
for 4k logical blocks which aligns with the default value of MDTS.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h|   4 +
 hw/block/nvme.h   |   1 +
 hw/block/nvme-ns.c|   8 ++
 hw/block/nvme.c   | 225 +-
 hw/block/trace-events |   6 ++
 5 files changed, 243 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 44bf6271b744..745d288b09cf 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -21,6 +21,10 @@
 
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
+
+uint16_t mssrl;
+uint32_t mcl;
+uint8_t  msrc;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 574333caa3f9..f549abeeb930 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -62,6 +62,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
 case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
+case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY";
 default:return "NVME_NVM_CMD_UNKNOWN";
 }
 }
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 2d69b5177b51..eb28757c2f17 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -59,6 +59,11 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 
 id_ns->npda = id_ns->npdg = npdg - 1;
 
+/* simple copy */
+id_ns->mssrl = cpu_to_le16(ns->params.mssrl);
+id_ns->mcl = cpu_to_le32(ns->params.mcl);
+id_ns->msrc = ns->params.msrc;
+
 return 0;
 }
 
@@ -150,6 +155,9 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
+DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128),
+DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 255),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f7f888402b06..82233f80541e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -999,6 +999,109 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
 nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+struct nvme_copy_ctx {
+int copies;
+uint8_t *bounce;
+uint32_t nlb;
+};
+
+struct nvme_copy_in_ctx {
+NvmeRequest *req;
+QEMUIOVector iov;
+};
+
+static void nvme_copy_cb(void *opaque, int ret)
+{
+NvmeRequest *req = opaque;
+NvmeNamespace *ns = req->ns;
+struct nvme_copy_ctx *ctx = req->opaque;
+
+trace_pci_nvme_copy_cb(nvme_cid(req));
+
+if (!ret) {
+block_acct_done(blk_get_stats(ns->blkconf.blk), >acct);
+} else {
+block_acct_failed(blk_get_stats(ns->blkconf.blk), >acct);
+nvme_aio_err(req, ret);
+}
+
+g_free(ctx->bounce);
+g_free(ctx);
+
+nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static void nvme_copy_in_complete(NvmeRequest *req)
+{
+NvmeNamespace *ns = req->ns;
+NvmeCopyCmd *copy = (NvmeCopyCmd *)>cmd;
+struct nvme_copy_ctx *ctx = req->opaque;
+uint64_t sdlba = le64_to_cpu(copy->sdlba);
+uint16_t status;
+
+trace_pci_nvme_copy_in_complete(nvme_cid(req));
+
+block_acct_done(blk_get_stats(ns->blkconf.blk), >acct);
+
+status = nvme_check_bounds(ns, sdlba, ctx->nlb);
+if (status) {
+trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
+req->status = status;
+
+g_free(ctx->bounce);
+g_free(ctx);
+
+nvme_enqueue_req_completion(nvme_cq(req), req);
+
+return;
+}
+
+qemu_iovec_init(>iov, 1);
+qemu_iovec_add(>iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
+
+block_acct_start(blk_get_stats(ns->blkconf.blk), >acct,
+ nvme_l2b(ns, ctx->nlb), BLOCK_ACCT_WRITE);
+
+req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
+ >iov, 0, nvme_copy_cb, req);
+}
+
+static void nvme_aio_copy_in_cb(void *opaque, int ret)
+{
+struct nvme_copy_in_ctx *in_ctx = opaque;
+NvmeRequest *req = in_ctx->req;
+NvmeNamespace *ns = req->ns;
+struct nvme_copy_ctx *ctx = req->opaque;
+
+qemu_iovec_destroy(_ctx->iov);
+g_free(in_ctx);
+
+trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
+
+if (ret) {
+nvme_aio_err(req, ret);
+}
+
+ctx->copies--;
+
+if (ctx->copies) {
+return;
+}
+
+if (req->status) {
+

[PATCH 1/2] nvme: updated shared header for copy command

From: Klaus Jensen 

Add new data structures and types for the Simple Copy command.

Signed-off-by: Klaus Jensen 
Cc: Stefan Hajnoczi 
Cc: Fam Zheng 
---
 include/block/nvme.h | 45 ++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index e95ff6ca9b37..b56efd6a89af 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -472,6 +472,7 @@ enum NvmeIoCommands {
 NVME_CMD_COMPARE= 0x05,
 NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
+NVME_CMD_COPY   = 0x19,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -603,6 +604,35 @@ typedef struct QEMU_PACKED NvmeDsmRange {
 uint64_tslba;
 } NvmeDsmRange;
 
+enum {
+NVME_COPY_FORMAT_0 = 0x0,
+};
+
+typedef struct NvmeCopyCmd {
+uint8_t opcode;
+uint8_t flags;
+uint16_tcid;
+uint32_tnsid;
+uint32_trsvd2[4];
+NvmeCmdDptr dptr;
+uint64_tsdlba;
+uint32_tcdw12;
+uint32_tcdw13;
+uint32_tilbrt;
+uint16_tlbat;
+uint16_tlbatm;
+} NvmeCopyCmd;
+
+typedef struct NvmeCopySourceRange {
+uint8_t  rsvd0[8];
+uint64_t slba;
+uint16_t nlb;
+uint8_t  rsvd18[6];
+uint32_t eilbrt;
+uint16_t elbatm;
+uint16_t elbat;
+} NvmeCopySourceRange;
+
 enum NvmeAsyncEventRequest {
 NVME_AER_TYPE_ERROR = 0,
 NVME_AER_TYPE_SMART = 1,
@@ -680,6 +710,7 @@ enum NvmeStatusCodes {
 NVME_CONFLICTING_ATTRS  = 0x0180,
 NVME_INVALID_PROT_INFO  = 0x0181,
 NVME_WRITE_TO_RO= 0x0182,
+NVME_CMD_SIZE_LIMIT = 0x0183,
 NVME_WRITE_FAULT= 0x0280,
 NVME_UNRECOVERED_READ   = 0x0281,
 NVME_E2E_GUARD_ERROR= 0x0282,
@@ -831,7 +862,7 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint8_t nvscc;
 uint8_t rsvd531;
 uint16_tacwu;
-uint8_t rsvd534[2];
+uint16_tocfs;
 uint32_tsgls;
 uint8_t rsvd540[228];
 uint8_t subnqn[256];
@@ -854,6 +885,11 @@ enum NvmeIdCtrlOncs {
 NVME_ONCS_FEATURES  = 1 << 4,
 NVME_ONCS_RESRVATIONS   = 1 << 5,
 NVME_ONCS_TIMESTAMP = 1 << 6,
+NVME_ONCS_COPY  = 1 << 8,
+};
+
+enum NvmeIdCtrlOcfs {
+NVME_OCFS_COPY_FORMAT_0 = 1 << NVME_COPY_FORMAT_0,
 };
 
 enum NvmeIdCtrlFrmw {
@@ -995,7 +1031,10 @@ typedef struct QEMU_PACKED NvmeIdNs {
 uint16_tnpdg;
 uint16_tnpda;
 uint16_tnows;
-uint8_t rsvd74[30];
+uint16_tmssrl;
+uint32_tmcl;
+uint8_t msrc;
+uint8_t rsvd81[23];
 uint8_t nguid[16];
 uint64_teui64;
 NvmeLBAFlbaf[16];
@@ -1059,6 +1098,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
+QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRange) != 32);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64);
@@ -1066,6 +1106,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
+QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
-- 
2.29.2

[PATCH 0/2] hw/block/nvme: add simple copy command

From: Klaus Jensen 

Add support for TP 4065 ("Simple Copy Command").

Klaus Jensen (2):
  nvme: updated shared header for copy command
  hw/block/nvme: add simple copy command

 hw/block/nvme-ns.h|   4 +
 hw/block/nvme.h   |   1 +
 include/block/nvme.h  |  45 -
 hw/block/nvme-ns.c|   8 ++
 hw/block/nvme.c   | 225 +-
 hw/block/trace-events |   6 ++
 6 files changed, 286 insertions(+), 3 deletions(-)

-- 
2.29.2

Re: [PATCH v8 0/5] hw/block/nvme: dulbe and dsm support

On Nov 12 20:59, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This adds support for the Deallocated or Unwritten Logical Block error
> recovery feature as well as the Dataset Management command.
> 
> v8:
>   - Move req->opaque clearing to nvme_req_clear.
>   - Add two preparation/cleanup patches.
> 
> v7:
>   - Handle negative return value from bdrv_block_status.
>   - bdrv_get_info may not be supported on all block drivers, so do not
> consider it a fatal error.
> 
> v6:
>   - Skip the allocation of the discards integer and just use the opaque
> value directly (Philippe)
>   - Split changes to include/block/nvme.h into a separate patch
> (Philippe)
>   - Clean up some convoluted checks on the discards value (Philippe)
>   - Use unambiguous units in the commit messages (Philippe)
>   - Stack allocate the range array (Keith)
> 
> v5:
>   - Restore status code from callback (Keith)
> 
> v4:
>   - Removed mixed declaration and code (Keith)
>   - Set NPDG and NPDA and account for the blockdev cluster size.
> 
> Klaus Jensen (5):
>   hw/block/nvme: remove superfluous NvmeCtrl parameter
>   hw/block/nvme: pull aio error handling
>   hw/block/nvme: add dulbe support
>   nvme: add namespace I/O optimization fields to shared header
>   hw/block/nvme: add the dataset management command
> 
>  hw/block/nvme-ns.h|   4 +
>  hw/block/nvme.h   |   2 +
>  include/block/nvme.h  |  12 +-
>  hw/block/nvme-ns.c|  34 +-
>  hw/block/nvme.c   | 258 --
>  hw/block/trace-events |   4 +
>  6 files changed, 276 insertions(+), 38 deletions(-)
> 
> -- 
> 2.29.2
> 

Thanks for the reviews everyone; applied to nvme-next.


signature.asc
Description: PGP signature

[PATCH 19/21] block: add bdrv_remove_backing transaction action

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 42 --
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index cf7b859a81..6ea926e2c1 100644
--- a/block.c
+++ b/block.c
@@ -2974,12 +2974,19 @@ BdrvChild *bdrv_attach_child(BlockDriverState 
*parent_bs,
 return child;
 }
 
+static void bdrv_child_free(void *opaque)
+{
+BdrvChild *c = opaque;
+
+g_free(c->name);
+g_free(c);
+}
+
 static void bdrv_remove_empty_child(BdrvChild *child)
 {
 assert(!child->bs);
 QLIST_SAFE_REMOVE(child, next);
-g_free(child->name);
-g_free(child);
+bdrv_child_free(child);
 }
 
 typedef struct BdrvAttachChildNopermState {
@@ -4852,6 +4859,37 @@ static bool should_update_child(BdrvChild *c, 
BlockDriverState *to)
 return ret;
 }
 
+/* this doesn't restore original child bs, only the child itself */
+static void bdrv_remove_backing_abort(void *opaque)
+{
+BdrvChild *c = opaque;
+BlockDriverState *parent_bs = c->opaque;
+
+QLIST_INSERT_HEAD(_bs->children, c, next);
+parent_bs->backing = c;
+}
+
+static BdrvActionDrv bdrv_remove_backing_drv = {
+.abort = bdrv_remove_backing_abort,
+.commit = bdrv_child_free,
+};
+
+__attribute__((unused))
+static void bdrv_remove_backing(BlockDriverState *bs, GSList **tran)
+{
+if (!bs->backing) {
+return;
+}
+
+if (bs->backing->bs) {
+bdrv_replace_child_safe(bs->backing, NULL, tran);
+}
+
+tran_prepend(tran, _remove_backing_drv, bs->backing);
+QLIST_SAFE_REMOVE(bs->backing, next);
+bs->backing = NULL;
+}
+
 static int bdrv_replace_node_noperm(BlockDriverState *from,
 BlockDriverState *to,
 bool auto_skip, GSList **tran, Error 
**errp)
-- 
2.21.3

[PATCH 21/21] block/backup-top: drop .active

We don't need this workaround anymore: bdrv_append is already smart
enough and we can use new bdrv_drop_filter().

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/backup-top.c | 38 +-
 tests/qemu-iotests/283.out |  2 +-
 2 files changed, 2 insertions(+), 38 deletions(-)

diff --git a/block/backup-top.c b/block/backup-top.c
index 650ed6195c..84eb73aeb7 100644
--- a/block/backup-top.c
+++ b/block/backup-top.c
@@ -37,7 +37,6 @@
 typedef struct BDRVBackupTopState {
 BlockCopyState *bcs;
 BdrvChild *target;
-bool active;
 int64_t cluster_size;
 } BDRVBackupTopState;
 
@@ -127,21 +126,6 @@ static void backup_top_child_perm(BlockDriverState *bs, 
BdrvChild *c,
   uint64_t perm, uint64_t shared,
   uint64_t *nperm, uint64_t *nshared)
 {
-BDRVBackupTopState *s = bs->opaque;
-
-if (!s->active) {
-/*
- * The filter node may be in process of bdrv_append(), which firstly do
- * bdrv_set_backing_hd() and then bdrv_replace_node(). This means that
- * we can't unshare BLK_PERM_WRITE during bdrv_append() operation. So,
- * let's require nothing during bdrv_append() and refresh permissions
- * after it (see bdrv_backup_top_append()).
- */
-*nperm = 0;
-*nshared = BLK_PERM_ALL;
-return;
-}
-
 if (!(role & BDRV_CHILD_FILTERED)) {
 /*
  * Target child
@@ -229,18 +213,6 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState 
*source,
 }
 appended = true;
 
-/*
- * bdrv_append() finished successfully, now we can require permissions
- * we want.
- */
-state->active = true;
-bdrv_child_refresh_perms(top, top->backing, _err);
-if (local_err) {
-error_prepend(_err,
-  "Cannot set permissions for backup-top filter: ");
-goto fail;
-}
-
 state->cluster_size = cluster_size;
 state->bcs = block_copy_state_new(top->backing, state->target,
   cluster_size, write_flags, _err);
@@ -256,7 +228,6 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState 
*source,
 
 fail:
 if (appended) {
-state->active = false;
 bdrv_backup_top_drop(top);
 } else {
 bdrv_unref(top);
@@ -272,16 +243,9 @@ void bdrv_backup_top_drop(BlockDriverState *bs)
 {
 BDRVBackupTopState *s = bs->opaque;
 
-bdrv_drained_begin(bs);
+bdrv_drop_filter(bs, _abort);
 
 block_copy_state_free(s->bcs);
 
-s->active = false;
-bdrv_child_refresh_perms(bs, bs->backing, _abort);
-bdrv_replace_node(bs, bs->backing->bs, _abort);
-bdrv_set_backing_hd(bs, NULL, _abort);
-
-bdrv_drained_end(bs);
-
 bdrv_unref(bs);
 }
diff --git a/tests/qemu-iotests/283.out b/tests/qemu-iotests/283.out
index fbb7d0f619..a34e4e3f92 100644
--- a/tests/qemu-iotests/283.out
+++ b/tests/qemu-iotests/283.out
@@ -5,4 +5,4 @@
 {"execute": "blockdev-add", "arguments": {"driver": "blkdebug", "image": 
"base", "node-name": "other", "take-child-perms": ["write"]}}
 {"return": {}}
 {"execute": "blockdev-backup", "arguments": {"device": "source", "sync": 
"full", "target": "target"}}
-{"error": {"class": "GenericError", "desc": "Cannot set permissions for 
backup-top filter: Conflicts with use by source as 'image', which does not 
allow 'write' on base"}}
+{"error": {"class": "GenericError", "desc": "Cannot append backup-top filter: 
Conflicts with use by source as 'image', which does not allow 'write' on base"}}
-- 
2.21.3

[PATCH 20/21] block: introduce bdrv_drop_filter()

Using bdrv_replace_node() for removing filter is not good enough: it
keeps child reference of the filter, which may conflict with original
top node during permission update.

Instead let's create new interface, which will do all graph
modifications first and then update permissions.

Let's modify bdrv_replace_node_common(), allowing it additionally drop
backing chain child link pointing to new node. This is quite
appropriate for bdrv_drop_intermediate() and makes possible to add
new bdrv_drop_filter() as a simple wrapper.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block.h |  1 +
 block.c   | 35 ---
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 6c1efce0c3..981a07e29d 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -348,6 +348,7 @@ int bdrv_append(BlockDriverState *bs_new, BlockDriverState 
*bs_top,
 Error **errp);
 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
   Error **errp);
+int bdrv_drop_filter(BlockDriverState *bs, Error **errp);
 
 int bdrv_parse_aio(const char *mode, int *flags);
 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
diff --git a/block.c b/block.c
index 6ea926e2c1..ee0143aff2 100644
--- a/block.c
+++ b/block.c
@@ -4923,15 +4923,30 @@ static int bdrv_replace_node_noperm(BlockDriverState 
*from,
  *
  * With auto_skip=false the error is returned if from has a parent which should
  * not be updated.
+ *
+ * With detach_subchain to must be in a backing chain of from. In this case
+ * backing link of the cow-parent of @to is removed.
  */
 static int bdrv_replace_node_common(BlockDriverState *from,
 BlockDriverState *to,
-bool auto_skip, Error **errp)
+bool auto_skip, bool detach_subchain,
+Error **errp)
 {
 int ret = -EPERM;
 GSList *tran = NULL;
 g_autoptr(GHashTable) found = NULL;
 g_autoptr(GSList) refresh_list = NULL;
+BlockDriverState *to_cow_parent;
+
+if (detach_subchain) {
+assert(bdrv_chain_contains(from, to));
+for (to_cow_parent = from;
+ bdrv_filter_or_cow_bs(to_cow_parent) != to;
+ to_cow_parent = bdrv_filter_or_cow_bs(to_cow_parent))
+{
+;
+}
+}
 
 /* Make sure that @from doesn't go away until we have successfully attached
  * all of its parents to @to. */
@@ -4952,6 +4967,10 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,
 goto out;
 }
 
+if (detach_subchain) {
+bdrv_remove_backing(to_cow_parent, );
+}
+
 found = g_hash_table_new(NULL, NULL);
 
 refresh_list = bdrv_topological_dfs(refresh_list, found, to);
@@ -4971,7 +4990,13 @@ out:
 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
   Error **errp)
 {
-return bdrv_replace_node_common(from, to, true, errp);
+return bdrv_replace_node_common(from, to, true, false, errp);
+}
+
+int bdrv_drop_filter(BlockDriverState *bs, Error **errp)
+{
+return bdrv_replace_node_common(bs, bdrv_filter_or_cow_bs(bs), true, true,
+errp);
 }
 
 /*
@@ -5303,7 +5328,11 @@ int bdrv_drop_intermediate(BlockDriverState *top, 
BlockDriverState *base,
 updated_children = g_slist_prepend(updated_children, c);
 }
 
-bdrv_replace_node_common(top, base, false, _err);
+/*
+ * It seems correct to pass detach_subchain=true here, but it triggers one
+ * more yet not fixed bug
+ */
+bdrv_replace_node_common(top, base, false, false, _err);
 if (local_err) {
 error_report_err(local_err);
 goto exit;
-- 
2.21.3

[PATCH 09/21] block: add bdrv_drv_set_perm transaction action

Refactor calling driver callbacks to a separate transaction action to
be used later.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 71 -
 1 file changed, 55 insertions(+), 16 deletions(-)

diff --git a/block.c b/block.c
index 56263407e8..799c475dda 100644
--- a/block.c
+++ b/block.c
@@ -2094,6 +2094,54 @@ static void bdrv_child_set_perm_safe(BdrvChild *c, 
uint64_t perm,
 }
 }
 
+static void bdrv_drv_set_perm_commit(void *opaque)
+{
+BlockDriverState *bs = opaque;
+uint64_t cumulative_perms, cumulative_shared_perms;
+
+if (bs->drv->bdrv_set_perm) {
+bdrv_get_cumulative_perm(bs, _perms,
+ _shared_perms);
+bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
+}
+}
+
+static void bdrv_drv_set_perm_abort(void *opaque)
+{
+BlockDriverState *bs = opaque;
+
+if (bs->drv->bdrv_abort_perm_update) {
+bs->drv->bdrv_abort_perm_update(bs);
+}
+}
+
+BdrvActionDrv bdrv_drv_set_perm_drv = {
+.abort = bdrv_drv_set_perm_abort,
+.commit = bdrv_drv_set_perm_commit,
+};
+
+static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
+ uint64_t shared_perm, GSList **tran,
+ Error **errp)
+{
+if (!bs->drv) {
+return 0;
+}
+
+if (bs->drv->bdrv_check_perm) {
+int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
+if (ret < 0) {
+return ret;
+}
+}
+
+if (tran) {
+tran_prepend(tran, _drv_set_perm_drv, bs);
+}
+
+return 0;
+}
+
 /*
  * Check whether permissions on this node can be changed in a way that
  * @cumulative_perms and @cumulative_shared_perms are the new cumulative
@@ -2108,6 +2156,7 @@ static int bdrv_node_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 uint64_t cumulative_shared_perms,
 GSList *ignore_children, Error **errp)
 {
+int ret;
 BlockDriver *drv = bs->drv;
 BdrvChild *c;
 
@@ -2153,12 +2202,10 @@ static int bdrv_node_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 return 0;
 }
 
-if (drv->bdrv_check_perm) {
-int ret = drv->bdrv_check_perm(bs, cumulative_perms,
-   cumulative_shared_perms, errp);
-if (ret < 0) {
-return ret;
-}
+ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, 
NULL,
+errp);
+if (ret < 0) {
+return ret;
 }
 
 /* Drivers that never have children can omit .bdrv_child_perm() */
@@ -2226,9 +2273,7 @@ static void bdrv_node_abort_perm_update(BlockDriverState 
*bs)
 return;
 }
 
-if (drv->bdrv_abort_perm_update) {
-drv->bdrv_abort_perm_update(bs);
-}
+bdrv_drv_set_perm_abort(bs);
 
 QLIST_FOREACH(c, >children, next) {
 bdrv_child_set_perm_abort(c);
@@ -2246,7 +2291,6 @@ static void bdrv_abort_perm_update(BlockDriverState *bs)
 
 static void bdrv_node_set_perm(BlockDriverState *bs)
 {
-uint64_t cumulative_perms, cumulative_shared_perms;
 BlockDriver *drv = bs->drv;
 BdrvChild *c;
 
@@ -2254,12 +2298,7 @@ static void bdrv_node_set_perm(BlockDriverState *bs)
 return;
 }
 
-bdrv_get_cumulative_perm(bs, _perms, _shared_perms);
-
-/* Update this node */
-if (drv->bdrv_set_perm) {
-drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
-}
+bdrv_drv_set_perm_commit(bs);
 
 /* Drivers that never have children can omit .bdrv_child_perm() */
 if (!drv->bdrv_child_perm) {
-- 
2.21.3

[PATCH 17/21] block: bdrv_append(): return status

Return int status to avoid extra error propagation schemes.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block.h   |  4 ++--
 block.c | 15 ---
 block/commit.c  |  6 ++
 block/mirror.c  |  6 ++
 blockdev.c  |  6 +++---
 tests/test-bdrv-graph-mod.c |  6 +++---
 6 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index ab812e14d8..6c1efce0c3 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -344,8 +344,8 @@ int bdrv_create(BlockDriver *drv, const char* filename,
 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
 
 BlockDriverState *bdrv_new(void);
-void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
- Error **errp);
+int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
+Error **errp);
 int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
   Error **errp);
 
diff --git a/block.c b/block.c
index a75c5b4aea..f2e714a81d 100644
--- a/block.c
+++ b/block.c
@@ -3443,7 +3443,6 @@ static BlockDriverState 
*bdrv_append_temp_snapshot(BlockDriverState *bs,
 int64_t total_size;
 QemuOpts *opts = NULL;
 BlockDriverState *bs_snapshot = NULL;
-Error *local_err = NULL;
 int ret;
 
 /* if snapshot, we create a temporary backing file and open it
@@ -3485,9 +3484,8 @@ static BlockDriverState 
*bdrv_append_temp_snapshot(BlockDriverState *bs,
 goto out;
 }
 
-bdrv_append(bs_snapshot, bs, _err);
-if (local_err) {
-error_propagate(errp, local_err);
+ret = bdrv_append(bs_snapshot, bs, errp);
+if (ret < 0) {
 bs_snapshot = NULL;
 goto out;
 }
@@ -4952,22 +4950,25 @@ int bdrv_replace_node(BlockDriverState *from, 
BlockDriverState *to,
  * Recent update: bdrv_append does NOT eat bs_new reference for now. Drop this
  * comment several moths later.
  */
-void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
- Error **errp)
+int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
+Error **errp)
 {
 Error *local_err = NULL;
 
 bdrv_set_backing_hd(bs_new, bs_top, _err);
 if (local_err) {
 error_propagate(errp, local_err);
-return;
+return -EPERM;
 }
 
 bdrv_replace_node(bs_top, bs_new, _err);
 if (local_err) {
 error_propagate(errp, local_err);
 bdrv_set_backing_hd(bs_new, NULL, _abort);
+return -EPERM;
 }
+
+return 0;
 }
 
 static void bdrv_delete(BlockDriverState *bs)
diff --git a/block/commit.c b/block/commit.c
index 61924bcf66..b89bb20b75 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -254,7 +254,6 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 BlockDriverState *iter;
 BlockDriverState *commit_top_bs = NULL;
 BlockDriverState *filtered_base;
-Error *local_err = NULL;
 int64_t base_size, top_size;
 uint64_t base_perms, iter_shared_perms;
 int ret;
@@ -312,11 +311,10 @@ void commit_start(const char *job_id, BlockDriverState 
*bs,
 
 commit_top_bs->total_sectors = top->total_sectors;
 
-bdrv_append(commit_top_bs, top, _err);
+ret = bdrv_append(commit_top_bs, top, errp);
 bdrv_unref(commit_top_bs); /* referenced by new parents or failed */
-if (local_err) {
+if (ret < 0) {
 commit_top_bs = NULL;
-error_propagate(errp, local_err);
 goto fail;
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index 13f7ecc998..c3fbe3e8bd 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1560,7 +1560,6 @@ static BlockJob *mirror_start_job(
 BlockDriverState *mirror_top_bs;
 bool target_is_backing;
 uint64_t target_perms, target_shared_perms;
-Error *local_err = NULL;
 int ret;
 
 if (granularity == 0) {
@@ -1606,12 +1605,11 @@ static BlockJob *mirror_start_job(
 mirror_top_bs->opaque = bs_opaque;
 
 bdrv_drained_begin(bs);
-bdrv_append(mirror_top_bs, bs, _err);
+ret = bdrv_append(mirror_top_bs, bs, errp);
 bdrv_drained_end(bs);
 
-if (local_err) {
+if (ret < 0) {
 bdrv_unref(mirror_top_bs);
-error_propagate(errp, local_err);
 return NULL;
 }
 
diff --git a/blockdev.c b/blockdev.c
index 96c96f8ba6..2af35d0958 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1432,6 +1432,7 @@ typedef struct ExternalSnapshotState {
 static void external_snapshot_prepare(BlkActionState *common,
   Error **errp)
 {
+int ret;
 int flags = 0;
 QDict *options = NULL;
 Error *local_err = NULL;
@@ -1587,9 +1588,8 @@ static void external_snapshot_prepare(BlkActionState 
*common,
 goto out;
 }
 
-bdrv_append(state->new_bs, state->old_bs, _err);
-if (local_err) {
-error_propagate(errp, local_err);
+ret =

[PATCH 06/21] block: rewrite bdrv_child_try_set_perm() using bdrv_refresh_perms()

We are going to drop recursive bdrv_child_* functions, so stop use them
in bdrv_child_try_set_perm() as a first step.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index e12acd5029..a9e4d2b57c 100644
--- a/block.c
+++ b/block.c
@@ -2372,11 +2372,16 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t 
perm, uint64_t shared,
 Error **errp)
 {
 Error *local_err = NULL;
+GSList *tran = NULL;
 int ret;
 
-ret = bdrv_child_check_perm(c, NULL, perm, shared, NULL, _err);
+bdrv_child_set_perm_safe(c, perm, shared, );
+
+ret = bdrv_refresh_perms(c->bs, _err);
+
+tran_finalize(tran, ret);
+
 if (ret < 0) {
-bdrv_child_abort_perm_update(c);
 if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
 /* tighten permissions */
 error_propagate(errp, local_err);
@@ -2390,12 +2395,9 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
 error_free(local_err);
 ret = 0;
 }
-return ret;
 }
 
-bdrv_child_set_perm(c);
-
-return 0;
+return ret;
 }
 
 int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
-- 
2.21.3

[PATCH 02/21] tests/test-bdrv-graph-mod: add test_parallel_perm_update

Add test to show that simple DFS recursion order is not correct for
permission update. Correct order is topological-sort order, which will
be introduced later.

Consider the block driver which has two filter children: one active
with exclusive write access and one inactive with no specific
permissions.

And, these two children has a common base child, like this:

┌─┐ ┌──┐
│ fl2 │ ◀── │ top  │
└─┘ └──┘
  │   │
  │   │ w
  │   ▼
  │ ┌──┐
  │ │ fl1  │
  │ └──┘
  │   │
  │   │ w
  │   ▼
  │ ┌──┐
  └───▶ │ base │
└──┘

So, exclusive write is propagated.

Assume, we want to make fl2 active instead of fl1.
So, we set some option for top driver and do permission update.

If permission update (remember, it's DFS) goes first through
top->fl1->base branch it will succeed: it firstly drop exclusive write
permissions and than apply them for another BdrvChildren.
But if permission update goes first through top->fl2->base branch it
will fail, as when we try to update fl2->base child, old not yet
updated fl1->base child will be in conflict.

Now test fails, so it runs only with -d flag. To run do

  ./test-bdrv-graph-mod -d -p /bdrv-graph-mod/parallel-perm-update

from /tests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/test-bdrv-graph-mod.c | 64 +
 1 file changed, 64 insertions(+)

diff --git a/tests/test-bdrv-graph-mod.c b/tests/test-bdrv-graph-mod.c
index 3b9e6f242f..27e3361a60 100644
--- a/tests/test-bdrv-graph-mod.c
+++ b/tests/test-bdrv-graph-mod.c
@@ -232,6 +232,68 @@ static void test_parallel_exclusive_write(void)
 bdrv_unref(top);
 }
 
+static void write_to_file_perms(BlockDriverState *bs, BdrvChild *c,
+ BdrvChildRole role,
+ BlockReopenQueue *reopen_queue,
+ uint64_t perm, uint64_t shared,
+ uint64_t *nperm, uint64_t *nshared)
+{
+if (bs->file && c == bs->file) {
+*nperm = BLK_PERM_WRITE;
+*nshared = BLK_PERM_ALL & ~BLK_PERM_WRITE;
+} else {
+*nperm = 0;
+*nshared = BLK_PERM_ALL;
+}
+}
+
+static BlockDriver bdrv_write_to_file = {
+.format_name = "tricky-perm",
+.bdrv_child_perm = write_to_file_perms,
+};
+
+static void test_parallel_perm_update(void)
+{
+BlockDriverState *top = no_perm_node("top");
+BlockDriverState *tricky =
+bdrv_new_open_driver(_write_to_file, "tricky", BDRV_O_RDWR,
+ _abort);
+BlockDriverState *base = no_perm_node("base");
+BlockDriverState *fl1 = pass_through_node("fl1");
+BlockDriverState *fl2 = pass_through_node("fl2");
+BdrvChild *c_fl1, *c_fl2;
+
+bdrv_attach_child(top, tricky, "file", _of_bds, BDRV_CHILD_DATA,
+  _abort);
+c_fl1 = bdrv_attach_child(tricky, fl1, "first", _of_bds,
+  BDRV_CHILD_FILTERED, _abort);
+c_fl2 = bdrv_attach_child(tricky, fl2, "second", _of_bds,
+  BDRV_CHILD_FILTERED, _abort);
+bdrv_attach_child(fl1, base, "backing", _of_bds, BDRV_CHILD_FILTERED,
+  _abort);
+bdrv_attach_child(fl2, base, "backing", _of_bds, BDRV_CHILD_FILTERED,
+  _abort);
+bdrv_ref(base);
+
+/* Select fl1 as first child to be active */
+tricky->file = c_fl1;
+bdrv_child_refresh_perms(top, top->children.lh_first, _abort);
+
+assert(c_fl1->perm & BLK_PERM_WRITE);
+
+/* Now, try to switch active child and update permissions */
+tricky->file = c_fl2;
+bdrv_child_refresh_perms(top, top->children.lh_first, _abort);
+
+assert(c_fl2->perm & BLK_PERM_WRITE);
+
+/* Switch once more, to not care about real child order in the list */
+tricky->file = c_fl1;
+bdrv_child_refresh_perms(top, top->children.lh_first, _abort);
+
+assert(c_fl1->perm & BLK_PERM_WRITE);
+}
+
 int main(int argc, char *argv[])
 {
 int i;
@@ -256,6 +318,8 @@ int main(int argc, char *argv[])
 if (debug) {
 g_test_add_func("/bdrv-graph-mod/parallel-exclusive-write",
 test_parallel_exclusive_write);
+g_test_add_func("/bdrv-graph-mod/parallel-perm-update",
+test_parallel_perm_update);
 }
 
 return g_test_run();
-- 
2.21.3

[PATCH 16/21] block: bdrv_append(): don't consume reference

We have too much comments for this feature. It seems better just don't
do it. Most of real users (tests don't count) have to create additional
reference.

Drop also comment in external_snapshot_prepare:
 - bdrv_append doesn't "remove" old bs in common sense, it sounds
   strange
 - the fact that bdrv_append can fail is obvious from the context
 - the fact that we must rollback all changes in transaction abort is
   known (it's the direct role of abort)

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 19 +++
 block/backup-top.c  |  1 -
 block/commit.c  |  1 +
 block/mirror.c  |  3 ---
 blockdev.c  |  4 
 tests/test-bdrv-drain.c |  2 +-
 tests/test-bdrv-graph-mod.c |  2 ++
 7 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/block.c b/block.c
index 1327254b8e..a75c5b4aea 100644
--- a/block.c
+++ b/block.c
@@ -3485,11 +3485,6 @@ static BlockDriverState 
*bdrv_append_temp_snapshot(BlockDriverState *bs,
 goto out;
 }
 
-/* bdrv_append() consumes a strong reference to bs_snapshot
- * (i.e. it will call bdrv_unref() on it) even on error, so in
- * order to be able to return one, we have to increase
- * bs_snapshot's refcount here */
-bdrv_ref(bs_snapshot);
 bdrv_append(bs_snapshot, bs, _err);
 if (local_err) {
 error_propagate(errp, local_err);
@@ -4954,10 +4949,8 @@ int bdrv_replace_node(BlockDriverState *from, 
BlockDriverState *to,
  *
  * This function does not create any image files.
  *
- * bdrv_append() takes ownership of a bs_new reference and unrefs it because
- * that's what the callers commonly need. bs_new will be referenced by the old
- * parents of bs_top after bdrv_append() returns. If the caller needs to keep a
- * reference of its own, it must call bdrv_ref().
+ * Recent update: bdrv_append does NOT eat bs_new reference for now. Drop this
+ * comment several moths later.
  */
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
  Error **errp)
@@ -4967,20 +4960,14 @@ void bdrv_append(BlockDriverState *bs_new, 
BlockDriverState *bs_top,
 bdrv_set_backing_hd(bs_new, bs_top, _err);
 if (local_err) {
 error_propagate(errp, local_err);
-goto out;
+return;
 }
 
 bdrv_replace_node(bs_top, bs_new, _err);
 if (local_err) {
 error_propagate(errp, local_err);
 bdrv_set_backing_hd(bs_new, NULL, _abort);
-goto out;
 }
-
-/* bs_new is now referenced by its new parents, we don't need the
- * additional reference any more. */
-out:
-bdrv_unref(bs_new);
 }
 
 static void bdrv_delete(BlockDriverState *bs)
diff --git a/block/backup-top.c b/block/backup-top.c
index fe6883cc97..650ed6195c 100644
--- a/block/backup-top.c
+++ b/block/backup-top.c
@@ -222,7 +222,6 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState 
*source,
 
 bdrv_drained_begin(source);
 
-bdrv_ref(top);
 bdrv_append(top, source, _err);
 if (local_err) {
 error_prepend(_err, "Cannot append backup-top filter: ");
diff --git a/block/commit.c b/block/commit.c
index 71db7ba747..61924bcf66 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -313,6 +313,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
 commit_top_bs->total_sectors = top->total_sectors;
 
 bdrv_append(commit_top_bs, top, _err);
+bdrv_unref(commit_top_bs); /* referenced by new parents or failed */
 if (local_err) {
 commit_top_bs = NULL;
 error_propagate(errp, local_err);
diff --git a/block/mirror.c b/block/mirror.c
index 8e1ad6eceb..13f7ecc998 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1605,9 +1605,6 @@ static BlockJob *mirror_start_job(
 bs_opaque = g_new0(MirrorBDSOpaque, 1);
 mirror_top_bs->opaque = bs_opaque;
 
-/* bdrv_append takes ownership of the mirror_top_bs reference, need to keep
- * it alive until block_job_create() succeeds even if bs has no parent. */
-bdrv_ref(mirror_top_bs);
 bdrv_drained_begin(bs);
 bdrv_append(mirror_top_bs, bs, _err);
 bdrv_drained_end(bs);
diff --git a/blockdev.c b/blockdev.c
index b5f11c524b..96c96f8ba6 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1587,10 +1587,6 @@ static void external_snapshot_prepare(BlkActionState 
*common,
 goto out;
 }
 
-/* This removes our old bs and adds the new bs. This is an operation that
- * can fail, so we need to do it in .prepare; undoing it for abort is
- * always possible. */
-bdrv_ref(state->new_bs);
 bdrv_append(state->new_bs, state->old_bs, _err);
 if (local_err) {
 error_propagate(errp, local_err);
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 8a29e33e00..892f7f47d8 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -1478,7 +1478,6 @@ static void test_append_to_drained(void)
 g_assert_cmpint(base_s->drain_count, ==, 1);

[PATCH 13/21] block: fix bdrv_replace_node_common

inore_children thing doesn't help to track all propagated permissions
of children we want to ignore. The simplest way to correctly update
permissions is update graph first and then do permission update. In
this case we just referesh permissions for the whole subgraph (in
topological-sort defined order) and everything is correctly calculated
automatically without any ignore_children.

So, refactor bdrv_replace_node_common to first do graph update and then
refresh the permissions.

Test test_parallel_exclusive_write() now pass, so move it out of
debugging "if".

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 42 ++---
 tests/test-bdrv-graph-mod.c | 18 +++-
 2 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/block.c b/block.c
index 5c94f5a428..08501350b7 100644
--- a/block.c
+++ b/block.c
@@ -2178,7 +2178,6 @@ static BdrvActionDrv bdrv_replace_child_drv = {
  *
  * Note: real unref of old_bs is done only on commit.
  */
-__attribute__((unused))
 static void bdrv_replace_child_safe(BdrvChild *child, BlockDriverState *new_bs,
 GSList **tran)
 {
@@ -4787,8 +4786,9 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,
 {
 int ret = -EPERM;
 BdrvChild *c, *next;
-GSList *list = NULL, *p;
-uint64_t perm = 0, shared = BLK_PERM_ALL;
+GSList *tran = NULL;
+g_autoptr(GHashTable) found = NULL;
+g_autoptr(GSList) refresh_list = NULL;
 
 /* Make sure that @from doesn't go away until we have successfully attached
  * all of its parents to @to. */
@@ -4798,7 +4798,12 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,
 assert(bdrv_get_aio_context(from) == bdrv_get_aio_context(to));
 bdrv_drained_begin(from);
 
-/* Put all parents into @list and calculate their cumulative permissions */
+/*
+ * Do the replacement without permission update.
+ * Replacement may influence the permissions, we should calculate new
+ * permissions based on new graph. If we fail, we'll roll-back the
+ * replacement.
+ */
 QLIST_FOREACH_SAFE(c, >parents, next_parent, next) {
 assert(c->bs == from);
 if (!should_update_child(c, to)) {
@@ -4814,34 +4819,19 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,
c->name, from->node_name);
 goto out;
 }
-list = g_slist_prepend(list, c);
-perm |= c->perm;
-shared &= c->shared_perm;
-}
-
-/* Check whether the required permissions can be granted on @to, ignoring
- * all BdrvChild in @list so that they can't block themselves. */
-ret = bdrv_check_update_perm(to, NULL, perm, shared, list, errp);
-if (ret < 0) {
-bdrv_abort_perm_update(to);
-goto out;
+bdrv_replace_child_safe(c, to, );
 }
 
-/* Now actually perform the change. We performed the permission check for
- * all elements of @list at once, so set the permissions all at once at the
- * very end. */
-for (p = list; p != NULL; p = p->next) {
-c = p->data;
+found = g_hash_table_new(NULL, NULL);
 
-bdrv_ref(to);
-bdrv_replace_child_noperm(c, to);
-bdrv_unref(from);
-}
+refresh_list = bdrv_topological_dfs(refresh_list, found, to);
+refresh_list = bdrv_topological_dfs(refresh_list, found, from);
 
-bdrv_set_perm(to);
+ret = bdrv_list_refresh_perms(refresh_list, errp);
 
 out:
-g_slist_free(list);
+tran_finalize(tran, ret);
+
 bdrv_drained_end(from);
 bdrv_unref(from);
 
diff --git a/tests/test-bdrv-graph-mod.c b/tests/test-bdrv-graph-mod.c
index 594a1df58b..d0e0f56ec3 100644
--- a/tests/test-bdrv-graph-mod.c
+++ b/tests/test-bdrv-graph-mod.c
@@ -292,20 +292,11 @@ static void test_parallel_perm_update(void)
 bdrv_child_refresh_perms(top, top->children.lh_first, _abort);
 
 assert(c_fl1->perm & BLK_PERM_WRITE);
+bdrv_unref(top);
 }
 
 int main(int argc, char *argv[])
 {
-int i;
-bool debug = false;
-
-for (i = 1; i < argc; i++) {
-if (!strcmp(argv[i], "-d")) {
-debug = true;
-break;
-}
-}
-
 bdrv_init();
 qemu_init_main_loop(_abort);
 
@@ -316,11 +307,8 @@ int main(int argc, char *argv[])
 test_should_update_child);
 g_test_add_func("/bdrv-graph-mod/parallel-perm-update",
 test_parallel_perm_update);
-
-if (debug) {
-g_test_add_func("/bdrv-graph-mod/parallel-exclusive-write",
-test_parallel_exclusive_write);
-}
+g_test_add_func("/bdrv-graph-mod/parallel-exclusive-write",
+test_parallel_exclusive_write);
 
 return g_test_run();
 }
-- 
2.21.3

[PATCH 12/21] block: return value from bdrv_replace_node()

Functions with errp argument are not recommened to be void-functions.
Improve bdrv_replace_node().

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block.h |  4 ++--
 block.c   | 14 --
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index db37a35cee..ab812e14d8 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -346,8 +346,8 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, 
Error **errp);
 BlockDriverState *bdrv_new(void);
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
  Error **errp);
-void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
-   Error **errp);
+int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
+  Error **errp);
 
 int bdrv_parse_aio(const char *mode, int *flags);
 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
diff --git a/block.c b/block.c
index 1b10b6fb5e..5c94f5a428 100644
--- a/block.c
+++ b/block.c
@@ -4781,14 +4781,14 @@ static bool should_update_child(BdrvChild *c, 
BlockDriverState *to)
  * With auto_skip=false the error is returned if from has a parent which should
  * not be updated.
  */
-static void bdrv_replace_node_common(BlockDriverState *from,
- BlockDriverState *to,
- bool auto_skip, Error **errp)
+static int bdrv_replace_node_common(BlockDriverState *from,
+BlockDriverState *to,
+bool auto_skip, Error **errp)
 {
+int ret = -EPERM;
 BdrvChild *c, *next;
 GSList *list = NULL, *p;
 uint64_t perm = 0, shared = BLK_PERM_ALL;
-int ret;
 
 /* Make sure that @from doesn't go away until we have successfully attached
  * all of its parents to @to. */
@@ -4844,10 +4844,12 @@ out:
 g_slist_free(list);
 bdrv_drained_end(from);
 bdrv_unref(from);
+
+return ret;
 }
 
-void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
-   Error **errp)
+int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
+  Error **errp)
 {
 return bdrv_replace_node_common(from, to, true, errp);
 }
-- 
2.21.3

[PATCH 11/21] block: add bdrv_replace_child_safe() transaction action

To be used in the following commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 54 ++
 1 file changed, 54 insertions(+)

diff --git a/block.c b/block.c
index d799afeedd..1b10b6fb5e 100644
--- a/block.c
+++ b/block.c
@@ -82,6 +82,9 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
BdrvChildRole child_role,
Error **errp);
 
+static void bdrv_replace_child_noperm(BdrvChild *child,
+  BlockDriverState *new_bs);
+
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;
 
@@ -2142,6 +2145,57 @@ static int bdrv_drv_set_perm(BlockDriverState *bs, 
uint64_t perm,
 return 0;
 }
 
+typedef struct BdrvReplaceChildState {
+BdrvChild *child;
+BlockDriverState *old_bs;
+} BdrvReplaceChildState;
+
+static void bdrv_replace_child_commit(void *opaque)
+{
+BdrvReplaceChildState *s = opaque;
+
+bdrv_unref(s->old_bs);
+}
+
+static void bdrv_replace_child_abort(void *opaque)
+{
+BdrvReplaceChildState *s = opaque;
+BlockDriverState *new_bs = s->child->bs;
+
+/* old_bs reference is transparently moved from @s to @s->child */
+bdrv_replace_child_noperm(s->child, s->old_bs);
+bdrv_unref(new_bs);
+}
+
+static BdrvActionDrv bdrv_replace_child_drv = {
+.commit = bdrv_replace_child_commit,
+.abort = bdrv_replace_child_abort,
+.clean = g_free,
+};
+
+/*
+ * bdrv_replace_child_safe
+ *
+ * Note: real unref of old_bs is done only on commit.
+ */
+__attribute__((unused))
+static void bdrv_replace_child_safe(BdrvChild *child, BlockDriverState *new_bs,
+GSList **tran)
+{
+BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
+*s = (BdrvReplaceChildState) {
+.child = child,
+.old_bs = child->bs,
+};
+tran_prepend(tran, _replace_child_drv, s);
+
+if (new_bs) {
+bdrv_ref(new_bs);
+}
+bdrv_replace_child_noperm(child, new_bs);
+/* old_bs reference is transparently moved from @child to @s */
+}
+
 /*
  * Check whether permissions on this node can be changed in a way that
  * @cumulative_perms and @cumulative_shared_perms are the new cumulative
-- 
2.21.3

[PATCH 15/21] block: split out bdrv_replace_node_noperm()

Split part of bdrv_replace_node_common() to be used separately.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 47 ++-
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/block.c b/block.c
index 5f6ad1d016..1327254b8e 100644
--- a/block.c
+++ b/block.c
@@ -4859,6 +4859,33 @@ static bool should_update_child(BdrvChild *c, 
BlockDriverState *to)
 return ret;
 }
 
+static int bdrv_replace_node_noperm(BlockDriverState *from,
+BlockDriverState *to,
+bool auto_skip, GSList **tran, Error 
**errp)
+{
+BdrvChild *c, *next;
+
+QLIST_FOREACH_SAFE(c, >parents, next_parent, next) {
+assert(c->bs == from);
+if (!should_update_child(c, to)) {
+if (auto_skip) {
+continue;
+}
+error_setg(errp, "Should not change '%s' link to '%s'",
+   c->name, from->node_name);
+return -EPERM;
+}
+if (c->frozen) {
+error_setg(errp, "Cannot change '%s' link to '%s'",
+   c->name, from->node_name);
+return -EPERM;
+}
+bdrv_replace_child_safe(c, to, tran);
+}
+
+return 0;
+}
+
 /*
  * With auto_skip=true bdrv_replace_node_common skips updating from parents
  * if it creates a parent-child relation loop or if parent is block-job.
@@ -4871,7 +4898,6 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,
 bool auto_skip, Error **errp)
 {
 int ret = -EPERM;
-BdrvChild *c, *next;
 GSList *tran = NULL;
 g_autoptr(GHashTable) found = NULL;
 g_autoptr(GSList) refresh_list = NULL;
@@ -4890,22 +4916,9 @@ static int bdrv_replace_node_common(BlockDriverState 
*from,
  * permissions based on new graph. If we fail, we'll roll-back the
  * replacement.
  */
-QLIST_FOREACH_SAFE(c, >parents, next_parent, next) {
-assert(c->bs == from);
-if (!should_update_child(c, to)) {
-if (auto_skip) {
-continue;
-}
-error_setg(errp, "Should not change '%s' link to '%s'",
-   c->name, from->node_name);
-goto out;
-}
-if (c->frozen) {
-error_setg(errp, "Cannot change '%s' link to '%s'",
-   c->name, from->node_name);
-goto out;
-}
-bdrv_replace_child_safe(c, to, );
+ret = bdrv_replace_node_noperm(from, to, auto_skip, , errp);
+if (ret < 0) {
+goto out;
 }
 
 found = g_hash_table_new(NULL, NULL);
-- 
2.21.3

[PATCH 14/21] block: add bdrv_attach_child_noperm() transaction action

The code partly duplicates bdrv_root_attach_child() and
bdrv_attach_child(). Still refactoring these two functions by renaming
them to *_common with new noperm argument is more complicating. When
all operations moved to new graph update paradigm (update permissions
only on updated graph) all duplications should leave.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 94 ++---
 1 file changed, 90 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index 08501350b7..5f6ad1d016 100644
--- a/block.c
+++ b/block.c
@@ -2974,16 +2974,102 @@ BdrvChild *bdrv_attach_child(BlockDriverState 
*parent_bs,
 return child;
 }
 
-static void bdrv_detach_child(BdrvChild *child)
+static void bdrv_remove_empty_child(BdrvChild *child)
 {
+assert(!child->bs);
 QLIST_SAFE_REMOVE(child, next);
-
-bdrv_replace_child(child, NULL);
-
 g_free(child->name);
 g_free(child);
 }
 
+typedef struct BdrvAttachChildNopermState {
+BdrvChild *child;
+AioContext *old_aio_context; /* NULL if not changed */
+} BdrvAttachChildNopermState;
+
+static void bdrv_attach_child_noperm_abort(void *opaque)
+{
+BdrvAttachChildNopermState *s = opaque;
+BlockDriverState *bs = s->child->bs;
+
+bdrv_replace_child_noperm(s->child, NULL);
+bdrv_remove_empty_child(s->child);
+
+/*
+ * refcnt was positive prior to bdrv_ref() in bdrv_attach_child_noperm(),
+ * so bs should not be deleted now.
+ */
+assert(bs->refcnt > 1);
+bdrv_unref(bs);
+if (s->old_aio_context) {
+bdrv_try_set_aio_context(bs, s->old_aio_context, NULL);
+}
+}
+
+static BdrvActionDrv bdrv_attach_child_noperm_drv = {
+.abort = bdrv_attach_child_noperm_abort,
+.clean = g_free,
+};
+
+__attribute__((unused))
+static BdrvChild *bdrv_attach_child_noperm(BlockDriverState *parent_bs,
+   BlockDriverState *child_bs,
+   const char *child_name,
+   BdrvChildRole child_role,
+   GSList **tran,
+   Error **errp)
+{
+int ret;
+BdrvChild *child;
+uint64_t perm, shared_perm;
+AioContext *parent_ctx = bdrv_get_aio_context(parent_bs);
+AioContext *child_ctx = bdrv_get_aio_context(child_bs);
+BdrvAttachChildNopermState *s;
+
+if (child_ctx != parent_ctx) {
+ret = bdrv_try_set_aio_context(child_bs, parent_ctx, errp);
+if (ret < 0) {
+return NULL;
+}
+}
+
+bdrv_get_cumulative_perm(parent_bs, , _perm);
+
+assert(parent_bs->drv);
+bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
+perm, shared_perm, , _perm);
+
+child = g_new(BdrvChild, 1);
+*child = (BdrvChild) {
+.bs = NULL,
+.name   = g_strdup(child_name),
+.klass  = _of_bds,
+.role   = child_role,
+.perm   = perm,
+.shared_perm= shared_perm,
+.opaque = parent_bs,
+};
+bdrv_ref(child_bs);
+bdrv_replace_child_noperm(child, child_bs);
+
+QLIST_INSERT_HEAD(_bs->children, child, next);
+
+s = g_new(BdrvAttachChildNopermState, 1);
+*s = (BdrvAttachChildNopermState) {
+.child = child,
+.old_aio_context = child_ctx == parent_ctx ? NULL : child_ctx,
+};
+tran_prepend(tran, _attach_child_noperm_drv, s);
+
+return child;
+}
+
+static void bdrv_detach_child(BdrvChild *child)
+{
+bdrv_replace_child(child, NULL);
+bdrv_remove_empty_child(child);
+}
+
 /* Callers must ensure that child->frozen is false. */
 void bdrv_root_unref_child(BdrvChild *child)
 {
-- 
2.21.3

[PATCH 1/2] block: make bdrv_drop_intermediate() less wrong

First, permission update loop tries to do iterations transactionally,
but the whole update is not transactional: nobody roll-back successful
loop iterations when some iteration fails.

Second, in the iteration we have nested permission update:
c->klass->update_filename may point to bdrv_child_cb_update_filename()
which calls bdrv_backing_update_filename(), which may do node reopen to
RW.

Permission update system is not prepared to nested updates, at least it
has intermediate permission-update state stored in BdrvChild
structures: has_backup_perm, backup_perm and backup_shared_perm.

So, let's first do bdrv_replace_node() (which is more transactional
than open-coded update in bdrv_drop_intermediate()) and then call
update_filename() in separate. We still do not rollback changes in case
of update_filename() failure but it's not much worse than pre-patch
behavior.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 36 +++-
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/block.c b/block.c
index 9538af4884..bd9f4e534b 100644
--- a/block.c
+++ b/block.c
@@ -4958,36 +4958,30 @@ int bdrv_drop_intermediate(BlockDriverState *top, 
BlockDriverState *base,
 backing_file_str = base->filename;
 }
 
-QLIST_FOREACH_SAFE(c, >parents, next_parent, next) {
-/* Check whether we are allowed to switch c from top to base */
-GSList *ignore_children = g_slist_prepend(NULL, c);
-ret = bdrv_check_update_perm(base, NULL, c->perm, c->shared_perm,
- ignore_children, NULL, _err);
-g_slist_free(ignore_children);
-if (ret < 0) {
-error_report_err(local_err);
-goto exit;
-}
+bdrv_replace_node(top, base, _err);
+if (local_err) {
+error_report_err(local_err);
+goto exit;
+}
 
-/* If so, update the backing file path in the image file */
+QLIST_FOREACH_SAFE(c, >parents, next_parent, next) {
 if (c->klass->update_filename) {
 ret = c->klass->update_filename(c, base, backing_file_str,
 _err);
 if (ret < 0) {
-bdrv_abort_perm_update(base);
+/*
+ * TODO: Actually, we want to rollback all previous iterations
+ * of this loop, and (which is almost impossible) previous
+ * bdrv_replace_node()...
+ *
+ * Note, that c->klass->update_filename may lead to permission
+ * update, so it's a bad idea to call it inside permission
+ * update transaction of bdrv_replace_node.
+ */
 error_report_err(local_err);
 goto exit;
 }
 }
-
-/*
- * Do the actual switch in the in-memory graph.
- * Completes bdrv_check_update_perm() transaction internally.
- * c->frozen is false, we have checked that above.
- */
-bdrv_ref(base);
-bdrv_replace_child(c, base);
-bdrv_unref(top);
 }
 
 if (update_inherits_from) {
-- 
2.21.3

[PATCH 18/21] block: adapt bdrv_append() for inserting filters

bdrv_append is not very good for inserting filters: it does extra
permission update as part of bdrv_set_backing_hd(). During this update
filter may conflict with other parents of top_bs.

Instead, let's first do all graph modifications and after it update
permissions.

Note: bdrv_append() is still only works for backing-child based
filters. It's something to improve later.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 50 +++---
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index f2e714a81d..cf7b859a81 100644
--- a/block.c
+++ b/block.c
@@ -4953,22 +4953,50 @@ int bdrv_replace_node(BlockDriverState *from, 
BlockDriverState *to,
 int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
 Error **errp)
 {
-Error *local_err = NULL;
+int ret;
+GSList *tran = NULL;
+AioContext *bs_new_ctx = bdrv_get_aio_context(bs_new);
+AioContext *bs_top_ctx = bdrv_get_aio_context(bs_top);
 
-bdrv_set_backing_hd(bs_new, bs_top, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-return -EPERM;
+assert(!bs_new->backing);
+
+if (bs_new_ctx != bs_top_ctx) {
+ret = bdrv_try_set_aio_context(bs_new, bs_top_ctx, NULL);
+if (ret < 0) {
+ret = bdrv_try_set_aio_context(bs_top, bs_new_ctx, errp);
+}
+if (ret < 0) {
+return ret;
+}
 }
 
-bdrv_replace_node(bs_top, bs_new, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-bdrv_set_backing_hd(bs_new, NULL, _abort);
-return -EPERM;
+bs_new->backing = bdrv_attach_child_noperm(bs_new, bs_top, "backing",
+   bdrv_backing_role(bs_new),
+   , errp);
+if (!bs_new->backing) {
+ret = -EINVAL;
+goto out;
 }
 
-return 0;
+ret = bdrv_replace_node_noperm(bs_top, bs_new, true, , errp);
+if (ret < 0) {
+goto out;
+}
+
+ret = bdrv_refresh_perms(bs_new, errp);
+out:
+tran_finalize(tran, ret);
+if (ret < 0) {
+bs_new->backing = NULL;
+if (bs_new_ctx != bdrv_get_aio_context(bs_new)) {
+bdrv_try_set_aio_context(bs_new, bs_new_ctx, _abort);
+}
+if (bs_top_ctx != bdrv_get_aio_context(bs_top)) {
+bdrv_try_set_aio_context(bs_top, bs_top_ctx, _abort);
+}
+}
+
+return ret;
 }
 
 static void bdrv_delete(BlockDriverState *bs)
-- 
2.21.3

[PATCH 08/21] block: use topological sort for permission update

Rewrite bdrv_check_perm(), bdrv_abort_perm_update() and bdrv_set_perm()
to update nodes in topological sort order instead of simple DFS. With
topologically sorted nodes, we update a node only when all its parents
already updated. With DFS it's not so.

Consider the following example:

A -+
|  |
|  v
|  B
|  |
v  |
C<-+

A is parent for B and C, B is parent for C.

Obviously, to update permissions, we should go in order A B C, so, when
we update C, all parent permissions already updated. But with current
approach (simple recursion) we can update in sequence A C B C (C is
updated twice). On first update of C, we consider old B permissions, so
doing wrong thing. If it succeed, all is OK, on second C update we will
finish with correct graph. But if the wrong thing failed, we break the
whole process for no reason (it's possible that updated B permission
will be less strict, but we will never check it).

Also new approach gives a way to simultaneously and correctly update
several nodes, we just need to run bdrv_topological_dfs() several times
to add all nodes and their subtrees into one topologically sorted list
(next patch will update bdrv_replace_node() in this manner).

Test test_parallel_perm_update() is now passing, so move it out of
debugging "if".

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 108 
 tests/test-bdrv-graph-mod.c |   4 +-
 tests/qemu-iotests/283.out  |   2 +-
 3 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/block.c b/block.c
index 22498b5288..56263407e8 100644
--- a/block.c
+++ b/block.c
@@ -1973,13 +1973,17 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, 
Error **errp)
 return false;
 }
 
-static bool bdrv_check_parents_compliance(BlockDriverState *bs, Error **errp)
+static bool bdrv_check_parents_compliance(BlockDriverState *bs,
+  GSList *ignore_children,
+  Error **errp)
 {
 BdrvChild *a, *b;
 
 QLIST_FOREACH(a, >parents, next_parent) {
 QLIST_FOREACH(b, >parents, next_parent) {
-if (a == b) {
+if (a == b || g_slist_find(ignore_children, a) ||
+g_slist_find(ignore_children, b))
+{
 continue;
 }
 
@@ -2012,6 +2016,29 @@ static void bdrv_child_perm(BlockDriverState *bs, 
BlockDriverState *child_bs,
 }
 }
 
+static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
+BlockDriverState *bs)
+{
+BdrvChild *child;
+g_autoptr(GHashTable) local_found = NULL;
+
+if (!found) {
+assert(!list);
+found = local_found = g_hash_table_new(NULL, NULL);
+}
+
+if (g_hash_table_contains(found, bs)) {
+return list;
+}
+g_hash_table_add(found, bs);
+
+QLIST_FOREACH(child, >children, next) {
+list = bdrv_topological_dfs(list, found, child->bs);
+}
+
+return g_slist_prepend(list, bs);
+}
+
 static void bdrv_child_set_perm_commit(void *opaque)
 {
 BdrvChild *c = opaque;
@@ -2076,14 +2103,13 @@ static void bdrv_child_set_perm_safe(BdrvChild *c, 
uint64_t perm,
  * A call to this function must always be followed by a call to bdrv_set_perm()
  * or bdrv_abort_perm_update().
  */
-static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
-   uint64_t cumulative_perms,
-   uint64_t cumulative_shared_perms,
-   GSList *ignore_children, Error **errp)
+static int bdrv_node_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
+uint64_t cumulative_perms,
+uint64_t cumulative_shared_perms,
+GSList *ignore_children, Error **errp)
 {
 BlockDriver *drv = bs->drv;
 BdrvChild *c;
-int ret;
 
 /* Write permissions never work with read-only images */
 if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
@@ -2128,8 +2154,8 @@ static int bdrv_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 }
 
 if (drv->bdrv_check_perm) {
-ret = drv->bdrv_check_perm(bs, cumulative_perms,
-   cumulative_shared_perms, errp);
+int ret = drv->bdrv_check_perm(bs, cumulative_perms,
+   cumulative_shared_perms, errp);
 if (ret < 0) {
 return ret;
 }
@@ -2144,21 +2170,43 @@ static int bdrv_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 /* Check all children */
 QLIST_FOREACH(c, >children, next) {
 uint64_t cur_perm, cur_shared;
-GSList *cur_ignore_children;
 
 bdrv_child_perm(bs, c->bs, c, c->role, q,
 cumulative_perms, cumulative_shared_perms,
 _perm, _shared);
+bdrv_child_set_perm_safe(c,

[PATCH 07/21] block: inline bdrv_child_*() permission functions calls

Each of them has only one caller. Open-coding simplifies further
pemission-update system changes.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 59 +
 1 file changed, 17 insertions(+), 42 deletions(-)

diff --git a/block.c b/block.c
index a9e4d2b57c..22498b5288 100644
--- a/block.c
+++ b/block.c
@@ -1893,11 +1893,11 @@ static int bdrv_fill_options(QDict **options, const 
char *filename,
 return 0;
 }
 
-static int bdrv_child_check_perm(BdrvChild *c, BlockReopenQueue *q,
- uint64_t perm, uint64_t shared,
- GSList *ignore_children, Error **errp);
-static void bdrv_child_abort_perm_update(BdrvChild *c);
-static void bdrv_child_set_perm(BdrvChild *c);
+static int bdrv_check_update_perm(BlockDriverState *bs, BlockReopenQueue *q,
+  uint64_t new_used_perm,
+  uint64_t new_shared_perm,
+  GSList *ignore_children,
+  Error **errp);
 
 typedef struct BlockReopenQueueEntry {
  bool prepared;
@@ -2144,15 +2144,21 @@ static int bdrv_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 /* Check all children */
 QLIST_FOREACH(c, >children, next) {
 uint64_t cur_perm, cur_shared;
+GSList *cur_ignore_children;
 
 bdrv_child_perm(bs, c->bs, c, c->role, q,
 cumulative_perms, cumulative_shared_perms,
 _perm, _shared);
-ret = bdrv_child_check_perm(c, q, cur_perm, cur_shared, 
ignore_children,
-errp);
+
+cur_ignore_children = g_slist_prepend(g_slist_copy(ignore_children), 
c);
+ret = bdrv_check_update_perm(c->bs, q, cur_perm, cur_shared,
+ cur_ignore_children, errp);
+g_slist_free(cur_ignore_children);
 if (ret < 0) {
 return ret;
 }
+
+bdrv_child_set_perm_safe(c, cur_perm, cur_shared, NULL);
 }
 
 return 0;
@@ -2179,7 +2185,8 @@ static void bdrv_abort_perm_update(BlockDriverState *bs)
 }
 
 QLIST_FOREACH(c, >children, next) {
-bdrv_child_abort_perm_update(c);
+bdrv_child_set_perm_abort(c);
+bdrv_abort_perm_update(c->bs);
 }
 }
 
@@ -2208,7 +2215,8 @@ static void bdrv_set_perm(BlockDriverState *bs)
 
 /* Update all children */
 QLIST_FOREACH(c, >children, next) {
-bdrv_child_set_perm(c);
+bdrv_child_set_perm_commit(c);
+bdrv_set_perm(c->bs);
 }
 }
 
@@ -2316,39 +2324,6 @@ static int bdrv_check_update_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
ignore_children, errp);
 }
 
-/* Needs to be followed by a call to either bdrv_child_set_perm() or
- * bdrv_child_abort_perm_update(). */
-static int bdrv_child_check_perm(BdrvChild *c, BlockReopenQueue *q,
- uint64_t perm, uint64_t shared,
- GSList *ignore_children, Error **errp)
-{
-int ret;
-
-ignore_children = g_slist_prepend(g_slist_copy(ignore_children), c);
-ret = bdrv_check_update_perm(c->bs, q, perm, shared, ignore_children, 
errp);
-g_slist_free(ignore_children);
-
-if (ret < 0) {
-return ret;
-}
-
-bdrv_child_set_perm_safe(c, perm, shared, NULL);
-
-return 0;
-}
-
-static void bdrv_child_set_perm(BdrvChild *c)
-{
-bdrv_child_set_perm_commit(c);
-bdrv_set_perm(c->bs);
-}
-
-static void bdrv_child_abort_perm_update(BdrvChild *c)
-{
-bdrv_child_set_perm_abort(c);
-bdrv_abort_perm_update(c->bs);
-}
-
 static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
 {
 int ret;
-- 
2.21.3

[PATCH 10/21] block: add bdrv_list_* permission update functions

Add new interface, allowing use of existing node list. It will be used
to fix bdrv_replace_node() in the following commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 112 ++--
 1 file changed, 76 insertions(+), 36 deletions(-)

diff --git a/block.c b/block.c
index 799c475dda..d799afeedd 100644
--- a/block.c
+++ b/block.c
@@ -2154,7 +2154,8 @@ static int bdrv_drv_set_perm(BlockDriverState *bs, 
uint64_t perm,
 static int bdrv_node_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
 uint64_t cumulative_perms,
 uint64_t cumulative_shared_perms,
-GSList *ignore_children, Error **errp)
+GSList *ignore_children,
+GSList **tran, Error **errp)
 {
 int ret;
 BlockDriver *drv = bs->drv;
@@ -2202,7 +2203,7 @@ static int bdrv_node_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 return 0;
 }
 
-ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, 
NULL,
+ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, 
tran,
 errp);
 if (ret < 0) {
 return ret;
@@ -2221,36 +,53 @@ static int bdrv_node_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 bdrv_child_perm(bs, c->bs, c, c->role, q,
 cumulative_perms, cumulative_shared_perms,
 _perm, _shared);
-bdrv_child_set_perm_safe(c, cur_perm, cur_shared, NULL);
+bdrv_child_set_perm_safe(c, cur_perm, cur_shared, tran);
 }
 
 return 0;
 }
 
-static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
-   uint64_t cumulative_perms,
-   uint64_t cumulative_shared_perms,
-   GSList *ignore_children, Error **errp)
+/*
+ * If use_cumulative_perms is true, use cumulative_perms and
+ * cumulative_shared_perms for first element of the list. Otherwise just 
refresh
+ * all permissions.
+ */
+static int bdrv_check_perm_common(GSList *list, BlockReopenQueue *q,
+  bool use_cumulative_perms,
+  uint64_t cumulative_perms,
+  uint64_t cumulative_shared_perms,
+  GSList *ignore_children,
+  GSList **tran, Error **errp)
 {
 int ret;
-BlockDriverState *root = bs;
-g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, root);
+BlockDriverState *bs;
 
-for ( ; list; list = list->next) {
+if (use_cumulative_perms) {
 bs = list->data;
 
-if (bs != root) {
-if (!bdrv_check_parents_compliance(bs, ignore_children, errp)) {
-return -EINVAL;
-}
+ret = bdrv_node_check_perm(bs, q, cumulative_perms,
+   cumulative_shared_perms,
+   ignore_children, tran, errp);
+if (ret < 0) {
+return ret;
+}
+
+list = list->next;
+}
 
-bdrv_get_cumulative_perm(bs, _perms,
- _shared_perms);
+for ( ; list; list = list->next) {
+bs = list->data;
+
+if (!bdrv_check_parents_compliance(bs, ignore_children, errp)) {
+return -EINVAL;
 }
 
+bdrv_get_cumulative_perm(bs, _perms,
+ _shared_perms);
+
 ret = bdrv_node_check_perm(bs, q, cumulative_perms,
cumulative_shared_perms,
-   ignore_children, errp);
+   ignore_children, tran, errp);
 if (ret < 0) {
 return ret;
 }
@@ -2259,6 +2277,22 @@ static int bdrv_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 return 0;
 }
 
+static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,
+   uint64_t cumulative_perms,
+   uint64_t cumulative_shared_perms,
+   GSList *ignore_children, Error **errp)
+{
+g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
+return bdrv_check_perm_common(list, q, true, cumulative_perms,
+  cumulative_shared_perms, ignore_children,
+  NULL, errp);
+}
+
+static int bdrv_list_check_perm(GSList *list, GSList **tran, Error **errp)
+{
+return bdrv_check_perm_common(list, NULL, false, 0, 0, NULL, tran, errp);
+}
+
 /*
  * Notifies drivers that after a previous bdrv_check_perm() call, the
  * permission update is not performed and any preparations made for it (e.g.
@@ -2280,15 +2314,19 @@ static void 
bdrv_node_abort_perm_update(BlockDriverState *bs)
 }
 }
 
-static void

[PATCH 05/21] block: refactor bdrv_child* permission functions

Split out non-recursive parts, and refactor as block graph transaction
action.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 79 ++---
 1 file changed, 59 insertions(+), 20 deletions(-)

diff --git a/block.c b/block.c
index 0d0f065db4..e12acd5029 100644
--- a/block.c
+++ b/block.c
@@ -48,6 +48,7 @@
 #include "qemu/timer.h"
 #include "qemu/cutils.h"
 #include "qemu/id.h"
+#include "qemu/transactions.h"
 #include "block/coroutines.h"
 
 #ifdef CONFIG_BSD
@@ -2011,6 +2012,61 @@ static void bdrv_child_perm(BlockDriverState *bs, 
BlockDriverState *child_bs,
 }
 }
 
+static void bdrv_child_set_perm_commit(void *opaque)
+{
+BdrvChild *c = opaque;
+
+c->has_backup_perm = false;
+}
+
+static void bdrv_child_set_perm_abort(void *opaque)
+{
+BdrvChild *c = opaque;
+/*
+ * We may have child->has_backup_perm unset at this point, as in case of
+ * _check_ stage of permission update failure we may _check_ not the whole
+ * subtree.  Still, _abort_ is called on the whole subtree anyway.
+ */
+if (c->has_backup_perm) {
+c->perm = c->backup_perm;
+c->shared_perm = c->backup_shared_perm;
+c->has_backup_perm = false;
+}
+}
+
+static BdrvActionDrv bdrv_child_set_pem_drv = {
+.abort = bdrv_child_set_perm_abort,
+.commit = bdrv_child_set_perm_commit,
+};
+
+/*
+ * With tran=NULL needs to be followed by direct call to either
+ * bdrv_child_set_perm_commit() or bdrv_child_set_perm_abort().
+ *
+ * With non-NUll tran needs to be followed by tran_abort() or tran_commit()
+ * instead.
+ */
+static void bdrv_child_set_perm_safe(BdrvChild *c, uint64_t perm,
+ uint64_t shared, GSList **tran)
+{
+if (!c->has_backup_perm) {
+c->has_backup_perm = true;
+c->backup_perm = c->perm;
+c->backup_shared_perm = c->shared_perm;
+}
+/*
+ * Note: it's OK if c->has_backup_perm was already set, as we can find the
+ * same c twice during check_perm procedure
+ */
+
+c->perm = perm;
+c->shared_perm = shared;
+
+if (tran) {
+tran_prepend(tran, _child_set_pem_drv, c);
+}
+}
+
 /*
  * Check whether permissions on this node can be changed in a way that
  * @cumulative_perms and @cumulative_shared_perms are the new cumulative
@@ -2276,37 +2332,20 @@ static int bdrv_child_check_perm(BdrvChild *c, 
BlockReopenQueue *q,
 return ret;
 }
 
-if (!c->has_backup_perm) {
-c->has_backup_perm = true;
-c->backup_perm = c->perm;
-c->backup_shared_perm = c->shared_perm;
-}
-/*
- * Note: it's OK if c->has_backup_perm was already set, as we can find the
- * same child twice during check_perm procedure
- */
-
-c->perm = perm;
-c->shared_perm = shared;
+bdrv_child_set_perm_safe(c, perm, shared, NULL);
 
 return 0;
 }
 
 static void bdrv_child_set_perm(BdrvChild *c)
 {
-c->has_backup_perm = false;
-
+bdrv_child_set_perm_commit(c);
 bdrv_set_perm(c->bs);
 }
 
 static void bdrv_child_abort_perm_update(BdrvChild *c)
 {
-if (c->has_backup_perm) {
-c->perm = c->backup_perm;
-c->shared_perm = c->backup_shared_perm;
-c->has_backup_perm = false;
-}
-
+bdrv_child_set_perm_abort(c);
 bdrv_abort_perm_update(c->bs);
 }
 
-- 
2.21.3

[PATCH 04/21] block: bdrv_refresh_perms: check parents compliance

Add additional check that node parents do not interfere with each
other. This should not hurt existing callers and allows in further
patch use bdrv_refresh_perms() to update a subtree of changed
BdrvChild (check that change is correct).

New check will substitute bdrv_check_update_perm() in following
permissions refactoring, so keep error messages the same to avoid
unit test result changes.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 62 -
 1 file changed, 53 insertions(+), 9 deletions(-)

diff --git a/block.c b/block.c
index 0dd28f0902..0d0f065db4 100644
--- a/block.c
+++ b/block.c
@@ -1945,6 +1945,56 @@ bool bdrv_is_writable(BlockDriverState *bs)
 return bdrv_is_writable_after_reopen(bs, NULL);
 }
 
+static char *bdrv_child_user_desc(BdrvChild *c)
+{
+if (c->klass->get_parent_desc) {
+return c->klass->get_parent_desc(c);
+}
+
+return g_strdup("another user");
+}
+
+static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
+{
+g_autofree char *user = NULL;
+g_autofree char *perm_names = NULL;
+
+if ((b->perm & a->shared_perm) == b->perm) {
+return true;
+}
+
+perm_names = bdrv_perm_names(b->perm & ~a->shared_perm);
+user = bdrv_child_user_desc(a);
+error_setg(errp, "Conflicts with use by %s as '%s', which does not "
+   "allow '%s' on %s",
+   user, a->name, perm_names, bdrv_get_node_name(b->bs));
+
+return false;
+}
+
+static bool bdrv_check_parents_compliance(BlockDriverState *bs, Error **errp)
+{
+BdrvChild *a, *b;
+
+QLIST_FOREACH(a, >parents, next_parent) {
+QLIST_FOREACH(b, >parents, next_parent) {
+if (a == b) {
+continue;
+}
+
+if (!bdrv_a_allow_b(a, b, errp)) {
+return false;
+}
+
+if (!bdrv_a_allow_b(b, a, errp)) {
+return false;
+}
+}
+}
+
+return true;
+}
+
 static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
 BdrvChild *c, BdrvChildRole role,
 BlockReopenQueue *reopen_queue,
@@ -2122,15 +2172,6 @@ void bdrv_get_cumulative_perm(BlockDriverState *bs, 
uint64_t *perm,
 *shared_perm = cumulative_shared_perms;
 }
 
-static char *bdrv_child_user_desc(BdrvChild *c)
-{
-if (c->klass->get_parent_desc) {
-return c->klass->get_parent_desc(c);
-}
-
-return g_strdup("another user");
-}
-
 char *bdrv_perm_names(uint64_t perm)
 {
 struct perm_name {
@@ -2274,6 +2315,9 @@ static int bdrv_refresh_perms(BlockDriverState *bs, Error 
**errp)
 int ret;
 uint64_t perm, shared_perm;
 
+if (!bdrv_check_parents_compliance(bs, errp)) {
+return -EPERM;
+}
 bdrv_get_cumulative_perm(bs, , _perm);
 ret = bdrv_check_perm(bs, NULL, perm, shared_perm, NULL, errp);
 if (ret < 0) {
-- 
2.21.3

[PATCH 03/21] util: add transactions.c

Add simple transaction API to use in further update of block graph
operations.

Supposed usage is:

- "prepare" is main function of the action and it should make the main
  effect of the action to be visible for the following actions, keeping
  possibility of roll-back, saving necessary things in action state,
  which is prepended to the list. So, driver struct doesn't include
  "prepare" field, as it is supposed to be called directly.

- commit/rollback is supposed to be called for the list of action
  states, to commit/rollback all the actions in reverse order

- When possible "commit" should not make visible effect for other
  actions, which make possible transparent logical interaction between
  actions.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/qemu/transactions.h | 46 +
 util/transactions.c | 81 +
 util/meson.build|  1 +
 3 files changed, 128 insertions(+)
 create mode 100644 include/qemu/transactions.h
 create mode 100644 util/transactions.c

diff --git a/include/qemu/transactions.h b/include/qemu/transactions.h
new file mode 100644
index 00..db8fced65f
--- /dev/null
+++ b/include/qemu/transactions.h
@@ -0,0 +1,46 @@
+/*
+ * Simple transactions API
+ *
+ * Copyright (c) 2020 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#ifndef QEMU_TRANSACTIONS_H
+#define QEMU_TRANSACTIONS_H
+
+#include 
+
+typedef struct BdrvActionDrv {
+void (*abort)(void *opeque);
+void (*commit)(void *opeque);
+void (*clean)(void *opeque);
+} BdrvActionDrv;
+
+void tran_prepend(GSList **list, BdrvActionDrv *drv, void *opaque);
+void tran_abort(GSList *backup);
+void tran_commit(GSList *backup);
+static inline void tran_finalize(GSList *backup, int ret)
+{
+if (ret < 0) {
+tran_abort(backup);
+} else {
+tran_commit(backup);
+}
+}
+
+#endif /* QEMU_TRANSACTIONS_H */
diff --git a/util/transactions.c b/util/transactions.c
new file mode 100644
index 00..845d77fc16
--- /dev/null
+++ b/util/transactions.c
@@ -0,0 +1,81 @@
+/*
+ * Simple transactions API
+ *
+ * Copyright (c) 2020 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/transactions.h"
+
+typedef struct BdrvAction {
+BdrvActionDrv *drv;
+void *opaque;
+} BdrvAction;
+
+void tran_prepend(GSList **list, BdrvActionDrv *drv, void *opaque)
+{
+BdrvAction *act;
+
+act = g_new(BdrvAction, 1);
+*act = (BdrvAction) {
+.drv = drv,
+.opaque = opaque
+};
+
+*list = g_slist_prepend(*list, act);
+}
+
+void tran_abort(GSList *list)
+{
+GSList *p;
+
+for (p = list; p != NULL; p = p->next) {
+BdrvAction *act = p->data;
+
+if (act->drv->abort) {
+act->drv->abort(act->opaque);
+}
+
+if (act->drv->clean) {
+act->drv->clean(act->opaque);
+}
+}
+
+g_slist_free_full(list, g_free);
+}
+
+void tran_commit(GSList *list)
+{
+GSList *p;
+
+for (p = list; p != NULL; p = p->next) {
+BdrvAction *act = p->data;
+
+if (act->drv->commit) {
+act->drv->commit(act->opaque);
+}
+
+if (act->drv->clean) {
+act->drv->clean(act->opaque);
+}
+}
+
+g_slist_free_full(list, g_free);
+}
diff --git a/util/meson.build b/util/meson.build
index f359af0d46..8c7c28bd40 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -41,6 +41,7 @@ util_ss.add(files('qsp.c'))
 util_ss.add(files('range.c'))
 util_ss.add(files('stats64.c'))
 util_ss.add(files('systemd.c'))

[PATCH 01/21] tests/test-bdrv-graph-mod: add test_parallel_exclusive_write

Add the test that shows that concept of ignore_children is incomplete.
Actually, when we want to update something, ignoring permission of some
existing BdrvChild, we should ignore also the propagated effect of this
child to the other children. But that's not done. Better approach
(update permissions on already updated graph) will be implemented
later.

No the test fails, so it's added with -d argument to not break make
check.

Test fails with

 "Conflicts with use by fl1 as 'backing', which does not allow 'write' on base"

because when updating permissions we can ignore original top->fl1
BdrvChild. But we don't ignore exclusive write permission in fl1->base
BdrvChild, which is propagated. Correct thing to do is make graph
change first and then do permission update from the top node.

To run test do

  ./test-bdrv-graph-mod -d -p /bdrv-graph-mod/parallel-exclusive-write

from /tests.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/test-bdrv-graph-mod.c | 62 +
 1 file changed, 62 insertions(+)

diff --git a/tests/test-bdrv-graph-mod.c b/tests/test-bdrv-graph-mod.c
index 8cff13830e..3b9e6f242f 100644
--- a/tests/test-bdrv-graph-mod.c
+++ b/tests/test-bdrv-graph-mod.c
@@ -44,6 +44,21 @@ static BlockDriver bdrv_no_perm = {
 .bdrv_child_perm = no_perm_default_perms,
 };
 
+static void exclusive_write_perms(BlockDriverState *bs, BdrvChild *c,
+  BdrvChildRole role,
+  BlockReopenQueue *reopen_queue,
+  uint64_t perm, uint64_t shared,
+  uint64_t *nperm, uint64_t *nshared)
+{
+*nperm = BLK_PERM_WRITE;
+*nshared = BLK_PERM_ALL & ~BLK_PERM_WRITE;
+}
+
+static BlockDriver bdrv_exclusive_writer = {
+.format_name = "exclusive-writer",
+.bdrv_child_perm = exclusive_write_perms,
+};
+
 static BlockDriverState *no_perm_node(const char *name)
 {
 return bdrv_new_open_driver(_no_perm, name, BDRV_O_RDWR, 
_abort);
@@ -55,6 +70,12 @@ static BlockDriverState *pass_through_node(const char *name)
 BDRV_O_RDWR, _abort);
 }
 
+static BlockDriverState *exclusive_writer_node(const char *name)
+{
+return bdrv_new_open_driver(_exclusive_writer, name,
+BDRV_O_RDWR, _abort);
+}
+
 /*
  * test_update_perm_tree
  *
@@ -185,8 +206,44 @@ static void test_should_update_child(void)
 blk_unref(root);
 }
 
+/*
+ * test_parallel_exclusive_write
+ *
+ * Check that when we replace node, old permissions of the node being removed
+ * doesn't break the replacement.
+ */
+static void test_parallel_exclusive_write(void)
+{
+BlockDriverState *top = exclusive_writer_node("top");
+BlockDriverState *base = no_perm_node("base");
+BlockDriverState *fl1 = pass_through_node("fl1");
+BlockDriverState *fl2 = pass_through_node("fl2");
+
+bdrv_attach_child(top, fl1, "backing", _of_bds, BDRV_CHILD_DATA,
+  _abort);
+bdrv_attach_child(fl1, base, "backing", _of_bds, BDRV_CHILD_FILTERED,
+  _abort);
+bdrv_attach_child(fl2, base, "backing", _of_bds, BDRV_CHILD_FILTERED,
+  _abort);
+bdrv_ref(base);
+
+bdrv_replace_node(fl1, fl2, _abort);
+
+bdrv_unref(top);
+}
+
 int main(int argc, char *argv[])
 {
+int i;
+bool debug = false;
+
+for (i = 1; i < argc; i++) {
+if (!strcmp(argv[i], "-d")) {
+debug = true;
+break;
+}
+}
+
 bdrv_init();
 qemu_init_main_loop(_abort);
 
@@ -196,5 +253,10 @@ int main(int argc, char *argv[])
 g_test_add_func("/bdrv-graph-mod/should-update-child",
 test_should_update_child);
 
+if (debug) {
+g_test_add_func("/bdrv-graph-mod/parallel-exclusive-write",
+test_parallel_exclusive_write);
+}
+
 return g_test_run();
 }
-- 
2.21.3

[PATCH 2/2] block: assert that permission commit sets same permissions

On permission update commit we must set same permissions as on _check_.
Let's add assertions. Next step may be to drop permission parameters
from _set_.

Note that prior to previous commit, fixing bdrv_drop_intermediate(),
new assertion in bdrv_child_set_perm() crashes on iotests 30 and 40.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index bd9f4e534b..0f4da59a6c 100644
--- a/block.c
+++ b/block.c
@@ -2105,9 +2105,10 @@ static void bdrv_abort_perm_update(BlockDriverState *bs)
 }
 }
 
-static void bdrv_set_perm(BlockDriverState *bs, uint64_t cumulative_perms,
-  uint64_t cumulative_shared_perms)
+static void bdrv_set_perm(BlockDriverState *bs, uint64_t _cumulative_perms,
+  uint64_t _cumulative_shared_perms)
 {
+uint64_t cumulative_perms, cumulative_shared_perms;
 BlockDriver *drv = bs->drv;
 BdrvChild *c;
 
@@ -2115,6 +2116,10 @@ static void bdrv_set_perm(BlockDriverState *bs, uint64_t 
cumulative_perms,
 return;
 }
 
+bdrv_get_cumulative_perm(bs, _perms, _shared_perms);
+assert(_cumulative_perms == cumulative_perms);
+assert(_cumulative_shared_perms == cumulative_shared_perms);
+
 /* Update this node */
 if (drv->bdrv_set_perm) {
 drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
@@ -2301,6 +2306,8 @@ static void bdrv_child_set_perm(BdrvChild *c, uint64_t 
perm, uint64_t shared)
 
 c->has_backup_perm = false;
 
+assert(c->perm == perm);
+assert(c->shared_perm == shared);
 c->perm = perm;
 c->shared_perm = shared;
 
-- 
2.21.3

[PATCH RFC 00/21] block: update graph permissions update

Hi all!

Here is a proposal of updating graph changing procedures.

The thing brought me here is a question about "activating" filters after
insertion, which is done in mirror_top and backup_top. The problem is
that we can't simply avoid permission conflict when inserting the
filter: during insertion old permissions of relations to be removed
conflicting with new permissions of new created relations. And current
solution is supporting additional "inactive" mode for the filter when it
doesn't require any permissions.

I suggest to change the order of operations: let's first do all graph
relations modifications and then refresh permissions. Of course we'll
need a way to restore old graph if refresh fails.

Another problem with permission update is that we update permissions in
order of DFS which is not always correct. Better is update node when all
its parents already updated and require correct permissions. This needs
a topological sort of nodes prior to permission update, see more in
patches later.

Key patches here:

01,02 - add failing tests to illustrate conceptual problems of current
permission update system.
[Here is side suggestion: we usually add tests after fix, so careful
 reviewer has to change order of patches to check that test fails before
 fix. I add tests in the way the may be simply executed but not yet take
 part in make check. It seems more native: first show the problem, then
 fix it. And when fixed, make tests available for make check]

08 - toplogical sort implemented for permission update, one of new tests
now pass

13 - improve bdrv_replace_node. second new test now pass

21 - drop .active field and activation procedure for backup-top!

Series marked as RFC as they are non-complete. Some things like
bdrv_replace_node() and bdrv_append() are moved into new paradigm
(update graph first) some not (like bdrv_reopen_multiple()). Because of
it we have to still support old interfaces (like ignore_children).

Still, I'd be very grateful for some feedback before investigating more
time to this thing.

Note, that this series does nothing with another graph-update problem
discussed under "[PATCH RFC 0/5] Fix accidental crash in iotest 30".


The series based on block-next Max's branch and can be found here:

git: https://src.openvz.org/scm/~vsementsov/qemu.git
tag: up-block-topologic-perm-v1

Vladimir Sementsov-Ogievskiy (21):
  tests/test-bdrv-graph-mod: add test_parallel_exclusive_write
  tests/test-bdrv-graph-mod: add test_parallel_perm_update
  util: add transactions.c
  block: bdrv_refresh_perms: check parents compliance
  block: refactor bdrv_child* permission functions
  block: rewrite bdrv_child_try_set_perm() using bdrv_refresh_perms()
  block: inline bdrv_child_*() permission functions calls
  block: use topological sort for permission update
  block: add bdrv_drv_set_perm transaction action
  block: add bdrv_list_* permission update functions
  block: add bdrv_replace_child_safe() transaction action
  block: return value from bdrv_replace_node()
  block: fix bdrv_replace_node_common
  block: add bdrv_attach_child_noperm() transaction action
  block: split out bdrv_replace_node_noperm()
  block: bdrv_append(): don't consume reference
  block: bdrv_append(): return status
  block: adapt bdrv_append() for inserting filters
  block: add bdrv_remove_backing transaction action
  block: introduce bdrv_drop_filter()
  block/backup-top: drop .active

 include/block/block.h   |   9 +-
 include/qemu/transactions.h |  46 +++
 block.c | 789 
 block/backup-top.c  |  39 +-
 block/commit.c  |   7 +-
 block/mirror.c  |   9 +-
 blockdev.c  |  10 +-
 tests/test-bdrv-drain.c |   2 +-
 tests/test-bdrv-graph-mod.c | 122 +-
 util/transactions.c |  81 
 tests/qemu-iotests/283.out  |   2 +-
 util/meson.build|   1 +
 12 files changed, 872 insertions(+), 245 deletions(-)
 create mode 100644 include/qemu/transactions.h
 create mode 100644 util/transactions.c

-- 
2.21.3

Re: [PATCH 1/1] Fix qcow2 corruption on discard

On Mon, 2020-11-23 at 19:20 +0100, Alberto Garcia wrote:
> On Mon 23 Nov 2020 05:09:41 PM CET, Kevin Wolf wrote:
> > > Commit 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
> > > introduced a subtle change to code in zero_in_l2_slice:
> > > 
> > > It swapped the order of
> > > 
> > > 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> > > 2. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> > > 3. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> > > 
> > > To
> > > 
> > > 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> > > 2. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> > > 3. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> 
> Ouch :( Good catch!
> 
> > > -qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> > >  if (unmap) {
> > >  qcow2_free_any_cluster(bs, old_l2_entry, 
> > > QCOW2_DISCARD_REQUEST);
> > >  }
> > >  set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
> > > +qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> > 
> > Good catch, but I think your order is wrong, too. We need the original
> > order from before 205fa50750:
> > 
> > 1. qcow2_cache_entry_mark_dirty()
> >set_l2_entry() + set_l2_bitmap()
> > 
> > 2. qcow2_free_any_cluster()
> 
> I agree with Kevin on this.

I also agree, I haven't thought about this.

Best regards,
Maxim Levitsky
> 
> Berto
>

Re: [PATCH 1/1] Fix qcow2 corruption on discard

2020-11-23 Thread Alberto Garcia

On Mon 23 Nov 2020 05:09:41 PM CET, Kevin Wolf wrote:
>> Commit 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
>> introduced a subtle change to code in zero_in_l2_slice:
>> 
>> It swapped the order of
>> 
>> 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
>> 2. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
>> 3. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
>> 
>> To
>> 
>> 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
>> 2. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
>> 3. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);

Ouch :( Good catch!

>> -qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
>>  if (unmap) {
>>  qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
>>  }
>>  set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
>> +qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
>
> Good catch, but I think your order is wrong, too. We need the original
> order from before 205fa50750:
>
> 1. qcow2_cache_entry_mark_dirty()
>set_l2_entry() + set_l2_bitmap()
>
> 2. qcow2_free_any_cluster()

I agree with Kevin on this.

Berto

Re: [PATCH 1/1] Fix qcow2 corruption on discard

On Mon, 2020-11-23 at 18:38 +0100, Kevin Wolf wrote:
> Am 23.11.2020 um 16:49 hat Maxim Levitsky geschrieben:
> > Commit 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
> > introduced a subtle change to code in zero_in_l2_slice:
> > 
> > It swapped the order of
> > 
> > 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> > 2. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> > 3. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> > 
> > To
> > 
> > 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> > 2. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> > 3. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> > 
> > It seems harmless, however the call to qcow2_free_any_clusters
> > can trigger a cache flush which can mark the L2 table as clean,
> > and assuming that this was the last write to it,
> > a stale version of it will remain on the disk.
> 
> Do you have more details on this last paragraph? I'm trying to come up
> with a reproducer, but I don't see how qcow2_free_any_clusters() could
> flush the L2 table cache. (It's easy to get it to flush the refcount
> block cache, but that's useless for a reproducer.)
> 
> The only way I see to flush any cache with it is in update_refcount()
> the qcow2_cache_set_dependency() call. This will always flush the cache
> that the L2 cache depends on - which will never be the L2 cache itself,
> but always either the refcount cache or nothing.
> 
> There are more options in alloc_refcount_block() if we're allocating a
> new refcount block, but in the context of freeing clusters we'll never
> need to do that.
> 
> Whatever I tried, at the end of zero_in_l2_slice(), I have a dirty L2
> table and a dirty refcount block in the cache, with a dependency that
> makes sure that the L2 table will be written out first.
> 
> If you don't have the information yet, can you try to debug your manual
> reproducer a bit more to find out how this happens?
I'll do this tomorrow.
Best regards,
Maxim Levitsky

> 
> Kevin
> 
> > Now we have a valid L2 entry pointing to a freed cluster. Oops.
> > 
> > Fixes: 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
> > Signed-off-by: Maxim Levitsky 
> > ---
> >  block/qcow2-cluster.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> > index 485b4cb92e..267b46a4ca 100644
> > --- a/block/qcow2-cluster.c
> > +++ b/block/qcow2-cluster.c
> > @@ -2010,11 +2010,11 @@ static int zero_in_l2_slice(BlockDriverState *bs, 
> > uint64_t offset,
> >  continue;
> >  }
> >  
> > -qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> >  if (unmap) {
> >  qcow2_free_any_cluster(bs, old_l2_entry, 
> > QCOW2_DISCARD_REQUEST);
> >  }
> >  set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
> > +qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> >  if (has_subclusters(s)) {
> >  set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
> >  }
> > -- 
> > 2.26.2
> >

Re: [PATCH 1/1] Fix qcow2 corruption on discard

Am 23.11.2020 um 16:49 hat Maxim Levitsky geschrieben:
> Commit 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
> introduced a subtle change to code in zero_in_l2_slice:
> 
> It swapped the order of
> 
> 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> 2. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> 3. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> 
> To
> 
> 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> 2. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> 3. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> 
> It seems harmless, however the call to qcow2_free_any_clusters
> can trigger a cache flush which can mark the L2 table as clean,
> and assuming that this was the last write to it,
> a stale version of it will remain on the disk.

Do you have more details on this last paragraph? I'm trying to come up
with a reproducer, but I don't see how qcow2_free_any_clusters() could
flush the L2 table cache. (It's easy to get it to flush the refcount
block cache, but that's useless for a reproducer.)

The only way I see to flush any cache with it is in update_refcount()
the qcow2_cache_set_dependency() call. This will always flush the cache
that the L2 cache depends on - which will never be the L2 cache itself,
but always either the refcount cache or nothing.

There are more options in alloc_refcount_block() if we're allocating a
new refcount block, but in the context of freeing clusters we'll never
need to do that.

Whatever I tried, at the end of zero_in_l2_slice(), I have a dirty L2
table and a dirty refcount block in the cache, with a dependency that
makes sure that the L2 table will be written out first.

If you don't have the information yet, can you try to debug your manual
reproducer a bit more to find out how this happens?

Kevin

> Now we have a valid L2 entry pointing to a freed cluster. Oops.
> 
> Fixes: 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
> Signed-off-by: Maxim Levitsky 
> ---
>  block/qcow2-cluster.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 485b4cb92e..267b46a4ca 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -2010,11 +2010,11 @@ static int zero_in_l2_slice(BlockDriverState *bs, 
> uint64_t offset,
>  continue;
>  }
>  
> -qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
>  if (unmap) {
>  qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
>  }
>  set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
> +qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
>  if (has_subclusters(s)) {
>  set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
>  }
> -- 
> 2.26.2
>

Re: [PATCH v2 0/2] Increase amount of data for monitor to read

2020-11-23 Thread Andrey Shinkevich


On 23.11.2020 18:44, Andrey Shinkevich wrote:

The subject was discussed here:
https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html

This series is a solution for the issue with QMP monitor buffered input.
A JSON little parser is introduced to separate QMP commands read from the
input buffer so that incoming requests do not overwhelm the monitor queue.
A side effect raised in the test #247 was managed in the first patch.
It may be considered as a workaround. Any sane fix suggested will be
appreciated.

Note:
This series goes after the Vladimir's one:
'[PATCH v3 00/25] backup performance: block_status + async"'
To make the test #129 passed, the following patch should be applied first:
'[PATCH v3 01/25] iotests: 129 don't check backup "busy"'.

v2:
   02: The static JSONthrottle object was made a member of the Chardev 
structure.
   The fd_chr_read functions were merged.
   The monitor thread synchronization was added to protect the input queue
   from overflow.

Andrey Shinkevich (2):
   iotests: add another bash sleep command to 247
   monitor: increase amount of data for monitor to read

  chardev/char-fd.c  | 35 +--
  chardev/char-socket.c  | 42 +++---
  chardev/char.c | 41 +
  include/chardev/char.h | 15 +++
  monitor/monitor.c  |  2 +-
  tests/qemu-iotests/247 |  2 ++
  tests/qemu-iotests/247.out |  1 +
  7 files changed, 132 insertions(+), 6 deletions(-)



...and with the extended number of QMP commands

time (echo "{ 'execute': 'qmp_capabilities' }"; for i in {1..1}; do 
echo "{ 'execute': 'query-block-jobs' } {"execute":"query-status"} { 
'execute': 'query-block-jobs' } {"execute":"query-status"} { 'execute': 
'query-block-jobs' } {"execute":"query-status"} { 'execute': 
'query-block-jobs' } {"execute":"query-status"}"; done; echo "{ 
'execute': 'quit' }" ) | ./build/qemu-system-x86_64 -qmp stdio > /dev/null


on master:
real0m10.112s
user0m10.168s
sys 0m4.793s

after the patch applied:
real0m4.140s
user0m4.079s
sys 0m0.785s

Andrey

Re: [PATCH v2 0/2] Increase amount of data for monitor to read

2020-11-23 Thread Andrey Shinkevich


On 23.11.2020 18:44, Andrey Shinkevich wrote:

The subject was discussed here:
https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html

This series is a solution for the issue with QMP monitor buffered input.
A JSON little parser is introduced to separate QMP commands read from the
input buffer so that incoming requests do not overwhelm the monitor queue.
A side effect raised in the test #247 was managed in the first patch.
It may be considered as a workaround. Any sane fix suggested will be
appreciated.

Note:
This series goes after the Vladimir's one:
'[PATCH v3 00/25] backup performance: block_status + async"'
To make the test #129 passed, the following patch should be applied first:
'[PATCH v3 01/25] iotests: 129 don't check backup "busy"'.

v2:
   02: The static JSONthrottle object was made a member of the Chardev 
structure.
   The fd_chr_read functions were merged.
   The monitor thread synchronization was added to protect the input queue
   from overflow.

Andrey Shinkevich (2):
   iotests: add another bash sleep command to 247
   monitor: increase amount of data for monitor to read

  chardev/char-fd.c  | 35 +--
  chardev/char-socket.c  | 42 +++---
  chardev/char.c | 41 +
  include/chardev/char.h | 15 +++
  monitor/monitor.c  |  2 +-
  tests/qemu-iotests/247 |  2 ++
  tests/qemu-iotests/247.out |  1 +
  7 files changed, 132 insertions(+), 6 deletions(-)




The Vladimir's modified test case

$ time (echo "{ 'execute': 'qmp_capabilities' }"; for i in {1..1}; 
do echo "{ 'execute': 'query-block-jobs' } {"execute":"query-status"} { 
'execute': 'query-block-jobs' } {"execute":"query-status"}"; done; echo 
"{ 'execute': 'quit' }" ) | ./build/qemu-system-x86_64 -qmp stdio > 
/dev/null


shows the following performance

on master:
real0m5.188s
user0m5.310s
sys 0m2.539s

after the patch applied:
real0m2.227s
user0m2.483s
sys 0m0.480s

Andrey

Re: [PATCH 1/1] Fix qcow2 corruption on discard

Am 23.11.2020 um 16:49 hat Maxim Levitsky geschrieben:
> Commit 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
> introduced a subtle change to code in zero_in_l2_slice:
> 
> It swapped the order of
> 
> 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> 2. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> 3. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> 
> To
> 
> 1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
> 2. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
> 3. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
> 
> It seems harmless, however the call to qcow2_free_any_clusters
> can trigger a cache flush which can mark the L2 table as clean,
> and assuming that this was the last write to it,
> a stale version of it will remain on the disk.
> 
> Now we have a valid L2 entry pointing to a freed cluster. Oops.
> 
> Fixes: 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
> Signed-off-by: Maxim Levitsky 
> ---
>  block/qcow2-cluster.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 485b4cb92e..267b46a4ca 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -2010,11 +2010,11 @@ static int zero_in_l2_slice(BlockDriverState *bs, 
> uint64_t offset,
>  continue;
>  }
>  
> -qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
>  if (unmap) {
>  qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
>  }
>  set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
> +qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);

Good catch, but I think your order is wrong, too. We need the original
order from before 205fa50750:

1. qcow2_cache_entry_mark_dirty()
   set_l2_entry() + set_l2_bitmap()

2. qcow2_free_any_cluster()

The order between qcow2_cache_entry_mark_dirty() and set_l2_entry()
shouldn't matter, but it's important that we update the refcount table
only after the L2 table has been updated to not reference the cluster
any more.

Otherwise a crash could lead to a situation where the cluster is
allocated (because it has refcount 0), but it was still in use in an L2
table. This is a classic corruption scenario.

Kevin

[PATCH 0/1] Fix qcow2 corruption after addition of subcluster support

On this weekend, I had discovered that one of my VMs started to act weird.

Due to this, I found out that it and most of the other VMs I have,
have grown an qcow2 corruption.

So after some bisecting, digging through dumps, and debugging,
I think I found the root cause and a fix.

In addition to that I would like to raise few points:

1. I had to use qcow2-dump from (*)
(it is also on github but without source. wierd...)
to examine the L1/L2 tables and refcount tables.

It seems that there were few attempts (**), (***) to make an official tool that
would dump at least L1/L2/refcount tables, but nothing got accepted
so far.

I think that an official tool to dump at least basic qcow2 structure
would be very helpful to discover/debug qcow2 corruptions.
I had to study again the qcow2 format for this, so I can help with that.

2. 'qemu-img check -r all' is happy to create clusters that are referenced
from multiple L2 entries.

This isn't technically wrong, since write through any of these l2 entries
will COW the cluster.

However I would be happy to know that my images don't have such clusters,
so I would like qemu-img check to at least notify about this.
Can we add some -check-weird-but-legal flag to it to check this?

Few notes about the condition for this corruption to occur:

I have a bunch of VMs which are running each using two qcow2 files,
base and a snapshot on top of it, which I 'qemu-img commit' once in a while.
Discard is enabled to avoid wasting disk space.

Since discard is enabled, 'qemu-img commit' often discards data on the base
disk.
The corruption happens after such a commit, and manifests in a stale L2
entry that was supposed to be discarded but now points to an unused cluster.

I wasn't able to reproduce this on small test case so far.

Best regards,
Maxim Levitsky

(*)https://lists.gnu.org/archive/html/qemu-devel/2017-08/msg02760.html
(**)
https://patchwork.kernel.org/project/qemu-devel/patch/20180328133845.20632-1-be...@igalia.com/
(***)
https://patchwork.kernel.org/project/qemu-devel/cover/1578990137-308222-1-git-send-email-andrey.shinkev...@virtuozzo.com/

Maxim Levitsky (1):
Fix qcow2 corruption on discard

block/qcow2-cluster.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

--
2.26.2

[PATCH 1/1] Fix qcow2 corruption on discard

Commit 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
introduced a subtle change to code in zero_in_l2_slice:

It swapped the order of

1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);
3. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);

To

1. qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2. qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
3. set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO);

It seems harmless, however the call to qcow2_free_any_clusters
can trigger a cache flush which can mark the L2 table as clean,
and assuming that this was the last write to it,
a stale version of it will remain on the disk.

Now we have a valid L2 entry pointing to a freed cluster. Oops.

Fixes: 205fa50750 ("qcow2: Add subcluster support to zero_in_l2_slice()")
Signed-off-by: Maxim Levitsky 
---
 block/qcow2-cluster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 485b4cb92e..267b46a4ca 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -2010,11 +2010,11 @@ static int zero_in_l2_slice(BlockDriverState *bs, 
uint64_t offset,
 continue;
 }
 
-qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
 if (unmap) {
 qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
 }
 set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
+qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
 if (has_subclusters(s)) {
 set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
 }
-- 
2.26.2

[PATCH v2 1/2] iotests: add another bash sleep command to 247

2020-11-23 Thread Andrey Shinkevich via

This patch paves the way for the one that follows. The following patch
makes the QMP monitor to read up to 4K from stdin at once. That results
in running the bash 'sleep' command before the _qemu_proc_exec() starts
in subshell. Another 'sleep' command with an unobtrusive 'query-status'
plays as a workaround.

Signed-off-by: Andrey Shinkevich 
---
 tests/qemu-iotests/247 | 2 ++
 tests/qemu-iotests/247.out | 1 +
 2 files changed, 3 insertions(+)

diff --git a/tests/qemu-iotests/247 b/tests/qemu-iotests/247
index 87e37b3..7d316ec 100755
--- a/tests/qemu-iotests/247
+++ b/tests/qemu-iotests/247
@@ -59,6 +59,8 @@ TEST_IMG="$TEST_IMG.4" _make_test_img $size
 {"execute":"block-commit",
  "arguments":{"device":"format-4", "top-node": "format-2", 
"base-node":"format-0", "job-id":"job0"}}
 EOF
+sleep 1
+echo '{"execute":"query-status"}'
 if [ "${VALGRIND_QEMU}" == "y" ]; then
 sleep 10
 else
diff --git a/tests/qemu-iotests/247.out b/tests/qemu-iotests/247.out
index e909e83..13d9547 100644
--- a/tests/qemu-iotests/247.out
+++ b/tests/qemu-iotests/247.out
@@ -17,6 +17,7 @@ QMP_VERSION
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"BLOCK_JOB_COMPLETED", "data": {"device": "job0", "len": 134217728, "offset": 
134217728, "speed": 0, "type": "commit"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "concluded", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "null", "id": "job0"}}
+{"return": {"status": "running", "singlestep": false, "running": true}}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"SHUTDOWN", "data": {"guest": false, "reason": "host-qmp-quit"}}
 *** done
-- 
1.8.3.1

[PATCH v2 2/2] monitor: increase amount of data for monitor to read

2020-11-23 Thread Andrey Shinkevich via

QMP and HMP monitors read one byte at a time from the socket or stdin,
which is very inefficient. With 100+ VMs on the host, this results in
multiple extra system calls and CPU overuse.
This patch increases the amount of read data up to 4096 bytes that fits
the buffer size on the channel level.
A JSON little parser is introduced to throttle QMP commands read from
the buffer so that incoming requests do not overflow the monitor input
queue.

Suggested-by: Denis V. Lunev 
Signed-off-by: Andrey Shinkevich 
---
 chardev/char-fd.c  | 35 +--
 chardev/char-socket.c  | 42 +++---
 chardev/char.c | 41 +
 include/chardev/char.h | 15 +++
 monitor/monitor.c  |  2 +-
 5 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/chardev/char-fd.c b/chardev/char-fd.c
index 1cd62f2..15bc8f4 100644
--- a/chardev/char-fd.c
+++ b/chardev/char-fd.c
@@ -33,6 +33,8 @@
 #include "chardev/char-fd.h"
 #include "chardev/char-io.h"
 
+#include "monitor/monitor-internal.h"
+
 /* Called with chr_write_lock held.  */
 static int fd_chr_write(Chardev *chr, const uint8_t *buf, int len)
 {
@@ -45,8 +47,12 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition 
cond, void *opaque)
 {
 Chardev *chr = CHARDEV(opaque);
 FDChardev *s = FD_CHARDEV(opaque);
+CharBackend *be = chr->be;
+Monitor *mon = (Monitor *)be->opaque;
 int len;
 uint8_t buf[CHR_READ_BUF_LEN];
+uint8_t *cursor;
+int load, size, pos;
 ssize_t ret;
 
 len = sizeof(buf);
@@ -62,10 +68,35 @@ static gboolean fd_chr_read(QIOChannel *chan, GIOCondition 
cond, void *opaque)
 if (ret == 0) {
 remove_fd_in_watch(chr);
 qemu_chr_be_event(chr, CHR_EVENT_CLOSED);
+chr->json_thl = (const JSONthrottle){0};
 return FALSE;
 }
-if (ret > 0) {
-qemu_chr_be_write(chr, buf, ret);
+if (ret < 0) {
+return TRUE;
+}
+load = ret;
+cursor = buf;
+
+while (load > 0) {
+size = load;
+if (monitor_is_qmp(mon)) {
+/* Find the end position of a JSON command in the input buffer */
+pos = qemu_chr_end_position((const char *) cursor, size,
+>json_thl);
+if (pos >= 0) {
+size = pos + 1;
+}
+}
+
+qemu_chr_be_write(chr, cursor, size);
+cursor += size;
+load -= size;
+
+if (load > 0) {
+while (qatomic_mb_read(>suspend_cnt)) {
+g_usleep(40);
+}
+}
 }
 
 return TRUE;
diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 213a4c8..30ad1d4 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -38,6 +38,8 @@
 #include "chardev/char-io.h"
 #include "qom/object.h"
 
+#include "monitor/monitor-internal.h"
+
 /***/
 /* TCP Net console */
 
@@ -522,7 +524,11 @@ static gboolean tcp_chr_read(QIOChannel *chan, 
GIOCondition cond, void *opaque)
 {
 Chardev *chr = CHARDEV(opaque);
 SocketChardev *s = SOCKET_CHARDEV(opaque);
+CharBackend *be = chr->be;
+Monitor *mon = (Monitor *)be->opaque;
 uint8_t buf[CHR_READ_BUF_LEN];
+uint8_t *cursor;
+int load, pos;
 int len, size;
 
 if ((s->state != TCP_CHARDEV_STATE_CONNECTED) ||
@@ -537,12 +543,42 @@ static gboolean tcp_chr_read(QIOChannel *chan, 
GIOCondition cond, void *opaque)
 if (size == 0 || (size == -1 && errno != EAGAIN)) {
 /* connection closed */
 tcp_chr_disconnect(chr);
-} else if (size > 0) {
+chr->json_thl = (const JSONthrottle){0};
+return TRUE;
+}
+if (size < 0) {
+return TRUE;
+}
+load = size;
+cursor = buf;
+
+while (load > 0) {
+size = load;
+if (monitor_is_qmp(mon)) {
+/* Find the end position of a JSON command in the input buffer */
+pos = qemu_chr_end_position((const char *) cursor, size,
+>json_thl);
+if (pos >= 0) {
+size = pos + 1;
+}
+}
+len = size;
+
 if (s->do_telnetopt) {
-tcp_chr_process_IAC_bytes(chr, s, buf, );
+tcp_chr_process_IAC_bytes(chr, s, cursor, );
 }
 if (size > 0) {
-qemu_chr_be_write(chr, buf, size);
+qemu_chr_be_write(chr, cursor, size);
+cursor += size;
+load -= size;
+} else {
+cursor += len;
+load -= len;
+}
+if (load > 0) {
+while (qatomic_mb_read(>suspend_cnt)) {
+g_usleep(40);
+}
 }
 }
 
diff --git a/chardev/char.c b/chardev/char.c
index aa42821..75c7bc7 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -1178,6 +1178,47 @@ GSource

[PATCH v2 0/2] Increase amount of data for monitor to read

2020-11-23 Thread Andrey Shinkevich via

The subject was discussed here:
https://lists.gnu.org/archive/html/qemu-devel/2017-05/msg00206.html

This series is a solution for the issue with QMP monitor buffered input.
A JSON little parser is introduced to separate QMP commands read from the
input buffer so that incoming requests do not overwhelm the monitor queue.
A side effect raised in the test #247 was managed in the first patch.
It may be considered as a workaround. Any sane fix suggested will be
appreciated.

Note:
This series goes after the Vladimir's one:
'[PATCH v3 00/25] backup performance: block_status + async"'
To make the test #129 passed, the following patch should be applied first:
'[PATCH v3 01/25] iotests: 129 don't check backup "busy"'.

v2:
  02: The static JSONthrottle object was made a member of the Chardev structure.
  The fd_chr_read functions were merged.
  The monitor thread synchronization was added to protect the input queue
  from overflow.

Andrey Shinkevich (2):
  iotests: add another bash sleep command to 247
  monitor: increase amount of data for monitor to read

 chardev/char-fd.c  | 35 +--
 chardev/char-socket.c  | 42 +++---
 chardev/char.c | 41 +
 include/chardev/char.h | 15 +++
 monitor/monitor.c  |  2 +-
 tests/qemu-iotests/247 |  2 ++
 tests/qemu-iotests/247.out |  1 +
 7 files changed, 132 insertions(+), 6 deletions(-)

-- 
1.8.3.1

Re: [PULL 00/33] Block patches

2020-11-23 Thread Peter Maydell

On Mon, 23 Nov 2020 at 12:55, Philippe Mathieu-Daudé  wrote:
>
> On 11/4/20 9:59 PM, Peter Maydell wrote:
> > On Wed, 4 Nov 2020 at 15:18, Stefan Hajnoczi  wrote:
> >>
> >> The following changes since commit 
> >> 8507c9d5c9a62de2a0e281b640f995e26eac46af:
> >>
> >>   Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into 
> >> staging (2020-11-03 15:59:44 +)
> >>
> >> are available in the Git repository at:
> >>
> >>   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
> >>
> >> for you to fetch changes up to fc107d86840b3364e922c26cf7631b7fd38ce523:
> >>
> >>   util/vfio-helpers: Assert offset is aligned to page size (2020-11-03 
> >> 19:06:23 +)
> >>
> >> 
> >> Pull request for 5.2
> >>
> >> NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
> >> improvements. Elena Afanasova's ioeventfd fixes are also included.
>
> There is a problem with this pull request, the fix hasn't
> been merged...

Sorry, this must have been a slip-up on my end. I have
now merged and pushed this pullreq to master.

-- PMM

Re: [PATCH RFC 0/5] Fix accidental crash in iotest 30


23.11.2020 14:10, Kevin Wolf wrote:

Am 23.11.2020 um 11:29 hat Vladimir Sementsov-Ogievskiy geschrieben:

23.11.2020 13:10, Kevin Wolf wrote:

Am 20.11.2020 um 19:19 hat Vladimir Sementsov-Ogievskiy geschrieben:

20.11.2020 20:22, Kevin Wolf wrote:

Am 20.11.2020 um 17:43 hat Vladimir Sementsov-Ogievskiy geschrieben:

20.11.2020 19:36, Kevin Wolf wrote:

Am 20.11.2020 um 17:16 hat Vladimir Sementsov-Ogievskiy geschrieben:

Hi all!

As Peter recently noted, iotest 30 accidentally fails.

I found that Qemu crashes due to interleaving of graph-update
operations of parallel mirror and stream block-jobs.


I haven't found the time yet to properly look into this or your other
thread where you had a similar question, but there is one thing I'm
wondering: Why can the nested job even make progress and run its
completion handler?

When we modify the graph, we should have drained the subtree in
question, so in theory while one job finishes and modifies the graph,
there should be no way for the other job to make progress and get
interleaved - it shouldn't be able to start I/O requests and much less
to run its completion handler and modify the graph.

Are we missing drained sections somewhere or do they fail to achieve
what I think they should achieve?



It all looks like both jobs are reached their finish simultaneously.
So, all progress is done in both jobs. And they go concurrently to
completion procedures which interleaves. So, there no more io through
blk, which is restricted by drained sections.


They can't be truly simultaneous because they run in the same thread.
During job completion, this is the main thread.


No, they not truly simultaneous, but completions may interleave
through nested aio_poll loops.



However as soon as job_is_completed() returns true, it seems we're not
pausing the job any more when one of its nodes gets drained.

Possibly also relevant: The job->busy = false in job_exit(). The comment
there says it's a lie, but we might deadlock otherwise.

This problem will probably affect other callers, too, which drain a
subtree and then resonably expect that nobody will modify the graph
until they end the drained section. So I think the problem that we need
to address is that jobs run their completion handlers even though they
are supposed to be paused by a drain.


Hmm. I always thought about drained section as about thing that stops
IO requests, not other operations.. And we do graph modifications in
drained section to avoid in-flight IO requests during graph
modification.


Is there any use for an operation that only stops I/O, but doesn't
prevent graph changes?

I always understood it as a request to have exclusive access to a
subtree, so that nobody else would touch it.


I'm not saying that your graph modification locks are a bad idea, but
they are probably not a complete solution.



Hmm. What do you mean? It's of course not complete, as I didn't
protect every graph modification procedure.. But if we do protect all
such things and do graph modifications always under this mutex, it
should work I think.


What I mean is that not only graph modifications can conflict with each
other, but most callers of drain_begin/end will probably not be prepared
for the graph changing under their feet, even if they don't actively
change the graph themselves.



Understand now.. Right.. Anyway, it looks as we need some kind of
mutex. As the user of drained section of course wants to do graph
modifications and even IO (for example update backing-link in
metadata). The first thing that comes to mind is to protect all
outer-most drained sections by global CoMutex and assert in
drain_begin/drain_end that the mutex is locked.

Hmm, it also looks like RW-lock, and simple IO is "read" and something
under drained section is "write".


In a way, drain _is_ the implementation of a lock. But as you've shown,
it's a buggy implementation.

What I was looking at was basically fixing the one instance of a bug
while leaving the design as it is.

My impression is that you're looking at this more radically and want to
rearchitecture the whole drain mechanism so that such bugs would be much
less likely to start with. Maybe this is a good idea, but it's probably
also a lot more effort.

Basically, for making use of more traditional locks, the naive approach
would be changing blk/bdrv_inc/dec_in_flight() so that it takes/releases
an actual coroutine lock. As you suggest, probably a CoRwLock.

I see a few non-trivial questions already for this part:

* What about requests for which bs->in_flight is increased more than
   once? Do we need some sort of recursive lock for them?


Is there reasonable example? I'd avoid recursive locks if possible, it doesn't 
make things simpler.



* How do you know whether you should take a reader or a writer lock? For
   drains called from coroutine context, maybe you could store the caller
   that "owns" the drain section in the BDS, but what about non-coroutine
   drains?


Intuitively,

Re: [PULL 00/33] Block patches

2020-11-23 Thread Philippe Mathieu-Daudé

On 11/4/20 9:59 PM, Peter Maydell wrote:
> On Wed, 4 Nov 2020 at 15:18, Stefan Hajnoczi  wrote:
>>
>> The following changes since commit 8507c9d5c9a62de2a0e281b640f995e26eac46af:
>>
>>   Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into 
>> staging (2020-11-03 15:59:44 +)
>>
>> are available in the Git repository at:
>>
>>   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
>>
>> for you to fetch changes up to fc107d86840b3364e922c26cf7631b7fd38ce523:
>>
>>   util/vfio-helpers: Assert offset is aligned to page size (2020-11-03 
>> 19:06:23 +)
>>
>> 
>> Pull request for 5.2
>>
>> NVMe fixes to solve IOMMU issues on non-x86 and error message/tracing
>> improvements. Elena Afanasova's ioeventfd fixes are also included.

There is a problem with this pull request, the fix hasn't
been merged...

> Applied, thanks.
> 
> Please update the changelog at https://wiki.qemu.org/ChangeLog/5.2
> for any user-visible changes.
> 
> -- PMM
>

Re: [raw] Guest stuck during live live-migration

[ Cc: qemu-block ]

Am 23.11.2020 um 10:36 hat Quentin Grolleau geschrieben:
> Hello,
> 
> In our company, we are hosting a large number of Vm, hosted behind Openstack 
> (so libvirt/qemu).
> A large majority of our Vms are runnign with local data only, stored on NVME, 
> and most of them are RAW disks.
> 
> With Qemu 4.0 (can be even with older version) we see strange  live-migration 
> comportement:

First of all, 4.0 is relatively old. Generally it is worth retrying with
the most recent code (git master or 5.2.0-rc2) before having a closer
look at problems, because it is frustrating to spend considerable time
debugging an issue and then find out it has already been fixed a year
ago.

> - some Vms live migrate at very high speed without issue (> 6 Gbps)
> - some Vms are running correctly, but migrating at a strange low speed 
> (3Gbps)
> - some Vms are migrating at a very low speed (1Gbps, sometime less) and 
> during the migration the guest is completely I/O stuck
> 
> When this issue happen the VM is completly block, iostat in the Vm show us a 
> latency of 30 secs

Can you get the stack backtraces of all QEMU threads while the VM is
blocked (e.g. with gdb or pstack)?

> First we thought it was related to an hardware issue we check it, we 
> comparing different hardware, but no issue where found there
> 
> So one of my colleague had the idea to limit with "tc" the bandwidth on the 
> interface the migration was done, and it worked the VM didn't lose any ping 
> nor being  I/O  stuck
> Important point : Once the Vm have been migrate (with the limitation ) one 
> time, if we migrate it again right after, the migration will be done at full 
> speed (8-9Gb/s) without freezing the Vm

Since you say you're using local storage, I assume that you're doing
both a VM live migration and storage migration at the same time. These
are separate connections, storage is migrated using a NBD connection.

Did you limit the bandwith for both connections, or if it was just one
of them, which one was it?

> It only happen on existing VM, we tried to replicate with a fresh instance 
> with exactly the same spec and nothing was happening
> 
> We tried to replicate the workload inside the VM but there was no way to 
> replicate the case. So it was not related to the workload nor to the server 
> that hosts the Vm
> 
> So we thought about the disk of the instance : the raw file.
> 
> We also tried to strace -c the process during the live-migration and it was 
> doing a lot of "lseek"
> 
> and we found this :
> https://lists.gnu.org/archive/html/qemu-devel/2017-02/msg00462.html

This case is different in that it used qcow2 (which should behave much
better today).

It also used ZFS, which you didn't mention. Is the problematic image
stored on ZFS? If not, which filesystem is it?

> So i rebuilt Qemu with this patch and the live-migration went well, at high 
> speed and with no VM freeze
> ( https://github.com/qemu/qemu/blob/master/block/file-posix.c#L2601 )
> 
> Do you have a way to avoid the "lseek" mechanism as it consumes more 
> resources to find the holes in the disk and don't let any for the VM ?

If you can provide the stack trace during the hang, we might be able to
tell why we're even trying to find holes.

Please also provide your QEMU command line.

At the moment, my assumption is that this is during a mirror block job
which is migrating the disk to your destination server. Not looking for
holes would mean that a sparse source file would become fully allocated
on the destination, which is usually not wanted (also we would
potentially transfer a lot more data over the networkj).

Can you give us a snippet from your strace that shows the individual
lseek syscalls? Depending on which ranges are queried, maybe we could
optimise things by caching the previous result.

Also, a final remark, I know of some cases (on XFS) where lseeks were
slow because the image file was heavily fragmented. Defragmenting the
file resolved the problem, so this may be another thing to try.

On XFS, newer QEMU versions set an extent size hint on newly created
image files (during qemu-img create), which can reduce fragmentation
considerably.

Kevin

> Server hosting the VM :
> - Bi-Xeon hosts With NVME storage and 10 Go Network card
> - Qemu 4.0 And Libvirt 5.4
> - Kernel 4.18.0.25
> 
> Guest having the issue :
> - raw image with Debian 8
> 
> Here the qemu img on the disk :
> > qemu-img info disk
> image: disk
> file format: raw
> virtual size: 400G (429496729600 bytes)
> disk size: 400G
> 
> 
> Quentin GROLLEAU
>

Re: [PATCH 0/4] vhost-user: avoid g_return_val_if() in get/set_config()

2020-11-23 Thread Stefano Garzarella


On Wed, Nov 18, 2020 at 09:16:40AM +, Stefan Hajnoczi wrote:

Markus Armbruster pointed out that g_return_val_if() is meant for programming
errors. It must not be used for input validation since it can be compiled out.
Use explicit if statements instead.

This patch series converts vhost-user device backends that use
g_return_val_if() in get/set_config().

Stefan Hajnoczi (4):
 contrib/vhost-user-blk: avoid g_return_val_if() input validation
 contrib/vhost-user-gpu: avoid g_return_val_if() input validation
 contrib/vhost-user-input: avoid g_return_val_if() input validation
 block/export: avoid g_return_val_if() input validation

block/export/vhost-user-blk-server.c| 4 +++-
contrib/vhost-user-blk/vhost-user-blk.c | 4 +++-
contrib/vhost-user-gpu/vhost-user-gpu.c | 4 +++-
contrib/vhost-user-input/main.c | 4 +++-
4 files changed, 12 insertions(+), 4 deletions(-)



Reviewed-by: Stefano Garzarella

Re: [PATCH RFC 0/5] Fix accidental crash in iotest 30

Am 23.11.2020 um 11:29 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 23.11.2020 13:10, Kevin Wolf wrote:
> > Am 20.11.2020 um 19:19 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > 20.11.2020 20:22, Kevin Wolf wrote:
> > > > Am 20.11.2020 um 17:43 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > > > 20.11.2020 19:36, Kevin Wolf wrote:
> > > > > > Am 20.11.2020 um 17:16 hat Vladimir Sementsov-Ogievskiy geschrieben:
> > > > > > > Hi all!
> > > > > > > 
> > > > > > > As Peter recently noted, iotest 30 accidentally fails.
> > > > > > > 
> > > > > > > I found that Qemu crashes due to interleaving of graph-update
> > > > > > > operations of parallel mirror and stream block-jobs.
> > > > > > 
> > > > > > I haven't found the time yet to properly look into this or your 
> > > > > > other
> > > > > > thread where you had a similar question, but there is one thing I'm
> > > > > > wondering: Why can the nested job even make progress and run its
> > > > > > completion handler?
> > > > > > 
> > > > > > When we modify the graph, we should have drained the subtree in
> > > > > > question, so in theory while one job finishes and modifies the 
> > > > > > graph,
> > > > > > there should be no way for the other job to make progress and get
> > > > > > interleaved - it shouldn't be able to start I/O requests and much 
> > > > > > less
> > > > > > to run its completion handler and modify the graph.
> > > > > > 
> > > > > > Are we missing drained sections somewhere or do they fail to achieve
> > > > > > what I think they should achieve?
> > > > > > 
> > > > > 
> > > > > It all looks like both jobs are reached their finish simultaneously.
> > > > > So, all progress is done in both jobs. And they go concurrently to
> > > > > completion procedures which interleaves. So, there no more io through
> > > > > blk, which is restricted by drained sections.
> > > > 
> > > > They can't be truly simultaneous because they run in the same thread.
> > > > During job completion, this is the main thread.
> > > 
> > > No, they not truly simultaneous, but completions may interleave
> > > through nested aio_poll loops.
> > > 
> > > > 
> > > > However as soon as job_is_completed() returns true, it seems we're not
> > > > pausing the job any more when one of its nodes gets drained.
> > > > 
> > > > Possibly also relevant: The job->busy = false in job_exit(). The comment
> > > > there says it's a lie, but we might deadlock otherwise.
> > > > 
> > > > This problem will probably affect other callers, too, which drain a
> > > > subtree and then resonably expect that nobody will modify the graph
> > > > until they end the drained section. So I think the problem that we need
> > > > to address is that jobs run their completion handlers even though they
> > > > are supposed to be paused by a drain.
> > > 
> > > Hmm. I always thought about drained section as about thing that stops
> > > IO requests, not other operations.. And we do graph modifications in
> > > drained section to avoid in-flight IO requests during graph
> > > modification.
> > 
> > Is there any use for an operation that only stops I/O, but doesn't
> > prevent graph changes?
> > 
> > I always understood it as a request to have exclusive access to a
> > subtree, so that nobody else would touch it.
> > 
> > > > I'm not saying that your graph modification locks are a bad idea, but
> > > > they are probably not a complete solution.
> > > > 
> > > 
> > > Hmm. What do you mean? It's of course not complete, as I didn't
> > > protect every graph modification procedure.. But if we do protect all
> > > such things and do graph modifications always under this mutex, it
> > > should work I think.
> > 
> > What I mean is that not only graph modifications can conflict with each
> > other, but most callers of drain_begin/end will probably not be prepared
> > for the graph changing under their feet, even if they don't actively
> > change the graph themselves.
> > 
> 
> Understand now.. Right.. Anyway, it looks as we need some kind of
> mutex. As the user of drained section of course wants to do graph
> modifications and even IO (for example update backing-link in
> metadata). The first thing that comes to mind is to protect all
> outer-most drained sections by global CoMutex and assert in
> drain_begin/drain_end that the mutex is locked.
> 
> Hmm, it also looks like RW-lock, and simple IO is "read" and something
> under drained section is "write".

In a way, drain _is_ the implementation of a lock. But as you've shown,
it's a buggy implementation.

What I was looking at was basically fixing the one instance of a bug
while leaving the design as it is.

My impression is that you're looking at this more radically and want to
rearchitecture the whole drain mechanism so that such bugs would be much
less likely to start with. Maybe this is a good idea, but it's probably
also a lot more effort.

Basically, for making use of more traditional locks, the naive approach
would be changing

Re: [PATCH RFC 0/5] Fix accidental crash in iotest 30


23.11.2020 13:10, Kevin Wolf wrote:

Am 20.11.2020 um 19:19 hat Vladimir Sementsov-Ogievskiy geschrieben:

20.11.2020 20:22, Kevin Wolf wrote:

Am 20.11.2020 um 17:43 hat Vladimir Sementsov-Ogievskiy geschrieben:

20.11.2020 19:36, Kevin Wolf wrote:

Am 20.11.2020 um 17:16 hat Vladimir Sementsov-Ogievskiy geschrieben:

Hi all!

As Peter recently noted, iotest 30 accidentally fails.

I found that Qemu crashes due to interleaving of graph-update
operations of parallel mirror and stream block-jobs.


I haven't found the time yet to properly look into this or your other
thread where you had a similar question, but there is one thing I'm
wondering: Why can the nested job even make progress and run its
completion handler?

When we modify the graph, we should have drained the subtree in
question, so in theory while one job finishes and modifies the graph,
there should be no way for the other job to make progress and get
interleaved - it shouldn't be able to start I/O requests and much less
to run its completion handler and modify the graph.

Are we missing drained sections somewhere or do they fail to achieve
what I think they should achieve?



It all looks like both jobs are reached their finish simultaneously.
So, all progress is done in both jobs. And they go concurrently to
completion procedures which interleaves. So, there no more io through
blk, which is restricted by drained sections.


They can't be truly simultaneous because they run in the same thread.
During job completion, this is the main thread.


No, they not truly simultaneous, but completions may interleave
through nested aio_poll loops.



However as soon as job_is_completed() returns true, it seems we're not
pausing the job any more when one of its nodes gets drained.

Possibly also relevant: The job->busy = false in job_exit(). The comment
there says it's a lie, but we might deadlock otherwise.

This problem will probably affect other callers, too, which drain a
subtree and then resonably expect that nobody will modify the graph
until they end the drained section. So I think the problem that we need
to address is that jobs run their completion handlers even though they
are supposed to be paused by a drain.


Hmm. I always thought about drained section as about thing that stops
IO requests, not other operations.. And we do graph modifications in
drained section to avoid in-flight IO requests during graph
modification.


Is there any use for an operation that only stops I/O, but doesn't
prevent graph changes?

I always understood it as a request to have exclusive access to a
subtree, so that nobody else would touch it.


I'm not saying that your graph modification locks are a bad idea, but
they are probably not a complete solution.



Hmm. What do you mean? It's of course not complete, as I didn't
protect every graph modification procedure.. But if we do protect all
such things and do graph modifications always under this mutex, it
should work I think.


What I mean is that not only graph modifications can conflict with each
other, but most callers of drain_begin/end will probably not be prepared
for the graph changing under their feet, even if they don't actively
change the graph themselves.



Understand now.. Right.. Anyway, it looks as we need some kind of mutex. As the 
user of drained section of course wants to do graph modifications and even IO 
(for example update backing-link in metadata). The first thing that comes to 
mind is to protect all outer-most drained sections by global CoMutex and assert 
in drain_begin/drain_end that the mutex is locked.

Hmm, it also looks like RW-lock, and simple IO is "read" and something under drained 
section is "write".


--
Best regards,
Vladimir

Re: [PATCH RFC 0/5] Fix accidental crash in iotest 30