[Qemu-block] [PULL 2/7] block/nfs: cache allocated filesize for read-only files

2015-09-25 Thread Jeff Cody
From: Peter Lieven 

If the file is readonly its not expected to grow so
save the blocking call to nfs_fstat_async and use
the value saved at connection time. Also important
the monitor (and thus the main loop) will not hang
if block device info is queried and the NFS share
is unresponsive.

Signed-off-by: Peter Lieven 
Reviewed-by: Jeff Cody 
Reviewed-by: Max Reitz 
Message-id: 1440671441-7978-1-git-send-email...@kamp.de
Signed-off-by: Jeff Cody 
---
 block/nfs.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index 02eb4e4..887a98e 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -43,6 +43,7 @@ typedef struct NFSClient {
 int events;
 bool has_zero_init;
 AioContext *aio_context;
+blkcnt_t st_blocks;
 } NFSClient;
 
 typedef struct NFSRPC {
@@ -374,6 +375,7 @@ static int64_t nfs_client_open(NFSClient *client, const 
char *filename,
 }
 
 ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
+client->st_blocks = st.st_blocks;
 client->has_zero_init = S_ISREG(st.st_mode);
 goto out;
 fail:
@@ -464,6 +466,11 @@ static int64_t 
nfs_get_allocated_file_size(BlockDriverState *bs)
 NFSRPC task = {0};
 struct stat st;
 
+if (bdrv_is_read_only(bs) &&
+!(bs->open_flags & BDRV_O_NOCACHE)) {
+return client->st_blocks * 512;
+}
+
 task.st = 
 if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
 ) != 0) {
@@ -484,6 +491,34 @@ static int nfs_file_truncate(BlockDriverState *bs, int64_t 
offset)
 return nfs_ftruncate(client->context, client->fh, offset);
 }
 
+/* Note that this will not re-establish a connection with the NFS server
+ * - it is effectively a NOP.  */
+static int nfs_reopen_prepare(BDRVReopenState *state,
+  BlockReopenQueue *queue, Error **errp)
+{
+NFSClient *client = state->bs->opaque;
+struct stat st;
+int ret = 0;
+
+if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) {
+error_setg(errp, "Cannot open a read-only mount as read-write");
+return -EACCES;
+}
+
+/* Update cache for read-only reopens */
+if (!(state->flags & BDRV_O_RDWR)) {
+ret = nfs_fstat(client->context, client->fh, );
+if (ret < 0) {
+error_setg(errp, "Failed to fstat file: %s",
+   nfs_get_error(client->context));
+return ret;
+}
+client->st_blocks = st.st_blocks;
+}
+
+return 0;
+}
+
 static BlockDriver bdrv_nfs = {
 .format_name= "nfs",
 .protocol_name  = "nfs",
@@ -499,6 +534,7 @@ static BlockDriver bdrv_nfs = {
 .bdrv_file_open = nfs_file_open,
 .bdrv_close = nfs_file_close,
 .bdrv_create= nfs_file_create,
+.bdrv_reopen_prepare= nfs_reopen_prepare,
 
 .bdrv_co_readv  = nfs_co_readv,
 .bdrv_co_writev = nfs_co_writev,
-- 
1.9.3




Re: [Qemu-block] [Qemu-devel] [PATCH] block: disable I/O limits at the beginning of bdrv_close()

2015-09-25 Thread Alberto Garcia
On Fri 25 Sep 2015 04:22:26 PM CEST, Eric Blake wrote:

>> Disabling I/O limits from a BDS also drains all pending throttled
>> requests, so it should be done at the beginning of bdrv_close() with
>> the rest of the bdrv_drain() calls before the BlockDriver is closed.
>
> Can this be abused? If I have a guest running in a cloud where the
> cloud provider has put severe throttling limits on me, but lets me
> hotplug to my heart's content, couldn't I just repeatedly plug/unplug
> the disk to get around the throttling (every time I unplug, all writes
> flush at full speed, then I immediately replug to start batching up a
> new set of writes).  In other words, shouldn't the draining still be
> throttled, to prevent my abuse?

I didn't think about this case, and I don't know how practical this is,
but note that bdrv_drain() (which is already at the beginning of
bdrv_close()) flushes the I/O queue explicitly bypassing the limits, so
other cases where a user can trigger a bdrv_drain() would also be
vulnerable to this.

Berto



[Qemu-block] [PULL 3/7] sheepdog: add reopen support

2015-09-25 Thread Jeff Cody
From: Liu Yuan 

With reopen supported, block-commit (and offline commit) is now supported for
image files whose base image uses the Sheepdog protocol driver.

Cc: qemu-de...@nongnu.org
Cc: Jeff Cody 
Cc: Kevin Wolf 
Cc: Stefan Hajnoczi 
Signed-off-by: Liu Yuan 
Message-id: 1440730438-24676-1-git-send-email-namei.u...@gmail.com
Signed-off-by: Jeff Cody 
---
 block/sheepdog.c | 76 
 1 file changed, 76 insertions(+)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 67ca788..255372eea 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -377,6 +377,11 @@ typedef struct BDRVSheepdogState {
 QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
 } BDRVSheepdogState;
 
+typedef struct BDRVSheepdogReopenState {
+int fd;
+int cache_flags;
+} BDRVSheepdogReopenState;
+
 static const char * sd_strerror(int err)
 {
 int i;
@@ -1486,6 +1491,68 @@ out:
 return ret;
 }
 
+static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
+ Error **errp)
+{
+BDRVSheepdogState *s = state->bs->opaque;
+BDRVSheepdogReopenState *re_s;
+int ret = 0;
+
+re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
+
+re_s->cache_flags = SD_FLAG_CMD_CACHE;
+if (state->flags & BDRV_O_NOCACHE) {
+re_s->cache_flags = SD_FLAG_CMD_DIRECT;
+}
+
+re_s->fd = get_sheep_fd(s, errp);
+if (re_s->fd < 0) {
+ret = re_s->fd;
+return ret;
+}
+
+return ret;
+}
+
+static void sd_reopen_commit(BDRVReopenState *state)
+{
+BDRVSheepdogReopenState *re_s = state->opaque;
+BDRVSheepdogState *s = state->bs->opaque;
+
+if (s->fd) {
+aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+closesocket(s->fd);
+}
+
+s->fd = re_s->fd;
+s->cache_flags = re_s->cache_flags;
+
+g_free(state->opaque);
+state->opaque = NULL;
+
+return;
+}
+
+static void sd_reopen_abort(BDRVReopenState *state)
+{
+BDRVSheepdogReopenState *re_s = state->opaque;
+BDRVSheepdogState *s = state->bs->opaque;
+
+if (re_s == NULL) {
+return;
+}
+
+if (re_s->fd) {
+aio_set_fd_handler(s->aio_context, re_s->fd, NULL, NULL, NULL);
+closesocket(re_s->fd);
+}
+
+g_free(state->opaque);
+state->opaque = NULL;
+
+return;
+}
+
 static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
 Error **errp)
 {
@@ -2702,6 +2769,9 @@ static BlockDriver bdrv_sheepdog = {
 .instance_size  = sizeof(BDRVSheepdogState),
 .bdrv_needs_filename = true,
 .bdrv_file_open = sd_open,
+.bdrv_reopen_prepare= sd_reopen_prepare,
+.bdrv_reopen_commit = sd_reopen_commit,
+.bdrv_reopen_abort  = sd_reopen_abort,
 .bdrv_close = sd_close,
 .bdrv_create= sd_create,
 .bdrv_has_zero_init = bdrv_has_zero_init_1,
@@ -2735,6 +2805,9 @@ static BlockDriver bdrv_sheepdog_tcp = {
 .instance_size  = sizeof(BDRVSheepdogState),
 .bdrv_needs_filename = true,
 .bdrv_file_open = sd_open,
+.bdrv_reopen_prepare= sd_reopen_prepare,
+.bdrv_reopen_commit = sd_reopen_commit,
+.bdrv_reopen_abort  = sd_reopen_abort,
 .bdrv_close = sd_close,
 .bdrv_create= sd_create,
 .bdrv_has_zero_init = bdrv_has_zero_init_1,
@@ -2768,6 +2841,9 @@ static BlockDriver bdrv_sheepdog_unix = {
 .instance_size  = sizeof(BDRVSheepdogState),
 .bdrv_needs_filename = true,
 .bdrv_file_open = sd_open,
+.bdrv_reopen_prepare= sd_reopen_prepare,
+.bdrv_reopen_commit = sd_reopen_commit,
+.bdrv_reopen_abort  = sd_reopen_abort,
 .bdrv_close = sd_close,
 .bdrv_create= sd_create,
 .bdrv_has_zero_init = bdrv_has_zero_init_1,
-- 
1.9.3




[Qemu-block] [PULL 6/7] sheepdog: use per AIOCB dirty indexes for non overlapping requests

2015-09-25 Thread Jeff Cody
From: Hitoshi Mitake 

In the commit 96b14ff85acf, requests for overlapping areas are
serialized. However, it cannot handle a case of non overlapping
requests. In such a case, min_dirty_data_idx and max_dirty_data_idx
can be overwritten by the requests and invalid inode update can
happen e.g. a case like create(1, 2) and create(3, 4) are issued in
parallel.

This patch lets SheepdogAIOCB have dirty data indexes instead of
BDRVSheepdogState for avoiding the above situation.

This patch also does trivial renaming for better description:
overwrapping -> overlapping

Cc: Teruaki Ishizaki 
Cc: Vasiliy Tolstov 
Cc: Jeff Cody 
Signed-off-by: Hitoshi Mitake 
Tested-by: Vasiliy Tolstov 
Message-id: 1441076590-8015-2-git-send-email-mitake.hito...@lab.ntt.co.jp
Signed-off-by: Jeff Cody 
---
 block/sheepdog.c | 63 +++-
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 255372eea..08a09e9 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -318,7 +318,7 @@ enum AIOCBState {
 AIOCB_DISCARD_OBJ,
 };
 
-#define AIOCBOverwrapping(x, y) \
+#define AIOCBOverlapping(x, y) \
 (!(x->max_affect_data_idx < y->min_affect_data_idx  \
|| y->max_affect_data_idx < x->min_affect_data_idx))
 
@@ -342,6 +342,15 @@ struct SheepdogAIOCB {
 uint32_t min_affect_data_idx;
 uint32_t max_affect_data_idx;
 
+/*
+ * The difference between affect_data_idx and dirty_data_idx:
+ * affect_data_idx represents range of index of all request types.
+ * dirty_data_idx represents range of index updated by COW requests.
+ * dirty_data_idx is used for updating an inode object.
+ */
+uint32_t min_dirty_data_idx;
+uint32_t max_dirty_data_idx;
+
 QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
 };
 
@@ -351,9 +360,6 @@ typedef struct BDRVSheepdogState {
 
 SheepdogInode inode;
 
-uint32_t min_dirty_data_idx;
-uint32_t max_dirty_data_idx;
-
 char name[SD_MAX_VDI_LEN];
 bool is_snapshot;
 uint32_t cache_flags;
@@ -373,7 +379,7 @@ typedef struct BDRVSheepdogState {
 QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
 QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
 
-CoQueue overwrapping_queue;
+CoQueue overlapping_queue;
 QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
 } BDRVSheepdogState;
 
@@ -561,6 +567,9 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, 
QEMUIOVector *qiov,
 acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
   acb->nb_sectors * BDRV_SECTOR_SIZE) / 
object_size;
 
+acb->min_dirty_data_idx = UINT32_MAX;
+acb->max_dirty_data_idx = 0;
+
 return acb;
 }
 
@@ -824,8 +833,8 @@ static void coroutine_fn aio_read_response(void *opaque)
  */
 if (rsp.result == SD_RES_SUCCESS) {
 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
-s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
-s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
+acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
 }
 }
 break;
@@ -1471,13 +1480,11 @@ static int sd_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 
 memcpy(>inode, buf, sizeof(s->inode));
-s->min_dirty_data_idx = UINT32_MAX;
-s->max_dirty_data_idx = 0;
 
 bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
 pstrcpy(s->name, sizeof(s->name), vdi);
 qemu_co_mutex_init(>lock);
-qemu_co_queue_init(>overwrapping_queue);
+qemu_co_queue_init(>overlapping_queue);
 qemu_opts_del(opts);
 g_free(buf);
 return 0;
@@ -1989,16 +1996,16 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB 
*acb)
 AIOReq *aio_req;
 uint32_t offset, data_len, mn, mx;
 
-mn = s->min_dirty_data_idx;
-mx = s->max_dirty_data_idx;
+mn = acb->min_dirty_data_idx;
+mx = acb->max_dirty_data_idx;
 if (mn <= mx) {
 /* we need to update the vdi object. */
 offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
 mn * sizeof(s->inode.data_vdi_id[0]);
 data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
 
-s->min_dirty_data_idx = UINT32_MAX;
-s->max_dirty_data_idx = 0;
+acb->min_dirty_data_idx = UINT32_MAX;
+acb->max_dirty_data_idx = 0;
 
 iov.iov_base = >inode;
 iov.iov_len = sizeof(s->inode);
@@ -2224,12 +2231,12 @@ out:
 return 1;
 }
 
-static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB 
*aiocb)
+static 

[Qemu-block] [PULL 1/7] block/nfs: fix calculation of allocated file size

2015-09-25 Thread Jeff Cody
From: Peter Lieven 

st.st_blocks is always counted in 512 byte units. Do not
use st.st_blksize as multiplicator which may be larger.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Lieven 
Reviewed-by: Max Reitz 
Reviewed-by: Jeff Cody 
Message-id: 1440067607-14547-1-git-send-email...@kamp.de
Signed-off-by: Jeff Cody 
---
 block/nfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nfs.c b/block/nfs.c
index c026ff6..02eb4e4 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -475,7 +475,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState 
*bs)
 aio_poll(client->aio_context, true);
 }
 
-return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize);
+return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
 
 static int nfs_file_truncate(BlockDriverState *bs, int64_t offset)
-- 
1.9.3




Re: [Qemu-block] [Qemu-devel] [PATCH] block: disable I/O limits at the beginning of bdrv_close()

2015-09-25 Thread Eric Blake
On 09/25/2015 07:41 AM, Alberto Garcia wrote:
> Disabling I/O limits from a BDS also drains all pending throttled
> requests, so it should be done at the beginning of bdrv_close() with
> the rest of the bdrv_drain() calls before the BlockDriver is closed.

Can this be abused? If I have a guest running in a cloud where the cloud
provider has put severe throttling limits on me, but lets me hotplug to
my heart's content, couldn't I just repeatedly plug/unplug the disk to
get around the throttling (every time I unplug, all writes flush at full
speed, then I immediately replug to start batching up a new set of
writes).  In other words, shouldn't the draining still be throttled, to
prevent my abuse?


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PULL 0/7] Block patches

2015-09-25 Thread Jeff Cody
The following changes since commit eb9d0ea063fc7bdfab76b84085602a9e48d13ec7:

  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20150924' 
into staging (2015-09-24 01:32:11 +0100)

are available in the git repository at:


  g...@github.com:codyprime/qemu-kvm-jtc.git tags/block-pull-request

for you to fetch changes up to e6fd57ea297ec3aad32b24090c5d3757a99df3fe:

  sheepdog: refine discard support (2015-09-25 10:25:19 -0400)


Block patches


Hitoshi Mitake (2):
  sheepdog: use per AIOCB dirty indexes for non overlapping requests
  sheepdog: refine discard support

Liu Yuan (1):
  sheepdog: add reopen support

Peter Lieven (2):
  block/nfs: fix calculation of allocated file size
  block/nfs: cache allocated filesize for read-only files

Wen Congyang (2):
  block: Introduce a new API bdrv_co_no_copy_on_readv()
  Backup: don't do copy-on-read in before_write_notifier

 block/backup.c|  20 --
 block/io.c|  12 +++-
 block/nfs.c   |  38 +++-
 block/sheepdog.c  | 168 +++---
 include/block/block.h |   9 ++-
 trace-events  |   1 +
 6 files changed, 200 insertions(+), 48 deletions(-)

-- 
1.9.3




[Qemu-block] [PULL 5/7] Backup: don't do copy-on-read in before_write_notifier

2015-09-25 Thread Jeff Cody
From: Wen Congyang 

We will copy data in before_write_notifier to do backup.
It is a nested I/O request, so we cannot do copy-on-read.

The steps to reproduce it:
1. -drive copy-on-read=on,...  // qemu option
2. drive_backup -f disk0 /path_to_backup.img // monitor command

Signed-off-by: Wen Congyang 
Tested-by: Jeff Cody 
Message-id: 1441682913-14320-3-git-send-email-we...@cn.fujitsu.com
Signed-off-by: Jeff Cody 
---
 block/backup.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 965654d..5696431 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -89,7 +89,8 @@ static void cow_request_end(CowRequest *req)
 
 static int coroutine_fn backup_do_cow(BlockDriverState *bs,
   int64_t sector_num, int nb_sectors,
-  bool *error_is_read)
+  bool *error_is_read,
+  bool is_write_notifier)
 {
 BackupBlockJob *job = (BackupBlockJob *)bs->job;
 CowRequest cow_request;
@@ -129,8 +130,14 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
 iov.iov_len = n * BDRV_SECTOR_SIZE;
 qemu_iovec_init_external(_qiov, , 1);
 
-ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
-_qiov);
+if (is_write_notifier) {
+ret = bdrv_co_no_copy_on_readv(bs,
+   start * BACKUP_SECTORS_PER_CLUSTER,
+   n, _qiov);
+} else {
+ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
+_qiov);
+}
 if (ret < 0) {
 trace_backup_do_cow_read_fail(job, start, ret);
 if (error_is_read) {
@@ -190,7 +197,7 @@ static int coroutine_fn backup_before_write_notify(
 assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
+return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
 }
 
 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -303,7 +310,8 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 return ret;
 }
 ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
-BACKUP_SECTORS_PER_CLUSTER, 
_is_read);
+BACKUP_SECTORS_PER_CLUSTER, _is_read,
+false);
 if ((ret < 0) &&
 backup_error_action(job, error_is_read, -ret) ==
 BLOCK_ERROR_ACTION_REPORT) {
@@ -408,7 +416,7 @@ static void coroutine_fn backup_run(void *opaque)
 }
 /* FULL sync mode we copy the whole drive. */
 ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
-BACKUP_SECTORS_PER_CLUSTER, _is_read);
+BACKUP_SECTORS_PER_CLUSTER, _is_read, false);
 if (ret < 0) {
 /* Depending on error action, fail now or retry cluster */
 BlockErrorAction action =
-- 
1.9.3




[Qemu-block] [PULL 7/7] sheepdog: refine discard support

2015-09-25 Thread Jeff Cody
From: Hitoshi Mitake 

This patch refines discard support of the sheepdog driver. The
existing discard mechanism was implemented on SD_OP_DISCARD_OBJ, which
was introduced before fine grained reference counting on newer
sheepdog. It doesn't care about relations of snapshots and clones and
discards objects unconditionally.

With this patch, the driver just updates an inode object for updating
reference. Removing the object is done in sheep process side.

Cc: Teruaki Ishizaki 
Cc: Vasiliy Tolstov 
Cc: Jeff Cody 
Signed-off-by: Hitoshi Mitake 
Tested-by: Vasiliy Tolstov 
Message-id: 1441076590-8015-3-git-send-email-mitake.hito...@lab.ntt.co.jp
Signed-off-by: Jeff Cody 
---
 block/sheepdog.c | 29 -
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 08a09e9..e7e58b7 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -28,7 +28,6 @@
 #define SD_OP_READ_OBJ   0x02
 #define SD_OP_WRITE_OBJ  0x03
 /* 0x04 is used internally by Sheepdog */
-#define SD_OP_DISCARD_OBJ0x05
 
 #define SD_OP_NEW_VDI0x11
 #define SD_OP_LOCK_VDI   0x12
@@ -861,10 +860,6 @@ static void coroutine_fn aio_read_response(void *opaque)
 rsp.result = SD_RES_SUCCESS;
 s->discard_supported = false;
 break;
-case SD_RES_SUCCESS:
-idx = data_oid_to_idx(aio_req->oid);
-s->inode.data_vdi_id[idx] = 0;
-break;
 default:
 break;
 }
@@ -1179,7 +1174,13 @@ static void coroutine_fn 
add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 hdr.flags = SD_FLAG_CMD_WRITE | flags;
 break;
 case AIOCB_DISCARD_OBJ:
-hdr.opcode = SD_OP_DISCARD_OBJ;
+hdr.opcode = SD_OP_WRITE_OBJ;
+hdr.flags = SD_FLAG_CMD_WRITE | flags;
+s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
+offset = offsetof(SheepdogInode,
+  data_vdi_id[data_oid_to_idx(oid)]);
+oid = vid_to_vdi_oid(s->inode.vdi_id);
+wlen = datalen = sizeof(uint32_t);
 break;
 }
 
@@ -2214,7 +2215,9 @@ static int coroutine_fn sd_co_rw_vector(void *p)
 }
 
 aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
-old_oid, done);
+old_oid,
+acb->aiocb_type == AIOCB_DISCARD_OBJ ?
+0 : done);
 QLIST_INSERT_HEAD(>inflight_aio_head, aio_req, aio_siblings);
 
 add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
@@ -2650,15 +2653,23 @@ static coroutine_fn int sd_co_discard(BlockDriverState 
*bs, int64_t sector_num,
   int nb_sectors)
 {
 SheepdogAIOCB *acb;
-QEMUIOVector dummy;
 BDRVSheepdogState *s = bs->opaque;
 int ret;
+QEMUIOVector discard_iov;
+struct iovec iov;
+uint32_t zero = 0;
 
 if (!s->discard_supported) {
 return 0;
 }
 
-acb = sd_aio_setup(bs, , sector_num, nb_sectors);
+memset(_iov, 0, sizeof(discard_iov));
+memset(, 0, sizeof(iov));
+iov.iov_base = 
+iov.iov_len = sizeof(zero);
+discard_iov.iov = 
+discard_iov.niov = 1;
+acb = sd_aio_setup(bs, _iov, sector_num, nb_sectors);
 acb->aiocb_type = AIOCB_DISCARD_OBJ;
 acb->aio_done_func = sd_finish_aiocb;
 
-- 
1.9.3




Re: [Qemu-block] ide-test fails on PPC64 big-endian host

2015-09-25 Thread John Snow


On 09/25/2015 03:20 PM, Peter Maydell wrote:
> Hi. I was looking at adding ppc64be to the set of machines I do build
> tests on before merging patches, but right now "make check" fails on
> this host.
> 
> ERROR:/home/pm215/qemu/tests/ide-test.c:721:cdrom_pio_impl: assertion
> failed ((data) & (DRQ | DRDY) == (DRQ | DRDY)): (0x0040 ==
> 0x0048)
> GTester: last random seed: R02S58b8c55d2bcc2ad0ddd605d5ce8483ee
> **
> ERROR:/home/pm215/qemu/tests/ide-test.c:721:cdrom_pio_impl: assertion
> failed ((data) & (DRQ | DRDY) == (DRQ | DRDY)): (0x0040 ==
> 0x0048)
> GTester: last random seed: R02S3bf67d6406a1f4ea5d8ca81bff345065
> **
> ERROR:/home/pm215/qemu/tests/ide-test.c:788:test_cdrom_dma: assertion
> failed (memcmp(pattern, rx, len) == 0): (1 == 0)
> GTester: last random seed: R02S2732ddb1755a1620a021eb6c59bd6281
> 
> The obvious guess is that something in the IDE code or its test
> harness has an accidental little-endian dependency. Would anybody
> care to investigate? :-)
> 
> thanks
> -- PMM
> 

Sigh, sorry. I'm a habitual offender of breaking ppcBE with my IDE
tests. I'll fix it.

:(



Re: [Qemu-block] [PULL 0/7] Block patches

2015-09-25 Thread Peter Maydell
On 25 September 2015 at 08:50, Jeff Cody  wrote:
> The following changes since commit eb9d0ea063fc7bdfab76b84085602a9e48d13ec7:
>
>   Merge remote-tracking branch 
> 'remotes/pmaydell/tags/pull-target-arm-20150924' into staging (2015-09-24 
> 01:32:11 +0100)
>
> are available in the git repository at:
>
>
>   g...@github.com:codyprime/qemu-kvm-jtc.git tags/block-pull-request
>
> for you to fetch changes up to e6fd57ea297ec3aad32b24090c5d3757a99df3fe:
>
>   sheepdog: refine discard support (2015-09-25 10:25:19 -0400)
>
> 
> Block patches
> 
>

Applied, thanks.

-- PMM



[Qemu-block] [PATCH v10 04/10] block: make bdrv_put_ref_bh_schedule() as a public API

2015-09-25 Thread Wen Congyang
Signed-off-by: Wen Congyang 
---
 block.c   | 25 +
 blockdev.c| 37 ++---
 include/block/block.h |  1 +
 3 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/block.c b/block.c
index 328c52f..f9a985c 100644
--- a/block.c
+++ b/block.c
@@ -3597,6 +3597,31 @@ void bdrv_unref(BlockDriverState *bs)
 }
 }
 
+typedef struct {
+QEMUBH *bh;
+BlockDriverState *bs;
+} BDRVPutRefBH;
+
+static void bdrv_put_ref_bh(void *opaque)
+{
+BDRVPutRefBH *s = opaque;
+
+bdrv_unref(s->bs);
+qemu_bh_delete(s->bh);
+g_free(s);
+}
+
+/* Release a BDS reference in a BH */
+void bdrv_put_ref_bh_schedule(BlockDriverState *bs)
+{
+BDRVPutRefBH *s;
+
+s = g_new(BDRVPutRefBH, 1);
+s->bh = qemu_bh_new(bdrv_put_ref_bh, s);
+s->bs = bs;
+qemu_bh_schedule(s->bh);
+}
+
 struct BdrvOpBlocker {
 Error *reason;
 QLIST_ENTRY(BdrvOpBlocker) list;
diff --git a/blockdev.c b/blockdev.c
index 3289cc3..11bc992 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -278,37 +278,6 @@ static void bdrv_format_print(void *opaque, const char 
*name)
 error_printf(" %s", name);
 }
 
-typedef struct {
-QEMUBH *bh;
-BlockDriverState *bs;
-} BDRVPutRefBH;
-
-static void bdrv_put_ref_bh(void *opaque)
-{
-BDRVPutRefBH *s = opaque;
-
-bdrv_unref(s->bs);
-qemu_bh_delete(s->bh);
-g_free(s);
-}
-
-/*
- * Release a BDS reference in a BH
- *
- * It is not safe to use bdrv_unref() from a callback function when the callers
- * still need the BlockDriverState.  In such cases we schedule a BH to release
- * the reference.
- */
-static void bdrv_put_ref_bh_schedule(BlockDriverState *bs)
-{
-BDRVPutRefBH *s;
-
-s = g_new(BDRVPutRefBH, 1);
-s->bh = qemu_bh_new(bdrv_put_ref_bh, s);
-s->bs = bs;
-qemu_bh_schedule(s->bh);
-}
-
 static int parse_block_error_action(const char *buf, bool is_read, Error 
**errp)
 {
 if (!strcmp(buf, "ignore")) {
@@ -2534,6 +2503,12 @@ static void block_job_cb(void *opaque, int ret)
 block_job_event_completed(bs->job, msg);
 }
 
+
+/*
+ * It is not safe to use bdrv_unref() from a callback function when the
+ * callers still need the BlockDriverState. In such cases we schedule
+ * a BH to release the reference.
+ */
 bdrv_put_ref_bh_schedule(bs);
 }
 
diff --git a/include/block/block.h b/include/block/block.h
index e4be19f..5154388 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -505,6 +505,7 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild 
*child);
 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
  BlockDriverState *child_bs,
  const BdrvChildRole *child_role);
+void bdrv_put_ref_bh_schedule(BlockDriverState *bs);
 
 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
-- 
2.4.3




[Qemu-block] [PATCH v10 03/10] Allow creating backup jobs when opening BDS

2015-09-25 Thread Wen Congyang
When opening BDS, we need to create backup jobs for
image-fleecing.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Jeff Cody 
---
 block/Makefile.objs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 58ef2ef..fa05f37 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -22,10 +22,10 @@ block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
 block-obj-y += write-threshold.o
+block-obj-y += backup.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
-common-obj-y += backup.o
 
 iscsi.o-cflags := $(LIBISCSI_CFLAGS)
 iscsi.o-libs   := $(LIBISCSI_LIBS)
-- 
2.4.3




[Qemu-block] [PATCH v10 06/10] Add new block driver interfaces to control block replication

2015-09-25 Thread Wen Congyang
Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Cc: Luiz Capitulino 
Cc: Michael Roth 
Reviewed-by: Paolo Bonzini 
---
 block.c   | 43 +++
 include/block/block.h |  5 +
 include/block/block_int.h | 14 ++
 qapi/block-core.json  | 13 +
 4 files changed, 75 insertions(+)

diff --git a/block.c b/block.c
index f9a985c..5cb916b 100644
--- a/block.c
+++ b/block.c
@@ -4253,3 +4253,46 @@ void bdrv_del_child(BlockDriverState *parent_bs, 
BlockDriverState *child_bs,
 
 parent_bs->drv->bdrv_del_child(parent_bs, child_bs, errp);
 }
+
+void bdrv_start_replication(BlockDriverState *bs, ReplicationMode mode,
+Error **errp)
+{
+BlockDriver *drv = bs->drv;
+
+if (drv && drv->bdrv_start_replication) {
+drv->bdrv_start_replication(bs, mode, errp);
+} else if (bs->file) {
+bdrv_start_replication(bs->file, mode, errp);
+} else {
+error_setg(errp, "The BDS %s doesn't support starting block"
+   " replication", bs->filename);
+}
+}
+
+void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp)
+{
+BlockDriver *drv = bs->drv;
+
+if (drv && drv->bdrv_do_checkpoint) {
+drv->bdrv_do_checkpoint(bs, errp);
+} else if (bs->file) {
+bdrv_do_checkpoint(bs->file, errp);
+} else {
+error_setg(errp, "The BDS %s doesn't support block checkpoint",
+   bs->filename);
+}
+}
+
+void bdrv_stop_replication(BlockDriverState *bs, bool failover, Error **errp)
+{
+BlockDriver *drv = bs->drv;
+
+if (drv && drv->bdrv_stop_replication) {
+drv->bdrv_stop_replication(bs, failover, errp);
+} else if (bs->file) {
+bdrv_stop_replication(bs->file, failover, errp);
+} else {
+error_setg(errp, "The BDS %s doesn't support stopping block"
+   " replication", bs->filename);
+}
+}
diff --git a/include/block/block.h b/include/block/block.h
index 5154388..40ef59f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -611,4 +611,9 @@ void bdrv_add_child(BlockDriverState *parent, 
BlockDriverState *child,
 void bdrv_del_child(BlockDriverState *parent, BlockDriverState *child,
 Error **errp);
 
+void bdrv_start_replication(BlockDriverState *bs, ReplicationMode mode,
+Error **errp);
+void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp);
+void bdrv_stop_replication(BlockDriverState *bs, bool failover, Error **errp);
+
 #endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 636d0c9..ee4b8fa 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -293,6 +293,20 @@ struct BlockDriver {
 void (*bdrv_del_child)(BlockDriverState *parent, BlockDriverState *child,
Error **errp);
 
+void (*bdrv_start_replication)(BlockDriverState *bs, ReplicationMode mode,
+   Error **errp);
+/* Drop Disk buffer when doing checkpoint. */
+void (*bdrv_do_checkpoint)(BlockDriverState *bs, Error **errp);
+/*
+ * After failover, we should flush Disk buffer into secondary disk
+ * and stop block replication.
+ *
+ * If the guest is shutdown, we should drop Disk buffer and stop
+ * block representation.
+ */
+void (*bdrv_stop_replication)(BlockDriverState *bs, bool failover,
+  Error **errp);
+
 QLIST_ENTRY(BlockDriver) list;
 };
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 000ae47..d5a177b 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1797,6 +1797,19 @@
 '*read-pattern': 'QuorumReadPattern' } }
 
 ##
+# @ReplicationMode
+#
+# An enumeration of replication modes.
+#
+# @primary: Primary mode, the vm's state will be sent to secondary QEMU.
+#
+# @secondary: Secondary mode, receive the vm's state from primary QEMU.
+#
+# Since: 2.5
+##
+{ 'enum' : 'ReplicationMode', 'data' : [ 'primary', 'secondary' ] }
+
+##
 # @BlockdevOptions
 #
 # Options for creating a block device.
-- 
2.4.3




[Qemu-block] [PATCH v10 07/10] quorum: implement block driver interfaces for block replication

2015-09-25 Thread Wen Congyang
Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Reviewed-by: Alberto Garcia 
---
 block/quorum.c | 77 ++
 1 file changed, 77 insertions(+)

diff --git a/block/quorum.c b/block/quorum.c
index 111a57b..d647ab4 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -85,6 +85,8 @@ typedef struct BDRVQuorumState {
 */
 
 QuorumReadPattern read_pattern;
+
+int replication_index; /* store which child supports block replication */
 } BDRVQuorumState;
 
 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -945,6 +947,7 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 
 g_free(opened);
+s->replication_index = -1;
 goto exit;
 
 close_exit:
@@ -1093,6 +1096,76 @@ static void quorum_refresh_filename(BlockDriverState *bs)
 bs->full_open_options = opts;
 }
 
+static void quorum_start_replication(BlockDriverState *bs, ReplicationMode 
mode,
+ Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+int count = 0, i, index;
+Error *local_err = NULL;
+
+/*
+ * TODO: support REPLICATION_MODE_SECONDARY if we allow secondary
+ * QEMU becoming primary QEMU.
+ */
+if (mode != REPLICATION_MODE_PRIMARY) {
+error_setg(errp, "The replication mode for quorum should be 
'primary'");
+return;
+}
+
+if (s->read_pattern != QUORUM_READ_PATTERN_FIFO) {
+error_setg(errp, "Block replication needs read pattern 'fifo'");
+return;
+}
+
+for (i = 0; i < s->num_children; i++) {
+bdrv_start_replication(s->bs[i], mode, _err);
+if (local_err) {
+error_free(local_err);
+local_err = NULL;
+} else {
+count++;
+index = i;
+}
+}
+
+if (count == 0) {
+error_setg(errp, "No child supports block replication");
+} else if (count > 1) {
+for (i = 0; i < s->num_children; i++) {
+bdrv_stop_replication(s->bs[i], false, NULL);
+}
+error_setg(errp, "Too many children support block replication");
+} else {
+s->replication_index = index;
+}
+}
+
+static void quorum_do_checkpoint(BlockDriverState *bs, Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+
+if (s->replication_index < 0) {
+error_setg(errp, "Block replication is not running");
+return;
+}
+
+bdrv_do_checkpoint(s->bs[s->replication_index], errp);
+}
+
+static void quorum_stop_replication(BlockDriverState *bs, bool failover,
+Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+
+if (s->replication_index < 0) {
+error_setg(errp, "Block replication is not running");
+return;
+}
+
+bdrv_stop_replication(s->bs[s->replication_index], failover, errp);
+s->replication_index = -1;
+}
+
 static BlockDriver bdrv_quorum = {
 .format_name= "quorum",
 .protocol_name  = "quorum",
@@ -1119,6 +1192,10 @@ static BlockDriver bdrv_quorum = {
 
 .is_filter  = true,
 .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
+
+.bdrv_start_replication = quorum_start_replication,
+.bdrv_do_checkpoint = quorum_do_checkpoint,
+.bdrv_stop_replication  = quorum_stop_replication,
 };
 
 static void bdrv_quorum_init(void)
-- 
2.4.3




Re: [Qemu-block] [Qemu-devel] [PATCH 11/16] Add new block driver interfaces to control block replication

2015-09-25 Thread Wen Congyang
On 09/03/2015 12:33 AM, Eric Blake wrote:
> On 09/02/2015 02:51 AM, Wen Congyang wrote:
>> Signed-off-by: Wen Congyang 
>> Signed-off-by: zhanghailiang 
>> Signed-off-by: Gonglei 
>> Cc: Luiz Capitulino 
>> Cc: Michael Roth 
>> Reviewed-by: Paolo Bonzini 
>> ---
>>  block.c   | 43 +++
>>  include/block/block.h |  5 +
>>  include/block/block_int.h | 14 ++
>>  qapi/block-core.json  | 15 +++
>>  4 files changed, 77 insertions(+)
>>
> 
> Just an interface review for now:
> 
>> +++ b/qapi/block-core.json
>> @@ -1810,6 +1810,21 @@
>>'data': { '*export': 'str' } }
>>  
>>  ##
>> +# @ReplicationMode
>> +#
>> +# An enumeration of replication modes.
>> +#
>> +# @unprotected: Replication is not started or after failover.
> 
> Maybe:
> 
> Replication is either not started, or has experienced failover.

This is internal state, and this mode is used to tell qemu that
it is on which side.

Thanks
Wen Congyang

> 
>> +#
>> +# @primary: Primary mode, the vm's state will be sent to secondary QEMU.
>> +#
>> +# @secondary: Secondary mode, receive the vm's state from primary QEMU.
>> +#
>> +# Since: 2.4
> 
> You've missed 2.4; this should be 2.5.
> 
>> +##
>> +{ 'enum' : 'ReplicationMode', 'data' : [ 'primary', 'secondary' ] }
> 
> Where is 'unprotected' in this list?
> 




[Qemu-block] [PATCH v10 02/10] Backup: clear all bitmap when doing block checkpoint

2015-09-25 Thread Wen Congyang
Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Reviewed-by: Jeff Cody 
---
 block/backup.c   | 14 ++
 blockjob.c   | 11 +++
 include/block/blockjob.h | 12 
 3 files changed, 37 insertions(+)

diff --git a/block/backup.c b/block/backup.c
index c61e4c3..5e5995e 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -214,11 +214,25 @@ static void backup_iostatus_reset(BlockJob *job)
 }
 }
 
+static void backup_do_checkpoint(BlockJob *job, Error **errp)
+{
+BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+
+if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
+error_setg(errp, "The backup job only supports block checkpoint in"
+   " sync=none mode");
+return;
+}
+
+hbitmap_reset_all(backup_job->bitmap);
+}
+
 static const BlockJobDriver backup_job_driver = {
 .instance_size  = sizeof(BackupBlockJob),
 .job_type   = BLOCK_JOB_TYPE_BACKUP,
 .set_speed  = backup_set_speed,
 .iostatus_reset = backup_iostatus_reset,
+.do_checkpoint  = backup_do_checkpoint,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
diff --git a/blockjob.c b/blockjob.c
index ca4be94..ea4c44a 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -405,3 +405,14 @@ void block_job_defer_to_main_loop(BlockJob *job,
 
 qemu_bh_schedule(data->bh);
 }
+
+void block_job_do_checkpoint(BlockJob *job, Error **errp)
+{
+if (!job->driver->do_checkpoint) {
+error_setg(errp, "The job %s doesn't support block checkpoint",
+   BlockJobType_lookup[job->driver->job_type]);
+return;
+}
+
+job->driver->do_checkpoint(job, errp);
+}
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index dd9d5e6..0b4f386 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -50,6 +50,9 @@ typedef struct BlockJobDriver {
  * manually.
  */
 void (*complete)(BlockJob *job, Error **errp);
+
+/** Optional callback for job types that support checkpoint. */
+void (*do_checkpoint)(BlockJob *job, Error **errp);
 } BlockJobDriver;
 
 /**
@@ -356,4 +359,13 @@ void block_job_defer_to_main_loop(BlockJob *job,
   BlockJobDeferToMainLoopFn *fn,
   void *opaque);
 
+/**
+ * block_job_do_checkpoint:
+ * @job: The job.
+ * @errp: Error object.
+ *
+ * Do block checkpoint on the specified job.
+ */
+void block_job_do_checkpoint(BlockJob *job, Error **errp);
+
 #endif
-- 
2.4.3




[Qemu-block] [PATCH v10 10/10] Add a new API to start/stop replication, do checkpoint to all BDSes

2015-09-25 Thread Wen Congyang
Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block.c   | 83 +++
 include/block/block.h |  4 +++
 2 files changed, 87 insertions(+)

diff --git a/block.c b/block.c
index 5cb916b..5891c4d 100644
--- a/block.c
+++ b/block.c
@@ -4296,3 +4296,86 @@ void bdrv_stop_replication(BlockDriverState *bs, bool 
failover, Error **errp)
" replication", bs->filename);
 }
 }
+
+void bdrv_start_replication_all(ReplicationMode mode, Error **errp)
+{
+BlockDriverState *bs = NULL, *temp = NULL;
+Error *local_err = NULL;
+
+while ((bs = bdrv_next(bs))) {
+if (!QLIST_EMPTY(>parents)) {
+/* It is not top BDS */
+continue;
+}
+
+if (bdrv_is_read_only(bs) || !bdrv_is_inserted(bs)) {
+continue;
+}
+
+bdrv_start_replication(bs, mode, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+goto fail;
+}
+}
+
+return;
+
+fail:
+while ((temp = bdrv_next(temp)) && bs != temp) {
+bdrv_stop_replication(temp, false, NULL);
+}
+}
+
+void bdrv_do_checkpoint_all(Error **errp)
+{
+BlockDriverState *bs = NULL;
+Error *local_err = NULL;
+
+while ((bs = bdrv_next(bs))) {
+if (!QLIST_EMPTY(>parents)) {
+/* It is not top BDS */
+continue;
+}
+
+if (bdrv_is_read_only(bs) || !bdrv_is_inserted(bs)) {
+continue;
+}
+
+bdrv_do_checkpoint(bs, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+return;
+}
+}
+}
+
+void bdrv_stop_replication_all(bool failover, Error **errp)
+{
+BlockDriverState *bs = NULL;
+Error *local_err = NULL;
+
+while ((bs = bdrv_next(bs))) {
+if (!QLIST_EMPTY(>parents)) {
+/* It is not top BDS */
+continue;
+}
+
+if (bdrv_is_read_only(bs) || !bdrv_is_inserted(bs)) {
+continue;
+}
+
+bdrv_stop_replication(bs, failover, _err);
+if (!errp) {
+/*
+ * The caller doesn't care the result, they just
+ * want to stop all block's replication.
+ */
+continue;
+}
+if (local_err) {
+error_propagate(errp, local_err);
+return;
+}
+}
+}
diff --git a/include/block/block.h b/include/block/block.h
index 40ef59f..eb6a4a2 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -616,4 +616,8 @@ void bdrv_start_replication(BlockDriverState *bs, 
ReplicationMode mode,
 void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp);
 void bdrv_stop_replication(BlockDriverState *bs, bool failover, Error **errp);
 
+void bdrv_start_replication_all(ReplicationMode mode, Error **errp);
+void bdrv_do_checkpoint_all(Error **errp);
+void bdrv_stop_replication_all(bool failover, Error **errp);
+
 #endif
-- 
2.4.3




[Qemu-block] [PATCH v10 09/10] support replication driver in blockdev-add

2015-09-25 Thread Wen Congyang
Signed-off-by: Wen Congyang 
---
 qapi/block-core.json | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index d5a177b..0907a72 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -219,7 +219,7 @@
 #   'qcow2', 'raw', 'tftp', 'vdi', 'vmdk', 'vpc', 'vvfat'
 #   2.2: 'archipelago' added, 'cow' dropped
 #   2.3: 'host_floppy' deprecated
-#   2.5: 'host_floppy' dropped
+#   2.5: 'host_floppy' dropped, 'replication' added
 #
 # @backing_file: #optional the name of the backing file (for copy-on-write)
 #
@@ -1375,6 +1375,7 @@
 # Drivers that are supported in block device operations.
 #
 # @host_device, @host_cdrom: Since 2.1
+# @replication: Since 2.5
 #
 # Since: 2.0
 ##
@@ -1382,8 +1383,8 @@
   'data': [ 'archipelago', 'blkdebug', 'blkverify', 'bochs', 'cloop',
 'dmg', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device',
 'http', 'https', 'null-aio', 'null-co', 'parallels',
-'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'tftp', 'vdi', 'vhdx',
-'vmdk', 'vpc', 'vvfat' ] }
+'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'replication',
+'tftp', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
 
 ##
 # @BlockdevOptionsBase
@@ -1810,6 +1811,19 @@
 { 'enum' : 'ReplicationMode', 'data' : [ 'primary', 'secondary' ] }
 
 ##
+# @BlockdevOptionsReplication
+#
+# Driver specific block device options for replication
+#
+# @mode: the replication mode
+#
+# Since: 2.5
+##
+{ 'struct': 'BlockdevOptionsReplication',
+  'base': 'BlockdevOptionsGenericFormat',
+  'data': { 'mode': 'ReplicationMode'  } }
+
+##
 # @BlockdevOptions
 #
 # Options for creating a block device.
@@ -1846,6 +1860,7 @@
   'quorum': 'BlockdevOptionsQuorum',
   'raw':'BlockdevOptionsGenericFormat',
 # TODO rbd: Wait for structured options
+  'replication':'BlockdevOptionsReplication',
 # TODO sheepdog: Wait for structured options
 # TODO ssh: Should take InetSocketAddress for 'host'?
   'tftp':   'BlockdevOptionsFile',
-- 
2.4.3




[Qemu-block] [PATCH v10 00/10] Block replication for continuous checkpoints

2015-09-25 Thread Wen Congyang
Block replication is a very important feature which is used for
continuous checkpoints(for example: COLO).

You can the detailed information about block replication from here:
http://wiki.qemu.org/Features/BlockReplication

Usage:
Please refer to docs/block-replication.txt

This patch series is based on the following patch series:
1. http://lists.nongnu.org/archive/html/qemu-devel/2015-09/msg05514.html
2. http://lists.nongnu.org/archive/html/qemu-devel/2015-09/msg04900.html

You can get the patch here:
https://github.com/coloft/qemu/tree/wency/block-replication-v10

You can get the patch with framework here:
https://github.com/coloft/qemu/tree/wency/colo_framework_v9.5

TODO:
1. Continuous block replication. It will be started after basic functions
   are accepted.

Changs Log:
V10:
1. Use blockdev-remove-medium and blockdev-insert-medium to replace backing
   reference.
2. Address the comments from Eric Blake
V9:
1. Update the error messages
2. Rebase to the newest qemu
3. Split child add/delete support. These patches are sent in another patchset.
V8:
1. Address Alberto Garcia's comments
V7:
1. Implement adding/removing quorum child. Remove the option non-connect.
2. Simplify the backing refrence option according to Stefan Hajnoczi's 
suggestion
V6:
1. Rebase to the newest qemu.
V5:
1. Address the comments from Gong Lei
2. Speed the failover up. The secondary vm can take over very quickly even
   if there are too many I/O requests.
V4:
1. Introduce a new driver replication to avoid touch nbd and qcow2.
V3:
1: use error_setg() instead of error_set()
2. Add a new block job API
3. Active disk, hidden disk and nbd target uses the same AioContext
4. Add a testcase to test new hbitmap API
V2:
1. Redesign the secondary qemu(use image-fleecing)
2. Use Error objects to return error message
3. Address the comments from Max Reitz and Eric Blake

Wen Congyang (10):
  allow writing to the backing file
  Backup: clear all bitmap when doing block checkpoint
  Allow creating backup jobs when opening BDS
  block: make bdrv_put_ref_bh_schedule() as a public API
  docs: block replication's description
  Add new block driver interfaces to control block replication
  quorum: implement block driver interfaces for block replication
  Implement new driver for block replication
  support replication driver in blockdev-add
  Add a new API to start/stop replication, do checkpoint to all BDSes

 block.c| 192 +-
 block/Makefile.objs|   3 +-
 block/backup.c |  14 ++
 block/quorum.c |  77 
 block/replication.c| 471 +
 blockdev.c |  37 +---
 blockjob.c |  11 ++
 docs/block-replication.txt | 259 +
 include/block/block.h  |  10 +
 include/block/block_int.h  |  14 ++
 include/block/blockjob.h   |  12 ++
 qapi/block-core.json   |  34 +++-
 12 files changed, 1098 insertions(+), 36 deletions(-)
 create mode 100644 block/replication.c
 create mode 100644 docs/block-replication.txt

-- 
2.4.3




[Qemu-block] [PATCH v10 05/10] docs: block replication's description

2015-09-25 Thread Wen Congyang
Signed-off-by: Wen Congyang 
Signed-off-by: Yang Hongyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 docs/block-replication.txt | 259 +
 1 file changed, 259 insertions(+)
 create mode 100644 docs/block-replication.txt

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
new file mode 100644
index 000..eab62df
--- /dev/null
+++ b/docs/block-replication.txt
@@ -0,0 +1,259 @@
+Block replication
+
+Copyright Fujitsu, Corp. 2015
+Copyright (c) 2015 Intel Corporation
+Copyright (c) 2015 HUAWEI TECHNOLOGIES CO., LTD.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+Block replication is used for continuous checkpoints. It is designed
+for COLO (COurse-grain LOck-stepping) where the Secondary VM is running.
+It can also be applied for FT/HA (Fault-tolerance/High Assurance) scenario,
+where the Secondary VM is not running.
+
+This document gives an overview of block replication's design.
+
+== Background ==
+High availability solutions such as micro checkpoint and COLO will do
+consecutive checkpoints. The VM state of Primary VM and Secondary VM is
+identical right after a VM checkpoint, but becomes different as the VM
+executes till the next checkpoint. To support disk contents checkpoint,
+the modified disk contents in the Secondary VM must be buffered, and are
+only dropped at next checkpoint time. To reduce the network transportation
+effort at the time of checkpoint, the disk modification operations of
+Primary disk are asynchronously forwarded to the Secondary node.
+
+== Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  |  Copy and Forward| |
+  |-(1)--+   | Disk Buffer |
+  |  |   | |
+  | (3)  \-/
+  | speculative  ^
+  |write through(2)
+  |  |   |
+  V  V   |
+   +--+   ++
+   | Primary Disk |   | Secondary Disk |
+   +--+   ++
+
+1) Primary write requests will be copied and forwarded to Secondary
+   QEMU.
+2) Before Primary write requests are written to Secondary disk, the
+   original sector content will be read from Secondary disk and
+   buffered in the Disk buffer, but it will not overwrite the existing
+   sector content (it could be from either "Secondary Write Requests" or
+   previous COW of "Primary Write Requests") in the Disk buffer.
+3) Primary write requests will be written to Secondary disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+
+ virtio-blk   ||
+ ^||.--
+ |||| Secondary
+1 Quorum  ||'--
+ /  \ ||
+/\||
+   Primary2 filter
+ disk ^
 virtio-blk
+  |
  ^
+3 NBD  --->  3 NBD 
  |
+client|| server
  2 filter
+  ||^  
  ^
+. |||  
  |
+Primary | ||  Secondary disk <- hidden-disk 5 
<- active-disk 4
+' |||  backing^   backing
+  ||| |
+  ||| |
+  ||

[Qemu-block] [PATCH v10 01/10] allow writing to the backing file

2015-09-25 Thread Wen Congyang
For block replication, we have such backing chain:
secondary disk <-- hidden disk <-- active disk
secondary disk is top BDS(use bacing reference), so it can be opened in
read-write mode. But hidden disk is read only, and we need to write to
hidden disk(backup job will write data to it).

TODO: support opening backing file in read-write mode if the BDS is
created by QMP command blockdev-add.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
---
 block.c | 41 -
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 073d8d6..328c52f 100644
--- a/block.c
+++ b/block.c
@@ -738,6 +738,15 @@ static const BdrvChildRole child_backing = {
 .inherit_flags = bdrv_backing_flags,
 };
 
+static int bdrv_backing_rw_flags(int flags)
+{
+return bdrv_backing_flags(flags) | BDRV_O_RDWR;
+}
+
+static const BdrvChildRole child_backing_rw = {
+.inherit_flags = bdrv_backing_rw_flags,
+};
+
 static int bdrv_open_flags(BlockDriverState *bs, int flags)
 {
 int open_flags = flags | BDRV_O_CACHE_WB;
@@ -1150,6 +1159,20 @@ out:
 bdrv_refresh_limits(bs, NULL);
 }
 
+#define ALLOW_WRITE_BACKING_FILE"allow-write-backing-file"
+static QemuOptsList backing_file_opts = {
+.name = "backing_file",
+.head = QTAILQ_HEAD_INITIALIZER(backing_file_opts.head),
+.desc = {
+{
+.name = ALLOW_WRITE_BACKING_FILE,
+.type = QEMU_OPT_BOOL,
+.help = "allow writes to backing file",
+},
+{ /* end of list */ }
+},
+};
+
 /*
  * Opens the backing file for a BlockDriverState if not yet open
  *
@@ -1164,6 +1187,9 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 int ret = 0;
 BlockDriverState *backing_hd;
 Error *local_err = NULL;
+QemuOpts *opts = NULL;
+bool child_rw = false;
+const BdrvChildRole *child_role = NULL;
 
 if (bs->backing_hd != NULL) {
 QDECREF(options);
@@ -1176,6 +1202,18 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 }
 
 bs->open_flags &= ~BDRV_O_NO_BACKING;
+
+opts = qemu_opts_create(_file_opts, NULL, 0, _abort);
+qemu_opts_absorb_qdict(opts, options, _err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+QDECREF(options);
+goto free_exit;
+}
+child_rw = qemu_opt_get_bool(opts, ALLOW_WRITE_BACKING_FILE, false);
+child_role = child_rw ? _backing_rw : _backing;
+
 if (qdict_haskey(options, "file.filename")) {
 backing_filename[0] = '\0';
 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
@@ -1208,7 +1246,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 assert(bs->backing_hd == NULL);
 ret = bdrv_open_inherit(_hd,
 *backing_filename ? backing_filename : NULL,
-NULL, options, 0, bs, _backing, _err);
+NULL, options, 0, bs, child_role, _err);
 if (ret < 0) {
 bdrv_unref(backing_hd);
 backing_hd = NULL;
@@ -1222,6 +1260,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 bdrv_set_backing_hd(bs, backing_hd);
 
 free_exit:
+qemu_opts_del(opts);
 g_free(backing_filename);
 return ret;
 }
-- 
2.4.3




[Qemu-block] [PATCH] block: disable I/O limits at the beginning of bdrv_close()

2015-09-25 Thread Alberto Garcia
Disabling I/O limits from a BDS also drains all pending throttled
requests, so it should be done at the beginning of bdrv_close() with
the rest of the bdrv_drain() calls before the BlockDriver is closed.

Signed-off-by: Alberto Garcia 
---
 block.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/block.c b/block.c
index 6268e37..1f90b47 100644
--- a/block.c
+++ b/block.c
@@ -1907,6 +1907,12 @@ void bdrv_close(BlockDriverState *bs)
 if (bs->job) {
 block_job_cancel_sync(bs->job);
 }
+
+/* Disable I/O limits and drain all pending throttled requests */
+if (bs->io_limits_enabled) {
+bdrv_io_limits_disable(bs);
+}
+
 bdrv_drain(bs); /* complete I/O */
 bdrv_flush(bs);
 bdrv_drain(bs); /* in case flush left pending I/O */
@@ -1958,11 +1964,6 @@ void bdrv_close(BlockDriverState *bs)
 blk_dev_change_media_cb(bs->blk, false);
 }
 
-/*throttling disk I/O limits*/
-if (bs->io_limits_enabled) {
-bdrv_io_limits_disable(bs);
-}
-
 QLIST_FOREACH_SAFE(ban, >aio_notifiers, list, ban_next) {
 g_free(ban);
 }
-- 
2.5.3




Re: [Qemu-block] [PATCH 05/16] block: Convert bs->file to BdrvChild

2015-09-25 Thread Alberto Garcia
On Thu 17 Sep 2015 03:48:09 PM CEST, Kevin Wolf  wrote:

> @@ -1929,6 +1925,11 @@ void bdrv_close(BlockDriverState *bs)
>  bdrv_unref(backing_hd);
>  }
>  
> +if (bs->file != NULL) {
> +bdrv_unref(bs->file->bs);
> +bs->file = NULL;
> +}
> +
>  QLIST_FOREACH_SAFE(child, >children, next, next) {
>  /* TODO Remove bdrv_unref() from drivers' close function and use
>   * bdrv_unref_child() here */
> @@ -1953,11 +1954,6 @@ void bdrv_close(BlockDriverState *bs)
>  bs->options = NULL;
>  QDECREF(bs->full_open_options);
>  bs->full_open_options = NULL;
> -
> -if (bs->file != NULL) {
> -bdrv_unref(bs->file);
> -bs->file = NULL;
> -}
>  }

You are moving bdrv_unref(bs->file) up in the function and this seems to
be causing a problem, by turning this:

bs->drv->bdrv_close(bs);
bs->drv = NULL;
bdrv_unref(bs->file);

into this:

bs->drv->bdrv_close(bs);
bdrv_unref(bs->file);
bs->drv = NULL;

In the latter case, closing bs->file calls aio_poll(). This can trigger
new requests on bs, which at that point has a valid pointer to a
BlockDriver that has already been closed. If throttling is enabled on bs
those requests might be queued for later. At the point in which those
requests are scheduled bs->drv is already NULL, crashing QEMU. I can
reproduce this easily with x-data-plane=on.

I sent a separate patch that moves the bdrv_io_limits_disable() call to
the beginning of bdrv_close(). That solves the crash, but I guess that
this patch should also be changed.

Berto