[Qemu-block] [PATCH v5 07/13] nbd-server: Clear can_read when device io blocker is set

2015-05-20 Thread Fam Zheng
So that NBD export cannot submit IO during bdrv_drain_all().

Signed-off-by: Fam Zheng f...@redhat.com
---
 nbd.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/nbd.c b/nbd.c
index 06b501b..b5026af 100644
--- a/nbd.c
+++ b/nbd.c
@@ -160,6 +160,8 @@ struct NBDExport {
 uint32_t nbdflags;
 QTAILQ_HEAD(, NBDClient) clients;
 QTAILQ_ENTRY(NBDExport) next;
+Notifier blocker_notifier;
+bool io_blocked;
 
 AioContext *ctx;
 };
@@ -1053,6 +1055,22 @@ static void blk_aio_detach(void *opaque)
 exp-ctx = NULL;
 }
 
+static void nbd_op_blocker_changed(Notifier *notifier, void *data)
+{
+BlockOpEvent *event = data;
+NBDClient *client;
+NBDExport *exp = container_of(notifier, NBDExport, blocker_notifier);
+
+if (event-type != BLOCK_OP_TYPE_DEVICE_IO) {
+return;
+}
+exp-io_blocked = event-blocking;
+
+QTAILQ_FOREACH(client, exp-clients, next) {
+nbd_update_can_read(client);
+}
+}
+
 NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
   uint32_t nbdflags, void (*close)(NBDExport *),
   Error **errp)
@@ -1081,6 +1099,8 @@ NBDExport *nbd_export_new(BlockBackend *blk, off_t 
dev_offset, off_t size,
  * access since the export could be available before migration handover.
  */
 blk_invalidate_cache(blk, NULL);
+exp-blocker_notifier.notify = nbd_op_blocker_changed;
+blk_op_blocker_add_notifier(blk, exp-blocker_notifier);
 return exp;
 
 fail:
@@ -1132,6 +1152,7 @@ void nbd_export_close(NBDExport *exp)
 nbd_export_set_name(exp, NULL);
 nbd_export_put(exp);
 if (exp-blk) {
+notifier_remove(exp-blocker_notifier);
 blk_remove_aio_context_notifier(exp-blk, blk_aio_attached,
 blk_aio_detach, exp);
 blk_unref(exp-blk);
@@ -1455,6 +1476,9 @@ static void nbd_update_can_read(NBDClient *client)
 bool can_read = client-recv_coroutine ||
 client-nb_requests  MAX_NBD_REQUESTS;
 
+if (client-exp  client-exp-io_blocked) {
+can_read = false;
+}
 if (can_read != client-can_read) {
 client-can_read = can_read;
 nbd_set_handlers(client);
-- 
2.4.1




[Qemu-block] [PATCH v5 02/13] block: Add op blocker notifier list

2015-05-20 Thread Fam Zheng
BDS users can register a notifier and get notified about op blocker
changes.

Signed-off-by: Fam Zheng f...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 35 +++
 include/block/block.h |  8 
 include/block/block_int.h |  3 +++
 3 files changed, 46 insertions(+)

diff --git a/block.c b/block.c
index 7904098..9714603 100644
--- a/block.c
+++ b/block.c
@@ -3375,6 +3375,19 @@ struct BdrvOpBlocker {
 QLIST_ENTRY(BdrvOpBlocker) list;
 };
 
+/* Add a notifier which will be notified when the first blocker of some type is
+ * added to bs, or when the last blocker is removed. The notifier handler
+ * should receive a BlockOpEvent data.
+ *
+ * Caller must hold bs-aio_context; the notifier handler is also called with
+ * it held.
+ */
+void bdrv_op_blocker_add_notifier(BlockDriverState *bs,
+  Notifier *notifier)
+{
+notifier_list_add(bs-op_blocker_notifiers, notifier);
+}
+
 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
 {
 BdrvOpBlocker *blocker;
@@ -3391,26 +3404,48 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, 
BlockOpType op, Error **errp)
 return false;
 }
 
+static void bdrv_op_blocker_notify(BlockDriverState *bs, BlockOpType op,
+   Error *reason, bool blocking)
+{
+BlockOpEvent event = (BlockOpEvent) {
+op = op,
+reason = reason,
+blocking = blocking,
+};
+
+notifier_list_notify(bs-op_blocker_notifiers, event);
+}
+
 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
 {
+bool blocked;
 BdrvOpBlocker *blocker;
 assert((int) op = 0  op  BLOCK_OP_TYPE_MAX);
 
 blocker = g_new0(BdrvOpBlocker, 1);
 blocker-reason = reason;
+blocked = !QLIST_EMPTY(bs-op_blockers[op]);
 QLIST_INSERT_HEAD(bs-op_blockers[op], blocker, list);
+if (!blocked) {
+bdrv_op_blocker_notify(bs, op, reason, true);
+}
 }
 
 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
 {
+bool blocked;
 BdrvOpBlocker *blocker, *next;
 assert((int) op = 0  op  BLOCK_OP_TYPE_MAX);
+blocked = !QLIST_EMPTY(bs-op_blockers[op]);
 QLIST_FOREACH_SAFE(blocker, bs-op_blockers[op], list, next) {
 if (blocker-reason == reason) {
 QLIST_REMOVE(blocker, list);
 g_free(blocker);
 }
 }
+if (blocked  QLIST_EMPTY(bs-op_blockers[op])) {
+bdrv_op_blocker_notify(bs, op, reason, false);
+}
 }
 
 void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
diff --git a/include/block/block.h b/include/block/block.h
index 906fb31..3420b2c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -163,6 +163,12 @@ typedef enum BlockOpType {
 BLOCK_OP_TYPE_MAX,
 } BlockOpType;
 
+typedef struct {
+BlockOpType type;
+Error *reason;
+bool blocking;
+} BlockOpEvent;
+
 void bdrv_iostatus_enable(BlockDriverState *bs);
 void bdrv_iostatus_reset(BlockDriverState *bs);
 void bdrv_iostatus_disable(BlockDriverState *bs);
@@ -491,6 +497,8 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs);
 void bdrv_ref(BlockDriverState *bs);
 void bdrv_unref(BlockDriverState *bs);
 
+void bdrv_op_blocker_add_notifier(BlockDriverState *bs,
+  Notifier *notifier);
 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
 void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
 void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index db29b74..29d1c84 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -418,6 +418,9 @@ struct BlockDriverState {
 /* operation blockers */
 QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
 
+/* Callback when one list in op_blockers has empty status change. */
+NotifierList op_blocker_notifiers;
+
 /* long-running background operation */
 BlockJob *job;
 
-- 
2.4.1




[Qemu-block] [PATCH v5 00/13] Fix transactional snapshot with dataplane and NBD export

2015-05-20 Thread Fam Zheng
v5: Patch 13: set blocker before bdrv_drain().

v4: virtio-scsi-dataplane: Use assert in ctrl/event queue handler. [Paolo]
Protect mirror complete in new patch 13. [Wen]
Add Max's rev-by in 02, 03, 04.
Fix 05, 06 per Max's comments.

Reported by Paolo.

Unlike the iohandler in main loop, iothreads currently process the event
notifier used by virtio-blk ioeventfd in nested aio_poll. This is dangerous
without proper protection, because guest requests could sneak to block layer
where they mustn't.

For example, a QMP transaction may involve multiple bdrv_drain_all() in
handling the list of AioContext it works on. If an aio_poll in one of the
bdrv_drain_all() happens to process a guest VQ kick, and dispatches the
ioeventfd event to virtio-blk, a new guest write is then submitted, and voila,
the transaction semantics is violated.

This series avoids this problem by disabling virtio-blk handlers during
bdrv_drain_all() and transactions.

- Patches 1~3 add the block layer op blocker change notifier code.
- Patches 4,5 secure virtio-blk dataplane.
- Patch 6 protects virtio-scsi dataplane.
- Patch 7 secures nbd export.
- Patch 8~11 protect each transaction type from being voilated by new IO
  generated in nested aio_poll.
- Patch 12 protects bdrv_drain and bdrv_drain_all.
- Patch 13 protects mirror complete.

Fam Zheng (13):
  block: Add op blocker type device IO
  block: Add op blocker notifier list
  block-backend: Add blk_op_blocker_add_notifier
  virtio-blk: Move complete_request to 'ops' structure
  virtio-blk: Don't handle output when there is device IO op blocker
  virtio-scsi-dataplane: Add device IO op blocker listener
  nbd-server: Clear can_read when device io blocker is set
  blockdev: Block device IO during internal snapshot transaction
  blockdev: Block device IO during external snapshot transaction
  blockdev: Block device IO during drive-backup transaction
  blockdev: Block device IO during blockdev-backup transaction
  block: Block device IO during bdrv_drain and bdrv_drain_all
  block/mirror: Block device IO during mirror exit

 block.c | 35 ++
 block/block-backend.c   |  6 +++
 block/io.c  | 22 ++-
 block/mirror.c  |  9 -
 blockdev.c  | 49 
 blockjob.c  |  1 +
 hw/block/dataplane/virtio-blk.c | 37 ---
 hw/block/virtio-blk.c   | 65 ++--
 hw/scsi/virtio-scsi-dataplane.c | 82 +++--
 hw/scsi/virtio-scsi.c   |  4 ++
 include/block/block.h   |  9 +
 include/block/block_int.h   |  3 ++
 include/hw/virtio/virtio-blk.h  | 17 +++--
 include/hw/virtio/virtio-scsi.h |  3 ++
 include/sysemu/block-backend.h  |  2 +
 migration/block.c   |  1 +
 nbd.c   | 24 
 17 files changed, 328 insertions(+), 41 deletions(-)

-- 
2.4.1




[Qemu-block] [PATCH v5 04/13] virtio-blk: Move complete_request to 'ops' structure

2015-05-20 Thread Fam Zheng
Should more ops be added to differentiate code between dataplane and
non-dataplane, the new saved_ops approach will be cleaner than messing
with N pointers.

Signed-off-by: Fam Zheng f...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 hw/block/dataplane/virtio-blk.c | 13 -
 hw/block/virtio-blk.c   |  8 ++--
 include/hw/virtio/virtio-blk.h  |  9 +++--
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 3ecc8bd..e287e09 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -51,8 +51,7 @@ struct VirtIOBlockDataPlane {
 
 /* Operation blocker on BDS */
 Error *blocker;
-void (*saved_complete_request)(struct VirtIOBlockReq *req,
-   unsigned char status);
+const VirtIOBlockOps *saved_ops;
 };
 
 /* Raise an interrupt to signal guest, if necessary */
@@ -88,6 +87,10 @@ static void complete_request_vring(VirtIOBlockReq *req, 
unsigned char status)
 qemu_bh_schedule(s-bh);
 }
 
+static const VirtIOBlockOps virtio_blk_data_plane_ops = {
+.complete_request = complete_request_vring,
+};
+
 static void handle_notify(EventNotifier *e)
 {
 VirtIOBlockDataPlane *s = container_of(e, VirtIOBlockDataPlane,
@@ -270,8 +273,8 @@ void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
 }
 s-host_notifier = *virtio_queue_get_host_notifier(vq);
 
-s-saved_complete_request = vblk-complete_request;
-vblk-complete_request = complete_request_vring;
+s-saved_ops = vblk-ops;
+vblk-ops = virtio_blk_data_plane_ops;
 
 s-starting = false;
 s-started = true;
@@ -314,7 +317,7 @@ void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
 return;
 }
 s-stopping = true;
-vblk-complete_request = s-saved_complete_request;
+vblk-ops = s-saved_ops;
 trace_virtio_blk_data_plane_stop(s);
 
 aio_context_acquire(s-ctx);
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index e6afe97..f4a9d19 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -59,9 +59,13 @@ static void virtio_blk_complete_request(VirtIOBlockReq *req,
 virtio_notify(vdev, s-vq);
 }
 
+static const VirtIOBlockOps virtio_blk_ops = (VirtIOBlockOps) {
+.complete_request = virtio_blk_complete_request,
+};
+
 static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
 {
-req-dev-complete_request(req, status);
+req-dev-ops-complete_request(req, status);
 }
 
 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
@@ -905,7 +909,7 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 s-sector_mask = (s-conf.conf.logical_block_size / BDRV_SECTOR_SIZE) - 1;
 
 s-vq = virtio_add_queue(vdev, 128, virtio_blk_handle_output);
-s-complete_request = virtio_blk_complete_request;
+s-ops = virtio_blk_ops;
 virtio_blk_data_plane_create(vdev, conf, s-dataplane, err);
 if (err != NULL) {
 error_propagate(errp, err);
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 6bf5905..28b3436 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -44,6 +44,12 @@ struct VirtIOBlkConf
 struct VirtIOBlockDataPlane;
 
 struct VirtIOBlockReq;
+
+typedef struct {
+/* Function to push to vq and notify guest */
+void (*complete_request)(struct VirtIOBlockReq *req, unsigned char status);
+} VirtIOBlockOps;
+
 typedef struct VirtIOBlock {
 VirtIODevice parent_obj;
 BlockBackend *blk;
@@ -54,8 +60,7 @@ typedef struct VirtIOBlock {
 unsigned short sector_mask;
 bool original_wce;
 VMChangeStateEntry *change;
-/* Function to push to vq and notify guest */
-void (*complete_request)(struct VirtIOBlockReq *req, unsigned char status);
+const VirtIOBlockOps *ops;
 Notifier migration_state_notifier;
 struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
-- 
2.4.1




[Qemu-block] [PATCH v5 03/13] block-backend: Add blk_op_blocker_add_notifier

2015-05-20 Thread Fam Zheng
Forward the call to bdrv_op_blocker_add_notifier.

Signed-off-by: Fam Zheng f...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block/block-backend.c  | 6 ++
 include/sysemu/block-backend.h | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 93e46f3..16efe60 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -802,6 +802,12 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason)
 bdrv_op_unblock_all(blk-bs, reason);
 }
 
+void blk_op_blocker_add_notifier(BlockBackend *blk,
+ Notifier *notifier)
+{
+bdrv_op_blocker_add_notifier(blk-bs, notifier);
+}
+
 AioContext *blk_get_aio_context(BlockBackend *blk)
 {
 return bdrv_get_aio_context(blk-bs);
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index b4a4d5e..cde9651 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -136,6 +136,8 @@ int blk_get_flags(BlockBackend *blk);
 int blk_get_max_transfer_length(BlockBackend *blk);
 void blk_set_guest_block_size(BlockBackend *blk, int align);
 void *blk_blockalign(BlockBackend *blk, size_t size);
+void blk_op_blocker_add_notifier(BlockBackend *blk,
+ Notifier *notifier);
 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp);
 void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason);
 void blk_op_block_all(BlockBackend *blk, Error *reason);
-- 
2.4.1




Re: [Qemu-block] [PATCH v3 00/13] main-loop: Get rid of fd_read_poll and qemu_set_fd_handler2

2015-05-20 Thread Fam Zheng
On Wed, 05/20 08:26, Paolo Bonzini wrote:
 
 
 On 19/05/2015 17:02, Stefan Hajnoczi wrote:
  1. Convert everything like you converted qemu-nbd.c.  This is a 
  conservative approach and we can be confident that behavior is 
  unchanged.
 
 So, that means whenever you change receive_disabled you call a new
 callback on the peer?  In addition, whenever the count of
 receive-disabled ports switches from zero to non-zero or vice versa,
 hubs need to inform all its ports.
 
 There are just two places that set/clear receive_disabled,
 qemu_deliver_packet and qemu_flush_or_purge_queued_packets, but I
 think a new API is needed to implement the callback for hubs
 (qemu_send_enable/qemu_send_disable).
 

I think .can_receive is the harder one, I'm not sure it's feasible - each
device has its own set of conditions, so it will be a huge change.

Fam



[Qemu-block] [PATCH v5 01/13] block: Add op blocker type device IO

2015-05-20 Thread Fam Zheng
It blocks device IO.

All bdrv_op_block_all/blk_op_block_all callers are taken care of:

- virtio_blk_data_plane_create
- virtio_scsi_hotplug

  Device creation, unblock it.

- bdrv_set_backing_hd

  Backing hd is not used by device, so blocking is OK.

- backup_start

  Blocking target when backup is running, unblock it.

- mirror_complete

  Blocking s-to_replace until mirror_exit, OK.

- block_job_complete

  The block job may be long running. Unblock it.

- init_blk_migration

  The block migration may be long running, Unblock it.

Signed-off-by: Fam Zheng f...@redhat.com
---
 blockjob.c  | 1 +
 hw/block/dataplane/virtio-blk.c | 1 +
 hw/scsi/virtio-scsi.c   | 1 +
 include/block/block.h   | 1 +
 migration/block.c   | 1 +
 5 files changed, 5 insertions(+)

diff --git a/blockjob.c b/blockjob.c
index 2755465..e39bdde 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -51,6 +51,7 @@ void *block_job_create(const BlockJobDriver *driver, 
BlockDriverState *bs,
BlockJobType_lookup[driver-job_type]);
 bdrv_op_block_all(bs, job-blocker);
 bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job-blocker);
+bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, job-blocker);
 
 job-driver= driver;
 job-bs= bs;
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 3db139b..3ecc8bd 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -209,6 +209,7 @@ void virtio_blk_data_plane_create(VirtIODevice *vdev, 
VirtIOBlkConf *conf,
 blk_op_unblock(conf-conf.blk, BLOCK_OP_TYPE_MIRROR, s-blocker);
 blk_op_unblock(conf-conf.blk, BLOCK_OP_TYPE_STREAM, s-blocker);
 blk_op_unblock(conf-conf.blk, BLOCK_OP_TYPE_REPLACE, s-blocker);
+blk_op_unblock(conf-conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s-blocker);
 
 *dataplane = s;
 }
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index e242fef..5e15fa6 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -772,6 +772,7 @@ static void virtio_scsi_hotplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 return;
 }
 blk_op_block_all(sd-conf.blk, s-blocker);
+blk_op_unblock(sd-conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s-blocker);
 aio_context_acquire(s-ctx);
 blk_set_aio_context(sd-conf.blk, s-ctx);
 aio_context_release(s-ctx);
diff --git a/include/block/block.h b/include/block/block.h
index 7d1a717..906fb31 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -159,6 +159,7 @@ typedef enum BlockOpType {
 BLOCK_OP_TYPE_RESIZE,
 BLOCK_OP_TYPE_STREAM,
 BLOCK_OP_TYPE_REPLACE,
+BLOCK_OP_TYPE_DEVICE_IO,
 BLOCK_OP_TYPE_MAX,
 } BlockOpType;
 
diff --git a/migration/block.c b/migration/block.c
index ddb59cc..b833bac 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -379,6 +379,7 @@ static void init_blk_migration(QEMUFile *f)
 alloc_aio_bitmap(bmds);
 error_setg(bmds-blocker, block device is in use by migration);
 bdrv_op_block_all(bs, bmds-blocker);
+bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, bmds-blocker);
 bdrv_ref(bs);
 
 block_mig_state.total_sector_sum += sectors;
-- 
2.4.1




[Qemu-block] [PATCH v5 06/13] virtio-scsi-dataplane: Add device IO op blocker listener

2015-05-20 Thread Fam Zheng
When a disk is attached to scsi-bus, virtio_scsi_hotplug will take care
of protecting the block device with op blockers. Currently we haven't
enabled block jobs (like what's done in virtio_blk_data_plane_create),
but it is necessary to honor device IO op blocker first before we do.
This is useful to make sure that guest IO requests are paused during qmp
transactions (such as multi-disk snapshot or backup).

A counter is added to the virtio-scsi device, which keeps track of
currently blocked disks. If it goes from 0 to 1, the ioeventfds are
disabled; when it goes back to 0, they are re-enabled.

Also in device initialization, push the enabling of ioeventfds to before
return, so the virtio_scsi_clear_aio is not needed there. Rename it,
pair with an enabling variant, fix one coding style issue, then use it
in the device pause points.

Signed-off-by: Fam Zheng f...@redhat.com
---
 hw/scsi/virtio-scsi-dataplane.c | 82 +++--
 hw/scsi/virtio-scsi.c   |  3 ++
 include/hw/virtio/virtio-scsi.h |  3 ++
 3 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 5575648..e220c12 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -40,7 +40,6 @@ void virtio_scsi_set_iothread(VirtIOSCSI *s, IOThread 
*iothread)
 
 static VirtIOSCSIVring *virtio_scsi_vring_init(VirtIOSCSI *s,
VirtQueue *vq,
-   EventNotifierHandler *handler,
int n)
 {
 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
@@ -60,7 +59,6 @@ static VirtIOSCSIVring *virtio_scsi_vring_init(VirtIOSCSI *s,
 r = g_slice_new(VirtIOSCSIVring);
 r-host_notifier = *virtio_queue_get_host_notifier(vq);
 r-guest_notifier = *virtio_queue_get_guest_notifier(vq);
-aio_set_event_notifier(s-ctx, r-host_notifier, handler);
 
 r-parent = s;
 
@@ -71,7 +69,6 @@ static VirtIOSCSIVring *virtio_scsi_vring_init(VirtIOSCSI *s,
 return r;
 
 fail_vring:
-aio_set_event_notifier(s-ctx, r-host_notifier, NULL);
 k-set_host_notifier(qbus-parent, n, false);
 g_slice_free(VirtIOSCSIVring, r);
 return NULL;
@@ -104,6 +101,9 @@ void virtio_scsi_vring_push_notify(VirtIOSCSIReq *req)
 }
 }
 
+static void virtio_scsi_start_ioeventfd(VirtIOSCSI *s);
+static void virtio_scsi_stop_ioeventfd(VirtIOSCSI *s);
+
 static void virtio_scsi_iothread_handle_ctrl(EventNotifier *notifier)
 {
 VirtIOSCSIVring *vring = container_of(notifier,
@@ -111,6 +111,7 @@ static void virtio_scsi_iothread_handle_ctrl(EventNotifier 
*notifier)
 VirtIOSCSI *s = VIRTIO_SCSI(vring-parent);
 VirtIOSCSIReq *req;
 
+assert(!s-pause_counter);
 event_notifier_test_and_clear(notifier);
 while ((req = virtio_scsi_pop_req_vring(s, vring))) {
 virtio_scsi_handle_ctrl_req(s, req);
@@ -124,6 +125,7 @@ static void virtio_scsi_iothread_handle_event(EventNotifier 
*notifier)
 VirtIOSCSI *s = vring-parent;
 VirtIODevice *vdev = VIRTIO_DEVICE(s);
 
+assert(!s-pause_counter);
 event_notifier_test_and_clear(notifier);
 
 if (!(vdev-status  VIRTIO_CONFIG_S_DRIVER_OK)) {
@@ -143,6 +145,7 @@ static void virtio_scsi_iothread_handle_cmd(EventNotifier 
*notifier)
 VirtIOSCSIReq *req, *next;
 QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
 
+assert(!s-pause_counter);
 event_notifier_test_and_clear(notifier);
 while ((req = virtio_scsi_pop_req_vring(s, vring))) {
 if (virtio_scsi_handle_cmd_req_prepare(s, req)) {
@@ -155,8 +158,56 @@ static void virtio_scsi_iothread_handle_cmd(EventNotifier 
*notifier)
 }
 }
 
+void virtio_scsi_dataplane_blocker_notify(Notifier *notifier,
+  void *data)
+{
+VirtIOSCSI *s = container_of(notifier, VirtIOSCSI, blocker_notifier);
+BlockOpEvent *event = data;
+
+if (event-type != BLOCK_OP_TYPE_DEVICE_IO) {
+return;
+}
+if (event-blocking) {
+s-pause_counter++;
+if (s-pause_counter == 1) {
+virtio_scsi_stop_ioeventfd(s);
+}
+} else {
+s-pause_counter--;
+if (s-pause_counter == 0) {
+virtio_scsi_start_ioeventfd(s);
+}
+}
+assert(s-pause_counter = 0);
+}
+
 /* assumes s-ctx held */
-static void virtio_scsi_clear_aio(VirtIOSCSI *s)
+static void virtio_scsi_start_ioeventfd(VirtIOSCSI *s)
+{
+VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+int i;
+
+if (!s-dataplane_started || s-dataplane_stopping) {
+return;
+}
+if (s-ctrl_vring) {
+aio_set_event_notifier(s-ctx, s-ctrl_vring-host_notifier,
+   virtio_scsi_iothread_handle_ctrl);
+}
+if (s-event_vring) {
+aio_set_event_notifier(s-ctx, s-event_vring-host_notifier,
+   

Re: [Qemu-block] [PATCH v5 13/13] block/mirror: Block device IO during mirror exit

2015-05-20 Thread Paolo Bonzini


On 20/05/2015 08:16, Fam Zheng wrote:
  
  static void mirror_exit(BlockJob *job, void *opaque)
 @@ -328,6 +330,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
  MirrorExitData *data = opaque;
  AioContext *replace_aio_context = NULL;
  
 +bdrv_op_unblock(s-common.bs, BLOCK_OP_TYPE_DEVICE_IO, data-blocker);
 +error_free(data-blocker);
  if (s-to_replace) {
  replace_aio_context = bdrv_get_aio_context(s-to_replace);
  aio_context_acquire(replace_aio_context);

Why here and not after the if (s-should_complete  data-ret == 0) {
... }?  The commit message says unblock it after bdrv_swap().

This is the only remaining issue.

Paolo



Re: [Qemu-block] [PATCH v5 13/13] block/mirror: Block device IO during mirror exit

2015-05-20 Thread Fam Zheng
On Wed, 05/20 08:32, Paolo Bonzini wrote:
 
 
 On 20/05/2015 08:16, Fam Zheng wrote:
   
   static void mirror_exit(BlockJob *job, void *opaque)
  @@ -328,6 +330,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
   MirrorExitData *data = opaque;
   AioContext *replace_aio_context = NULL;
   
  +bdrv_op_unblock(s-common.bs, BLOCK_OP_TYPE_DEVICE_IO, data-blocker);
  +error_free(data-blocker);
   if (s-to_replace) {
   replace_aio_context = bdrv_get_aio_context(s-to_replace);
   aio_context_acquire(replace_aio_context);
 
 Why here and not after the if (s-should_complete  data-ret == 0) {
 ... }?  The commit message says unblock it after bdrv_swap().
 
 This is the only remaining issue.

Ouch, I thought I did that... It absolutely has to go after bdrv_swap().

Fam



[Qemu-block] [PATCH v5 05/13] virtio-blk: Don't handle output when there is device IO op blocker

2015-05-20 Thread Fam Zheng
virtio-blk now listens to op blocker change of the associated block
backend.

Up on setting op blocker on BLOCK_OP_TYPE_DEVICE_IO:

  non-dataplane:
   1) Set VirtIOBlock.paused
   2) In virtio_blk_handle_output, do nothing if VirtIOBlock.paused

  dataplane:
   1) Clear the host event notifier
   2) In handle_notify, do nothing if VirtIOBlock.paused

Up on removing the op blocker:

  non-dataplane:
   1) Clear VirtIOBlock.paused
   2) Schedule a BH on the AioContext of the backend, which calls
   virtio_blk_handle_output, so that the previous unhandled kicks can
   make progress

  dataplane:
   1) Set the host event notifier
   2) Notify the host event notifier so that unhandled events are
   processed

Signed-off-by: Fam Zheng f...@redhat.com
---
 hw/block/dataplane/virtio-blk.c | 25 -
 hw/block/virtio-blk.c   | 59 +++--
 include/hw/virtio/virtio-blk.h  |  8 +-
 3 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index e287e09..a5e8e35 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -87,8 +87,28 @@ static void complete_request_vring(VirtIOBlockReq *req, 
unsigned char status)
 qemu_bh_schedule(s-bh);
 }
 
+static void virtio_blk_data_plane_pause(VirtIOBlock *vblk)
+{
+VirtIOBlockDataPlane *s = vblk-dataplane;
+
+event_notifier_test_and_clear(s-host_notifier);
+aio_set_event_notifier(s-ctx, s-host_notifier, NULL);
+}
+
+static void handle_notify(EventNotifier *e);
+static void virtio_blk_data_plane_resume(VirtIOBlock *vblk)
+{
+VirtIOBlockDataPlane *s = vblk-dataplane;
+
+aio_set_event_notifier(s-ctx, s-host_notifier, handle_notify);
+
+event_notifier_set(s-host_notifier);
+}
+
 static const VirtIOBlockOps virtio_blk_data_plane_ops = {
-.complete_request = complete_request_vring,
+.complete_request   = complete_request_vring,
+.pause  = virtio_blk_data_plane_pause,
+.resume = virtio_blk_data_plane_resume,
 };
 
 static void handle_notify(EventNotifier *e)
@@ -98,6 +118,9 @@ static void handle_notify(EventNotifier *e)
 VirtIOBlock *vblk = VIRTIO_BLK(s-vdev);
 
 event_notifier_test_and_clear(s-host_notifier);
+if (vblk-paused) {
+return;
+}
 blk_io_plug(s-conf-conf.blk);
 for (;;) {
 MultiReqBuffer mrb = {};
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index f4a9d19..b4b19b5 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -59,8 +59,38 @@ static void virtio_blk_complete_request(VirtIOBlockReq *req,
 virtio_notify(vdev, s-vq);
 }
 
+typedef struct {
+QEMUBH *bh;
+VirtIOBlock *s;
+} VirtIOBlockResumeData;
+
+static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq);
+static void virtio_blk_resume_bh_cb(void *opaque)
+{
+VirtIOBlockResumeData *data = opaque;
+qemu_bh_delete(data-bh);
+virtio_blk_handle_output(VIRTIO_DEVICE(data-s), data-s-vq);
+}
+
+static void virtio_blk_pause(VirtIOBlock *vblk)
+{
+/* TODO: stop ioeventfd */
+}
+
+static void virtio_blk_resume(VirtIOBlock *vblk)
+{
+VirtIOBlockResumeData *data = g_new(VirtIOBlockResumeData, 1);
+data-bh = aio_bh_new(blk_get_aio_context(vblk-blk),
+virtio_blk_resume_bh_cb, data);
+data-s = vblk;
+data-s-paused = false;
+qemu_bh_schedule(data-bh);
+}
+
 static const VirtIOBlockOps virtio_blk_ops = (VirtIOBlockOps) {
-.complete_request = virtio_blk_complete_request,
+.complete_request   = virtio_blk_complete_request,
+.pause  = virtio_blk_pause,
+.resume = virtio_blk_resume,
 };
 
 static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
@@ -597,6 +627,9 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, 
VirtQueue *vq)
 VirtIOBlockReq *req;
 MultiReqBuffer mrb = {};
 
+if (s-paused) {
+return;
+}
 /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
  * dataplane here instead of waiting for .set_status().
  */
@@ -787,7 +820,7 @@ static void virtio_blk_save(QEMUFile *f, void *opaque)
 
 virtio_save(vdev, f);
 }
-
+
 static void virtio_blk_save_device(VirtIODevice *vdev, QEMUFile *f)
 {
 VirtIOBlock *s = VIRTIO_BLK(vdev);
@@ -875,6 +908,25 @@ static void virtio_blk_migration_state_changed(Notifier 
*notifier, void *data)
 }
 }
 
+static void virtio_blk_op_blocker_changed(Notifier *notifier, void *opaque)
+{
+BlockOpEvent *event = opaque;
+VirtIOBlock *s = container_of(notifier, VirtIOBlock,
+  op_blocker_notifier);
+
+if (event-type != BLOCK_OP_TYPE_DEVICE_IO) {
+return;
+}
+if (event-blocking == s-paused) {
+return;
+}
+if (event-blocking) {
+s-ops-pause(s);
+} else {
+s-ops-resume(s);
+}

[Qemu-block] [PATCH v5 09/13] blockdev: Block device IO during external snapshot transaction

2015-05-20 Thread Fam Zheng
Signed-off-by: Fam Zheng f...@redhat.com
---
 blockdev.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 7f763d9..923fc90 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1404,6 +1404,7 @@ typedef struct ExternalSnapshotState {
 BlockDriverState *old_bs;
 BlockDriverState *new_bs;
 AioContext *aio_context;
+Error *blocker;
 } ExternalSnapshotState;
 
 static void external_snapshot_prepare(BlkTransactionState *common,
@@ -1525,6 +1526,9 @@ static void external_snapshot_prepare(BlkTransactionState 
*common,
 if (ret != 0) {
 error_propagate(errp, local_err);
 }
+
+error_setg(state-blocker, snapshot in progress);
+bdrv_op_block(state-old_bs, BLOCK_OP_TYPE_DEVICE_IO, state-blocker);
 }
 
 static void external_snapshot_commit(BlkTransactionState *common)
@@ -1541,8 +1545,6 @@ static void external_snapshot_commit(BlkTransactionState 
*common)
  * don't want to abort all of them if one of them fails the reopen */
 bdrv_reopen(state-new_bs, state-new_bs-open_flags  ~BDRV_O_RDWR,
 NULL);
-
-aio_context_release(state-aio_context);
 }
 
 static void external_snapshot_abort(BlkTransactionState *common)
@@ -1552,6 +1554,17 @@ static void external_snapshot_abort(BlkTransactionState 
*common)
 if (state-new_bs) {
 bdrv_unref(state-new_bs);
 }
+}
+
+static void external_snapshot_clean(BlkTransactionState *common)
+{
+ExternalSnapshotState *state =
+ DO_UPCAST(ExternalSnapshotState, common, common);
+
+if (state-old_bs) {
+bdrv_op_unblock(state-old_bs, BLOCK_OP_TYPE_DEVICE_IO, 
state-blocker);
+error_free(state-blocker);
+}
 if (state-aio_context) {
 aio_context_release(state-aio_context);
 }
@@ -1716,6 +1729,7 @@ static const BdrvActionOps actions[] = {
 .prepare  = external_snapshot_prepare,
 .commit   = external_snapshot_commit,
 .abort = external_snapshot_abort,
+.clean = external_snapshot_clean,
 },
 [TRANSACTION_ACTION_KIND_DRIVE_BACKUP] = {
 .instance_size = sizeof(DriveBackupState),
-- 
2.4.1




[Qemu-block] [PATCH v5 08/13] blockdev: Block device IO during internal snapshot transaction

2015-05-20 Thread Fam Zheng
Signed-off-by: Fam Zheng f...@redhat.com
---
 blockdev.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 5eaf77e..7f763d9 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1262,6 +1262,7 @@ typedef struct InternalSnapshotState {
 BlockDriverState *bs;
 AioContext *aio_context;
 QEMUSnapshotInfo sn;
+Error *blocker;
 } InternalSnapshotState;
 
 static void internal_snapshot_prepare(BlkTransactionState *common,
@@ -1300,6 +1301,10 @@ static void 
internal_snapshot_prepare(BlkTransactionState *common,
 state-aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(state-aio_context);
 
+state-bs = bs;
+error_setg(state-blocker, internal snapshot in progress);
+bdrv_op_block(bs, BLOCK_OP_TYPE_DEVICE_IO, state-blocker);
+
 if (!bdrv_is_inserted(bs)) {
 error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device);
 return;
@@ -1354,9 +1359,6 @@ static void internal_snapshot_prepare(BlkTransactionState 
*common,
  name, device);
 return;
 }
-
-/* 4. succeed, mark a snapshot is created */
-state-bs = bs;
 }
 
 static void internal_snapshot_abort(BlkTransactionState *common)
@@ -1387,6 +1389,10 @@ static void internal_snapshot_clean(BlkTransactionState 
*common)
 InternalSnapshotState *state = DO_UPCAST(InternalSnapshotState,
  common, common);
 
+if (state-bs) {
+bdrv_op_unblock(state-bs, BLOCK_OP_TYPE_DEVICE_IO, state-blocker);
+error_free(state-blocker);
+}
 if (state-aio_context) {
 aio_context_release(state-aio_context);
 }
-- 
2.4.1




[Qemu-block] [PATCH v5 12/13] block: Block device IO during bdrv_drain and bdrv_drain_all

2015-05-20 Thread Fam Zheng
We don't want new requests from guest, so block the operation around the
nested poll.

It also avoids looping forever when iothread is submitting a lot of requests.

Signed-off-by: Fam Zheng f...@redhat.com
---
 block/io.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index 1ce62c4..b23a83f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -286,12 +286,21 @@ static bool bdrv_drain_one(BlockDriverState *bs)
  *
  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
  * AioContext.
+ *
+ * Devices are paused to avoid looping forever because otherwise they could
+ * keep submitting more requests.
  */
 void bdrv_drain(BlockDriverState *bs)
 {
+Error *blocker = NULL;
+
+error_setg(blocker, bdrv_drain in progress);
+bdrv_op_block(bs, BLOCK_OP_TYPE_DEVICE_IO, blocker);
 while (bdrv_drain_one(bs)) {
 /* Keep iterating */
 }
+bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, blocker);
+error_free(blocker);
 }
 
 /*
@@ -303,14 +312,20 @@ void bdrv_drain(BlockDriverState *bs)
  * Note that completion of an asynchronous I/O operation can trigger any
  * number of other I/O operations on other devices---for example a coroutine
  * can be arbitrarily complex and a constant flow of I/O can come until the
- * coroutine is complete.  Because of this, it is not possible to have a
- * function to drain a single device's I/O queue.
+ * coroutine is complete.  Because of this, we must call bdrv_drain_one in a
+ * loop.
+ *
+ * We explicitly pause block jobs and devices to prevent them from submitting
+ * more requests.
  */
 void bdrv_drain_all(void)
 {
 /* Always run first iteration so any pending completion BHs run */
 bool busy = true;
 BlockDriverState *bs = NULL;
+Error *blocker = NULL;
+
+error_setg(blocker, bdrv_drain_all in progress);
 
 while ((bs = bdrv_next(bs))) {
 AioContext *aio_context = bdrv_get_aio_context(bs);
@@ -319,6 +334,7 @@ void bdrv_drain_all(void)
 if (bs-job) {
 block_job_pause(bs-job);
 }
+bdrv_op_block(bs, BLOCK_OP_TYPE_DEVICE_IO, blocker);
 aio_context_release(aio_context);
 }
 
@@ -343,8 +359,10 @@ void bdrv_drain_all(void)
 if (bs-job) {
 block_job_resume(bs-job);
 }
+bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, blocker);
 aio_context_release(aio_context);
 }
+error_free(blocker);
 }
 
 /**
-- 
2.4.1




[Qemu-block] [PATCH v5 13/13] block/mirror: Block device IO during mirror exit

2015-05-20 Thread Fam Zheng
When mirror should complete, the source and target are in sync.  But we
call bdrv_swap() only a while later in the main loop bh. If the guest
writes something before that, target will not get the new data.

Block device IO before bdrv_drain and unblock it after bdrw_swap().

Reported-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Fam Zheng f...@redhat.com
---
 block/mirror.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 58f391a..f400456 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -320,6 +320,8 @@ static void mirror_drain(MirrorBlockJob *s)
 
 typedef struct {
 int ret;
+/* Use to pause device IO during mirror exit */
+Error *blocker;
 } MirrorExitData;
 
 static void mirror_exit(BlockJob *job, void *opaque)
@@ -328,6 +330,8 @@ static void mirror_exit(BlockJob *job, void *opaque)
 MirrorExitData *data = opaque;
 AioContext *replace_aio_context = NULL;
 
+bdrv_op_unblock(s-common.bs, BLOCK_OP_TYPE_DEVICE_IO, data-blocker);
+error_free(data-blocker);
 if (s-to_replace) {
 replace_aio_context = bdrv_get_aio_context(s-to_replace);
 aio_context_acquire(replace_aio_context);
@@ -376,6 +380,8 @@ static void coroutine_fn mirror_run(void *opaque)
  checking for a NULL string */
 int ret = 0;
 int n;
+data = g_malloc0(sizeof(*data));
+error_setg(data-blocker, mirror job exiting);
 
 if (block_job_is_cancelled(s-common)) {
 goto immediate_exit;
@@ -521,6 +527,7 @@ static void coroutine_fn mirror_run(void *opaque)
  * mirror_populate runs.
  */
 trace_mirror_before_drain(s, cnt);
+bdrv_op_block(bs, BLOCK_OP_TYPE_DEVICE_IO, data-blocker);
 bdrv_drain(bs);
 cnt = bdrv_get_dirty_count(s-dirty_bitmap);
 }
@@ -543,6 +550,7 @@ static void coroutine_fn mirror_run(void *opaque)
 s-common.cancelled = false;
 break;
 }
+bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, data-blocker);
 last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 }
 
@@ -563,7 +571,6 @@ immediate_exit:
 bdrv_release_dirty_bitmap(bs, s-dirty_bitmap);
 bdrv_iostatus_disable(s-target);
 
-data = g_malloc(sizeof(*data));
 data-ret = ret;
 block_job_defer_to_main_loop(s-common, mirror_exit, data);
 }
-- 
2.4.1




Re: [Qemu-block] [Qemu-devel] [PATCH 7/8] fdc: Fix MSR.RQM flag

2015-05-20 Thread Kevin Wolf
Am 19.05.2015 um 22:40 hat John Snow geschrieben:
 
 
 On 05/19/2015 11:36 AM, Kevin Wolf wrote:
  The RQM bit in MSR should be set whenever the guest is supposed to
  access the FIFO, and it should be cleared in all other cases. This is
  important so the guest can't continue writing/reading the FIFO beyond
  the length that it's suppossed to access (see CVE-2015-3456).
  
  Commit e9077462 fixed the CVE by adding code that avoids the buffer
  overflow; however it doesn't correct the wrong behaviour of the floppy
  controller which should already have cleared RQM.
  
  Currently, RQM stays set all the time and during all phases while a
  command is being processed. This is error-prone because the command has
  to explicitly clear the flag if it doesn't need data (and indeed, the
  two buggy commands that are the culprits for the CVE just forgot to do
  that).
  
  This patch clears RQM immediately as soon as all bytes that are expected
  have been received. If the the FIFO is used in the next phase, the flag
  has to be set explicitly there.
  
  This alone should have been enough to fix the CVE, but now we have two
  lines of defense - even better.
  
  Signed-off-by: Kevin Wolf kw...@redhat.com
  ---
   hw/block/fdc.c | 13 -
   1 file changed, 12 insertions(+), 1 deletion(-)
  
  diff --git a/hw/block/fdc.c b/hw/block/fdc.c
  index 8d322e0..c6a046e 100644
  --- a/hw/block/fdc.c
  +++ b/hw/block/fdc.c
  @@ -1165,7 +1165,9 @@ static void fdctrl_to_command_phase(FDCtrl *fdctrl)
   fdctrl-phase = FD_PHASE_COMMAND;
   fdctrl-data_dir = FD_DIR_WRITE;
   fdctrl-data_pos = 0;
  +fdctrl-data_len = 1; /* Accept command byte, adjust for params later 
  */
   fdctrl-msr = ~(FD_MSR_CMDBUSY | FD_MSR_DIO);
  +fdctrl-msr |= FD_MSR_RQM;
   }
   
   /* Update the state to allow the guest to read out the command status.
  @@ -1380,7 +1382,7 @@ static void fdctrl_start_transfer(FDCtrl *fdctrl, int 
  direction)
   }
   }
   FLOPPY_DPRINTF(start non-DMA transfer\n);
  -fdctrl-msr |= FD_MSR_NONDMA;
  +fdctrl-msr |= FD_MSR_NONDMA | FD_MSR_RQM;
   if (direction != FD_DIR_WRITE)
   fdctrl-msr |= FD_MSR_DIO;
   /* IO based transfer: calculate len */
  @@ -1560,6 +1562,7 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
   }
   
   if (++fdctrl-data_pos == fdctrl-data_len) {
  +fdctrl-msr = ~FD_MSR_RQM;
 
 Doesn't stop_transfer set this flag back right away?

It does, by switching to the result phase.

I think it's clearer to disable the bit anywhere where the FIFO has
received as many bytes as it's supposed to, even if the next phase is
started immediately and reenables it.

In real hardware, sending a byte causes the FDC to disable RQM, then
process the byte (which means completing command execution for this code
path), then reenable RQM if needed.

Currently our code is completely synchronous, so we could ignore this
detail because the state between clearing and setting RQM isn't
observable by the guest. If we ever introduce something asynchronous in
the path, we will need this though - and modelling real hardware more
precisely has never hurt anyway.

Kevin



[Qemu-block] [PATCH v4 5/5] raw-posix: Introduce hdev_is_sg()

2015-05-20 Thread Dimitris Aragiorgis
Until now, an SG device was identified only by checking if its path
started with /dev/sg. Then, hdev_open() set bs-sg accordingly.
This is very fragile, e.g. it fails with symlinks or relative paths.
We should rely on the actual properties of the device instead of the
specified file path.

Test for an SG device (e.g. /dev/sg0) by ensuring that all of the
following holds:

 - The device supports the SG_GET_VERSION_NUM ioctl
 - The device supports the SG_GET_SCSI_ID ioctl
 - The specified file name corresponds to a character device

Signed-off-by: Dimitris Aragiorgis dim...@arrikto.com
---
 block/raw-posix.c |   39 ---
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index ace228f..d84e5a0 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -57,6 +57,7 @@
 #include linux/fd.h
 #include linux/fs.h
 #include linux/hdreg.h
+#include scsi/sg.h
 #ifdef __s390__
 #include asm/dasd.h
 #endif
@@ -2081,15 +2082,38 @@ static void hdev_parse_filename(const char *filename, 
QDict *options,
 qdict_put_obj(options, filename, QOBJECT(qstring_from_str(filename)));
 }
 
+static int hdev_is_sg(BlockDriverState *bs)
+{
+
+#if defined(__linux__)
+
+struct stat st;
+struct sg_scsi_id scsiid;
+int sg_version;
+
+if (!bdrv_ioctl(bs, SG_GET_VERSION_NUM, sg_version) 
+!bdrv_ioctl(bs, SG_GET_SCSI_ID, scsiid) 
+stat(bs-filename, st) = 0  S_ISCHR(st.st_mode)) {
+DPRINTF(SG device found: type=%d, version=%d\n,
+scsiid.scsi_type, sg_version);
+return 1;
+}
+
+#endif
+
+return 0;
+}
+
 static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
  Error **errp)
 {
 BDRVRawState *s = bs-opaque;
 Error *local_err = NULL;
 int ret;
-const char *filename = qdict_get_str(options, filename);
 
 #if defined(__APPLE__)  defined(__MACH__)
+const char *filename = qdict_get_str(options, filename);
+
 if (strstart(filename, /dev/cdrom, NULL)) {
 kern_return_t kernResult;
 io_iterator_t mediaIterator;
@@ -2118,16 +2142,6 @@ static int hdev_open(BlockDriverState *bs, QDict 
*options, int flags,
 #endif
 
 s-type = FTYPE_FILE;
-#if defined(__linux__)
-{
-char resolved_path[ MAXPATHLEN ], *temp;
-
-temp = realpath(filename, resolved_path);
-if (temp  strstart(temp, /dev/sg, NULL)) {
-bs-sg = 1;
-}
-}
-#endif
 
 ret = raw_open_common(bs, options, flags, 0, local_err);
 if (ret  0) {
@@ -2137,6 +2151,9 @@ static int hdev_open(BlockDriverState *bs, QDict 
*options, int flags,
 return ret;
 }
 
+/* Since this does ioctl the device must be already opened */
+bs-sg = hdev_is_sg(bs);
+
 if (flags  BDRV_O_RDWR) {
 ret = check_hdev_writable(s);
 if (ret  0) {
-- 
1.7.10.4




Re: [Qemu-block] [Qemu-devel] [PATCH 6/8] fdc: Disentangle phases in fdctrl_read_data()

2015-05-20 Thread John Snow


On 05/20/2015 04:25 AM, Kevin Wolf wrote:
 Am 19.05.2015 um 22:40 hat John Snow geschrieben:


 On 05/19/2015 11:36 AM, Kevin Wolf wrote:
 This commit makes similar improvements as have already been made to the
 write function: Instead of relying on a flag in the MSR to distinguish
 controller phases, use the explicit phase that we store now. Assertions
 of the right MSR flags are added.

 Signed-off-by: Kevin Wolf kw...@redhat.com
 ---
  hw/block/fdc.c | 33 +++--
  1 file changed, 23 insertions(+), 10 deletions(-)

 diff --git a/hw/block/fdc.c b/hw/block/fdc.c
 index cbf7abf..8d322e0 100644
 --- a/hw/block/fdc.c
 +++ b/hw/block/fdc.c
 @@ -1533,9 +1533,16 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
  FLOPPY_DPRINTF(error: controller not ready for reading\n);
  return 0;
  }
 +
 +/* If data_len spans multiple sectors, the current position in the FIFO
 + * wraps around while fdctrl-data_pos is the real position in the 
 whole
 + * request. */
  pos = fdctrl-data_pos;
  pos %= FD_SECTOR_LEN;
 -if (fdctrl-msr  FD_MSR_NONDMA) {
 +
 +switch (fdctrl-phase) {
 +case FD_PHASE_EXECUTION:
 +assert(fdctrl-msr  FD_MSR_NONDMA);
  if (pos == 0) {
  if (fdctrl-data_pos != 0)
  if (!fdctrl_seek_to_next_sect(fdctrl, cur_drv)) {
 @@ -1551,20 +1558,26 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
  memset(fdctrl-fifo, 0, FD_SECTOR_LEN);
  }
  }
 -}
 -retval = fdctrl-fifo[pos];
 -if (++fdctrl-data_pos == fdctrl-data_len) {
 -fdctrl-data_pos = 0;

 I suppose data_pos is now reset by either stop_transfer (via
 to_result_phase) or to_command_phase, so this is OK.
 
 Yes, that was redundant code.
 
 -/* Switch from transfer mode to status mode
 - * then from status mode to command mode
 - */
 -if (fdctrl-msr  FD_MSR_NONDMA) {
 +
 +if (++fdctrl-data_pos == fdctrl-data_len) {
  fdctrl_stop_transfer(fdctrl, 0x00, 0x00, 0x00);
 -} else {
 +}
 +break;
 +
 +case FD_PHASE_RESULT:
 +assert(!(fdctrl-msr  FD_MSR_NONDMA));
 +if (++fdctrl-data_pos == fdctrl-data_len) {

 Not a terribly big fan of moving this pointer independently inside of
 each case statement, but I guess the alternative does look a lot worse.
 Having things separated by phases is a lot easier to follow.
 
 I'm not too happy about it either, but I couldn't think of anything
 better. Having two different switches almost immediately after each
 other, with only the if line in between, would look really awkward and
 be hard to read. And the old code isn't nice either.
 
 If you have any idea for a better solution, let me know.
 
 Kevin
 

I'm all complaints and no solutions. I believe I gave you my R-b anyway. :)



Re: [Qemu-block] [PATCH] MAINTAINERS: Add header files to Block Layer Core section

2015-05-20 Thread Alberto Garcia
On Wed 20 May 2015 12:05:55 PM CEST, Kevin Wolf kw...@redhat.com wrote:
 Suggested-by: Markus Armbruster arm...@redhat.com
 Signed-off-by: Kevin Wolf kw...@redhat.com

Reviewed-by: Alberto Garcia be...@igalia.com

Berto



Re: [Qemu-block] [Qemu-devel] [PATCH v4 11/11] qmp-commands.hx: Update the supported 'transaction' operations

2015-05-20 Thread Kashyap Chamarthy
On Tue, May 19, 2015 at 11:37:32AM -0400, John Snow wrote:
  On Mon, May 18, 2015 at 06:22:22PM +0200, Max Reitz wrote:
  On 12.05.2015 01:04, John Snow wrote:

[. . .]

  diff --git a/qmp-commands.hx b/qmp-commands.hx
  index 7506774..363126a 100644
  --- a/qmp-commands.hx
  +++ b/qmp-commands.hx
  @@ -1238,11 +1238,14 @@ SQMP
   transaction
   ---
  -Atomically operate on one or more block devices.  The only supported 
  operations
  -for now are drive-backup, internal and external snapshotting.  A list of
  -dictionaries is accepted, that contains the actions to be performed.
  -If there is any failure performing any of the operations, all operations
  -for the group are abandoned.
  +Atomically operate on one or more block devices.  Operations that are
  +currently supported: drive-backup, blockdev-backup,
  +blockdev-snapshot-sync, blockdev-snapshot-internal-sync, abort,
  +block-dirty-bitmap-add, block-dirty-bitmap-clear
 
  Hm, seven operations... Worth making it a real list?
  
  I don't have a preference. FWIW, I think it still retains the
  readability. And, not sure if it's worth the churn.
  
  (refer to the
  +qemu/qapi-schema.json file for minimum required QEMU versions for these
  +operations).  A list of dictionaries is accepted, that contains the
  +actions to be performed.  If there is any failure performing any of the
  +operations, all operations for the group are abandoned.
   For external snapshots, the dictionary contains the device, the file to 
  use for
   the new snapshot, and the format.  The default format, if not specified, 
  is
 
  
 
 I have to respin the series anyway, so if you want Kashyap, you can
 rewrite this and send it to me privately for inclusion

Done. Sent you (and Max) the revised patch. Max provided his R-b. Max
wondered whether you'd include this in your series on #qemu. I'll assume
you'll do.

Thanks.

-- 
/kashyap



Re: [Qemu-block] [Qemu-devel] [PATCH 3/8] fdc: Introduce fdctrl-phase

2015-05-20 Thread John Snow


On 05/20/2015 05:24 AM, Peter Maydell wrote:
 On 20 May 2015 at 09:43, Kevin Wolf kw...@redhat.com wrote:
 Am 20.05.2015 um 10:06 hat Peter Maydell geschrieben:
 That handles migration, which is good. But I still think that
 storing the same information in two places in the device
 state (phase field and the register fields) is error-prone.

 That's actually my point. The registers are accessed everywhere in the
 code, whereas phase transitions are in very few well-defined places
 (there are exactly four of them, iirc). If they get out of sync, chances
 are that the bug is in the register value, not in the phase. When we
 know what phase we're in, we can assert the bits and actually catch such
 bugs.

 If we want to switch to having a phase field we should calculate
 the relevant register bits on demand based on the phase, rather
 than keeping both copies of the state in sync manually.

 That doesn't work, unfortunately. Some register bits imply a specific
 phase (assuming correct code), but you can't derive the exact bits just
 from the phase.
 
 Having now dug out a copy of the 82078 spec, I agree that the state
 isn't derivable purely from the register values in the general case.
 The controller clearly has a state machine internally but it doesn't
 surface that in the register state except indirectly.
 
 -- PMM
 

So even if /currently/ we can reconstitute it from the register values,
we may eventually be unable to.

post_load will work for now, but I fear the case (in ten years) when
someone else cleans up FDC code but fails to realize that the phase is
not explicitly migrated.



[Qemu-block] [PATCH] MAINTAINERS: Split Block QAPI, monitor, command line off core

2015-05-20 Thread Markus Armbruster
Kevin and Stefan asked me to take care of this part.

Signed-off-by: Markus Armbruster arm...@redhat.com
---
 MAINTAINERS | 8 
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index b3552b2..8df0c6a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -812,6 +812,14 @@ F: block/stream.h
 F: block/mirror.c
 T: git git://github.com/codyprime/qemu-kvm-jtc.git block
 
+Block QAPI, monitor, command line
+M: Markus Armbruster arm...@redhat.com
+S: Supported
+F: blockdev.c
+F: block/qapi.c
+F: qapi/block*.json
+T: git git://repo.or.cz/qemu/armbru.git block-next
+
 Character Devices
 M: Paolo Bonzini pbonz...@redhat.com
 S: Maintained
-- 
1.9.3




Re: [Qemu-block] [Qemu-devel] [PATCH 7/8] fdc: Fix MSR.RQM flag

2015-05-20 Thread John Snow


On 05/20/2015 04:14 AM, Kevin Wolf wrote:
 Am 19.05.2015 um 22:40 hat John Snow geschrieben:


 On 05/19/2015 11:36 AM, Kevin Wolf wrote:
 The RQM bit in MSR should be set whenever the guest is supposed to
 access the FIFO, and it should be cleared in all other cases. This is
 important so the guest can't continue writing/reading the FIFO beyond
 the length that it's suppossed to access (see CVE-2015-3456).

 Commit e9077462 fixed the CVE by adding code that avoids the buffer
 overflow; however it doesn't correct the wrong behaviour of the floppy
 controller which should already have cleared RQM.

 Currently, RQM stays set all the time and during all phases while a
 command is being processed. This is error-prone because the command has
 to explicitly clear the flag if it doesn't need data (and indeed, the
 two buggy commands that are the culprits for the CVE just forgot to do
 that).

 This patch clears RQM immediately as soon as all bytes that are expected
 have been received. If the the FIFO is used in the next phase, the flag
 has to be set explicitly there.

 This alone should have been enough to fix the CVE, but now we have two
 lines of defense - even better.

 Signed-off-by: Kevin Wolf kw...@redhat.com
 ---
  hw/block/fdc.c | 13 -
  1 file changed, 12 insertions(+), 1 deletion(-)

 diff --git a/hw/block/fdc.c b/hw/block/fdc.c
 index 8d322e0..c6a046e 100644
 --- a/hw/block/fdc.c
 +++ b/hw/block/fdc.c
 @@ -1165,7 +1165,9 @@ static void fdctrl_to_command_phase(FDCtrl *fdctrl)
  fdctrl-phase = FD_PHASE_COMMAND;
  fdctrl-data_dir = FD_DIR_WRITE;
  fdctrl-data_pos = 0;
 +fdctrl-data_len = 1; /* Accept command byte, adjust for params later 
 */
  fdctrl-msr = ~(FD_MSR_CMDBUSY | FD_MSR_DIO);
 +fdctrl-msr |= FD_MSR_RQM;
  }
  
  /* Update the state to allow the guest to read out the command status.
 @@ -1380,7 +1382,7 @@ static void fdctrl_start_transfer(FDCtrl *fdctrl, int 
 direction)
  }
  }
  FLOPPY_DPRINTF(start non-DMA transfer\n);
 -fdctrl-msr |= FD_MSR_NONDMA;
 +fdctrl-msr |= FD_MSR_NONDMA | FD_MSR_RQM;
  if (direction != FD_DIR_WRITE)
  fdctrl-msr |= FD_MSR_DIO;
  /* IO based transfer: calculate len */
 @@ -1560,6 +1562,7 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
  }
  
  if (++fdctrl-data_pos == fdctrl-data_len) {
 +fdctrl-msr = ~FD_MSR_RQM;

 Doesn't stop_transfer set this flag back right away?
 
 It does, by switching to the result phase.
 
 I think it's clearer to disable the bit anywhere where the FIFO has
 received as many bytes as it's supposed to, even if the next phase is
 started immediately and reenables it.
 
 In real hardware, sending a byte causes the FDC to disable RQM, then
 process the byte (which means completing command execution for this code
 path), then reenable RQM if needed.
 
 Currently our code is completely synchronous, so we could ignore this
 detail because the state between clearing and setting RQM isn't
 observable by the guest. If we ever introduce something asynchronous in
 the path, we will need this though - and modelling real hardware more
 precisely has never hurt anyway.
 
 Kevin
 

OK, just amend the commit message to explain that clearing the bits here
is to accommodate a possible asynchronous refactor, or to be more
explicit, or etc etc etc.

--js




Re: [Qemu-block] [Qemu-devel] [PATCH v4 06/11] block: add refcount to Job object

2015-05-20 Thread Stefan Hajnoczi
On Tue, May 19, 2015 at 06:15:23PM -0400, John Snow wrote:
 On 05/18/2015 11:45 AM, Stefan Hajnoczi wrote:
  On Mon, May 11, 2015 at 07:04:21PM -0400, John Snow wrote:
  If we want to get at the job after the life of the job,
  we'll need a refcount for this object.
 
  This may occur for example if we wish to inspect the actions
  taken by a particular job after a transactional group of jobs
  runs, and further actions are required.
 
  Signed-off-by: John Snow js...@redhat.com
  Reviewed-by: Max Reitz mre...@redhat.com
  ---
   blockjob.c   | 18 --
   include/block/blockjob.h | 21 +
   2 files changed, 37 insertions(+), 2 deletions(-)
  
  I think the only reason for this refcount is so that
  backup_transaction_complete() can be called.  It accesses
  BackupBlockJob-sync_bitmap so the BackupBlockJob instance needs to be
  alive.
  
  The bitmap refcount is incremented in blockdev.c, not block/backup.c, so
  it is fine to drop backup_transaction_complete() and decrement the
  bitmap refcount in blockdev.c instead.
  
  If you do that then there is no need to add a refcount to block job.
  This would simplify things.
  
 
 So you are suggesting that I cache the bitmap reference (instead of the
 job reference) and then just increment/decrement it directly in
 .prepare, .abort and .cb as needed.
 
 You did find the disparity with the reference count for the bitmap, at
 least: that is kind of gross. I was coincidentally thinking of punting
 it back into a backup_transaction_start to keep more code /out/ of
 blockdev...
 
 I'll sit on this one for a few more minutes. I'll try to get rid of the
 job refcnt, but I also want to keep the transaction actions as tidy as I
 can.
 
 Maybe it's too much abstraction for a simple task, but I wanted to make
 sure I wasn't hacking in transaction callbacks in a manner where they'd
 only be useful to me, for only this one case. It's conceivable that if
 anyone else attempts to use this callback hijacking mechanism that
 they'll need to find a way to modify objects within the Job without
 pulling everything up to the transaction actions, too.

Hmm...I think this could be achieved without refcounting in
transactions, actions, or block jobs.

Create a separate MultiCompletion struct with a counter and list of
callbacks:

typedef struct MultiCompletionCB {
BlockCompletionFunc cb;
void *opaque;
QSLIST_ENTRY(MultiCompletionCB) list;
} MultiCompletionCB;

typedef struct {
unsigned refcnt; /* callbacks invoked when this reaches zero */
QSLIST_HEAD(, MultiCompletionCB) callbacks;
int ret;
} MultiCompletion;

void multicomp_add_cb(MultiCompletion *mc, BlockCompletionFunc cb, void 
*opaque);

/* Note the first argument is MultiCompletion* but this prototype
 * allows it to be used as a blockjob cb.
 */
void multicomp_complete(void *opaque, int ret)
{
MultiCompletion *mc = opaque;

if (ret  0) {
mc-ret = ret;
}

if (--mc-refcnt == 0) {
MultiCompletionCB *cb_data;
while ((cb_data = QSLIST_FIRST(mc-callbacks)) != NULL) {
cb_data-cb(cb_data-opaque, mc-ret);

QSLIST_REMOVE_HEAD(mc-callbacks, list);
g_free(cb_data);
}

g_free(mc);
}
}

qmp_transaction() creates a MultiCompletion and passes it to action
.prepare().  If an action wishes to join the MultiCompletion, it calls
multicomp_add_cb() after creating a block job:

static void backup_completion(void *opaque, int ret)
{
HBitmap *bmap = opaque;
if (ret  0) {
...merge bitmap back in...
} else {
...drop frozen bitmap...
}
}

...
backup_start(..., multicomp_completion, mc);
multicomp_add_cb(mc, backup_completion, bmap);

The multicomp_complete() function gets called by a blockjob cb function.

This approach is more reusable since MultiCompletion is independent of
transactions or block jobs.  It also doesn't hold on to transactions,
actions, or block jobs after they have served their purpose.

Is this along the lines you were thinking about?

Stefan


pgp8guvPLI0mt.pgp
Description: PGP signature


Re: [Qemu-block] [Qemu-devel] [PATCH 6/8] fdc: Disentangle phases in fdctrl_read_data()

2015-05-20 Thread Kevin Wolf
Am 19.05.2015 um 22:40 hat John Snow geschrieben:
 
 
 On 05/19/2015 11:36 AM, Kevin Wolf wrote:
  This commit makes similar improvements as have already been made to the
  write function: Instead of relying on a flag in the MSR to distinguish
  controller phases, use the explicit phase that we store now. Assertions
  of the right MSR flags are added.
  
  Signed-off-by: Kevin Wolf kw...@redhat.com
  ---
   hw/block/fdc.c | 33 +++--
   1 file changed, 23 insertions(+), 10 deletions(-)
  
  diff --git a/hw/block/fdc.c b/hw/block/fdc.c
  index cbf7abf..8d322e0 100644
  --- a/hw/block/fdc.c
  +++ b/hw/block/fdc.c
  @@ -1533,9 +1533,16 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
   FLOPPY_DPRINTF(error: controller not ready for reading\n);
   return 0;
   }
  +
  +/* If data_len spans multiple sectors, the current position in the FIFO
  + * wraps around while fdctrl-data_pos is the real position in the 
  whole
  + * request. */
   pos = fdctrl-data_pos;
   pos %= FD_SECTOR_LEN;
  -if (fdctrl-msr  FD_MSR_NONDMA) {
  +
  +switch (fdctrl-phase) {
  +case FD_PHASE_EXECUTION:
  +assert(fdctrl-msr  FD_MSR_NONDMA);
   if (pos == 0) {
   if (fdctrl-data_pos != 0)
   if (!fdctrl_seek_to_next_sect(fdctrl, cur_drv)) {
  @@ -1551,20 +1558,26 @@ static uint32_t fdctrl_read_data(FDCtrl *fdctrl)
   memset(fdctrl-fifo, 0, FD_SECTOR_LEN);
   }
   }
  -}
  -retval = fdctrl-fifo[pos];
  -if (++fdctrl-data_pos == fdctrl-data_len) {
  -fdctrl-data_pos = 0;
 
 I suppose data_pos is now reset by either stop_transfer (via
 to_result_phase) or to_command_phase, so this is OK.

Yes, that was redundant code.

  -/* Switch from transfer mode to status mode
  - * then from status mode to command mode
  - */
  -if (fdctrl-msr  FD_MSR_NONDMA) {
  +
  +if (++fdctrl-data_pos == fdctrl-data_len) {
   fdctrl_stop_transfer(fdctrl, 0x00, 0x00, 0x00);
  -} else {
  +}
  +break;
  +
  +case FD_PHASE_RESULT:
  +assert(!(fdctrl-msr  FD_MSR_NONDMA));
  +if (++fdctrl-data_pos == fdctrl-data_len) {
 
 Not a terribly big fan of moving this pointer independently inside of
 each case statement, but I guess the alternative does look a lot worse.
 Having things separated by phases is a lot easier to follow.

I'm not too happy about it either, but I couldn't think of anything
better. Having two different switches almost immediately after each
other, with only the if line in between, would look really awkward and
be hard to read. And the old code isn't nice either.

If you have any idea for a better solution, let me know.

Kevin



[Qemu-block] [PATCH v4 3/5] raw-posix: DPRINTF instead of DEBUG_BLOCK_PRINT

2015-05-20 Thread Dimitris Aragiorgis
Building the QEMU tools fails if we #define DEBUG_BLOCK inside
block/raw-posix.c. Here instead of adding qemu-log.o in block-obj-y
so that DEBUG_BLOCK_PRINT can be used, we substitute the latter with
a simple DPRINTF() (that does not cause bit-rot).

Signed-off-by: Dimitris Aragiorgis dim...@arrikto.com
---
 block/raw-posix.c |   26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index fb58440..438bf0b 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -97,12 +97,18 @@
 //#define DEBUG_FLOPPY
 
 //#define DEBUG_BLOCK
-#if defined(DEBUG_BLOCK)
-#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
-{ qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
+
+#ifdef DEBUG_BLOCK
+# define DEBUG_BLOCK_PRINT 1
 #else
-#define DEBUG_BLOCK_PRINT(formatCstr, ...)
+# define DEBUG_BLOCK_PRINT 0
 #endif
+#define DPRINTF(fmt, ...) \
+do { \
+if (DEBUG_BLOCK_PRINT) { \
+printf(fmt, ## __VA_ARGS__); \
+} \
+} while (0)
 
 /* OS X does not have O_DSYNC */
 #ifndef O_DSYNC
@@ -1016,6 +1022,7 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb)
 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes)
 {
 struct xfs_flock64 fl;
+int err;
 
 memset(fl, 0, sizeof(fl));
 fl.l_whence = SEEK_SET;
@@ -1023,8 +1030,9 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t 
offset, uint64_t bytes)
 fl.l_len = bytes;
 
 if (xfsctl(NULL, s-fd, XFS_IOC_ZERO_RANGE, fl)  0) {
-DEBUG_BLOCK_PRINT(cannot write zero range (%s)\n, strerror(errno));
-return -errno;
+err = errno;
+DPRINTF(cannot write zero range (%s)\n, strerror(errno));
+return -err;
 }
 
 return 0;
@@ -1033,6 +1041,7 @@ static int xfs_write_zeroes(BDRVRawState *s, int64_t 
offset, uint64_t bytes)
 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 {
 struct xfs_flock64 fl;
+int err;
 
 memset(fl, 0, sizeof(fl));
 fl.l_whence = SEEK_SET;
@@ -1040,8 +1049,9 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, 
uint64_t bytes)
 fl.l_len = bytes;
 
 if (xfsctl(NULL, s-fd, XFS_IOC_UNRESVSP64, fl)  0) {
-DEBUG_BLOCK_PRINT(cannot punch hole (%s)\n, strerror(errno));
-return -errno;
+err = errno;
+DPRINTF(cannot punch hole (%s)\n, strerror(errno));
+return -err;
 }
 
 return 0;
-- 
1.7.10.4




[Qemu-block] [PATCH v4 0/5] Some fixes related to scsi-generic

2015-05-20 Thread Dimitris Aragiorgis
Hi all,

These four patches make slight changes to the way QEMU handles SCSI
generic devices to fix a number of small problems.

I am sending them against the master branch, since I don't know if they
can be considered bugfixes.

Thanks,
dimara

v4 (rebased to current master):
* Avoid errno clobbering with DPRINTF + strerror() (Eric's comment)
* Use {} in #define macro even if it is not necessary (single commands)

v3 (rebased to current master):
* Avoid bit-rot in DPRINTF (adopt Eric's suggestion)
* Address Kevin's comments (DEBUF_FLOPPY, line  80 chars, SG device)
* Mention Kevin's comment wrt disk flush in the corresponding commit

v2:
* remove duplicate check for sg inside iscsi_co_flush()
* remove DEBUG_BLOCK_PRINT in block/raw-posix.c
* use DPRINTF for debugging in block/raw-posix.c

PS: Paolo suggested to use a tracepoint inside hdev_is_sg() but I chose DPRINTF
instead. It would make sense to add a tracepoint for bdrv_is_sg() (just like
most bdrv_* commands) but this is too much for now since it just returns the
bs-sg flag (and is not an actual driver function). If you insist I'll change
it in v3.

Dimitris Aragiorgis (5):
  block: Use bdrv_is_sg() everywhere
  Fix migration in case of scsi-generic
  raw-posix: DPRINTF instead of DEBUG_BLOCK_PRINT
  raw-posix: Use DPRINTF for DEBUG_FLOPPY
  raw-posix: Introduce hdev_is_sg()

 block.c   |6 ++--
 block/io.c|3 +-
 block/iscsi.c |4 ---
 block/raw-posix.c |   91 +++--
 4 files changed, 58 insertions(+), 46 deletions(-)

-- 
1.7.10.4




Re: [Qemu-block] [Qemu-devel] [PATCH 3/8] fdc: Introduce fdctrl-phase

2015-05-20 Thread Kevin Wolf
Am 20.05.2015 um 10:06 hat Peter Maydell geschrieben:
 On 20 May 2015 at 08:54, Kevin Wolf kw...@redhat.com wrote:
  Am 19.05.2015 um 22:57 hat Peter Maydell geschrieben:
  Yeah, if there's genuinely an underlying state machine that's
  not completely visible in registers you need to actually model it.
  You should probably then model the register bits by calculating
  them from the state rather than by changing them as you go along
  in parallel with moving the state machine around.
 
  I think the combination of registers is actually enough to reconstruct
  what state we're in, so it is derived (otherwise I would have fixed a
  bug that I'm not aware of). Adding logic to derive it in a post-load
  handler should be good enough.
 
 That handles migration, which is good. But I still think that
 storing the same information in two places in the device
 state (phase field and the register fields) is error-prone.

That's actually my point. The registers are accessed everywhere in the
code, whereas phase transitions are in very few well-defined places
(there are exactly four of them, iirc). If they get out of sync, chances
are that the bug is in the register value, not in the phase. When we
know what phase we're in, we can assert the bits and actually catch such
bugs.

 If we want to switch to having a phase field we should calculate
 the relevant register bits on demand based on the phase, rather
 than keeping both copies of the state in sync manually.

That doesn't work, unfortunately. Some register bits imply a specific
phase (assuming correct code), but you can't derive the exact bits just
from the phase.

In fact, I'm not even sure if we would still be able to derive the phase
from registers alone if we ever converted the FDC to use AIO instead of
synchronous I/O.

Kevin



[Qemu-block] [PATCH v4 1/5] block: Use bdrv_is_sg() everywhere

2015-05-20 Thread Dimitris Aragiorgis
Instead of checking bs-sg use bdrv_is_sg() consistently throughout
the code.

Signed-off-by: Dimitris Aragiorgis dim...@arrikto.com
Reviewed-by: Paolo Bonzini pbonz...@redhat.com
---
 block.c   |6 +++---
 block/iscsi.c |2 +-
 block/raw-posix.c |4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 7904098..d651b57 100644
--- a/block.c
+++ b/block.c
@@ -566,7 +566,7 @@ static int find_image_format(BlockDriverState *bs, const 
char *filename,
 int ret = 0;
 
 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
-if (bs-sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
+if (bdrv_is_sg(bs) || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
 *pdrv = bdrv_raw;
 return ret;
 }
@@ -598,7 +598,7 @@ static int refresh_total_sectors(BlockDriverState *bs, 
int64_t hint)
 BlockDriver *drv = bs-drv;
 
 /* Do not attempt drv-bdrv_getlength() on scsi-generic devices */
-if (bs-sg)
+if (bdrv_is_sg(bs))
 return 0;
 
 /* query actual device if possible, otherwise just trust the hint */
@@ -890,7 +890,7 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockDriverState *file,
 }
 
 assert(bdrv_opt_mem_align(bs) != 0);
-assert((bs-request_alignment != 0) || bs-sg);
+assert((bs-request_alignment != 0) || bdrv_is_sg(bs));
 return 0;
 
 free_and_fail:
diff --git a/block/iscsi.c b/block/iscsi.c
index 8fca1d3..502a81f 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -627,7 +627,7 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs)
 IscsiLun *iscsilun = bs-opaque;
 struct IscsiTask iTask;
 
-if (bs-sg) {
+if (bdrv_is_sg(bs)) {
 return 0;
 }
 
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 24d8582..fb58440 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -302,9 +302,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
 BDRVRawState *s = bs-opaque;
 char *buf;
 
-/* For /dev/sg devices the alignment is not really used.
+/* For SCSI generic devices the alignment is not really used.
With buffered I/O, we don't have any restrictions. */
-if (bs-sg || !s-needs_alignment) {
+if (bdrv_is_sg(bs) || !s-needs_alignment) {
 bs-request_alignment = 1;
 s-buf_align = 1;
 return;
-- 
1.7.10.4




[Qemu-block] [PATCH v4 2/5] Fix migration in case of scsi-generic

2015-05-20 Thread Dimitris Aragiorgis
During migration, QEMU uses fsync()/fdatasync() on the open file
descriptor for read-write block devices to flush data just before
stopping the VM.

However, fsync() on a scsi-generic device returns -EINVAL which
causes the migration to fail. This patch skips flushing data in case
of an SG device, since submitting SCSI commands directly via an SG
character device (e.g. /dev/sg0) bypasses the page cache completely,
anyway.

Note that fsync() not only flushes the page cache but also the disk
cache. The scsi-generic device never sends flushes, and for
migration it assumes that the same SCSI device is used by the
destination host, so it does not issue any SCSI SYNCHRONIZE CACHE
(10) command.

Finally, remove the bdrv_is_sg() test from iscsi_co_flush() since
this is now redundant (we flush the underlying protocol at the end
of bdrv_co_flush() which, with this patch, we never reach).

Signed-off-by: Dimitris Aragiorgis dim...@arrikto.com
---
 block/io.c|3 ++-
 block/iscsi.c |4 
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index 1ce62c4..922dc07 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2231,7 +2231,8 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
 int ret;
 
-if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
+if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
+bdrv_is_sg(bs)) {
 return 0;
 }
 
diff --git a/block/iscsi.c b/block/iscsi.c
index 502a81f..965978b 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -627,10 +627,6 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState 
*bs)
 IscsiLun *iscsilun = bs-opaque;
 struct IscsiTask iTask;
 
-if (bdrv_is_sg(bs)) {
-return 0;
-}
-
 if (!iscsilun-force_next_flush) {
 return 0;
 }
-- 
1.7.10.4




[Qemu-block] [PATCH v4 4/5] raw-posix: Use DPRINTF for DEBUG_FLOPPY

2015-05-20 Thread Dimitris Aragiorgis
Get rid of several #ifdef DEBUG_FLOPPY and substitute them with
DPRINTF.

Signed-off-by: Dimitris Aragiorgis dim...@arrikto.com
---
 block/raw-posix.c |   22 +-
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 438bf0b..ace228f 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -94,8 +94,6 @@
 #include xfs/xfs.h
 #endif
 
-//#define DEBUG_FLOPPY
-
 //#define DEBUG_BLOCK
 
 #ifdef DEBUG_BLOCK
@@ -2167,16 +2165,12 @@ static int fd_open(BlockDriverState *bs)
 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s-fd_open_time) = 
FD_OPEN_TIMEOUT) {
 qemu_close(s-fd);
 s-fd = -1;
-#ifdef DEBUG_FLOPPY
-printf(Floppy closed\n);
-#endif
+DPRINTF(Floppy closed\n);
 }
 if (s-fd  0) {
 if (s-fd_got_error 
 (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s-fd_error_time)  
FD_OPEN_TIMEOUT) {
-#ifdef DEBUG_FLOPPY
-printf(No floppy (open delayed)\n);
-#endif
+DPRINTF(No floppy (open delayed)\n);
 return -EIO;
 }
 s-fd = qemu_open(bs-filename, s-open_flags  ~O_NONBLOCK);
@@ -2185,14 +2179,10 @@ static int fd_open(BlockDriverState *bs)
 s-fd_got_error = 1;
 if (last_media_present)
 s-fd_media_changed = 1;
-#ifdef DEBUG_FLOPPY
-printf(No floppy\n);
-#endif
+DPRINTF(No floppy\n);
 return -EIO;
 }
-#ifdef DEBUG_FLOPPY
-printf(Floppy opened\n);
-#endif
+DPRINTF(Floppy opened\n);
 }
 if (!last_media_present)
 s-fd_media_changed = 1;
@@ -2460,9 +2450,7 @@ static int floppy_media_changed(BlockDriverState *bs)
 fd_open(bs);
 ret = s-fd_media_changed;
 s-fd_media_changed = 0;
-#ifdef DEBUG_FLOPPY
-printf(Floppy changed=%d\n, ret);
-#endif
+DPRINTF(Floppy changed=%d\n, ret);
 return ret;
 }
 
-- 
1.7.10.4




Re: [Qemu-block] [PATCH v3 00/13] main-loop: Get rid of fd_read_poll and qemu_set_fd_handler2

2015-05-20 Thread Stefan Hajnoczi
On Wed, May 20, 2015 at 8:39 AM, Paolo Bonzini pbonz...@redhat.com wrote:
 On 20/05/2015 08:38, Fam Zheng wrote:
 On Wed, 05/20 08:26, Paolo Bonzini wrote:


 On 19/05/2015 17:02, Stefan Hajnoczi wrote:
 1. Convert everything like you converted qemu-nbd.c.  This is a
 conservative approach and we can be confident that behavior is
 unchanged.

 So, that means whenever you change receive_disabled you call a new
 callback on the peer?  In addition, whenever the count of
 receive-disabled ports switches from zero to non-zero or vice versa,
 hubs need to inform all its ports.

 There are just two places that set/clear receive_disabled,
 qemu_deliver_packet and qemu_flush_or_purge_queued_packets, but I
 think a new API is needed to implement the callback for hubs
 (qemu_send_enable/qemu_send_disable).

 I think .can_receive is the harder one, I'm not sure it's feasible - each
 device has its own set of conditions, so it will be a huge change.

 The 1-0 transition is easy because it happens when message delivery
 fails.  I know the network code very little, but I think queuing exactly
 one packet in this case should be acceptable.  If this is true, the
 network code can detect the 1-0 transition automatically.

That's correct.  One packet is queued the first time
qemu_net_queue_send_iov() goes false.

 The 0-1 transition should be easy in principle, because NICs are
 supposed to call qemu_flush_queued_packets when it happens.  Not that
 they do, but you can find some very old and partial work in branch
 rx-flush of my github repo.

That sounds good.  Failure to call qemu_flush_queued_packets is
already a problem today so this change would benefit some of the
existing NICs.

Stefan



[Qemu-block] [PATCH] MAINTAINERS: Add header files to Block Layer Core section

2015-05-20 Thread Kevin Wolf
Suggested-by: Markus Armbruster arm...@redhat.com
Signed-off-by: Kevin Wolf kw...@redhat.com
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index b3552b2..9ff7c36 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -785,6 +785,7 @@ S: Supported
 F: block*
 F: block/
 F: hw/block/
+F: include/block/
 F: qemu-img*
 F: qemu-io*
 F: tests/qemu-iotests/
-- 
1.8.3.1




Re: [Qemu-block] [Qemu-devel] [PATCH 3/8] fdc: Introduce fdctrl-phase

2015-05-20 Thread Peter Maydell
On 20 May 2015 at 09:43, Kevin Wolf kw...@redhat.com wrote:
 Am 20.05.2015 um 10:06 hat Peter Maydell geschrieben:
 That handles migration, which is good. But I still think that
 storing the same information in two places in the device
 state (phase field and the register fields) is error-prone.

 That's actually my point. The registers are accessed everywhere in the
 code, whereas phase transitions are in very few well-defined places
 (there are exactly four of them, iirc). If they get out of sync, chances
 are that the bug is in the register value, not in the phase. When we
 know what phase we're in, we can assert the bits and actually catch such
 bugs.

 If we want to switch to having a phase field we should calculate
 the relevant register bits on demand based on the phase, rather
 than keeping both copies of the state in sync manually.

 That doesn't work, unfortunately. Some register bits imply a specific
 phase (assuming correct code), but you can't derive the exact bits just
 from the phase.

Having now dug out a copy of the 82078 spec, I agree that the state
isn't derivable purely from the register values in the general case.
The controller clearly has a state machine internally but it doesn't
surface that in the register state except indirectly.

-- PMM



Re: [Qemu-block] [Qemu-devel] [PATCH 01/34] qdict: Add qdict_array_entries()

2015-05-20 Thread Alberto Garcia
On Fri 08 May 2015 10:06:35 PM CEST, Eric Blake wrote:

 +for (i = 0; i  UINT_MAX; i++) {
 +QObject *subqobj;
 +int subqdict_entries;
 +size_t slen = 32 + subqdict_len;
 +char indexstr[slen], prefix[slen];

 And more dependence on a working C99 compiler, thanks to variable
 length array (VLA).

 +size_t snprintf_ret;
 +
 +snprintf_ret = snprintf(indexstr, slen, %s%u, subqdict, i);
 +assert(snprintf_ret  slen);

 Since gcc may compile the allocation of indexstr into a malloc()
 anyways, would it be any simpler to just use g_strdup_printf()
 directly, instead of futzing around with VLA and snprintf() ourselves?
 It might mean less code, as some of the error checking is taken care
 of on your behalf.

Since the only difference between the two strings you are allocating is
the trailing dot, you could also save one malloc() by reusing the same
string and stripping the dot.

Alternatively you could allocate the memory outside the loop instead of
having to do it in every iteration. The size is always the same after
all.

Berto



Re: [Qemu-block] [Qemu-devel] [PATCH] MAINTAINERS: Split Block QAPI, monitor, command line off core

2015-05-20 Thread Eric Blake
On 05/20/2015 05:23 AM, Markus Armbruster wrote:
 Kevin and Stefan asked me to take care of this part.
 
 Signed-off-by: Markus Armbruster arm...@redhat.com
 ---
  MAINTAINERS | 8 
  1 file changed, 8 insertions(+)
 

Reviewed-by: Eric Blake ebl...@redhat.com

 diff --git a/MAINTAINERS b/MAINTAINERS
 index b3552b2..8df0c6a 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
 @@ -812,6 +812,14 @@ F: block/stream.h
  F: block/mirror.c
  T: git git://github.com/codyprime/qemu-kvm-jtc.git block
  
 +Block QAPI, monitor, command line
 +M: Markus Armbruster arm...@redhat.com
 +S: Supported
 +F: blockdev.c
 +F: block/qapi.c
 +F: qapi/block*.json
 +T: git git://repo.or.cz/qemu/armbru.git block-next
 +
  Character Devices
  M: Paolo Bonzini pbonz...@redhat.com
  S: Maintained
 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PULL 03/12] tests: Use qtest_add_data_func() consistently

2015-05-20 Thread Andreas Färber
Replace uses of g_test_add_data_func() for QTest test cases.

It is still valid to use it for any non-QTest test cases,
which are not run for multiple target binaries.

Suggested-by: John Snow js...@redhat.com
Reviewed-by: John Snow js...@redhat.com
Signed-off-by: Andreas Färber afaer...@suse.de
---
 tests/ahci-test.c   |  9 ++---
 tests/e1000-test.c  |  4 ++--
 tests/eepro100-test.c   |  5 ++---
 tests/endianness-test.c | 18 +-
 tests/pc-cpu-test.c | 13 ++---
 tests/qom-test.c|  4 ++--
 6 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/tests/ahci-test.c b/tests/ahci-test.c
index 7c23bb2..9d12caa 100644
--- a/tests/ahci-test.c
+++ b/tests/ahci-test.c
@@ -1174,7 +1174,6 @@ static void test_io_interface(gconstpointer opaque)
 static void create_ahci_io_test(enum IOMode type, enum AddrMode addr,
 enum BuffLen len, enum OffsetType offset)
 {
-static const char *arch;
 char *name;
 AHCIIOTestOptions *opts = g_malloc(sizeof(AHCIIOTestOptions));
 
@@ -1183,17 +1182,13 @@ static void create_ahci_io_test(enum IOMode type, enum 
AddrMode addr,
 opts-io_type = type;
 opts-offset = offset;
 
-if (!arch) {
-arch = qtest_get_arch();
-}
-
-name = g_strdup_printf(/%s/ahci/io/%s/%s/%s/%s, arch,
+name = g_strdup_printf(ahci/io/%s/%s/%s/%s,
io_mode_str[type],
addr_mode_str[addr],
buff_len_str[len],
offset_str[offset]);
 
-g_test_add_data_func(name, opts, test_io_interface);
+qtest_add_data_func(name, opts, test_io_interface);
 g_free(name);
 }
 
diff --git a/tests/e1000-test.c b/tests/e1000-test.c
index 81f164d..7ca6d7e 100644
--- a/tests/e1000-test.c
+++ b/tests/e1000-test.c
@@ -44,8 +44,8 @@ int main(int argc, char **argv)
 for (i = 0; i  ARRAY_SIZE(models); i++) {
 char *path;
 
-path = g_strdup_printf(/%s/e1000/%s, qtest_get_arch(), models[i]);
-g_test_add_data_func(path, models[i], test_device);
+path = g_strdup_printf(e1000/%s, models[i]);
+qtest_add_data_func(path, models[i], test_device);
 }
 
 return g_test_run();
diff --git a/tests/eepro100-test.c b/tests/eepro100-test.c
index bf82526..8bfaccd 100644
--- a/tests/eepro100-test.c
+++ b/tests/eepro100-test.c
@@ -54,9 +54,8 @@ int main(int argc, char **argv)
 for (i = 0; i  ARRAY_SIZE(models); i++) {
 char *path;
 
-path = g_strdup_printf(/%s/eepro100/%s,
-   qtest_get_arch(), models[i]);
-g_test_add_data_func(path, models[i], test_device);
+path = g_strdup_printf(eepro100/%s, models[i]);
+qtest_add_data_func(path, models[i], test_device);
 }
 
 return g_test_run();
diff --git a/tests/endianness-test.c b/tests/endianness-test.c
index 92e17d2..7482ff7 100644
--- a/tests/endianness-test.c
+++ b/tests/endianness-test.c
@@ -298,17 +298,17 @@ int main(int argc, char **argv)
 if (strcmp(test_cases[i].arch, arch) != 0) {
 continue;
 }
-path = g_strdup_printf(/%s/endianness/%s,
-   arch, test_cases[i].machine);
-g_test_add_data_func(path, test_cases[i], test_endianness);
+path = g_strdup_printf(endianness/%s,
+   test_cases[i].machine);
+qtest_add_data_func(path, test_cases[i], test_endianness);
 
-path = g_strdup_printf(/%s/endianness/split/%s,
-   arch, test_cases[i].machine);
-g_test_add_data_func(path, test_cases[i], test_endianness_split);
+path = g_strdup_printf(endianness/split/%s,
+   test_cases[i].machine);
+qtest_add_data_func(path, test_cases[i], test_endianness_split);
 
-path = g_strdup_printf(/%s/endianness/combine/%s,
-   arch, test_cases[i].machine);
-g_test_add_data_func(path, test_cases[i], test_endianness_combine);
+path = g_strdup_printf(endianness/combine/%s,
+   test_cases[i].machine);
+qtest_add_data_func(path, test_cases[i], test_endianness_combine);
 }
 
 ret = g_test_run();
diff --git a/tests/pc-cpu-test.c b/tests/pc-cpu-test.c
index a0122d3..3505c7c 100644
--- a/tests/pc-cpu-test.c
+++ b/tests/pc-cpu-test.c
@@ -75,7 +75,6 @@ static void test_pc_without_cpu_add(gconstpointer data)
 
 static void add_pc_test_cases(void)
 {
-const char *arch = qtest_get_arch();
 QDict *response, *minfo;
 QList *list;
 const QListEntry *p;
@@ -119,15 +118,15 @@ static void add_pc_test_cases(void)
 (strcmp(mname, pc-0.12) == 0) ||
 (strcmp(mname, pc-0.11) == 0) ||
 (strcmp(mname, pc-0.10) == 0)) {
-path = g_strdup_printf(/%s/cpu/%s/init/%ux%ux%umaxcpus=%u,
-   arch, mname, data-sockets, 

Re: [Qemu-block] [PATCH COLO v4 00/15] Block replication for continuous checkpoints

2015-05-20 Thread Wen Congyang
On 05/21/2015 03:18 AM, Dr. David Alan Gilbert wrote:
 * Wen Congyang (we...@cn.fujitsu.com) wrote:
 Block replication is a very important feature which is used for
 continuous checkpoints(for example: COLO).

 Usage:
 Please refer to docs/block-replication.txt

 You can get the patch here:
 https://github.com/wencongyang/qemu-colo/commits/block-replication-v4

 You can get the patch with the other COLO patches here:
 https://github.com/wencongyang/qemu-colo/tree/colo_huawei_v4.7
 
 Hi,
   A couple of questions:
  1) I still trip the handle_aiocb_rw assertion occasionally;
   I see Kevin was asking for some detail on 
 http://lists.nongnu.org/archive/html/qemu-devel/2015-01/msg04507.html
  is that still the right fix?

I will send the newest fix today.

 
  2) The only stats I see on the block replication are the info block-jobs
 completed 'n' of 'm' - is 'n' there just the total number of blocks 
 written?
 Are there any more stats about?

There is nothing now. I add it to TODO list, and do it later.

 
 
 TODO:
 1. Test failover when the guest does too many I/O operations. If it
takes too much time, we need to optimize it.
 
 Limiting the size of the state needed for commit might help that; you can
 always trigger a new checkpoint if the size gets too big; it would also
 make it more realistic since it will probably fail at the moment if you
 fill up the RAM you're using to hold the hidden/active disks with a big
 write.

These codes have been changed in the newest version(will send it out soon).
In the newest codes, secondary vm read from active disk, and write to
secondary disk or active disk(only when the sectors are allocated in 
hidden/active
disk) after failover.

Thanks
Wen Congyang

 
 Dave
 
 2. Continuous block replication. It will be started after basic functions
are accepted.

 Changs Log:
 V4:
 1. Introduce a new driver replication to avoid touch nbd and qcow2.
 V3:
 1: use error_setg() instead of error_set()
 2. Add a new block job API
 3. Active disk, hidden disk and nbd target uses the same AioContext
 4. Add a testcase to test new hbitmap API
 V2:
 1. Redesign the secondary qemu(use image-fleecing)
 2. Use Error objects to return error message
 3. Address the comments from Max Reitz and Eric Blake

 Wen Congyang (15):
   docs: block replication's description
   allow writing to the backing file
   Allow creating backup jobs when opening BDS
   block: Parse backing_reference option to reference existing BDS
   Backup: clear all bitmap when doing block checkpoint
   Don't allow a disk use backing reference target
   Add new block driver interface to connect/disconnect the remote target
   NBD client: implement block driver interfaces to connect/disconnect
 NBD server
   Introduce a new -drive option to control whether to connect to remote
 target
   NBD client: connect to nbd server later
   Add new block driver interfaces to control block replication
   skip nbd_target when starting block replication
   quorum: implement block driver interfaces for block replication
   quorum: allow ignoring child errors
   Implement new driver for block replication

  block.c| 270 +++-
  block/Makefile.objs|   3 +-
  block/backup.c |  13 ++
  block/nbd.c|  67 --
  block/quorum.c | 142 -
  block/replication.c| 512 
 +
  blockdev.c |   8 +
  blockjob.c |  10 +
  docs/block-replication.txt | 179 
  include/block/block.h  |  10 +
  include/block/block_int.h  |  18 ++
  include/block/blockjob.h   |  12 ++
  qapi/block.json|  16 ++
  qemu-options.hx|   4 +
  tests/qemu-iotests/051 |  13 ++
  tests/qemu-iotests/051.out |  13 ++
  16 files changed, 1260 insertions(+), 30 deletions(-)
  create mode 100644 block/replication.c
  create mode 100644 docs/block-replication.txt

 -- 
 2.1.0

 --
 Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
 .
 




[Qemu-block] [PATCH COLO-Block v5 03/15] Allow creating backup jobs when opening BDS

2015-05-20 Thread Wen Congyang
When opening BDS, we need to create backup jobs for
image-fleecing.

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
Cc: Jeff Cody jc...@redhat.com
---
 block/Makefile.objs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 0d8c2a4..8dd9b9c 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -21,10 +21,10 @@ block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
 block-obj-y += write-threshold.o
+block-obj-y += backup.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
-common-obj-y += backup.o
 
 iscsi.o-cflags := $(LIBISCSI_CFLAGS)
 iscsi.o-libs   := $(LIBISCSI_LIBS)
-- 
2.1.0




[Qemu-block] [PATCH COLO-Block v5 12/15] skip nbd_target when starting block replication

2015-05-20 Thread Wen Congyang
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
---
 block.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/block.c b/block.c
index 74fd3d6..961e395 100644
--- a/block.c
+++ b/block.c
@@ -4221,6 +4221,10 @@ void bdrv_start_replication(BlockDriverState *bs, 
ReplicationMode mode,
 {
 BlockDriver *drv = bs-drv;
 
+if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKING_REFERENCE, NULL)) {
+return;
+}
+
 if (drv  drv-bdrv_start_replication) {
 drv-bdrv_start_replication(bs, mode, errp);
 } else if (bs-file) {
@@ -4234,6 +4238,10 @@ void bdrv_do_checkpoint(BlockDriverState *bs, Error 
**errp)
 {
 BlockDriver *drv = bs-drv;
 
+if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKING_REFERENCE, NULL)) {
+return;
+}
+
 if (drv  drv-bdrv_do_checkpoint) {
 drv-bdrv_do_checkpoint(bs, errp);
 } else if (bs-file) {
@@ -4247,6 +4255,10 @@ void bdrv_stop_replication(BlockDriverState *bs, bool 
failover, Error **errp)
 {
 BlockDriver *drv = bs-drv;
 
+if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKING_REFERENCE, NULL)) {
+return;
+}
+
 if (drv  drv-bdrv_stop_replication) {
 drv-bdrv_stop_replication(bs, failover, errp);
 } else if (bs-file) {
-- 
2.1.0




[Qemu-block] [PATCH COLO-Block v5 15/15] Implement new driver for block replication

2015-05-20 Thread Wen Congyang
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
---
 block/Makefile.objs |   1 +
 block/replication.c | 441 
 2 files changed, 442 insertions(+)
 create mode 100644 block/replication.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 8dd9b9c..8b1c587 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -22,6 +22,7 @@ block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
+block-obj-y += replication.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
diff --git a/block/replication.c b/block/replication.c
new file mode 100644
index 000..da3b512
--- /dev/null
+++ b/block/replication.c
@@ -0,0 +1,441 @@
+#include qemu-common.h
+#include block/block_int.h
+#include block/blockjob.h
+#include block/nbd.h
+
+typedef struct BDRVReplicationState {
+ReplicationMode mode;
+int replication_state;
+char *export_name;
+NBDExport *exp;
+BlockDriverState *active_disk;
+BlockDriverState *hidden_disk;
+BlockDriverState *secondary_disk; /* nbd target */
+int error;
+} BDRVReplicationState;
+
+enum {
+BLOCK_REPLICATION_NONE, /* block replication is not started */
+BLOCK_REPLICATION_RUNNING,  /* block replication is running */
+BLOCK_REPLICATION_DONE, /* block replication is done(failover) */
+};
+
+#define COMMIT_CLUSTER_BITS 16
+#define COMMIT_CLUSTER_SIZE (1  COMMIT_CLUSTER_BITS)
+#define COMMIT_SECTORS_PER_CLUSTER (COMMIT_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+static void replication_stop(BlockDriverState *bs, bool failover, Error 
**errp);
+
+#define NBD_OPT_EXPORT  export
+#define REPLICATION_MODEmode
+static QemuOptsList replication_runtime_opts = {
+.name = replication,
+.head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
+.desc = {
+{
+.name = REPLICATION_MODE,
+.type = QEMU_OPT_STRING,
+},
+{
+.name = NBD_OPT_EXPORT,
+.type = QEMU_OPT_STRING,
+.help = The NBD server name,
+},
+{ /* end of list */ }
+},
+};
+
+static int replication_open(BlockDriverState *bs, QDict *options,
+int flags, Error **errp)
+{
+int ret;
+BDRVReplicationState *s = bs-opaque;;
+Error *local_err = NULL;
+QemuOpts *opts = NULL;
+const char *mode;
+
+ret = -EINVAL;
+opts = qemu_opts_create(replication_runtime_opts, NULL, 0, error_abort);
+qemu_opts_absorb_qdict(opts, options, local_err);
+if (local_err) {
+goto fail;
+}
+
+mode = qemu_opt_get(opts, REPLICATION_MODE);
+if (!mode) {
+error_setg(local_err, Missing the option mode);
+goto fail;
+}
+
+if (!strcmp(mode, primary)) {
+s-mode = REPLICATION_MODE_PRIMARY;
+} else if (!strcmp(mode, secondary)) {
+s-mode = REPLICATION_MODE_SECONDARY;
+} else {
+error_setg(local_err,
+   The option mode's value should be primary or secondary);
+goto fail;
+}
+
+if (s-mode == REPLICATION_MODE_SECONDARY) {
+s-export_name = g_strdup(qemu_opt_get(opts, NBD_OPT_EXPORT));
+if (!s-export_name) {
+error_setg(local_err, Missing the option export);
+goto fail;
+}
+}
+
+return 0;
+
+fail:
+qemu_opts_del(opts);
+/* propagate error */
+if (local_err) {
+error_propagate(errp, local_err);
+}
+return ret;
+}
+
+static void replication_close(BlockDriverState *bs)
+{
+BDRVReplicationState *s = bs-opaque;
+
+if (s-replication_state == BLOCK_REPLICATION_RUNNING) {
+replication_stop(bs, false, NULL);
+}
+
+g_free(s-export_name);
+}
+
+static int64_t replication_getlength(BlockDriverState *bs)
+{
+return bdrv_getlength(bs-file);
+}
+
+static int replication_get_io_status(BDRVReplicationState *s)
+{
+switch (s-replication_state) {
+case BLOCK_REPLICATION_NONE:
+return -EIO;
+case BLOCK_REPLICATION_RUNNING:
+return 0;
+case BLOCK_REPLICATION_DONE:
+return s-mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
+default:
+abort();
+}
+}
+
+static int replication_return_value(BDRVReplicationState *s, int ret)
+{
+if (s-mode == REPLICATION_MODE_SECONDARY) {
+return ret;
+}
+
+if (ret  0) {
+s-error = ret;
+ret = 0;
+}
+
+return ret;
+}
+
+static coroutine_fn int replication_co_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ int remaining_sectors,
+ QEMUIOVector *qiov)
+{
+BDRVReplicationState *s = bs-opaque;
+int ret;
+
+if (s-mode == REPLICATION_MODE_PRIMARY) {
+/* We 

[Qemu-block] [PATCH COLO-Block v5 05/15] Backup: clear all bitmap when doing block checkpoint

2015-05-20 Thread Wen Congyang
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
Cc: Jeff Cody jc...@redhat.com
---
 block/backup.c   | 13 +
 blockjob.c   | 10 ++
 include/block/blockjob.h | 12 
 3 files changed, 35 insertions(+)

diff --git a/block/backup.c b/block/backup.c
index d3f648d..d3d8ba7 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -210,11 +210,24 @@ static void backup_iostatus_reset(BlockJob *job)
 bdrv_iostatus_reset(s-target);
 }
 
+static void backup_do_checkpoint(BlockJob *job, Error **errp)
+{
+BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+
+if (backup_job-sync_mode != MIRROR_SYNC_MODE_NONE) {
+error_setg(errp, this feature or command is not currently supported);
+return;
+}
+
+hbitmap_reset_all(backup_job-bitmap);
+}
+
 static const BlockJobDriver backup_job_driver = {
 .instance_size  = sizeof(BackupBlockJob),
 .job_type   = BLOCK_JOB_TYPE_BACKUP,
 .set_speed  = backup_set_speed,
 .iostatus_reset = backup_iostatus_reset,
+.do_checkpoint  = backup_do_checkpoint,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
diff --git a/blockjob.c b/blockjob.c
index 2755465..9d2128a 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -399,3 +399,13 @@ void block_job_defer_to_main_loop(BlockJob *job,
 
 qemu_bh_schedule(data-bh);
 }
+
+void block_job_do_checkpoint(BlockJob *job, Error **errp)
+{
+if (!job-driver-do_checkpoint) {
+error_setg(errp, this feature or command is not currently supported);
+return;
+}
+
+job-driver-do_checkpoint(job, errp);
+}
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 57d8ef1..b832dc3 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -50,6 +50,9 @@ typedef struct BlockJobDriver {
  * manually.
  */
 void (*complete)(BlockJob *job, Error **errp);
+
+/** Optional callback for job types that support checkpoint. */
+void (*do_checkpoint)(BlockJob *job, Error **errp);
 } BlockJobDriver;
 
 /**
@@ -348,4 +351,13 @@ void block_job_defer_to_main_loop(BlockJob *job,
   BlockJobDeferToMainLoopFn *fn,
   void *opaque);
 
+/**
+ * block_job_do_checkpoint:
+ * @job: The job.
+ * @errp: Error object.
+ *
+ * Do block checkpoint on the specified job.
+ */
+void block_job_do_checkpoint(BlockJob *job, Error **errp);
+
 #endif
-- 
2.1.0




[Qemu-block] [PATCH COLO-Block v5 02/15] allow writing to the backing file

2015-05-20 Thread Wen Congyang
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
---
 block.c | 33 -
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 49d472f..adffa0e 100644
--- a/block.c
+++ b/block.c
@@ -1051,6 +1051,20 @@ out:
 bdrv_refresh_limits(bs, NULL);
 }
 
+#define ALLOW_WRITE_BACKING_FILEallow-write-backing-file
+static QemuOptsList backing_file_opts = {
+.name = backing_file,
+.head = QTAILQ_HEAD_INITIALIZER(backing_file_opts.head),
+.desc = {
+{
+.name = ALLOW_WRITE_BACKING_FILE,
+.type = QEMU_OPT_BOOL,
+.help = allow write to backing file,
+},
+{ /* end of list */ }
+},
+};
+
 /*
  * Opens the backing file for a BlockDriverState if not yet open
  *
@@ -1065,6 +1079,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 int ret = 0;
 BlockDriverState *backing_hd;
 Error *local_err = NULL;
+QemuOpts *opts = NULL;
+int flags;
 
 if (bs-backing_hd != NULL) {
 QDECREF(options);
@@ -1077,6 +1093,21 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 }
 
 bs-open_flags = ~BDRV_O_NO_BACKING;
+flags = bdrv_backing_flags(bs-open_flags);
+
+opts = qemu_opts_create(backing_file_opts, NULL, 0, error_abort);
+qemu_opts_absorb_qdict(opts, options, local_err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+QDECREF(options);
+goto free_exit;
+}
+if (qemu_opt_get_bool(opts, ALLOW_WRITE_BACKING_FILE, false)) {
+flags |= BDRV_O_RDWR;
+}
+qemu_opts_del(opts);
+
 if (qdict_haskey(options, file.filename)) {
 backing_filename[0] = '\0';
 } else if (bs-backing_file[0] == '\0'  qdict_size(options) == 0) {
@@ -1109,7 +1140,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 assert(bs-backing_hd == NULL);
 ret = bdrv_open(backing_hd,
 *backing_filename ? backing_filename : NULL, NULL, options,
-bdrv_backing_flags(bs-open_flags), NULL, local_err);
+flags, NULL, local_err);
 if (ret  0) {
 bdrv_unref(backing_hd);
 backing_hd = NULL;
-- 
2.1.0




[Qemu-block] [PATCH COLO-Block v5 00/15] Block replication for continuous checkpoints

2015-05-20 Thread Wen Congyang
Block replication is a very important feature which is used for
continuous checkpoints(for example: COLO).

Usage:
Please refer to docs/block-replication.txt

You can get the patch here:
https://github.com/wencongyang/qemu-colo/commits/block-replication-v5

The other newest COLO patchse will be sent soon.

TODO:
1. Continuous block replication. It will be started after basic functions
   are accepted.

Changs Log:
V5:
1. Address the comments from Gong Lei
2. Speed the failover up. The secondary vm can take over very quickly even
   if there are too many I/O requests.
V4:
1. Introduce a new driver replication to avoid touch nbd and qcow2.
V3:
1: use error_setg() instead of error_set()
2. Add a new block job API
3. Active disk, hidden disk and nbd target uses the same AioContext
4. Add a testcase to test new hbitmap API
V2:
1. Redesign the secondary qemu(use image-fleecing)
2. Use Error objects to return error message
3. Address the comments from Max Reitz and Eric Blake
Wen Congyang (15):
  docs: block replication's description
  allow writing to the backing file
  Allow creating backup jobs when opening BDS
  block: Parse backing_reference option to reference existing BDS
  Backup: clear all bitmap when doing block checkpoint
  Don't allow a disk use backing reference target
  Add new block driver interface to connect/disconnect the remote target
  NBD client: implement block driver interfaces to connect/disconnect
NBD server
  Introduce a new -drive option to control whether to connect to remote
target
  NBD client: connect to nbd server later
  Add new block driver interfaces to control block replication
  skip nbd_target when starting block replication
  quorum: implement block driver interfaces for block replication
  quorum: allow ignoring child errors
  Implement new driver for block replication

 block.c| 272 +++-
 block/Makefile.objs|   3 +-
 block/backup.c |  13 ++
 block/nbd.c|  69 +--
 block/quorum.c | 142 ++-
 block/replication.c| 441 +
 blockdev.c |   8 +
 blockjob.c |  10 +
 docs/block-replication.txt | 179 ++
 include/block/block.h  |  10 +
 include/block/block_int.h  |  18 ++
 include/block/blockjob.h   |  12 ++
 qapi/block.json|  16 ++
 qemu-options.hx|   4 +
 tests/qemu-iotests/051 |  13 ++
 tests/qemu-iotests/051.out |  13 ++
 16 files changed, 1193 insertions(+), 30 deletions(-)
 create mode 100644 block/replication.c
 create mode 100644 docs/block-replication.txt

-- 
2.1.0




[Qemu-block] [PATCH COLO-Block v5 01/15] docs: block replication's description

2015-05-20 Thread Wen Congyang
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: Yang Hongyang yan...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
---
 docs/block-replication.txt | 179 +
 1 file changed, 179 insertions(+)
 create mode 100644 docs/block-replication.txt

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
new file mode 100644
index 000..a29f51a
--- /dev/null
+++ b/docs/block-replication.txt
@@ -0,0 +1,179 @@
+Block replication
+
+Copyright Fujitsu, Corp. 2015
+Copyright (c) 2015 Intel Corporation
+Copyright (c) 2015 HUAWEI TECHNOLOGIES CO., LTD.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+Block replication is used for continuous checkpoints. It is designed
+for COLO (COurse-grain LOck-stepping) where the Secondary VM is running.
+It can also be applied for FT/HA (Fault-tolerance/High Assurance) scenario,
+where the Secondary VM is not running.
+
+This document gives an overview of block replication's design.
+
+== Background ==
+High availability solutions such as micro checkpoint and COLO will do
+consecutive checkpoints. The VM state of Primary VM and Secondary VM is
+identical right after a VM checkpoint, but becomes different as the VM
+executes till the next checkpoint. To support disk contents checkpoint,
+the modified disk contents in the Secondary VM must be buffered, and are
+only dropped at next checkpoint time. To reduce the network transportation
+effort at the time of checkpoint, the disk modification operations of
+Primary disk are asynchronously forwarded to the Secondary node.
+
+== Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  |  Copy and Forward| |
+  |-(1)--+   | Disk Buffer |
+  |  |   | |
+  | (3)  \-/
+  | speculative  ^
+  |write through(2)
+  |  |   |
+  V  V   |
+   +--+   ++
+   | Primary Disk |   | Secondary Disk |
+   +--+   ++
+
+1) Primary write requests will be copied and forwarded to Secondary
+   QEMU.
+2) Before Primary write requests are written to Secondary disk, the
+   original sector content will be read from Secondary disk and
+   buffered in the Disk buffer, but it will not overwrite the existing
+   sector content(it could be from either Secondary Write Requests or
+   previous COW of Primary Write Requests) in the Disk buffer.
+3) Primary write requests will be written to Secondary disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+
+ virtio-blk   ||
+ ^||.--
+ |||| Secondary
+1 Quorum  ||'--
+ /  \ ||
+/\||
+   Primary2 filter
+ disk ^
 virtio-blk
+  |
  ^
+3 NBD  ---  3 NBD 
  |
+client|| server
  2 filter
+  ||^  
  ^
+. |||  
  |
+Primary | ||  Secondary disk - hidden-disk 5 
- active-disk 4
+' |||  backing^   backing
+  ||| |
+  ||| |
+  ||'-'

[Qemu-block] [PATCH COLO-Block v5 11/15] Add new block driver interfaces to control block replication

2015-05-20 Thread Wen Congyang
Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
Cc: Luiz Capitulino lcapitul...@redhat.com
Cc: Michael Roth mdr...@linux.vnet.ibm.com
Reviewed-by: Paolo Bonzini pbonz...@redhat.com
---
 block.c   | 40 
 include/block/block.h |  5 +
 include/block/block_int.h | 14 ++
 qapi/block.json   | 16 
 4 files changed, 75 insertions(+)

diff --git a/block.c b/block.c
index 9a8ef9f..74fd3d6 100644
--- a/block.c
+++ b/block.c
@@ -4215,3 +4215,43 @@ void bdrv_disconnect(BlockDriverState *bs)
 bdrv_disconnect(bs-file);
 }
 }
+
+void bdrv_start_replication(BlockDriverState *bs, ReplicationMode mode,
+Error **errp)
+{
+BlockDriver *drv = bs-drv;
+
+if (drv  drv-bdrv_start_replication) {
+drv-bdrv_start_replication(bs, mode, errp);
+} else if (bs-file) {
+bdrv_start_replication(bs-file, mode, errp);
+} else {
+error_setg(errp, this feature or command is not currently supported);
+}
+}
+
+void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp)
+{
+BlockDriver *drv = bs-drv;
+
+if (drv  drv-bdrv_do_checkpoint) {
+drv-bdrv_do_checkpoint(bs, errp);
+} else if (bs-file) {
+bdrv_do_checkpoint(bs-file, errp);
+} else {
+error_setg(errp, this feature or command is not currently supported);
+}
+}
+
+void bdrv_stop_replication(BlockDriverState *bs, bool failover, Error **errp)
+{
+BlockDriver *drv = bs-drv;
+
+if (drv  drv-bdrv_stop_replication) {
+drv-bdrv_stop_replication(bs, failover, errp);
+} else if (bs-file) {
+bdrv_stop_replication(bs-file, failover, errp);
+} else {
+error_setg(errp, this feature or command is not currently supported);
+}
+}
diff --git a/include/block/block.h b/include/block/block.h
index ed5b8e5..c98adfb 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -596,4 +596,9 @@ BlockAcctStats *bdrv_get_stats(BlockDriverState *bs);
 void bdrv_connect(BlockDriverState *bs, Error **errp);
 void bdrv_disconnect(BlockDriverState *bs);
 
+void bdrv_start_replication(BlockDriverState *bs, ReplicationMode mode,
+Error **errp);
+void bdrv_do_checkpoint(BlockDriverState *bs, Error **errp);
+void bdrv_stop_replication(BlockDriverState *bs, bool failover, Error **errp);
+
 #endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
index da29fa7..761e6ff 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -293,6 +293,20 @@ struct BlockDriver {
 void (*bdrv_connect)(BlockDriverState *bs, Error **errp);
 void (*bdrv_disconnect)(BlockDriverState *bs);
 
+void (*bdrv_start_replication)(BlockDriverState *bs, ReplicationMode mode,
+   Error **errp);
+/* Drop Disk buffer when doing checkpoint. */
+void (*bdrv_do_checkpoint)(BlockDriverState *bs, Error **errp);
+/*
+ * After failover, we should flush Disk buffer into secondary disk
+ * and stop block replication.
+ *
+ * If the guest is shutdown, we should drop Disk buffer and stop
+ * block representation.
+ */
+void (*bdrv_stop_replication)(BlockDriverState *bs, bool failover,
+  Error **errp);
+
 QLIST_ENTRY(BlockDriver) list;
 };
 
diff --git a/qapi/block.json b/qapi/block.json
index aad645c..04dc4c2 100644
--- a/qapi/block.json
+++ b/qapi/block.json
@@ -40,6 +40,22 @@
   'data': ['auto', 'none', 'lba', 'large', 'rechs']}
 
 ##
+# @ReplicationMode
+#
+# An enumeration of replication modes.
+#
+# @unprotected: Replication is not started or after failover.
+#
+# @primary: Primary mode, the vm's state will be sent to secondary QEMU.
+#
+# @secondary: Secondary mode, receive the vm's state from primary QEMU.
+#
+# Since: 2.4
+##
+{ 'enum' : 'ReplicationMode',
+  'data' : ['unprotected', 'primary', 'secondary']}
+
+##
 # @BlockdevSnapshotInternal
 #
 # @device: the name of the device to generate the snapshot from
-- 
2.1.0




[Qemu-block] [PATCH COLO-Block v5 04/15] block: Parse backing_reference option to reference existing BDS

2015-05-20 Thread Wen Congyang
Usage:
-drive file=xxx,id=Y, \
-drive 
file=,id=X,backing_reference.drive_id=Y,backing_reference.hidden-disk.*

It will create such backing chain:
   {virtio-blk dev 'Y'}  
{virtio-blk dev 'X'}
 |  
|
 |  
|
 v  
v

[base] - [mid] - ( Y )  - (hidden target) 
--- ( X )

 v  ^
 v  ^
 v  ^
 v  ^
  drive-backup sync=none 

X's backing file is hidden-disk, and hidden-disk's backing file is Y.
Disk Y may be opened or reopened in read-write mode, so A block backup
job is automatically created: source is Y and target is hidden disk.
Active disk X, hidden disk, and Y are all on the same AioContext.

Signed-off-by: Wen Congyang we...@cn.fujitsu.com
Signed-off-by: zhanghailiang zhang.zhanghaili...@huawei.com
Signed-off-by: Gonglei arei.gong...@huawei.com
---
 block.c| 154 -
 include/block/block.h  |   1 +
 include/block/block_int.h  |   1 +
 tests/qemu-iotests/051 |  13 
 tests/qemu-iotests/051.out |  13 
 5 files changed, 179 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index adffa0e..220ba9a 100644
--- a/block.c
+++ b/block.c
@@ -1157,6 +1157,119 @@ free_exit:
 return ret;
 }
 
+static void backing_reference_completed(void *opaque, int ret)
+{
+BlockDriverState *hidden_disk = opaque;
+
+assert(!hidden_disk-backing_reference);
+}
+
+static int bdrv_open_backing_reference_file(BlockDriverState *bs,
+QDict *options, Error **errp)
+{
+const char *backing_name;
+QDict *hidden_disk_options = NULL;
+BlockDriverState *backing_hd, *hidden_disk;
+BlockBackend *backing_blk;
+AioContext *aio_context;
+Error *local_err = NULL;
+int ret = 0;
+
+backing_name = qdict_get_try_str(options, drive_id);
+if (!backing_name) {
+error_setg(errp, Backing reference needs option drive_id);
+ret = -EINVAL;
+goto free_exit;
+}
+qdict_del(options, drive_id);
+
+qdict_extract_subqdict(options, hidden_disk_options, hidden-disk.);
+if (!qdict_size(hidden_disk_options)) {
+error_setg(errp, Backing reference needs option hidden-disk.*);
+ret = -EINVAL;
+goto free_exit;
+}
+
+if (qdict_size(options)) {
+const QDictEntry *entry = qdict_first(options);
+error_setg(errp, Backing reference used by '%s' doesn't support 
+   the option '%s', bdrv_get_device_name(bs), entry-key);
+ret = -EINVAL;
+goto free_exit;
+}
+
+backing_blk = blk_by_name(backing_name);
+if (!backing_blk) {
+error_setg(errp, Device '%s' not found, backing_name);
+ret = -ENOENT;
+goto free_exit;
+}
+
+backing_hd = blk_bs(backing_blk);
+/* Backing reference itself? */
+if (backing_hd == bs || bdrv_find_overlay(backing_hd, bs)) {
+error_setg(errp, Backing reference itself);
+ret = -EINVAL;
+goto free_exit;
+}
+
+if (bdrv_op_is_blocked(backing_hd, BLOCK_OP_TYPE_BACKING_REFERENCE,
+   errp)) {
+ret = -EBUSY;
+goto free_exit;
+}
+
+/* hidden-disk is bs's backing file */
+ret = bdrv_open_backing_file(bs, hidden_disk_options, errp);
+hidden_disk_options = NULL;
+if (ret  0) {
+goto free_exit;
+}
+
+hidden_disk = bs-backing_hd;
+if (!hidden_disk-drv || !hidden_disk-drv-supports_backing) {
+ret = -EINVAL;
+error_setg(errp, Hidden disk's driver doesn't support backing files);
+goto free_exit;
+}
+
+bdrv_set_backing_hd(hidden_disk, backing_hd);
+bdrv_ref(backing_hd);
+
+/*
+ * backing hd may be opened or reopened in read-write mode, so we
+ * should backup backing hd to hidden disk
+ */
+bdrv_op_unblock(hidden_disk, BLOCK_OP_TYPE_BACKUP_TARGET,
+bs-backing_blocker);
+bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
+hidden_disk-backing_blocker);
+
+bdrv_ref(hidden_disk);
+
+aio_context = bdrv_get_aio_context(bs);
+aio_context_acquire(aio_context);
+bdrv_set_aio_context(backing_hd, aio_context);
+backup_start(backing_hd, hidden_disk, 0, MIRROR_SYNC_MODE_NONE, NULL,
+ BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
+ backing_reference_completed, hidden_disk, local_err);
+aio_context_release(aio_context);
+if (local_err) 

Re: [Qemu-block] [Qemu-devel] [PATCH 11/34] block: Allow references for backing files

2015-05-20 Thread Wen Congyang
On 05/09/2015 01:21 AM, Kevin Wolf wrote:
 For bs-file, using references to existing BDSes has been possible for a
 while already. This patch enables the same for bs-backing_hd.

1. We reference to an existing BDSes, and some disk uses this blk. Do
we allow this?
2. bs-backing_hd-blk can be not NULL now? If we do an active commit
to this backing file(use mirror job), we will call bdrv_swap() in
mirror_exit(), and the function bdrv_swap() doesn't allow that
new_bs-blk(here is bs-backing_hd) is not NULL.

Thanks
Wen Congyang

 
 Signed-off-by: Kevin Wolf kw...@redhat.com
 ---
  block.c   | 42 --
  block/mirror.c|  2 +-
  include/block/block.h |  3 ++-
  3 files changed, 27 insertions(+), 20 deletions(-)
 
 diff --git a/block.c b/block.c
 index e93bf63..95dc51e 100644
 --- a/block.c
 +++ b/block.c
 @@ -1109,30 +1109,41 @@ out:
  /*
   * Opens the backing file for a BlockDriverState if not yet open
   *
 - * options is a QDict of options to pass to the block drivers, or NULL for an
 - * empty set of options. The reference to the QDict is transferred to this
 - * function (even on failure), so if the caller intends to reuse the 
 dictionary,
 - * it needs to use QINCREF() before calling bdrv_file_open.
 + * bdrev_key specifies the key for the image's BlockdevRef in the options 
 QDict.
 + * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
 + * itself, all options starting with ${bdref_key}. are considered part of 
 the
 + * BlockdevRef.
 + *
 + * TODO Can this be unified with bdrv_open_image()?
   */
 -int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error 
 **errp)
 +int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
 +   const char *bdref_key, Error **errp)
  {
  char *backing_filename = g_malloc0(PATH_MAX);
 +char *bdref_key_dot;
 +const char *reference = NULL;
  int ret = 0;
  BlockDriverState *backing_hd;
 +QDict *options;
  Error *local_err = NULL;
  
  if (bs-backing_hd != NULL) {
 -QDECREF(options);
  goto free_exit;
  }
  
  /* NULL means an empty set of options */
 -if (options == NULL) {
 -options = qdict_new();
 +if (parent_options == NULL) {
 +parent_options = qdict_new();
  }
  
  bs-open_flags = ~BDRV_O_NO_BACKING;
 -if (qdict_haskey(options, file.filename)) {
 +
 +bdref_key_dot = g_strdup_printf(%s., bdref_key);
 +qdict_extract_subqdict(parent_options, options, bdref_key_dot);
 +g_free(bdref_key_dot);
 +
 +reference = qdict_get_try_str(parent_options, bdref_key);
 +if (reference || qdict_haskey(options, file.filename)) {
  backing_filename[0] = '\0';
  } else if (bs-backing_file[0] == '\0'  qdict_size(options) == 0) {
  QDECREF(options);
 @@ -1155,20 +1166,17 @@ int bdrv_open_backing_file(BlockDriverState *bs, 
 QDict *options, Error **errp)
  goto free_exit;
  }
  
 -backing_hd = bdrv_new();
 -
  if (bs-backing_format[0] != '\0'  !qdict_haskey(options, driver)) {
  qdict_put(options, driver, qstring_from_str(bs-backing_format));
  }
  
  assert(bs-backing_hd == NULL);
 +backing_hd = NULL;
  ret = bdrv_open_inherit(backing_hd,
  *backing_filename ? backing_filename : NULL,
 -NULL, options, 0, bs, child_backing,
 +reference, options, 0, bs, child_backing,
  NULL, local_err);
  if (ret  0) {
 -bdrv_unref(backing_hd);
 -backing_hd = NULL;
  bs-open_flags |= BDRV_O_NO_BACKING;
  error_setg(errp, Could not open backing file: %s,
 error_get_pretty(local_err));
 @@ -1176,6 +1184,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
 *options, Error **errp)
  goto free_exit;
  }
  bdrv_set_backing_hd(bs, backing_hd);
 +qdict_del(parent_options, bdref_key);
  
  free_exit:
  g_free(backing_filename);
 @@ -1463,10 +1472,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, 
 const char *filename,
  
  /* If there is a backing file, use it */
  if ((flags  BDRV_O_NO_BACKING) == 0) {
 -QDict *backing_options;
 -
 -qdict_extract_subqdict(options, backing_options, backing.);
 -ret = bdrv_open_backing_file(bs, backing_options, local_err);
 +ret = bdrv_open_backing_file(bs, options, backing, local_err);
  if (ret  0) {
  goto close_and_fail;
  }
 diff --git a/block/mirror.c b/block/mirror.c
 index 58f391a..f1bc342 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
 @@ -592,7 +592,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
  Error *local_err = NULL;
  int ret;
  
 -ret = bdrv_open_backing_file(s-target, NULL, local_err);
 +ret = bdrv_open_backing_file(s-target, NULL, backing, local_err);
  if 

Re: [Qemu-block] [PATCH] MAINTAINERS: Split Block QAPI, monitor, command line off core

2015-05-20 Thread Kevin Wolf
Am 20.05.2015 um 13:23 hat Markus Armbruster geschrieben:
 Kevin and Stefan asked me to take care of this part.
 
 Signed-off-by: Markus Armbruster arm...@redhat.com

Thanks, applied to the block branch.

Kevin



Re: [Qemu-block] [PATCH 03/34] quorum: Use bdrv_open_image()

2015-05-20 Thread Alberto Garcia
On Fri 08 May 2015 07:21:35 PM CEST, Kevin Wolf wrote:

 Besides standardising on a single interface for opening child nodes,
 this simplifies the .bdrv_open() implementation of the quorum block
 driver by using block layer functionality for handling BlockdevRefs.

 Signed-off-by: Kevin Wolf kw...@redhat.com

Reviewed-by: Alberto Garcia be...@igalia.com

Berto



Re: [Qemu-block] [PATCH 05/34] block: Use macro for cache option names

2015-05-20 Thread Alberto Garcia
On Fri 08 May 2015 07:21:37 PM CEST, Kevin Wolf wrote:

 Signed-off-by: Kevin Wolf kw...@redhat.com

Reviewed-by: Alberto Garcia be...@igalia.com

Berto