[PATCH 06/19] virtio: decrement last_avail_idx with inuse before saving.
For regular migration inuse == 0 always as requests are flushed before save. However, event-tap log when enabled introduces an extra queue for requests which is not being flushed, thus the last inuse requests are left in the event-tap queue. Move the last_avail_idx value sent to the remote back to make it repeat the last inuse requests. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/virtio.c | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 31bd9e3..f05d1b6 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -673,12 +673,20 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f) qemu_put_be32(f, i); for (i = 0; i VIRTIO_PCI_QUEUE_MAX; i++) { +/* For regular migration inuse == 0 always as + * requests are flushed before save. However, + * event-tap log when enabled introduces an extra + * queue for requests which is not being flushed, + * thus the last inuse requests are left in the event-tap queue. + * Move the last_avail_idx value sent to the remote back + * to make it repeat the last inuse requests. */ +uint16_t last_avail = vdev-vq[i].last_avail_idx - vdev-vq[i].inuse; if (vdev-vq[i].vring.num == 0) break; qemu_put_be32(f, vdev-vq[i].vring.num); qemu_put_be64(f, vdev-vq[i].pa); -qemu_put_be16s(f, vdev-vq[i].last_avail_idx); +qemu_put_be16s(f, last_avail); if (vdev-binding-save_queue) vdev-binding-save_queue(vdev-binding_opaque, i, f); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 18/19] Introduce -k option to enable FT migration mode (Kemari).
When -k option is set to migrate command, it will turn on ft_mode to start FT migration mode (Kemari). Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hmp-commands.hx |7 --- migration.c |4 qmp-commands.hx |7 --- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index 38e1eb7..3560f32 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -754,13 +754,14 @@ ETEXI { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, diff --git a/migration.c b/migration.c index 7837c55..9d2abff 100644 --- a/migration.c +++ b/migration.c @@ -99,6 +99,10 @@ int do_migrate(Monitor *mon, const QDict *qdict, QObject **ret_data) return -1; } +if (qdict_get_try_bool(qdict, ft, 0)) { +ft_mode = FT_INIT; +} + if (strstart(uri, tcp:, p)) { s = tcp_start_outgoing_migration(mon, p, max_throttle, detach, blk, inc); diff --git a/qmp-commands.hx b/qmp-commands.hx index df40a3d..f81a4a2 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -431,13 +431,14 @@ EQMP { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/19] net: insert event-tap to qemu_send_packet() and qemu_sendv_packet_async().
event-tap function is called only when it is on. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- net.c |9 + 1 files changed, 9 insertions(+), 0 deletions(-) diff --git a/net.c b/net.c index 9ba5be2..1176124 100644 --- a/net.c +++ b/net.c @@ -36,6 +36,7 @@ #include qemu-common.h #include qemu_socket.h #include hw/qdev.h +#include event-tap.h static QTAILQ_HEAD(, VLANState) vlans; static QTAILQ_HEAD(, VLANClientState) non_vlan_clients; @@ -559,6 +560,10 @@ ssize_t qemu_send_packet_async(VLANClientState *sender, void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size) { +if (event_tap_is_on()) { +return event_tap_send_packet(vc, buf, size); +} + qemu_send_packet_async(vc, buf, size, NULL); } @@ -657,6 +662,10 @@ ssize_t qemu_sendv_packet_async(VLANClientState *sender, { NetQueue *queue; +if (event_tap_is_on()) { +return event_tap_sendv_packet_async(sender, iov, iovcnt, sent_cb); +} + if (sender-link_down || (!sender-peer !sender-vlan)) { return calc_iov_length(iov, iovcnt); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/19] Insert event_tap_mmio() to cpu_physical_memory_rw() in exec.c.
Record mmio write event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- exec.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/exec.c b/exec.c index e950df2..c81fd09 100644 --- a/exec.c +++ b/exec.c @@ -33,6 +33,7 @@ #include osdep.h #include kvm.h #include qemu-timer.h +#include event-tap.h #if defined(CONFIG_USER_ONLY) #include qemu.h #include signal.h @@ -3632,6 +3633,9 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, io_index = (pd IO_MEM_SHIFT) (IO_MEM_NB_ENTRIES - 1); if (p) addr1 = (addr ~TARGET_PAGE_MASK) + p-region_offset; + +event_tap_mmio(addr, buf, len); + /* XXX: could force cpu_single_env to NULL to avoid potential bugs */ if (l = 4 ((addr1 3) == 0)) { -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/19] Introduce fault tolerant VM transaction QEMUFile and ft_mode.
This code implements VM transaction protocol. Like buffered_file, it sits between savevm and migration layer. With this architecture, VM transaction protocol is implemented mostly independent from other existing code. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.objs |1 + ft_trans_file.c | 624 +++ ft_trans_file.h | 72 +++ migration.c |3 + trace-events| 15 ++ 5 files changed, 715 insertions(+), 0 deletions(-) create mode 100644 ft_trans_file.c create mode 100644 ft_trans_file.h diff --git a/Makefile.objs b/Makefile.objs index 353b1a8..04148b5 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -100,6 +100,7 @@ common-obj-y += msmouse.o ps2.o common-obj-y += qdev.o qdev-properties.o common-obj-y += block-migration.o common-obj-y += pflib.o +common-obj-y += ft_trans_file.o common-obj-$(CONFIG_BRLAPI) += baum.o common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o diff --git a/ft_trans_file.c b/ft_trans_file.c new file mode 100644 index 000..2b42b95 --- /dev/null +++ b/ft_trans_file.c @@ -0,0 +1,624 @@ +/* + * Fault tolerant VM transaction QEMUFile + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * This source code is based on buffered_file.c. + * Copyright IBM, Corp. 2008 + * Authors: + * Anthony Liguorialigu...@us.ibm.com + */ + +#include qemu-common.h +#include qemu-error.h +#include hw/hw.h +#include qemu-timer.h +#include sysemu.h +#include qemu-char.h +#include trace.h +#include ft_trans_file.h + +typedef struct FtTransHdr +{ +uint16_t cmd; +uint16_t id; +uint32_t seq; +uint32_t payload_len; +} FtTransHdr; + +typedef struct QEMUFileFtTrans +{ +FtTransPutBufferFunc *put_buffer; +FtTransGetBufferFunc *get_buffer; +FtTransPutReadyFunc *put_ready; +FtTransGetReadyFunc *get_ready; +FtTransWaitForUnfreezeFunc *wait_for_unfreeze; +FtTransCloseFunc *close; +void *opaque; +QEMUFile *file; + +enum QEMU_VM_TRANSACTION_STATE state; +uint32_t seq; +uint16_t id; + +int has_error; + +bool freeze_output; +bool freeze_input; +bool rate_limit; +bool is_sender; +bool is_payload; + +uint8_t *buf; +size_t buf_max_size; +size_t put_offset; +size_t get_offset; + +FtTransHdr header; +size_t header_offset; +} QEMUFileFtTrans; + +#define IO_BUF_SIZE 32768 + +static void ft_trans_append(QEMUFileFtTrans *s, +const uint8_t *buf, size_t size) +{ +if (size (s-buf_max_size - s-put_offset)) { +trace_ft_trans_realloc(s-buf_max_size, size + 1024); +s-buf_max_size += size + 1024; +s-buf = qemu_realloc(s-buf, s-buf_max_size); +} + +trace_ft_trans_append(size); +memcpy(s-buf + s-put_offset, buf, size); +s-put_offset += size; +} + +static void ft_trans_flush(QEMUFileFtTrans *s) +{ +size_t offset = 0; + +if (s-has_error) { +error_report(flush when error %d, bailing, s-has_error); +return; +} + +while (offset s-put_offset) { +ssize_t ret; + +ret = s-put_buffer(s-opaque, s-buf + offset, s-put_offset - offset); +if (ret == -EAGAIN) { +break; +} + +if (ret = 0) { +error_report(error flushing data, %s, strerror(errno)); +s-has_error = FT_TRANS_ERR_FLUSH; +break; +} else { +offset += ret; +} +} + +trace_ft_trans_flush(offset, s-put_offset); +memmove(s-buf, s-buf + offset, s-put_offset - offset); +s-put_offset -= offset; +s-freeze_output = !!s-put_offset; +} + +static ssize_t ft_trans_put(void *opaque, void *buf, int size) +{ +QEMUFileFtTrans *s = opaque; +size_t offset = 0; +ssize_t len; + +/* flush buffered data before putting next */ +if (s-put_offset) { +ft_trans_flush(s); +} + +while (!s-freeze_output offset size) { +len = s-put_buffer(s-opaque, (uint8_t *)buf + offset, size - offset); + +if (len == -EAGAIN) { +trace_ft_trans_freeze_output(); +s-freeze_output = 1; +break; +} + +if (len = 0) { +error_report(putting data failed, %s, strerror(errno)); +s-has_error = 1; +offset = -EINVAL; +break; +} + +offset += len; +} + +if (s-freeze_output) { +ft_trans_append(s, buf + offset, size - offset); +offset = size; +} + +return offset; +} + +static int ft_trans_send_header(QEMUFileFtTrans *s, +enum QEMU_VM_TRANSACTION_STATE state, +uint32_t payload_len) +{ +int ret; +FtTransHdr
[PATCH 05/19] vl.c: add deleted flag for deleting the handler.
Make deleting handlers robust against deletion of any elements in a handler by using a deleted flag like in file descriptors. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c | 13 + 1 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vl.c b/vl.c index ed2cdfa..00155fb 100644 --- a/vl.c +++ b/vl.c @@ -1158,6 +1158,7 @@ static void nographic_update(void *opaque) struct vm_change_state_entry { VMChangeStateHandler *cb; void *opaque; +int deleted; QLIST_ENTRY (vm_change_state_entry) entries; }; @@ -1178,8 +1179,7 @@ VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, void qemu_del_vm_change_state_handler(VMChangeStateEntry *e) { -QLIST_REMOVE (e, entries); -qemu_free (e); +e-deleted = 1; } void vm_state_notify(int running, int reason) @@ -1188,8 +1188,13 @@ void vm_state_notify(int running, int reason) trace_vm_state_notify(running, reason); -for (e = vm_change_state_head.lh_first; e; e = e-entries.le_next) { -e-cb(e-opaque, running, reason); +QLIST_FOREACH(e, vm_change_state_head, entries) { +if (e-deleted) { +QLIST_REMOVE(e, entries); +qemu_free(e); +} else { +e-cb(e-opaque, running, reason); +} } } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/19] Introduce skip_header parameter to qemu_loadvm_state().
Introduce skip_header parameter to qemu_loadvm_state() so that it can be called iteratively without reading the header. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |2 +- savevm.c| 24 +--- sysemu.h|2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/migration.c b/migration.c index f0df5fc..dd3bf94 100644 --- a/migration.c +++ b/migration.c @@ -63,7 +63,7 @@ int qemu_start_incoming_migration(const char *uri) void process_incoming_migration(QEMUFile *f) { -if (qemu_loadvm_state(f) 0) { +if (qemu_loadvm_state(f, 0) 0) { fprintf(stderr, load of migration failed\n); exit(0); } diff --git a/savevm.c b/savevm.c index 6c4c72b..58e48e3 100644 --- a/savevm.c +++ b/savevm.c @@ -1716,7 +1716,7 @@ typedef struct LoadStateEntry { int version_id; } LoadStateEntry; -int qemu_loadvm_state(QEMUFile *f) +int qemu_loadvm_state(QEMUFile *f, int skip_header) { QLIST_HEAD(, LoadStateEntry) loadvm_handlers = QLIST_HEAD_INITIALIZER(loadvm_handlers); @@ -1729,17 +1729,19 @@ int qemu_loadvm_state(QEMUFile *f) return -EINVAL; } -v = qemu_get_be32(f); -if (v != QEMU_VM_FILE_MAGIC) -return -EINVAL; +if (!skip_header) { +v = qemu_get_be32(f); +if (v != QEMU_VM_FILE_MAGIC) +return -EINVAL; -v = qemu_get_be32(f); -if (v == QEMU_VM_FILE_VERSION_COMPAT) { -fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); -return -ENOTSUP; +v = qemu_get_be32(f); +if (v == QEMU_VM_FILE_VERSION_COMPAT) { +fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); +return -ENOTSUP; +} +if (v != QEMU_VM_FILE_VERSION) +return -ENOTSUP; } -if (v != QEMU_VM_FILE_VERSION) -return -ENOTSUP; while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; @@ -2062,7 +2064,7 @@ int load_vmstate(const char *name) return -EINVAL; } -ret = qemu_loadvm_state(f); +ret = qemu_loadvm_state(f, 0); qemu_fclose(f); if (ret 0) { diff --git a/sysemu.h b/sysemu.h index 23ae17e..c86b4e8 100644 --- a/sysemu.h +++ b/sysemu.h @@ -81,7 +81,7 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); -int qemu_loadvm_state(QEMUFile *f); +int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ void do_info_slirp(Monitor *mon); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/19] Introduce event-tap.
event-tap controls when to start FT transaction, and provides proxy functions to called from net/block devices. While FT transaction, it queues up net/block requests, and flush them when the transaction gets completed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.target |1 + event-tap.c | 939 +++ event-tap.h | 44 +++ qemu-tool.c | 28 ++ trace-events| 10 + 5 files changed, 1022 insertions(+), 0 deletions(-) create mode 100644 event-tap.c create mode 100644 event-tap.h diff --git a/Makefile.target b/Makefile.target index b0ba95f..edbdbee 100644 --- a/Makefile.target +++ b/Makefile.target @@ -199,6 +199,7 @@ obj-y += rwhandler.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o LIBS+=-lz +obj-y += event-tap.o QEMU_CFLAGS += $(VNC_TLS_CFLAGS) QEMU_CFLAGS += $(VNC_SASL_CFLAGS) diff --git a/event-tap.c b/event-tap.c new file mode 100644 index 000..f44d835 --- /dev/null +++ b/event-tap.c @@ -0,0 +1,939 @@ +/* + * Event Tap functions for QEMU + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include qemu-common.h +#include qemu-error.h +#include block.h +#include block_int.h +#include ioport.h +#include osdep.h +#include sysemu.h +#include hw/hw.h +#include net.h +#include event-tap.h +#include trace.h + +enum EVENT_TAP_STATE { +EVENT_TAP_OFF, +EVENT_TAP_ON, +EVENT_TAP_SUSPEND, +EVENT_TAP_FLUSH, +EVENT_TAP_LOAD, +EVENT_TAP_REPLAY, +}; + +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF; + +typedef struct EventTapIOport { +uint32_t address; +uint32_t data; +int index; +} EventTapIOport; + +#define MMIO_BUF_SIZE 8 + +typedef struct EventTapMMIO { +uint64_t address; +uint8_t buf[MMIO_BUF_SIZE]; +int len; +} EventTapMMIO; + +typedef struct EventTapNetReq { +char *device_name; +int iovcnt; +int vlan_id; +bool vlan_needed; +bool async; +struct iovec *iov; +NetPacketSent *sent_cb; +} EventTapNetReq; + +#define MAX_BLOCK_REQUEST 32 + +typedef struct EventTapAIOCB EventTapAIOCB; + +typedef struct EventTapBlkReq { +char *device_name; +int num_reqs; +int num_cbs; +bool is_flush; +BlockRequest reqs[MAX_BLOCK_REQUEST]; +EventTapAIOCB *acb[MAX_BLOCK_REQUEST]; +} EventTapBlkReq; + +#define EVENT_TAP_IOPORT (1 0) +#define EVENT_TAP_MMIO (1 1) +#define EVENT_TAP_NET(1 2) +#define EVENT_TAP_BLK(1 3) + +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1) + +typedef struct EventTapLog { +int mode; +union { +EventTapIOport ioport; +EventTapMMIO mmio; +}; +union { +EventTapNetReq net_req; +EventTapBlkReq blk_req; +}; +QTAILQ_ENTRY(EventTapLog) node; +} EventTapLog; + +struct EventTapAIOCB { +BlockDriverAIOCB common; +BlockDriverAIOCB *acb; +bool is_canceled; +}; + +static EventTapLog *last_event_tap; + +static QTAILQ_HEAD(, EventTapLog) event_list; +static QTAILQ_HEAD(, EventTapLog) event_pool; + +static int (*event_tap_cb)(void); +static QEMUBH *event_tap_bh; +static VMChangeStateEntry *vmstate; + +static void event_tap_bh_cb(void *p) +{ +if (event_tap_cb) { +event_tap_cb(); +} + +qemu_bh_delete(event_tap_bh); +event_tap_bh = NULL; +} + +static void event_tap_schedule_bh(void) +{ +trace_event_tap_ignore_bh(!!event_tap_bh); + +/* if bh is already set, we ignore it for now */ +if (event_tap_bh) { +return; +} + +event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL); +qemu_bh_schedule(event_tap_bh); + +return; +} + +static void *event_tap_alloc_log(void) +{ +EventTapLog *log; + +if (QTAILQ_EMPTY(event_pool)) { +log = qemu_mallocz(sizeof(EventTapLog)); +} else { +log = QTAILQ_FIRST(event_pool); +QTAILQ_REMOVE(event_pool, log, node); +} + +return log; +} + +static void event_tap_free_net_req(EventTapNetReq *net_req); +static void event_tap_free_blk_req(EventTapBlkReq *blk_req); + +static void event_tap_free_log(EventTapLog *log) +{ +int mode = log-mode ~EVENT_TAP_TYPE_MASK; + +if (mode == EVENT_TAP_NET) { +event_tap_free_net_req(log-net_req); +} else if (mode == EVENT_TAP_BLK) { +event_tap_free_blk_req(log-blk_req); +} + +log-mode = 0; + +/* return the log to event_pool */ +QTAILQ_INSERT_HEAD(event_pool, log, node); +} + +static void event_tap_free_pool(void) +{ +EventTapLog *log, *next; + +QTAILQ_FOREACH_SAFE(log, event_pool, node, next) { +QTAILQ_REMOVE(event_pool, log, node); +qemu_free(log); +} +} + +static void event_tap_free_net_req(EventTapNetReq *net_req) +{ +int i; + +if (!net_req-async
[PATCH 14/19] block: insert event-tap to bdrv_aio_writev(), bdrv_aio_flush() and bdrv_flush().
event-tap function is called only when it is on, and requests were sent from device emulators. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- block.c | 15 +++ 1 files changed, 15 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index b476479..8ddce13 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include block_int.h #include module.h #include qemu-objects.h +#include event-tap.h #ifdef CONFIG_BSD #include sys/types.h @@ -1482,6 +1483,10 @@ int bdrv_flush(BlockDriverState *bs) } if (bs-drv bs-drv-bdrv_flush) { +if (*bs-device_name event_tap_is_on()) { +event_tap_bdrv_flush(); +} + return bs-drv-bdrv_flush(bs); } @@ -2117,6 +2122,11 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; +if (*bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); +} + if (bs-dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, opaque); @@ -2380,6 +2390,11 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, if (!drv) return NULL; + +if (*bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_flush(bs, cb, opaque); +} + return drv-bdrv_aio_flush(bs, cb, opaque); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 17/19] migration-tcp: modify tcp_accept_incoming_migration() to handle ft_mode, and add a hack not to close fd when ft_mode is enabled.
When ft_mode is set in the header, tcp_accept_incoming_migration() sets ft_trans_incoming() as a callback, and call qemu_file_get_notify() to receive FT transaction iteratively. We also need a hack no to close fd before moving to ft_transaction mode, so that we can reuse the fd for it. vm_change_state_handler is added to turn off ft_mode when cont is pressed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 67 ++- 1 files changed, 66 insertions(+), 1 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index 55777c8..84076d6 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -18,6 +18,8 @@ #include sysemu.h #include buffered_file.h #include block.h +#include ft_trans_file.h +#include event-tap.h //#define DEBUG_MIGRATION_TCP @@ -29,6 +31,8 @@ do { } while (0) #endif +static VMChangeStateEntry *vmstate; + static int socket_errno(FdMigrationState *s) { return socket_error(); @@ -56,7 +60,8 @@ static int socket_read(FdMigrationState *s, const void * buf, size_t size) static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); -if (s-fd != -1) { +/* FIX ME: accessing ft_mode here isn't clean */ +if (s-fd != -1 ft_mode != FT_INIT) { close(s-fd); s-fd = -1; } @@ -150,6 +155,36 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, return s-mig_state; } +static void ft_trans_incoming(void *opaque) +{ +QEMUFile *f = opaque; + +qemu_file_get_notify(f); +if (qemu_file_has_error(f)) { +ft_mode = FT_ERROR; +qemu_fclose(f); +} +} + +static void ft_trans_reset(void *opaque, int running, int reason) +{ +QEMUFile *f = opaque; + +if (running) { +if (ft_mode != FT_ERROR) { +qemu_fclose(f); +} +ft_mode = FT_OFF; +qemu_del_vm_change_state_handler(vmstate); +} +} + +static void ft_trans_schedule_replay(QEMUFile *f) +{ +event_tap_schedule_replay(); +vmstate = qemu_add_vm_change_state_handler(ft_trans_reset, f); +} + static void tcp_accept_incoming_migration(void *opaque) { struct sockaddr_in addr; @@ -175,8 +210,38 @@ static void tcp_accept_incoming_migration(void *opaque) goto out; } +if (ft_mode == FT_INIT) { +autostart = 0; +} + process_incoming_migration(f); + +if (ft_mode == FT_INIT) { +int ret; + +socket_set_nodelay(c); + +f = qemu_fopen_ft_trans(s, c); +if (f == NULL) { +fprintf(stderr, could not qemu_fopen_ft_trans\n); +goto out; +} + +/* need to wait sender to setup */ +ret = qemu_ft_trans_begin(f); +if (ret 0) { +goto out; +} + +qemu_set_fd_handler2(c, NULL, ft_trans_incoming, NULL, f); +ft_trans_schedule_replay(f); +ft_mode = FT_TRANSACTION_RECV; + +return; +} + qemu_fclose(f); + out: close(c); out2: -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/19] Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer().
Currently buf size is fixed at 32KB. It would be useful if it could be flexible. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |2 ++ savevm.c | 20 +++- 2 files changed, 21 insertions(+), 1 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index 5e24329..a168a37 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -58,6 +58,8 @@ void qemu_fflush(QEMUFile *f); int qemu_fclose(QEMUFile *f); void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); +void *qemu_realloc_buffer(QEMUFile *f, int size); +void qemu_clear_buffer(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { diff --git a/savevm.c b/savevm.c index 6d83b0f..6c4c72b 100644 --- a/savevm.c +++ b/savevm.c @@ -171,7 +171,8 @@ struct QEMUFile { when reading */ int buf_index; int buf_size; /* 0 when writing */ -uint8_t buf[IO_BUF_SIZE]; +int buf_max_size; +uint8_t *buf; int has_error; }; @@ -422,6 +423,9 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, f-get_rate_limit = get_rate_limit; f-is_write = 0; +f-buf_max_size = IO_BUF_SIZE; +f-buf = qemu_malloc(sizeof(uint8_t) * f-buf_max_size); + return f; } @@ -452,6 +456,19 @@ void qemu_fflush(QEMUFile *f) } } +void *qemu_realloc_buffer(QEMUFile *f, int size) +{ +f-buf_max_size = size; +f-buf = qemu_realloc(f-buf, f-buf_max_size); + +return f-buf; +} + +void qemu_clear_buffer(QEMUFile *f) +{ +f-buf_size = f-buf_index = f-buf_offset = 0; +} + static void qemu_fill_buffer(QEMUFile *f) { int len; @@ -477,6 +494,7 @@ int qemu_fclose(QEMUFile *f) qemu_fflush(f); if (f-close) ret = f-close(f-opaque); +qemu_free(f-buf); qemu_free(f); return ret; } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 18/19] Introduce -k option to enable FT migration mode (Kemari).
2011/2/8 Paolo Bonzini pbonz...@redhat.com: On 02/08/2011 12:01 PM, Yoshiaki Tamura wrote: When -k option is set to migrate command, it will turn on ft_mode to start FT migration mode (Kemari). This could also use a kemari: prefix. Sorry, missed that comment. BTW, the help message would be like put kemari: in front of URI to enable Fault Tolerance mode (Kemari protocol) If there are comments on this, I would like to take them. Thanks, Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/19] savevm: introduce qemu_savevm_trans_{begin,commit}.
2011/2/2 Paolo Bonzini pbonz...@redhat.com: On 02/01/2011 07:21 PM, Yoshiaki Tamura wrote: Paolo, I refactored the savevm functions. Could you give me your comments? I didn't review it thoroughly, but the abstractions seem okay. Thanks. Since It got a bit messy, I wanted hear your opinion. Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/19] savevm: introduce qemu_savevm_trans_{begin,commit}.
Paolo, I refactored the savevm functions. Could you give me your comments? Thanks, Yoshi diff --git a/savevm.c b/savevm.c index 5418280..90aae55 100644 --- a/savevm.c +++ b/savevm.c @@ -1602,29 +1602,68 @@ bool qemu_savevm_state_blocked(Monitor *mon) return false; } -int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, -int shared) +/* + * section: header to write + * inc: if true, forces to pass SECTION_PART instead of SECTION_START + * pause: if true, breaks the loop when live handler returned 0 + */ +static int qemu_savevm_state_live(Monitor *mon, QEMUFile *f, int section, + bool inc, bool pause) { SaveStateEntry *se; +int skip = 0, ret; QTAILQ_FOREACH(se, savevm_handlers, entry) { -if(se-set_params == NULL) { +int len, stage; + +if (se-save_live_state == NULL) { continue; - } - se-set_params(blk_enable, shared, se-opaque); +} + +/* Section type */ +qemu_put_byte(f, section); +qemu_put_be32(f, se-section_id); + +if (section == QEMU_VM_SECTION_START) { +/* ID string */ +len = strlen(se-idstr); +qemu_put_byte(f, len); +qemu_put_buffer(f, (uint8_t *)se-idstr, len); + +qemu_put_be32(f, se-instance_id); +qemu_put_be32(f, se-version_id); + +stage = inc ? QEMU_VM_SECTION_PART : QEMU_VM_SECTION_START; +} else { +assert(inc); +stage = section; +} + +ret = se-save_live_state(mon, f, stage, se-opaque); +if (!ret) { +skip++; +if (pause) { +break; +} +} } - -qemu_put_be32(f, QEMU_VM_FILE_MAGIC); -qemu_put_be32(f, QEMU_VM_FILE_VERSION); + +return skip; +} + +static void qemu_savevm_state_full(QEMUFile *f) +{ +SaveStateEntry *se; QTAILQ_FOREACH(se, savevm_handlers, entry) { int len; -if (se-save_live_state == NULL) +if (se-save_state == NULL se-vmsd == NULL) { continue; +} /* Section type */ -qemu_put_byte(f, QEMU_VM_SECTION_START); +qemu_put_byte(f, QEMU_VM_SECTION_FULL); qemu_put_be32(f, se-section_id); /* ID string */ @@ -1635,8 +1674,28 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, qemu_put_be32(f, se-instance_id); qemu_put_be32(f, se-version_id); -se-save_live_state(mon, f, QEMU_VM_SECTION_START, se-opaque); +vmstate_save(f, se); +} + +qemu_put_byte(f, QEMU_VM_EOF); +} + +int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, +int shared) +{ +SaveStateEntry *se; + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +if(se-set_params == NULL) { +continue; +} +se-set_params(blk_enable, shared, se-opaque); } + +qemu_put_be32(f, QEMU_VM_FILE_MAGIC); +qemu_put_be32(f, QEMU_VM_FILE_VERSION); + +qemu_savevm_state_live(mon, f, QEMU_VM_SECTION_START, 0, 0); if (qemu_file_has_error(f)) { qemu_savevm_state_cancel(mon, f); @@ -1648,29 +1707,16 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f) { -SaveStateEntry *se; int ret = 1; -QTAILQ_FOREACH(se, savevm_handlers, entry) { -if (se-save_live_state == NULL) -continue; - -/* Section type */ -qemu_put_byte(f, QEMU_VM_SECTION_PART); -qemu_put_be32(f, se-section_id); - -ret = se-save_live_state(mon, f, QEMU_VM_SECTION_PART, se-opaque); -if (!ret) { -/* Do not proceed to the next vmstate before this one reported - completion of the current stage. This serializes the migration - and reduces the probability that a faster changing state is - synchronized over and over again. */ -break; -} -} - -if (ret) +/* Do not proceed to the next vmstate before this one reported + completion of the current stage. This serializes the migration + and reduces the probability that a faster changing state is + synchronized over and over again. */ +ret = qemu_savevm_state_live(mon, f, QEMU_VM_SECTION_PART, 1, 1); +if (!ret) { return 1; +} if (qemu_file_has_error(f)) { qemu_savevm_state_cancel(mon, f); @@ -1682,46 +1728,40 @@ int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f) int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f) { -SaveStateEntry *se; - cpu_synchronize_all_states(); -QTAILQ_FOREACH(se, savevm_handlers, entry) { -if (se-save_live_state == NULL) -continue; - -/* Section type */ -qemu_put_byte(f, QEMU_VM_SECTION_END); -qemu_put_be32(f,
Re: [Qemu-devel] Re: [PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
2011/1/29 Paolo Bonzini pbonz...@redhat.com: On 01/28/2011 04:31 PM, Yoshiaki Tamura wrote: That's the hack I was imaging:) So your original patch is also a hack? :) TBH, yeah :) I didn't came up better idea that is not over engineered. Maybe this is just an issue of preference, but I'm not sure adding kemari: to be intuitive. If there were similar extensions having the same problem, I would have agreed quickly. I originally didn't have this idea, but simply adding -kemari separate from -incoming isn't enough? No idea... Only, having migrate on one side and something other than -incoming on the other side seems strange. OK, then while keeping -incoming kemari:tcp:host:port as a strong solution, could you please explain why placing the original parser under tcp handler wasn't a good idea? With that, -incoming exec .*,ft_mode shouldn't be a problem. I chose -incoming tcp:host:port, ft_mode because qemu usually take , for specifying variants for each option (e.g. -net nic,macaddr=). The problem was -incoming didn't have it yet. Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
2011/1/29 Paolo Bonzini pbonz...@redhat.com: On 01/29/2011 10:31 AM, Yoshiaki Tamura wrote: OK, then while keeping -incoming kemari:tcp:host:port as a strong solution, could you please explain why placing the original parser under tcp handler wasn't a good idea? With that, -incoming exec .*,ft_mode shouldn't be a problem. But a hypothetical -incoming unix.*,ft_mode would have to be implemented twice. You mean Kemari should be able to use with unix domain sockets, or other local communication patch? Since Kemari needs two remote hosts, I don't see why need to use unix domain sockets except for testing. Maybe I'm missing the point :) Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
2011/1/29 Paolo Bonzini pbonz...@redhat.com: On 01/29/2011 12:32 PM, Yoshiaki Tamura wrote: But a hypothetical -incoming unix.*,ft_mode would have to be implemented twice. You mean Kemari should be able to use with unix domain sockets, or other local communication patch? Since Kemari needs two remote hosts, I don't see why need to use unix domain sockets except for testing. Maybe I'm missing the point:) Well, I mentioned unix because it is basically the only other migration protocol implemented in QEMU, but the file descriptor backend could also be used. Kemari-over-SCTP could be an interesting application too for example. I'm not saying you should adjust the other patches for the implementation, but the syntax to invoke Kemari should be future-proof. Yes, I understand. I'm not negative to your suggestion at all, but trying to figure out what would be the best :) Let me take your suggestion to use -incoming kemari:tcp:host:port in the next spin, and see what others think. Thanks, Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
2011/1/28 Paolo Bonzini pbonz...@redhat.com: On 01/28/2011 08:21 AM, Yoshiaki Tamura wrote: + /* check ft_mode option */ + p = strstr(uri, ft_mode); + if (p !strcmp(p, ft_mode)) { + ft_mode = FT_INIT; + } + This works for TCP mode, but: 1) I am not sure what would happen with -incoming exec; Nothing happens if used with other protocols, but I assume you're mentioning that it's not clear from the code, which makes sense. 2) it is tricky! :) It works only because anything after the port is truncated by parse_host_port. Is there any reason why the code cannot be in tcp_start_incoming_migration, where we know the URI has the right scheme? Alternatively you could put it _before_ the scheme, like kemari:tcp:host:port. I placed it here just because string parsing was mainly done in this function except for parse_host_port. I have no objection placing the parsing at the beginning of tcp_start_incoming_migration. Thanks, Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 15/19] savevm: introduce qemu_savevm_trans_{begin,commit}.
2011/1/28 Paolo Bonzini pbonz...@redhat.com: On 01/28/2011 08:21 AM, Yoshiaki Tamura wrote: +int qemu_savevm_trans_begin(Monitor *mon, QEMUFile *f, int init) +{ + SaveStateEntry *se; + int skipped = 0; + + QTAILQ_FOREACH(se,savevm_handlers, entry) { + int len, stage, ret; + + if (se-save_live_state == NULL) { + continue; + } + + /* Section type */ + qemu_put_byte(f, QEMU_VM_SECTION_START); + qemu_put_be32(f, se-section_id); + + /* ID string */ + len = strlen(se-idstr); + qemu_put_byte(f, len); + qemu_put_buffer(f, (uint8_t *)se-idstr, len); + + qemu_put_be32(f, se-instance_id); + qemu_put_be32(f, se-version_id); + + stage = init ? QEMU_VM_SECTION_START : QEMU_VM_SECTION_PART; + ret = se-save_live_state(mon, f, stage, se-opaque); + if (!ret) { + skipped++; + } + } + + if (qemu_file_has_error(f)) { + return -EIO; + } + + return skipped; +} + Right now, this is very similar to qemu_savevm_state_begin and _iterate, but not quite. Perhaps you could abstract it to a single function that could be used everywhere live handlers are used. For example, /* section says which header to write; incremental == true forces to pass SECTION_PART instead of SECTION_START. In code: if (section == QEMU_VM_SECTION_START) { stage = incremental ? QEMU_VM_SECTION_PART : QEMU_VM_SECTION_START; } else { assert(incremental); stage = section; } */ int qemu_savevm_state_live(Monitor *mon, QEMUFile *f, int section, int incremental) Likewise, + QTAILQ_FOREACH(se, savevm_handlers, entry) { + int len; + + if (se-save_state == NULL se-vmsd == NULL) { + continue; + } + + /* Section type */ + qemu_put_byte(f, QEMU_VM_SECTION_FULL); + qemu_put_be32(f, se-section_id); + + /* ID string */ + len = strlen(se-idstr); + qemu_put_byte(f, len); + qemu_put_buffer(f, (uint8_t *)se-idstr, len); + + qemu_put_be32(f, se-instance_id); + qemu_put_be32(f, se-version_id); + + vmstate_save(f, se); + } this code is straight from qemu_savevm_state_complete and should be moved into its own function. Looks reasonable to avoid bit rotten. Let me see what I can do. Thanks, Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
2011/1/28 Paolo Bonzini pbonz...@redhat.com: On 01/28/2011 02:53 PM, Yoshiaki Tamura wrote: 1) I am not sure what would happen with -incoming exec; Nothing happens if used with other protocols, but I assume you're mentioning that it's not clear from the code, which makes sense. I assume nothing just because the code for other protocols isn't using ft_mode. However, for -incoming exec the parsing code as it is now would trigger if the executed file ended with ft_mode. Hmm. Haven't thought about it. So now I think it should be at the beginning of the scheme for forward compatibility with everything. Is it possible to detect a migration scheme that does not support Kemari, and give an error in that case? Having a scheme like kemari:tcp:host:port looks quite challenging to me. We can of course add some quick hacks for it, but adding a nice layered architecture should be more appropriate. Similar to protocols and formats in block layer? At the same time, I want to avoid anything over engineered. Thanks, Yoshi Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
2011/1/29 Paolo Bonzini pbonz...@redhat.com: On 01/28/2011 04:05 PM, Yoshiaki Tamura wrote: Having a scheme like kemari:tcp:host:port looks quite challenging to me. We can of course add some quick hacks for it, but adding a nice layered architecture should be more appropriate. Similar to protocols and formats in block layer? At the same time, I want to avoid anything over engineered. I was simply thinking of if (strstart (uri, kemari:, p)) { ft_mode = FT_INIT; uri = p; } :) That's the hack I was imaging :) Maybe this is just an issue of preference, but I'm not sure adding kemari: to be intuitive. If there were similar extensions having the same problem, I would have agreed quickly. I originally didn't have this idea, but simply adding -kemari separate from -incoming isn't enough? Thanks, Yoshi I think the same could be done for outgoing migration instead of -k actually. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 16/19] migration: introduce migrate_ft_trans_{put,get}_ready(), and modify migrate_fd_put_ready() when ft_mode is on.
Introduce migrate_ft_trans_put_ready() which kicks the FT transaction cycle. When ft_mode is on, migrate_fd_put_ready() would open ft_trans_file and turn on event_tap. To end or cancel FT transaction, ft_mode and event_tap is turned off. migrate_ft_trans_get_ready() is called to receive ack from the receiver. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c | 267 ++- 1 files changed, 266 insertions(+), 1 deletions(-) diff --git a/migration.c b/migration.c index cd02b7e..aa30ecd 100644 --- a/migration.c +++ b/migration.c @@ -21,6 +21,7 @@ #include qemu_socket.h #include block-migration.h #include qemu-objects.h +#include event-tap.h //#define DEBUG_MIGRATION @@ -278,6 +279,14 @@ void migrate_fd_error(FdMigrationState *s) migrate_fd_cleanup(s); } +static void migrate_ft_trans_error(FdMigrationState *s) +{ +ft_mode = FT_ERROR; +qemu_savevm_state_cancel(s-mon, s-file); +migrate_fd_error(s); +event_tap_unregister(); +} + int migrate_fd_cleanup(FdMigrationState *s) { int ret = 0; @@ -313,6 +322,17 @@ void migrate_fd_put_notify(void *opaque) qemu_file_put_notify(s-file); } +static void migrate_fd_get_notify(void *opaque) +{ +FdMigrationState *s = opaque; + +qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL); +qemu_file_get_notify(s-file); +if (qemu_file_has_error(s-file)) { +migrate_ft_trans_error(s); +} +} + ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) { FdMigrationState *s = opaque; @@ -347,6 +367,10 @@ int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) ret = -(s-get_error(s)); } +if (ret == -EAGAIN) { +qemu_set_fd_handler2(s-fd, NULL, migrate_fd_get_notify, NULL, s); +} + return ret; } @@ -373,6 +397,236 @@ void migrate_fd_connect(FdMigrationState *s) migrate_fd_put_ready(s); } +static int migrate_ft_trans_commit(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_COMMIT ft_mode != FT_TRANSACTION_ATOMIC) { +fprintf(stderr, +migrate_ft_trans_commit: invalid ft_mode %d\n, ft_mode); +goto out; +} + +do { +if (ft_mode == FT_TRANSACTION_ATOMIC) { +if (qemu_ft_trans_begin(s-file) 0) { +fprintf(stderr, qemu_ft_trans_begin failed\n); +goto out; +} + +ret = qemu_savevm_trans_begin(s-mon, s-file, 0); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_begin failed\n); +goto out; +} + +ft_mode = FT_TRANSACTION_COMMIT; +if (ret) { +/* don't proceed until if fd isn't ready */ +goto out; +} +} + +/* make the VM state consistent by flushing outstanding events */ +vm_stop(0); + +/* send at full speed */ +qemu_file_set_rate_limit(s-file, 0); + +ret = qemu_savevm_trans_complete(s-mon, s-file); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_complete failed\n); +goto out; +} + +if (ret) { +/* don't proceed until if fd isn't ready */ +ret = 1; +goto out; +} + +ret = qemu_ft_trans_commit(s-file); +if (ret 0) { +fprintf(stderr, qemu_ft_trans_commit failed\n); +goto out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_RECV; +ret = 1; +goto out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto out; +} + +ft_mode = ret ? FT_TRANSACTION_BEGIN : FT_TRANSACTION_ATOMIC; +} while (ft_mode != FT_TRANSACTION_BEGIN); + +vm_start(); +ret = 0; + +out: +return ret; +} + +static int migrate_ft_trans_get_ready(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_RECV) { +fprintf(stderr, +migrate_ft_trans_get_ready: invalid ft_mode %d\n, ft_mode); +goto error_out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto error_out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_BEGIN; +} else { +ft_mode = FT_TRANSACTION_ATOMIC; + +ret = migrate_ft_trans_commit(s); +if (ret 0) { +goto error_out; +} +if (ret) { +goto out; +} +} + +vm_start(); +ret = 0; +goto out; + +error_out: +migrate_ft_trans_error(s); + +out
[PATCH 14/19] block: insert event-tap to bdrv_aio_writev(), bdrv_aio_flush() and bdrv_flush().
event-tap function is called only when it is on, and requests were sent from device emulators. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- block.c | 15 +++ 1 files changed, 15 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index ff2795b..e4df9b6 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include block_int.h #include module.h #include qemu-objects.h +#include event-tap.h #ifdef CONFIG_BSD #include sys/types.h @@ -1476,6 +1477,10 @@ int bdrv_flush(BlockDriverState *bs) } if (bs-drv bs-drv-bdrv_flush) { +if (*bs-device_name event_tap_is_on()) { +event_tap_bdrv_flush(); +} + return bs-drv-bdrv_flush(bs); } @@ -2111,6 +2116,11 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; +if (*bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); +} + if (bs-dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, opaque); @@ -2374,6 +2384,11 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, if (!drv) return NULL; + +if (*bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_flush(bs, cb, opaque); +} + return drv-bdrv_aio_flush(bs, cb, opaque); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/19] Kemari for KVM v0.2.8
Hi, This patch series is a revised version of Kemari for KVM, which applied comments for the previous post. The current code is based on qemu.git 0bfe006c5380c5f8a485a55ded3329fbbc224396. The changes from v0.2.7 - v0.2.8 are: - fixed calling wrong cb in event-tap - add missing qemu_aio_release in event-tap The changes from v0.2.6 - v0.2.7 are: - add AIOCB, AIOPool and cancel functions (Kevin) - insert event-tap for bdrv_flush (Kevin) - add error handing when calling bdrv functions (Kevin) - fix usage of qemu_aio_flush and bdrv_flush (Kevin) - use bs in AIOCB on the primary (Kevin) - reorder event-tap functions to gather with block/net (Kevin) - fix checking bs-device_name (Kevin) The changes from v0.2.5 - v0.2.6 are: - use qemu_{put,get}_be32() to save/load niov in event-tap The changes from v0.2.4 - v0.2.5 are: - fixed braces and trailing spaces by using Blue's checkpatch.pl (Blue) - event-tap: don't try to send blk_req if it's a bdrv_aio_flush event The changes from v0.2.3 - v0.2.4 are: - call vm_start() before event_tap_flush_one() to avoid failure in virtio-net assertion - add vm_change_state_handler to turn off ft_mode - use qemu_iovec functions in event-tap - remove duplicated code in migration - remove unnecessary new line for error_report in ft_trans_file The changes from v0.2.2 - v0.2.3 are: - queue async net requests without copying (MST) -- if not async, contents of the packets are sent to the secondary - better description for option -k (MST) - fix memory transfer failure - fix ft transaction initiation failure The changes from v0.2.1 - v0.2.2 are: - decrement last_avaid_idx with inuse before saving (MST) - remove qemu_aio_flush() and bdrv_flush_all() in migrate_ft_trans_commit() The changes from v0.2 - v0.2.1 are: - Move event-tap to net/block layer and use stubs (Blue, Paul, MST, Kevin) - Tap bdrv_aio_flush (Marcelo) - Remove multiwrite interface in event-tap (Stefan) - Fix event-tap to use pio/mmio to replay both net/block (Stefan) - Improve error handling in event-tap (Stefan) - Fix leak in event-tap (Stefan) - Revise virtio last_avail_idx manipulation (MST) - Clean up migration.c hook (Marcelo) - Make deleting change state handler robust (Isaku, Anthony) The changes from v0.1.1 - v0.2 are: - Introduce a queue in event-tap to make VM sync live. - Change transaction receiver to a state machine for async receiving. - Replace net/block layer functions with event-tap proxy functions. - Remove dirty bitmap optimization for now. - convert DPRINTF() in ft_trans_file to trace functions. - convert fprintf() in ft_trans_file to error_report(). - improved error handling in ft_trans_file. - add a tmp pointer to qemu_del_vm_change_state_handler. The changes from v0.1 - v0.1.1 are: - events are tapped in net/block layer instead of device emulation layer. - Introduce a new option for -incoming to accept FT transaction. - Removed writev() support to QEMUFile and FdMigrationState for now. I would post this work in a different series. - Modified virtio-blk save/load handler to send inuse variable to correctly replay. - Removed configure --enable-ft-mode. - Removed unnecessary check for qemu_realloc(). The first 6 patches modify several functions of qemu to prepare introducing Kemari specific components. The next 6 patches are the components of Kemari. They introduce event-tap and the FT transaction protocol file based on buffered file. The design document of FT transaction protocol can be found at, http://wiki.qemu.org/images/b/b1/Kemari_sender_receiver_0.5a.pdf Then the following 2 patches modifies net/block layer functions with event-tap functions. Please note that if Kemari is off, event-tap will just passthrough, and there is most no intrusion to exisiting functions including normal live migration. Finally, the migration layer are modified to support Kemari in the last 5 patches. Again, there shouldn't be any affection if a user doesn't specify Kemari specific options. The transaction is now async on both sender and receiver side. The sender side respects the max_downtime to decide when to switch from async to sync mode. The repository contains all patches I'm sending with this message. For those who want to try, please pull the following repository. It also includes dirty bitmap optimization which aren't ready for posting yet. To remove the dirty bitmap optimization, please look at HEAD~5 of the tree. git://kemari.git.sourceforge.net/gitroot/kemari/kemari next Thanks, Yoshi Yoshiaki Tamura (19): Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer(). Introduce read() to FdMigrationState. Introduce skip_header parameter to qemu_loadvm_state(). qemu-char: export socket_set_nodelay(). vl.c: add deleted flag for deleting the handler. virtio: decrement last_avail_idx with inuse before saving. Introduce fault tolerant VM transaction QEMUFile and ft_mode. savevm: introduce util functions to control
[PATCH 02/19] Introduce read() to FdMigrationState.
Currently FdMigrationState doesn't support read(), and this patch introduces it to get response from the other side. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 15 +++ migration.c | 13 + migration.h |3 +++ 3 files changed, 31 insertions(+), 0 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index b55f419..55777c8 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -39,6 +39,20 @@ static int socket_write(FdMigrationState *s, const void * buf, size_t size) return send(s-fd, buf, size, 0); } +static int socket_read(FdMigrationState *s, const void * buf, size_t size) +{ +ssize_t len; + +do { +len = recv(s-fd, (void *)buf, size, 0); +} while (len == -1 socket_error() == EINTR); +if (len == -1) { +len = -socket_error(); +} + +return len; +} + static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); @@ -94,6 +108,7 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, s-get_error = socket_errno; s-write = socket_write; +s-read = socket_read; s-close = tcp_close; s-mig_state.cancel = migrate_fd_cancel; s-mig_state.get_status = migrate_fd_get_status; diff --git a/migration.c b/migration.c index d593b1d..bee20f0 100644 --- a/migration.c +++ b/migration.c @@ -334,6 +334,19 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) return ret; } +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) +{ +FdMigrationState *s = opaque; +int ret; + +ret = s-read(s, data, size); +if (ret == -1) { +ret = -(s-get_error(s)); +} + +return ret; +} + void migrate_fd_connect(FdMigrationState *s) { int ret; diff --git a/migration.h b/migration.h index d13ed4f..7bf6747 100644 --- a/migration.h +++ b/migration.h @@ -47,6 +47,7 @@ struct FdMigrationState int (*get_error)(struct FdMigrationState*); int (*close)(struct FdMigrationState*); int (*write)(struct FdMigrationState*, const void *, size_t); +int (*read)(struct FdMigrationState *, const void *, size_t); void *opaque; }; @@ -115,6 +116,8 @@ void migrate_fd_put_notify(void *opaque); ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size); +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size); + void migrate_fd_connect(FdMigrationState *s); void migrate_fd_put_ready(void *opaque); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/19] qemu-char: export socket_set_nodelay().
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- qemu-char.c |2 +- qemu_socket.h |1 + 2 files changed, 2 insertions(+), 1 deletions(-) diff --git a/qemu-char.c b/qemu-char.c index edc9ad6..737d347 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -2116,7 +2116,7 @@ static void tcp_chr_telnet_init(int fd) send(fd, (char *)buf, 3, 0); } -static void socket_set_nodelay(int fd) +void socket_set_nodelay(int fd) { int val = 1; setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)val, sizeof(val)); diff --git a/qemu_socket.h b/qemu_socket.h index 897a8ae..b7f8465 100644 --- a/qemu_socket.h +++ b/qemu_socket.h @@ -36,6 +36,7 @@ int inet_aton(const char *cp, struct in_addr *ia); int qemu_socket(int domain, int type, int protocol); int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen); void socket_set_nonblock(int fd); +void socket_set_nodelay(int fd); int send_all(int fd, const void *buf, int len1); /* New, ipv6-ready socket helper functions, see qemu-sockets.c */ -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 08/19] savevm: introduce util functions to control ft_trans_file from savevm layer.
To utilize ft_trans_file function, savevm needs interfaces to be exported. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |5 ++ savevm.c | 149 ++ 2 files changed, 154 insertions(+), 0 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index 7f05830..52e807c 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -51,6 +51,7 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, QEMUFile *qemu_fopen(const char *filename, const char *mode); QEMUFile *qemu_fdopen(int fd, const char *mode); QEMUFile *qemu_fopen_socket(int fd); +QEMUFile *qemu_fopen_ft_trans(int s_fd, int c_fd); QEMUFile *qemu_popen(FILE *popen_file, const char *mode); QEMUFile *qemu_popen_cmd(const char *command, const char *mode); int qemu_stdio_fd(QEMUFile *f); @@ -60,6 +61,9 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); void *qemu_realloc_buffer(QEMUFile *f, int size); void qemu_clear_buffer(QEMUFile *f); +int qemu_ft_trans_begin(QEMUFile *f); +int qemu_ft_trans_commit(QEMUFile *f); +int qemu_ft_trans_cancel(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { @@ -94,6 +98,7 @@ void qemu_file_set_error(QEMUFile *f); * halted due to rate limiting or EAGAIN errors occur as it can be used to * resume output. */ void qemu_file_put_notify(QEMUFile *f); +void qemu_file_get_notify(void *opaque); static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { diff --git a/savevm.c b/savevm.c index dc15c03..5418280 100644 --- a/savevm.c +++ b/savevm.c @@ -83,6 +83,7 @@ #include migration.h #include qemu_socket.h #include qemu-queue.h +#include ft_trans_file.h #define SELF_ANNOUNCE_ROUNDS 5 @@ -190,6 +191,13 @@ typedef struct QEMUFileSocket QEMUFile *file; } QEMUFileSocket; +typedef struct QEMUFileSocketTrans +{ +int fd; +QEMUFileSocket *s; +VMChangeStateEntry *e; +} QEMUFileSocketTrans; + static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) { QEMUFileSocket *s = opaque; @@ -205,6 +213,22 @@ static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) return len; } +static ssize_t socket_put_buffer(void *opaque, const void *buf, size_t size) +{ +QEMUFileSocket *s = opaque; +ssize_t len; + +do { +len = send(s-fd, (void *)buf, size, 0); +} while (len == -1 socket_error() == EINTR); + +if (len == -1) { +len = -socket_error(); +} + +return len; +} + static int socket_close(void *opaque) { QEMUFileSocket *s = opaque; @@ -212,6 +236,70 @@ static int socket_close(void *opaque) return 0; } +static int socket_trans_get_buffer(void *opaque, uint8_t *buf, int64_t pos, size_t size) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; +ssize_t len; + +len = socket_get_buffer(s, buf, pos, size); + +return len; +} + +static ssize_t socket_trans_put_buffer(void *opaque, const void *buf, size_t size) +{ +QEMUFileSocketTrans *t = opaque; + +return socket_put_buffer(t-s, buf, size); +} + + +static int socket_trans_get_ready(void *opaque) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; +QEMUFile *f = s-file; +int ret = 0; + +ret = qemu_loadvm_state(f, 1); +if (ret 0) { +fprintf(stderr, +socket_trans_get_ready: error while loading vmstate\n); +} + +return ret; +} + +static int socket_trans_close(void *opaque) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; + +qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL); +qemu_set_fd_handler2(t-fd, NULL, NULL, NULL, NULL); +qemu_del_vm_change_state_handler(t-e); +close(s-fd); +close(t-fd); +qemu_free(s); +qemu_free(t); + +return 0; +} + +static void socket_trans_resume(void *opaque, int running, int reason) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; + +if (!running) { +return; +} + +qemu_announce_self(); +qemu_fclose(s-file); +} + static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) { QEMUFileStdio *s = opaque; @@ -334,6 +422,26 @@ QEMUFile *qemu_fopen_socket(int fd) return s-file; } +QEMUFile *qemu_fopen_ft_trans(int s_fd, int c_fd) +{ +QEMUFileSocketTrans *t = qemu_mallocz(sizeof(QEMUFileSocketTrans)); +QEMUFileSocket *s = qemu_mallocz(sizeof(QEMUFileSocket)); + +t-s = s; +t-fd = s_fd; +t-e = qemu_add_vm_change_state_handler(socket_trans_resume, t); + +s-fd = c_fd; +s-file = qemu_fopen_ops_ft_trans(t, socket_trans_put_buffer, + socket_trans_get_buffer, NULL, + socket_trans_get_ready, + migrate_fd_wait_for_unfreeze, + socket_trans_close, 0
[PATCH 15/19] savevm: introduce qemu_savevm_trans_{begin,commit}.
Introduce qemu_savevm_state_{begin,commit} to send the memory and device info together, while avoiding cancelling memory state tracking. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- savevm.c | 93 ++ sysemu.h |2 + 2 files changed, 95 insertions(+), 0 deletions(-) diff --git a/savevm.c b/savevm.c index 5418280..73465ed 100644 --- a/savevm.c +++ b/savevm.c @@ -1726,6 +1726,99 @@ int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f) return 0; } +int qemu_savevm_trans_begin(Monitor *mon, QEMUFile *f, int init) +{ +SaveStateEntry *se; +int skipped = 0; + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int len, stage, ret; + +if (se-save_live_state == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_START); +qemu_put_be32(f, se-section_id); + +/* ID string */ +len = strlen(se-idstr); +qemu_put_byte(f, len); +qemu_put_buffer(f, (uint8_t *)se-idstr, len); + +qemu_put_be32(f, se-instance_id); +qemu_put_be32(f, se-version_id); + +stage = init ? QEMU_VM_SECTION_START : QEMU_VM_SECTION_PART; +ret = se-save_live_state(mon, f, stage, se-opaque); +if (!ret) { +skipped++; +} +} + +if (qemu_file_has_error(f)) { +return -EIO; +} + +return skipped; +} + +int qemu_savevm_trans_complete(Monitor *mon, QEMUFile *f) +{ +SaveStateEntry *se; + +cpu_synchronize_all_states(); + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int ret; + +if (se-save_live_state == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_PART); +qemu_put_be32(f, se-section_id); + +ret = se-save_live_state(mon, f, QEMU_VM_SECTION_PART, se-opaque); +if (!ret) { +/* do not proceed to the next vmstate. */ +return 1; +} +} + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int len; + +if (se-save_state == NULL se-vmsd == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_FULL); +qemu_put_be32(f, se-section_id); + +/* ID string */ +len = strlen(se-idstr); +qemu_put_byte(f, len); +qemu_put_buffer(f, (uint8_t *)se-idstr, len); + +qemu_put_be32(f, se-instance_id); +qemu_put_be32(f, se-version_id); + +vmstate_save(f, se); +} + +qemu_put_byte(f, QEMU_VM_EOF); + +if (qemu_file_has_error(f)) { +return -EIO; +} + +return 0; +} + void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f) { SaveStateEntry *se; diff --git a/sysemu.h b/sysemu.h index 329415f..ee2c382 100644 --- a/sysemu.h +++ b/sysemu.h @@ -81,6 +81,8 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); +int qemu_savevm_trans_begin(Monitor *mon, QEMUFile *f, int init); +int qemu_savevm_trans_complete(Monitor *mon, QEMUFile *f); int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/19] Introduce event-tap.
event-tap controls when to start FT transaction, and provides proxy functions to called from net/block devices. While FT transaction, it queues up net/block requests, and flush them when the transaction gets completed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.target |1 + event-tap.c | 939 +++ event-tap.h | 44 +++ qemu-tool.c | 28 ++ trace-events|9 + 5 files changed, 1021 insertions(+), 0 deletions(-) create mode 100644 event-tap.c create mode 100644 event-tap.h diff --git a/Makefile.target b/Makefile.target index cd2abde..20f02d5 100644 --- a/Makefile.target +++ b/Makefile.target @@ -199,6 +199,7 @@ obj-y += rwhandler.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o LIBS+=-lz +obj-y += event-tap.o QEMU_CFLAGS += $(VNC_TLS_CFLAGS) QEMU_CFLAGS += $(VNC_SASL_CFLAGS) diff --git a/event-tap.c b/event-tap.c new file mode 100644 index 000..f44d835 --- /dev/null +++ b/event-tap.c @@ -0,0 +1,939 @@ +/* + * Event Tap functions for QEMU + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include qemu-common.h +#include qemu-error.h +#include block.h +#include block_int.h +#include ioport.h +#include osdep.h +#include sysemu.h +#include hw/hw.h +#include net.h +#include event-tap.h +#include trace.h + +enum EVENT_TAP_STATE { +EVENT_TAP_OFF, +EVENT_TAP_ON, +EVENT_TAP_SUSPEND, +EVENT_TAP_FLUSH, +EVENT_TAP_LOAD, +EVENT_TAP_REPLAY, +}; + +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF; + +typedef struct EventTapIOport { +uint32_t address; +uint32_t data; +int index; +} EventTapIOport; + +#define MMIO_BUF_SIZE 8 + +typedef struct EventTapMMIO { +uint64_t address; +uint8_t buf[MMIO_BUF_SIZE]; +int len; +} EventTapMMIO; + +typedef struct EventTapNetReq { +char *device_name; +int iovcnt; +int vlan_id; +bool vlan_needed; +bool async; +struct iovec *iov; +NetPacketSent *sent_cb; +} EventTapNetReq; + +#define MAX_BLOCK_REQUEST 32 + +typedef struct EventTapAIOCB EventTapAIOCB; + +typedef struct EventTapBlkReq { +char *device_name; +int num_reqs; +int num_cbs; +bool is_flush; +BlockRequest reqs[MAX_BLOCK_REQUEST]; +EventTapAIOCB *acb[MAX_BLOCK_REQUEST]; +} EventTapBlkReq; + +#define EVENT_TAP_IOPORT (1 0) +#define EVENT_TAP_MMIO (1 1) +#define EVENT_TAP_NET(1 2) +#define EVENT_TAP_BLK(1 3) + +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1) + +typedef struct EventTapLog { +int mode; +union { +EventTapIOport ioport; +EventTapMMIO mmio; +}; +union { +EventTapNetReq net_req; +EventTapBlkReq blk_req; +}; +QTAILQ_ENTRY(EventTapLog) node; +} EventTapLog; + +struct EventTapAIOCB { +BlockDriverAIOCB common; +BlockDriverAIOCB *acb; +bool is_canceled; +}; + +static EventTapLog *last_event_tap; + +static QTAILQ_HEAD(, EventTapLog) event_list; +static QTAILQ_HEAD(, EventTapLog) event_pool; + +static int (*event_tap_cb)(void); +static QEMUBH *event_tap_bh; +static VMChangeStateEntry *vmstate; + +static void event_tap_bh_cb(void *p) +{ +if (event_tap_cb) { +event_tap_cb(); +} + +qemu_bh_delete(event_tap_bh); +event_tap_bh = NULL; +} + +static void event_tap_schedule_bh(void) +{ +trace_event_tap_ignore_bh(!!event_tap_bh); + +/* if bh is already set, we ignore it for now */ +if (event_tap_bh) { +return; +} + +event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL); +qemu_bh_schedule(event_tap_bh); + +return; +} + +static void *event_tap_alloc_log(void) +{ +EventTapLog *log; + +if (QTAILQ_EMPTY(event_pool)) { +log = qemu_mallocz(sizeof(EventTapLog)); +} else { +log = QTAILQ_FIRST(event_pool); +QTAILQ_REMOVE(event_pool, log, node); +} + +return log; +} + +static void event_tap_free_net_req(EventTapNetReq *net_req); +static void event_tap_free_blk_req(EventTapBlkReq *blk_req); + +static void event_tap_free_log(EventTapLog *log) +{ +int mode = log-mode ~EVENT_TAP_TYPE_MASK; + +if (mode == EVENT_TAP_NET) { +event_tap_free_net_req(log-net_req); +} else if (mode == EVENT_TAP_BLK) { +event_tap_free_blk_req(log-blk_req); +} + +log-mode = 0; + +/* return the log to event_pool */ +QTAILQ_INSERT_HEAD(event_pool, log, node); +} + +static void event_tap_free_pool(void) +{ +EventTapLog *log, *next; + +QTAILQ_FOREACH_SAFE(log, event_pool, node, next) { +QTAILQ_REMOVE(event_pool, log, node); +qemu_free(log); +} +} + +static void event_tap_free_net_req(EventTapNetReq *net_req) +{ +int i; + +if (!net_req-async
[PATCH 07/19] Introduce fault tolerant VM transaction QEMUFile and ft_mode.
This code implements VM transaction protocol. Like buffered_file, it sits between savevm and migration layer. With this architecture, VM transaction protocol is implemented mostly independent from other existing code. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.objs |1 + ft_trans_file.c | 624 +++ ft_trans_file.h | 72 +++ migration.c |3 + trace-events| 16 ++ 5 files changed, 716 insertions(+), 0 deletions(-) create mode 100644 ft_trans_file.c create mode 100644 ft_trans_file.h diff --git a/Makefile.objs b/Makefile.objs index fda366d..1f10fbc 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -100,6 +100,7 @@ common-obj-y += msmouse.o ps2.o common-obj-y += qdev.o qdev-properties.o common-obj-y += block-migration.o common-obj-y += pflib.o +common-obj-y += ft_trans_file.o common-obj-$(CONFIG_BRLAPI) += baum.o common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o diff --git a/ft_trans_file.c b/ft_trans_file.c new file mode 100644 index 000..2b42b95 --- /dev/null +++ b/ft_trans_file.c @@ -0,0 +1,624 @@ +/* + * Fault tolerant VM transaction QEMUFile + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * This source code is based on buffered_file.c. + * Copyright IBM, Corp. 2008 + * Authors: + * Anthony Liguorialigu...@us.ibm.com + */ + +#include qemu-common.h +#include qemu-error.h +#include hw/hw.h +#include qemu-timer.h +#include sysemu.h +#include qemu-char.h +#include trace.h +#include ft_trans_file.h + +typedef struct FtTransHdr +{ +uint16_t cmd; +uint16_t id; +uint32_t seq; +uint32_t payload_len; +} FtTransHdr; + +typedef struct QEMUFileFtTrans +{ +FtTransPutBufferFunc *put_buffer; +FtTransGetBufferFunc *get_buffer; +FtTransPutReadyFunc *put_ready; +FtTransGetReadyFunc *get_ready; +FtTransWaitForUnfreezeFunc *wait_for_unfreeze; +FtTransCloseFunc *close; +void *opaque; +QEMUFile *file; + +enum QEMU_VM_TRANSACTION_STATE state; +uint32_t seq; +uint16_t id; + +int has_error; + +bool freeze_output; +bool freeze_input; +bool rate_limit; +bool is_sender; +bool is_payload; + +uint8_t *buf; +size_t buf_max_size; +size_t put_offset; +size_t get_offset; + +FtTransHdr header; +size_t header_offset; +} QEMUFileFtTrans; + +#define IO_BUF_SIZE 32768 + +static void ft_trans_append(QEMUFileFtTrans *s, +const uint8_t *buf, size_t size) +{ +if (size (s-buf_max_size - s-put_offset)) { +trace_ft_trans_realloc(s-buf_max_size, size + 1024); +s-buf_max_size += size + 1024; +s-buf = qemu_realloc(s-buf, s-buf_max_size); +} + +trace_ft_trans_append(size); +memcpy(s-buf + s-put_offset, buf, size); +s-put_offset += size; +} + +static void ft_trans_flush(QEMUFileFtTrans *s) +{ +size_t offset = 0; + +if (s-has_error) { +error_report(flush when error %d, bailing, s-has_error); +return; +} + +while (offset s-put_offset) { +ssize_t ret; + +ret = s-put_buffer(s-opaque, s-buf + offset, s-put_offset - offset); +if (ret == -EAGAIN) { +break; +} + +if (ret = 0) { +error_report(error flushing data, %s, strerror(errno)); +s-has_error = FT_TRANS_ERR_FLUSH; +break; +} else { +offset += ret; +} +} + +trace_ft_trans_flush(offset, s-put_offset); +memmove(s-buf, s-buf + offset, s-put_offset - offset); +s-put_offset -= offset; +s-freeze_output = !!s-put_offset; +} + +static ssize_t ft_trans_put(void *opaque, void *buf, int size) +{ +QEMUFileFtTrans *s = opaque; +size_t offset = 0; +ssize_t len; + +/* flush buffered data before putting next */ +if (s-put_offset) { +ft_trans_flush(s); +} + +while (!s-freeze_output offset size) { +len = s-put_buffer(s-opaque, (uint8_t *)buf + offset, size - offset); + +if (len == -EAGAIN) { +trace_ft_trans_freeze_output(); +s-freeze_output = 1; +break; +} + +if (len = 0) { +error_report(putting data failed, %s, strerror(errno)); +s-has_error = 1; +offset = -EINVAL; +break; +} + +offset += len; +} + +if (s-freeze_output) { +ft_trans_append(s, buf + offset, size - offset); +offset = size; +} + +return offset; +} + +static int ft_trans_send_header(QEMUFileFtTrans *s, +enum QEMU_VM_TRANSACTION_STATE state, +uint32_t payload_len) +{ +int ret; +FtTransHdr
[PATCH 12/19] Insert event_tap_mmio() to cpu_physical_memory_rw() in exec.c.
Record mmio write event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- exec.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/exec.c b/exec.c index e950df2..c81fd09 100644 --- a/exec.c +++ b/exec.c @@ -33,6 +33,7 @@ #include osdep.h #include kvm.h #include qemu-timer.h +#include event-tap.h #if defined(CONFIG_USER_ONLY) #include qemu.h #include signal.h @@ -3632,6 +3633,9 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, io_index = (pd IO_MEM_SHIFT) (IO_MEM_NB_ENTRIES - 1); if (p) addr1 = (addr ~TARGET_PAGE_MASK) + p-region_offset; + +event_tap_mmio(addr, buf, len); + /* XXX: could force cpu_single_env to NULL to avoid potential bugs */ if (l = 4 ((addr1 3) == 0)) { -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/19] Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer().
Currently buf size is fixed at 32KB. It would be useful if it could be flexible. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |2 ++ savevm.c | 20 +++- 2 files changed, 21 insertions(+), 1 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index dd993de..7f05830 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -58,6 +58,8 @@ void qemu_fflush(QEMUFile *f); int qemu_fclose(QEMUFile *f); void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); +void *qemu_realloc_buffer(QEMUFile *f, int size); +void qemu_clear_buffer(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { diff --git a/savevm.c b/savevm.c index fcd8db4..d1efdd3 100644 --- a/savevm.c +++ b/savevm.c @@ -172,7 +172,8 @@ struct QEMUFile { when reading */ int buf_index; int buf_size; /* 0 when writing */ -uint8_t buf[IO_BUF_SIZE]; +int buf_max_size; +uint8_t *buf; int has_error; }; @@ -423,6 +424,9 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, f-get_rate_limit = get_rate_limit; f-is_write = 0; +f-buf_max_size = IO_BUF_SIZE; +f-buf = qemu_malloc(sizeof(uint8_t) * f-buf_max_size); + return f; } @@ -453,6 +457,19 @@ void qemu_fflush(QEMUFile *f) } } +void *qemu_realloc_buffer(QEMUFile *f, int size) +{ +f-buf_max_size = size; +f-buf = qemu_realloc(f-buf, f-buf_max_size); + +return f-buf; +} + +void qemu_clear_buffer(QEMUFile *f) +{ +f-buf_size = f-buf_index = f-buf_offset = 0; +} + static void qemu_fill_buffer(QEMUFile *f) { int len; @@ -478,6 +495,7 @@ int qemu_fclose(QEMUFile *f) qemu_fflush(f); if (f-close) ret = f-close(f-opaque); +qemu_free(f-buf); qemu_free(f); return ret; } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
The option looks like, -incoming protocol:address:port,ft_mode Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/migration.c b/migration.c index 1752cf4..29d4fb1 100644 --- a/migration.c +++ b/migration.c @@ -45,6 +45,12 @@ int qemu_start_incoming_migration(const char *uri) const char *p; int ret; +/* check ft_mode option */ +p = strstr(uri, ft_mode); +if (p !strcmp(p, ft_mode)) { +ft_mode = FT_INIT; +} + if (strstart(uri, tcp:, p)) ret = tcp_start_incoming_migration(p); #if !defined(WIN32) -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/19] vl.c: add deleted flag for deleting the handler.
Make deleting handlers robust against deletion of any elements in a handler by using a deleted flag like in file descriptors. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c | 13 + 1 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vl.c b/vl.c index 0292184..8bbb785 100644 --- a/vl.c +++ b/vl.c @@ -1140,6 +1140,7 @@ static void nographic_update(void *opaque) struct vm_change_state_entry { VMChangeStateHandler *cb; void *opaque; +int deleted; QLIST_ENTRY (vm_change_state_entry) entries; }; @@ -1160,8 +1161,7 @@ VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, void qemu_del_vm_change_state_handler(VMChangeStateEntry *e) { -QLIST_REMOVE (e, entries); -qemu_free (e); +e-deleted = 1; } void vm_state_notify(int running, int reason) @@ -1170,8 +1170,13 @@ void vm_state_notify(int running, int reason) trace_vm_state_notify(running, reason); -for (e = vm_change_state_head.lh_first; e; e = e-entries.le_next) { -e-cb(e-opaque, running, reason); +QLIST_FOREACH(e, vm_change_state_head, entries) { +if (e-deleted) { +QLIST_REMOVE(e, entries); +qemu_free(e); +} else { +e-cb(e-opaque, running, reason); +} } } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 17/19] migration-tcp: modify tcp_accept_incoming_migration() to handle ft_mode, and add a hack not to close fd when ft_mode is enabled.
When ft_mode is set in the header, tcp_accept_incoming_migration() sets ft_trans_incoming() as a callback, and call qemu_file_get_notify() to receive FT transaction iteratively. We also need a hack no to close fd before moving to ft_transaction mode, so that we can reuse the fd for it. vm_change_state_handler is added to turn off ft_mode when cont is pressed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 67 ++- 1 files changed, 66 insertions(+), 1 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index 55777c8..84076d6 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -18,6 +18,8 @@ #include sysemu.h #include buffered_file.h #include block.h +#include ft_trans_file.h +#include event-tap.h //#define DEBUG_MIGRATION_TCP @@ -29,6 +31,8 @@ do { } while (0) #endif +static VMChangeStateEntry *vmstate; + static int socket_errno(FdMigrationState *s) { return socket_error(); @@ -56,7 +60,8 @@ static int socket_read(FdMigrationState *s, const void * buf, size_t size) static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); -if (s-fd != -1) { +/* FIX ME: accessing ft_mode here isn't clean */ +if (s-fd != -1 ft_mode != FT_INIT) { close(s-fd); s-fd = -1; } @@ -150,6 +155,36 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, return s-mig_state; } +static void ft_trans_incoming(void *opaque) +{ +QEMUFile *f = opaque; + +qemu_file_get_notify(f); +if (qemu_file_has_error(f)) { +ft_mode = FT_ERROR; +qemu_fclose(f); +} +} + +static void ft_trans_reset(void *opaque, int running, int reason) +{ +QEMUFile *f = opaque; + +if (running) { +if (ft_mode != FT_ERROR) { +qemu_fclose(f); +} +ft_mode = FT_OFF; +qemu_del_vm_change_state_handler(vmstate); +} +} + +static void ft_trans_schedule_replay(QEMUFile *f) +{ +event_tap_schedule_replay(); +vmstate = qemu_add_vm_change_state_handler(ft_trans_reset, f); +} + static void tcp_accept_incoming_migration(void *opaque) { struct sockaddr_in addr; @@ -175,8 +210,38 @@ static void tcp_accept_incoming_migration(void *opaque) goto out; } +if (ft_mode == FT_INIT) { +autostart = 0; +} + process_incoming_migration(f); + +if (ft_mode == FT_INIT) { +int ret; + +socket_set_nodelay(c); + +f = qemu_fopen_ft_trans(s, c); +if (f == NULL) { +fprintf(stderr, could not qemu_fopen_ft_trans\n); +goto out; +} + +/* need to wait sender to setup */ +ret = qemu_ft_trans_begin(f); +if (ret 0) { +goto out; +} + +qemu_set_fd_handler2(c, NULL, ft_trans_incoming, NULL, f); +ft_trans_schedule_replay(f); +ft_mode = FT_TRANSACTION_RECV; + +return; +} + qemu_fclose(f); + out: close(c); out2: -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/19] virtio: decrement last_avail_idx with inuse before saving.
For regular migration inuse == 0 always as requests are flushed before save. However, event-tap log when enabled introduces an extra queue for requests which is not being flushed, thus the last inuse requests are left in the event-tap queue. Move the last_avail_idx value sent to the remote back to make it repeat the last inuse requests. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/virtio.c | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 31bd9e3..f05d1b6 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -673,12 +673,20 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f) qemu_put_be32(f, i); for (i = 0; i VIRTIO_PCI_QUEUE_MAX; i++) { +/* For regular migration inuse == 0 always as + * requests are flushed before save. However, + * event-tap log when enabled introduces an extra + * queue for requests which is not being flushed, + * thus the last inuse requests are left in the event-tap queue. + * Move the last_avail_idx value sent to the remote back + * to make it repeat the last inuse requests. */ +uint16_t last_avail = vdev-vq[i].last_avail_idx - vdev-vq[i].inuse; if (vdev-vq[i].vring.num == 0) break; qemu_put_be32(f, vdev-vq[i].vring.num); qemu_put_be64(f, vdev-vq[i].pa); -qemu_put_be16s(f, vdev-vq[i].last_avail_idx); +qemu_put_be16s(f, last_avail); if (vdev-binding-save_queue) vdev-binding-save_queue(vdev-binding_opaque, i, f); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/19] ioport: insert event_tap_ioport() to ioport_write().
Record ioport event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- ioport.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/ioport.c b/ioport.c index aa4188a..74aebf5 100644 --- a/ioport.c +++ b/ioport.c @@ -27,6 +27,7 @@ #include ioport.h #include trace.h +#include event-tap.h /***/ /* IO Port */ @@ -76,6 +77,7 @@ static void ioport_write(int index, uint32_t address, uint32_t data) default_ioport_writel }; IOPortWriteFunc *func = ioport_write_table[index][address]; +event_tap_ioport(index, address, data); if (!func) func = default_func[index]; func(ioport_opaque[address], address, data); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/19] net: insert event-tap to qemu_send_packet() and qemu_sendv_packet_async().
event-tap function is called only when it is on. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- net.c |9 + 1 files changed, 9 insertions(+), 0 deletions(-) diff --git a/net.c b/net.c index 9ba5be2..1176124 100644 --- a/net.c +++ b/net.c @@ -36,6 +36,7 @@ #include qemu-common.h #include qemu_socket.h #include hw/qdev.h +#include event-tap.h static QTAILQ_HEAD(, VLANState) vlans; static QTAILQ_HEAD(, VLANClientState) non_vlan_clients; @@ -559,6 +560,10 @@ ssize_t qemu_send_packet_async(VLANClientState *sender, void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size) { +if (event_tap_is_on()) { +return event_tap_send_packet(vc, buf, size); +} + qemu_send_packet_async(vc, buf, size, NULL); } @@ -657,6 +662,10 @@ ssize_t qemu_sendv_packet_async(VLANClientState *sender, { NetQueue *queue; +if (event_tap_is_on()) { +return event_tap_sendv_packet_async(sender, iov, iovcnt, sent_cb); +} + if (sender-link_down || (!sender-peer !sender-vlan)) { return calc_iov_length(iov, iovcnt); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/19] Introduce skip_header parameter to qemu_loadvm_state().
Introduce skip_header parameter to qemu_loadvm_state() so that it can be called iteratively without reading the header. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |2 +- savevm.c| 24 +--- sysemu.h|2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/migration.c b/migration.c index bee20f0..11eff51 100644 --- a/migration.c +++ b/migration.c @@ -60,7 +60,7 @@ int qemu_start_incoming_migration(const char *uri) void process_incoming_migration(QEMUFile *f) { -if (qemu_loadvm_state(f) 0) { +if (qemu_loadvm_state(f, 0) 0) { fprintf(stderr, load of migration failed\n); exit(0); } diff --git a/savevm.c b/savevm.c index d1efdd3..dc15c03 100644 --- a/savevm.c +++ b/savevm.c @@ -1709,7 +1709,7 @@ typedef struct LoadStateEntry { int version_id; } LoadStateEntry; -int qemu_loadvm_state(QEMUFile *f) +int qemu_loadvm_state(QEMUFile *f, int skip_header) { QLIST_HEAD(, LoadStateEntry) loadvm_handlers = QLIST_HEAD_INITIALIZER(loadvm_handlers); @@ -1722,17 +1722,19 @@ int qemu_loadvm_state(QEMUFile *f) return -EINVAL; } -v = qemu_get_be32(f); -if (v != QEMU_VM_FILE_MAGIC) -return -EINVAL; +if (!skip_header) { +v = qemu_get_be32(f); +if (v != QEMU_VM_FILE_MAGIC) +return -EINVAL; -v = qemu_get_be32(f); -if (v == QEMU_VM_FILE_VERSION_COMPAT) { -fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); -return -ENOTSUP; +v = qemu_get_be32(f); +if (v == QEMU_VM_FILE_VERSION_COMPAT) { +fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); +return -ENOTSUP; +} +if (v != QEMU_VM_FILE_VERSION) +return -ENOTSUP; } -if (v != QEMU_VM_FILE_VERSION) -return -ENOTSUP; while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; @@ -2055,7 +2057,7 @@ int load_vmstate(const char *name) return -EINVAL; } -ret = qemu_loadvm_state(f); +ret = qemu_loadvm_state(f, 0); qemu_fclose(f); if (ret 0) { diff --git a/sysemu.h b/sysemu.h index 0c969f2..329415f 100644 --- a/sysemu.h +++ b/sysemu.h @@ -81,7 +81,7 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); -int qemu_loadvm_state(QEMUFile *f); +int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ void do_info_slirp(Monitor *mon); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/19] Call init handler of event-tap at main() in vl.c.
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/vl.c b/vl.c index 8bbb785..9faeb27 100644 --- a/vl.c +++ b/vl.c @@ -162,6 +162,7 @@ int main(int argc, char **argv) #include qemu-queue.h #include cpus.h #include arch_init.h +#include event-tap.h #include ui/qemu-spice.h @@ -2895,6 +2896,8 @@ int main(int argc, char **argv, char **envp) blk_mig_init(); +event_tap_init(); + if (default_cdrom) { /* we always create the cdrom drive, even if no disk is there */ drive_add(NULL, CDROM_ALIAS); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 18/19] Introduce -k option to enable FT migration mode (Kemari).
When -k option is set to migrate command, it will turn on ft_mode to start FT migration mode (Kemari). Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hmp-commands.hx |7 --- migration.c |4 qmp-commands.hx |7 --- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index 1cea572..b7f8f2f 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -735,13 +735,14 @@ ETEXI { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, diff --git a/migration.c b/migration.c index aa30ecd..1752cf4 100644 --- a/migration.c +++ b/migration.c @@ -96,6 +96,10 @@ int do_migrate(Monitor *mon, const QDict *qdict, QObject **ret_data) return -1; } +if (qdict_get_try_bool(qdict, ft, 0)) { +ft_mode = FT_INIT; +} + if (strstart(uri, tcp:, p)) { s = tcp_start_outgoing_migration(mon, p, max_throttle, detach, blk, inc); diff --git a/qmp-commands.hx b/qmp-commands.hx index 56c4d8b..1521931 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -431,13 +431,14 @@ EQMP { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/19] Insert event_tap_mmio() to cpu_physical_memory_rw() in exec.c.
Record mmio write event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- exec.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/exec.c b/exec.c index e950df2..c81fd09 100644 --- a/exec.c +++ b/exec.c @@ -33,6 +33,7 @@ #include osdep.h #include kvm.h #include qemu-timer.h +#include event-tap.h #if defined(CONFIG_USER_ONLY) #include qemu.h #include signal.h @@ -3632,6 +3633,9 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, io_index = (pd IO_MEM_SHIFT) (IO_MEM_NB_ENTRIES - 1); if (p) addr1 = (addr ~TARGET_PAGE_MASK) + p-region_offset; + +event_tap_mmio(addr, buf, len); + /* XXX: could force cpu_single_env to NULL to avoid potential bugs */ if (l = 4 ((addr1 3) == 0)) { -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/19] net: insert event-tap to qemu_send_packet() and qemu_sendv_packet_async().
event-tap function is called only when it is on. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- net.c |9 + 1 files changed, 9 insertions(+), 0 deletions(-) diff --git a/net.c b/net.c index 9ba5be2..1176124 100644 --- a/net.c +++ b/net.c @@ -36,6 +36,7 @@ #include qemu-common.h #include qemu_socket.h #include hw/qdev.h +#include event-tap.h static QTAILQ_HEAD(, VLANState) vlans; static QTAILQ_HEAD(, VLANClientState) non_vlan_clients; @@ -559,6 +560,10 @@ ssize_t qemu_send_packet_async(VLANClientState *sender, void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size) { +if (event_tap_is_on()) { +return event_tap_send_packet(vc, buf, size); +} + qemu_send_packet_async(vc, buf, size, NULL); } @@ -657,6 +662,10 @@ ssize_t qemu_sendv_packet_async(VLANClientState *sender, { NetQueue *queue; +if (event_tap_is_on()) { +return event_tap_sendv_packet_async(sender, iov, iovcnt, sent_cb); +} + if (sender-link_down || (!sender-peer !sender-vlan)) { return calc_iov_length(iov, iovcnt); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/19] Kemari for KVM v0.2.7
Hi, This patch series is a revised version of Kemari for KVM, which applied comments for the previous post. The current code is based on qemu.git 0bfe006c5380c5f8a485a55ded3329fbbc224396. The changes from v0.2.6 - v0.2.7 are: - add AIOCB, AIOPool and cancel functions (Kevin) - insert event-tap for bdrv_flush (Kevin) - add error handing when calling bdrv functions (Kevin) - fix usage of qemu_aio_flush and bdrv_flush (Kevin) - use bs in AIOCB on the primary (Kevin) - reorder event-tap functions to gather with block/net (Kevin) - fix checking bs-device_name (Kevin) The changes from v0.2.5 - v0.2.6 are: - use qemu_{put,get}_be32() to save/load niov in event-tap The changes from v0.2.4 - v0.2.5 are: - fixed braces and trailing spaces by using Blue's checkpatch.pl (Blue) - event-tap: don't try to send blk_req if it's a bdrv_aio_flush event The changes from v0.2.3 - v0.2.4 are: - call vm_start() before event_tap_flush_one() to avoid failure in virtio-net assertion - add vm_change_state_handler to turn off ft_mode - use qemu_iovec functions in event-tap - remove duplicated code in migration - remove unnecessary new line for error_report in ft_trans_file The changes from v0.2.2 - v0.2.3 are: - queue async net requests without copying (MST) -- if not async, contents of the packets are sent to the secondary - better description for option -k (MST) - fix memory transfer failure - fix ft transaction initiation failure The changes from v0.2.1 - v0.2.2 are: - decrement last_avaid_idx with inuse before saving (MST) - remove qemu_aio_flush() and bdrv_flush_all() in migrate_ft_trans_commit() The changes from v0.2 - v0.2.1 are: - Move event-tap to net/block layer and use stubs (Blue, Paul, MST, Kevin) - Tap bdrv_aio_flush (Marcelo) - Remove multiwrite interface in event-tap (Stefan) - Fix event-tap to use pio/mmio to replay both net/block (Stefan) - Improve error handling in event-tap (Stefan) - Fix leak in event-tap (Stefan) - Revise virtio last_avail_idx manipulation (MST) - Clean up migration.c hook (Marcelo) - Make deleting change state handler robust (Isaku, Anthony) The changes from v0.1.1 - v0.2 are: - Introduce a queue in event-tap to make VM sync live. - Change transaction receiver to a state machine for async receiving. - Replace net/block layer functions with event-tap proxy functions. - Remove dirty bitmap optimization for now. - convert DPRINTF() in ft_trans_file to trace functions. - convert fprintf() in ft_trans_file to error_report(). - improved error handling in ft_trans_file. - add a tmp pointer to qemu_del_vm_change_state_handler. The changes from v0.1 - v0.1.1 are: - events are tapped in net/block layer instead of device emulation layer. - Introduce a new option for -incoming to accept FT transaction. - Removed writev() support to QEMUFile and FdMigrationState for now. I would post this work in a different series. - Modified virtio-blk save/load handler to send inuse variable to correctly replay. - Removed configure --enable-ft-mode. - Removed unnecessary check for qemu_realloc(). The first 6 patches modify several functions of qemu to prepare introducing Kemari specific components. The next 6 patches are the components of Kemari. They introduce event-tap and the FT transaction protocol file based on buffered file. The design document of FT transaction protocol can be found at, http://wiki.qemu.org/images/b/b1/Kemari_sender_receiver_0.5a.pdf Then the following 2 patches modifies net/block layer functions with event-tap functions. Please note that if Kemari is off, event-tap will just passthrough, and there is most no intrusion to exisiting functions including normal live migration. Finally, the migration layer are modified to support Kemari in the last 5 patches. Again, there shouldn't be any affection if a user doesn't specify Kemari specific options. The transaction is now async on both sender and receiver side. The sender side respects the max_downtime to decide when to switch from async to sync mode. The repository contains all patches I'm sending with this message. For those who want to try, please pull the following repository. It also includes dirty bitmap optimization which aren't ready for posting yet. To remove the dirty bitmap optimization, please look at HEAD~5 of the tree. git://kemari.git.sourceforge.net/gitroot/kemari/kemari next Thanks, Yoshi Yoshiaki Tamura (19): Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer(). Introduce read() to FdMigrationState. Introduce skip_header parameter to qemu_loadvm_state(). qemu-char: export socket_set_nodelay(). vl.c: add deleted flag for deleting the handler. virtio: decrement last_avail_idx with inuse before saving. Introduce fault tolerant VM transaction QEMUFile and ft_mode. savevm: introduce util functions to control ft_trans_file from savevm layer. Introduce event-tap. Call init handler of event-tap at main() in vl.c. ioport: insert
[PATCH 08/19] savevm: introduce util functions to control ft_trans_file from savevm layer.
To utilize ft_trans_file function, savevm needs interfaces to be exported. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |5 ++ savevm.c | 149 ++ 2 files changed, 154 insertions(+), 0 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index 7f05830..52e807c 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -51,6 +51,7 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, QEMUFile *qemu_fopen(const char *filename, const char *mode); QEMUFile *qemu_fdopen(int fd, const char *mode); QEMUFile *qemu_fopen_socket(int fd); +QEMUFile *qemu_fopen_ft_trans(int s_fd, int c_fd); QEMUFile *qemu_popen(FILE *popen_file, const char *mode); QEMUFile *qemu_popen_cmd(const char *command, const char *mode); int qemu_stdio_fd(QEMUFile *f); @@ -60,6 +61,9 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); void *qemu_realloc_buffer(QEMUFile *f, int size); void qemu_clear_buffer(QEMUFile *f); +int qemu_ft_trans_begin(QEMUFile *f); +int qemu_ft_trans_commit(QEMUFile *f); +int qemu_ft_trans_cancel(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { @@ -94,6 +98,7 @@ void qemu_file_set_error(QEMUFile *f); * halted due to rate limiting or EAGAIN errors occur as it can be used to * resume output. */ void qemu_file_put_notify(QEMUFile *f); +void qemu_file_get_notify(void *opaque); static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { diff --git a/savevm.c b/savevm.c index dc15c03..5418280 100644 --- a/savevm.c +++ b/savevm.c @@ -83,6 +83,7 @@ #include migration.h #include qemu_socket.h #include qemu-queue.h +#include ft_trans_file.h #define SELF_ANNOUNCE_ROUNDS 5 @@ -190,6 +191,13 @@ typedef struct QEMUFileSocket QEMUFile *file; } QEMUFileSocket; +typedef struct QEMUFileSocketTrans +{ +int fd; +QEMUFileSocket *s; +VMChangeStateEntry *e; +} QEMUFileSocketTrans; + static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) { QEMUFileSocket *s = opaque; @@ -205,6 +213,22 @@ static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) return len; } +static ssize_t socket_put_buffer(void *opaque, const void *buf, size_t size) +{ +QEMUFileSocket *s = opaque; +ssize_t len; + +do { +len = send(s-fd, (void *)buf, size, 0); +} while (len == -1 socket_error() == EINTR); + +if (len == -1) { +len = -socket_error(); +} + +return len; +} + static int socket_close(void *opaque) { QEMUFileSocket *s = opaque; @@ -212,6 +236,70 @@ static int socket_close(void *opaque) return 0; } +static int socket_trans_get_buffer(void *opaque, uint8_t *buf, int64_t pos, size_t size) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; +ssize_t len; + +len = socket_get_buffer(s, buf, pos, size); + +return len; +} + +static ssize_t socket_trans_put_buffer(void *opaque, const void *buf, size_t size) +{ +QEMUFileSocketTrans *t = opaque; + +return socket_put_buffer(t-s, buf, size); +} + + +static int socket_trans_get_ready(void *opaque) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; +QEMUFile *f = s-file; +int ret = 0; + +ret = qemu_loadvm_state(f, 1); +if (ret 0) { +fprintf(stderr, +socket_trans_get_ready: error while loading vmstate\n); +} + +return ret; +} + +static int socket_trans_close(void *opaque) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; + +qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL); +qemu_set_fd_handler2(t-fd, NULL, NULL, NULL, NULL); +qemu_del_vm_change_state_handler(t-e); +close(s-fd); +close(t-fd); +qemu_free(s); +qemu_free(t); + +return 0; +} + +static void socket_trans_resume(void *opaque, int running, int reason) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; + +if (!running) { +return; +} + +qemu_announce_self(); +qemu_fclose(s-file); +} + static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) { QEMUFileStdio *s = opaque; @@ -334,6 +422,26 @@ QEMUFile *qemu_fopen_socket(int fd) return s-file; } +QEMUFile *qemu_fopen_ft_trans(int s_fd, int c_fd) +{ +QEMUFileSocketTrans *t = qemu_mallocz(sizeof(QEMUFileSocketTrans)); +QEMUFileSocket *s = qemu_mallocz(sizeof(QEMUFileSocket)); + +t-s = s; +t-fd = s_fd; +t-e = qemu_add_vm_change_state_handler(socket_trans_resume, t); + +s-fd = c_fd; +s-file = qemu_fopen_ops_ft_trans(t, socket_trans_put_buffer, + socket_trans_get_buffer, NULL, + socket_trans_get_ready, + migrate_fd_wait_for_unfreeze, + socket_trans_close, 0
[PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
The option looks like, -incoming protocol:address:port,ft_mode Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/migration.c b/migration.c index 1752cf4..29d4fb1 100644 --- a/migration.c +++ b/migration.c @@ -45,6 +45,12 @@ int qemu_start_incoming_migration(const char *uri) const char *p; int ret; +/* check ft_mode option */ +p = strstr(uri, ft_mode); +if (p !strcmp(p, ft_mode)) { +ft_mode = FT_INIT; +} + if (strstart(uri, tcp:, p)) ret = tcp_start_incoming_migration(p); #if !defined(WIN32) -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/19] ioport: insert event_tap_ioport() to ioport_write().
Record ioport event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- ioport.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/ioport.c b/ioport.c index aa4188a..74aebf5 100644 --- a/ioport.c +++ b/ioport.c @@ -27,6 +27,7 @@ #include ioport.h #include trace.h +#include event-tap.h /***/ /* IO Port */ @@ -76,6 +77,7 @@ static void ioport_write(int index, uint32_t address, uint32_t data) default_ioport_writel }; IOPortWriteFunc *func = ioport_write_table[index][address]; +event_tap_ioport(index, address, data); if (!func) func = default_func[index]; func(ioport_opaque[address], address, data); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/19] Call init handler of event-tap at main() in vl.c.
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/vl.c b/vl.c index 8bbb785..9faeb27 100644 --- a/vl.c +++ b/vl.c @@ -162,6 +162,7 @@ int main(int argc, char **argv) #include qemu-queue.h #include cpus.h #include arch_init.h +#include event-tap.h #include ui/qemu-spice.h @@ -2895,6 +2896,8 @@ int main(int argc, char **argv, char **envp) blk_mig_init(); +event_tap_init(); + if (default_cdrom) { /* we always create the cdrom drive, even if no disk is there */ drive_add(NULL, CDROM_ALIAS); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/19] block: insert event-tap to bdrv_aio_writev() and bdrv_aio_flush().
event-tap function is called only when it is on, and requests sent from device emulators. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- block.c | 15 +++ 1 files changed, 15 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index ff2795b..e4df9b6 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include block_int.h #include module.h #include qemu-objects.h +#include event-tap.h #ifdef CONFIG_BSD #include sys/types.h @@ -1476,6 +1477,10 @@ int bdrv_flush(BlockDriverState *bs) } if (bs-drv bs-drv-bdrv_flush) { +if (*bs-device_name event_tap_is_on()) { +event_tap_bdrv_flush(); +} + return bs-drv-bdrv_flush(bs); } @@ -2111,6 +2116,11 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; +if (*bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); +} + if (bs-dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, opaque); @@ -2374,6 +2384,11 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, if (!drv) return NULL; + +if (*bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_flush(bs, cb, opaque); +} + return drv-bdrv_aio_flush(bs, cb, opaque); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/19] Introduce read() to FdMigrationState.
Currently FdMigrationState doesn't support read(), and this patch introduces it to get response from the other side. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 15 +++ migration.c | 13 + migration.h |3 +++ 3 files changed, 31 insertions(+), 0 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index b55f419..55777c8 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -39,6 +39,20 @@ static int socket_write(FdMigrationState *s, const void * buf, size_t size) return send(s-fd, buf, size, 0); } +static int socket_read(FdMigrationState *s, const void * buf, size_t size) +{ +ssize_t len; + +do { +len = recv(s-fd, (void *)buf, size, 0); +} while (len == -1 socket_error() == EINTR); +if (len == -1) { +len = -socket_error(); +} + +return len; +} + static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); @@ -94,6 +108,7 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, s-get_error = socket_errno; s-write = socket_write; +s-read = socket_read; s-close = tcp_close; s-mig_state.cancel = migrate_fd_cancel; s-mig_state.get_status = migrate_fd_get_status; diff --git a/migration.c b/migration.c index d593b1d..bee20f0 100644 --- a/migration.c +++ b/migration.c @@ -334,6 +334,19 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) return ret; } +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) +{ +FdMigrationState *s = opaque; +int ret; + +ret = s-read(s, data, size); +if (ret == -1) { +ret = -(s-get_error(s)); +} + +return ret; +} + void migrate_fd_connect(FdMigrationState *s) { int ret; diff --git a/migration.h b/migration.h index d13ed4f..7bf6747 100644 --- a/migration.h +++ b/migration.h @@ -47,6 +47,7 @@ struct FdMigrationState int (*get_error)(struct FdMigrationState*); int (*close)(struct FdMigrationState*); int (*write)(struct FdMigrationState*, const void *, size_t); +int (*read)(struct FdMigrationState *, const void *, size_t); void *opaque; }; @@ -115,6 +116,8 @@ void migrate_fd_put_notify(void *opaque); ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size); +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size); + void migrate_fd_connect(FdMigrationState *s); void migrate_fd_put_ready(void *opaque); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/19] savevm: introduce qemu_savevm_trans_{begin,commit}.
Introduce qemu_savevm_state_{begin,commit} to send the memory and device info together, while avoiding cancelling memory state tracking. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- savevm.c | 93 ++ sysemu.h |2 + 2 files changed, 95 insertions(+), 0 deletions(-) diff --git a/savevm.c b/savevm.c index 5418280..73465ed 100644 --- a/savevm.c +++ b/savevm.c @@ -1726,6 +1726,99 @@ int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f) return 0; } +int qemu_savevm_trans_begin(Monitor *mon, QEMUFile *f, int init) +{ +SaveStateEntry *se; +int skipped = 0; + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int len, stage, ret; + +if (se-save_live_state == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_START); +qemu_put_be32(f, se-section_id); + +/* ID string */ +len = strlen(se-idstr); +qemu_put_byte(f, len); +qemu_put_buffer(f, (uint8_t *)se-idstr, len); + +qemu_put_be32(f, se-instance_id); +qemu_put_be32(f, se-version_id); + +stage = init ? QEMU_VM_SECTION_START : QEMU_VM_SECTION_PART; +ret = se-save_live_state(mon, f, stage, se-opaque); +if (!ret) { +skipped++; +} +} + +if (qemu_file_has_error(f)) { +return -EIO; +} + +return skipped; +} + +int qemu_savevm_trans_complete(Monitor *mon, QEMUFile *f) +{ +SaveStateEntry *se; + +cpu_synchronize_all_states(); + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int ret; + +if (se-save_live_state == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_PART); +qemu_put_be32(f, se-section_id); + +ret = se-save_live_state(mon, f, QEMU_VM_SECTION_PART, se-opaque); +if (!ret) { +/* do not proceed to the next vmstate. */ +return 1; +} +} + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int len; + +if (se-save_state == NULL se-vmsd == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_FULL); +qemu_put_be32(f, se-section_id); + +/* ID string */ +len = strlen(se-idstr); +qemu_put_byte(f, len); +qemu_put_buffer(f, (uint8_t *)se-idstr, len); + +qemu_put_be32(f, se-instance_id); +qemu_put_be32(f, se-version_id); + +vmstate_save(f, se); +} + +qemu_put_byte(f, QEMU_VM_EOF); + +if (qemu_file_has_error(f)) { +return -EIO; +} + +return 0; +} + void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f) { SaveStateEntry *se; diff --git a/sysemu.h b/sysemu.h index 329415f..ee2c382 100644 --- a/sysemu.h +++ b/sysemu.h @@ -81,6 +81,8 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); +int qemu_savevm_trans_begin(Monitor *mon, QEMUFile *f, int init); +int qemu_savevm_trans_complete(Monitor *mon, QEMUFile *f); int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 17/19] migration-tcp: modify tcp_accept_incoming_migration() to handle ft_mode, and add a hack not to close fd when ft_mode is enabled.
When ft_mode is set in the header, tcp_accept_incoming_migration() sets ft_trans_incoming() as a callback, and call qemu_file_get_notify() to receive FT transaction iteratively. We also need a hack no to close fd before moving to ft_transaction mode, so that we can reuse the fd for it. vm_change_state_handler is added to turn off ft_mode when cont is pressed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 67 ++- 1 files changed, 66 insertions(+), 1 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index 55777c8..84076d6 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -18,6 +18,8 @@ #include sysemu.h #include buffered_file.h #include block.h +#include ft_trans_file.h +#include event-tap.h //#define DEBUG_MIGRATION_TCP @@ -29,6 +31,8 @@ do { } while (0) #endif +static VMChangeStateEntry *vmstate; + static int socket_errno(FdMigrationState *s) { return socket_error(); @@ -56,7 +60,8 @@ static int socket_read(FdMigrationState *s, const void * buf, size_t size) static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); -if (s-fd != -1) { +/* FIX ME: accessing ft_mode here isn't clean */ +if (s-fd != -1 ft_mode != FT_INIT) { close(s-fd); s-fd = -1; } @@ -150,6 +155,36 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, return s-mig_state; } +static void ft_trans_incoming(void *opaque) +{ +QEMUFile *f = opaque; + +qemu_file_get_notify(f); +if (qemu_file_has_error(f)) { +ft_mode = FT_ERROR; +qemu_fclose(f); +} +} + +static void ft_trans_reset(void *opaque, int running, int reason) +{ +QEMUFile *f = opaque; + +if (running) { +if (ft_mode != FT_ERROR) { +qemu_fclose(f); +} +ft_mode = FT_OFF; +qemu_del_vm_change_state_handler(vmstate); +} +} + +static void ft_trans_schedule_replay(QEMUFile *f) +{ +event_tap_schedule_replay(); +vmstate = qemu_add_vm_change_state_handler(ft_trans_reset, f); +} + static void tcp_accept_incoming_migration(void *opaque) { struct sockaddr_in addr; @@ -175,8 +210,38 @@ static void tcp_accept_incoming_migration(void *opaque) goto out; } +if (ft_mode == FT_INIT) { +autostart = 0; +} + process_incoming_migration(f); + +if (ft_mode == FT_INIT) { +int ret; + +socket_set_nodelay(c); + +f = qemu_fopen_ft_trans(s, c); +if (f == NULL) { +fprintf(stderr, could not qemu_fopen_ft_trans\n); +goto out; +} + +/* need to wait sender to setup */ +ret = qemu_ft_trans_begin(f); +if (ret 0) { +goto out; +} + +qemu_set_fd_handler2(c, NULL, ft_trans_incoming, NULL, f); +ft_trans_schedule_replay(f); +ft_mode = FT_TRANSACTION_RECV; + +return; +} + qemu_fclose(f); + out: close(c); out2: -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/19] Introduce event-tap.
event-tap controls when to start FT transaction, and provides proxy functions to called from net/block devices. While FT transaction, it queues up net/block requests, and flush them when the transaction gets completed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.target |1 + event-tap.c | 925 +++ event-tap.h | 44 +++ qemu-tool.c | 28 ++ trace-events|9 + 5 files changed, 1007 insertions(+), 0 deletions(-) create mode 100644 event-tap.c create mode 100644 event-tap.h diff --git a/Makefile.target b/Makefile.target index cd2abde..20f02d5 100644 --- a/Makefile.target +++ b/Makefile.target @@ -199,6 +199,7 @@ obj-y += rwhandler.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o LIBS+=-lz +obj-y += event-tap.o QEMU_CFLAGS += $(VNC_TLS_CFLAGS) QEMU_CFLAGS += $(VNC_SASL_CFLAGS) diff --git a/event-tap.c b/event-tap.c new file mode 100644 index 000..ca2a204 --- /dev/null +++ b/event-tap.c @@ -0,0 +1,925 @@ +/* + * Event Tap functions for QEMU + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include qemu-common.h +#include qemu-error.h +#include block.h +#include block_int.h +#include ioport.h +#include osdep.h +#include sysemu.h +#include hw/hw.h +#include net.h +#include event-tap.h +#include trace.h + +enum EVENT_TAP_STATE { +EVENT_TAP_OFF, +EVENT_TAP_ON, +EVENT_TAP_SUSPEND, +EVENT_TAP_FLUSH, +EVENT_TAP_LOAD, +EVENT_TAP_REPLAY, +}; + +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF; + +typedef struct EventTapIOport { +uint32_t address; +uint32_t data; +int index; +} EventTapIOport; + +#define MMIO_BUF_SIZE 8 + +typedef struct EventTapMMIO { +uint64_t address; +uint8_t buf[MMIO_BUF_SIZE]; +int len; +} EventTapMMIO; + +typedef struct EventTapNetReq { +char *device_name; +int iovcnt; +int vlan_id; +bool vlan_needed; +bool async; +struct iovec *iov; +NetPacketSent *sent_cb; +} EventTapNetReq; + +#define MAX_BLOCK_REQUEST 32 + +typedef struct EventTapAIOCB EventTapAIOCB; + +typedef struct EventTapBlkReq { +char *device_name; +int num_reqs; +int num_cbs; +bool is_flush; +BlockRequest reqs[MAX_BLOCK_REQUEST]; +EventTapAIOCB *acb[MAX_BLOCK_REQUEST]; +} EventTapBlkReq; + +#define EVENT_TAP_IOPORT (1 0) +#define EVENT_TAP_MMIO (1 1) +#define EVENT_TAP_NET(1 2) +#define EVENT_TAP_BLK(1 3) + +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1) + +typedef struct EventTapLog { +int mode; +union { +EventTapIOport ioport; +EventTapMMIO mmio; +}; +union { +EventTapNetReq net_req; +EventTapBlkReq blk_req; +}; +QTAILQ_ENTRY(EventTapLog) node; +} EventTapLog; + +struct EventTapAIOCB { +BlockDriverAIOCB common; +BlockDriverAIOCB *acb; +bool is_canceled; +}; + +static EventTapLog *last_event_tap; + +static QTAILQ_HEAD(, EventTapLog) event_list; +static QTAILQ_HEAD(, EventTapLog) event_pool; + +static int (*event_tap_cb)(void); +static QEMUBH *event_tap_bh; +static VMChangeStateEntry *vmstate; + +static void event_tap_bh_cb(void *p) +{ +if (event_tap_cb) { +event_tap_cb(); +} + +qemu_bh_delete(event_tap_bh); +event_tap_bh = NULL; +} + +static void event_tap_schedule_bh(void) +{ +trace_event_tap_ignore_bh(!!event_tap_bh); + +/* if bh is already set, we ignore it for now */ +if (event_tap_bh) { +return; +} + +event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL); +qemu_bh_schedule(event_tap_bh); + +return ; +} + +static void *event_tap_alloc_log(void) +{ +EventTapLog *log; + +if (QTAILQ_EMPTY(event_pool)) { +log = qemu_mallocz(sizeof(EventTapLog)); +} else { +log = QTAILQ_FIRST(event_pool); +QTAILQ_REMOVE(event_pool, log, node); +} + +return log; +} + +static void event_tap_free_log(EventTapLog *log) +{ +int i, mode = log-mode ~EVENT_TAP_TYPE_MASK; + +if (mode == EVENT_TAP_NET) { +EventTapNetReq *net_req = log-net_req; + +if (!net_req-async) { +for (i = 0; i net_req-iovcnt; i++) { +qemu_free(net_req-iov[i].iov_base); +} +qemu_free(net_req-iov); +} else if (event_tap_state = EVENT_TAP_LOAD) { +qemu_free(net_req-iov); +} + +qemu_free(net_req-device_name); +} else if (mode == EVENT_TAP_BLK) { +EventTapBlkReq *blk_req = log-blk_req; + +if (event_tap_state = EVENT_TAP_LOAD !blk_req-is_flush) { +for (i = 0; i blk_req-num_reqs; i++) { +qemu_iovec_destroy(blk_req-reqs[i].qiov); +qemu_free(blk_req-reqs[i].qiov
[PATCH 16/19] migration: introduce migrate_ft_trans_{put,get}_ready(), and modify migrate_fd_put_ready() when ft_mode is on.
Introduce migrate_ft_trans_put_ready() which kicks the FT transaction cycle. When ft_mode is on, migrate_fd_put_ready() would open ft_trans_file and turn on event_tap. To end or cancel FT transaction, ft_mode and event_tap is turned off. migrate_ft_trans_get_ready() is called to receive ack from the receiver. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c | 267 ++- 1 files changed, 266 insertions(+), 1 deletions(-) diff --git a/migration.c b/migration.c index cd02b7e..aa30ecd 100644 --- a/migration.c +++ b/migration.c @@ -21,6 +21,7 @@ #include qemu_socket.h #include block-migration.h #include qemu-objects.h +#include event-tap.h //#define DEBUG_MIGRATION @@ -278,6 +279,14 @@ void migrate_fd_error(FdMigrationState *s) migrate_fd_cleanup(s); } +static void migrate_ft_trans_error(FdMigrationState *s) +{ +ft_mode = FT_ERROR; +qemu_savevm_state_cancel(s-mon, s-file); +migrate_fd_error(s); +event_tap_unregister(); +} + int migrate_fd_cleanup(FdMigrationState *s) { int ret = 0; @@ -313,6 +322,17 @@ void migrate_fd_put_notify(void *opaque) qemu_file_put_notify(s-file); } +static void migrate_fd_get_notify(void *opaque) +{ +FdMigrationState *s = opaque; + +qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL); +qemu_file_get_notify(s-file); +if (qemu_file_has_error(s-file)) { +migrate_ft_trans_error(s); +} +} + ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) { FdMigrationState *s = opaque; @@ -347,6 +367,10 @@ int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) ret = -(s-get_error(s)); } +if (ret == -EAGAIN) { +qemu_set_fd_handler2(s-fd, NULL, migrate_fd_get_notify, NULL, s); +} + return ret; } @@ -373,6 +397,236 @@ void migrate_fd_connect(FdMigrationState *s) migrate_fd_put_ready(s); } +static int migrate_ft_trans_commit(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_COMMIT ft_mode != FT_TRANSACTION_ATOMIC) { +fprintf(stderr, +migrate_ft_trans_commit: invalid ft_mode %d\n, ft_mode); +goto out; +} + +do { +if (ft_mode == FT_TRANSACTION_ATOMIC) { +if (qemu_ft_trans_begin(s-file) 0) { +fprintf(stderr, qemu_ft_trans_begin failed\n); +goto out; +} + +ret = qemu_savevm_trans_begin(s-mon, s-file, 0); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_begin failed\n); +goto out; +} + +ft_mode = FT_TRANSACTION_COMMIT; +if (ret) { +/* don't proceed until if fd isn't ready */ +goto out; +} +} + +/* make the VM state consistent by flushing outstanding events */ +vm_stop(0); + +/* send at full speed */ +qemu_file_set_rate_limit(s-file, 0); + +ret = qemu_savevm_trans_complete(s-mon, s-file); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_complete failed\n); +goto out; +} + +if (ret) { +/* don't proceed until if fd isn't ready */ +ret = 1; +goto out; +} + +ret = qemu_ft_trans_commit(s-file); +if (ret 0) { +fprintf(stderr, qemu_ft_trans_commit failed\n); +goto out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_RECV; +ret = 1; +goto out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto out; +} + +ft_mode = ret ? FT_TRANSACTION_BEGIN : FT_TRANSACTION_ATOMIC; +} while (ft_mode != FT_TRANSACTION_BEGIN); + +vm_start(); +ret = 0; + +out: +return ret; +} + +static int migrate_ft_trans_get_ready(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_RECV) { +fprintf(stderr, +migrate_ft_trans_get_ready: invalid ft_mode %d\n, ft_mode); +goto error_out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto error_out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_BEGIN; +} else { +ft_mode = FT_TRANSACTION_ATOMIC; + +ret = migrate_ft_trans_commit(s); +if (ret 0) { +goto error_out; +} +if (ret) { +goto out; +} +} + +vm_start(); +ret = 0; +goto out; + +error_out: +migrate_ft_trans_error(s); + +out
[PATCH 03/19] Introduce skip_header parameter to qemu_loadvm_state().
Introduce skip_header parameter to qemu_loadvm_state() so that it can be called iteratively without reading the header. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |2 +- savevm.c| 24 +--- sysemu.h|2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/migration.c b/migration.c index bee20f0..11eff51 100644 --- a/migration.c +++ b/migration.c @@ -60,7 +60,7 @@ int qemu_start_incoming_migration(const char *uri) void process_incoming_migration(QEMUFile *f) { -if (qemu_loadvm_state(f) 0) { +if (qemu_loadvm_state(f, 0) 0) { fprintf(stderr, load of migration failed\n); exit(0); } diff --git a/savevm.c b/savevm.c index d1efdd3..dc15c03 100644 --- a/savevm.c +++ b/savevm.c @@ -1709,7 +1709,7 @@ typedef struct LoadStateEntry { int version_id; } LoadStateEntry; -int qemu_loadvm_state(QEMUFile *f) +int qemu_loadvm_state(QEMUFile *f, int skip_header) { QLIST_HEAD(, LoadStateEntry) loadvm_handlers = QLIST_HEAD_INITIALIZER(loadvm_handlers); @@ -1722,17 +1722,19 @@ int qemu_loadvm_state(QEMUFile *f) return -EINVAL; } -v = qemu_get_be32(f); -if (v != QEMU_VM_FILE_MAGIC) -return -EINVAL; +if (!skip_header) { +v = qemu_get_be32(f); +if (v != QEMU_VM_FILE_MAGIC) +return -EINVAL; -v = qemu_get_be32(f); -if (v == QEMU_VM_FILE_VERSION_COMPAT) { -fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); -return -ENOTSUP; +v = qemu_get_be32(f); +if (v == QEMU_VM_FILE_VERSION_COMPAT) { +fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); +return -ENOTSUP; +} +if (v != QEMU_VM_FILE_VERSION) +return -ENOTSUP; } -if (v != QEMU_VM_FILE_VERSION) -return -ENOTSUP; while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; @@ -2055,7 +2057,7 @@ int load_vmstate(const char *name) return -EINVAL; } -ret = qemu_loadvm_state(f); +ret = qemu_loadvm_state(f, 0); qemu_fclose(f); if (ret 0) { diff --git a/sysemu.h b/sysemu.h index 0c969f2..329415f 100644 --- a/sysemu.h +++ b/sysemu.h @@ -81,7 +81,7 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); -int qemu_loadvm_state(QEMUFile *f); +int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ void do_info_slirp(Monitor *mon); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/19] Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer().
Currently buf size is fixed at 32KB. It would be useful if it could be flexible. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |2 ++ savevm.c | 20 +++- 2 files changed, 21 insertions(+), 1 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index dd993de..7f05830 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -58,6 +58,8 @@ void qemu_fflush(QEMUFile *f); int qemu_fclose(QEMUFile *f); void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); +void *qemu_realloc_buffer(QEMUFile *f, int size); +void qemu_clear_buffer(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { diff --git a/savevm.c b/savevm.c index fcd8db4..d1efdd3 100644 --- a/savevm.c +++ b/savevm.c @@ -172,7 +172,8 @@ struct QEMUFile { when reading */ int buf_index; int buf_size; /* 0 when writing */ -uint8_t buf[IO_BUF_SIZE]; +int buf_max_size; +uint8_t *buf; int has_error; }; @@ -423,6 +424,9 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, f-get_rate_limit = get_rate_limit; f-is_write = 0; +f-buf_max_size = IO_BUF_SIZE; +f-buf = qemu_malloc(sizeof(uint8_t) * f-buf_max_size); + return f; } @@ -453,6 +457,19 @@ void qemu_fflush(QEMUFile *f) } } +void *qemu_realloc_buffer(QEMUFile *f, int size) +{ +f-buf_max_size = size; +f-buf = qemu_realloc(f-buf, f-buf_max_size); + +return f-buf; +} + +void qemu_clear_buffer(QEMUFile *f) +{ +f-buf_size = f-buf_index = f-buf_offset = 0; +} + static void qemu_fill_buffer(QEMUFile *f) { int len; @@ -478,6 +495,7 @@ int qemu_fclose(QEMUFile *f) qemu_fflush(f); if (f-close) ret = f-close(f-opaque); +qemu_free(f-buf); qemu_free(f); return ret; } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/19] virtio: decrement last_avail_idx with inuse before saving.
For regular migration inuse == 0 always as requests are flushed before save. However, event-tap log when enabled introduces an extra queue for requests which is not being flushed, thus the last inuse requests are left in the event-tap queue. Move the last_avail_idx value sent to the remote back to make it repeat the last inuse requests. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/virtio.c | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 31bd9e3..f05d1b6 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -673,12 +673,20 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f) qemu_put_be32(f, i); for (i = 0; i VIRTIO_PCI_QUEUE_MAX; i++) { +/* For regular migration inuse == 0 always as + * requests are flushed before save. However, + * event-tap log when enabled introduces an extra + * queue for requests which is not being flushed, + * thus the last inuse requests are left in the event-tap queue. + * Move the last_avail_idx value sent to the remote back + * to make it repeat the last inuse requests. */ +uint16_t last_avail = vdev-vq[i].last_avail_idx - vdev-vq[i].inuse; if (vdev-vq[i].vring.num == 0) break; qemu_put_be32(f, vdev-vq[i].vring.num); qemu_put_be64(f, vdev-vq[i].pa); -qemu_put_be16s(f, vdev-vq[i].last_avail_idx); +qemu_put_be16s(f, last_avail); if (vdev-binding-save_queue) vdev-binding-save_queue(vdev-binding_opaque, i, f); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/19] vl.c: add deleted flag for deleting the handler.
Make deleting handlers robust against deletion of any elements in a handler by using a deleted flag like in file descriptors. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c | 13 + 1 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vl.c b/vl.c index 0292184..8bbb785 100644 --- a/vl.c +++ b/vl.c @@ -1140,6 +1140,7 @@ static void nographic_update(void *opaque) struct vm_change_state_entry { VMChangeStateHandler *cb; void *opaque; +int deleted; QLIST_ENTRY (vm_change_state_entry) entries; }; @@ -1160,8 +1161,7 @@ VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, void qemu_del_vm_change_state_handler(VMChangeStateEntry *e) { -QLIST_REMOVE (e, entries); -qemu_free (e); +e-deleted = 1; } void vm_state_notify(int running, int reason) @@ -1170,8 +1170,13 @@ void vm_state_notify(int running, int reason) trace_vm_state_notify(running, reason); -for (e = vm_change_state_head.lh_first; e; e = e-entries.le_next) { -e-cb(e-opaque, running, reason); +QLIST_FOREACH(e, vm_change_state_head, entries) { +if (e-deleted) { +QLIST_REMOVE(e, entries); +qemu_free(e); +} else { +e-cb(e-opaque, running, reason); +} } } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/19] qemu-char: export socket_set_nodelay().
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- qemu-char.c |2 +- qemu_socket.h |1 + 2 files changed, 2 insertions(+), 1 deletions(-) diff --git a/qemu-char.c b/qemu-char.c index edc9ad6..737d347 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -2116,7 +2116,7 @@ static void tcp_chr_telnet_init(int fd) send(fd, (char *)buf, 3, 0); } -static void socket_set_nodelay(int fd) +void socket_set_nodelay(int fd) { int val = 1; setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)val, sizeof(val)); diff --git a/qemu_socket.h b/qemu_socket.h index 897a8ae..b7f8465 100644 --- a/qemu_socket.h +++ b/qemu_socket.h @@ -36,6 +36,7 @@ int inet_aton(const char *cp, struct in_addr *ia); int qemu_socket(int domain, int type, int protocol); int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen); void socket_set_nonblock(int fd); +void socket_set_nodelay(int fd); int send_all(int fd, const void *buf, int len1); /* New, ipv6-ready socket helper functions, see qemu-sockets.c */ -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/19] Introduce fault tolerant VM transaction QEMUFile and ft_mode.
This code implements VM transaction protocol. Like buffered_file, it sits between savevm and migration layer. With this architecture, VM transaction protocol is implemented mostly independent from other existing code. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.objs |1 + ft_trans_file.c | 624 +++ ft_trans_file.h | 72 +++ migration.c |3 + trace-events| 16 ++ 5 files changed, 716 insertions(+), 0 deletions(-) create mode 100644 ft_trans_file.c create mode 100644 ft_trans_file.h diff --git a/Makefile.objs b/Makefile.objs index fda366d..1f10fbc 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -100,6 +100,7 @@ common-obj-y += msmouse.o ps2.o common-obj-y += qdev.o qdev-properties.o common-obj-y += block-migration.o common-obj-y += pflib.o +common-obj-y += ft_trans_file.o common-obj-$(CONFIG_BRLAPI) += baum.o common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o diff --git a/ft_trans_file.c b/ft_trans_file.c new file mode 100644 index 000..2b42b95 --- /dev/null +++ b/ft_trans_file.c @@ -0,0 +1,624 @@ +/* + * Fault tolerant VM transaction QEMUFile + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * This source code is based on buffered_file.c. + * Copyright IBM, Corp. 2008 + * Authors: + * Anthony Liguorialigu...@us.ibm.com + */ + +#include qemu-common.h +#include qemu-error.h +#include hw/hw.h +#include qemu-timer.h +#include sysemu.h +#include qemu-char.h +#include trace.h +#include ft_trans_file.h + +typedef struct FtTransHdr +{ +uint16_t cmd; +uint16_t id; +uint32_t seq; +uint32_t payload_len; +} FtTransHdr; + +typedef struct QEMUFileFtTrans +{ +FtTransPutBufferFunc *put_buffer; +FtTransGetBufferFunc *get_buffer; +FtTransPutReadyFunc *put_ready; +FtTransGetReadyFunc *get_ready; +FtTransWaitForUnfreezeFunc *wait_for_unfreeze; +FtTransCloseFunc *close; +void *opaque; +QEMUFile *file; + +enum QEMU_VM_TRANSACTION_STATE state; +uint32_t seq; +uint16_t id; + +int has_error; + +bool freeze_output; +bool freeze_input; +bool rate_limit; +bool is_sender; +bool is_payload; + +uint8_t *buf; +size_t buf_max_size; +size_t put_offset; +size_t get_offset; + +FtTransHdr header; +size_t header_offset; +} QEMUFileFtTrans; + +#define IO_BUF_SIZE 32768 + +static void ft_trans_append(QEMUFileFtTrans *s, +const uint8_t *buf, size_t size) +{ +if (size (s-buf_max_size - s-put_offset)) { +trace_ft_trans_realloc(s-buf_max_size, size + 1024); +s-buf_max_size += size + 1024; +s-buf = qemu_realloc(s-buf, s-buf_max_size); +} + +trace_ft_trans_append(size); +memcpy(s-buf + s-put_offset, buf, size); +s-put_offset += size; +} + +static void ft_trans_flush(QEMUFileFtTrans *s) +{ +size_t offset = 0; + +if (s-has_error) { +error_report(flush when error %d, bailing, s-has_error); +return; +} + +while (offset s-put_offset) { +ssize_t ret; + +ret = s-put_buffer(s-opaque, s-buf + offset, s-put_offset - offset); +if (ret == -EAGAIN) { +break; +} + +if (ret = 0) { +error_report(error flushing data, %s, strerror(errno)); +s-has_error = FT_TRANS_ERR_FLUSH; +break; +} else { +offset += ret; +} +} + +trace_ft_trans_flush(offset, s-put_offset); +memmove(s-buf, s-buf + offset, s-put_offset - offset); +s-put_offset -= offset; +s-freeze_output = !!s-put_offset; +} + +static ssize_t ft_trans_put(void *opaque, void *buf, int size) +{ +QEMUFileFtTrans *s = opaque; +size_t offset = 0; +ssize_t len; + +/* flush buffered data before putting next */ +if (s-put_offset) { +ft_trans_flush(s); +} + +while (!s-freeze_output offset size) { +len = s-put_buffer(s-opaque, (uint8_t *)buf + offset, size - offset); + +if (len == -EAGAIN) { +trace_ft_trans_freeze_output(); +s-freeze_output = 1; +break; +} + +if (len = 0) { +error_report(putting data failed, %s, strerror(errno)); +s-has_error = 1; +offset = -EINVAL; +break; +} + +offset += len; +} + +if (s-freeze_output) { +ft_trans_append(s, buf + offset, size - offset); +offset = size; +} + +return offset; +} + +static int ft_trans_send_header(QEMUFileFtTrans *s, +enum QEMU_VM_TRANSACTION_STATE state, +uint32_t payload_len) +{ +int ret; +FtTransHdr
Re: [Qemu-devel] [PATCH 09/19] Introduce event-tap.
2011/1/20 Kevin Wolf kw...@redhat.com: Am 20.01.2011 06:19, schrieb Yoshiaki Tamura: + return; + } + + bdrv_aio_writev(bs, blk_req-reqs[0].sector, blk_req-reqs[0].qiov, + blk_req-reqs[0].nb_sectors, blk_req-reqs[0].cb, + blk_req-reqs[0].opaque); Same here. + bdrv_flush(bs); This looks really strange. What is this supposed to do? One point is that you write it immediately after bdrv_aio_write, so you get an fsync for which you don't know if it includes the current write request or if it doesn't. Which data do you want to get flushed to the disk? I was expecting to flush the aio request that was just initiated. Am I misunderstanding the function? Seems so. The function names don't use really clear terminology either, so you're not the first one to fall in this trap. Basically we have: * qemu_aio_flush() waits for all AIO requests to complete. I think you wanted to have exactly this, but only for a single block device. Such a function doesn't exist yet. * bdrv_flush() makes sure that all successfully completed requests are written to disk (by calling fsync) * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run the fsync in the thread pool Then what I wanted to do is, call qemu_aio_flush first, then bdrv_flush. It should be like live migration. Okay, that makes sense. :-) The other thing is that you introduce a bdrv_flush for each request, basically forcing everyone to something very similar to writethrough mode. I'm sure this will have a big impact on performance. The reason is to avoid inversion of queued requests. Although processing one-by-one is heavy, wouldn't having requests flushed to disk out of order break the disk image? No, that's fine. If a guest issues two requests at the same time, they may complete in any order. You just need to make sure that you don't call the completion callback before the request really has completed. We need to flush requests, meaning aio and fsync, before sending the final state of the guests, to make sure we can switch to the secondary safely. In theory I think you could just re-submit the requests on the secondary if they had not completed yet. But you're right, let's keep things simple for the start. I'm just starting to wonder if the guest won't timeout the requests if they are queued for too long. Even more, with IDE, it can only handle one request at a time, so not completing requests doesn't sound like a good idea at all. In what intervals is the event-tap queue flushed? The requests are flushed once each transaction completes. So it's not with specific intervals. Right. So when is a transaction completed? This is the time that a single request will take. The transaction is completed when the vm state is sent to the secondary, and the primary receives the ack to it. Please let me know if the answer is too vague. What I can tell is that it can't be super fast. On the other hand, if you complete before actually writing out, you don't get timeouts, but you signal success to the guest when the request could still fail. What would you do in this case? With a writeback cache mode we're fine, we can just fail the next flush (until then nothing is guaranteed to be on disk and order doesn't matter either), but with cache=writethrough we're in serious trouble. Have you thought about this problem? Maybe we end up having to flush the event-tap queue for each single write in writethrough mode. Yes, and that's what I'm trying to do at this point. Oh, I must have missed that code. Which patch/function should I look at? Maybe I miss-answered to your question. The device may receive timeouts. If timeouts didn't happen, the requests are flushed one-by-one in writethrough because we're calling qemu_aio_flush and bdrv_flush together. Yoshi I know that performance matters a lot, but sacrificing reliability over performance now isn't a good idea. I first want to lay the ground, and then focus on optimization. Note that without dirty bitmap optimization, Kemari suffers a lot in sending rams. Anthony and I discussed to take this approach at KVM Forum. I agree, starting simple makes sense. Kevin -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 09/19] Introduce event-tap.
2011/1/20 Kevin Wolf kw...@redhat.com: Am 20.01.2011 11:39, schrieb Yoshiaki Tamura: 2011/1/20 Kevin Wolf kw...@redhat.com: Am 20.01.2011 06:19, schrieb Yoshiaki Tamura: + return; + } + + bdrv_aio_writev(bs, blk_req-reqs[0].sector, blk_req-reqs[0].qiov, + blk_req-reqs[0].nb_sectors, blk_req-reqs[0].cb, + blk_req-reqs[0].opaque); Same here. + bdrv_flush(bs); This looks really strange. What is this supposed to do? One point is that you write it immediately after bdrv_aio_write, so you get an fsync for which you don't know if it includes the current write request or if it doesn't. Which data do you want to get flushed to the disk? I was expecting to flush the aio request that was just initiated. Am I misunderstanding the function? Seems so. The function names don't use really clear terminology either, so you're not the first one to fall in this trap. Basically we have: * qemu_aio_flush() waits for all AIO requests to complete. I think you wanted to have exactly this, but only for a single block device. Such a function doesn't exist yet. * bdrv_flush() makes sure that all successfully completed requests are written to disk (by calling fsync) * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run the fsync in the thread pool Then what I wanted to do is, call qemu_aio_flush first, then bdrv_flush. It should be like live migration. Okay, that makes sense. :-) The other thing is that you introduce a bdrv_flush for each request, basically forcing everyone to something very similar to writethrough mode. I'm sure this will have a big impact on performance. The reason is to avoid inversion of queued requests. Although processing one-by-one is heavy, wouldn't having requests flushed to disk out of order break the disk image? No, that's fine. If a guest issues two requests at the same time, they may complete in any order. You just need to make sure that you don't call the completion callback before the request really has completed. We need to flush requests, meaning aio and fsync, before sending the final state of the guests, to make sure we can switch to the secondary safely. In theory I think you could just re-submit the requests on the secondary if they had not completed yet. But you're right, let's keep things simple for the start. I'm just starting to wonder if the guest won't timeout the requests if they are queued for too long. Even more, with IDE, it can only handle one request at a time, so not completing requests doesn't sound like a good idea at all. In what intervals is the event-tap queue flushed? The requests are flushed once each transaction completes. So it's not with specific intervals. Right. So when is a transaction completed? This is the time that a single request will take. The transaction is completed when the vm state is sent to the secondary, and the primary receives the ack to it. Please let me know if the answer is too vague. What I can tell is that it can't be super fast. On the other hand, if you complete before actually writing out, you don't get timeouts, but you signal success to the guest when the request could still fail. What would you do in this case? With a writeback cache mode we're fine, we can just fail the next flush (until then nothing is guaranteed to be on disk and order doesn't matter either), but with cache=writethrough we're in serious trouble. Have you thought about this problem? Maybe we end up having to flush the event-tap queue for each single write in writethrough mode. Yes, and that's what I'm trying to do at this point. Oh, I must have missed that code. Which patch/function should I look at? Maybe I miss-answered to your question. The device may receive timeouts. We should pay attention that the guest does not see timeouts. I'm not expecting that I/O will be super fast, and as long as it is only a performance problem we can live with it. However, as soon as the guest gets timeouts it reports I/O errors and eventually offlines the block device. At this point it's not a performance problem any more, but also a correctness problem. This is why I suggested that we flush the event-tap queue (i.e. complete the transaction) immediately after an I/O request has been issued instead of waiting for other events that would complete the transaction. Right. event-tap doesn't queue at specific interval. It'll schedule the transaction as bh once events are tapped . The purpose of the queue is store requests initiated while the transaction. So I believe current implementation should be doing what you're expecting. However, if the guest dirtied huge amount of ram and initiated block requests, we may get timeouts even we started transaction right away. Yoshi If timeouts didn't happen, the requests are flushed one-by-one in writethrough because we're calling qemu_aio_flush and bdrv_flush
Re: [Qemu-devel] [PATCH 09/19] Introduce event-tap.
2011/1/20 Kevin Wolf kw...@redhat.com: Am 20.01.2011 14:50, schrieb Yoshiaki Tamura: 2011/1/20 Kevin Wolf kw...@redhat.com: Am 20.01.2011 11:39, schrieb Yoshiaki Tamura: 2011/1/20 Kevin Wolf kw...@redhat.com: Am 20.01.2011 06:19, schrieb Yoshiaki Tamura: + return; + } + + bdrv_aio_writev(bs, blk_req-reqs[0].sector, blk_req-reqs[0].qiov, + blk_req-reqs[0].nb_sectors, blk_req-reqs[0].cb, + blk_req-reqs[0].opaque); Same here. + bdrv_flush(bs); This looks really strange. What is this supposed to do? One point is that you write it immediately after bdrv_aio_write, so you get an fsync for which you don't know if it includes the current write request or if it doesn't. Which data do you want to get flushed to the disk? I was expecting to flush the aio request that was just initiated. Am I misunderstanding the function? Seems so. The function names don't use really clear terminology either, so you're not the first one to fall in this trap. Basically we have: * qemu_aio_flush() waits for all AIO requests to complete. I think you wanted to have exactly this, but only for a single block device. Such a function doesn't exist yet. * bdrv_flush() makes sure that all successfully completed requests are written to disk (by calling fsync) * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run the fsync in the thread pool Then what I wanted to do is, call qemu_aio_flush first, then bdrv_flush. It should be like live migration. Okay, that makes sense. :-) The other thing is that you introduce a bdrv_flush for each request, basically forcing everyone to something very similar to writethrough mode. I'm sure this will have a big impact on performance. The reason is to avoid inversion of queued requests. Although processing one-by-one is heavy, wouldn't having requests flushed to disk out of order break the disk image? No, that's fine. If a guest issues two requests at the same time, they may complete in any order. You just need to make sure that you don't call the completion callback before the request really has completed. We need to flush requests, meaning aio and fsync, before sending the final state of the guests, to make sure we can switch to the secondary safely. In theory I think you could just re-submit the requests on the secondary if they had not completed yet. But you're right, let's keep things simple for the start. I'm just starting to wonder if the guest won't timeout the requests if they are queued for too long. Even more, with IDE, it can only handle one request at a time, so not completing requests doesn't sound like a good idea at all. In what intervals is the event-tap queue flushed? The requests are flushed once each transaction completes. So it's not with specific intervals. Right. So when is a transaction completed? This is the time that a single request will take. The transaction is completed when the vm state is sent to the secondary, and the primary receives the ack to it. Please let me know if the answer is too vague. What I can tell is that it can't be super fast. On the other hand, if you complete before actually writing out, you don't get timeouts, but you signal success to the guest when the request could still fail. What would you do in this case? With a writeback cache mode we're fine, we can just fail the next flush (until then nothing is guaranteed to be on disk and order doesn't matter either), but with cache=writethrough we're in serious trouble. Have you thought about this problem? Maybe we end up having to flush the event-tap queue for each single write in writethrough mode. Yes, and that's what I'm trying to do at this point. Oh, I must have missed that code. Which patch/function should I look at? Maybe I miss-answered to your question. The device may receive timeouts. We should pay attention that the guest does not see timeouts. I'm not expecting that I/O will be super fast, and as long as it is only a performance problem we can live with it. However, as soon as the guest gets timeouts it reports I/O errors and eventually offlines the block device. At this point it's not a performance problem any more, but also a correctness problem. This is why I suggested that we flush the event-tap queue (i.e. complete the transaction) immediately after an I/O request has been issued instead of waiting for other events that would complete the transaction. Right. event-tap doesn't queue at specific interval. It'll schedule the transaction as bh once events are tapped . The purpose of the queue is store requests initiated while the transaction. Ok, now I got it. :-) So the patches are already doing the best we can do. So I believe current implementation should be doing what you're expecting. However, if the guest dirtied huge amount of ram and initiated block requests, we may get timeouts even we
Re: [Qemu-devel] [PATCH 14/19] block: insert event-tap to bdrv_aio_writev() and bdrv_aio_flush().
2011/1/19 Kevin Wolf kw...@redhat.com: Am 19.01.2011 06:44, schrieb Yoshiaki Tamura: event-tap function is called only when it is on, and requests sent from device emulators. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- block.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index ff2795b..85bd8b8 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include block_int.h #include module.h #include qemu-objects.h +#include event-tap.h #ifdef CONFIG_BSD #include sys/types.h @@ -2111,6 +2112,11 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; + if (bs-device_name event_tap_is_on()) { bs-device_name is a pointer to a char array contained in bs, so it's never NULL. You probably mean *bs-device_name? Yes, thanks for pointing out. It didn't expose because event_tap_is_on() was false upon flushing after synchronization. Yoshi Kevin -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 09/19] Introduce event-tap.
2011/1/19 Kevin Wolf kw...@redhat.com: Am 19.01.2011 06:44, schrieb Yoshiaki Tamura: event-tap controls when to start FT transaction, and provides proxy functions to called from net/block devices. While FT transaction, it queues up net/block requests, and flush them when the transaction gets completed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp One general comment: On the first glance this seems to mix block and net (and some other things) arbitrarily instead of having a section for handling all block stuff, then network, etc. Is there a specific reason for the order in which you put the functions? If not, maybe reordering them might improve readability. Thanks. I'll rework on that. --- Makefile.target | 1 + event-tap.c | 847 +++ event-tap.h | 42 +++ qemu-tool.c | 24 ++ trace-events | 9 + 5 files changed, 923 insertions(+), 0 deletions(-) create mode 100644 event-tap.c create mode 100644 event-tap.h diff --git a/Makefile.target b/Makefile.target index e15b1c4..f36cd75 100644 --- a/Makefile.target +++ b/Makefile.target @@ -199,6 +199,7 @@ obj-y += rwhandler.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o LIBS+=-lz +obj-y += event-tap.o QEMU_CFLAGS += $(VNC_TLS_CFLAGS) QEMU_CFLAGS += $(VNC_SASL_CFLAGS) diff --git a/event-tap.c b/event-tap.c new file mode 100644 index 000..f492708 --- /dev/null +++ b/event-tap.c @@ -0,0 +1,847 @@ +/* + * Event Tap functions for QEMU + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include qemu-common.h +#include qemu-error.h +#include block.h +#include block_int.h +#include ioport.h +#include osdep.h +#include sysemu.h +#include hw/hw.h +#include net.h +#include event-tap.h +#include trace.h + +enum EVENT_TAP_STATE { + EVENT_TAP_OFF, + EVENT_TAP_ON, + EVENT_TAP_FLUSH, + EVENT_TAP_LOAD, + EVENT_TAP_REPLAY, +}; + +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF; +static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */ Indeed, bdrv_aio_cancel will segfault this way. If you use dummies instead of real ACBs the only way to correctly implement bdrv_aio_cancel is waiting for all in-flight AIOs (qemu_aio_flush). So I need to insert a new event_tap function to bdrv_aio_cancel to do that. +typedef struct EventTapIOport { + uint32_t address; + uint32_t data; + int index; +} EventTapIOport; + +#define MMIO_BUF_SIZE 8 + +typedef struct EventTapMMIO { + uint64_t address; + uint8_t buf[MMIO_BUF_SIZE]; + int len; +} EventTapMMIO; + +typedef struct EventTapNetReq { + char *device_name; + int iovcnt; + struct iovec *iov; + int vlan_id; + bool vlan_needed; + bool async; + NetPacketSent *sent_cb; +} EventTapNetReq; + +#define MAX_BLOCK_REQUEST 32 + +typedef struct EventTapBlkReq { + char *device_name; + int num_reqs; + int num_cbs; + bool is_flush; + BlockRequest reqs[MAX_BLOCK_REQUEST]; + BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST]; + void *opaque[MAX_BLOCK_REQUEST]; +} EventTapBlkReq; + +#define EVENT_TAP_IOPORT (1 0) +#define EVENT_TAP_MMIO (1 1) +#define EVENT_TAP_NET (1 2) +#define EVENT_TAP_BLK (1 3) + +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1) + +typedef struct EventTapLog { + int mode; + union { + EventTapIOport ioport; + EventTapMMIO mmio; + }; + union { + EventTapNetReq net_req; + EventTapBlkReq blk_req; + }; + QTAILQ_ENTRY(EventTapLog) node; +} EventTapLog; + +static EventTapLog *last_event_tap; + +static QTAILQ_HEAD(, EventTapLog) event_list; +static QTAILQ_HEAD(, EventTapLog) event_pool; + +static int (*event_tap_cb)(void); +static QEMUBH *event_tap_bh; +static VMChangeStateEntry *vmstate; + +static void event_tap_bh_cb(void *p) +{ + if (event_tap_cb) { + event_tap_cb(); + } + + qemu_bh_delete(event_tap_bh); + event_tap_bh = NULL; +} + +static void event_tap_schedule_bh(void) +{ + trace_event_tap_ignore_bh(!!event_tap_bh); + + /* if bh is already set, we ignore it for now */ + if (event_tap_bh) { + return; + } + + event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL); + qemu_bh_schedule(event_tap_bh); + + return ; +} + +static void event_tap_alloc_net_req(EventTapNetReq *net_req, + VLANClientState *vc, + const struct iovec *iov, int iovcnt, + NetPacketSent *sent_cb, bool async) +{ + int i; + + net_req-iovcnt = iovcnt
Re: [Qemu-devel] [PATCH 14/19] block: insert event-tap to bdrv_aio_writev() and bdrv_aio_flush().
2011/1/19 Kevin Wolf kw...@redhat.com: Am 19.01.2011 06:44, schrieb Yoshiaki Tamura: event-tap function is called only when it is on, and requests sent from device emulators. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- block.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index ff2795b..85bd8b8 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include block_int.h #include module.h #include qemu-objects.h +#include event-tap.h #ifdef CONFIG_BSD #include sys/types.h @@ -2111,6 +2112,11 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; + if (bs-device_name event_tap_is_on()) { + return event_tap_bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); + } + if (bs-dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, opaque); Just noticed the context here... Does this patch break block migration when event-tap is on? I don't think so. event-tap will call bdrv_aio_writev() upon flushing requests and it shouldn't affect block-migration. The block written after the synchronization should be marked as dirty and should be sent in the next round. Am I missing the point? Another question that came to my mind is if we really hook everything we need. I think we'll need to have a hook in bdrv_flush as well. I don't know if you do hook qemu_aio_flush and friends - does a call cause event-tap to flush its queue? If not, a call to qemu_aio_flush might hang qemu because it's waiting for requests to complete which are actually stuck in the event-tap queue. No it doesn't queue at event-tap. Marcelo pointed that we should hook bdrv_aio_flush to avoid requests inversion, that made sense to me. Do we need to hook bdrv_flush for that same reason? If we hook bdrv_flush and qemu_aio_flush, we're going loop forever because the synchronization code is calling vm_stop that call bdrv_flush_all and qemu_aio_flush. Yoshi Kevin -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH 09/19] Introduce event-tap.
2011/1/19 Kevin Wolf kw...@redhat.com: Am 19.01.2011 14:04, schrieb Yoshiaki Tamura: +static void event_tap_blk_flush(EventTapBlkReq *blk_req) +{ + BlockDriverState *bs; + + bs = bdrv_find(blk_req-device_name); Please store the BlockDriverState in blk_req. This code loops over all block devices and does a string comparison - and that for each request. You can also save the qemu_strdup() when creating the request. In the few places where you really need the device name (might be the case for load/save, I'm not sure), you can still get it from the BlockDriverState. I would do so for the primary side. Although we haven't implemented yet, we want to replay block requests from block layer on the secondary side, and need device name to restore BlockDriverState. Hm, I see. I'm not happy about it, but I don't have a suggestion right away how to avoid it. + + if (blk_req-is_flush) { + bdrv_aio_flush(bs, blk_req-reqs[0].cb, blk_req-reqs[0].opaque); You need to handle errors. If bdrv_aio_flush returns NULL, call the callback with -EIO. I'll do so. + return; + } + + bdrv_aio_writev(bs, blk_req-reqs[0].sector, blk_req-reqs[0].qiov, + blk_req-reqs[0].nb_sectors, blk_req-reqs[0].cb, + blk_req-reqs[0].opaque); Same here. + bdrv_flush(bs); This looks really strange. What is this supposed to do? One point is that you write it immediately after bdrv_aio_write, so you get an fsync for which you don't know if it includes the current write request or if it doesn't. Which data do you want to get flushed to the disk? I was expecting to flush the aio request that was just initiated. Am I misunderstanding the function? Seems so. The function names don't use really clear terminology either, so you're not the first one to fall in this trap. Basically we have: * qemu_aio_flush() waits for all AIO requests to complete. I think you wanted to have exactly this, but only for a single block device. Such a function doesn't exist yet. * bdrv_flush() makes sure that all successfully completed requests are written to disk (by calling fsync) * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run the fsync in the thread pool Then what I wanted to do is, call qemu_aio_flush first, then bdrv_flush. It should be like live migration. The other thing is that you introduce a bdrv_flush for each request, basically forcing everyone to something very similar to writethrough mode. I'm sure this will have a big impact on performance. The reason is to avoid inversion of queued requests. Although processing one-by-one is heavy, wouldn't having requests flushed to disk out of order break the disk image? No, that's fine. If a guest issues two requests at the same time, they may complete in any order. You just need to make sure that you don't call the completion callback before the request really has completed. We need to flush requests, meaning aio and fsync, before sending the final state of the guests, to make sure we can switch to the secondary safely. I'm just starting to wonder if the guest won't timeout the requests if they are queued for too long. Even more, with IDE, it can only handle one request at a time, so not completing requests doesn't sound like a good idea at all. In what intervals is the event-tap queue flushed? The requests are flushed once each transaction completes. So it's not with specific intervals. On the other hand, if you complete before actually writing out, you don't get timeouts, but you signal success to the guest when the request could still fail. What would you do in this case? With a writeback cache mode we're fine, we can just fail the next flush (until then nothing is guaranteed to be on disk and order doesn't matter either), but with cache=writethrough we're in serious trouble. Have you thought about this problem? Maybe we end up having to flush the event-tap queue for each single write in writethrough mode. Yes, and that's what I'm trying to do at this point. I know that performance matters a lot, but sacrificing reliability over performance now isn't a good idea. I first want to lay the ground, and then focus on optimization. Note that without dirty bitmap optimization, Kemari suffers a lot in sending rams. Anthony and I discussed to take this approach at KVM Forum. Additionally, error handling is missing. I looked at the codes using bdrv_flush and realized some of them doesn't handle errors, but scsi-disk.c does. Should everyone handle errors or depends on the usage? I added the return code only recently, it was a void function previously. Probably some error handling should be added to all of them. Ah:) Glad to hear that. Yoshi Kevin -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo
[PATCH 00/19] Kemari for KVM v0.2.6
Hi, This patch series is a revised version of Kemari for KVM, which applied comments for the previous post. The current code is based on qemu.git d03d11260ee2d55579e8b76116e35ccdf5031833. The changes from v0.2.5 - v0.2.6 are: - use qemu_{put,get}_be32() to save/load niov in event-tap The changes from v0.2.4 - v0.2.5 are: - fixed braces and trailing spaces by using Blue's checkpatch.pl (Blue) - event-tap: don't try to send blk_req if it's a bdrv_aio_flush event The changes from v0.2.3 - v0.2.4 are: - call vm_start() before event_tap_flush_one() to avoid failure in virtio-net assertion - add vm_change_state_handler to turn off ft_mode - use qemu_iovec functions in event-tap - remove duplicated code in migration - remove unnecessary new line for error_report in ft_trans_file The changes from v0.2.2 - v0.2.3 are: - queue async net requests without copying (MST) -- if not async, contents of the packets are sent to the secondary - better description for option -k (MST) - fix memory transfer failure - fix ft transaction initiation failure The changes from v0.2.1 - v0.2.2 are: - decrement last_avaid_idx with inuse before saving (MST) - remove qemu_aio_flush() and bdrv_flush_all() in migrate_ft_trans_commit() The changes from v0.2 - v0.2.1 are: - Move event-tap to net/block layer and use stubs (Blue, Paul, MST, Kevin) - Tap bdrv_aio_flush (Marcelo) - Remove multiwrite interface in event-tap (Stefan) - Fix event-tap to use pio/mmio to replay both net/block (Stefan) - Improve error handling in event-tap (Stefan) - Fix leak in event-tap (Stefan) - Revise virtio last_avail_idx manipulation (MST) - Clean up migration.c hook (Marcelo) - Make deleting change state handler robust (Isaku, Anthony) The changes from v0.1.1 - v0.2 are: - Introduce a queue in event-tap to make VM sync live. - Change transaction receiver to a state machine for async receiving. - Replace net/block layer functions with event-tap proxy functions. - Remove dirty bitmap optimization for now. - convert DPRINTF() in ft_trans_file to trace functions. - convert fprintf() in ft_trans_file to error_report(). - improved error handling in ft_trans_file. - add a tmp pointer to qemu_del_vm_change_state_handler. The changes from v0.1 - v0.1.1 are: - events are tapped in net/block layer instead of device emulation layer. - Introduce a new option for -incoming to accept FT transaction. - Removed writev() support to QEMUFile and FdMigrationState for now. I would post this work in a different series. - Modified virtio-blk save/load handler to send inuse variable to correctly replay. - Removed configure --enable-ft-mode. - Removed unnecessary check for qemu_realloc(). The first 6 patches modify several functions of qemu to prepare introducing Kemari specific components. The next 6 patches are the components of Kemari. They introduce event-tap and the FT transaction protocol file based on buffered file. The design document of FT transaction protocol can be found at, http://wiki.qemu.org/images/b/b1/Kemari_sender_receiver_0.5a.pdf Then the following 2 patches modifies net/block layer functions with event-tap functions. Please note that if Kemari is off, event-tap will just passthrough, and there is most no intrusion to exisiting functions including normal live migration. Finally, the migration layer are modified to support Kemari in the last 5 patches. Again, there shouldn't be any affection if a user doesn't specify Kemari specific options. The transaction is now async on both sender and receiver side. The sender side respects the max_downtime to decide when to switch from async to sync mode. The repository contains all patches I'm sending with this message. For those who want to try, please pull the following repository. It also includes dirty bitmap optimization which aren't ready for posting yet. To remove the dirty bitmap optimization, please look at HEAD~5 of the tree. git://kemari.git.sourceforge.net/gitroot/kemari/kemari next Thanks, Yoshi Yoshiaki Tamura (19): Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer(). Introduce read() to FdMigrationState. Introduce skip_header parameter to qemu_loadvm_state(). qemu-char: export socket_set_nodelay(). vl.c: add deleted flag for deleting the handler. virtio: decrement last_avail_idx with inuse before saving. Introduce fault tolerant VM transaction QEMUFile and ft_mode. savevm: introduce util functions to control ft_trans_file from savevm layer. Introduce event-tap. Call init handler of event-tap at main() in vl.c. ioport: insert event_tap_ioport() to ioport_write(). Insert event_tap_mmio() to cpu_physical_memory_rw() in exec.c. net: insert event-tap to qemu_send_packet() and qemu_sendv_packet_async(). block: insert event-tap to bdrv_aio_writev() and bdrv_aio_flush(). savevm: introduce qemu_savevm_trans_{begin,commit}. migration: introduce migrate_ft_trans_{put,get}_ready(), and modify
[PATCH 02/19] Introduce read() to FdMigrationState.
Currently FdMigrationState doesn't support read(), and this patch introduces it to get response from the other side. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 15 +++ migration.c | 13 + migration.h |3 +++ 3 files changed, 31 insertions(+), 0 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index b55f419..55777c8 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -39,6 +39,20 @@ static int socket_write(FdMigrationState *s, const void * buf, size_t size) return send(s-fd, buf, size, 0); } +static int socket_read(FdMigrationState *s, const void * buf, size_t size) +{ +ssize_t len; + +do { +len = recv(s-fd, (void *)buf, size, 0); +} while (len == -1 socket_error() == EINTR); +if (len == -1) { +len = -socket_error(); +} + +return len; +} + static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); @@ -94,6 +108,7 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, s-get_error = socket_errno; s-write = socket_write; +s-read = socket_read; s-close = tcp_close; s-mig_state.cancel = migrate_fd_cancel; s-mig_state.get_status = migrate_fd_get_status; diff --git a/migration.c b/migration.c index e5ba51c..4a749bb 100644 --- a/migration.c +++ b/migration.c @@ -330,6 +330,19 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) return ret; } +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) +{ +FdMigrationState *s = opaque; +int ret; + +ret = s-read(s, data, size); +if (ret == -1) { +ret = -(s-get_error(s)); +} + +return ret; +} + void migrate_fd_connect(FdMigrationState *s) { int ret; diff --git a/migration.h b/migration.h index d13ed4f..7bf6747 100644 --- a/migration.h +++ b/migration.h @@ -47,6 +47,7 @@ struct FdMigrationState int (*get_error)(struct FdMigrationState*); int (*close)(struct FdMigrationState*); int (*write)(struct FdMigrationState*, const void *, size_t); +int (*read)(struct FdMigrationState *, const void *, size_t); void *opaque; }; @@ -115,6 +116,8 @@ void migrate_fd_put_notify(void *opaque); ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size); +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size); + void migrate_fd_connect(FdMigrationState *s); void migrate_fd_put_ready(void *opaque); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/19] Insert event_tap_mmio() to cpu_physical_memory_rw() in exec.c.
Record mmio write event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- exec.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/exec.c b/exec.c index 49c28b1..4a171cc 100644 --- a/exec.c +++ b/exec.c @@ -33,6 +33,7 @@ #include osdep.h #include kvm.h #include qemu-timer.h +#include event-tap.h #if defined(CONFIG_USER_ONLY) #include qemu.h #include signal.h @@ -3625,6 +3626,9 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, io_index = (pd IO_MEM_SHIFT) (IO_MEM_NB_ENTRIES - 1); if (p) addr1 = (addr ~TARGET_PAGE_MASK) + p-region_offset; + +event_tap_mmio(addr, buf, len); + /* XXX: could force cpu_single_env to NULL to avoid potential bugs */ if (l = 4 ((addr1 3) == 0)) { -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/19] Introduce fault tolerant VM transaction QEMUFile and ft_mode.
This code implements VM transaction protocol. Like buffered_file, it sits between savevm and migration layer. With this architecture, VM transaction protocol is implemented mostly independent from other existing code. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.objs |1 + ft_trans_file.c | 624 +++ ft_trans_file.h | 72 +++ migration.c |3 + trace-events| 16 ++ 5 files changed, 716 insertions(+), 0 deletions(-) create mode 100644 ft_trans_file.c create mode 100644 ft_trans_file.h diff --git a/Makefile.objs b/Makefile.objs index c3e52c5..de38579 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -100,6 +100,7 @@ common-obj-y += msmouse.o ps2.o common-obj-y += qdev.o qdev-properties.o common-obj-y += block-migration.o common-obj-y += pflib.o +common-obj-y += ft_trans_file.o common-obj-$(CONFIG_BRLAPI) += baum.o common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o diff --git a/ft_trans_file.c b/ft_trans_file.c new file mode 100644 index 000..2b42b95 --- /dev/null +++ b/ft_trans_file.c @@ -0,0 +1,624 @@ +/* + * Fault tolerant VM transaction QEMUFile + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * This source code is based on buffered_file.c. + * Copyright IBM, Corp. 2008 + * Authors: + * Anthony Liguorialigu...@us.ibm.com + */ + +#include qemu-common.h +#include qemu-error.h +#include hw/hw.h +#include qemu-timer.h +#include sysemu.h +#include qemu-char.h +#include trace.h +#include ft_trans_file.h + +typedef struct FtTransHdr +{ +uint16_t cmd; +uint16_t id; +uint32_t seq; +uint32_t payload_len; +} FtTransHdr; + +typedef struct QEMUFileFtTrans +{ +FtTransPutBufferFunc *put_buffer; +FtTransGetBufferFunc *get_buffer; +FtTransPutReadyFunc *put_ready; +FtTransGetReadyFunc *get_ready; +FtTransWaitForUnfreezeFunc *wait_for_unfreeze; +FtTransCloseFunc *close; +void *opaque; +QEMUFile *file; + +enum QEMU_VM_TRANSACTION_STATE state; +uint32_t seq; +uint16_t id; + +int has_error; + +bool freeze_output; +bool freeze_input; +bool rate_limit; +bool is_sender; +bool is_payload; + +uint8_t *buf; +size_t buf_max_size; +size_t put_offset; +size_t get_offset; + +FtTransHdr header; +size_t header_offset; +} QEMUFileFtTrans; + +#define IO_BUF_SIZE 32768 + +static void ft_trans_append(QEMUFileFtTrans *s, +const uint8_t *buf, size_t size) +{ +if (size (s-buf_max_size - s-put_offset)) { +trace_ft_trans_realloc(s-buf_max_size, size + 1024); +s-buf_max_size += size + 1024; +s-buf = qemu_realloc(s-buf, s-buf_max_size); +} + +trace_ft_trans_append(size); +memcpy(s-buf + s-put_offset, buf, size); +s-put_offset += size; +} + +static void ft_trans_flush(QEMUFileFtTrans *s) +{ +size_t offset = 0; + +if (s-has_error) { +error_report(flush when error %d, bailing, s-has_error); +return; +} + +while (offset s-put_offset) { +ssize_t ret; + +ret = s-put_buffer(s-opaque, s-buf + offset, s-put_offset - offset); +if (ret == -EAGAIN) { +break; +} + +if (ret = 0) { +error_report(error flushing data, %s, strerror(errno)); +s-has_error = FT_TRANS_ERR_FLUSH; +break; +} else { +offset += ret; +} +} + +trace_ft_trans_flush(offset, s-put_offset); +memmove(s-buf, s-buf + offset, s-put_offset - offset); +s-put_offset -= offset; +s-freeze_output = !!s-put_offset; +} + +static ssize_t ft_trans_put(void *opaque, void *buf, int size) +{ +QEMUFileFtTrans *s = opaque; +size_t offset = 0; +ssize_t len; + +/* flush buffered data before putting next */ +if (s-put_offset) { +ft_trans_flush(s); +} + +while (!s-freeze_output offset size) { +len = s-put_buffer(s-opaque, (uint8_t *)buf + offset, size - offset); + +if (len == -EAGAIN) { +trace_ft_trans_freeze_output(); +s-freeze_output = 1; +break; +} + +if (len = 0) { +error_report(putting data failed, %s, strerror(errno)); +s-has_error = 1; +offset = -EINVAL; +break; +} + +offset += len; +} + +if (s-freeze_output) { +ft_trans_append(s, buf + offset, size - offset); +offset = size; +} + +return offset; +} + +static int ft_trans_send_header(QEMUFileFtTrans *s, +enum QEMU_VM_TRANSACTION_STATE state, +uint32_t payload_len) +{ +int ret; +FtTransHdr
[PATCH 18/19] Introduce -k option to enable FT migration mode (Kemari).
When -k option is set to migrate command, it will turn on ft_mode to start FT migration mode (Kemari). Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hmp-commands.hx |7 --- migration.c |4 qmp-commands.hx |7 --- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index 1cea572..b7f8f2f 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -735,13 +735,14 @@ ETEXI { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, diff --git a/migration.c b/migration.c index fb73b2d..11bbdf8 100644 --- a/migration.c +++ b/migration.c @@ -92,6 +92,10 @@ int do_migrate(Monitor *mon, const QDict *qdict, QObject **ret_data) return -1; } +if (qdict_get_try_bool(qdict, ft, 0)) { +ft_mode = FT_INIT; +} + if (strstart(uri, tcp:, p)) { s = tcp_start_outgoing_migration(mon, p, max_throttle, detach, blk, inc); diff --git a/qmp-commands.hx b/qmp-commands.hx index 56c4d8b..1521931 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -431,13 +431,14 @@ EQMP { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 19/19] migration: add a parser to accept FT migration incoming mode.
The option looks like, -incoming protocol:address:port,ft_mode Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/migration.c b/migration.c index 11bbdf8..7275f02 100644 --- a/migration.c +++ b/migration.c @@ -45,6 +45,12 @@ int qemu_start_incoming_migration(const char *uri) const char *p; int ret; +/* check ft_mode option */ +p = strstr(uri, ft_mode); +if (p !strcmp(p, ft_mode)) { +ft_mode = FT_INIT; +} + if (strstart(uri, tcp:, p)) ret = tcp_start_incoming_migration(p); #if !defined(WIN32) -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/19] Introduce event-tap.
event-tap controls when to start FT transaction, and provides proxy functions to called from net/block devices. While FT transaction, it queues up net/block requests, and flush them when the transaction gets completed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.target |1 + event-tap.c | 847 +++ event-tap.h | 42 +++ qemu-tool.c | 24 ++ trace-events|9 + 5 files changed, 923 insertions(+), 0 deletions(-) create mode 100644 event-tap.c create mode 100644 event-tap.h diff --git a/Makefile.target b/Makefile.target index e15b1c4..f36cd75 100644 --- a/Makefile.target +++ b/Makefile.target @@ -199,6 +199,7 @@ obj-y += rwhandler.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o LIBS+=-lz +obj-y += event-tap.o QEMU_CFLAGS += $(VNC_TLS_CFLAGS) QEMU_CFLAGS += $(VNC_SASL_CFLAGS) diff --git a/event-tap.c b/event-tap.c new file mode 100644 index 000..f492708 --- /dev/null +++ b/event-tap.c @@ -0,0 +1,847 @@ +/* + * Event Tap functions for QEMU + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include qemu-common.h +#include qemu-error.h +#include block.h +#include block_int.h +#include ioport.h +#include osdep.h +#include sysemu.h +#include hw/hw.h +#include net.h +#include event-tap.h +#include trace.h + +enum EVENT_TAP_STATE { +EVENT_TAP_OFF, +EVENT_TAP_ON, +EVENT_TAP_FLUSH, +EVENT_TAP_LOAD, +EVENT_TAP_REPLAY, +}; + +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF; +static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */ + +typedef struct EventTapIOport { +uint32_t address; +uint32_t data; +int index; +} EventTapIOport; + +#define MMIO_BUF_SIZE 8 + +typedef struct EventTapMMIO { +uint64_t address; +uint8_t buf[MMIO_BUF_SIZE]; +int len; +} EventTapMMIO; + +typedef struct EventTapNetReq { +char *device_name; +int iovcnt; +struct iovec *iov; +int vlan_id; +bool vlan_needed; +bool async; +NetPacketSent *sent_cb; +} EventTapNetReq; + +#define MAX_BLOCK_REQUEST 32 + +typedef struct EventTapBlkReq { +char *device_name; +int num_reqs; +int num_cbs; +bool is_flush; +BlockRequest reqs[MAX_BLOCK_REQUEST]; +BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST]; +void *opaque[MAX_BLOCK_REQUEST]; +} EventTapBlkReq; + +#define EVENT_TAP_IOPORT (1 0) +#define EVENT_TAP_MMIO (1 1) +#define EVENT_TAP_NET(1 2) +#define EVENT_TAP_BLK(1 3) + +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1) + +typedef struct EventTapLog { +int mode; +union { +EventTapIOport ioport; +EventTapMMIO mmio; +}; +union { +EventTapNetReq net_req; +EventTapBlkReq blk_req; +}; +QTAILQ_ENTRY(EventTapLog) node; +} EventTapLog; + +static EventTapLog *last_event_tap; + +static QTAILQ_HEAD(, EventTapLog) event_list; +static QTAILQ_HEAD(, EventTapLog) event_pool; + +static int (*event_tap_cb)(void); +static QEMUBH *event_tap_bh; +static VMChangeStateEntry *vmstate; + +static void event_tap_bh_cb(void *p) +{ +if (event_tap_cb) { +event_tap_cb(); +} + +qemu_bh_delete(event_tap_bh); +event_tap_bh = NULL; +} + +static void event_tap_schedule_bh(void) +{ +trace_event_tap_ignore_bh(!!event_tap_bh); + +/* if bh is already set, we ignore it for now */ +if (event_tap_bh) { +return; +} + +event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL); +qemu_bh_schedule(event_tap_bh); + +return ; +} + +static void event_tap_alloc_net_req(EventTapNetReq *net_req, + VLANClientState *vc, + const struct iovec *iov, int iovcnt, + NetPacketSent *sent_cb, bool async) +{ +int i; + +net_req-iovcnt = iovcnt; +net_req-async = async; +net_req-device_name = qemu_strdup(vc-name); +net_req-sent_cb = sent_cb; + +if (vc-vlan) { +net_req-vlan_needed = 1; +net_req-vlan_id = vc-vlan-id; +} else { +net_req-vlan_needed = 0; +} + +if (async) { +net_req-iov = (struct iovec *)iov; +} else { +net_req-iov = qemu_malloc(sizeof(struct iovec) * iovcnt); +for (i = 0; i iovcnt; i++) { +net_req-iov[i].iov_base = qemu_malloc(iov[i].iov_len); +memcpy(net_req-iov[i].iov_base, iov[i].iov_base, iov[i].iov_len); +net_req-iov[i].iov_len = iov[i].iov_len; +} +} +} + +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req, +BlockDriverState *bs, BlockRequest *reqs, +int num_reqs, BlockDriverCompletionFunc *cb
[PATCH 10/19] Call init handler of event-tap at main() in vl.c.
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/vl.c b/vl.c index 8bbb785..9faeb27 100644 --- a/vl.c +++ b/vl.c @@ -162,6 +162,7 @@ int main(int argc, char **argv) #include qemu-queue.h #include cpus.h #include arch_init.h +#include event-tap.h #include ui/qemu-spice.h @@ -2895,6 +2896,8 @@ int main(int argc, char **argv, char **envp) blk_mig_init(); +event_tap_init(); + if (default_cdrom) { /* we always create the cdrom drive, even if no disk is there */ drive_add(NULL, CDROM_ALIAS); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/19] net: insert event-tap to qemu_send_packet() and qemu_sendv_packet_async().
event-tap function is called only when it is on. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- net.c |9 + 1 files changed, 9 insertions(+), 0 deletions(-) diff --git a/net.c b/net.c index 9ba5be2..1176124 100644 --- a/net.c +++ b/net.c @@ -36,6 +36,7 @@ #include qemu-common.h #include qemu_socket.h #include hw/qdev.h +#include event-tap.h static QTAILQ_HEAD(, VLANState) vlans; static QTAILQ_HEAD(, VLANClientState) non_vlan_clients; @@ -559,6 +560,10 @@ ssize_t qemu_send_packet_async(VLANClientState *sender, void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size) { +if (event_tap_is_on()) { +return event_tap_send_packet(vc, buf, size); +} + qemu_send_packet_async(vc, buf, size, NULL); } @@ -657,6 +662,10 @@ ssize_t qemu_sendv_packet_async(VLANClientState *sender, { NetQueue *queue; +if (event_tap_is_on()) { +return event_tap_sendv_packet_async(sender, iov, iovcnt, sent_cb); +} + if (sender-link_down || (!sender-peer !sender-vlan)) { return calc_iov_length(iov, iovcnt); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 16/19] migration: introduce migrate_ft_trans_{put,get}_ready(), and modify migrate_fd_put_ready() when ft_mode is on.
Introduce migrate_ft_trans_put_ready() which kicks the FT transaction cycle. When ft_mode is on, migrate_fd_put_ready() would open ft_trans_file and turn on event_tap. To end or cancel FT transaction, ft_mode and event_tap is turned off. migrate_ft_trans_get_ready() is called to receive ack from the receiver. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c | 265 ++- 1 files changed, 264 insertions(+), 1 deletions(-) diff --git a/migration.c b/migration.c index 9740cb6..fb73b2d 100644 --- a/migration.c +++ b/migration.c @@ -21,6 +21,7 @@ #include qemu_socket.h #include block-migration.h #include qemu-objects.h +#include event-tap.h //#define DEBUG_MIGRATION @@ -274,6 +275,14 @@ void migrate_fd_error(FdMigrationState *s) migrate_fd_cleanup(s); } +static void migrate_ft_trans_error(FdMigrationState *s) +{ +ft_mode = FT_ERROR; +qemu_savevm_state_cancel(s-mon, s-file); +migrate_fd_error(s); +event_tap_unregister(); +} + int migrate_fd_cleanup(FdMigrationState *s) { int ret = 0; @@ -309,6 +318,17 @@ void migrate_fd_put_notify(void *opaque) qemu_file_put_notify(s-file); } +static void migrate_fd_get_notify(void *opaque) +{ +FdMigrationState *s = opaque; + +qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL); +qemu_file_get_notify(s-file); +if (qemu_file_has_error(s-file)) { +migrate_ft_trans_error(s); +} +} + ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) { FdMigrationState *s = opaque; @@ -343,6 +363,10 @@ int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) ret = -(s-get_error(s)); } +if (ret == -EAGAIN) { +qemu_set_fd_handler2(s-fd, NULL, migrate_fd_get_notify, NULL, s); +} + return ret; } @@ -369,6 +393,234 @@ void migrate_fd_connect(FdMigrationState *s) migrate_fd_put_ready(s); } +static int migrate_ft_trans_commit(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_COMMIT ft_mode != FT_TRANSACTION_ATOMIC) { +fprintf(stderr, +migrate_ft_trans_commit: invalid ft_mode %d\n, ft_mode); +goto out; +} + +do { +if (ft_mode == FT_TRANSACTION_ATOMIC) { +if (qemu_ft_trans_begin(s-file) 0) { +fprintf(stderr, qemu_ft_trans_begin failed\n); +goto out; +} + +ret = qemu_savevm_trans_begin(s-mon, s-file, 0); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_begin failed\n); +goto out; +} + +ft_mode = FT_TRANSACTION_COMMIT; +if (ret) { +/* don't proceed until if fd isn't ready */ +goto out; +} +} + +/* make the VM state consistent by flushing outstanding events */ +vm_stop(0); + +/* send at full speed */ +qemu_file_set_rate_limit(s-file, 0); + +ret = qemu_savevm_trans_complete(s-mon, s-file); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_complete failed\n); +goto out; +} + +if (ret) { +/* don't proceed until if fd isn't ready */ +ret = 1; +goto out; +} + +ret = qemu_ft_trans_commit(s-file); +if (ret 0) { +fprintf(stderr, qemu_ft_trans_commit failed\n); +goto out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_RECV; +ret = 1; +goto out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto out; +} + +ft_mode = ret ? FT_TRANSACTION_BEGIN : FT_TRANSACTION_ATOMIC; +} while (ft_mode != FT_TRANSACTION_BEGIN); + +vm_start(); +ret = 0; + +out: +return ret; +} + +static int migrate_ft_trans_get_ready(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_RECV) { +fprintf(stderr, +migrate_ft_trans_get_ready: invalid ft_mode %d\n, ft_mode); +goto error_out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto error_out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_BEGIN; +} else { +ft_mode = FT_TRANSACTION_ATOMIC; + +ret = migrate_ft_trans_commit(s); +if (ret 0) { +goto error_out; +} +if (ret) { +goto out; +} +} + +vm_start(); +ret = 0; +goto out; + +error_out: +migrate_ft_trans_error(s); + +out
[PATCH 05/19] vl.c: add deleted flag for deleting the handler.
Make deleting handlers robust against deletion of any elements in a handler by using a deleted flag like in file descriptors. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c | 13 + 1 files changed, 9 insertions(+), 4 deletions(-) diff --git a/vl.c b/vl.c index 0292184..8bbb785 100644 --- a/vl.c +++ b/vl.c @@ -1140,6 +1140,7 @@ static void nographic_update(void *opaque) struct vm_change_state_entry { VMChangeStateHandler *cb; void *opaque; +int deleted; QLIST_ENTRY (vm_change_state_entry) entries; }; @@ -1160,8 +1161,7 @@ VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb, void qemu_del_vm_change_state_handler(VMChangeStateEntry *e) { -QLIST_REMOVE (e, entries); -qemu_free (e); +e-deleted = 1; } void vm_state_notify(int running, int reason) @@ -1170,8 +1170,13 @@ void vm_state_notify(int running, int reason) trace_vm_state_notify(running, reason); -for (e = vm_change_state_head.lh_first; e; e = e-entries.le_next) { -e-cb(e-opaque, running, reason); +QLIST_FOREACH(e, vm_change_state_head, entries) { +if (e-deleted) { +QLIST_REMOVE(e, entries); +qemu_free(e); +} else { +e-cb(e-opaque, running, reason); +} } } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/19] virtio: decrement last_avail_idx with inuse before saving.
For regular migration inuse == 0 always as requests are flushed before save. However, event-tap log when enabled introduces an extra queue for requests which is not being flushed, thus the last inuse requests are left in the event-tap queue. Move the last_avail_idx value sent to the remote back to make it repeat the last inuse requests. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/virtio.c | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 07dbf86..b6cf4e5 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -665,12 +665,20 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f) qemu_put_be32(f, i); for (i = 0; i VIRTIO_PCI_QUEUE_MAX; i++) { +/* For regular migration inuse == 0 always as + * requests are flushed before save. However, + * event-tap log when enabled introduces an extra + * queue for requests which is not being flushed, + * thus the last inuse requests are left in the event-tap queue. + * Move the last_avail_idx value sent to the remote back + * to make it repeat the last inuse requests. */ +uint16_t last_avail = vdev-vq[i].last_avail_idx - vdev-vq[i].inuse; if (vdev-vq[i].vring.num == 0) break; qemu_put_be32(f, vdev-vq[i].vring.num); qemu_put_be64(f, vdev-vq[i].pa); -qemu_put_be16s(f, vdev-vq[i].last_avail_idx); +qemu_put_be16s(f, last_avail); if (vdev-binding-save_queue) vdev-binding-save_queue(vdev-binding_opaque, i, f); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/19] Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer().
Currently buf size is fixed at 32KB. It would be useful if it could be flexible. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |2 ++ savevm.c | 20 +++- 2 files changed, 21 insertions(+), 1 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index 163a683..a506688 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -58,6 +58,8 @@ void qemu_fflush(QEMUFile *f); int qemu_fclose(QEMUFile *f); void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); +void *qemu_realloc_buffer(QEMUFile *f, int size); +void qemu_clear_buffer(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { diff --git a/savevm.c b/savevm.c index 90aa237..8c64c63 100644 --- a/savevm.c +++ b/savevm.c @@ -172,7 +172,8 @@ struct QEMUFile { when reading */ int buf_index; int buf_size; /* 0 when writing */ -uint8_t buf[IO_BUF_SIZE]; +int buf_max_size; +uint8_t *buf; int has_error; }; @@ -423,6 +424,9 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, f-get_rate_limit = get_rate_limit; f-is_write = 0; +f-buf_max_size = IO_BUF_SIZE; +f-buf = qemu_malloc(sizeof(uint8_t) * f-buf_max_size); + return f; } @@ -453,6 +457,19 @@ void qemu_fflush(QEMUFile *f) } } +void *qemu_realloc_buffer(QEMUFile *f, int size) +{ +f-buf_max_size = size; +f-buf = qemu_realloc(f-buf, f-buf_max_size); + +return f-buf; +} + +void qemu_clear_buffer(QEMUFile *f) +{ +f-buf_size = f-buf_index = f-buf_offset = 0; +} + static void qemu_fill_buffer(QEMUFile *f) { int len; @@ -478,6 +495,7 @@ int qemu_fclose(QEMUFile *f) qemu_fflush(f); if (f-close) ret = f-close(f-opaque); +qemu_free(f-buf); qemu_free(f); return ret; } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 08/19] savevm: introduce util functions to control ft_trans_file from savevm layer.
To utilize ft_trans_file function, savevm needs interfaces to be exported. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |5 ++ savevm.c | 149 ++ 2 files changed, 154 insertions(+), 0 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index a506688..ace1744 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -51,6 +51,7 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, QEMUFile *qemu_fopen(const char *filename, const char *mode); QEMUFile *qemu_fdopen(int fd, const char *mode); QEMUFile *qemu_fopen_socket(int fd); +QEMUFile *qemu_fopen_ft_trans(int s_fd, int c_fd); QEMUFile *qemu_popen(FILE *popen_file, const char *mode); QEMUFile *qemu_popen_cmd(const char *command, const char *mode); int qemu_stdio_fd(QEMUFile *f); @@ -60,6 +61,9 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); void *qemu_realloc_buffer(QEMUFile *f, int size); void qemu_clear_buffer(QEMUFile *f); +int qemu_ft_trans_begin(QEMUFile *f); +int qemu_ft_trans_commit(QEMUFile *f); +int qemu_ft_trans_cancel(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { @@ -94,6 +98,7 @@ void qemu_file_set_error(QEMUFile *f); * halted due to rate limiting or EAGAIN errors occur as it can be used to * resume output. */ void qemu_file_put_notify(QEMUFile *f); +void qemu_file_get_notify(void *opaque); static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { diff --git a/savevm.c b/savevm.c index 7bc3699..4821808 100644 --- a/savevm.c +++ b/savevm.c @@ -83,6 +83,7 @@ #include migration.h #include qemu_socket.h #include qemu-queue.h +#include ft_trans_file.h #define SELF_ANNOUNCE_ROUNDS 5 @@ -190,6 +191,13 @@ typedef struct QEMUFileSocket QEMUFile *file; } QEMUFileSocket; +typedef struct QEMUFileSocketTrans +{ +int fd; +QEMUFileSocket *s; +VMChangeStateEntry *e; +} QEMUFileSocketTrans; + static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) { QEMUFileSocket *s = opaque; @@ -205,6 +213,22 @@ static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) return len; } +static ssize_t socket_put_buffer(void *opaque, const void *buf, size_t size) +{ +QEMUFileSocket *s = opaque; +ssize_t len; + +do { +len = send(s-fd, (void *)buf, size, 0); +} while (len == -1 socket_error() == EINTR); + +if (len == -1) { +len = -socket_error(); +} + +return len; +} + static int socket_close(void *opaque) { QEMUFileSocket *s = opaque; @@ -212,6 +236,70 @@ static int socket_close(void *opaque) return 0; } +static int socket_trans_get_buffer(void *opaque, uint8_t *buf, int64_t pos, size_t size) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; +ssize_t len; + +len = socket_get_buffer(s, buf, pos, size); + +return len; +} + +static ssize_t socket_trans_put_buffer(void *opaque, const void *buf, size_t size) +{ +QEMUFileSocketTrans *t = opaque; + +return socket_put_buffer(t-s, buf, size); +} + + +static int socket_trans_get_ready(void *opaque) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; +QEMUFile *f = s-file; +int ret = 0; + +ret = qemu_loadvm_state(f, 1); +if (ret 0) { +fprintf(stderr, +socket_trans_get_ready: error while loading vmstate\n); +} + +return ret; +} + +static int socket_trans_close(void *opaque) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; + +qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL); +qemu_set_fd_handler2(t-fd, NULL, NULL, NULL, NULL); +qemu_del_vm_change_state_handler(t-e); +close(s-fd); +close(t-fd); +qemu_free(s); +qemu_free(t); + +return 0; +} + +static void socket_trans_resume(void *opaque, int running, int reason) +{ +QEMUFileSocketTrans *t = opaque; +QEMUFileSocket *s = t-s; + +if (!running) { +return; +} + +qemu_announce_self(); +qemu_fclose(s-file); +} + static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) { QEMUFileStdio *s = opaque; @@ -334,6 +422,26 @@ QEMUFile *qemu_fopen_socket(int fd) return s-file; } +QEMUFile *qemu_fopen_ft_trans(int s_fd, int c_fd) +{ +QEMUFileSocketTrans *t = qemu_mallocz(sizeof(QEMUFileSocketTrans)); +QEMUFileSocket *s = qemu_mallocz(sizeof(QEMUFileSocket)); + +t-s = s; +t-fd = s_fd; +t-e = qemu_add_vm_change_state_handler(socket_trans_resume, t); + +s-fd = c_fd; +s-file = qemu_fopen_ops_ft_trans(t, socket_trans_put_buffer, + socket_trans_get_buffer, NULL, + socket_trans_get_ready, + migrate_fd_wait_for_unfreeze, + socket_trans_close, 0
[PATCH 17/19] migration-tcp: modify tcp_accept_incoming_migration() to handle ft_mode, and add a hack not to close fd when ft_mode is enabled.
When ft_mode is set in the header, tcp_accept_incoming_migration() sets ft_trans_incoming() as a callback, and call qemu_file_get_notify() to receive FT transaction iteratively. We also need a hack no to close fd before moving to ft_transaction mode, so that we can reuse the fd for it. vm_change_state_handler is added to turn off ft_mode when cont is pressed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 67 ++- 1 files changed, 66 insertions(+), 1 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index 55777c8..84076d6 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -18,6 +18,8 @@ #include sysemu.h #include buffered_file.h #include block.h +#include ft_trans_file.h +#include event-tap.h //#define DEBUG_MIGRATION_TCP @@ -29,6 +31,8 @@ do { } while (0) #endif +static VMChangeStateEntry *vmstate; + static int socket_errno(FdMigrationState *s) { return socket_error(); @@ -56,7 +60,8 @@ static int socket_read(FdMigrationState *s, const void * buf, size_t size) static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); -if (s-fd != -1) { +/* FIX ME: accessing ft_mode here isn't clean */ +if (s-fd != -1 ft_mode != FT_INIT) { close(s-fd); s-fd = -1; } @@ -150,6 +155,36 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, return s-mig_state; } +static void ft_trans_incoming(void *opaque) +{ +QEMUFile *f = opaque; + +qemu_file_get_notify(f); +if (qemu_file_has_error(f)) { +ft_mode = FT_ERROR; +qemu_fclose(f); +} +} + +static void ft_trans_reset(void *opaque, int running, int reason) +{ +QEMUFile *f = opaque; + +if (running) { +if (ft_mode != FT_ERROR) { +qemu_fclose(f); +} +ft_mode = FT_OFF; +qemu_del_vm_change_state_handler(vmstate); +} +} + +static void ft_trans_schedule_replay(QEMUFile *f) +{ +event_tap_schedule_replay(); +vmstate = qemu_add_vm_change_state_handler(ft_trans_reset, f); +} + static void tcp_accept_incoming_migration(void *opaque) { struct sockaddr_in addr; @@ -175,8 +210,38 @@ static void tcp_accept_incoming_migration(void *opaque) goto out; } +if (ft_mode == FT_INIT) { +autostart = 0; +} + process_incoming_migration(f); + +if (ft_mode == FT_INIT) { +int ret; + +socket_set_nodelay(c); + +f = qemu_fopen_ft_trans(s, c); +if (f == NULL) { +fprintf(stderr, could not qemu_fopen_ft_trans\n); +goto out; +} + +/* need to wait sender to setup */ +ret = qemu_ft_trans_begin(f); +if (ret 0) { +goto out; +} + +qemu_set_fd_handler2(c, NULL, ft_trans_incoming, NULL, f); +ft_trans_schedule_replay(f); +ft_mode = FT_TRANSACTION_RECV; + +return; +} + qemu_fclose(f); + out: close(c); out2: -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/19] Introduce skip_header parameter to qemu_loadvm_state().
Introduce skip_header parameter to qemu_loadvm_state() so that it can be called iteratively without reading the header. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |2 +- savevm.c| 24 +--- sysemu.h|2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/migration.c b/migration.c index 4a749bb..9639659 100644 --- a/migration.c +++ b/migration.c @@ -60,7 +60,7 @@ int qemu_start_incoming_migration(const char *uri) void process_incoming_migration(QEMUFile *f) { -if (qemu_loadvm_state(f) 0) { +if (qemu_loadvm_state(f, 0) 0) { fprintf(stderr, load of migration failed\n); exit(0); } diff --git a/savevm.c b/savevm.c index 8c64c63..7bc3699 100644 --- a/savevm.c +++ b/savevm.c @@ -1701,7 +1701,7 @@ typedef struct LoadStateEntry { int version_id; } LoadStateEntry; -int qemu_loadvm_state(QEMUFile *f) +int qemu_loadvm_state(QEMUFile *f, int skip_header) { QLIST_HEAD(, LoadStateEntry) loadvm_handlers = QLIST_HEAD_INITIALIZER(loadvm_handlers); @@ -1710,17 +1710,19 @@ int qemu_loadvm_state(QEMUFile *f) unsigned int v; int ret; -v = qemu_get_be32(f); -if (v != QEMU_VM_FILE_MAGIC) -return -EINVAL; +if (!skip_header) { +v = qemu_get_be32(f); +if (v != QEMU_VM_FILE_MAGIC) +return -EINVAL; -v = qemu_get_be32(f); -if (v == QEMU_VM_FILE_VERSION_COMPAT) { -fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); -return -ENOTSUP; +v = qemu_get_be32(f); +if (v == QEMU_VM_FILE_VERSION_COMPAT) { +fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); +return -ENOTSUP; +} +if (v != QEMU_VM_FILE_VERSION) +return -ENOTSUP; } -if (v != QEMU_VM_FILE_VERSION) -return -ENOTSUP; while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; @@ -2043,7 +2045,7 @@ int load_vmstate(const char *name) return -EINVAL; } -ret = qemu_loadvm_state(f); +ret = qemu_loadvm_state(f, 0); qemu_fclose(f); if (ret 0) { diff --git a/sysemu.h b/sysemu.h index d8fceec..81bcf00 100644 --- a/sysemu.h +++ b/sysemu.h @@ -80,7 +80,7 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); -int qemu_loadvm_state(QEMUFile *f); +int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ void do_info_slirp(Monitor *mon); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/19] block: insert event-tap to bdrv_aio_writev() and bdrv_aio_flush().
event-tap function is called only when it is on, and requests sent from device emulators. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- block.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index ff2795b..85bd8b8 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include block_int.h #include module.h #include qemu-objects.h +#include event-tap.h #ifdef CONFIG_BSD #include sys/types.h @@ -2111,6 +2112,11 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; +if (bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); +} + if (bs-dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, opaque); @@ -2374,6 +2380,11 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, if (!drv) return NULL; + +if (bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_flush(bs, cb, opaque); +} + return drv-bdrv_aio_flush(bs, cb, opaque); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/19] savevm: introduce qemu_savevm_trans_{begin,commit}.
Introduce qemu_savevm_state_{begin,commit} to send the memory and device info together, while avoiding cancelling memory state tracking. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- savevm.c | 93 ++ sysemu.h |2 + 2 files changed, 95 insertions(+), 0 deletions(-) diff --git a/savevm.c b/savevm.c index 4821808..629936b 100644 --- a/savevm.c +++ b/savevm.c @@ -1723,6 +1723,99 @@ int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f) return 0; } +int qemu_savevm_trans_begin(Monitor *mon, QEMUFile *f, int init) +{ +SaveStateEntry *se; +int skipped = 0; + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int len, stage, ret; + +if (se-save_live_state == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_START); +qemu_put_be32(f, se-section_id); + +/* ID string */ +len = strlen(se-idstr); +qemu_put_byte(f, len); +qemu_put_buffer(f, (uint8_t *)se-idstr, len); + +qemu_put_be32(f, se-instance_id); +qemu_put_be32(f, se-version_id); + +stage = init ? QEMU_VM_SECTION_START : QEMU_VM_SECTION_PART; +ret = se-save_live_state(mon, f, stage, se-opaque); +if (!ret) { +skipped++; +} +} + +if (qemu_file_has_error(f)) { +return -EIO; +} + +return skipped; +} + +int qemu_savevm_trans_complete(Monitor *mon, QEMUFile *f) +{ +SaveStateEntry *se; + +cpu_synchronize_all_states(); + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int ret; + +if (se-save_live_state == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_PART); +qemu_put_be32(f, se-section_id); + +ret = se-save_live_state(mon, f, QEMU_VM_SECTION_PART, se-opaque); +if (!ret) { +/* do not proceed to the next vmstate. */ +return 1; +} +} + +QTAILQ_FOREACH(se, savevm_handlers, entry) { +int len; + +if (se-save_state == NULL se-vmsd == NULL) { +continue; +} + +/* Section type */ +qemu_put_byte(f, QEMU_VM_SECTION_FULL); +qemu_put_be32(f, se-section_id); + +/* ID string */ +len = strlen(se-idstr); +qemu_put_byte(f, len); +qemu_put_buffer(f, (uint8_t *)se-idstr, len); + +qemu_put_be32(f, se-instance_id); +qemu_put_be32(f, se-version_id); + +vmstate_save(f, se); +} + +qemu_put_byte(f, QEMU_VM_EOF); + +if (qemu_file_has_error(f)) { +return -EIO; +} + +return 0; +} + void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f) { SaveStateEntry *se; diff --git a/sysemu.h b/sysemu.h index 81bcf00..9c2c45e 100644 --- a/sysemu.h +++ b/sysemu.h @@ -80,6 +80,8 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); +int qemu_savevm_trans_begin(Monitor *mon, QEMUFile *f, int init); +int qemu_savevm_trans_complete(Monitor *mon, QEMUFile *f); int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/19] ioport: insert event_tap_ioport() to ioport_write().
Record ioport event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- ioport.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/ioport.c b/ioport.c index aa4188a..74aebf5 100644 --- a/ioport.c +++ b/ioport.c @@ -27,6 +27,7 @@ #include ioport.h #include trace.h +#include event-tap.h /***/ /* IO Port */ @@ -76,6 +77,7 @@ static void ioport_write(int index, uint32_t address, uint32_t data) default_ioport_writel }; IOPortWriteFunc *func = ioport_write_table[index][address]; +event_tap_ioport(index, address, data); if (!func) func = default_func[index]; func(ioport_opaque[address], address, data); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/19] qemu-char: export socket_set_nodelay().
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- qemu-char.c |2 +- qemu_socket.h |1 + 2 files changed, 2 insertions(+), 1 deletions(-) diff --git a/qemu-char.c b/qemu-char.c index edc9ad6..737d347 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -2116,7 +2116,7 @@ static void tcp_chr_telnet_init(int fd) send(fd, (char *)buf, 3, 0); } -static void socket_set_nodelay(int fd) +void socket_set_nodelay(int fd) { int val = 1; setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *)val, sizeof(val)); diff --git a/qemu_socket.h b/qemu_socket.h index 897a8ae..b7f8465 100644 --- a/qemu_socket.h +++ b/qemu_socket.h @@ -36,6 +36,7 @@ int inet_aton(const char *cp, struct in_addr *ia); int qemu_socket(int domain, int type, int protocol); int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen); void socket_set_nonblock(int fd); +void socket_set_nodelay(int fd); int send_all(int fd, const void *buf, int len1); /* New, ipv6-ready socket helper functions, see qemu-sockets.c */ -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/19] block: insert event-tap to bdrv_aio_writev() and bdrv_aio_flush().
event-tap function is called only when it is on, and requests sent from device emulators. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- block.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) diff --git a/block.c b/block.c index ff2795b..85bd8b8 100644 --- a/block.c +++ b/block.c @@ -28,6 +28,7 @@ #include block_int.h #include module.h #include qemu-objects.h +#include event-tap.h #ifdef CONFIG_BSD #include sys/types.h @@ -2111,6 +2112,11 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; +if (bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); +} + if (bs-dirty_bitmap) { blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb, opaque); @@ -2374,6 +2380,11 @@ BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, if (!drv) return NULL; + +if (bs-device_name event_tap_is_on()) { +return event_tap_bdrv_aio_flush(bs, cb, opaque); +} + return drv-bdrv_aio_flush(bs, cb, opaque); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/19] Introduce skip_header parameter to qemu_loadvm_state().
Introduce skip_header parameter to qemu_loadvm_state() so that it can be called iteratively without reading the header. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c |2 +- savevm.c| 24 +--- sysemu.h|2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/migration.c b/migration.c index 4a749bb..9639659 100644 --- a/migration.c +++ b/migration.c @@ -60,7 +60,7 @@ int qemu_start_incoming_migration(const char *uri) void process_incoming_migration(QEMUFile *f) { -if (qemu_loadvm_state(f) 0) { +if (qemu_loadvm_state(f, 0) 0) { fprintf(stderr, load of migration failed\n); exit(0); } diff --git a/savevm.c b/savevm.c index 8c64c63..7bc3699 100644 --- a/savevm.c +++ b/savevm.c @@ -1701,7 +1701,7 @@ typedef struct LoadStateEntry { int version_id; } LoadStateEntry; -int qemu_loadvm_state(QEMUFile *f) +int qemu_loadvm_state(QEMUFile *f, int skip_header) { QLIST_HEAD(, LoadStateEntry) loadvm_handlers = QLIST_HEAD_INITIALIZER(loadvm_handlers); @@ -1710,17 +1710,19 @@ int qemu_loadvm_state(QEMUFile *f) unsigned int v; int ret; -v = qemu_get_be32(f); -if (v != QEMU_VM_FILE_MAGIC) -return -EINVAL; +if (!skip_header) { +v = qemu_get_be32(f); +if (v != QEMU_VM_FILE_MAGIC) +return -EINVAL; -v = qemu_get_be32(f); -if (v == QEMU_VM_FILE_VERSION_COMPAT) { -fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); -return -ENOTSUP; +v = qemu_get_be32(f); +if (v == QEMU_VM_FILE_VERSION_COMPAT) { +fprintf(stderr, SaveVM v2 format is obsolete and don't work anymore\n); +return -ENOTSUP; +} +if (v != QEMU_VM_FILE_VERSION) +return -ENOTSUP; } -if (v != QEMU_VM_FILE_VERSION) -return -ENOTSUP; while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; @@ -2043,7 +2045,7 @@ int load_vmstate(const char *name) return -EINVAL; } -ret = qemu_loadvm_state(f); +ret = qemu_loadvm_state(f, 0); qemu_fclose(f); if (ret 0) { diff --git a/sysemu.h b/sysemu.h index d8fceec..81bcf00 100644 --- a/sysemu.h +++ b/sysemu.h @@ -80,7 +80,7 @@ int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable, int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f); int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f); void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f); -int qemu_loadvm_state(QEMUFile *f); +int qemu_loadvm_state(QEMUFile *f, int skip_header); /* SLIRP */ void do_info_slirp(Monitor *mon); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/19] virtio: decrement last_avail_idx with inuse before saving.
For regular migration inuse == 0 always as requests are flushed before save. However, event-tap log when enabled introduces an extra queue for requests which is not being flushed, thus the last inuse requests are left in the event-tap queue. Move the last_avail_idx value sent to the remote back to make it repeat the last inuse requests. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/virtio.c | 10 +- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index 07dbf86..b6cf4e5 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -665,12 +665,20 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f) qemu_put_be32(f, i); for (i = 0; i VIRTIO_PCI_QUEUE_MAX; i++) { +/* For regular migration inuse == 0 always as + * requests are flushed before save. However, + * event-tap log when enabled introduces an extra + * queue for requests which is not being flushed, + * thus the last inuse requests are left in the event-tap queue. + * Move the last_avail_idx value sent to the remote back + * to make it repeat the last inuse requests. */ +uint16_t last_avail = vdev-vq[i].last_avail_idx - vdev-vq[i].inuse; if (vdev-vq[i].vring.num == 0) break; qemu_put_be32(f, vdev-vq[i].vring.num); qemu_put_be64(f, vdev-vq[i].pa); -qemu_put_be16s(f, vdev-vq[i].last_avail_idx); +qemu_put_be16s(f, last_avail); if (vdev-binding-save_queue) vdev-binding-save_queue(vdev-binding_opaque, i, f); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/19] net: insert event-tap to qemu_send_packet() and qemu_sendv_packet_async().
event-tap function is called only when it is on. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- net.c |9 + 1 files changed, 9 insertions(+), 0 deletions(-) diff --git a/net.c b/net.c index 9ba5be2..1176124 100644 --- a/net.c +++ b/net.c @@ -36,6 +36,7 @@ #include qemu-common.h #include qemu_socket.h #include hw/qdev.h +#include event-tap.h static QTAILQ_HEAD(, VLANState) vlans; static QTAILQ_HEAD(, VLANClientState) non_vlan_clients; @@ -559,6 +560,10 @@ ssize_t qemu_send_packet_async(VLANClientState *sender, void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size) { +if (event_tap_is_on()) { +return event_tap_send_packet(vc, buf, size); +} + qemu_send_packet_async(vc, buf, size, NULL); } @@ -657,6 +662,10 @@ ssize_t qemu_sendv_packet_async(VLANClientState *sender, { NetQueue *queue; +if (event_tap_is_on()) { +return event_tap_sendv_packet_async(sender, iov, iovcnt, sent_cb); +} + if (sender-link_down || (!sender-peer !sender-vlan)) { return calc_iov_length(iov, iovcnt); } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 16/19] migration: introduce migrate_ft_trans_{put,get}_ready(), and modify migrate_fd_put_ready() when ft_mode is on.
Introduce migrate_ft_trans_put_ready() which kicks the FT transaction cycle. When ft_mode is on, migrate_fd_put_ready() would open ft_trans_file and turn on event_tap. To end or cancel FT transaction, ft_mode and event_tap is turned off. migrate_ft_trans_get_ready() is called to receive ack from the receiver. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration.c | 265 ++- 1 files changed, 264 insertions(+), 1 deletions(-) diff --git a/migration.c b/migration.c index 9740cb6..fb73b2d 100644 --- a/migration.c +++ b/migration.c @@ -21,6 +21,7 @@ #include qemu_socket.h #include block-migration.h #include qemu-objects.h +#include event-tap.h //#define DEBUG_MIGRATION @@ -274,6 +275,14 @@ void migrate_fd_error(FdMigrationState *s) migrate_fd_cleanup(s); } +static void migrate_ft_trans_error(FdMigrationState *s) +{ +ft_mode = FT_ERROR; +qemu_savevm_state_cancel(s-mon, s-file); +migrate_fd_error(s); +event_tap_unregister(); +} + int migrate_fd_cleanup(FdMigrationState *s) { int ret = 0; @@ -309,6 +318,17 @@ void migrate_fd_put_notify(void *opaque) qemu_file_put_notify(s-file); } +static void migrate_fd_get_notify(void *opaque) +{ +FdMigrationState *s = opaque; + +qemu_set_fd_handler2(s-fd, NULL, NULL, NULL, NULL); +qemu_file_get_notify(s-file); +if (qemu_file_has_error(s-file)) { +migrate_ft_trans_error(s); +} +} + ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) { FdMigrationState *s = opaque; @@ -343,6 +363,10 @@ int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) ret = -(s-get_error(s)); } +if (ret == -EAGAIN) { +qemu_set_fd_handler2(s-fd, NULL, migrate_fd_get_notify, NULL, s); +} + return ret; } @@ -369,6 +393,234 @@ void migrate_fd_connect(FdMigrationState *s) migrate_fd_put_ready(s); } +static int migrate_ft_trans_commit(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_COMMIT ft_mode != FT_TRANSACTION_ATOMIC) { +fprintf(stderr, +migrate_ft_trans_commit: invalid ft_mode %d\n, ft_mode); +goto out; +} + +do { +if (ft_mode == FT_TRANSACTION_ATOMIC) { +if (qemu_ft_trans_begin(s-file) 0) { +fprintf(stderr, qemu_ft_trans_begin failed\n); +goto out; +} + +ret = qemu_savevm_trans_begin(s-mon, s-file, 0); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_begin failed\n); +goto out; +} + +ft_mode = FT_TRANSACTION_COMMIT; +if (ret) { +/* don't proceed until if fd isn't ready */ +goto out; +} +} + +/* make the VM state consistent by flushing outstanding events */ +vm_stop(0); + +/* send at full speed */ +qemu_file_set_rate_limit(s-file, 0); + +ret = qemu_savevm_trans_complete(s-mon, s-file); +if (ret 0) { +fprintf(stderr, qemu_savevm_trans_complete failed\n); +goto out; +} + +if (ret) { +/* don't proceed until if fd isn't ready */ +ret = 1; +goto out; +} + +ret = qemu_ft_trans_commit(s-file); +if (ret 0) { +fprintf(stderr, qemu_ft_trans_commit failed\n); +goto out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_RECV; +ret = 1; +goto out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto out; +} + +ft_mode = ret ? FT_TRANSACTION_BEGIN : FT_TRANSACTION_ATOMIC; +} while (ft_mode != FT_TRANSACTION_BEGIN); + +vm_start(); +ret = 0; + +out: +return ret; +} + +static int migrate_ft_trans_get_ready(void *opaque) +{ +FdMigrationState *s = opaque; +int ret = -1; + +if (ft_mode != FT_TRANSACTION_RECV) { +fprintf(stderr, +migrate_ft_trans_get_ready: invalid ft_mode %d\n, ft_mode); +goto error_out; +} + +/* flush and check if events are remaining */ +vm_start(); +ret = event_tap_flush_one(); +if (ret 0) { +fprintf(stderr, event_tap_flush_one failed\n); +goto error_out; +} + +if (ret) { +ft_mode = FT_TRANSACTION_BEGIN; +} else { +ft_mode = FT_TRANSACTION_ATOMIC; + +ret = migrate_ft_trans_commit(s); +if (ret 0) { +goto error_out; +} +if (ret) { +goto out; +} +} + +vm_start(); +ret = 0; +goto out; + +error_out: +migrate_ft_trans_error(s); + +out
[PATCH 00/19] Kemari for KVM v0.2.5
Hi, This patch series is a revised version of Kemari for KVM, which applied comments for the previous post. The current code is based on qemu.git d03d11260ee2d55579e8b76116e35ccdf5031833. The changes from v0.2.4 - v0.2.5 are: - fixed braces and trailing spaces by using Blue's checkpatch.pl (Blue) - event-tap: don't try to send blk_req if it's a bdrv_aio_flush event The changes from v0.2.3 - v0.2.4 are: - call vm_start() before event_tap_flush_one() to avoid failure in virtio-net assertion - add vm_change_state_handler to turn off ft_mode - use qemu_iovec functions in event-tap - remove duplicated code in migration - remove unnecessary new line for error_report in ft_trans_file The changes from v0.2.2 - v0.2.3 are: - queue async net requests without copying (MST) -- if not async, contents of the packets are sent to the secondary - better description for option -k (MST) - fix memory transfer failure - fix ft transaction initiation failure The changes from v0.2.1 - v0.2.2 are: - decrement last_avaid_idx with inuse before saving (MST) - remove qemu_aio_flush() and bdrv_flush_all() in migrate_ft_trans_commit() The changes from v0.2 - v0.2.1 are: - Move event-tap to net/block layer and use stubs (Blue, Paul, MST, Kevin) - Tap bdrv_aio_flush (Marcelo) - Remove multiwrite interface in event-tap (Stefan) - Fix event-tap to use pio/mmio to replay both net/block (Stefan) - Improve error handling in event-tap (Stefan) - Fix leak in event-tap (Stefan) - Revise virtio last_avail_idx manipulation (MST) - Clean up migration.c hook (Marcelo) - Make deleting change state handler robust (Isaku, Anthony) The changes from v0.1.1 - v0.2 are: - Introduce a queue in event-tap to make VM sync live. - Change transaction receiver to a state machine for async receiving. - Replace net/block layer functions with event-tap proxy functions. - Remove dirty bitmap optimization for now. - convert DPRINTF() in ft_trans_file to trace functions. - convert fprintf() in ft_trans_file to error_report(). - improved error handling in ft_trans_file. - add a tmp pointer to qemu_del_vm_change_state_handler. The changes from v0.1 - v0.1.1 are: - events are tapped in net/block layer instead of device emulation layer. - Introduce a new option for -incoming to accept FT transaction. - Removed writev() support to QEMUFile and FdMigrationState for now. I would post this work in a different series. - Modified virtio-blk save/load handler to send inuse variable to correctly replay. - Removed configure --enable-ft-mode. - Removed unnecessary check for qemu_realloc(). The first 6 patches modify several functions of qemu to prepare introducing Kemari specific components. The next 6 patches are the components of Kemari. They introduce event-tap and the FT transaction protocol file based on buffered file. The design document of FT transaction protocol can be found at, http://wiki.qemu.org/images/b/b1/Kemari_sender_receiver_0.5a.pdf Then the following 2 patches modifies net/block layer functions with event-tap functions. Please note that if Kemari is off, event-tap will just passthrough, and there is most no intrusion to exisiting functions including normal live migration. Finally, the migration layer are modified to support Kemari in the last 5 patches. Again, there shouldn't be any affection if a user doesn't specify Kemari specific options. The transaction is now async on both sender and receiver side. The sender side respects the max_downtime to decide when to switch from async to sync mode. The repository contains all patches I'm sending with this message. For those who want to try, please pull the following repository. It also includes dirty bitmap optimization which aren't ready for posting yet. To remove the dirty bitmap optimization, please look at HEAD~5 of the tree. git://kemari.git.sourceforge.net/gitroot/kemari/kemari next Thanks, Yoshi Yoshiaki Tamura (19): Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer(). Introduce read() to FdMigrationState. Introduce skip_header parameter to qemu_loadvm_state(). qemu-char: export socket_set_nodelay(). vl.c: add deleted flag for deleting the handler. virtio: decrement last_avail_idx with inuse before saving. Introduce fault tolerant VM transaction QEMUFile and ft_mode. savevm: introduce util functions to control ft_trans_file from savevm layer. Introduce event-tap. Call init handler of event-tap at main() in vl.c. ioport: insert event_tap_ioport() to ioport_write(). Insert event_tap_mmio() to cpu_physical_memory_rw() in exec.c. net: insert event-tap to qemu_send_packet() and qemu_sendv_packet_async(). block: insert event-tap to bdrv_aio_writev() and bdrv_aio_flush(). savevm: introduce qemu_savevm_trans_{begin,commit}. migration: introduce migrate_ft_trans_{put,get}_ready(), and modify migrate_fd_put_ready() when ft_mode is on. migration-tcp: modify tcp_accept_incoming_migration
[PATCH 09/19] Introduce event-tap.
event-tap controls when to start FT transaction, and provides proxy functions to called from net/block devices. While FT transaction, it queues up net/block requests, and flush them when the transaction gets completed. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.target |1 + event-tap.c | 846 +++ event-tap.h | 42 +++ qemu-tool.c | 24 ++ trace-events|9 + 5 files changed, 922 insertions(+), 0 deletions(-) create mode 100644 event-tap.c create mode 100644 event-tap.h diff --git a/Makefile.target b/Makefile.target index e15b1c4..f36cd75 100644 --- a/Makefile.target +++ b/Makefile.target @@ -199,6 +199,7 @@ obj-y += rwhandler.o obj-$(CONFIG_KVM) += kvm.o kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o LIBS+=-lz +obj-y += event-tap.o QEMU_CFLAGS += $(VNC_TLS_CFLAGS) QEMU_CFLAGS += $(VNC_SASL_CFLAGS) diff --git a/event-tap.c b/event-tap.c new file mode 100644 index 000..66a501b --- /dev/null +++ b/event-tap.c @@ -0,0 +1,846 @@ +/* + * Event Tap functions for QEMU + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include qemu-common.h +#include qemu-error.h +#include block.h +#include block_int.h +#include ioport.h +#include osdep.h +#include sysemu.h +#include hw/hw.h +#include net.h +#include event-tap.h +#include trace.h + +enum EVENT_TAP_STATE { +EVENT_TAP_OFF, +EVENT_TAP_ON, +EVENT_TAP_FLUSH, +EVENT_TAP_LOAD, +EVENT_TAP_REPLAY, +}; + +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF; +static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */ + +typedef struct EventTapIOport { +uint32_t address; +uint32_t data; +int index; +} EventTapIOport; + +#define MMIO_BUF_SIZE 8 + +typedef struct EventTapMMIO { +uint64_t address; +uint8_t buf[MMIO_BUF_SIZE]; +int len; +} EventTapMMIO; + +typedef struct EventTapNetReq { +char *device_name; +int iovcnt; +struct iovec *iov; +int vlan_id; +bool vlan_needed; +bool async; +NetPacketSent *sent_cb; +} EventTapNetReq; + +#define MAX_BLOCK_REQUEST 32 + +typedef struct EventTapBlkReq { +char *device_name; +int num_reqs; +int num_cbs; +bool is_flush; +BlockRequest reqs[MAX_BLOCK_REQUEST]; +BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST]; +void *opaque[MAX_BLOCK_REQUEST]; +} EventTapBlkReq; + +#define EVENT_TAP_IOPORT (1 0) +#define EVENT_TAP_MMIO (1 1) +#define EVENT_TAP_NET(1 2) +#define EVENT_TAP_BLK(1 3) + +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1) + +typedef struct EventTapLog { +int mode; +union { +EventTapIOport ioport; +EventTapMMIO mmio; +}; +union { +EventTapNetReq net_req; +EventTapBlkReq blk_req; +}; +QTAILQ_ENTRY(EventTapLog) node; +} EventTapLog; + +static EventTapLog *last_event_tap; + +static QTAILQ_HEAD(, EventTapLog) event_list; +static QTAILQ_HEAD(, EventTapLog) event_pool; + +static int (*event_tap_cb)(void); +static QEMUBH *event_tap_bh; +static VMChangeStateEntry *vmstate; + +static void event_tap_bh_cb(void *p) +{ +if (event_tap_cb) { +event_tap_cb(); +} + +qemu_bh_delete(event_tap_bh); +event_tap_bh = NULL; +} + +static void event_tap_schedule_bh(void) +{ +trace_event_tap_ignore_bh(!!event_tap_bh); + +/* if bh is already set, we ignore it for now */ +if (event_tap_bh) { +return; +} + +event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL); +qemu_bh_schedule(event_tap_bh); + +return ; +} + +static void event_tap_alloc_net_req(EventTapNetReq *net_req, + VLANClientState *vc, + const struct iovec *iov, int iovcnt, + NetPacketSent *sent_cb, bool async) +{ +int i; + +net_req-iovcnt = iovcnt; +net_req-async = async; +net_req-device_name = qemu_strdup(vc-name); +net_req-sent_cb = sent_cb; + +if (vc-vlan) { +net_req-vlan_needed = 1; +net_req-vlan_id = vc-vlan-id; +} else { +net_req-vlan_needed = 0; +} + +if (async) { +net_req-iov = (struct iovec *)iov; +} else { +net_req-iov = qemu_malloc(sizeof(struct iovec) * iovcnt); +for (i = 0; i iovcnt; i++) { +net_req-iov[i].iov_base = qemu_malloc(iov[i].iov_len); +memcpy(net_req-iov[i].iov_base, iov[i].iov_base, iov[i].iov_len); +net_req-iov[i].iov_len = iov[i].iov_len; +} +} +} + +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req, +BlockDriverState *bs, BlockRequest *reqs, +int num_reqs, BlockDriverCompletionFunc *cb
[PATCH 11/19] ioport: insert event_tap_ioport() to ioport_write().
Record ioport event to replay it upon failover. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- ioport.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/ioport.c b/ioport.c index aa4188a..74aebf5 100644 --- a/ioport.c +++ b/ioport.c @@ -27,6 +27,7 @@ #include ioport.h #include trace.h +#include event-tap.h /***/ /* IO Port */ @@ -76,6 +77,7 @@ static void ioport_write(int index, uint32_t address, uint32_t data) default_ioport_writel }; IOPortWriteFunc *func = ioport_write_table[index][address]; +event_tap_ioport(index, address, data); if (!func) func = default_func[index]; func(ioport_opaque[address], address, data); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/19] Introduce fault tolerant VM transaction QEMUFile and ft_mode.
This code implements VM transaction protocol. Like buffered_file, it sits between savevm and migration layer. With this architecture, VM transaction protocol is implemented mostly independent from other existing code. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp Signed-off-by: OHMURA Kei ohmura@lab.ntt.co.jp --- Makefile.objs |1 + ft_trans_file.c | 624 +++ ft_trans_file.h | 72 +++ migration.c |3 + trace-events| 16 ++ 5 files changed, 716 insertions(+), 0 deletions(-) create mode 100644 ft_trans_file.c create mode 100644 ft_trans_file.h diff --git a/Makefile.objs b/Makefile.objs index c3e52c5..de38579 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -100,6 +100,7 @@ common-obj-y += msmouse.o ps2.o common-obj-y += qdev.o qdev-properties.o common-obj-y += block-migration.o common-obj-y += pflib.o +common-obj-y += ft_trans_file.o common-obj-$(CONFIG_BRLAPI) += baum.o common-obj-$(CONFIG_POSIX) += migration-exec.o migration-unix.o migration-fd.o diff --git a/ft_trans_file.c b/ft_trans_file.c new file mode 100644 index 000..2b42b95 --- /dev/null +++ b/ft_trans_file.c @@ -0,0 +1,624 @@ +/* + * Fault tolerant VM transaction QEMUFile + * + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * This source code is based on buffered_file.c. + * Copyright IBM, Corp. 2008 + * Authors: + * Anthony Liguorialigu...@us.ibm.com + */ + +#include qemu-common.h +#include qemu-error.h +#include hw/hw.h +#include qemu-timer.h +#include sysemu.h +#include qemu-char.h +#include trace.h +#include ft_trans_file.h + +typedef struct FtTransHdr +{ +uint16_t cmd; +uint16_t id; +uint32_t seq; +uint32_t payload_len; +} FtTransHdr; + +typedef struct QEMUFileFtTrans +{ +FtTransPutBufferFunc *put_buffer; +FtTransGetBufferFunc *get_buffer; +FtTransPutReadyFunc *put_ready; +FtTransGetReadyFunc *get_ready; +FtTransWaitForUnfreezeFunc *wait_for_unfreeze; +FtTransCloseFunc *close; +void *opaque; +QEMUFile *file; + +enum QEMU_VM_TRANSACTION_STATE state; +uint32_t seq; +uint16_t id; + +int has_error; + +bool freeze_output; +bool freeze_input; +bool rate_limit; +bool is_sender; +bool is_payload; + +uint8_t *buf; +size_t buf_max_size; +size_t put_offset; +size_t get_offset; + +FtTransHdr header; +size_t header_offset; +} QEMUFileFtTrans; + +#define IO_BUF_SIZE 32768 + +static void ft_trans_append(QEMUFileFtTrans *s, +const uint8_t *buf, size_t size) +{ +if (size (s-buf_max_size - s-put_offset)) { +trace_ft_trans_realloc(s-buf_max_size, size + 1024); +s-buf_max_size += size + 1024; +s-buf = qemu_realloc(s-buf, s-buf_max_size); +} + +trace_ft_trans_append(size); +memcpy(s-buf + s-put_offset, buf, size); +s-put_offset += size; +} + +static void ft_trans_flush(QEMUFileFtTrans *s) +{ +size_t offset = 0; + +if (s-has_error) { +error_report(flush when error %d, bailing, s-has_error); +return; +} + +while (offset s-put_offset) { +ssize_t ret; + +ret = s-put_buffer(s-opaque, s-buf + offset, s-put_offset - offset); +if (ret == -EAGAIN) { +break; +} + +if (ret = 0) { +error_report(error flushing data, %s, strerror(errno)); +s-has_error = FT_TRANS_ERR_FLUSH; +break; +} else { +offset += ret; +} +} + +trace_ft_trans_flush(offset, s-put_offset); +memmove(s-buf, s-buf + offset, s-put_offset - offset); +s-put_offset -= offset; +s-freeze_output = !!s-put_offset; +} + +static ssize_t ft_trans_put(void *opaque, void *buf, int size) +{ +QEMUFileFtTrans *s = opaque; +size_t offset = 0; +ssize_t len; + +/* flush buffered data before putting next */ +if (s-put_offset) { +ft_trans_flush(s); +} + +while (!s-freeze_output offset size) { +len = s-put_buffer(s-opaque, (uint8_t *)buf + offset, size - offset); + +if (len == -EAGAIN) { +trace_ft_trans_freeze_output(); +s-freeze_output = 1; +break; +} + +if (len = 0) { +error_report(putting data failed, %s, strerror(errno)); +s-has_error = 1; +offset = -EINVAL; +break; +} + +offset += len; +} + +if (s-freeze_output) { +ft_trans_append(s, buf + offset, size - offset); +offset = size; +} + +return offset; +} + +static int ft_trans_send_header(QEMUFileFtTrans *s, +enum QEMU_VM_TRANSACTION_STATE state, +uint32_t payload_len) +{ +int ret; +FtTransHdr
[PATCH 18/19] Introduce -k option to enable FT migration mode (Kemari).
When -k option is set to migrate command, it will turn on ft_mode to start FT migration mode (Kemari). Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hmp-commands.hx |7 --- migration.c |4 qmp-commands.hx |7 --- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/hmp-commands.hx b/hmp-commands.hx index 1cea572..b7f8f2f 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -735,13 +735,14 @@ ETEXI { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, diff --git a/migration.c b/migration.c index fb73b2d..11bbdf8 100644 --- a/migration.c +++ b/migration.c @@ -92,6 +92,10 @@ int do_migrate(Monitor *mon, const QDict *qdict, QObject **ret_data) return -1; } +if (qdict_get_try_bool(qdict, ft, 0)) { +ft_mode = FT_INIT; +} + if (strstart(uri, tcp:, p)) { s = tcp_start_outgoing_migration(mon, p, max_throttle, detach, blk, inc); diff --git a/qmp-commands.hx b/qmp-commands.hx index 56c4d8b..1521931 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -431,13 +431,14 @@ EQMP { .name = migrate, -.args_type = detach:-d,blk:-b,inc:-i,uri:s, -.params = [-d] [-b] [-i] uri, +.args_type = detach:-d,blk:-b,inc:-i,ft:-k,uri:s, +.params = [-d] [-b] [-i] [-k] uri, .help = migrate to URI (using -d to not wait for completion) \n\t\t\t -b for migration without shared storage with full copy of disk\n\t\t\t -i for migration without shared storage with incremental copy of disk - (base image shared between src and destination), + (base image shared between src and destination) + \n\t\t\t -k for Fault Tolerance mode (Kemari protocol), .user_print = monitor_user_noop, .mhandler.cmd_new = do_migrate, }, -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/19] Call init handler of event-tap at main() in vl.c.
Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- vl.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/vl.c b/vl.c index 8bbb785..9faeb27 100644 --- a/vl.c +++ b/vl.c @@ -162,6 +162,7 @@ int main(int argc, char **argv) #include qemu-queue.h #include cpus.h #include arch_init.h +#include event-tap.h #include ui/qemu-spice.h @@ -2895,6 +2896,8 @@ int main(int argc, char **argv, char **envp) blk_mig_init(); +event_tap_init(); + if (default_cdrom) { /* we always create the cdrom drive, even if no disk is there */ drive_add(NULL, CDROM_ALIAS); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/19] Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer().
Currently buf size is fixed at 32KB. It would be useful if it could be flexible. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- hw/hw.h |2 ++ savevm.c | 20 +++- 2 files changed, 21 insertions(+), 1 deletions(-) diff --git a/hw/hw.h b/hw/hw.h index 163a683..a506688 100644 --- a/hw/hw.h +++ b/hw/hw.h @@ -58,6 +58,8 @@ void qemu_fflush(QEMUFile *f); int qemu_fclose(QEMUFile *f); void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size); void qemu_put_byte(QEMUFile *f, int v); +void *qemu_realloc_buffer(QEMUFile *f, int size); +void qemu_clear_buffer(QEMUFile *f); static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v) { diff --git a/savevm.c b/savevm.c index 90aa237..8c64c63 100644 --- a/savevm.c +++ b/savevm.c @@ -172,7 +172,8 @@ struct QEMUFile { when reading */ int buf_index; int buf_size; /* 0 when writing */ -uint8_t buf[IO_BUF_SIZE]; +int buf_max_size; +uint8_t *buf; int has_error; }; @@ -423,6 +424,9 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer, f-get_rate_limit = get_rate_limit; f-is_write = 0; +f-buf_max_size = IO_BUF_SIZE; +f-buf = qemu_malloc(sizeof(uint8_t) * f-buf_max_size); + return f; } @@ -453,6 +457,19 @@ void qemu_fflush(QEMUFile *f) } } +void *qemu_realloc_buffer(QEMUFile *f, int size) +{ +f-buf_max_size = size; +f-buf = qemu_realloc(f-buf, f-buf_max_size); + +return f-buf; +} + +void qemu_clear_buffer(QEMUFile *f) +{ +f-buf_size = f-buf_index = f-buf_offset = 0; +} + static void qemu_fill_buffer(QEMUFile *f) { int len; @@ -478,6 +495,7 @@ int qemu_fclose(QEMUFile *f) qemu_fflush(f); if (f-close) ret = f-close(f-opaque); +qemu_free(f-buf); qemu_free(f); return ret; } -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/19] Introduce read() to FdMigrationState.
Currently FdMigrationState doesn't support read(), and this patch introduces it to get response from the other side. Signed-off-by: Yoshiaki Tamura tamura.yoshi...@lab.ntt.co.jp --- migration-tcp.c | 15 +++ migration.c | 13 + migration.h |3 +++ 3 files changed, 31 insertions(+), 0 deletions(-) diff --git a/migration-tcp.c b/migration-tcp.c index b55f419..55777c8 100644 --- a/migration-tcp.c +++ b/migration-tcp.c @@ -39,6 +39,20 @@ static int socket_write(FdMigrationState *s, const void * buf, size_t size) return send(s-fd, buf, size, 0); } +static int socket_read(FdMigrationState *s, const void * buf, size_t size) +{ +ssize_t len; + +do { +len = recv(s-fd, (void *)buf, size, 0); +} while (len == -1 socket_error() == EINTR); +if (len == -1) { +len = -socket_error(); +} + +return len; +} + static int tcp_close(FdMigrationState *s) { DPRINTF(tcp_close\n); @@ -94,6 +108,7 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon, s-get_error = socket_errno; s-write = socket_write; +s-read = socket_read; s-close = tcp_close; s-mig_state.cancel = migrate_fd_cancel; s-mig_state.get_status = migrate_fd_get_status; diff --git a/migration.c b/migration.c index e5ba51c..4a749bb 100644 --- a/migration.c +++ b/migration.c @@ -330,6 +330,19 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size) return ret; } +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size) +{ +FdMigrationState *s = opaque; +int ret; + +ret = s-read(s, data, size); +if (ret == -1) { +ret = -(s-get_error(s)); +} + +return ret; +} + void migrate_fd_connect(FdMigrationState *s) { int ret; diff --git a/migration.h b/migration.h index d13ed4f..7bf6747 100644 --- a/migration.h +++ b/migration.h @@ -47,6 +47,7 @@ struct FdMigrationState int (*get_error)(struct FdMigrationState*); int (*close)(struct FdMigrationState*); int (*write)(struct FdMigrationState*, const void *, size_t); +int (*read)(struct FdMigrationState *, const void *, size_t); void *opaque; }; @@ -115,6 +116,8 @@ void migrate_fd_put_notify(void *opaque); ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size); +int migrate_fd_get_buffer(void *opaque, uint8_t *data, int64_t pos, size_t size); + void migrate_fd_connect(FdMigrationState *s); void migrate_fd_put_ready(void *opaque); -- 1.7.1.2 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html