Hi Michael, I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag? You don't do any decoding in the destination.
I would suggest creating a QEMUFileRDMA and moving the write/read code You can either add a new rdma_buffer QEMUFileOps or add the address to put_buffer. you also have some white space damage in the beginning of savevm.c. Regards, Orit On 01/29/2013 12:01 AM, mrhi...@linux.vnet.ibm.com wrote > From: "Michael R. Hines" <mrhi...@us.ibm.com> > > > Signed-off-by: Michael R. Hines <mrhi...@us.ibm.com> > --- > arch_init.c | 116 > +++++++++++++++++++++++++++++++++++++++-- > include/migration/qemu-file.h | 1 + > savevm.c | 90 +++++++++++++++++++++++++++----- > 3 files changed, 189 insertions(+), 18 deletions(-) > > diff --git a/arch_init.c b/arch_init.c > index dada6de..7633fa6 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -42,6 +42,7 @@ > #include "migration/migration.h" > #include "exec/gdbstub.h" > #include "hw/smbios.h" > +#include "qemu/rdma.h" > #include "exec/address-spaces.h" > #include "hw/pcspk.h" > #include "migration/page_cache.h" > @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH; > #define RAM_SAVE_FLAG_EOS 0x10 > #define RAM_SAVE_FLAG_CONTINUE 0x20 > #define RAM_SAVE_FLAG_XBZRLE 0x40 > +#define RAM_SAVE_FLAG_RDMA 0x80 > > #ifdef __ALTIVEC__ > #include <altivec.h> > @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) > int bytes_sent = 0; > MemoryRegion *mr; > ram_addr_t current_addr; > + static int not_sent = 1; > > if (!block) > block = QTAILQ_FIRST(&ram_list.blocks); > @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage) > int cont = (block == last_sent_block) ? > RAM_SAVE_FLAG_CONTINUE : 0; > > + current_addr = block->offset + offset; > p = memory_region_get_ram_ptr(mr) + offset; > > /* In doubt sent page as normal */ > bytes_sent = -1; > - if (is_dup_page(p)) { > + > + /* > + * RFC RDMA: The empirical cost of searching for zero pages here > + * plus the cost of communicating with the other side > + * seems to take significantly more time than simply > + * dumping the page into remote memory. > + */ > + if (!qemu_rdma_migration_enabled() && is_dup_page(p)) { > acct_info.dup_pages++; > bytes_sent = save_block_hdr(f, block, offset, cont, > RAM_SAVE_FLAG_COMPRESS); > qemu_put_byte(f, *p); > bytes_sent += 1; > + /* > + * RFC RDMA: Same comment as above. time(run-length encoding) > + * + time(communication) is too big. RDMA throughput > tanks > + * when this feature is enabled. But there's no need > + * to change the code since the feature is optional. > + */ > } else if (migrate_use_xbzrle()) { > - current_addr = block->offset + offset; > bytes_sent = save_xbzrle_page(f, p, current_addr, block, > offset, cont, last_stage); > if (!last_stage) { > p = get_cached_data(XBZRLE.cache, current_addr); > } > + } else if (qemu_rdma_migration_enabled()) { > + int ret; > + > + /* > + * RFC RDMA: This bad hack was to cause the loop on the > + * receiving side to break. Comments are welcome > + * on how to get rid of it. > + */ > + if (not_sent == 1) { > + not_sent = 0; > + bytes_sent = save_block_hdr(f, block, offset, > + cont, RAM_SAVE_FLAG_RDMA); > + } > + acct_info.norm_pages++; > + /* > + * use RDMA to send page > + */ > + if (qemu_rdma_migration_write(&rdma_mdata, current_addr, > + TARGET_PAGE_SIZE)) { > + fprintf(stderr, "rdma migration: write error!\n"); > + qemu_file_set_error(f, -EIO); > + return 0; > + } > + > + /* > + * do some polling > + */ > + while (1) { > + ret = qemu_rdma_migration_poll(&rdma_mdata); > + if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) { > + break; > + } > + if (ret < 0) { > + fprintf(stderr, "rdma migration: polling error!\n"); > + qemu_file_set_error(f, -EIO); > + return 0; > + } > + } > + bytes_sent += TARGET_PAGE_SIZE; > } > > /* XBZRLE overflow or normal page */ > @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > return 0; > } > > + > +int tprate = 1000; > + > static int ram_save_iterate(QEMUFile *f, void *opaque) > { > int ret; > int i; > - int64_t t0; > - int total_sent = 0; > + int64_t t0, tp0; > + int total_sent = 0, last_total_sent = 0; > > qemu_mutex_lock_ramlist(); > > @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) > break; > } > total_sent += bytes_sent; > + last_total_sent += bytes_sent; > acct_info.iterations++; > /* we want to check in the 1st loop, just in case it was the 1st time > and we had to sync the dirty bitmap. > qemu_get_clock_ns() is a bit expensive, so we only check each some > iterations > */ > + > + /* > + * RFC RDMA: Can we have something like this to periodically print > + * out throughput. > + * This is just a rough-sketch that partially worked for me. > + * I assume there a better way that everyone would prefer. > + * Perhaps we could set a QMP command that toggled a "periodic > printing" > + * option that allowed more details to be printed on stdout.....? > + */ > if ((i & 63) == 0) { > - uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000; > + uint64_t curr = qemu_get_clock_ns(rt_clock); > + uint64_t t1 = (curr - t0) / 1000000; > + double tp; > if (t1 > MAX_WAIT) { > DPRINTF("big wait: %" PRIu64 " milliseconds, %d > iterations\n", > t1, i); > break; > } > + > + if ((i % tprate) == 0) { > + uint64_t tp1 = (curr - tp0) / 1000000; > + tp = ((double) last_total_sent * 8.0 / > + ((double) tp1 / 1000.0)) / 1000.0 / 1000.0; > + printf("throughput: %f mbps\n", tp); > + last_total_sent = 0; > + tp0 = curr; > + } > } > i++; > } > > + /* flush buffer write */ > + if (qemu_rdma_migration_enabled()) { > + int resp; > + resp = qemu_rdma_migration_write_flush(&rdma_mdata); > + if (resp < 0) { > + fprintf(stderr, "rdma migration: write flush error!\n"); > + qemu_file_set_error(f, -EIO); > + return 0; > + } > + } > + > qemu_mutex_unlock_ramlist(); > > if (ret < 0) { > @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int > version_id) > ret = -EINVAL; > goto done; > } > + } else if (flags & RAM_SAVE_FLAG_RDMA) { > + /* > + * RFC RDMA: This bad hack was to cause the loop break. > + * Comments are welcome on how to get rid of it. > + * Communicating here is unnecessary because the > + * RDMA page has already arrived. > + * Comments are welcome on how to get rif of this. > + */ > + if (!qemu_rdma_migration_enabled()) { > + return -EINVAL; > + } > + void *host = host_from_stream_offset(f, addr, flags); > + if (!host) { > + return -EINVAL; > + } > + /* rdma page is already here, nothing to do */ > } > error = qemu_file_get_error(f); > if (error) { > diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h > index 68deefb..7c9968e 100644 > --- a/include/migration/qemu-file.h > +++ b/include/migration/qemu-file.h > @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f); > int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate); > int64_t qemu_file_get_rate_limit(QEMUFile *f); > int qemu_file_get_error(QEMUFile *f); > +void qemu_file_set_error(QEMUFile *f, int ret); > > static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) > { > diff --git a/savevm.c b/savevm.c > index 304d1ef..071196e 100644 > --- a/savevm.c > +++ b/savevm.c > @@ -24,6 +24,7 @@ > > #include "config-host.h" > #include "qemu-common.h" > +#include "qemu/rdma.h" > #include "hw/hw.h" > #include "hw/qdev.h" > #include "net/net.h" > @@ -50,7 +51,7 @@ > #define ARP_OP_REQUEST_REV 0x3 > > static int announce_self_create(uint8_t *buf, > - uint8_t *mac_addr) > + uint8_t *mac_addr) > { > /* Ethernet header. */ > memset(buf, 0xff, 6); /* destination MAC addr */ > @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque) > qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) + > 50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100); > } else { > - qemu_del_timer(timer); > - qemu_free_timer(timer); > + qemu_del_timer(timer); > + qemu_free_timer(timer); > } > } > > void qemu_announce_self(void) > { > - static QEMUTimer *timer; > - timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer); > - qemu_announce_self_once(&timer); > + static QEMUTimer *timer; > + timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer); > + qemu_announce_self_once(&timer); > } > > /***********************************************************/ > @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode) > QEMUFileStdio *s; > > if (mode == NULL || > - (mode[0] != 'r' && mode[0] != 'w') || > - mode[1] != 'b' || mode[2] != 0) { > + (mode[0] != 'r' && mode[0] != 'w') || > + mode[1] != 'b' || mode[2] != 0) { > fprintf(stderr, "qemu_fdopen: Argument validity check failed\n"); > return NULL; > } > @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char > *mode) > QEMUFileStdio *s; > > if (mode == NULL || > - (mode[0] != 'r' && mode[0] != 'w') || > - mode[1] != 'b' || mode[2] != 0) { > + (mode[0] != 'r' && mode[0] != 'w') || > + mode[1] != 'b' || mode[2] != 0) { > fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); > return NULL; > } > @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f) > return f->last_error; > } > > -static void qemu_file_set_error(QEMUFile *f, int ret) > +void qemu_file_set_error(QEMUFile *f, int ret) > { > if (f->last_error == 0) { > f->last_error = ret; > @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f) > { > SaveStateEntry *se; > int ret = 1; > + static int first_time = 1; > > QTAILQ_FOREACH(se, &savevm_handlers, entry) { > if (!se->ops || !se->ops->save_live_iterate) { > @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f) > } > } > if (ret != 0) { > +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC > + /* > + * We use two "sync" infiniband messages happen during migration. > + * One at the beginning and one at the end, just to be thorough. > + * This is the first one. > + */ > + if (first_time && qemu_rdma_migration_enabled()) { > + int r; > + first_time = 0; > + if (qemu_rdma_migration_post_send_sync(&rdma_mdata, > + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) { > + fprintf(stderr, > + "rdma migration: error posting extra send sync!\n"); > + return -EIO; > + } > + > + r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, > + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC); > + if (r < 0) { > + fprintf(stderr, > + "rdma migration: qemu_savevm_state_iterate" > + " sync polling error!\n"); > + return -EIO; > + } > + } > +#endif > + > return ret; > } > + > ret = qemu_file_get_error(f); > if (ret != 0) { > qemu_savevm_state_cancel(); > @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f) > int len; > > if ((!se->ops || !se->ops->save_state) && !se->vmsd) { > - continue; > + continue; > } > trace_savevm_section_start(); > /* Section type */ > @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f) > trace_savevm_section_end(se->section_id); > } > > + /* > + * We use two "sync" infiniband messages happen during migration. > + * One at the beginning and one at the end, just to be thorough. > + * This is the second one. > + */ > + if (qemu_rdma_migration_enabled()) { > + if (qemu_rdma_migration_post_send_sync(&rdma_mdata, > + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) { > + fprintf(stderr, "rdma migration: error posting send sync!\n"); > + return -EIO; > + } > + } > + > qemu_put_byte(f, QEMU_VM_EOF); > > + /* wait for RDMA sync message to complete */ > + if (qemu_rdma_migration_enabled()) { > + int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, > + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC); > + if (ret < 0) { > + fprintf(stderr, "rdma migration: qemu_savevm_state_full" > + " sync polling error!\n"); > + return -EIO; > + } > + } > + > return qemu_file_get_error(f); > } > > @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f) > > cpu_synchronize_all_post_init(); > > - ret = 0; > + /* wait for RDMA sync message */ > + if (qemu_rdma_migration_enabled()) { > + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, > + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC); > + if (ret < 0) { > + fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header" > + " sync polling error!\n"); > + goto out; > + } > + } > > + ret = 0; > out: > QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) { > QLIST_REMOVE(le, entry); >