On 06/24/2013 12:47 PM, Chegu Vinod wrote: > If a user chooses to turn on the auto-converge migration capability > these changes detect the lack of convergence and throttle down the > guest. i.e. force the VCPUs out of the guest for some duration > and let the migration thread catchup and help converge. > > Verified the convergence using the following : > - Java Warehouse workload running on a 20VCPU/256G guest(~80% busy) > - OLTP like workload running on a 80VCPU/512G guest (~80% busy) > > Sample results with Java warehouse workload : (migrate speed set to 20Gb and > migrate downtime set to 4seconds). > > (qemu) info migrate > capabilities: xbzrle: off auto-converge: off <---- > Migration status: active > total time: 1487503 milliseconds > expected downtime: 519 milliseconds > transferred ram: 383749347 kbytes > remaining ram: 2753372 kbytes > total ram: 268444224 kbytes > duplicate: 65461532 pages > skipped: 64901568 pages > normal: 95750218 pages > normal bytes: 383000872 kbytes > dirty pages rate: 67551 pages > > --- > > (qemu) info migrate > capabilities: xbzrle: off auto-converge: on <---- > Migration status: completed > total time: 241161 milliseconds > downtime: 6373 milliseconds > transferred ram: 28235307 kbytes > remaining ram: 0 kbytes > total ram: 268444224 kbytes > duplicate: 64946416 pages > skipped: 64903523 pages > normal: 7044971 pages > normal bytes: 28179884 kbytes > > Signed-off-by: Chegu Vinod <chegu_vi...@hp.com> > --- > arch_init.c | 79 > +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 files changed, 79 insertions(+), 0 deletions(-) > > diff --git a/arch_init.c b/arch_init.c > index a8b91ee..e7ca3b1 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -104,6 +104,9 @@ int graphic_depth = 15; > #endif > > const uint32_t arch_type = QEMU_ARCH; > +static bool mig_throttle_on; > +static int dirty_rate_high_cnt; > +static void check_guest_throttling(void); > > /***********************************************************/ > /* ram save/restore */ > @@ -378,8 +381,14 @@ static void migration_bitmap_sync(void) > uint64_t num_dirty_pages_init = migration_dirty_pages; > MigrationState *s = migrate_get_current(); > static int64_t start_time; > + static int64_t bytes_xfer_prev; > static int64_t num_dirty_pages_period; > int64_t end_time; > + int64_t bytes_xfer_now; > + > + if (!bytes_xfer_prev) { > + bytes_xfer_prev = ram_bytes_transferred(); > + } > > if (!start_time) { > start_time = qemu_get_clock_ms(rt_clock); > @@ -404,6 +413,23 @@ static void migration_bitmap_sync(void) > > /* more than 1 second = 1000 millisecons */ > if (end_time > start_time + 1000) { > + if (migrate_auto_converge()) { > + /* The following detection logic can be refined later. For now: > + Check to see if the dirtied bytes is 50% more than the approx. > + amount of bytes that just got transferred since the last time > we > + were in this routine. If that happens >N times (for now N==4) > + we turn on the throttle down logic */ > + bytes_xfer_now = ram_bytes_transferred(); > + if (s->dirty_pages_rate && > + (num_dirty_pages_period * TARGET_PAGE_SIZE > > + (bytes_xfer_now - bytes_xfer_prev)/2) && > + (dirty_rate_high_cnt++ > 4)) { > + trace_migration_throttle(); > + mig_throttle_on = true; > + dirty_rate_high_cnt = 0; > + } > + bytes_xfer_prev = bytes_xfer_now; > + } else { > + mig_throttle_on = false; > + } > s->dirty_pages_rate = num_dirty_pages_period * 1000 > / (end_time - start_time); > s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; > @@ -566,6 +592,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > migration_bitmap = bitmap_new(ram_pages); > bitmap_set(migration_bitmap, 0, ram_pages); > migration_dirty_pages = ram_pages; > + mig_throttle_on = false; > + dirty_rate_high_cnt = 0; > > if (migrate_use_xbzrle()) { > XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / > @@ -628,6 +656,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) > } > total_sent += bytes_sent; > acct_info.iterations++; > + check_guest_throttling(); > /* we want to check in the 1st loop, just in case it was the 1st time > and we had to sync the dirty bitmap. > qemu_get_clock_ns() is a bit expensive, so we only check each some > @@ -1097,3 +1126,53 @@ TargetInfo *qmp_query_target(Error **errp) > > return info; > } > + > +/* Stub function that's gets run on the vcpu when its brought out of the > + VM to run inside qemu via async_run_on_cpu()*/ > +static void mig_sleep_cpu(void *opq) > +{ > + qemu_mutex_unlock_iothread(); > + g_usleep(30*1000); > + qemu_mutex_lock_iothread(); > +} > + > +/* To reduce the dirty rate explicitly disallow the VCPUs from spending > + much time in the VM. The migration thread will try to catchup. > + Workload will experience a performance drop. > +*/ > +static void mig_throttle_cpu_down(CPUState *cpu, void *data) > +{ > + async_run_on_cpu(cpu, mig_sleep_cpu, NULL); > +} > + > +static void mig_throttle_guest_down(void) > +{ > + qemu_mutex_lock_iothread(); > + qemu_for_each_cpu(mig_throttle_cpu_down, NULL); > + qemu_mutex_unlock_iothread(); > +} > + > +static void check_guest_throttling(void) > +{ > + static int64_t t0; > + int64_t t1; > + > + if (!mig_throttle_on) { > + return; > + } > + > + if (!t0) { > + t0 = qemu_get_clock_ns(rt_clock); > + return; > + } > + > + t1 = qemu_get_clock_ns(rt_clock); > + > + /* If it has been more than 40 ms since the last time the guest > + * was throttled then do it again. > + */ > + if (40 < (t1-t0)/1000000) { > + mig_throttle_guest_down(); > + t0 = t1; > + } > +} >
Reviewed-by: Orit Wasserman <owass...@redhat.com>