This version improves support for multiple monitors and has been ported up to HEAD as of 01/14.
Changes since V6: - Integrated with virtio qdev feature bit changes (specifically: Use VirtIODevice 'guest_features' to check if memory stats is a negotiated feature) - Track which monitor initiated the most recent stats request. Now it does the Right Thing(tm) with multiple monitors making parallel requests. Changes since V5: - Asynchronous query-balloon mode for QMP - Add timeout to prevent hanging the user monitor in synchronous mode Changes since V4: - Virtio spec updated: http://ozlabs.org/~rusty/virtio-spec/virtio-spec-0.8.2.pdf - Guest-side Linux implementation applied by Rusty - Start using the QObject infrastructure - All endian conversions done in the host - Report stats that reference a quantity of memory in bytes Changes since V3: - Increase stat field size to 64 bits - Report all sizes in kb (not pages) - Drop anon_pages stat Changes since V2: - Use a virtqueue for communication instead of the device config space Changes since V1: - In the monitor, print all stats on one line with less abbreviated names - Coding style changes When using ballooning to manage overcommitted memory on a host, a system for guests to communicate their memory usage to the host can provide information that will minimize the impact of ballooning on the guests. The current method employs a daemon running in each guest that communicates memory statistics to a host daemon at a specified time interval. The host daemon aggregates this information and inflates and/or deflates balloons according to the level of host memory pressure. This approach is effective but overly complex since a daemon must be installed inside each guest and coordinated to communicate with the host. A simpler approach is to collect memory statistics in the virtio balloon driver and communicate them directly to the hypervisor. Signed-off-by: Adam Litke <a...@us.ibm.com> To: Anthony Liguori <aligu...@us.ibm.com> Cc: Avi Kivity <a...@redhat.com> Cc: Luiz Capitulino <lcapitul...@redhat.com> Cc: qemu-devel@nongnu.org diff --git a/balloon.h b/balloon.h index 60b4a5d..684ea9e 100644 --- a/balloon.h +++ b/balloon.h @@ -16,12 +16,15 @@ #include "cpu-defs.h" -typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target); +/* Timeout for synchronous stats requests (in seconds) */ +#define QEMU_BALLOON_SYNC_TIMEOUT 5 + +typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target, Monitor *mon); void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque); -void qemu_balloon(ram_addr_t target); +int qemu_balloon(ram_addr_t target); -ram_addr_t qemu_balloon_status(void); +int qemu_balloon_status(Monitor *mon); #endif diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c index e17880f..b68bad7 100644 --- a/hw/virtio-balloon.c +++ b/hw/virtio-balloon.c @@ -19,6 +19,10 @@ #include "balloon.h" #include "virtio-balloon.h" #include "kvm.h" +#include "monitor.h" +#include "qlist.h" +#include "qint.h" +#include "qstring.h" #if defined(__linux__) #include <sys/mman.h> @@ -27,9 +31,16 @@ typedef struct VirtIOBalloon { VirtIODevice vdev; - VirtQueue *ivq, *dvq; + VirtQueue *ivq, *dvq, *svq; uint32_t num_pages; uint32_t actual; + uint64_t stats[VIRTIO_BALLOON_S_NR]; + VirtQueueElement stats_vq_elem; + size_t stats_vq_offset; + QEMUTimer *stats_timer; + uint64_t stats_updated; + Monitor *stats_mon; + bool stats_requested; } VirtIOBalloon; static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev) @@ -46,6 +57,50 @@ static void balloon_page(void *addr, int deflate) #endif } +/* + * reset_stats - Mark all items in the stats array as unset + * + * This function needs to be called at device intialization and before + * before updating to a set of newly-generated stats. This will ensure that no + * stale values stick around in case the guest reports a subset of the supported + * statistics. + */ +static inline void reset_stats(VirtIOBalloon *dev) +{ + int i; + for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1); + dev->stats_updated = qemu_get_clock(host_clock); +} + +static void stat_put(QDict *dict, const char *label, uint64_t val) +{ + if (val != -1) + qdict_put(dict, label, qint_from_int(val)); +} + +static QObject *get_stats_qobject(VirtIOBalloon *dev) +{ + QDict *dict = qdict_new(); + uint32_t actual = ram_size - (dev->actual << VIRTIO_BALLOON_PFN_SHIFT); + uint64_t age; + + stat_put(dict, "actual", actual); + stat_put(dict, "mem_swapped_in", dev->stats[VIRTIO_BALLOON_S_SWAP_IN]); + stat_put(dict, "mem_swapped_out", dev->stats[VIRTIO_BALLOON_S_SWAP_OUT]); + stat_put(dict, "major_page_faults", dev->stats[VIRTIO_BALLOON_S_MAJFLT]); + stat_put(dict, "minor_page_faults", dev->stats[VIRTIO_BALLOON_S_MINFLT]); + stat_put(dict, "free_mem", dev->stats[VIRTIO_BALLOON_S_MEMFREE]); + stat_put(dict, "total_mem", dev->stats[VIRTIO_BALLOON_S_MEMTOT]); + + /* If age is over the timeout threshold, report it */ + age = (qemu_get_clock(host_clock) - dev->stats_updated) / + (get_ticks_per_sec() / 1000); + if (age >= QEMU_BALLOON_SYNC_TIMEOUT * 1000) + stat_put(dict, "age", age); + + return QOBJECT(dict); +} + /* FIXME: once we do a virtio refactoring, this will get subsumed into common * code */ static size_t memcpy_from_iovector(void *data, size_t offset, size_t size, @@ -104,6 +159,73 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) } } +/* + * complete_stats_request - Clean up and report statistics. + */ +static void complete_stats_request(VirtIOBalloon *vb) +{ + QObject *stats; + + /* Only respond to incoming stats if we requested them */ + if (!vb->stats_requested) { + return; + } + + stats = get_stats_qobject(vb); + if (!monitor_ctrl_mode(vb->stats_mon)) { + qemu_del_timer(vb->stats_timer); + monitor_print_balloon(vb->stats_mon, stats); + monitor_resume(vb->stats_mon); + } else { + monitor_protocol_event(QEVENT_BALLOON, stats); + } + + vb->stats_mon = NULL; + vb->stats_requested = false; +} + +/* + * stats_request_timeout - Timer callback for sychronous request timeout + * + * In the case of a synchronous timeout, just report the old statistics. + */ +static void stats_request_timeout(void *opaque) +{ + VirtIOBalloon *vb = (VirtIOBalloon *)opaque; + complete_stats_request(vb); +} + +static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIOBalloon *s = DO_UPCAST(VirtIOBalloon, vdev, vdev); + VirtQueueElement *elem = &s->stats_vq_elem; + VirtIOBalloonStat stat; + size_t offset = 0; + + if (!virtqueue_pop(vq, elem)) { + return; + } + + /* Initialize the stats to get rid of any stale values. This is only + * needed to handle the case where a guest supports fewer stats than it + * used to (ie. it has booted into an old kernel). + */ + reset_stats(s); + + while (memcpy_from_iovector(&stat, offset, sizeof(stat), elem->out_sg, + elem->out_num) == sizeof(stat)) { + uint16_t tag = tswap16(stat.tag); + uint64_t val = tswap64(stat.val); + + offset += sizeof(stat); + if (tag < VIRTIO_BALLOON_S_NR) + s->stats[tag] = val; + } + s->stats_vq_offset = offset; + + complete_stats_request(s); +} + static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) { VirtIOBalloon *dev = to_virtio_balloon(vdev); @@ -126,10 +248,56 @@ static void virtio_balloon_set_config(VirtIODevice *vdev, static uint32_t virtio_balloon_get_features(VirtIODevice *vdev, uint32_t f) { + f |= (1 << VIRTIO_BALLOON_F_STATS_VQ); return f; } -static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t target) +/* + * request_stats - Initiate a request for memory statistics + * + * If stats are not supported, just report 'actual', otherwise, two types of + * requests are supported: + * + * Synchronous requests are intended to be used with the user monitor. The + * monitor is suspended until the stats are received or the timer expires. + * Either the newly-update stats or the current stats (in the case of a timeout) + * are printed to the monitor and the monitor is resumed. + * + * Asynchronous requests are intended for QMP. A QMP event will be generated + * only if the stats are updated. + */ +static void request_stats(VirtIOBalloon *vb, Monitor *mon) +{ + /* If a user-monitor is already waiting, resume that one first */ + if (vb->stats_requested && !monitor_ctrl_mode(vb->stats_mon)) { + qemu_del_timer(vb->stats_timer); + monitor_resume(vb->stats_mon); + } + vb->stats_requested = true; + vb->stats_mon = mon; + + /* Set up a synchronous request for a user-monitor. */ + if (!monitor_ctrl_mode(mon)) { + uint64_t later = qemu_get_clock(vm_clock) + + QEMU_BALLOON_SYNC_TIMEOUT * get_ticks_per_sec(); + monitor_suspend(mon); + qemu_mod_timer(vb->stats_timer, later); + } + + if (vb->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ)) { + virtqueue_push(vb->svq, &vb->stats_vq_elem, vb->stats_vq_offset); + virtio_notify(&vb->vdev, vb->svq); + } else { + /* Stats are not supported. Clear out any stale values that might have + * been set by a more featureful guest kernel. + */ + reset_stats(vb); + complete_stats_request(vb); + } +} + +static void virtio_balloon_to_target(void *opaque, ram_addr_t target, + Monitor *mon) { VirtIOBalloon *dev = opaque; @@ -139,9 +307,9 @@ static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t target) if (target) { dev->num_pages = (ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT; virtio_notify_config(&dev->vdev); + } else { + request_stats(dev, mon); } - - return ram_size - (dev->actual << VIRTIO_BALLOON_PFN_SHIFT); } static void virtio_balloon_save(QEMUFile *f, void *opaque) @@ -152,6 +320,11 @@ static void virtio_balloon_save(QEMUFile *f, void *opaque) qemu_put_be32(f, s->num_pages); qemu_put_be32(f, s->actual); + qemu_put_buffer(f, (uint8_t *)&s->stats_vq_elem, sizeof(VirtQueueElement)); + qemu_put_buffer(f, (uint8_t *)&s->stats_vq_offset, sizeof(size_t)); + qemu_put_timer(f, s->stats_timer); + qemu_put_be32(f, s->stats_updated); +/* XXX save monitor and stats_requested */ } static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id) @@ -165,6 +338,11 @@ static int virtio_balloon_load(QEMUFile *f, void *opaque, int version_id) s->num_pages = qemu_get_be32(f); s->actual = qemu_get_be32(f); + qemu_get_buffer(f, (uint8_t *)&s->stats_vq_elem, sizeof(VirtQueueElement)); + qemu_get_buffer(f, (uint8_t *)&s->stats_vq_offset, sizeof(size_t)); + qemu_get_timer(f, s->stats_timer); + s->stats_updated = qemu_get_be32(f); +/* XXX Load monitor and stats_requested */ return 0; } @@ -183,7 +361,10 @@ VirtIODevice *virtio_balloon_init(DeviceState *dev) s->ivq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output); s->dvq = virtio_add_queue(&s->vdev, 128, virtio_balloon_handle_output); + s->svq = virtio_add_queue(&s->vdev, 128, virtio_balloon_receive_stats); + s->stats_timer = qemu_new_timer(vm_clock, stats_request_timeout, s); + reset_stats(s); qemu_add_balloon_handler(virtio_balloon_to_target, s); register_savevm("virtio-balloon", -1, 1, virtio_balloon_save, virtio_balloon_load, s); diff --git a/hw/virtio-balloon.h b/hw/virtio-balloon.h index 9a0d119..e20cf6b 100644 --- a/hw/virtio-balloon.h +++ b/hw/virtio-balloon.h @@ -25,6 +25,7 @@ /* The feature bitmap for virtio balloon */ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ +#define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory stats virtqueue */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 @@ -37,4 +38,18 @@ struct virtio_balloon_config uint32_t actual; }; +/* Memory Statistics */ +#define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ +#define VIRTIO_BALLOON_S_SWAP_OUT 1 /* Amount of memory swapped out */ +#define VIRTIO_BALLOON_S_MAJFLT 2 /* Number of major faults */ +#define VIRTIO_BALLOON_S_MINFLT 3 /* Number of minor faults */ +#define VIRTIO_BALLOON_S_MEMFREE 4 /* Total amount of free memory */ +#define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ +#define VIRTIO_BALLOON_S_NR 6 + +typedef struct VirtIOBalloonStat { + uint16_t tag; + uint64_t val; +} __attribute__((packed)) VirtIOBalloonStat; + #endif diff --git a/monitor.c b/monitor.c index b824e7c..134ed15 100644 --- a/monitor.c +++ b/monitor.c @@ -133,7 +133,7 @@ static void monitor_command_cb(Monitor *mon, const char *cmdline, void *opaque); /* Return true if in control mode, false otherwise */ -static inline int monitor_ctrl_mode(const Monitor *mon) +int monitor_ctrl_mode(const Monitor *mon) { return (mon->flags & MONITOR_USE_CONTROL); } @@ -357,6 +357,9 @@ void monitor_protocol_event(MonitorEvent event, QObject *data) case QEVENT_STOP: event_name = "STOP"; break; + case QEVENT_BALLOON: + event_name = "BALLOON"; + break; default: abort(); break; @@ -2058,43 +2061,27 @@ static void do_info_status(Monitor *mon, QObject **ret_data) vm_running, singlestep); } -static ram_addr_t balloon_get_value(void) +static void print_balloon_stat(const char *key, QObject *obj, void *opaque) { - ram_addr_t actual; - - if (kvm_enabled() && !kvm_has_sync_mmu()) { - qemu_error_new(QERR_KVM_MISSING_CAP, "synchronous MMU", "balloon"); - return 0; - } - - actual = qemu_balloon_status(); - if (actual == 0) { - qemu_error_new(QERR_DEVICE_NOT_ACTIVE, "balloon"); - return 0; - } - - return actual; -} + Monitor *mon = opaque; -/** - * do_balloon(): Request VM to change its memory allocation - */ -static void do_balloon(Monitor *mon, const QDict *qdict, QObject **ret_data) -{ - if (balloon_get_value()) { - /* ballooning is active */ - qemu_balloon(qdict_get_int(qdict, "value")); - } + if (strcmp(key, "actual")) + monitor_printf(mon, ",%s=%" PRId64, key, + qint_get_int(qobject_to_qint(obj))); } -static void monitor_print_balloon(Monitor *mon, const QObject *data) +void monitor_print_balloon(Monitor *mon, const QObject *data) { QDict *qdict; qdict = qobject_to_qdict(data); + if (!qdict_haskey(qdict, "actual")) + return; - monitor_printf(mon, "balloon: actual=%" PRId64 "\n", - qdict_get_int(qdict, "balloon") >> 20); + monitor_printf(mon, "balloon: actual=%" PRId64, + qdict_get_int(qdict, "actual") >> 20); + qdict_iter(qdict, print_balloon_stat, mon); + monitor_printf(mon, "\n"); } /** @@ -2102,21 +2089,52 @@ static void monitor_print_balloon(Monitor *mon, const QObject *data) * * Return a QDict with the following information: * - * - "balloon": current balloon value in bytes + * - "actual": current balloon value in bytes + * The following fields may or may not be present: + * - "mem_swapped_in": Amount of memory swapped in (bytes) + * - "mem_swapped_out": Amount of memory swapped out (bytes) + * - "major_page_faults": Number of major faults + * - "minor_page_faults": Number of minor faults + * - "free_mem": Total amount of free and unused memory (bytes) + * - "total_mem": Total amount of available memory (bytes) * * Example: * - * { "balloon": 1073741824 } + * { "actual": 1073741824, "mem_swapped_in": 0, "mem_swapped_out": 0, + * "major_page_faults": 142, "minor_page_faults": 239245, + * "free_mem": 1014185984, "total_mem": 1044668416 } */ static void do_info_balloon(Monitor *mon, QObject **ret_data) { - ram_addr_t actual; + int ret; + + if (kvm_enabled() && !kvm_has_sync_mmu()) { + qemu_error_new(QERR_KVM_MISSING_CAP, "synchronous MMU", "balloon"); + return; + } + + ret = qemu_balloon_status(mon); + if (!ret) { + qemu_error_new(QERR_DEVICE_NOT_ACTIVE, "balloon"); + return; + } +} + +/** + * do_balloon(): Request VM to change its memory allocation + */ +static void do_balloon(Monitor *mon, const QDict *qdict, QObject **ret_data) +{ + int ret; - actual = balloon_get_value(); - if (actual != 0) { - *ret_data = qobject_from_jsonf("{ 'balloon': %" PRId64 "}", - (int64_t) actual); + if (kvm_enabled() && !kvm_has_sync_mmu()) { + qemu_error_new(QERR_KVM_MISSING_CAP, "synchronous MMU", "balloon"); + return; } + + ret = qemu_balloon(qdict_get_int(qdict, "value")); + if (ret == 0) + qemu_error_new(QERR_DEVICE_NOT_ACTIVE, "balloon"); } static qemu_acl *find_acl(Monitor *mon, const char *name) diff --git a/monitor.h b/monitor.h index 6ed117a..556239a 100644 --- a/monitor.h +++ b/monitor.h @@ -20,6 +20,7 @@ typedef enum MonitorEvent { QEVENT_RESET, QEVENT_POWERDOWN, QEVENT_STOP, + QEVENT_BALLOON, QEVENT_MAX, } MonitorEvent; @@ -32,6 +33,7 @@ void monitor_resume(Monitor *mon); void monitor_read_bdrv_key_start(Monitor *mon, BlockDriverState *bs, BlockDriverCompletionFunc *completion_cb, void *opaque); +void monitor_print_balloon(Monitor *mon, const QObject *data); int monitor_get_fd(Monitor *mon, const char *fdname); @@ -41,4 +43,6 @@ void monitor_printf(Monitor *mon, const char *fmt, ...) void monitor_print_filename(Monitor *mon, const char *filename); void monitor_flush(Monitor *mon); +extern int monitor_ctrl_mode(const Monitor *mon); + #endif /* !MONITOR_H */ diff --git a/vl.c b/vl.c index b048e89..e4bf434 100644 --- a/vl.c +++ b/vl.c @@ -362,17 +362,24 @@ void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque) qemu_balloon_event_opaque = opaque; } -void qemu_balloon(ram_addr_t target) +int qemu_balloon(ram_addr_t target) { - if (qemu_balloon_event) - qemu_balloon_event(qemu_balloon_event_opaque, target); + if (qemu_balloon_event) { + qemu_balloon_event(qemu_balloon_event_opaque, target, NULL); + return 1; + } else { + return 0; + } } -ram_addr_t qemu_balloon_status(void) +int qemu_balloon_status(Monitor *mon) { - if (qemu_balloon_event) - return qemu_balloon_event(qemu_balloon_event_opaque, 0); - return 0; + if (qemu_balloon_event) { + qemu_balloon_event(qemu_balloon_event_opaque, 0, mon); + return 1; + } else { + return 0; + } } /***********************************************************/ -- Thanks, Adam