Provide the cpr-save restart mode, which preserves the guest VM across a restart of the qemu process. After cpr-save, the caller passes qemu command-line arguments to cpr-exec, which directly exec's the new qemu binary. The arguments must include -S so new qemu starts in a paused state. The caller resumes the guest by calling cpr-load.
To use the restart mode, guest RAM must be backed by a memory-backend-file with share=on. The '-cpr-enable restart' option causes secondary guest ram blocks (those not specified on the command line) to be allocated by mmap'ing a memfd. The memfd values are saved in special cpr state which is retrieved after exec, and are kept open across exec, after which they are retrieved and re-mmap'd. Hence guest RAM is preserved in place, albeit with new virtual addresses in the qemu process. The restart mode supports vfio devices and memory-backend-memfd in subsequent patches. cpr-exec syntax: { 'command': 'cpr-exec', 'data': { 'argv': [ 'str' ] } } Add the restart mode: { 'enum': 'CprMode', 'data': [ 'reboot', 'restart' ] } Signed-off-by: Steve Sistare <steven.sist...@oracle.com> --- migration/cpr.c | 35 +++++++++++++++++++++++++++++++++++ qapi/cpr.json | 26 +++++++++++++++++++++++++- qemu-options.hx | 2 +- softmmu/physmem.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- trace-events | 1 + 5 files changed, 107 insertions(+), 3 deletions(-) diff --git a/migration/cpr.c b/migration/cpr.c index 1cc8738..8b3fffd 100644 --- a/migration/cpr.c +++ b/migration/cpr.c @@ -22,6 +22,7 @@ static int cpr_enabled_modes; void cpr_init(int modes) { cpr_enabled_modes = modes; + cpr_state_load(&error_fatal); } bool cpr_enabled(CprMode mode) @@ -153,6 +154,37 @@ err: cpr_set_mode(CPR_MODE_NONE); } +static int preserve_fd(const char *name, int id, int fd, void *opaque) +{ + qemu_clear_cloexec(fd); + return 0; +} + +static int unpreserve_fd(const char *name, int id, int fd, void *opaque) +{ + qemu_set_cloexec(fd); + return 0; +} + +void qmp_cpr_exec(strList *args, Error **errp) +{ + if (!runstate_check(RUN_STATE_SAVE_VM)) { + error_setg(errp, "runstate is not save-vm"); + return; + } + if (cpr_get_mode() != CPR_MODE_RESTART) { + error_setg(errp, "cpr-exec requires cpr-save with restart mode"); + return; + } + + cpr_walk_fd(preserve_fd, 0); + if (cpr_state_save(errp)) { + return; + } + + assert(qemu_system_exec_request(args, errp) == 0); +} + void qmp_cpr_load(const char *filename, CprMode mode, Error **errp) { QEMUFile *f; @@ -189,6 +221,9 @@ void qmp_cpr_load(const char *filename, CprMode mode, Error **errp) goto out; } + /* Clear cloexec to prevent fd leaks until the next cpr-save */ + cpr_walk_fd(unpreserve_fd, 0); + state = global_state_get_runstate(); if (state == RUN_STATE_RUNNING) { vm_start(); diff --git a/qapi/cpr.json b/qapi/cpr.json index 11c6f88..47ee4ff 100644 --- a/qapi/cpr.json +++ b/qapi/cpr.json @@ -15,11 +15,12 @@ # @CprMode: # # @reboot: checkpoint can be cpr-load'ed after a host reboot. +# @restart: checkpoint can be cpr-load'ed after restarting qemu. # # Since: 7.1 ## { 'enum': 'CprMode', - 'data': [ 'none', 'reboot' ] } + 'data': [ 'none', 'reboot', 'restart' ] } ## # @cpr-save: @@ -38,6 +39,11 @@ # issue the quit command, reboot the system, start qemu using the same # arguments plus -S, and issue the cpr-load command. # +# If @mode is 'restart', the checkpoint remains valid after restarting +# qemu using a subsequent cpr-exec. Guest RAM must be backed by a +# memory-backend-file with share=on. +# To resume from the checkpoint, issue the cpr-load command. +# # @filename: name of checkpoint file # @mode: @CprMode mode # @@ -48,6 +54,24 @@ 'mode': 'CprMode' } } ## +# @cpr-exec: +# +# Restart qemu by directly exec'ing @argv[0], replacing the qemu process. +# The PID remains the same. Must be called after cpr-save restart. +# +# @argv[0] should be the path of a new qemu binary, or a prefix command that +# in turn exec's the new qemu binary. The arguments must match those used +# to initially start qemu, plus the -S option so new qemu starts in a paused +# state. +# +# @argv: arguments to be passed to exec(). +# +# Since: 7.1 +## +{ 'command': 'cpr-exec', + 'data': { 'argv': [ 'str' ] } } + +## # @cpr-load: # # Load a virtual machine from the checkpoint file @filename that was created diff --git a/qemu-options.hx b/qemu-options.hx index 6e51c33..1b49360 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4484,7 +4484,7 @@ SRST ERST DEF("cpr-enable", HAS_ARG, QEMU_OPTION_cpr_enable, \ - "-cpr-enable reboot enable the cpr mode\n", + "-cpr-enable reboot|restart enable the cpr mode\n", QEMU_ARCH_ALL) SRST ``-cpr-enable reboot`` diff --git a/softmmu/physmem.c b/softmmu/physmem.c index 822c424..412cc80 100644 --- a/softmmu/physmem.c +++ b/softmmu/physmem.c @@ -44,6 +44,7 @@ #include "qemu/qemu-print.h" #include "qemu/log.h" #include "qemu/memalign.h" +#include "qemu/memfd.h" #include "exec/memory.h" #include "exec/ioport.h" #include "sysemu/dma.h" @@ -1962,6 +1963,40 @@ static void dirty_memory_extend(ram_addr_t old_ram_size, } } +static bool memory_region_is_backend(MemoryRegion *mr) +{ + return !!object_dynamic_cast(mr->parent_obj.parent, TYPE_MEMORY_BACKEND); +} + +static void *qemu_anon_memfd_alloc(RAMBlock *rb, size_t maxlen, Error **errp) +{ + size_t len, align; + void *addr; + struct MemoryRegion *mr = rb->mr; + const char *name = memory_region_name(mr); + int mfd = cpr_find_memfd(name, &len, &maxlen, &align); + + if (mfd >= 0) { + rb->used_length = len; + rb->max_length = maxlen; + mr->align = align; + } else { + len = rb->used_length; + maxlen = rb->max_length; + mr->align = QEMU_VMALLOC_ALIGN; + mfd = qemu_memfd_create(name, maxlen + mr->align, 0, 0, 0, errp); + if (mfd < 0) { + return NULL; + } + cpr_save_memfd(name, mfd, len, maxlen, mr->align); + } + rb->flags |= RAM_SHARED; + qemu_set_cloexec(mfd); + addr = file_ram_alloc(rb, maxlen, mfd, false, false, 0, errp); + trace_anon_memfd_alloc(name, maxlen, addr, mfd); + return addr; +} + static void ram_block_add(RAMBlock *new_block, Error **errp) { const bool noreserve = qemu_ram_is_noreserve(new_block); @@ -1986,6 +2021,14 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_mutex_unlock_ramlist(); return; } + } else if (cpr_enabled(CPR_MODE_RESTART) && + !memory_region_is_backend(new_block->mr)) { + new_block->host = qemu_anon_memfd_alloc(new_block, + new_block->max_length, + errp); + if (!new_block->host) { + return; + } } else { new_block->host = qemu_anon_ram_alloc(new_block->max_length, &new_block->mr->align, @@ -1997,8 +2040,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) qemu_mutex_unlock_ramlist(); return; } - memory_try_enable_merging(new_block->host, new_block->max_length); } + memory_try_enable_merging(new_block->host, new_block->max_length); } new_ram_size = MAX(old_ram_size, @@ -2231,6 +2274,7 @@ void qemu_ram_free(RAMBlock *block) } qemu_mutex_lock_ramlist(); + cpr_delete_memfd(memory_region_name(block->mr)); QLIST_REMOVE_RCU(block, next); ram_list.mru_block = NULL; /* Write list before version */ diff --git a/trace-events b/trace-events index bc71006..07369bb 100644 --- a/trace-events +++ b/trace-events @@ -45,6 +45,7 @@ ram_block_discard_range(const char *rbname, void *hva, size_t length, bool need_ # accel/tcg/cputlb.c memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) "0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u" memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64 +anon_memfd_alloc(const char *name, size_t size, void *ptr, int fd) "%s size %zu ptr %p fd %d" # gdbstub.c gdbstub_op_start(const char *device) "Starting gdbstub using device %s" -- 1.8.3.1