Peter Feiner <pe...@gridcentric.ca> writes: > This patch makes the -mem-path filenames deterministic and allows some control > over how QEMU mmaps the files. Given this control, QEMU can be used to > implement > exogenous memory management techniques quite simply. Two examples follow: > > 1. Post-copy migration (mmap=shared for origin, save device state, path in > NFS available on both hosts, mmap=shared for destination). > 2. memory sharing via VM cloning (mmap=shared for master, save device > state, > then mmap=private for clones). > > These new features are exposed via arguments to -mem-path: > > -mem-path \ > path[,mode=temp|open|create][,mmap=private|shared][,nofallback][,anyfs] > > The backing file names are made deterministic by including their RAMBlocks' > offsets, which are unique given a qemu command line. Note that Xen's live > migration relies on RAMBlocks having the same offsets between QEMU processes > (see xen_read_physmap). The new file name format is > > qemu_back_mem.OFFSET+SIZE.NAME[.RANDOM] > > where SIZE and NAME are added for extra sanity checking when mode="open". > > By default, the old -mem-path behavior is preserved. I.e., mode="temp" is > used, > which adds a random suffix to the deterministic name and unlinks the files. > > Signed-off-by: Peter Feiner <pe...@gridcentric.ca>
This is not reasonable IMHO. I was okay with sticking a name on a ramblock, but encoding a guest PA offset turns this into a supported ABI which I'm not willing to do. A one line change is one thing, but not a complex new option that introduces an ABI only for a proprietary product that's jumping through hoops to keep from contributing useful logic to QEMU. Regards, Anthony Liguori > --- > cpu-all.h | 5 ++++ > exec.c | 60 ++++++++++++++++++++++++++++++++++-------------------- > qemu-config.c | 26 +++++++++++++++++++++++ > qemu-options.hx | 24 +++++++++++++++++++-- > vl.c | 43 +++++++++++++++++++++++++++++++++++++- > 5 files changed, 131 insertions(+), 27 deletions(-) > > diff --git a/cpu-all.h b/cpu-all.h > index c9c51b8..a83f71e 100644 > --- a/cpu-all.h > +++ b/cpu-all.h > @@ -500,6 +500,11 @@ typedef struct RAMList { > extern RAMList ram_list; > > extern const char *mem_path; > +extern int mem_temp; > +extern int mem_create; > +extern int mem_fallback; > +extern int mem_shared; > +extern int mem_hugetlbfs; > extern int mem_prealloc; > > /* Flags stored in the low bits of the TLB virtual address. These are > diff --git a/exec.c b/exec.c > index 8435de0..75b146c 100644 > --- a/exec.c > +++ b/exec.c > @@ -2346,7 +2346,7 @@ void qemu_flush_coalesced_mmio_buffer(void) > > #define HUGETLBFS_MAGIC 0x958458f6 > > -static long gethugepagesize(const char *path) > +static long getfspagesize(const char *path, int want_hugetlbfs) > { > struct statfs fs; > int ret; > @@ -2360,7 +2360,7 @@ static long gethugepagesize(const char *path) > return 0; > } > > - if (fs.f_type != HUGETLBFS_MAGIC) > + if (want_hugetlbfs && fs.f_type != HUGETLBFS_MAGIC) > fprintf(stderr, "Warning: path not on HugeTLBFS: %s\n", path); > > return fs.f_bsize; > @@ -2373,17 +2373,15 @@ static void *file_ram_alloc(RAMBlock *block, > char *filename; > void *area; > int fd; > -#ifdef MAP_POPULATE > int flags; > -#endif > - unsigned long hpagesize; > + unsigned long pagesize; > > - hpagesize = gethugepagesize(path); > - if (!hpagesize) { > + pagesize = getfspagesize(path, mem_hugetlbfs); > + if (!pagesize) { > return NULL; > } > > - if (memory < hpagesize) { > + if (memory < pagesize) { > return NULL; > } > > @@ -2392,20 +2390,35 @@ static void *file_ram_alloc(RAMBlock *block, > return NULL; > } > > - if (asprintf(&filename, "%s/qemu_back_mem.XXXXXX", path) == -1) { > + if (asprintf(&filename, "%s/qemu_back_mem.%"PRIx64"+%"PRIx64".%s", > + path, block->offset, memory, block->mr->name) == -1) { > return NULL; > } > > - fd = mkstemp(filename); > + if (mem_temp) { > + if (asprintf(&filename, "%s.XXXXXX", filename) == -1) { > + return NULL; > + } > + fd = mkstemp(filename); > + } else { > + flags = O_RDWR | O_CLOEXEC | (mem_create ? O_CREAT : 0); > + fd = open(filename, flags, 0600); > + } > + > if (fd < 0) { > - perror("unable to create backing store for hugepages"); > + fprintf(stderr, "unable to create backing store %s " > + "for guest memory: %s\n", filename, strerror(errno)); > free(filename); > return NULL; > } > - unlink(filename); > + > + if (mem_temp) { > + unlink(filename); > + } > + > free(filename); > > - memory = (memory+hpagesize-1) & ~(hpagesize-1); > + memory = (memory+pagesize-1) & ~(pagesize-1); > > /* > * ftruncate is not supported by hugetlbfs in older > @@ -2416,16 +2429,16 @@ static void *file_ram_alloc(RAMBlock *block, > if (ftruncate(fd, memory)) > perror("ftruncate"); > > + flags = mem_shared ? MAP_SHARED : MAP_PRIVATE; > #ifdef MAP_POPULATE > /* NB: MAP_POPULATE won't exhaustively alloc all phys pages in the case > - * MAP_PRIVATE is requested. For mem_prealloc we mmap as MAP_SHARED > + * MAP_PRIVATE is requested. For mem_prealloc we always mmap as > MAP_SHARED > * to sidestep this quirk. > */ > - flags = mem_prealloc ? MAP_POPULATE | MAP_SHARED : MAP_PRIVATE; > - area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0); > -#else > - area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); > + if (mem_prealloc) > + flags = MAP_POPULATE | MAP_SHARED; > #endif > + area = mmap(0, memory, PROT_READ | PROT_WRITE, flags, fd, 0); > if (area == MAP_FAILED) { > perror("file_ram_alloc: can't mmap RAM pages"); > close(fd); > @@ -2561,6 +2574,10 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, > void *host, > #if defined (__linux__) && !defined(TARGET_S390X) > new_block->host = file_ram_alloc(new_block, size, mem_path); > if (!new_block->host) { > + if (!mem_fallback) { > + fprintf(stderr, "failed to allocate ram file for %s\n", > mr->name); > + exit(1); > + } > new_block->host = qemu_vmalloc(size); > memory_try_enable_merging(new_block->host, size); > } > @@ -2675,11 +2692,10 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t > length) > if (mem_path) { > #if defined(__linux__) && !defined(TARGET_S390X) > if (block->fd) { > + flags = mem_shared ? MAP_SHARED : MAP_PRIVATE; > #ifdef MAP_POPULATE > - flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED : > - MAP_PRIVATE; > -#else > - flags |= MAP_PRIVATE; > + if (mem_prealloc) > + flags = MAP_POPULATE | MAP_SHARED; > #endif > area = mmap(vaddr, length, PROT_READ | PROT_WRITE, > flags, block->fd, offset); > diff --git a/qemu-config.c b/qemu-config.c > index 10d1ba4..45b24d2 100644 > --- a/qemu-config.c > +++ b/qemu-config.c > @@ -691,6 +691,31 @@ static QemuOptsList qemu_object_opts = { > }, > }; > > +static QemuOptsList qemu_mempath_opts = { > + .name = "mempath", > + .implied_opt_name = "path", > + .head = QTAILQ_HEAD_INITIALIZER(qemu_mempath_opts.head), > + .desc = { > + { > + .name = "path", > + .type = QEMU_OPT_STRING, > + },{ > + .name = "mmap", > + .type = QEMU_OPT_STRING, > + },{ > + .name = "mode", > + .type = QEMU_OPT_STRING, > + },{ > + .name = "anyfs", > + .type = QEMU_OPT_BOOL, > + },{ > + .name = "fallback", > + .type = QEMU_OPT_BOOL, > + }, > + { /* end of list */ } > + }, > +}; > + > static QemuOptsList *vm_config_groups[32] = { > &qemu_drive_opts, > &qemu_chardev_opts, > @@ -709,6 +734,7 @@ static QemuOptsList *vm_config_groups[32] = { > &qemu_sandbox_opts, > &qemu_add_fd_opts, > &qemu_object_opts, > + &qemu_mempath_opts, > NULL, > }; > > diff --git a/qemu-options.hx b/qemu-options.hx > index 9bb29d3..8972397 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -443,10 +443,28 @@ gigabytes respectively. > ETEXI > > DEF("mem-path", HAS_ARG, QEMU_OPTION_mempath, > - "-mem-path FILE provide backing storage for guest RAM\n", QEMU_ARCH_ALL) > + "-mem-path > path[,mmap=private|shared][,mode=temp|open|create][,nofallback][,anyfs]\n" > + " provide backing storage for guest RAM\n", > + QEMU_ARCH_ALL) > STEXI > -@item -mem-path @var{path} > -Allocate guest RAM from a temporarily created file in @var{path}. > +@item -mem-path > @var{path}[,mmap=private|shared][,mode=temp|open|create][,nofallback][,anyfs] > +Allocate guest RAM from files in @var{path}. > +@item mmap=private|shared > +Specifies how backing files are mmap'd. "private" indicates a COW mapping, > thus > +leaving the underlying file unchanged. "shared" indicates a write-through > +mapping, thus reflecting the guest's memory in the underlying file. Default > is > +private. > +@item mode=temp|open|create > +Determines how backing files are created and opened. "temp" indicates that > +unique temporary files are created in @var{path} and unlinked after opening. > +Both "create" and "open" indicate that deterministic names are used and the > +files aren't unlinked. The The difference between "create" and "open" is that > +"open" expects the files to already exist. Default is temp. > +@item nofallback > +Fail if backing files cannot be used for guest RAM (e.g., permission error, > bad > +path). By default, RAM allocation falls back to normal method. > +@item anyfs > +Suppressing warnings if @var{path} isn't on a hugetlbfs filesystem. > ETEXI > > #ifdef MAP_POPULATE > diff --git a/vl.c b/vl.c > index c8e9c78..3b7cea7 100644 > --- a/vl.c > +++ b/vl.c > @@ -185,6 +185,11 @@ static int display_remote; > const char* keyboard_layout = NULL; > ram_addr_t ram_size; > const char *mem_path = NULL; > +int mem_shared = 0; > +int mem_hugetlbfs = 1; > +int mem_temp = 1; > +int mem_create = 1; > +int mem_fallback = 1; > #ifdef MAP_POPULATE > int mem_prealloc = 0; /* force preallocation of physical target memory */ > #endif > @@ -2938,9 +2943,43 @@ int main(int argc, char **argv, char **envp) > } > break; > } > - case QEMU_OPTION_mempath: > - mem_path = optarg; > + case QEMU_OPTION_mempath: { > + const char *value; > + opts = qemu_opts_parse(qemu_find_opts("mempath"), optarg, 1); > + if (!opts) { > + exit(1); > + } > + > + mem_path = qemu_opt_get(opts, "path"); > + mem_hugetlbfs = !qemu_opt_get_bool(opts, "anyfs", 0); > + mem_fallback = qemu_opt_get_bool(opts, "fallback", 1); > + > + value = qemu_opt_get(opts, "mode"); > + if (value == NULL || !strcmp(value, "temp")) { > + mem_temp = 1; > + } else { > + mem_temp = 0; > + if (!strcmp(value, "create")) { > + mem_create = 1; > + } else if (!strcmp(value, "open")) { > + mem_create = 0; > + } else { > + fprintf(stderr, "Invalid -mem-path mode value.\n"); > + exit(1); > + } > + } > + > + value = qemu_opt_get(opts, "mmap"); > + if (value == NULL || !strcmp(value, "private")) { > + mem_shared = 0; > + } else if (!strcmp(value, "shared")) { > + mem_shared = 1; > + } else { > + fprintf(stderr, "Invalid -mem-path mmap value.\n"); > + exit(1); > + } > break; > + } > #ifdef MAP_POPULATE > case QEMU_OPTION_mem_prealloc: > mem_prealloc = 1; > -- > 1.7.5.4