On Mon, 1 Oct 2018 11:24:01 -0700, Ori Bernstein <o...@eigenstate.org> wrote:
> On Mon, 1 Oct 2018 12:55:12 +0200 > Reyk Floeter <r...@openbsd.org> wrote: > > > Hi Ori, > > > > On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote: > > > I've added support to vmd for external snapshots. That is, > > > snapshots that are derived from a base image. Data lookups > > > start in the derived image, and if the derived image does not > > > contain some data, the search proceeds ot the base image. > > > Multiple derived images may exist off of a single base image. > > > > > > > Nice work! This will be quite useful, thanks. > > > > I think I broke your diff as my last commit to derive the raw/qcow2 > > format introduced some conflicts. I had posted it on hackers@ and > > forgot that your aren't on the internal list yet - sorry for that. Updated version. Changes from the last diff: - Merge in syntax changes. - Don't over-read when getting the base images. - Fix relative paths in base images. - Allow multiple derived images to use a single base image, and allow a user with only read permisssions to base their images on top of it. - Probe the base image size, use/validate it when craeting disk images. - Fix style a bit (long lines, changing from sizeof foo to sizeof(foo). - Move a define out of vmmvar.h - And update the manpage with these changes. - Improve error checking around creating/resolving base disk paths. diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile index c2a5f42d5f6..1f8673e0e26 100644 --- regress/usr.sbin/vmd/diskfmt/Makefile +++ regress/usr.sbin/vmd/diskfmt/Makefile @@ -11,7 +11,7 @@ VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/ PROG=vioscribble -SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c +SRCS=vioscribble.c vioqcow2.c vioraw.c CFLAGS+=-I$(VMD_DIR) -pthread LDFLAGS+=-pthread @@ -26,3 +26,6 @@ scribble-images: .PHONY: ${REGRESS_TARGETS} scribble-images .include <bsd.regress.mk> + +vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c + cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c . diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c index 14d720db652..1da8efedac7 100644 --- regress/usr.sbin/vmd/diskfmt/vioscribble.c +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c @@ -122,16 +122,18 @@ main(int argc, char **argv) verbose = !!getenv("VERBOSE"); qcfd = open("scribble.qc2", O_RDWR); rawfd = open("scribble.raw", O_RDWR); - if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1) + if (qcfd == -1) err(1, "unable to open qcow"); - if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1) + if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1) + err(1, "unable to init qcow"); + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1) err(1, "unable to open raw"); srandom_deterministic(123); /* scribble to both disks */ printf("scribbling...\n"); - for (i = 0; i < 16; i++) { + for (i = 0; i < 1024*16; i++) { off = (random() % DISKSZ); len = random() % sizeof buf + 1; fill(off, buf, sizeof buf); diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c index 8748ecfdedc..4637256452b 100644 --- usr.sbin/vmctl/main.c +++ usr.sbin/vmctl/main.c @@ -67,7 +67,8 @@ int ctl_receive(struct parse_result *, int, char *[]); struct ctl_command ctl_commands[] = { { "console", CMD_CONSOLE, ctl_console, "id" }, - { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 }, + { "create", CMD_CREATE, ctl_create, + "\"path\" [-s size] [-b base]", 1 }, { "load", CMD_LOAD, ctl_load, "\"path\"" }, { "log", CMD_LOG, ctl_log, "[verbose|brief]" }, { "reload", CMD_RELOAD, ctl_reload, "" }, @@ -538,47 +539,54 @@ int ctl_create(struct parse_result *res, int argc, char *argv[]) { int ch, ret, type; - const char *paths[2], *disk, *format; + const char *disk, *format, *base; if (argc < 2) ctl_usage(res->ctl); + base = NULL; type = parse_disktype(argv[1], &disk); - paths[0] = disk; - paths[1] = NULL; - - if (unveil(paths[0], "rwc") == -1) + if (unveil(disk, "rwc") == -1) err(1, "unveil"); - if (pledge("stdio rpath wpath cpath", NULL) == -1) - err(1, "pledge"); argc--; argv++; - while ((ch = getopt(argc, argv, "s:")) != -1) { + while ((ch = getopt(argc, argv, "s:b:")) != -1) { switch (ch) { case 's': if (parse_size(res, optarg, 0) != 0) errx(1, "invalid size: %s", optarg); break; + case 'b': + base = optarg; + if (unveil(base, "r") == -1) + err(1, "unveil"); + break; default: ctl_usage(res->ctl); /* NOTREACHED */ } } - if (res->size == 0) { - fprintf(stderr, "missing size argument\n"); + if (pledge("stdio rpath wpath cpath", NULL) == -1) + err(1, "pledge"); + + if (base && type != VMDF_QCOW2) + errx(1, "base images require qcow2 disk format"); + if (res->size == 0 && !base) { + fprintf(stderr, "could not create %s: missing size argument\n", + disk); ctl_usage(res->ctl); } if (type == VMDF_QCOW2) { format = "qcow2"; - ret = create_qc2_imagefile(paths[0], res->size); + ret = create_qc2_imagefile(disk, base, res->size); } else { format = "raw"; - ret = create_raw_imagefile(paths[0], res->size); + ret = create_raw_imagefile(disk, res->size); } if (ret != 0) { diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8 index f7890ac99f8..7a02452789c 100644 --- usr.sbin/vmctl/vmctl.8 +++ usr.sbin/vmctl/vmctl.8 @@ -50,7 +50,7 @@ Using .Xr cu 1 connect to the console of the VM with the specified .Ar id . -.It Cm create Ar path Fl s Ar size +.It Cm create Ar path Fl s Op Ar size Op Fl b Ar base Creates a VM disk image file with the specified .Ar path and @@ -65,7 +65,14 @@ or in order to specify the disk format. If left unspecified, the format defaults to .Pa raw -if it cannot be derived automatically. +if it cannot be derived automatically. For qcow2, a +.Ar base +image may be specified. The base image is not modified. The derived image +contains only the changes written by the VM. When creating a derived image, +the +.Ar size +may be omitted, and probed from the base image. If it is provided, it must +match the base image size. .It Cm load Ar filename Load additional configuration from the specified file. .It Cm log brief diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c index b09e1115ff7..81fbdba9a28 100644 --- usr.sbin/vmctl/vmctl.c +++ usr.sbin/vmctl/vmctl.c @@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize) #define ALIGN(sz, align) \ ((sz + align - 1) & ~(align - 1)) int -create_qc2_imagefile(const char *imgfile_path, long imgsize) +create_qc2_imagefile(const char *imgfile_path, + const char *base_path, long imgsize) { struct qcheader { char magic[4]; @@ -869,15 +870,30 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize) uint64_t autoclearfeatures; uint32_t reforder; uint32_t headersz; - } __packed hdr; + } __packed hdr, basehdr; int fd, ret; + ssize_t base_len; uint64_t l1sz, refsz, disksz, initsz, clustersz; uint64_t l1off, refoff, v, i, l1entrysz, refentrysz; uint16_t refs; disksz = 1024*1024*imgsize; + + if (base_path) { + fd = open(base_path, O_RDONLY); + if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr)) + err(1, "failure to read base image header"); + close(fd); + if (!disksz) + disksz = betoh64(basehdr.disksz); + else if (disksz != betoh64(basehdr.disksz)) + errx(1, "base size does not match requested size"); + } + if (!base_path && !disksz) + errx(1, "missing disk size"); + clustersz = (1<<16); - l1off = ALIGN(sizeof hdr, clustersz); + l1off = ALIGN(sizeof(hdr), clustersz); l1entrysz = clustersz * clustersz / 8; l1sz = (disksz + l1entrysz - 1) / l1entrysz; @@ -887,11 +903,12 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize) refsz = (disksz + refentrysz - 1) / refentrysz; initsz = ALIGN(refoff + refsz*clustersz, clustersz); + base_len = base_path ? strlen(base_path) : 0; - memcpy(hdr.magic, "QFI\xfb", 4); + memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)); hdr.version = htobe32(3); - hdr.backingoff = htobe64(0); - hdr.backingsz = htobe32(0); + hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0); + hdr.backingsz = htobe32(base_len); hdr.clustershift = htobe32(16); hdr.disksz = htobe64(disksz); hdr.cryptmethod = htobe32(0); @@ -905,7 +922,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize) hdr.compatfeatures = htobe64(0); hdr.autoclearfeatures = htobe64(0); hdr.reforder = htobe32(4); - hdr.headersz = htobe32(sizeof hdr); + hdr.headersz = htobe32(sizeof(hdr)); /* Refuse to overwrite an existing image */ fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL, @@ -914,7 +931,11 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize) return (errno); /* Write out the header */ - if (write(fd, &hdr, sizeof hdr) != sizeof hdr) + if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) + goto error; + + /* Add the base image */ + if (base_path && write(fd, base_path, base_len) != base_len) goto error; /* Extend to desired size, and add one refcount cluster */ diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h index 006411d9785..cdf50cad755 100644 --- usr.sbin/vmctl/vmctl.h +++ usr.sbin/vmctl/vmctl.h @@ -87,7 +87,7 @@ __dead void /* vmctl.c */ int create_raw_imagefile(const char *, long); -int create_qc2_imagefile(const char *, long); +int create_qc2_imagefile(const char *, const char*, long); int vm_start(uint32_t, const char *, int, int, char **, int, char **, int *, char *, char *, char *); int vm_start_complete(struct imsg *, int *, int); diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c index 550b73c1a39..8350d0fc4fc 100644 --- usr.sbin/vmd/config.c +++ usr.sbin/vmd/config.c @@ -35,6 +35,7 @@ #include <util.h> #include <errno.h> #include <imsg.h> +#include <libgen.h> #include "proc.h" #include "vmd.h" @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg) int config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid) { + int diskfds[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK]; struct vmd_if *vif; struct vmop_create_params *vmc = &vm->vm_params; struct vm_create_params *vcp = &vmc->vmc_params; - unsigned int i; + unsigned int i, j; int fd = -1, vmboot = 0; - int kernfd = -1, *diskfds = NULL, *tapfds = NULL; + int kernfd = -1; + int *tapfds; int cdromfd = -1; int saved_errno = 0; + int n = 0, aflags, oflags; char ifname[IF_NAMESIZE], *s; char path[PATH_MAX]; + char base[PATH_MAX]; + char expanded[PATH_MAX]; unsigned int unit; errno = 0; @@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid) } } - diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds)); - if (diskfds == NULL) { - log_warn("%s: can't allocate disk fds", __func__); - goto fail; - } - for (i = 0; i < vcp->vcp_ndisks; i++) - diskfds[i] = -1; + for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) + diskfds[i][j] = -1; tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds)); if (tapfds == NULL) { @@ -289,22 +291,71 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid) /* Open disk images for child */ for (i = 0 ; i < vcp->vcp_ndisks; i++) { - /* Stat disk[i] to ensure it is a regular file */ - if ((diskfds[i] = open(vcp->vcp_disks[i], - O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) { - log_warn("%s: can't open disk %s", __func__, - vcp->vcp_disks[i]); - errno = VMD_DISK_MISSING; - goto fail; - } + if (strlcpy(path, vcp->vcp_disks[i], sizeof path) > PATH_MAX) + log_warnx("%s, disk path too long", __func__); + memset(vmc->vmc_diskbases, 0, sizeof vmc->vmc_diskbases); + oflags = O_RDWR|O_EXLOCK|O_NONBLOCK; + aflags = R_OK|W_OK; + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) { + /* Stat disk[i] to ensure it is a regular file */ + if ((diskfds[i][j] = open(path, oflags)) == -1) { + log_warn("%s: can't open disk %s", __func__, + vcp->vcp_disks[i]); + errno = VMD_DISK_MISSING; + goto fail; + } - if (vm_checkaccess(diskfds[i], - vmc->vmc_checkaccess & VMOP_CREATE_DISK, - uid, R_OK|W_OK) == -1) { - log_warnx("vm \"%s\" no read/write access to disk %s", - vcp->vcp_name, vcp->vcp_disks[i]); - errno = EPERM; - goto fail; + if (vm_checkaccess(diskfds[i][j], + vmc->vmc_checkaccess & VMOP_CREATE_DISK, + uid, R_OK|W_OK) == -1) { + log_warnx("vm \"%s\" unable to access " + "disk %s", vcp->vcp_name, + vcp->vcp_disks[i]); + errno = EPERM; + goto fail; + } + + /* + * Clear the write and exclusive flags for base images. + * All writes should go to the top image, allowing them + * to be shared. + */ + oflags = O_RDONLY|O_NONBLOCK; + aflags = R_OK; + n = virtio_get_base(diskfds[i][j], base, sizeof base, + vmc->vmc_disktypes[i]); + if (n == 0) + break; + if (n == -1) { + log_warnx("vm \"%s\" unable to read " + "base for disk %s", vcp->vcp_name, + vcp->vcp_disks[i]); + goto fail; + } + /* + * Relative paths should be interpreted relative + * to the disk image, rather than relative to the + * directory vmd happens to be running in, since + * this is the only userful interpretation. + */ + if (base[0] == '/') { + if (realpath(base, path) == NULL) { + log_warn("unable to resolve %s", base); + goto fail; + } + } else { + s = dirname(path); + if (snprintf(expanded, sizeof(expanded), + "%s/%s", s, base) >= (int)sizeof(expanded)) { + log_warn("path too long: %s/%s", + s, base); + goto fail; + } + if (realpath(expanded, path) == NULL) { + log_warn("unable to resolve %s", base); + goto fail; + } + } } } @@ -402,9 +453,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid) NULL, 0); for (i = 0; i < vcp->vcp_ndisks; i++) { - proc_compose_imsg(ps, PROC_VMM, -1, - IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i], - &i, sizeof(i)); + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) { + if (diskfds[i][j] == -1) + break; + proc_compose_imsg(ps, PROC_VMM, -1, + IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, + diskfds[i][j], &i, sizeof(i)); + } } for (i = 0; i < vcp->vcp_nnics; i++) { proc_compose_imsg(ps, PROC_VMM, -1, @@ -416,7 +471,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid) proc_compose_imsg(ps, PROC_VMM, -1, IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd, NULL, 0); - free(diskfds); free(tapfds); vm->vm_running = 1; @@ -430,11 +484,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid) close(kernfd); if (cdromfd != -1) close(cdromfd); - if (diskfds != NULL) { - for (i = 0; i < vcp->vcp_ndisks; i++) - close(diskfds[i]); - free(diskfds); - } + for (i = 0; i < vcp->vcp_ndisks; i++) + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) + if (diskfds[i][j] != -1) + close(diskfds[i][j]); if (tapfds != NULL) { for (i = 0; i < vcp->vcp_nnics; i++) close(tapfds[i]); @@ -489,7 +542,7 @@ int config_getdisk(struct privsep *ps, struct imsg *imsg) { struct vmd_vm *vm; - unsigned int n; + unsigned int n, idx; errno = 0; if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) { @@ -500,14 +553,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg) IMSG_SIZE_CHECK(imsg, &n); memcpy(&n, imsg->data, sizeof(n)); - if (n >= vm->vm_params.vmc_params.vcp_ndisks || - vm->vm_disks[n] != -1 || imsg->fd == -1) { + if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) { log_warnx("invalid disk id"); errno = EINVAL; return (-1); } - vm->vm_disks[n] = imsg->fd; - + idx = vm->vm_params.vmc_diskbases[n]++; + if (idx >= VMM_MAX_BASE_PER_DISK) { + log_warnx("too many bases for disk"); + errno = EINVAL; + return (-1); + } + vm->vm_disks[n][idx] = imsg->fd; return (0); } diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c index c3211d186fa..81dca129896 100644 --- usr.sbin/vmd/vioqcow2.c +++ usr.sbin/vmd/vioqcow2.c @@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *); static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); static int inc_refs(struct qcdisk *, off_t, int); -static int qc2_openpath(struct qcdisk *, char *, int); -static int qc2_open(struct qcdisk *, int); +static int qc2_open(struct qcdisk *, int *, size_t); static ssize_t qc2_pread(void *, char *, size_t, off_t); static ssize_t qc2_pwrite(void *, char *, size_t, off_t); static void qc2_close(void *, int); @@ -118,14 +117,14 @@ static void qc2_close(void *, int); * May open snapshot base images. */ int -virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd) +virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) { struct qcdisk *diskp; diskp = malloc(sizeof(struct qcdisk)); if (diskp == NULL) return -1; - if (qc2_open(diskp, fd) == -1) { + if (qc2_open(diskp, fd, nfd) == -1) { log_warnx("%s: could not open qcow2 disk", __func__); return -1; } @@ -137,19 +136,41 @@ virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd) return 0; } -static int -qc2_openpath(struct qcdisk *disk, char *path, int flags) +ssize_t +virtio_qcow2_get_base(int fd, char *path, size_t npath) { - int fd; + struct qcheader header; + uint64_t backingoff; + uint32_t backingsz; - fd = open(path, flags); - if (fd < 0) + if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) { + log_warnx("%s: short read on header", __func__); + return -1; + } + if (strncmp(header.magic, "QFI\xfb", 4) != 0) { + log_warn("%s: invalid magic numbers", __func__); return -1; - return qc2_open(disk, fd); + } + backingoff = be64toh(header.backingoff); + backingsz = be32toh(header.backingsz); + if (backingsz != 0) { + if (backingsz >= npath - 1) { + log_warn("%s: snapshot path too long", __func__); + return -1; + } + if (pread(fd, path, backingsz, backingoff) != backingsz) { + log_warnx("%s: could not read snapshot base name", + __func__); + return -1; + } + path[backingsz] = '\0'; + log_warnx("path: %s\n", path); + } + return backingsz; } static int -qc2_open(struct qcdisk *disk, int fd) +qc2_open(struct qcdisk *disk, int *fds, size_t nfd) { char basepath[PATH_MAX]; struct stat st; @@ -157,14 +178,15 @@ qc2_open(struct qcdisk *disk, int fd) uint64_t backingoff; uint32_t backingsz; size_t i; - int version; + int version, fd; pthread_rwlock_init(&disk->lock, NULL); + fd = fds[0]; disk->fd = fd; disk->base = NULL; disk->l1 = NULL; - if (pread(fd, &header, sizeof header, 0) != sizeof header) { + if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) { log_warn("%s: short read on header", __func__); goto error; } @@ -203,7 +225,7 @@ qc2_open(struct qcdisk *disk, int fd) goto error; } - disk->l1 = calloc(disk->l1sz, sizeof *disk->l1); + disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1)); if (!disk->l1) goto error; if (pread(disk->fd, disk->l1, 8*disk->l1sz, disk->l1off) @@ -222,14 +244,7 @@ qc2_open(struct qcdisk *disk, int fd) backingoff = be64toh(header.backingoff); backingsz = be32toh(header.backingsz); if (backingsz != 0) { - /* - * FIXME: we need to figure out a way of opening these things, - * otherwise we just crash with a pledge violation. - */ - log_warn("%s: unsupported external snapshot images", __func__); - goto error; - - if (backingsz >= sizeof basepath - 1) { + if (backingsz >= sizeof(basepath) - 1) { log_warn("%s: snapshot path too long", __func__); goto error; } @@ -239,11 +254,17 @@ qc2_open(struct qcdisk *disk, int fd) goto error; } basepath[backingsz] = 0; + if (nfd <= 1) { + log_warnx("%s: missing base image %s", __func__, + basepath); + goto error; + } + disk->base = calloc(1, sizeof(struct qcdisk)); if (!disk->base) goto error; - if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) { + if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) { log_warn("%s: could not open %s", basepath, __func__); goto error; } @@ -521,12 +542,12 @@ mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) cluster = disk->end; disk->end += disk->clustersz; buf = htobe64(cluster | QCOW2_INPLACE); - if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf)) + if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off*8) != sizeof(buf)) goto fail; /* TODO: lazily sync: currently VMD doesn't close things */ buf = htobe64(disk->l1[l1off]); - if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8) + if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8*l1off) != 8) goto fail; if (inc_refs(disk, cluster, 1) == -1) goto fail; @@ -571,7 +592,7 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster) l1idx = (off / disk->clustersz) / nper; l2idx = (off / disk->clustersz) % nper; l1off = disk->refoff + 8*l1idx; - if (pread(disk->fd, &buf, sizeof buf, l1off) != 8) + if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8) return -1; l2cluster = be64toh(buf); @@ -583,19 +604,20 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster) return -1; } buf = htobe64(l2cluster); - if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) { + if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) { return -1; } } refs = 1; if (!newcluster) { - if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2) + if (pread(disk->fd, &refs, sizeof(refs), + l2cluster+2*l2idx) != 2) return -1; refs = be16toh(refs) + 1; } refs = htobe16(refs); - if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) { + if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2*l2idx) != 2) { log_warn("%s: could not write ref block", __func__); return -1; } diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c index e02ab67c5dc..ff4bbb3095e 100644 --- usr.sbin/vmd/vioraw.c +++ usr.sbin/vmd/vioraw.c @@ -53,19 +53,21 @@ raw_close(void *file, int stayopen) * returning -1 for error, 0 for success. */ int -virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd) +virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) { off_t sz; int *fdp; - sz = lseek(fd, 0, SEEK_END); + if (nfd != 1) + return -1; + sz = lseek(fd[0], 0, SEEK_END); if (sz == -1) return -1; fdp = malloc(sizeof(int)); if (!fdp) return -1; - *fdp = fd; + *fdp = fd[0]; file->p = fdp; file->pread = raw_pread; file->pwrite = raw_pwrite; diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c index 93490344560..1c66f33216a 100644 --- usr.sbin/vmd/virtio.c +++ usr.sbin/vmd/virtio.c @@ -1745,24 +1745,39 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, return (0); } +int +virtio_get_base(int fd, char *path, size_t npath ,int type) +{ + switch (type) { + case VMDF_RAW: return 0; + case VMDF_QCOW2: return virtio_qcow2_get_base(fd, path, npath); + } + log_warnx("%s: invalid disk format", __func__); + return -1; +} + +/* + * Initializes a struct virtio_backing using the list of fds. + */ static int -virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type) +virtio_init_disk(struct virtio_backing *file, off_t *sz, + int *fd, size_t nfd, int type) { /* * probe disk types in order of preference, first one to work wins. * TODO: provide a way of specifying the type and options. */ switch (type) { - case VMDF_RAW: return virtio_init_raw(file, sz, fd); - case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd); + case VMDF_RAW: return virtio_init_raw(file, sz, fd, nfd); + case VMDF_QCOW2: return virtio_init_qcow2(file, sz, fd, nfd); } log_warnx("%s: invalid disk format", __func__); return -1; } void -virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks, - int *child_taps) +virtio_init(struct vmd_vm *vm, int child_cdrom, + int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps) { struct vmop_create_params *vmc = &vm->vm_params; struct vm_create_params *vcp = &vmc->vmc_params; @@ -1838,7 +1853,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks, vioblk[i].vm_id = vcp->vcp_id; vioblk[i].irq = pci_get_dev_irq(id); if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz, - child_disks[i], vmc->vmc_disktypes[i]) == -1) { + child_disks[i], vmc->vmc_diskbases[i], + vmc->vmc_disktypes[i]) == -1) { log_warnx("%s: unable to determine disk format", __func__); return; @@ -1967,7 +1983,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks, vioscsi->vq[i].last_avail = 0; } if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, - child_cdrom, VMDF_RAW) == -1) { + &child_cdrom, 1, VMDF_RAW) == -1) { log_warnx("%s: unable to determine iso format", __func__); return; @@ -2123,7 +2139,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps) } int -vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks) +vioblk_restore(int fd, struct vmop_create_params *vmc, + int child_disks[][VMM_MAX_BASE_PER_DISK]) { struct vm_create_params *vcp = &vmc->vmc_params; uint8_t i; @@ -2149,7 +2166,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks) return (-1); } if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz, - child_disks[i], vmc->vmc_disktypes[i]) == -1) { + child_disks[i], vmc->vmc_diskbases[i], + vmc->vmc_disktypes[i]) == -1) { log_warnx("%s: unable to determine disk format", __func__); return (-1); @@ -2186,7 +2204,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom) return (-1); } - if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom, + if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1, VMDF_RAW) == -1) { log_warnx("%s: unable to determine iso format", __func__); return (-1); @@ -2198,8 +2216,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int child_cdrom) } int -virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks, - int *child_taps) +virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, + int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps) { struct vmop_create_params *vmc = &vm->vm_params; struct vm_create_params *vcp = &vmc->vmc_params; diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h index 46006916b6a..91f0e323204 100644 --- usr.sbin/vmd/virtio.h +++ usr.sbin/vmd/virtio.h @@ -257,10 +257,11 @@ struct ioinfo { }; /* virtio.c */ -void virtio_init(struct vmd_vm *, int, int *, int *); +void virtio_init(struct vmd_vm *, int, int[][VMM_MAX_BASE_PER_DISK], int *); void virtio_shutdown(struct vmd_vm *); int virtio_dump(int); -int virtio_restore(int, struct vmd_vm *, int, int *, int *); +int virtio_restore(int, struct vmd_vm *, int, + int[][VMM_MAX_BASE_PER_DISK], int *); uint32_t vring_size(uint32_t); int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); @@ -270,12 +271,14 @@ void viornd_update_qs(void); void viornd_update_qa(void); int viornd_notifyq(void); -int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd); -int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd); +ssize_t virtio_qcow2_get_base(int, char *, size_t); +int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t); +int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t); int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int vioblk_dump(int); -int vioblk_restore(int, struct vmop_create_params *, int *); +int vioblk_restore(int, struct vmop_create_params *, + int[][VMM_MAX_BASE_PER_DISK]); void vioblk_update_qs(struct vioblk_dev *); void vioblk_update_qa(struct vioblk_dev *); int vioblk_notifyq(struct vioblk_dev *); diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c index ef4494d918b..7c9aaf27012 100644 --- usr.sbin/vmd/vm.c +++ usr.sbin/vmd/vm.c @@ -65,8 +65,8 @@ io_fn_t ioports_map[MAX_PORTS]; -int run_vm(int, int *, int *, struct vmop_create_params *, - struct vcpu_reg_state *); +int run_vm(int, int[][VMM_MAX_BASE_PER_DISK], int *, + struct vmop_create_params *, struct vcpu_reg_state *); void vm_dispatch_vmm(int, short, void *); void *event_thread(void *); void *vcpu_run_loop(void *); @@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); void create_memory_map(struct vm_create_params *); int alloc_guest_mem(struct vm_create_params *); int vmm_create_vm(struct vm_create_params *); -void init_emulated_hw(struct vmop_create_params *, int, int *, int *); -void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int); +void init_emulated_hw(struct vmop_create_params *, int, + int[][VMM_MAX_BASE_PER_DISK], int *); +void restore_emulated_hw(struct vm_create_params *, int, int *, + int[][VMM_MAX_BASE_PER_DISK],int); void vcpu_exit_inout(struct vm_run_params *); uint8_t vcpu_exit_pci(struct vm_run_params *); int vcpu_pic_intr(uint32_t, uint32_t, uint8_t); @@ -327,7 +329,7 @@ start_vm(struct vmd_vm *vm, int fd) /* Find and open kernel image */ if ((fp = vmboot_open(vm->vm_kernel, - vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL) + vm->vm_disks[0][0], vmc->vmc_disktypes[0], &vmboot)) == NULL) fatalx("failed to open kernel - exiting"); /* Load kernel image */ @@ -903,7 +905,7 @@ vmm_create_vm(struct vm_create_params *vcp) */ void init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, - int *child_disks, int *child_taps) + int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps) { struct vm_create_params *vcp = &vmc->vmc_params; int i; @@ -968,7 +970,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom, */ void restore_emulated_hw(struct vm_create_params *vcp, int fd, - int *child_taps, int *child_disks, int child_cdrom) + int *child_taps, int child_disks[][VMM_MAX_BASE_PER_DISK], int child_cdrom) { /* struct vm_create_params *vcp = &vmc->vmc_params; */ int i; @@ -1029,8 +1031,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd, * !0 : the VM exited abnormally or failed to start */ int -run_vm(int child_cdrom, int *child_disks, int *child_taps, - struct vmop_create_params *vmc, struct vcpu_reg_state *vrs) +run_vm(int child_cdrom, int child_disks[][VMM_MAX_BASE_PER_DISK], + int *child_taps, struct vmop_create_params *vmc, + struct vcpu_reg_state *vrs) { struct vm_create_params *vcp = &vmc->vmc_params; struct vm_rwregs_params vregsp; diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c index 44ceeb64a7e..f23d8d6b25a 100644 --- usr.sbin/vmd/vmboot.c +++ usr.sbin/vmd/vmboot.c @@ -414,13 +414,13 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int disk_type, switch (vmboot->vbp_type) { case VMDF_RAW: - if (virtio_init_raw(vfp, &sz, disk_fd) == -1) { + if (virtio_init_raw(vfp, &sz, &disk_fd, 1) == -1) { log_debug("%s: could not open raw disk", __func__); goto fail; } break; case VMDF_QCOW2: - if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) { + if (virtio_init_qcow2(vfp, &sz, &disk_fd, 1) == -1) { log_debug("%s: could not open qcow2 disk", __func__); goto fail; } diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c index 1571be21bc5..8377812c316 100644 --- usr.sbin/vmd/vmd.c +++ usr.sbin/vmd/vmd.c @@ -1097,7 +1097,7 @@ void vm_stop(struct vmd_vm *vm, int keeptty, const char *caller) { struct privsep *ps = &env->vmd_ps; - unsigned int i; + unsigned int i, j; if (vm == NULL) return; @@ -1117,9 +1117,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char *caller) close(vm->vm_iev.ibuf.fd); } for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) { - if (vm->vm_disks[i] != -1) { - close(vm->vm_disks[i]); - vm->vm_disks[i] = -1; + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) { + if (vm->vm_disks[i][j] != -1) { + close(vm->vm_disks[i][j]); + vm->vm_disks[i][j] = -1; + } } } for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) { @@ -1176,7 +1178,7 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc, struct vmop_owner *vmo = NULL; struct vmd_user *usr = NULL; uint32_t rng; - unsigned int i; + unsigned int i, j; struct vmd_switch *sw; char *s; @@ -1267,7 +1269,8 @@ vm_register(struct privsep *ps, struct vmop_create_params *vmc, vm->vm_user = usr; for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) - vm->vm_disks[i] = -1; + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) + vm->vm_disks[i][j] = -1; for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) vm->vm_ifs[i].vif_fd = -1; for (i = 0; i < vcp->vcp_nnics; i++) { diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h index b348d12c757..0683629b3b1 100644 --- usr.sbin/vmd/vmd.h +++ usr.sbin/vmd/vmd.h @@ -48,6 +48,7 @@ #define VM_DEFAULT_DEVICE "hd0a" #define VM_BOOT_CONF "/etc/boot.conf" #define VM_NAME_MAX 64 +#define VM_MAX_BASE_PER_DISK 4 #define VM_TTYNAME_MAX 16 #define MAX_TAP 256 #define NR_BACKLOG 5 @@ -169,6 +170,7 @@ struct vmop_create_params { #define VMIFF_OPTMASK (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN) unsigned int vmc_disktypes[VMM_MAX_DISKS_PER_VM]; + unsigned int vmc_diskbases[VMM_MAX_DISKS_PER_VM]; #define VMDF_RAW 0x01 #define VMDF_QCOW2 0x02 @@ -241,7 +243,7 @@ struct vmd_vm { uint32_t vm_vmid; int vm_kernel; int vm_cdrom; - int vm_disks[VMM_MAX_DISKS_PER_VM]; + int vm_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK]; struct vmd_if vm_ifs[VMM_MAX_NICS_PER_VM]; char *vm_ttyname; int vm_tty; @@ -415,4 +417,7 @@ int parse_config(const char *); int cmdline_symset(char *); int host(const char *, struct address *); +/* virtio.c */ +int virtio_get_base(int, char *, size_t, int); + #endif /* VMD_H */ diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c index 7757856323f..3176fd85713 100644 --- usr.sbin/vmd/vmm.c +++ usr.sbin/vmd/vmm.c @@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) struct vmd_vm *vm; int ret = EINVAL; int fds[2]; - size_t i; + size_t i, j; if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) { log_warnx("%s: can't find vm", __func__); @@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid) close(fds[1]); for (i = 0 ; i < vcp->vcp_ndisks; i++) { - close(vm->vm_disks[i]); - vm->vm_disks[i] = -1; + for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) { + if (vm->vm_disks[i][j] != -1) + close(vm->vm_disks[i][j]); + vm->vm_disks[i][j] = -1; + } } for (i = 0 ; i < vcp->vcp_nnics; i++) { close(vm->vm_ifs[i].vif_fd); -- Ori Bernstein