On Mon, 1 Oct 2018 11:24:01 -0700, Ori Bernstein <o...@eigenstate.org> wrote:

> On Mon, 1 Oct 2018 12:55:12 +0200
> Reyk Floeter <r...@openbsd.org> wrote:
> 
> > Hi Ori,
> > 
> > On Sun, Sep 30, 2018 at 12:27:00PM -0700, Ori Bernstein wrote:
> > > I've added support to vmd for external snapshots. That is,
> > > snapshots that are derived from a base image. Data lookups
> > > start in the derived image, and if the derived image does not
> > > contain some data, the search proceeds ot the base image.
> > > Multiple derived images may exist off of a single base image.
> > > 
> > 
> > Nice work!  This will be quite useful, thanks.
> > 
> > I think I broke your diff as my last commit to derive the raw/qcow2
> > format introduced some conflicts.  I had posted it on hackers@ and
> > forgot that your aren't on the internal list yet - sorry for that.

Updated version. Changes from the last diff:

- Merge in syntax changes. 
- Don't over-read when getting the base images.
- Fix relative paths in base images.
- Allow multiple derived images to use a single base image, and allow a user
  with only read permisssions to base their images on top of it.
- Probe the base image size, use/validate it when craeting disk images.
- Fix style a bit (long lines, changing from sizeof foo to sizeof(foo).
- Move a define out of vmmvar.h
- And update the manpage with these changes.
- Improve error checking around creating/resolving base disk paths.


diff --git regress/usr.sbin/vmd/diskfmt/Makefile 
regress/usr.sbin/vmd/diskfmt/Makefile
index c2a5f42d5f6..1f8673e0e26 100644
--- regress/usr.sbin/vmd/diskfmt/Makefile
+++ regress/usr.sbin/vmd/diskfmt/Makefile
@@ -11,7 +11,7 @@
 VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
 
 PROG=vioscribble
-SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+SRCS=vioscribble.c vioqcow2.c vioraw.c
 CFLAGS+=-I$(VMD_DIR) -pthread
 LDFLAGS+=-pthread
 
@@ -26,3 +26,6 @@ scribble-images:
 .PHONY: ${REGRESS_TARGETS} scribble-images
 
 .include <bsd.regress.mk>
+
+vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
+       cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c 
regress/usr.sbin/vmd/diskfmt/vioscribble.c
index 14d720db652..1da8efedac7 100644
--- regress/usr.sbin/vmd/diskfmt/vioscribble.c
+++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
@@ -122,16 +122,18 @@ main(int argc, char **argv)
        verbose = !!getenv("VERBOSE");
        qcfd = open("scribble.qc2", O_RDWR);
        rawfd = open("scribble.raw", O_RDWR);
-       if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
+       if (qcfd == -1)
                err(1, "unable to open qcow");
-       if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
+       if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
+               err(1, "unable to init qcow");
+       if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
                err(1, "unable to open raw");
 
        srandom_deterministic(123);
 
        /* scribble to both disks */
        printf("scribbling...\n");
-       for (i = 0; i < 16; i++) {
+       for (i = 0; i < 1024*16; i++) {
                off = (random() % DISKSZ);
                len = random() % sizeof buf + 1;
                fill(off, buf, sizeof buf);
diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
index 8748ecfdedc..4637256452b 100644
--- usr.sbin/vmctl/main.c
+++ usr.sbin/vmctl/main.c
@@ -67,7 +67,8 @@ int            ctl_receive(struct parse_result *, int, char 
*[]);
 
 struct ctl_command ctl_commands[] = {
        { "console",    CMD_CONSOLE,    ctl_console,    "id" },
-       { "create",     CMD_CREATE,     ctl_create,     "\"path\" -s size", 1 },
+       { "create",     CMD_CREATE,     ctl_create,     
+               "\"path\" [-s size] [-b base]", 1 },
        { "load",       CMD_LOAD,       ctl_load,       "\"path\"" },
        { "log",        CMD_LOG,        ctl_log,        "[verbose|brief]" },
        { "reload",     CMD_RELOAD,     ctl_reload,     "" },
@@ -538,47 +539,54 @@ int
 ctl_create(struct parse_result *res, int argc, char *argv[])
 {
        int              ch, ret, type;
-       const char      *paths[2], *disk, *format;
+       const char      *disk, *format, *base;
 
        if (argc < 2)
                ctl_usage(res->ctl);
 
+       base = NULL;
        type = parse_disktype(argv[1], &disk);
 
-       paths[0] = disk;
-       paths[1] = NULL;
-
-       if (unveil(paths[0], "rwc") == -1)
+       if (unveil(disk, "rwc") == -1)
                err(1, "unveil");
 
-       if (pledge("stdio rpath wpath cpath", NULL) == -1)
-               err(1, "pledge");
        argc--;
        argv++;
 
-       while ((ch = getopt(argc, argv, "s:")) != -1) {
+       while ((ch = getopt(argc, argv, "s:b:")) != -1) {
                switch (ch) {
                case 's':
                        if (parse_size(res, optarg, 0) != 0)
                                errx(1, "invalid size: %s", optarg);
                        break;
+               case 'b':
+                       base = optarg;
+                       if (unveil(base, "r") == -1)
+                               err(1, "unveil");
+                       break;
                default:
                        ctl_usage(res->ctl);
                        /* NOTREACHED */
                }
        }
 
-       if (res->size == 0) {
-               fprintf(stderr, "missing size argument\n");
+       if (pledge("stdio rpath wpath cpath", NULL) == -1)
+               err(1, "pledge");
+
+       if (base && type != VMDF_QCOW2)
+               errx(1, "base images require qcow2 disk format");
+       if (res->size == 0 && !base) {
+               fprintf(stderr, "could not create %s: missing size argument\n",
+                   disk);
                ctl_usage(res->ctl);
        }
 
        if (type == VMDF_QCOW2) {
                format = "qcow2";
-               ret = create_qc2_imagefile(paths[0], res->size);
+               ret = create_qc2_imagefile(disk, base, res->size);
        } else {
                format = "raw";
-               ret = create_raw_imagefile(paths[0], res->size);
+               ret = create_raw_imagefile(disk, res->size);
        }
 
        if (ret != 0) {
diff --git usr.sbin/vmctl/vmctl.8 usr.sbin/vmctl/vmctl.8
index f7890ac99f8..7a02452789c 100644
--- usr.sbin/vmctl/vmctl.8
+++ usr.sbin/vmctl/vmctl.8
@@ -50,7 +50,7 @@ Using
 .Xr cu 1
 connect to the console of the VM with the specified
 .Ar id .
-.It Cm create Ar path Fl s Ar size
+.It Cm create Ar path Fl s Op Ar size Op  Fl b Ar base
 Creates a VM disk image file with the specified
 .Ar path
 and
@@ -65,7 +65,14 @@ or
 in order to specify the disk format.
 If left unspecified, the format defaults to
 .Pa raw
-if it cannot be derived automatically.
+if it cannot be derived automatically.  For qcow2, a
+.Ar base
+image may be specified. The base image is not modified. The derived image
+contains only the changes written by the VM. When creating a derived image,
+the
+.Ar size
+may be omitted, and probed from the base image. If it is provided, it must
+match the base image size.
 .It Cm load Ar filename
 Load additional configuration from the specified file.
 .It Cm log brief
diff --git usr.sbin/vmctl/vmctl.c usr.sbin/vmctl/vmctl.c
index b09e1115ff7..81fbdba9a28 100644
--- usr.sbin/vmctl/vmctl.c
+++ usr.sbin/vmctl/vmctl.c
@@ -847,7 +847,8 @@ create_raw_imagefile(const char *imgfile_path, long imgsize)
 #define ALIGN(sz, align) \
        ((sz + align - 1) & ~(align - 1))
 int
-create_qc2_imagefile(const char *imgfile_path, long imgsize)
+create_qc2_imagefile(const char *imgfile_path,
+    const char *base_path, long imgsize)
 {
        struct qcheader {
                char magic[4];
@@ -869,15 +870,30 @@ create_qc2_imagefile(const char *imgfile_path, long 
imgsize)
                uint64_t autoclearfeatures;
                uint32_t reforder;
                uint32_t headersz;
-       } __packed hdr;
+       } __packed hdr, basehdr;
        int fd, ret;
+       ssize_t base_len;
        uint64_t l1sz, refsz, disksz, initsz, clustersz;
        uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
        uint16_t refs;
 
        disksz = 1024*1024*imgsize;
+
+       if (base_path) {
+               fd = open(base_path, O_RDONLY);
+               if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
+                       err(1, "failure to read base image header");
+               close(fd);
+               if (!disksz)
+                       disksz = betoh64(basehdr.disksz);
+               else if (disksz != betoh64(basehdr.disksz))
+                       errx(1, "base size does not match requested size");
+       }
+       if (!base_path && !disksz)
+               errx(1, "missing disk size");
+
        clustersz = (1<<16);
-       l1off = ALIGN(sizeof hdr, clustersz);
+       l1off = ALIGN(sizeof(hdr), clustersz);
 
        l1entrysz = clustersz * clustersz / 8;
        l1sz = (disksz + l1entrysz - 1) / l1entrysz;
@@ -887,11 +903,12 @@ create_qc2_imagefile(const char *imgfile_path, long 
imgsize)
        refsz = (disksz + refentrysz - 1) / refentrysz;
 
        initsz = ALIGN(refoff + refsz*clustersz, clustersz);
+       base_len = base_path ? strlen(base_path) : 0;
 
-       memcpy(hdr.magic, "QFI\xfb", 4);
+       memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
        hdr.version             = htobe32(3);
-       hdr.backingoff          = htobe64(0);
-       hdr.backingsz           = htobe32(0);
+       hdr.backingoff          = htobe64(base_path ? sizeof(hdr) : 0);
+       hdr.backingsz           = htobe32(base_len);
        hdr.clustershift        = htobe32(16);
        hdr.disksz              = htobe64(disksz);
        hdr.cryptmethod         = htobe32(0);
@@ -905,7 +922,7 @@ create_qc2_imagefile(const char *imgfile_path, long imgsize)
        hdr.compatfeatures      = htobe64(0);
        hdr.autoclearfeatures   = htobe64(0);
        hdr.reforder            = htobe32(4);
-       hdr.headersz            = htobe32(sizeof hdr);
+       hdr.headersz            = htobe32(sizeof(hdr));
 
        /* Refuse to overwrite an existing image */
        fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
@@ -914,7 +931,11 @@ create_qc2_imagefile(const char *imgfile_path, long 
imgsize)
                return (errno);
 
        /* Write out the header */
-       if (write(fd, &hdr, sizeof hdr) != sizeof hdr)
+       if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+               goto error;
+
+       /* Add the base image */
+       if (base_path && write(fd, base_path, base_len) != base_len)
                goto error;
 
        /* Extend to desired size, and add one refcount cluster */
diff --git usr.sbin/vmctl/vmctl.h usr.sbin/vmctl/vmctl.h
index 006411d9785..cdf50cad755 100644
--- usr.sbin/vmctl/vmctl.h
+++ usr.sbin/vmctl/vmctl.h
@@ -87,7 +87,7 @@ __dead void
 
 /* vmctl.c */
 int     create_raw_imagefile(const char *, long);
-int     create_qc2_imagefile(const char *, long);
+int     create_qc2_imagefile(const char *, const char*, long);
 int     vm_start(uint32_t, const char *, int, int, char **, int,
            char **, int *, char *, char *, char *);
 int     vm_start_complete(struct imsg *, int *, int);
diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
index 550b73c1a39..8350d0fc4fc 100644
--- usr.sbin/vmd/config.c
+++ usr.sbin/vmd/config.c
@@ -35,6 +35,7 @@
 #include <util.h>
 #include <errno.h>
 #include <imsg.h>
+#include <libgen.h>
 
 #include "proc.h"
 #include "vmd.h"
@@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
 int
 config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t uid)
 {
+       int diskfds[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
        struct vmd_if           *vif;
        struct vmop_create_params *vmc = &vm->vm_params;
        struct vm_create_params *vcp = &vmc->vmc_params;
-       unsigned int             i;
+       unsigned int             i, j;
        int                      fd = -1, vmboot = 0;
-       int                      kernfd = -1, *diskfds = NULL, *tapfds = NULL;
+       int                      kernfd = -1;
+       int                     *tapfds;
        int                      cdromfd = -1;
        int                      saved_errno = 0;
+       int                      n = 0, aflags, oflags;
        char                     ifname[IF_NAMESIZE], *s;
        char                     path[PATH_MAX];
+       char                     base[PATH_MAX];
+       char                     expanded[PATH_MAX];
        unsigned int             unit;
 
        errno = 0;
@@ -205,13 +211,9 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, 
uint32_t peerid, uid_t uid)
                }
        }
 
-       diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
-       if (diskfds == NULL) {
-               log_warn("%s: can't allocate disk fds", __func__);
-               goto fail;
-       }
-       for (i = 0; i < vcp->vcp_ndisks; i++)
-               diskfds[i] = -1;
+       for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
+               for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+                       diskfds[i][j] = -1;
 
        tapfds = reallocarray(NULL, vcp->vcp_nnics, sizeof(*tapfds));
        if (tapfds == NULL) {
@@ -289,22 +291,71 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, 
uint32_t peerid, uid_t uid)
 
        /* Open disk images for child */
        for (i = 0 ; i < vcp->vcp_ndisks; i++) {
-                /* Stat disk[i] to ensure it is a regular file */
-               if ((diskfds[i] = open(vcp->vcp_disks[i],
-                   O_RDWR|O_EXLOCK|O_NONBLOCK)) == -1) {
-                       log_warn("%s: can't open disk %s", __func__,
-                           vcp->vcp_disks[i]);
-                       errno = VMD_DISK_MISSING;
-                       goto fail;
-               }
+               if (strlcpy(path, vcp->vcp_disks[i], sizeof path) > PATH_MAX)
+                       log_warnx("%s, disk path too long", __func__);
+               memset(vmc->vmc_diskbases, 0, sizeof vmc->vmc_diskbases);
+               oflags = O_RDWR|O_EXLOCK|O_NONBLOCK;
+               aflags = R_OK|W_OK;
+               for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+                       /* Stat disk[i] to ensure it is a regular file */
+                       if ((diskfds[i][j] = open(path, oflags)) == -1) {
+                               log_warn("%s: can't open disk %s", __func__,
+                                   vcp->vcp_disks[i]);
+                               errno = VMD_DISK_MISSING;
+                               goto fail;
+                       }
 
-               if (vm_checkaccess(diskfds[i],
-                   vmc->vmc_checkaccess & VMOP_CREATE_DISK,
-                   uid, R_OK|W_OK) == -1) {
-                       log_warnx("vm \"%s\" no read/write access to disk %s",
-                           vcp->vcp_name, vcp->vcp_disks[i]);
-                       errno = EPERM;
-                       goto fail;
+                       if (vm_checkaccess(diskfds[i][j],
+                           vmc->vmc_checkaccess & VMOP_CREATE_DISK,
+                           uid, R_OK|W_OK) == -1) {
+                               log_warnx("vm \"%s\" unable to access "
+                                   "disk %s", vcp->vcp_name,
+                                   vcp->vcp_disks[i]);
+                               errno = EPERM;
+                               goto fail;
+                       }
+
+                       /*
+                        * Clear the write and exclusive flags for base images.
+                        * All writes should go to the top image, allowing them
+                        * to be shared.
+                        */
+                       oflags = O_RDONLY|O_NONBLOCK;
+                       aflags = R_OK;
+                       n = virtio_get_base(diskfds[i][j], base, sizeof base,
+                           vmc->vmc_disktypes[i]);
+                       if (n == 0)
+                               break;
+                       if (n == -1) {
+                               log_warnx("vm \"%s\" unable to read "
+                                   "base for disk %s", vcp->vcp_name,
+                                   vcp->vcp_disks[i]);
+                               goto fail;
+                       }
+                       /*
+                        * Relative paths should be interpreted relative
+                        * to the disk image, rather than relative to the
+                        * directory vmd happens to be running in, since
+                        * this is the only userful interpretation.
+                        */
+                       if (base[0] == '/') {
+                               if (realpath(base, path) == NULL) {
+                                       log_warn("unable to resolve %s", base);
+                                       goto fail;
+                               }
+                       } else {
+                               s = dirname(path);
+                               if (snprintf(expanded, sizeof(expanded),
+                                   "%s/%s", s, base) >= (int)sizeof(expanded)) 
{
+                                       log_warn("path too long: %s/%s",
+                                           s, base);
+                                       goto fail;
+                               }
+                               if (realpath(expanded, path) == NULL) {
+                                       log_warn("unable to resolve %s", base);
+                                       goto fail;
+                               }
+                       }
                }
        }
 
@@ -402,9 +453,13 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, 
uint32_t peerid, uid_t uid)
                    NULL, 0);
 
        for (i = 0; i < vcp->vcp_ndisks; i++) {
-               proc_compose_imsg(ps, PROC_VMM, -1,
-                   IMSG_VMDOP_START_VM_DISK, vm->vm_vmid, diskfds[i],
-                   &i, sizeof(i));
+               for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+                       if (diskfds[i][j] == -1)
+                               break;
+                       proc_compose_imsg(ps, PROC_VMM, -1,
+                           IMSG_VMDOP_START_VM_DISK, vm->vm_vmid,
+                           diskfds[i][j], &i, sizeof(i));
+               }
        }
        for (i = 0; i < vcp->vcp_nnics; i++) {
                proc_compose_imsg(ps, PROC_VMM, -1,
@@ -416,7 +471,6 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, 
uint32_t peerid, uid_t uid)
                proc_compose_imsg(ps, PROC_VMM, -1,
                    IMSG_VMDOP_START_VM_END, vm->vm_vmid, fd,  NULL, 0);
 
-       free(diskfds);
        free(tapfds);
 
        vm->vm_running = 1;
@@ -430,11 +484,10 @@ config_setvm(struct privsep *ps, struct vmd_vm *vm, 
uint32_t peerid, uid_t uid)
                close(kernfd);
        if (cdromfd != -1)
                close(cdromfd);
-       if (diskfds != NULL) {
-               for (i = 0; i < vcp->vcp_ndisks; i++)
-                       close(diskfds[i]);
-               free(diskfds);
-       }
+       for (i = 0; i < vcp->vcp_ndisks; i++)
+               for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+                       if (diskfds[i][j] != -1)
+                               close(diskfds[i][j]);
        if (tapfds != NULL) {
                for (i = 0; i < vcp->vcp_nnics; i++)
                        close(tapfds[i]);
@@ -489,7 +542,7 @@ int
 config_getdisk(struct privsep *ps, struct imsg *imsg)
 {
        struct vmd_vm   *vm;
-       unsigned int     n;
+       unsigned int     n, idx;
 
        errno = 0;
        if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
@@ -500,14 +553,18 @@ config_getdisk(struct privsep *ps, struct imsg *imsg)
        IMSG_SIZE_CHECK(imsg, &n);
        memcpy(&n, imsg->data, sizeof(n));
 
-       if (n >= vm->vm_params.vmc_params.vcp_ndisks ||
-           vm->vm_disks[n] != -1 || imsg->fd == -1) {
+       if (n >= vm->vm_params.vmc_params.vcp_ndisks || imsg->fd == -1) {
                log_warnx("invalid disk id");
                errno = EINVAL;
                return (-1);
        }
-       vm->vm_disks[n] = imsg->fd;
-
+       idx = vm->vm_params.vmc_diskbases[n]++;
+       if (idx >= VMM_MAX_BASE_PER_DISK) {
+               log_warnx("too many bases for disk");
+               errno = EINVAL;
+               return (-1);
+       }
+       vm->vm_disks[n][idx] = imsg->fd;
        return (0);
 }
 
diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
index c3211d186fa..81dca129896 100644
--- usr.sbin/vmd/vioqcow2.c
+++ usr.sbin/vmd/vioqcow2.c
@@ -104,8 +104,7 @@ static off_t xlate(struct qcdisk *, off_t, int *);
 static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
 static int inc_refs(struct qcdisk *, off_t, int);
-static int qc2_openpath(struct qcdisk *, char *, int);
-static int qc2_open(struct qcdisk *, int);
+static int qc2_open(struct qcdisk *, int *, size_t);
 static ssize_t qc2_pread(void *, char *, size_t, off_t);
 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
 static void qc2_close(void *, int);
@@ -118,14 +117,14 @@ static void qc2_close(void *, int);
  * May open snapshot base images.
  */
 int
-virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
        struct qcdisk *diskp;
 
        diskp = malloc(sizeof(struct qcdisk));
        if (diskp == NULL)
                return -1;
-       if (qc2_open(diskp, fd) == -1) {
+       if (qc2_open(diskp, fd, nfd) == -1) {
                log_warnx("%s: could not open qcow2 disk", __func__);
                return -1;
        }
@@ -137,19 +136,41 @@ virtio_init_qcow2(struct virtio_backing *file, off_t 
*szp, int fd)
        return 0;
 }
 
-static int
-qc2_openpath(struct qcdisk *disk, char *path, int flags)
+ssize_t
+virtio_qcow2_get_base(int fd, char *path, size_t npath)
 {
-       int fd;
+       struct qcheader header;
+       uint64_t backingoff;
+       uint32_t backingsz;
 
-       fd = open(path, flags);
-       if (fd < 0)
+       if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
+               log_warnx("%s: short read on header", __func__);
+               return -1;
+       }
+       if (strncmp(header.magic, "QFI\xfb", 4) != 0) {
+               log_warn("%s: invalid magic numbers", __func__);
                return -1;
-       return qc2_open(disk, fd);
+       }
+       backingoff = be64toh(header.backingoff);
+       backingsz = be32toh(header.backingsz);
+       if (backingsz != 0) {
+               if (backingsz >= npath - 1) {
+                       log_warn("%s: snapshot path too long", __func__);
+                       return -1;
+               }
+               if (pread(fd, path, backingsz, backingoff) != backingsz) {
+                       log_warnx("%s: could not read snapshot base name",
+                           __func__);
+                       return -1;
+               }
+               path[backingsz] = '\0';
+               log_warnx("path: %s\n", path);
+       }
+       return backingsz;
 }
 
 static int
-qc2_open(struct qcdisk *disk, int fd)
+qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
 {
        char basepath[PATH_MAX];
        struct stat st;
@@ -157,14 +178,15 @@ qc2_open(struct qcdisk *disk, int fd)
        uint64_t backingoff;
        uint32_t backingsz;
        size_t i;
-       int version;
+       int version, fd;
 
        pthread_rwlock_init(&disk->lock, NULL);
+       fd = fds[0];
        disk->fd = fd;
        disk->base = NULL;
        disk->l1 = NULL;
 
-       if (pread(fd, &header, sizeof header, 0) != sizeof header) {
+       if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
                log_warn("%s: short read on header", __func__);
                goto error;
        }
@@ -203,7 +225,7 @@ qc2_open(struct qcdisk *disk, int fd)
                goto error;
        }
 
-       disk->l1 = calloc(disk->l1sz, sizeof *disk->l1);
+       disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
        if (!disk->l1)
                goto error;
        if (pread(disk->fd, disk->l1, 8*disk->l1sz, disk->l1off)
@@ -222,14 +244,7 @@ qc2_open(struct qcdisk *disk, int fd)
        backingoff = be64toh(header.backingoff);
        backingsz = be32toh(header.backingsz);
        if (backingsz != 0) {
-               /*
-                * FIXME: we need to figure out a way of opening these things,
-                * otherwise we just crash with a pledge violation.
-                */
-               log_warn("%s: unsupported external snapshot images", __func__);
-               goto error;
-
-               if (backingsz >= sizeof basepath - 1) {
+               if (backingsz >= sizeof(basepath) - 1) {
                        log_warn("%s: snapshot path too long", __func__);
                        goto error;
                }
@@ -239,11 +254,17 @@ qc2_open(struct qcdisk *disk, int fd)
                        goto error;
                }
                basepath[backingsz] = 0;
+               if (nfd <= 1) {
+                       log_warnx("%s: missing base image %s", __func__, 
+                           basepath);
+                       goto error;
+               }
+
 
                disk->base = calloc(1, sizeof(struct qcdisk));
                if (!disk->base)
                        goto error;
-               if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
+               if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) {
                        log_warn("%s: could not open %s", basepath, __func__);
                        goto error;
                }
@@ -521,12 +542,12 @@ mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t 
off, off_t src_phys)
        cluster = disk->end;
        disk->end += disk->clustersz;
        buf = htobe64(cluster | QCOW2_INPLACE);
-       if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf))
+       if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off*8) != sizeof(buf))
                goto fail;
 
        /* TODO: lazily sync: currently VMD doesn't close things */
        buf = htobe64(disk->l1[l1off]);
-       if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8)
+       if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8*l1off) != 8)
                goto fail;
        if (inc_refs(disk, cluster, 1) == -1)
                goto fail;
@@ -571,7 +592,7 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
        l1idx = (off / disk->clustersz) / nper;
        l2idx = (off / disk->clustersz) % nper;
        l1off = disk->refoff + 8*l1idx;
-       if (pread(disk->fd, &buf, sizeof buf, l1off) != 8)
+       if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
                return -1;
 
        l2cluster = be64toh(buf);
@@ -583,19 +604,20 @@ inc_refs(struct qcdisk *disk, off_t off, int newcluster)
                        return -1;
                }
                buf = htobe64(l2cluster);
-               if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) {
+               if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) {
                        return -1;
                }
        }
 
        refs = 1;
        if (!newcluster) {
-               if (pread(disk->fd, &refs, sizeof refs, l2cluster+2*l2idx) != 2)
+               if (pread(disk->fd, &refs, sizeof(refs), 
+                   l2cluster+2*l2idx) != 2)
                        return -1;
                refs = be16toh(refs) + 1;
        }
        refs = htobe16(refs);
-       if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) {
+       if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2*l2idx) != 2) {
                log_warn("%s: could not write ref block", __func__);
                return -1;
        }
diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
index e02ab67c5dc..ff4bbb3095e 100644
--- usr.sbin/vmd/vioraw.c
+++ usr.sbin/vmd/vioraw.c
@@ -53,19 +53,21 @@ raw_close(void *file, int stayopen)
  * returning -1 for error, 0 for success.
  */
 int
-virtio_init_raw(struct virtio_backing *file, off_t *szp, int fd)
+virtio_init_raw(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
 {
        off_t sz;
        int *fdp;
 
-       sz = lseek(fd, 0, SEEK_END);
+       if (nfd != 1)
+               return -1;
+       sz = lseek(fd[0], 0, SEEK_END);
        if (sz == -1)
                return -1;
 
        fdp = malloc(sizeof(int));
        if (!fdp)
                return -1;
-       *fdp = fd;
+       *fdp = fd[0];
        file->p = fdp;
        file->pread = raw_pread;
        file->pwrite = raw_pwrite;
diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
index 93490344560..1c66f33216a 100644
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -1745,24 +1745,39 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t 
*intr,
        return (0);
 }
 
+int
+virtio_get_base(int fd, char *path, size_t npath ,int type)
+{
+       switch (type) {
+       case VMDF_RAW:          return 0;
+       case VMDF_QCOW2:        return virtio_qcow2_get_base(fd, path, npath);
+       }
+       log_warnx("%s: invalid disk format", __func__);
+       return -1;
+}
+
+/*
+ * Initializes a struct virtio_backing using the list of fds.
+ */
 static int
-virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd, int type)
+virtio_init_disk(struct virtio_backing *file, off_t *sz,
+    int *fd, size_t nfd, int type)
 {
        /* 
         * probe disk types in order of preference, first one to work wins.
         * TODO: provide a way of specifying the type and options.
         */
        switch (type) {
-       case VMDF_RAW:          return virtio_init_raw(file, sz, fd);
-       case VMDF_QCOW2:        return virtio_init_qcow2(file, sz, fd);
+       case VMDF_RAW:          return virtio_init_raw(file, sz, fd, nfd);
+       case VMDF_QCOW2:        return virtio_init_qcow2(file, sz, fd, nfd);
        }
        log_warnx("%s: invalid disk format", __func__);
        return -1;
 }
 
 void
-virtio_init(struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_init(struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
        struct vmop_create_params *vmc = &vm->vm_params;
        struct vm_create_params *vcp = &vmc->vmc_params;
@@ -1838,7 +1853,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int 
*child_disks,
                        vioblk[i].vm_id = vcp->vcp_id;
                        vioblk[i].irq = pci_get_dev_irq(id);
                        if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-                           child_disks[i], vmc->vmc_disktypes[i]) == -1) {
+                           child_disks[i], vmc->vmc_diskbases[i], 
+                           vmc->vmc_disktypes[i]) == -1) {
                                log_warnx("%s: unable to determine disk format",
                                    __func__);
                                return;
@@ -1967,7 +1983,7 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, int 
*child_disks,
                        vioscsi->vq[i].last_avail = 0;
                }
                if (virtio_init_disk(&vioscsi->file, &vioscsi->sz,
-                   child_cdrom, VMDF_RAW) == -1) {
+                   &child_cdrom, 1, VMDF_RAW) == -1) {
                        log_warnx("%s: unable to determine iso format",
                            __func__);
                        return;
@@ -2123,7 +2139,8 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_taps)
 }
 
 int
-vioblk_restore(int fd, struct vmop_create_params *vmc, int *child_disks)
+vioblk_restore(int fd, struct vmop_create_params *vmc,
+    int child_disks[][VMM_MAX_BASE_PER_DISK])
 {
        struct vm_create_params *vcp = &vmc->vmc_params;
        uint8_t i;
@@ -2149,7 +2166,8 @@ vioblk_restore(int fd, struct vmop_create_params *vmc, 
int *child_disks)
                        return (-1);
                }
                if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz,
-                   child_disks[i], vmc->vmc_disktypes[i]) == -1)  {
+                   child_disks[i], vmc->vmc_diskbases[i],
+                   vmc->vmc_disktypes[i]) == -1)  {
                        log_warnx("%s: unable to determine disk format",
                            __func__);
                        return (-1);
@@ -2186,7 +2204,7 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int 
child_cdrom)
                return (-1);
        }
 
-       if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, child_cdrom,
+       if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1,
            VMDF_RAW) == -1) {
                log_warnx("%s: unable to determine iso format", __func__);
                return (-1);
@@ -2198,8 +2216,8 @@ vioscsi_restore(int fd, struct vm_create_params *vcp, int 
child_cdrom)
 }
 
 int
-virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int *child_disks,
-    int *child_taps)
+virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom,
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
        struct vmop_create_params *vmc = &vm->vm_params;
        struct vm_create_params *vcp = &vmc->vmc_params;
diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h
index 46006916b6a..91f0e323204 100644
--- usr.sbin/vmd/virtio.h
+++ usr.sbin/vmd/virtio.h
@@ -257,10 +257,11 @@ struct ioinfo {
 };
 
 /* virtio.c */
-void virtio_init(struct vmd_vm *, int, int *, int *);
+void virtio_init(struct vmd_vm *, int, int[][VMM_MAX_BASE_PER_DISK], int *);
 void virtio_shutdown(struct vmd_vm *);
 int virtio_dump(int);
-int virtio_restore(int, struct vmd_vm *, int, int *, int *);
+int virtio_restore(int, struct vmd_vm *, int,
+    int[][VMM_MAX_BASE_PER_DISK], int *);
 uint32_t vring_size(uint32_t);
 
 int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
@@ -270,12 +271,14 @@ void viornd_update_qs(void);
 void viornd_update_qa(void);
 int viornd_notifyq(void);
 
-int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd);
-int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd);
+ssize_t virtio_qcow2_get_base(int, char *, size_t);
+int virtio_init_raw(struct virtio_backing *, off_t *, int*, size_t);
+int virtio_init_qcow2(struct virtio_backing *, off_t *, int*, size_t);
 
 int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
 int vioblk_dump(int);
-int vioblk_restore(int, struct vmop_create_params *, int *);
+int vioblk_restore(int, struct vmop_create_params *,
+    int[][VMM_MAX_BASE_PER_DISK]);
 void vioblk_update_qs(struct vioblk_dev *);
 void vioblk_update_qa(struct vioblk_dev *);
 int vioblk_notifyq(struct vioblk_dev *);
diff --git usr.sbin/vmd/vm.c usr.sbin/vmd/vm.c
index ef4494d918b..7c9aaf27012 100644
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -65,8 +65,8 @@
 
 io_fn_t ioports_map[MAX_PORTS];
 
-int run_vm(int, int *, int *, struct vmop_create_params *,
-    struct vcpu_reg_state *);
+int run_vm(int, int[][VMM_MAX_BASE_PER_DISK], int *,
+    struct vmop_create_params *, struct vcpu_reg_state *);
 void vm_dispatch_vmm(int, short, void *);
 void *event_thread(void *);
 void *vcpu_run_loop(void *);
@@ -75,8 +75,10 @@ int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
 void create_memory_map(struct vm_create_params *);
 int alloc_guest_mem(struct vm_create_params *);
 int vmm_create_vm(struct vm_create_params *);
-void init_emulated_hw(struct vmop_create_params *, int, int *, int *);
-void restore_emulated_hw(struct vm_create_params *, int, int *, int *,int);
+void init_emulated_hw(struct vmop_create_params *, int,
+    int[][VMM_MAX_BASE_PER_DISK], int *);
+void restore_emulated_hw(struct vm_create_params *, int, int *,
+    int[][VMM_MAX_BASE_PER_DISK],int);
 void vcpu_exit_inout(struct vm_run_params *);
 uint8_t vcpu_exit_pci(struct vm_run_params *);
 int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
@@ -327,7 +329,7 @@ start_vm(struct vmd_vm *vm, int fd)
 
                /* Find and open kernel image */
                if ((fp = vmboot_open(vm->vm_kernel,
-                   vm->vm_disks[0], vmc->vmc_disktypes[0], &vmboot)) == NULL)
+                   vm->vm_disks[0][0], vmc->vmc_disktypes[0], &vmboot)) == 
NULL)
                        fatalx("failed to open kernel - exiting");
 
                /* Load kernel image */
@@ -903,7 +905,7 @@ vmm_create_vm(struct vm_create_params *vcp)
  */
 void
 init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
-    int *child_disks, int *child_taps)
+    int child_disks[][VMM_MAX_BASE_PER_DISK], int *child_taps)
 {
        struct vm_create_params *vcp = &vmc->vmc_params;
        int i;
@@ -968,7 +970,7 @@ init_emulated_hw(struct vmop_create_params *vmc, int 
child_cdrom,
  */
 void
 restore_emulated_hw(struct vm_create_params *vcp, int fd,
-    int *child_taps, int *child_disks, int child_cdrom)
+    int *child_taps, int child_disks[][VMM_MAX_BASE_PER_DISK], int child_cdrom)
 {
        /* struct vm_create_params *vcp = &vmc->vmc_params; */
        int i;
@@ -1029,8 +1031,9 @@ restore_emulated_hw(struct vm_create_params *vcp, int fd,
  *  !0 : the VM exited abnormally or failed to start
  */
 int
-run_vm(int child_cdrom, int *child_disks, int *child_taps,
-    struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
+run_vm(int child_cdrom, int child_disks[][VMM_MAX_BASE_PER_DISK],
+    int *child_taps, struct vmop_create_params *vmc,
+    struct vcpu_reg_state *vrs)
 {
        struct vm_create_params *vcp = &vmc->vmc_params;
        struct vm_rwregs_params vregsp;
diff --git usr.sbin/vmd/vmboot.c usr.sbin/vmd/vmboot.c
index 44ceeb64a7e..f23d8d6b25a 100644
--- usr.sbin/vmd/vmboot.c
+++ usr.sbin/vmd/vmboot.c
@@ -414,13 +414,13 @@ vmboot_open(int kernel_fd, int disk_fd, unsigned int 
disk_type,
 
        switch (vmboot->vbp_type) {
        case VMDF_RAW:
-               if (virtio_init_raw(vfp, &sz, disk_fd) == -1) {
+               if (virtio_init_raw(vfp, &sz, &disk_fd, 1) == -1) {
                        log_debug("%s: could not open raw disk", __func__);
                        goto fail;
                }
                break;
        case VMDF_QCOW2:
-               if (virtio_init_qcow2(vfp, &sz, disk_fd) == -1) {
+               if (virtio_init_qcow2(vfp, &sz, &disk_fd, 1) == -1) {
                        log_debug("%s: could not open qcow2 disk", __func__);
                        goto fail;
                }
diff --git usr.sbin/vmd/vmd.c usr.sbin/vmd/vmd.c
index 1571be21bc5..8377812c316 100644
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -1097,7 +1097,7 @@ void
 vm_stop(struct vmd_vm *vm, int keeptty, const char *caller)
 {
        struct privsep  *ps = &env->vmd_ps;
-       unsigned int     i;
+       unsigned int     i, j;
 
        if (vm == NULL)
                return;
@@ -1117,9 +1117,11 @@ vm_stop(struct vmd_vm *vm, int keeptty, const char 
*caller)
                close(vm->vm_iev.ibuf.fd);
        }
        for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++) {
-               if (vm->vm_disks[i] != -1) {
-                       close(vm->vm_disks[i]);
-                       vm->vm_disks[i] = -1;
+               for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+                       if (vm->vm_disks[i][j] != -1) {
+                               close(vm->vm_disks[i][j]);
+                               vm->vm_disks[i][j] = -1;
+                       }
                }
        }
        for (i = 0; i < VMM_MAX_NICS_PER_VM; i++) {
@@ -1176,7 +1178,7 @@ vm_register(struct privsep *ps, struct vmop_create_params 
*vmc,
        struct vmop_owner       *vmo = NULL;
        struct vmd_user         *usr = NULL;
        uint32_t                 rng;
-       unsigned int             i;
+       unsigned int             i, j;
        struct vmd_switch       *sw;
        char                    *s;
 
@@ -1267,7 +1269,8 @@ vm_register(struct privsep *ps, struct vmop_create_params 
*vmc,
        vm->vm_user = usr;
 
        for (i = 0; i < VMM_MAX_DISKS_PER_VM; i++)
-               vm->vm_disks[i] = -1;
+               for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++)
+                       vm->vm_disks[i][j] = -1;
        for (i = 0; i < VMM_MAX_NICS_PER_VM; i++)
                vm->vm_ifs[i].vif_fd = -1;
        for (i = 0; i < vcp->vcp_nnics; i++) {
diff --git usr.sbin/vmd/vmd.h usr.sbin/vmd/vmd.h
index b348d12c757..0683629b3b1 100644
--- usr.sbin/vmd/vmd.h
+++ usr.sbin/vmd/vmd.h
@@ -48,6 +48,7 @@
 #define VM_DEFAULT_DEVICE      "hd0a"
 #define VM_BOOT_CONF           "/etc/boot.conf"
 #define VM_NAME_MAX            64
+#define VM_MAX_BASE_PER_DISK   4
 #define VM_TTYNAME_MAX         16
 #define MAX_TAP                        256
 #define NR_BACKLOG             5
@@ -169,6 +170,7 @@ struct vmop_create_params {
 #define VMIFF_OPTMASK          (VMIFF_LOCKED|VMIFF_LOCAL|VMIFF_RDOMAIN)
 
        unsigned int             vmc_disktypes[VMM_MAX_DISKS_PER_VM];
+       unsigned int             vmc_diskbases[VMM_MAX_DISKS_PER_VM];
 #define VMDF_RAW               0x01
 #define VMDF_QCOW2             0x02
 
@@ -241,7 +243,7 @@ struct vmd_vm {
        uint32_t                 vm_vmid;
        int                      vm_kernel;
        int                      vm_cdrom;
-       int                      vm_disks[VMM_MAX_DISKS_PER_VM];
+       int                      
vm_disks[VMM_MAX_DISKS_PER_VM][VMM_MAX_BASE_PER_DISK];
        struct vmd_if            vm_ifs[VMM_MAX_NICS_PER_VM];
        char                    *vm_ttyname;
        int                      vm_tty;
@@ -415,4 +417,7 @@ int  parse_config(const char *);
 int     cmdline_symset(char *);
 int     host(const char *, struct address *);
 
+/* virtio.c */
+int     virtio_get_base(int, char *, size_t, int);
+
 #endif /* VMD_H */
diff --git usr.sbin/vmd/vmm.c usr.sbin/vmd/vmm.c
index 7757856323f..3176fd85713 100644
--- usr.sbin/vmd/vmm.c
+++ usr.sbin/vmd/vmm.c
@@ -608,7 +608,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
        struct vmd_vm           *vm;
        int                      ret = EINVAL;
        int                      fds[2];
-       size_t                   i;
+       size_t                   i, j;
 
        if ((vm = vm_getbyvmid(imsg->hdr.peerid)) == NULL) {
                log_warnx("%s: can't find vm", __func__);
@@ -643,8 +643,11 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *pid)
                close(fds[1]);
 
                for (i = 0 ; i < vcp->vcp_ndisks; i++) {
-                       close(vm->vm_disks[i]);
-                       vm->vm_disks[i] = -1;
+                       for (j = 0; j < VMM_MAX_BASE_PER_DISK; j++) {
+                               if (vm->vm_disks[i][j] != -1)
+                                       close(vm->vm_disks[i][j]);
+                               vm->vm_disks[i][j] = -1;
+                       }
                }
                for (i = 0 ; i < vcp->vcp_nnics; i++) {
                        close(vm->vm_ifs[i].vif_fd);

-- 
    Ori Bernstein

Reply via email to