On Sun, 12 Aug 2018 22:51:24 -0700, Ori Bernstein <[email protected]> wrote:
> On Sun, 5 Aug 2018 00:52:32 -0700, Ori Bernstein <[email protected]> wrote: > > And, now something that actually appears to work. You can create a > disk on OpenBSD using qemu: > > qemu-img create foo.qc2 16G > > add it to your vm.conf: > > disk "/path/to/foo.qc2" > > boot and install OpenBSD on it as normal, and if you decide you don't like > hardware virtualization, you can point qemu at it and run using that: > > qemu-system-x86_64 -m 1024 -hda foo.qc2 > > Snapshots haven't been tested yet, and tools need to be added, incompatible > extensions are silently ignored, and there could stand to be a bit more sanity > checking. > > vioscribble.c should also probably be extracted into a regress test, rather > than just something that sits beside the I/O code. > > Patch below: One more update, with some significant differences: - External snapshots will work if you comment out the chroot and add rpath to the pledges. This is a bad idea, so external snapshots will return a clean error until I can figure out a good way to plumb the fds, shuffle around the pledges, or do something else to make it possible to open the backing files from the vm process. - Internal snapshots seem to work, but you will need qemu to manage them. qemu-img snapshot -c snapname disk.qc2 # create qemu-img snapshot -a snapname disk.qc2 # revert These have only been tested lightly, so I wouldn't trust them fully yet. - vioscribble has been turned into a regress test, and grew license information. - A somewhat embarrassing bug, where I malloced the wrong type, was fixed. --- regress/usr.sbin/vmd/diskfmt/Makefile | 28 ++ regress/usr.sbin/vmd/diskfmt/vioscribble.c | 165 +++++++++ usr.sbin/vmd/Makefile | 2 +- usr.sbin/vmd/vioqcow2.c | 574 +++++++++++++++++++++++++++++ usr.sbin/vmd/virtio.c | 10 +- usr.sbin/vmd/virtio.h | 1 + 6 files changed, 776 insertions(+), 4 deletions(-) create mode 100644 regress/usr.sbin/vmd/diskfmt/Makefile create mode 100644 regress/usr.sbin/vmd/diskfmt/vioscribble.c create mode 100644 usr.sbin/vmd/vioqcow2.c diff --git regress/usr.sbin/vmd/diskfmt/Makefile regress/usr.sbin/vmd/diskfmt/Makefile new file mode 100644 index 00000000000..71bb2b8ce52 --- /dev/null +++ regress/usr.sbin/vmd/diskfmt/Makefile @@ -0,0 +1,28 @@ +# $OpenBSD: Makefile,v 1.5 2018/07/20 22:18:49 bluhm Exp $ + +# This regression test creates a raw disk image and a +# qcow disk image, and scribbles the same data to both +# of them. It verifies that they both have the same +# result. +# +# In order for this test to work, qemu must be installed +# in order to create the disk images. + +VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/ + +PROG=vioscribble +SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c +CFLAGS+=-I$(VMD_DIR) -pthread +LDFLAGS+=-pthread + +run-regress-vioscribble: scribble-images + +scribble-images: + rm -f scribble.raw scribble.qc2 + vmctl create scribble.raw -s 4G + qemu-img create -f qcow2 scribble.qc2 4G + + +.PHONY: ${REGRESS_TARGETS} scribble-images + +.include <bsd.regress.mk> diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c regress/usr.sbin/vmd/diskfmt/vioscribble.c new file mode 100644 index 00000000000..3821c3b277b --- /dev/null +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c @@ -0,0 +1,165 @@ +/* $OpenBSD: $ */ + +/* + * Copyright (c) 2018 Ori Bernstein <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Quick hack of a program to try to test vioqcow2.c against + * vioraw.c. + * + * Compile with: + * + * cc -pthread -o scribble vioscribble.c vioqcow2.c vioraw.c + */ +#include <sys/param.h> /* PAGE_SIZE */ +#include <sys/socket.h> +#include <sys/stat.h> + +#include <machine/vmmvar.h> +#include <dev/pci/pcireg.h> +#include <dev/pci/pcidevs.h> +#include <dev/pv/virtioreg.h> +#include <dev/pv/vioblkreg.h> +#include <dev/pv/vioscsireg.h> + +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/if_ether.h> + +#include <errno.h> +#include <event.h> +#include <poll.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <fcntl.h> +#include <unistd.h> +#include <assert.h> +#include <err.h> + +#include "pci.h" +#include "vmd.h" +#include "vmm.h" +#include "virtio.h" + +#define CLUSTERSZ 65536 + +int verbose; +struct virtio_backing qcowfile; +struct virtio_backing rawfile; + +/* We expect the scribble disks to be 4g in size */ +#define DISKSZ (4ull*1024ull*1024ull*1024ull) + +/* functions that io code depends on */ + +void +log_debug(const char *emsg, ...) +{ + if (verbose) { + va_list ap; + + va_start(ap, emsg); + vfprintf(stdout, emsg, ap); + fprintf(stdout, "\n"); + va_end(ap); + } +} + +void +log_warnx(const char *emsg, ...) +{ + va_list ap; + + va_start(ap, emsg); + vfprintf(stdout, emsg, ap); + fprintf(stdout, "\n"); + va_end(ap); +} + +void +log_warn(const char *emsg, ...) +{ + va_list ap; + + va_start(ap, emsg); + vfprintf(stdout, emsg, ap); + fprintf(stdout, "\n"); + va_end(ap); +} + +static void +fill(size_t off, char *buf, size_t len) +{ + size_t i; + + /* use the top bits of off, since we can guess at where we went wrong. */ + for (i = 0; i < len; i++) + buf[i] = (off >> 8); +} + +int +main(int argc, char **argv) +{ + int qcfd, rawfd, i; + char buf[64*1024], cmp[64*1024]; + off_t len, off, qcsz, rawsz; + + verbose = !!getenv("VERBOSE"); + qcfd = open("scribble.qc2", O_RDWR); + rawfd = open("scribble.raw", O_RDWR); + if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1) + err(1, "unable to open qcow"); + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1) + err(1, "unable to open raw"); + + srandom_deterministic(123); + + /* scribble to both disks */ + printf("scribbling...\n"); + for (i = 0; i < 16; i++) { + off = (random() % DISKSZ); + len = random() % sizeof buf + 1; + fill(off, buf, sizeof buf); + if (qcowfile.pwrite(qcowfile.p, buf, len, off) == -1) + printf("iter %d: unable to write at %llx\n", i, off); + rawfile.pwrite(rawfile.p, buf, len, off); + + if (qcowfile.pread(qcowfile.p, buf, len, off) == -1) + printf("unable to read at %llx\n", off); + rawfile.pread(rawfile.p, cmp, len, off); + if (memcmp(buf, cmp, len) != 0) { + printf("iter %d: mismatch at 0x%llx (espected val: %d)\n", + i, off, (char)(off >> 8)); + break; + } + } + + /* validate that both disks match */ + printf("validating...\n"); + for (off = 0; off < DISKSZ; off += sizeof buf) { + if (qcowfile.pread(qcowfile.p, buf, sizeof buf, off) == -1) + printf("unable to read at %llx\n", off); + rawfile.pread(rawfile.p, cmp, sizeof buf, off); + if (memcmp(buf, cmp, sizeof buf) != 0) { + printf("mismatch at 0x%llx (espected val: %d)\n", + off, (char)(off >> 8)); + break; + } + } + return 0; +} diff --git usr.sbin/vmd/Makefile usr.sbin/vmd/Makefile index 24c1d1b1d4a..b6db6c782d6 100644 --- usr.sbin/vmd/Makefile +++ usr.sbin/vmd/Makefile @@ -6,7 +6,7 @@ PROG= vmd SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c packet.c -SRCS+= parse.y atomicio.c vioscsi.c vioraw.c +SRCS+= parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c CFLAGS+= -Wall -I${.CURDIR} CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c new file mode 100644 index 00000000000..8c040aa39aa --- /dev/null +++ usr.sbin/vmd/vioqcow2.c @@ -0,0 +1,574 @@ +/* $OpenBSD: $ */ + +/* + * Copyright (c) 2018 Ori Bernstein <[email protected]> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/stat.h> + +#include <machine/vmmvar.h> +#include <dev/pci/pcireg.h> + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <assert.h> +#include <err.h> + +#include "vmd.h" +#include "vmm.h" +#include "virtio.h" + +#define QCOW2_COMPRESSED 0x4000000000000000ull +#define QCOW2_INPLACE 0x8000000000000000ull + +#define QCOW2_DIRTY (1 << 0) +#define QCOW2_CORRUPT (1 << 1) + +enum { + ICFEATURE_DIRTY = 1 << 0, + ICFEATURE_CORRUPT = 1 << 1, +}; + +enum { + ACFEATURE_BITEXT = 1 << 0, +}; + +struct qcheader { + char magic[4]; + uint32_t version; + uint64_t backingoff; + uint32_t backingsz; + uint32_t clustershift; + uint64_t disksz; + uint32_t cryptmethod; + uint32_t l1sz; + uint64_t l1off; + uint64_t refoff; + uint32_t refsz; + uint32_t snapcount; + uint64_t snapsz; + /* v3 additions */ + uint64_t incompatfeatures; + uint64_t compatfeatures; + uint64_t autoclearfeatures; + uint32_t reforder; /* bits = 1 << reforder */ + uint32_t headersz; +} __packed; + +struct qcdisk { + pthread_rwlock_t lock; + struct qcdisk *base; + struct qcheader header; + + int fd; + uint64_t *l1; + char *scratch; + off_t end; + uint32_t clustersz; + off_t disksz; /* in bytes */ + uint32_t cryptmethod; + + uint32_t l1sz; + off_t l1off; + + off_t refoff; + uint32_t refsz; + + uint32_t nsnap; + off_t snapoff; + + /* v3 features */ + uint64_t incompatfeatures; + uint64_t autoclearfeatures; + uint32_t refssz; + uint32_t headersz; +}; + +extern char *__progname; + +static off_t xlate(struct qcdisk *, off_t, int *); +static int copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); +static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); +static int inc_refs(struct qcdisk *, off_t, int); +static int qc2_openpath(struct qcdisk *, char *, int); +static int qc2_open(struct qcdisk *, int); +static ssize_t qc2_pread(void *, char *, size_t, off_t); +static ssize_t qc2_pwrite(void *, char *, size_t, off_t); +static void qc2_close(void *); + +/* + * Initializes a raw disk image backing file from an fd. + * Stores the number of 512 byte sectors in *szp, + * returning -1 for error, 0 for success. + * + * May open snapshot base images. + */ +int +virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd) +{ + struct qcdisk *diskp; + + diskp = malloc(sizeof(struct qcdisk)); + if (diskp == NULL) + return -1; + if (qc2_open(diskp, fd) == -1) { + free(diskp); + return -1; + } + file->p = diskp; + file->pread = qc2_pread; + file->pwrite = qc2_pwrite; + file->close = qc2_close; + *szp = diskp->disksz / 512; + return 0; +} + +static int +qc2_openpath(struct qcdisk *disk, char *path, int flags) +{ + int fd; + + fd = open(path, flags); + if (fd < 0) + return -1; + return qc2_open(disk, fd); +} + +static int +qc2_open(struct qcdisk *disk, int fd) +{ + char basepath[PATH_MAX]; + struct stat st; + struct qcheader header; + uint64_t backingoff; + uint32_t backingsz; + size_t i; + int version; + + if (pread(fd, &header, sizeof header, 0) != sizeof header) + return -1; + if (strncmp(header.magic, "QFI\xfb", 4) != 0) + return -1; + pthread_rwlock_init(&disk->lock, NULL); + disk->fd = fd; + disk->base = NULL; + + disk->clustersz = (1ull << be32toh(header.clustershift)); + disk->disksz = be64toh(header.disksz); + disk->cryptmethod = be32toh(header.cryptmethod); + disk->l1sz = be32toh(header.l1sz); + disk->l1off = be64toh(header.l1off); + disk->refsz = be32toh(header.refsz); + disk->refoff = be64toh(header.refoff); + disk->nsnap = be32toh(header.snapcount); + disk->snapoff = be64toh(header.snapsz); + /* + * The additional features here are defined as 0 in the v2 format, + * so as long as we clear the buffer before parsing, we don't need + * to check versions here. + */ + disk->incompatfeatures = be64toh(header.incompatfeatures); + disk->autoclearfeatures = be64toh(header.autoclearfeatures); + disk->refssz = be32toh(header.refsz); + disk->headersz = be32toh(header.headersz); + + /* + * We only know about the dirty or corrupt bits here. + */ + if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) { + log_warn("%s: unsupported features %llx", __progname, + disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); + return -1; + } + + disk->l1 = calloc(disk->l1sz, sizeof *disk->l1); + if (pread(disk->fd, (char*)disk->l1, 8*disk->l1sz, disk->l1off) + != 8*disk->l1sz) { + free(disk->l1); + return -1; + } + for (i = 0; i < disk->l1sz; i++) + disk->l1[i] = be64toh(disk->l1[i]); + version = be32toh(header.version); + if (version != 2 && version != 3) { + log_warn("%s: unknown qcow2 version %d", __progname, version); + return -1; + } + + backingoff = be64toh(header.backingoff); + backingsz = be32toh(header.backingsz); + if (backingsz != 0) { + /* + * FIXME: we need to figure out a way of opening these things, otherwise + * we just crash with a pledge violation. + */ + log_warn("unsupported external snapshot images"); + return -1; + + if (backingsz >= sizeof basepath - 1) { + log_warn("%s: snapshot path too long", __progname); + return -1; + } + if (pread(fd, basepath, backingsz, backingoff) != backingsz) { + log_warn("%s: could not read snapshot base name", __progname); + return -1; + } + basepath[backingsz] = 0; + + disk->base = calloc(1, sizeof(struct qcdisk)); + if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) { + free(disk->base); + return -1; + } + if (disk->base->clustersz != disk->clustersz) { + log_warn("%s: all disks must share clustersize", __progname); + free(disk->base); + return -1; + } + } + fstat(fd, &st); + disk->end = st.st_size; + return 0; +} + +static ssize_t +qc2_pread(void *p, char *buf, size_t len, off_t off) +{ + struct qcdisk *disk, *d; + off_t phys_off, end, cluster_off; + ssize_t sz, rem; + + disk = p; + end = off + len; + if (off < 0 || end > disk->disksz) + return -1; + + /* handle head chunk separately */ + rem = len; + while (off != end) { + for (d = disk; d; d = d->base) + if ((phys_off = xlate(d, off, NULL)) > 0) + break; + /* Break out into chunks. This handles + * three cases: + * + * |----+====|========|====+ | + * + * Either we are at the start of the read, + * and the cluster has some leading bytes. + * This means that we are reading the tail + * of the cluster, and our size is: + * + * clustersz - (off % clustersz). + * + * Otherwise, we're reading the middle section. + * We're already aligned here, so we can just + * read the whole cluster size. Or we're at the + * tail, at which point we just want to read the + * remaining bytes. + */ + cluster_off = off % disk->clustersz; + sz = disk->clustersz - cluster_off; + if (sz > rem) + sz = rem; + /* + * If we're within the disk, but don't have backing bytes, + * just read back zeros. + */ + if (!d) + bzero(buf, sz); + else if (pread(d->fd, buf, sz, phys_off) != sz) + return -1; + off += sz; + buf += sz; + rem -= sz; + } + return len; +} + +ssize_t +qc2_pwrite(void *p, char *buf, size_t len, off_t off) +{ + struct qcdisk *disk, *d; + off_t phys_off, cluster_off, end; + ssize_t sz, rem; + int inplace; + + d = p; + disk = p; + inplace = 1; + end = off + len; + if (off < 0 || end > disk->disksz) + return -1; + rem = len; + while (off != end) { + /* See the read code for a summary of the computation */ + cluster_off = off % disk->clustersz; + sz = disk->clustersz - cluster_off; + if (sz > rem) + sz = rem; + + phys_off = xlate(disk, off, &inplace); + if (phys_off == -1) + return -1; + /* + * If we couldn't find the cluster in the writable disk, + * see if it exists in the base image. If it does, we + * need to copy it before the write. The copy happens + * in the '!inplace' if clause below te search. + */ + if (phys_off == 0) + for (d = disk->base; d; d = d->base) + if ((phys_off = xlate(d, off, NULL)) > 0) + break; + if (!inplace) + phys_off = mkcluster(disk, d, off, phys_off); + if (phys_off == -1) + return -1; + log_debug("writing at %llx", phys_off); + if (pwrite(disk->fd, buf, sz, phys_off) != sz) + return -1; + off += sz; + buf += sz; + rem -= sz; + } + return len; +} + +static void +qc2_close(void *p) +{ + struct qcdisk *disk; + + disk = p; + pwrite(disk->fd, disk->l1, disk->l1sz, disk->l1off); + close(disk->fd); + free(disk); +} + +/* + * Translates a virtual offset into an on-disk offset. + * Returns: + * -1 on error + * 0 on 'not found' + * >0 on found + */ +static off_t +xlate(struct qcdisk *disk, off_t off, int *inplace) +{ + off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; + uint64_t buf; + + + pthread_rwlock_rdlock(&disk->lock); + log_debug("xlating offset %llx", off); + if (off < 0) + goto err; + + l2sz = disk->clustersz / 8; + l1off = (off / disk->clustersz) / l2sz; + if (l1off >= disk->l1sz) + goto err; + + log_debug("read l1off: %llx", l1off); + l2tab = disk->l1[l1off]; + l2tab &= ~QCOW2_INPLACE; + if (l2tab == 0) { + pthread_rwlock_unlock(&disk->lock); + return 0; + } + l2off = (off / disk->clustersz) % l2sz; + log_debug("read l2off: %llx", l2off); + pread(disk->fd, &buf, sizeof(buf), l2tab + l2off*8); + cluster = be64toh(buf); + /* + * cluster may be 0, but all future operations don't affect + * the return value. + */ + if (inplace) + *inplace = !!(cluster & QCOW2_INPLACE); + if (cluster & QCOW2_COMPRESSED) { + log_warn("%s: compressed clusters unsupported", __progname); + goto err; + } + pthread_rwlock_unlock(&disk->lock); + clusteroff = 0; + cluster &= ~QCOW2_INPLACE; + if (cluster) + clusteroff = off % disk->clustersz; + log_debug("cluster: %llx, clusteroff: %llx", cluster, clusteroff); + return cluster + clusteroff; +err: + pthread_rwlock_unlock(&disk->lock); + return -1; +} + +/* + * Allocates a new cluster on disk, creating a new L2 table + * if needed. The cluster starts off with a refs of one, + * and the writable bit set. + * + * Returns -1 on error, and the physical address within the + * cluster of the write offset if it exists. + */ +static off_t +mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) +{ + off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; + uint64_t buf; + int fd; + + pthread_rwlock_wrlock(&disk->lock); + + cluster = -1; + fd = disk->fd; + /* L1 entries always exist */ + l2sz = disk->clustersz / 8; + l1off = off / (disk->clustersz * l2sz); + if (l1off >= disk->l1sz) + goto fail; + + /* + * Align disk to cluster size, for ftruncate: Not strictly + * required, but it easier to eyeball buggy write offsets, + * and helps performance a bit. + */ + disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); + + l2tab = disk->l1[l1off]; + l2off = (off / disk->clustersz) % l2sz; + log_debug("l2tab: %llx, l2off: %llx, inplace: %d", + l2tab, l2off & ~QCOW2_INPLACE, (l2tab & QCOW2_INPLACE) != 0); + /* We may need to create or clone an L2 entry to map the block */ + if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { + log_debug("creating l2 table"); + orig = l2tab & ~QCOW2_INPLACE; + l2tab = disk->end; + disk->end += disk->clustersz; + if (ftruncate(disk->fd, disk->end) == -1) { + perror("ftruncate"); + goto fail; + } + + /* + * If we translated, found a L2 entry, but it needed to + * be copied, copy it. + */ + if (orig != 0 && copy_cluster(disk, disk, l2tab, orig) == -1) { + perror("move cluster"); + goto fail; + } + /* Update l1 -- we flush it later */ + disk->l1[l1off] = l2tab | QCOW2_INPLACE; + if (inc_refs(disk, l2tab, 1) == -1) { + perror("refs"); + goto fail; + } + } + l2tab &= ~QCOW2_INPLACE; + + /* Grow the disk */ + if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) + goto fail; + if (src_phys > 0 && copy_cluster(disk, base, disk->end, src_phys) == -1) + return -1; + cluster = disk->end; + disk->end += disk->clustersz; + buf = htobe64(cluster | QCOW2_INPLACE); + if (pwrite(disk->fd, &buf, sizeof buf, l2tab + l2off*8) != sizeof(buf)) + goto fail; + + /* TODO: lazily sync: currently VMD doesn't close things */ + buf = htobe64(disk->l1[l1off]); + if (pwrite(disk->fd, &buf, sizeof buf, disk->l1off + 8*l1off) != 8) + goto fail; + if (inc_refs(disk, cluster, 1) == -1) + goto fail; + + pthread_rwlock_unlock(&disk->lock); + clusteroff = off % disk->clustersz; + return cluster + clusteroff; + +fail: + pthread_rwlock_unlock(&disk->lock); + return -1; +} + +static int +copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) +{ + char *scratch; + + scratch = alloca(disk->clustersz); + if (!scratch) + err(1, "out of memory"); + src &= ~(disk->clustersz - 1); + dst &= ~(disk->clustersz - 1); + if (pread(base->fd, scratch, disk->clustersz, src) == -1) + return -1; + if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) + return -1; + return 0; +} + +static int +inc_refs(struct qcdisk *disk, off_t off, int newcluster) +{ + off_t l1off, l1idx, l2idx, l2cluster; + size_t nper; + uint16_t refs; + uint64_t buf; + + off &= ~QCOW2_INPLACE; + nper = disk->clustersz / 2; + log_debug("incrementing refs for %llx (cluster %llx, nper=%zd)", off, off / disk->clustersz, nper); + l1idx = (off / disk->clustersz) / nper; + l2idx = (off / disk->clustersz) % nper; + l1off = disk->refoff + 8*l1idx; + if (pread(disk->fd, &buf, sizeof buf, l1off) != 8) + return -1; + + l2cluster = be64toh(buf); + if (l2cluster == 0) { + l2cluster = disk->end; + disk->end += disk->clustersz; + if (ftruncate(disk->fd, disk->end) < 0) { + log_debug("refs block grow fail "); + return -1; + } + buf = htobe64(l2cluster); + if (pwrite(disk->fd, &buf, sizeof buf, l1off) != 8) { + return -1; + } + } + + refs = 1; + if (!newcluster) { + if (pread(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) + return -1; + refs = be16toh(refs) + 1; + } + refs = htobe16(refs); + log_debug("writing refs %d for offset %llx (l1: %llx, l2: %llx) to %llx + %llx", + refs, off, l1idx, l2idx, l2cluster, 2*l2idx); + if (pwrite(disk->fd, &refs, sizeof refs, l2cluster + 2*l2idx) != 2) { + log_debug("could not write ref block"); + } + return 0; +} + diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c index 099f1df6a7e..2177d8434f6 100644 --- usr.sbin/vmd/virtio.c +++ usr.sbin/vmd/virtio.c @@ -1749,10 +1749,14 @@ static int virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd) { /* - * This is where we slot in disk type selection. - * Right now, there's only raw. + * probe disk types in order of preference, first one to work wins. + * TODO: provide a way of specifying the type and options. */ - return virtio_init_raw(file, sz, fd); + if (virtio_init_qcow2(file, sz, fd) == 0) + return 0; + if (virtio_init_raw(file, sz, fd) == 0) + return 0; + return -1; } void diff --git usr.sbin/vmd/virtio.h usr.sbin/vmd/virtio.h index 1f0c91d2577..a513d541f9b 100644 --- usr.sbin/vmd/virtio.h +++ usr.sbin/vmd/virtio.h @@ -271,6 +271,7 @@ void viornd_update_qa(void); int viornd_notifyq(void); int virtio_init_raw(struct virtio_backing *dev, off_t *sz, int fd); +int virtio_init_qcow2(struct virtio_backing *dev, off_t *sz, int fd); int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int vioblk_dump(int); -- 2.16.4 -- Ori Bernstein
