On Sun, 5 Aug 2018 00:52:32 -0700, Ori Bernstein <[email protected]> wrote:

> This change introduces a 'struct virtio_backing' which makes the
> disk i/o pluggable, providing 'backing->{pread,pwrite}' calls that
> can be replaced by different disk i/o drivers.
> 
> This is necessary preparation for adding qcow2 support, which will
> come as a follow up patch. I'll be posting a preview of it in a follow
> up email.

And, now something that actually appears to work. You can create a
disk on OpenBSD using qemu:

        qemu-img create foo.qc2 16G

add it to your vm.conf:

        disk "/path/to/foo.qc2"

boot and install OpenBSD on it as normal, and if you decide you don't like
hardware virtualization, you can point qemu at it and run using that:

        qemu-system-x86_64 -m 1024 -hda foo.qc2 

Snapshots haven't been tested yet, and tools need to be added, incompatible
extensions are silently ignored, and there could stand to be a bit more sanity
checking.

vioscribble.c should also probably be extracted into a regress test, rather
than just something that sits beside the I/O code.

Patch below:

---
 usr.sbin/vmd/Makefile      |   2 +-
 usr.sbin/vmd/vioqcow2.c    | 624 +++++++++++++++++++++++++++++++++++++
 usr.sbin/vmd/vioqcow2.h    |   6 +
 usr.sbin/vmd/vioraw.c      |  17 +
 usr.sbin/vmd/vioscribble.c | 143 +++++++++
 usr.sbin/vmd/virtio.c      |  10 +-
 6 files changed, 798 insertions(+), 4 deletions(-)
 create mode 100644 usr.sbin/vmd/vioqcow2.c
 create mode 100644 usr.sbin/vmd/vioqcow2.h
 create mode 100644 usr.sbin/vmd/vioscribble.c

diff --git usr.sbin/vmd/Makefile usr.sbin/vmd/Makefile
index 24c1d1b1d4a..b6db6c782d6 100644
--- usr.sbin/vmd/Makefile
+++ usr.sbin/vmd/Makefile
@@ -6,7 +6,7 @@ PROG=           vmd
 SRCS=          vmd.c control.c log.c priv.c proc.c config.c vmm.c
 SRCS+=         vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
 SRCS+=         ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c packet.c
-SRCS+=         parse.y atomicio.c vioscsi.c vioraw.c
+SRCS+=         parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c
 
 CFLAGS+=       -Wall -I${.CURDIR}
 CFLAGS+=       -Wstrict-prototypes -Wmissing-prototypes
diff --git usr.sbin/vmd/vioqcow2.c usr.sbin/vmd/vioqcow2.c
new file mode 100644
index 00000000000..fbc2e245495
--- /dev/null
+++ usr.sbin/vmd/vioqcow2.c
@@ -0,0 +1,624 @@
+/*     $OpenBSD: $     */
+
+/*
+ * Copyright (c) 2018 Ori Bernstein <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h> /* PAGE_SIZE */
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include <machine/vmmvar.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcidevs.h>
+#include <dev/pv/virtioreg.h>
+#include <dev/pv/vioblkreg.h>
+#include <dev/pv/vioscsireg.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+
+#include <errno.h>
+#include <event.h>
+#include <poll.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <err.h>
+
+#include "pci.h"
+#include "vmd.h"
+#include "vmm.h"
+#include "virtio.h"
+
+#define QCOW2_COMPRESSED       0x4000000000000000ull
+#define QCOW2_INPLACE          0x8000000000000000ull
+
+enum {
+       ICFEATURE_DIRTY         = 1 << 0,
+       ICFEATURE_CORRUPT       = 1 << 1,
+};
+
+enum {
+       ACFEATURE_BITEXT        = 1 << 0,
+};
+
+struct qcdisk {
+       pthread_rwlock_t lock;
+       struct qcdisk *base;
+       int fd;
+
+       char *l1;
+       char *scratch;
+       off_t end;
+
+       uint32_t version;
+
+       uint64_t backingoff;
+       uint32_t backingsz;
+
+       uint32_t clustershift;
+       uint32_t clustersz;
+       off_t    sz; /* in bytes */
+       uint32_t cryptmethod;
+
+       uint32_t l1size;
+       off_t    l1off;
+
+       off_t    refoff;
+       uint32_t refsize;
+
+       uint32_t nsnap;
+       off_t    snapoff;
+
+       /* v3 features */
+       uint64_t incompatfeatures;
+       uint64_t autoclearfeatures;
+       uint32_t refssz;
+       uint32_t headersz;
+};
+
+extern char *__progname;
+
+static int move_cluster(struct qcdisk *, off_t, off_t);
+static off_t xlate(struct qcdisk *, off_t, int *);
+static off_t mkcluster(struct qcdisk *, off_t, off_t);
+static int inc_refs(struct qcdisk *, off_t, int);
+static uint32_t getbe32(char **, char *);
+static uint64_t getbe64(char **, char *);
+static uint16_t unpackbe16(char *);
+static uint32_t unpackbe32(char *);
+static uint64_t unpackbe64(char *);
+static void packbe16(char *p, uint16_t v);
+//static void packbe32(char *p, uint32_t v);
+static void packbe64(char *, uint64_t);
+static int qc2_openpath(struct qcdisk *, char *, int);
+static int qc2_open(struct qcdisk *, int);
+static ssize_t qc2_pread(void *, char *, size_t, off_t);
+static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
+static void qc2_close(void *);
+
+int
+virtio_init_qcow2(struct virtio_backing *file, off_t *szp, int fd)
+{
+       struct qcdisk *diskp;
+
+       diskp = malloc(sizeof(int*));
+       if (diskp == NULL)
+               return -1;
+       if (qc2_open(diskp, fd) == -1) {
+               free(diskp);
+               return -1;
+       }
+       file->p = diskp;
+       file->pread = qc2_pread;
+       file->pwrite = qc2_pwrite;
+       file->close = qc2_close;
+       *szp = diskp->sz / 512;
+       return 0;
+}
+
+static int
+qc2_openpath(struct qcdisk *disk, char *path, int flags)
+{
+       int fd;
+
+       fd = open(path, flags);
+       if (fd < 0)
+               return -1;
+       return qc2_open(disk, fd);
+}
+
+static int
+qc2_open(struct qcdisk *disk, int fd)
+{
+       char *p, *end, basepath[PATH_MAX];
+       struct stat st;
+       char buf[
+               4 + /* magic */
+               4 + /* version */
+               8 + /* backing offset */
+               4 + /* backing size */
+               4 + /* cluster size */
+               8 + /* size */
+               4 + /* crypt method */
+               4 + /* l1 size */
+               8 + /* l1 offset */
+               8 + /* refs offset */
+               4 + /* refs size */
+               4 + /* snapshot count */
+               8 + /* snapshot size */
+               /* v3 additions */
+               8 + /* incompatfeatures */
+               8 + /* autoclearfeatures */
+               4 + /* refssz */
+               4   /* headersz */
+       ];
+
+       memset(buf, sizeof buf, 0);
+       pread(fd, buf, sizeof buf, 0);
+       if (strncmp(buf, "QFI\xfb", 4) != 0)
+               return -1;
+       p = buf + 4;
+       end = buf + sizeof buf;
+       pthread_rwlock_init(&disk->lock, NULL);
+       disk->fd = fd;
+       disk->base = NULL;
+       disk->version = getbe32(&p, end);
+       disk->backingoff = getbe64(&p, end);
+       disk->backingsz = getbe32(&p, end);
+       disk->clustershift = getbe32(&p, end);
+       disk->sz = getbe64(&p, end);
+       disk->cryptmethod = getbe32(&p, end);
+       disk->l1size = getbe32(&p, end);
+       disk->l1off = getbe64(&p, end);
+       disk->refoff = getbe64(&p, end);
+       disk->refsize = getbe32(&p, end);
+       disk->nsnap = getbe32(&p, end);
+       disk->snapoff = getbe64(&p, end);
+       /* 
+        * The additional features here are defined as 0 in the v2 format,
+        * so as long as we clear the buffer before parsing, we don't need
+        * to check versions here.
+        */
+       disk->incompatfeatures = getbe64(&p, end);
+       disk->autoclearfeatures = getbe64(&p, end);
+       disk->refssz = getbe32(&p, end);
+       disk->headersz = getbe32(&p, end);
+       disk->l1 = calloc(disk->l1size, sizeof *disk->l1);
+       if (pread(disk->fd, (char*)disk->l1, 8*disk->l1size, disk->l1off)
+           != 8*disk->l1size) {
+               free(disk->l1);
+               return -1;
+       }
+       if (disk->version != 2 && disk->version != 3) {
+               log_warn("%s: unknown qcow2 version %d",
+                   __progname, disk->version);
+               return -1;
+       }
+
+       if (disk->backingsz != 0) {
+               if (disk->backingsz >= sizeof basepath - 1) {
+                       log_warn("%s: snapshot path too long", __progname);
+                       return -1;
+               }
+               if (pread(fd, basepath, disk->backingsz, disk->backingoff) !=
+                   disk->backingsz) {
+                       log_warn("%s: could not read snapshot base name", 
__progname);
+                       return -1;
+               }
+               basepath[disk->backingsz] = 0;
+
+               disk->base = calloc(1, sizeof(struct qcdisk));
+               if (qc2_openpath(disk->base, basepath, O_RDONLY) == -1) {
+                       free(disk->base);
+                       return -1;
+               }
+               if (disk->base->clustersz != disk->clustersz) {
+                       log_warn("%s: all disks must share clustersize", 
__progname);
+                       free(disk->base);
+                       return -1;
+               }
+       }
+       disk->clustersz = (1ull << disk->clustershift);
+       disk->scratch = malloc(disk->clustersz);
+       fstat(fd, &st);
+       disk->end = st.st_size;
+       return 0;
+}
+
+static ssize_t
+qc2_pread(void *p, char *buf, size_t len, off_t off)
+{
+       struct qcdisk *disk, *d;
+       off_t phys_off, end, cluster_off;
+       ssize_t sz, rem;
+
+       disk = p;
+       end = off + len;
+       if (off < 0 || end > disk->sz)
+               return -1;
+
+       /* handle head chunk separately */
+       rem = len;
+       while (off != end) {
+               for (d = disk; d; d = disk->base)
+                       if ((phys_off = xlate(disk, off, NULL)) > 0)
+                               break;
+               /* Break out into chunks. This handles
+                * three cases:
+                *
+                *    |----+====|========|====+    |
+                * 
+                * Either we are at the start of the read,
+                * and the cluster has some leading bytes.
+                * This means that we are reading the tail
+                * of the cluster, and our size is:
+                *
+                *      clustersz - (off % clustersz).
+                *
+                * Otherwise, we're reading the middle section.
+                * We're already aligned here, so we can just
+                * read the whole cluster size. Or we're at the
+                * tail, at which point we just want to read the
+                * remaining bytes.
+                */
+               cluster_off = off % disk->clustersz;
+               sz = disk->clustersz - cluster_off;
+               if (sz > rem)
+                       sz = rem;
+               /* 
+               * If we're within the disk, but don't have backing bytes,
+               * just read back zeros.
+               */
+               if (!d)
+                       bzero(buf, sz);
+               else if (pread(d->fd, buf, sz, phys_off) != sz)
+                       return -1;
+               off += sz;
+               buf += sz;
+               rem -= sz;
+       }
+       return len;
+}
+
+ssize_t
+qc2_pwrite(void *p, char *buf, size_t len, off_t off)
+{
+       struct qcdisk *disk;
+       off_t phys_off, cluster_off, end;
+       ssize_t sz, rem;
+       int inplace;
+
+       disk = p;
+       inplace = 1;
+       end = off + len;
+       if (off < 0 || end > disk->sz)
+               return -1;
+       rem = len;
+       while (off != end) {
+               /* See the read code for a summary of the computation */
+               cluster_off = off % disk->clustersz;
+               sz = disk->clustersz - cluster_off;
+               if (sz > rem)
+                       sz = rem;
+
+               phys_off = xlate(disk, off, &inplace);
+               if (phys_off == -1)
+                       return -1;
+               if (phys_off == 0 || !inplace)
+                       phys_off = mkcluster(disk, off, phys_off);
+               if (phys_off == -1)
+                       return -1;
+               log_debug("writing at %llx", phys_off);
+               if (pwrite(disk->fd, buf, sz, phys_off) != sz)
+                       return -1;
+               off += sz;
+               buf += sz;
+               rem -= sz;
+       }
+       return len;
+}
+
+static void
+qc2_close(void *p)
+{
+       struct qcdisk *disk;
+
+       disk = p;
+       pwrite(disk->fd, disk->l1, disk->l1size, disk->l1off);
+       close(disk->fd);
+       free(disk);
+}
+
+/* 
+ * Translates a virtual offset into an on-disk offset.
+ * Returns:
+ *     -1 on error
+ *      0 on 'not found'
+ *     >0 on found
+ */
+static off_t
+xlate(struct qcdisk *disk, off_t off, int *inplace)
+{
+       off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
+       char buf[8];
+
+
+       pthread_rwlock_rdlock(&disk->lock);
+       log_debug("xlating offset %llx", off);
+       if (off < 0)
+               goto err;
+
+       l2sz = disk->clustersz / 8;
+       l1off = (off >> disk->clustershift) / l2sz;
+       if (l1off >= disk->l1size)
+               goto err;
+
+       log_debug("read l1off: %llx", l1off);
+       l2tab = unpackbe64(disk->l1 + 8*l1off);
+       l2tab &= ~QCOW2_INPLACE;
+       if (l2tab == 0) {
+               pthread_rwlock_unlock(&disk->lock);
+               return 0;
+       }
+       l2off = (off >> disk->clustershift) % l2sz;
+       log_debug("read l2off: %llx", l2off);
+       pread(disk->fd, buf, sizeof(buf), l2tab + l2off*8);
+       cluster = unpackbe64(buf);
+       /* 
+        * cluster may be 0, but all future operations don't affect
+        * the return value. 
+        */
+       if (inplace)
+               *inplace = !!(cluster & QCOW2_INPLACE);
+       if (cluster & QCOW2_COMPRESSED) {
+               log_warn("%s: compressed clusters unsupported", __progname);
+               goto err;
+       }
+       pthread_rwlock_unlock(&disk->lock);
+       clusteroff = 0;
+       cluster &= ~QCOW2_INPLACE;
+       if (cluster)
+               clusteroff = off % disk->clustersz;
+       log_debug("cluster: %llx, clusteroff: %llx", cluster, clusteroff);
+       return cluster + clusteroff;
+err:
+       pthread_rwlock_unlock(&disk->lock);
+       return -1;
+}
+
+/*
+ * Allocates a new cluster on disk, creating a new L2 table
+ * if needed. The cluster starts off with a refs of one,
+ * and the writable bit set.
+ *
+ * Returns -1 on error, and the physical address within the
+ * cluster of the write offset if it exists.
+ */
+static off_t
+mkcluster(struct qcdisk *disk, off_t off, off_t src_phys)
+{
+       off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
+       char buf[8];
+       int fd;
+
+       pthread_rwlock_wrlock(&disk->lock);
+
+       cluster = -1;
+       fd = disk->fd;
+       /* L1 entries always exist */
+       l2sz = (1ull << disk->clustershift) / 8;
+       l1off = (off >> disk->clustershift) / l2sz;
+       if (l1off >= disk->l1size)
+               goto fail;
+
+       /*
+        * Align disk to cluster size, for ftruncate: Not strictly
+        * required, but it easier to eyeball buggy write offsets,
+        * and helps performance a bit.
+        */
+       disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
+
+       l2tab = unpackbe64(disk->l1 + 8*l1off);
+       l2off = (off >> disk->clustershift) % l2sz;
+       log_debug("l2tab: %llx, l2off: %llx, inplace: %d",
+           l2tab, l2off & ~QCOW2_INPLACE, (l2tab & QCOW2_INPLACE) != 0);
+       /* We may need to create or clone an L2 entry to map the block */
+       if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
+               log_debug("creating l2 table");
+               orig = l2tab & ~QCOW2_INPLACE;
+               l2tab = disk->end;
+               disk->end += disk->clustersz;
+               if (ftruncate(disk->fd, disk->end) == -1) {
+                       perror("ftruncate");
+                       goto fail;
+               }
+
+               /*
+                * If we translated, found a L2 entry, but it needed to
+                * be copied, copy it.
+                */
+               if (orig != 0 && move_cluster(disk, l2tab, orig) == -1) {
+                       perror("move cluster");
+                       goto fail;
+               }
+               /* Update l1 -- we flush it later */
+               packbe64(disk->l1 + 8*l1off, l2tab | QCOW2_INPLACE);
+               if (inc_refs(disk, l2tab, 1) == -1) {
+                       perror("refs");
+                       goto fail;
+               }
+       }
+       l2tab &= ~QCOW2_INPLACE;
+
+       /* Grow the disk */
+       if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
+               goto fail;
+       if (src_phys > 0 && move_cluster(disk, disk->end, src_phys) == -1)
+                       return -1;
+       cluster = disk->end;
+       disk->end += disk->clustersz;
+       packbe64(buf, cluster | QCOW2_INPLACE);
+       log_debug("update l2: %zd@%lld", sizeof(buf), l2tab + l2off*8);
+       if (pwrite(disk->fd, buf, sizeof(buf), l2tab + l2off*8) != sizeof(buf))
+               goto fail;
+
+       /* TODO: lazily sync: currently VMD doesn't close things */
+       if (pwrite(disk->fd, disk->l1 + 8*l1off, 8, disk->l1off + 8*l1off) != 8)
+               goto fail;
+       if (inc_refs(disk, cluster, 1) == -1)
+               goto fail;
+
+       pthread_rwlock_unlock(&disk->lock);
+       clusteroff = off % disk->clustersz;
+       return cluster + clusteroff;
+
+fail:
+       pthread_rwlock_unlock(&disk->lock);
+       return -1;
+}
+
+/* Copies a cluster. Must be called with write lock held. */
+static int
+move_cluster(struct qcdisk *disk, off_t dst, off_t src)
+{
+       if (pread(disk->fd, disk->scratch, disk->clustersz, src) == -1)
+               return -1;
+       if (pwrite(disk->fd, disk->scratch, disk->clustersz, dst) == -1)
+               return -1;
+       return 0;
+}
+
+static int
+inc_refs(struct qcdisk *disk, off_t off, int newcluster)
+{
+       off_t l1off, l1idx, l2idx, l2cluster;
+       size_t nper;
+       uint16_t refs;
+       char buf[8];
+
+       off &= ~QCOW2_INPLACE;
+       nper = disk->clustersz / 2;
+       log_debug("incrementing refs for %llx (cluster %llx, nper=%zd)", off, 
off / disk->clustersz, nper);
+       l1idx = (off / disk->clustersz) / nper;
+       l2idx = (off / disk->clustersz) % nper;
+       l1off = disk->refoff + 8*l1idx;
+       if (pread(disk->fd, buf, 8, l1off) != 8)
+               return -1;
+
+       l2cluster = unpackbe64(buf);
+       if (l2cluster == 0) {
+               l2cluster = disk->end;
+               disk->end += disk->clustersz;
+               if (ftruncate(disk->fd, disk->end) < 0) {
+                       log_debug("refs block grow fail ");
+                       return -1;
+               }
+               packbe64(buf, l2cluster);
+               if (pwrite(disk->fd, buf, 8, l1off) != 8) {
+                       return -1;
+               }
+       }
+
+       refs = 1;
+       if (!newcluster) {
+               if (pread(disk->fd, buf, 2, l2cluster + 2*l2idx) != 2)
+                       return -1;
+               refs = unpackbe16(buf) + 1;
+       }
+       packbe16(buf, refs);
+       log_debug("writing refs %d for offset %llx (l1: %llx, l2: %llx) to %llx 
+ %llx",
+           refs, off, l1idx, l2idx, l2cluster, 2*l2idx);
+       if (pwrite(disk->fd, buf, 2, l2cluster + 2*l2idx) != 2) {
+               log_debug("could not write ref block");
+       }
+       return 0;
+}
+
+static uint32_t
+getbe32(char **buf, char *end)
+{
+       char *p;
+
+       assert(end - *buf >= 4);
+       p = *buf;
+       *buf += 4;
+       return unpackbe32(p);
+}
+
+static uint64_t
+getbe64(char **buf, char *end)
+{
+       char *p;
+
+       assert(end - *buf >= 8);
+       p = *buf;
+       *buf += 8;
+       return unpackbe64(p);
+}
+
+
+static uint64_t
+unpackbe64(char *p)
+{
+       return
+           (((uint64_t)p[0] & 0xff) << 56ull) | (((uint64_t)p[1] & 0xff) << 
48ull) |
+           (((uint64_t)p[2] & 0xff) << 40ull) | (((uint64_t)p[3] & 0xff) << 
32ull) |
+           (((uint64_t)p[4] & 0xff) << 24ull) | (((uint64_t)p[5] & 0xff) << 
16ull) |
+           (((uint64_t)p[6] & 0xff) <<  8ull) | (((uint64_t)p[7] & 0xff) <<  
0ull);
+}
+
+static uint32_t
+unpackbe32(char *p)
+{
+       return 
+           (((uint32_t)p[0] & 0xff) << 24ul) | (((uint32_t)p[1] & 0xff) << 
16ul) |
+           (((uint32_t)p[2] & 0xff) <<  8ul) | (((uint32_t)p[3] & 0xff) <<  
0ul);
+}
+
+static uint16_t
+unpackbe16(char *p)
+{
+       return (((uint16_t)p[2] & 0xff) <<  8ul) | (((uint16_t)p[3] & 0xff) <<  
0ul);
+}
+
+static void
+packbe64(char *p, uint64_t v)
+{
+       p[0] = (v >> 56ull); p[1] = (v >> 48ull);
+       p[2] = (v >> 40ull); p[3] = (v >> 32ull);
+       p[4] = (v >> 24ull); p[5] = (v >> 16ull);
+       p[6] = (v >>  8ull); p[7] = (v >>  0ull);
+}
+
+//static void
+//packbe32(char *p, uint32_t v)
+//{
+//     p[0] = (v >> 24ul); p[1] = (v >> 16ul);
+//     p[2] = (v >>  8ul); p[3] = (v >>  0ul);
+//}
+
+static void
+packbe16(char *p, uint16_t v)
+{
+       p[0] = (v >>  8ul); p[1] = (v >>  0ul);
+}
+
diff --git usr.sbin/vmd/vioqcow2.h usr.sbin/vmd/vioqcow2.h
new file mode 100644
index 00000000000..29e7b9a6510
--- /dev/null
+++ usr.sbin/vmd/vioqcow2.h
@@ -0,0 +1,6 @@
+#ifndef _QCOW2_H_
+#define _QCOW2_H_
+
+#include <stdint.h>
+
+#endif
diff --git usr.sbin/vmd/vioraw.c usr.sbin/vmd/vioraw.c
index ffd352d0e37..9eaf597e9e0 100644
--- usr.sbin/vmd/vioraw.c
+++ usr.sbin/vmd/vioraw.c
@@ -1,3 +1,20 @@
+/*     $OpenBSD: $     */
+/*
+ * Copyright (c) 2018 Ori Bernstein <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
 #include <sys/param.h> /* PAGE_SIZE */
 #include <sys/socket.h>
 
diff --git usr.sbin/vmd/vioscribble.c usr.sbin/vmd/vioscribble.c
new file mode 100644
index 00000000000..61e0515b97c
--- /dev/null
+++ usr.sbin/vmd/vioscribble.c
@@ -0,0 +1,143 @@
+/* 
+ * Quick hack of a program to try to test vioqcow2.c against
+ * vioraw.c.
+ *
+ * Compile with:
+ *
+ *     cc -pthread -o scribble vioscribble.c vioqcow2.c vioraw.c
+ */
+#include <sys/param.h> /* PAGE_SIZE */
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include <machine/vmmvar.h>
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcidevs.h>
+#include <dev/pv/virtioreg.h>
+#include <dev/pv/vioblkreg.h>
+#include <dev/pv/vioscsireg.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/if_ether.h>
+
+#include <errno.h>
+#include <event.h>
+#include <poll.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <err.h>
+
+#include "pci.h"
+#include "vmd.h"
+#include "vmm.h"
+#include "virtio.h"
+
+#define CLUSTERSZ 65536
+
+struct virtio_backing qcowfile;
+struct virtio_backing rawfile;
+
+/* We expect the scribble disks to be 4g in size */
+#define DISKSZ (4ull*1024ull*1024ull*1024ull)
+
+/* functions that io code depends on */
+
+void
+log_debug(const char *emsg, ...)
+{
+       //va_list ap;
+
+       //va_start(ap, emsg);
+       //vfprintf(stdout, emsg, ap);
+       //fprintf(stdout, "\n");
+       //va_end(ap);
+}
+
+void
+log_warnx(const char *emsg, ...)
+{
+       va_list ap;
+
+       va_start(ap, emsg);
+       vfprintf(stdout, emsg, ap);
+       fprintf(stdout, "\n");
+       va_end(ap);
+}
+
+void
+log_warn(const char *emsg, ...)
+{
+       va_list ap;
+
+       va_start(ap, emsg);
+       vfprintf(stdout, emsg, ap);
+       fprintf(stdout, "\n");
+       va_end(ap);
+}
+
+static void
+fill(size_t off, char *buf, size_t len)
+{
+       size_t i;
+
+       /* use the top bits of off, since we can guess at where we went wrong. 
*/
+       for (i = 0; i < len; i++)
+               buf[i] = (off >> 8);
+}
+
+int
+main(int argc, char **argv)
+{
+       int qcfd, rawfd, i;
+       char buf[64*1024], cmp[64*1024];
+       off_t len, off, qcsz, rawsz;
+
+       qcfd = open("scribble.qc2", O_RDWR);
+       rawfd = open("scribble.raw", O_RDWR);
+       if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
+               err(1, "unable to open qcow");
+       if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
+               err(1, "unable to open raw");
+
+       srandom_deterministic(123);
+
+       /* scribble to both disks */
+       printf("scribbling...\n");
+       for (i = 0; i < 16; i++) {
+               off = (random() % DISKSZ);
+               len = random() % sizeof buf + 1;
+               fill(off, buf, sizeof buf);
+               if (qcowfile.pwrite(qcowfile.p, buf, len, off) == -1)
+                       printf("iter %d: unable to write at %llx\n", i, off);
+               rawfile.pwrite(rawfile.p, buf, len, off);
+
+               if (qcowfile.pread(qcowfile.p, buf, len, off) == -1)
+                       printf("unable to read at %llx\n", off);
+               rawfile.pread(rawfile.p, cmp, len, off);
+               if (memcmp(buf, cmp, len) != 0) {
+                       printf("iter %d: mismatch at 0x%llx (espected val: 
%d)\n",
+                           i, off, (char)(off  >> 8));
+                       break;
+               }
+       }
+
+       /* validate that both disks match */
+       printf("validating...\n");
+       for (off = 0; off < DISKSZ; off += sizeof buf) {
+               if (qcowfile.pread(qcowfile.p, buf, sizeof buf, off) == -1)
+                       printf("unable to read at %llx\n", off);
+               rawfile.pread(rawfile.p, cmp, sizeof buf, off);
+               if (memcmp(buf, cmp, sizeof buf) != 0) {
+                       printf("mismatch at 0x%llx (espected val: %d)\n",
+                           off, (char)(off  >> 8));
+                       break;
+               }
+       }
+       return 0;
+}
diff --git usr.sbin/vmd/virtio.c usr.sbin/vmd/virtio.c
index d019bccb945..222ca2a5fd4 100644
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -1749,10 +1749,14 @@ static int
 virtio_init_disk(struct virtio_backing *file, off_t *sz, int fd)
 {
        /* 
-        * This is where we slot in disk type selection.
-        *  Right now, there's only raw.
+        * probe disk types in order of preference, first one to work wins.
+        * TODO: provide a way of specifying the type and options.
         */
-       return virtio_init_raw(file, sz, fd);
+       if (virtio_init_qcow2(file, sz, fd) == 0)
+               return 0;
+       if (virtio_init_raw(file, sz, fd) == 0)
+               return 0;
+       return -1;
 }
 
 void

-- 
    Ori Bernstein

Reply via email to