Author: mav
Date: Sat Apr 18 20:10:19 2015
New Revision: 281700
URL: https://svnweb.freebsd.org/changeset/base/281700

Log:
  Workaround bhyve virtual disks operation on top of GEOM providers.
  
  GEOM does not support scatter/gather lists in its I/Os.  Such requests
  are cut in pieces by physio(), that may be problematic, if those pieces
  are not multiple of provider's sector size.  If such case is detected,
  move the data through temporary sequential buffer.
  
  MFC after:    2 weeks

Modified:
  head/usr.sbin/bhyve/block_if.c
  head/usr.sbin/bhyve/block_if.h
  head/usr.sbin/bhyve/pci_ahci.c
  head/usr.sbin/bhyve/pci_virtio_block.c

Modified: head/usr.sbin/bhyve/block_if.c
==============================================================================
--- head/usr.sbin/bhyve/block_if.c      Sat Apr 18 19:52:39 2015        
(r281699)
+++ head/usr.sbin/bhyve/block_if.c      Sat Apr 18 20:10:19 2015        
(r281700)
@@ -85,6 +85,7 @@ struct blockif_ctxt {
        int                     bc_magic;
        int                     bc_fd;
        int                     bc_ischr;
+       int                     bc_isgeom;
        int                     bc_candelete;
        int                     bc_rdonly;
        off_t                   bc_size;
@@ -198,27 +199,93 @@ blockif_complete(struct blockif_ctxt *bc
 }
 
 static void
-blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
+blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
 {
        struct blockif_req *br;
        off_t arg[2];
-       int err;
+       ssize_t clen, len, off, boff, voff;
+       int i, err;
 
        br = be->be_req;
+       if (br->br_iovcnt <= 1)
+               buf = NULL;
        err = 0;
-
        switch (be->be_op) {
        case BOP_READ:
-               if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
-                          br->br_offset) < 0)
-                       err = errno;
+               if (buf == NULL) {
+                       if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
+                                  br->br_offset)) < 0)
+                               err = errno;
+                       else
+                               br->br_resid -= len;
+                       break;
+               }
+               i = 0;
+               off = voff = 0;
+               while (br->br_resid > 0) {
+                       len = MIN(br->br_resid, MAXPHYS);
+                       if (pread(bc->bc_fd, buf, len, br->br_offset +
+                           off) < 0) {
+                               err = errno;
+                               break;
+                       }
+                       boff = 0;
+                       do {
+                               clen = MIN(len - boff, br->br_iov[i].iov_len -
+                                   voff);
+                               memcpy(br->br_iov[i].iov_base + voff,
+                                   buf + boff, clen);
+                               if (clen < br->br_iov[i].iov_len - voff)
+                                       voff += clen;
+                               else {
+                                       i++;
+                                       voff = 0;
+                               }
+                               boff += clen;
+                       } while (boff < len);
+                       off += len;
+                       br->br_resid -= len;
+               }
                break;
        case BOP_WRITE:
-               if (bc->bc_rdonly)
+               if (bc->bc_rdonly) {
                        err = EROFS;
-               else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
-                            br->br_offset) < 0)
-                       err = errno;
+                       break;
+               }
+               if (buf == NULL) {
+                       if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
+                                   br->br_offset)) < 0)
+                               err = errno;
+                       else
+                               br->br_resid -= len;
+                       break;
+               }
+               i = 0;
+               off = voff = 0;
+               while (br->br_resid > 0) {
+                       len = MIN(br->br_resid, MAXPHYS);
+                       boff = 0;
+                       do {
+                               clen = MIN(len - boff, br->br_iov[i].iov_len -
+                                   voff);
+                               memcpy(buf + boff,
+                                   br->br_iov[i].iov_base + voff, clen);
+                               if (clen < br->br_iov[i].iov_len - voff)
+                                       voff += clen;
+                               else {
+                                       i++;
+                                       voff = 0;
+                               }
+                               boff += clen;
+                       } while (boff < len);
+                       if (pwrite(bc->bc_fd, buf, len, br->br_offset +
+                           off) < 0) {
+                               err = errno;
+                               break;
+                       }
+                       off += len;
+                       br->br_resid -= len;
+               }
                break;
        case BOP_FLUSH:
                if (bc->bc_ischr) {
@@ -234,9 +301,11 @@ blockif_proc(struct blockif_ctxt *bc, st
                        err = EROFS;
                else if (bc->bc_ischr) {
                        arg[0] = br->br_offset;
-                       arg[1] = br->br_iov[0].iov_len;
+                       arg[1] = br->br_resid;
                        if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
                                err = errno;
+                       else
+                               br->br_resid = 0;
                } else
                        err = EOPNOTSUPP;
                break;
@@ -256,15 +325,20 @@ blockif_thr(void *arg)
        struct blockif_ctxt *bc;
        struct blockif_elem *be;
        pthread_t t;
+       uint8_t *buf;
 
        bc = arg;
+       if (bc->bc_isgeom)
+               buf = malloc(MAXPHYS);
+       else
+               buf = NULL;
        t = pthread_self();
 
        pthread_mutex_lock(&bc->bc_mtx);
        for (;;) {
                while (blockif_dequeue(bc, t, &be)) {
                        pthread_mutex_unlock(&bc->bc_mtx);
-                       blockif_proc(bc, be);
+                       blockif_proc(bc, be, buf);
                        pthread_mutex_lock(&bc->bc_mtx);
                        blockif_complete(bc, be);
                }
@@ -275,6 +349,8 @@ blockif_thr(void *arg)
        }
        pthread_mutex_unlock(&bc->bc_mtx);
 
+       if (buf)
+               free(buf);
        pthread_exit(NULL);
        return (NULL);
 }
@@ -315,13 +391,14 @@ struct blockif_ctxt *
 blockif_open(const char *optstr, const char *ident)
 {
        char tname[MAXCOMLEN + 1];
+       char name[MAXPATHLEN];
        char *nopt, *xopts;
        struct blockif_ctxt *bc;
        struct stat sbuf;
        struct diocgattr_arg arg;
        off_t size, psectsz, psectoff;
        int extra, fd, i, sectsz;
-       int nocache, sync, ro, candelete;
+       int nocache, sync, ro, candelete, geom;
 
        pthread_once(&blockif_once, blockif_init);
 
@@ -375,7 +452,7 @@ blockif_open(const char *optstr, const c
         size = sbuf.st_size;
        sectsz = DEV_BSIZE;
        psectsz = psectoff = 0;
-       candelete = 0;
+       candelete = geom = 0;
        if (S_ISCHR(sbuf.st_mode)) {
                if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
                    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
@@ -391,6 +468,8 @@ blockif_open(const char *optstr, const c
                arg.len = sizeof(arg.value.i);
                if (ioctl(fd, DIOCGATTR, &arg) == 0)
                        candelete = arg.value.i;
+               if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
+                       geom = 1;
        } else
                psectsz = sbuf.st_blksize;
 
@@ -403,6 +482,7 @@ blockif_open(const char *optstr, const c
        bc->bc_magic = BLOCKIF_SIG;
        bc->bc_fd = fd;
        bc->bc_ischr = S_ISCHR(sbuf.st_mode);
+       bc->bc_isgeom = geom;
        bc->bc_candelete = candelete;
        bc->bc_rdonly = ro;
        bc->bc_size = size;

Modified: head/usr.sbin/bhyve/block_if.h
==============================================================================
--- head/usr.sbin/bhyve/block_if.h      Sat Apr 18 19:52:39 2015        
(r281699)
+++ head/usr.sbin/bhyve/block_if.h      Sat Apr 18 20:10:19 2015        
(r281700)
@@ -45,6 +45,7 @@ struct blockif_req {
        struct iovec    br_iov[BLOCKIF_IOV_MAX];
        int             br_iovcnt;
        off_t           br_offset;
+       ssize_t         br_resid;
        void            (*br_callback)(struct blockif_req *req, int err);
        void            *br_param;
 };

Modified: head/usr.sbin/bhyve/pci_ahci.c
==============================================================================
--- head/usr.sbin/bhyve/pci_ahci.c      Sat Apr 18 19:52:39 2015        
(r281699)
+++ head/usr.sbin/bhyve/pci_ahci.c      Sat Apr 18 20:10:19 2015        
(r281700)
@@ -571,6 +571,7 @@ ahci_build_iov(struct ahci_port *p, stru
        }
 
        breq->br_iovcnt = j;
+       breq->br_resid = todo;
        aior->done += todo;
        aior->more = (aior->done < aior->len && i < prdtl);
 }
@@ -776,8 +777,7 @@ next:
 
        breq = &aior->io_req;
        breq->br_offset = elba * blockif_sectsz(p->bctx);
-       breq->br_iovcnt = 1;
-       breq->br_iov[0].iov_len = elen * blockif_sectsz(p->bctx);
+       breq->br_resid = elen * blockif_sectsz(p->bctx);
 
        /*
         * Mark this command in-flight.

Modified: head/usr.sbin/bhyve/pci_virtio_block.c
==============================================================================
--- head/usr.sbin/bhyve/pci_virtio_block.c      Sat Apr 18 19:52:39 2015        
(r281699)
+++ head/usr.sbin/bhyve/pci_virtio_block.c      Sat Apr 18 20:10:19 2015        
(r281700)
@@ -198,7 +198,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *s
        struct pci_vtblk_ioreq *io;
        int i, n;
        int err;
-       int iolen;
+       ssize_t iolen;
        int writeop, type;
        off_t offset;
        struct iovec iov[BLOCKIF_IOV_MAX + 2];
@@ -246,8 +246,9 @@ pci_vtblk_proc(struct pci_vtblk_softc *s
                assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
                iolen += iov[i].iov_len;
        }
+       io->io_req.br_resid = iolen;
 
-       DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", 
+       DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", 
                 writeop ? "write" : "read/ident", iolen, i - 1, offset));
 
        switch (type) {
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to