Author: kib
Date: Tue Mar 19 14:53:23 2013
New Revision: 248518
URL: http://svnweb.freebsd.org/changeset/base/248518

Log:
  Support unmapped i/o for the md(4).
  
  The vnode-backed md(4) has to map the unmapped bio because VOP_READ()
  and VOP_WRITE() interfaces do not allow to pass unmapped requests to
  the filesystem. Vnode-backed md(4) uses pbufs instead of relying on
  the bio_transient_map, to avoid usual md deadlock.
  
  Sponsored by: The FreeBSD Foundation
  Tested by:    pho, scottl

Modified:
  head/sys/dev/md/md.c

Modified: head/sys/dev/md/md.c
==============================================================================
--- head/sys/dev/md/md.c        Tue Mar 19 14:50:24 2013        (r248517)
+++ head/sys/dev/md/md.c        Tue Mar 19 14:53:23 2013        (r248518)
@@ -18,11 +18,16 @@
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1990, 1993
  *     The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2013 The FreeBSD Foundation
+ * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -59,6 +64,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
+#include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/fcntl.h>
@@ -169,6 +175,8 @@ static LIST_HEAD(, md_s) md_softc_list =
 #define NMASK  (NINDIR-1)
 static int nshift;
 
+static int md_vnode_pbuf_freecnt;
+
 struct indir {
        uintptr_t       *array;
        u_int           total;
@@ -415,11 +423,103 @@ g_md_start(struct bio *bp)
        wakeup(sc);
 }
 
+#define        MD_MALLOC_MOVE_ZERO     1
+#define        MD_MALLOC_MOVE_FILL     2
+#define        MD_MALLOC_MOVE_READ     3
+#define        MD_MALLOC_MOVE_WRITE    4
+#define        MD_MALLOC_MOVE_CMP      5
+
+static int
+md_malloc_move(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
+    void *ptr, u_char fill, int op)
+{
+       struct sf_buf *sf;
+       vm_page_t m, *mp1;
+       char *p, first;
+       off_t *uc;
+       unsigned n;
+       int error, i, ma_offs1, sz, first_read;
+
+       m = NULL;
+       error = 0;
+       sf = NULL;
+       /* if (op == MD_MALLOC_MOVE_CMP) { gcc */
+               first = 0;
+               first_read = 0;
+               uc = ptr;
+               mp1 = *mp;
+               ma_offs1 = *ma_offs;
+       /* } */
+       sched_pin();
+       for (n = sectorsize; n != 0; n -= sz) {
+               sz = imin(PAGE_SIZE - *ma_offs, n);
+               if (m != **mp) {
+                       if (sf != NULL)
+                               sf_buf_free(sf);
+                       m = **mp;
+                       sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
+                           (md_malloc_wait ? 0 : SFB_NOWAIT));
+                       if (sf == NULL) {
+                               error = ENOMEM;
+                               break;
+                       }
+               }
+               p = (char *)sf_buf_kva(sf) + *ma_offs;
+               switch (op) {
+               case MD_MALLOC_MOVE_ZERO:
+                       bzero(p, sz);
+                       break;
+               case MD_MALLOC_MOVE_FILL:
+                       memset(p, fill, sz);
+                       break;
+               case MD_MALLOC_MOVE_READ:
+                       bcopy(ptr, p, sz);
+                       cpu_flush_dcache(p, sz);
+                       break;
+               case MD_MALLOC_MOVE_WRITE:
+                       bcopy(p, ptr, sz);
+                       break;
+               case MD_MALLOC_MOVE_CMP:
+                       for (i = 0; i < sz; i++, p++) {
+                               if (!first_read) {
+                                       *uc = (u_char)*p;
+                                       first = *p;
+                                       first_read = 1;
+                               } else if (*p != first) {
+                                       error = EDOOFUS;
+                                       break;
+                               }
+                       }
+                       break;
+               default:
+                       KASSERT(0, ("md_malloc_move unknown op %d\n", op));
+                       break;
+               }
+               if (error != 0)
+                       break;
+               *ma_offs += sz;
+               *ma_offs %= PAGE_SIZE;
+               if (*ma_offs == 0)
+                       (*mp)++;
+               ptr = (char *)ptr + sz;
+       }
+
+       if (sf != NULL)
+               sf_buf_free(sf);
+       sched_unpin();
+       if (op == MD_MALLOC_MOVE_CMP && error != 0) {
+               *mp = mp1;
+               *ma_offs = ma_offs1;
+       }
+       return (error);
+}
+
 static int
 mdstart_malloc(struct md_s *sc, struct bio *bp)
 {
-       int i, error;
        u_char *dst;
+       vm_page_t *m;
+       int i, error, error1, ma_offs, notmapped;
        off_t secno, nsec, uc;
        uintptr_t sp, osp;
 
@@ -432,9 +532,17 @@ mdstart_malloc(struct md_s *sc, struct b
                return (EOPNOTSUPP);
        }
 
+       notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
+       if (notmapped) {
+               m = bp->bio_ma;
+               ma_offs = bp->bio_ma_offset;
+               dst = NULL;
+       } else {
+               dst = bp->bio_data;
+       }
+
        nsec = bp->bio_length / sc->sectorsize;
        secno = bp->bio_offset / sc->sectorsize;
-       dst = bp->bio_data;
        error = 0;
        while (nsec--) {
                osp = s_read(sc->indir, secno);
@@ -442,21 +550,45 @@ mdstart_malloc(struct md_s *sc, struct b
                        if (osp != 0)
                                error = s_write(sc->indir, secno, 0);
                } else if (bp->bio_cmd == BIO_READ) {
-                       if (osp == 0)
-                               bzero(dst, sc->sectorsize);
-                       else if (osp <= 255)
-                               memset(dst, osp, sc->sectorsize);
-                       else {
-                               bcopy((void *)osp, dst, sc->sectorsize);
-                               cpu_flush_dcache(dst, sc->sectorsize);
+                       if (osp == 0) {
+                               if (notmapped) {
+                                       error = md_malloc_move(&m, &ma_offs,
+                                           sc->sectorsize, NULL, 0,
+                                           MD_MALLOC_MOVE_ZERO);
+                               } else
+                                       bzero(dst, sc->sectorsize);
+                       } else if (osp <= 255) {
+                               if (notmapped) {
+                                       error = md_malloc_move(&m, &ma_offs,
+                                           sc->sectorsize, NULL, osp,
+                                           MD_MALLOC_MOVE_FILL);
+                               } else
+                                       memset(dst, osp, sc->sectorsize);
+                       } else {
+                               if (notmapped) {
+                                       error = md_malloc_move(&m, &ma_offs,
+                                           sc->sectorsize, (void *)osp, 0,
+                                           MD_MALLOC_MOVE_READ);
+                               } else {
+                                       bcopy((void *)osp, dst, sc->sectorsize);
+                                       cpu_flush_dcache(dst, sc->sectorsize);
+                               }
                        }
                        osp = 0;
                } else if (bp->bio_cmd == BIO_WRITE) {
                        if (sc->flags & MD_COMPRESS) {
-                               uc = dst[0];
-                               for (i = 1; i < sc->sectorsize; i++)
-                                       if (dst[i] != uc)
-                                               break;
+                               if (notmapped) {
+                                       error1 = md_malloc_move(&m, &ma_offs,
+                                           sc->sectorsize, &uc, 0,
+                                           MD_MALLOC_MOVE_CMP);
+                                       i = error1 == 0 ? sc->sectorsize : 0;
+                               } else {
+                                       uc = dst[0];
+                                       for (i = 1; i < sc->sectorsize; i++) {
+                                               if (dst[i] != uc)
+                                                       break;
+                                       }
+                               }
                        } else {
                                i = 0;
                                uc = 0;
@@ -473,10 +605,26 @@ mdstart_malloc(struct md_s *sc, struct b
                                                error = ENOSPC;
                                                break;
                                        }
-                                       bcopy(dst, (void *)sp, sc->sectorsize);
+                                       if (notmapped) {
+                                               error = md_malloc_move(&m,
+                                                   &ma_offs, sc->sectorsize,
+                                                   (void *)sp, 0,
+                                                   MD_MALLOC_MOVE_WRITE);
+                                       } else {
+                                               bcopy(dst, (void *)sp,
+                                                   sc->sectorsize);
+                                       }
                                        error = s_write(sc->indir, secno, sp);
                                } else {
-                                       bcopy(dst, (void *)osp, sc->sectorsize);
+                                       if (notmapped) {
+                                               error = md_malloc_move(&m,
+                                                   &ma_offs, sc->sectorsize,
+                                                   (void *)osp, 0,
+                                                   MD_MALLOC_MOVE_WRITE);
+                                       } else {
+                                               bcopy(dst, (void *)osp,
+                                                   sc->sectorsize);
+                                       }
                                        osp = 0;
                                }
                        }
@@ -488,7 +636,8 @@ mdstart_malloc(struct md_s *sc, struct b
                if (error != 0)
                        break;
                secno++;
-               dst += sc->sectorsize;
+               if (!notmapped)
+                       dst += sc->sectorsize;
        }
        bp->bio_resid = 0;
        return (error);
@@ -521,6 +670,7 @@ mdstart_vnode(struct md_s *sc, struct bi
        struct iovec aiov;
        struct mount *mp;
        struct vnode *vp;
+       struct buf *pb;
        struct thread *td;
        off_t end, zerosize;
 
@@ -592,7 +742,17 @@ mdstart_vnode(struct md_s *sc, struct bi
                return (error);
        }
 
-       aiov.iov_base = bp->bio_data;
+       KASSERT(bp->bio_length <= MAXPHYS, ("bio_length %jd",
+           (uintmax_t)bp->bio_length));
+       if ((bp->bio_flags & BIO_UNMAPPED) == 0) {
+               pb = NULL;
+               aiov.iov_base = bp->bio_data;
+       } else {
+               pb = getpbuf(&md_vnode_pbuf_freecnt);
+               pmap_qenter((vm_offset_t)pb->b_data, bp->bio_ma, bp->bio_ma_n);
+               aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
+                   bp->bio_ma_offset);
+       }
        aiov.iov_len = bp->bio_length;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
@@ -622,6 +782,10 @@ mdstart_vnode(struct md_s *sc, struct bi
                VOP_UNLOCK(vp, 0);
                vn_finished_write(mp);
        }
+       if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
+               pmap_qremove((vm_offset_t)pb->b_data, bp->bio_ma_n);
+               relpbuf(pb, &md_vnode_pbuf_freecnt);
+       }
        bp->bio_resid = auio.uio_resid;
        return (error);
 }
@@ -629,11 +793,10 @@ mdstart_vnode(struct md_s *sc, struct bi
 static int
 mdstart_swap(struct md_s *sc, struct bio *bp)
 {
-       struct sf_buf *sf;
-       int rv, offs, len, lastend;
-       vm_pindex_t i, lastp;
        vm_page_t m;
        u_char *p;
+       vm_pindex_t i, lastp;
+       int rv, ma_offs, offs, len, lastend;
 
        switch (bp->bio_cmd) {
        case BIO_READ:
@@ -645,6 +808,7 @@ mdstart_swap(struct md_s *sc, struct bio
        }
 
        p = bp->bio_data;
+       ma_offs = (bp->bio_flags & BIO_UNMAPPED) == 0 ? 0 : bp->bio_ma_offset;
 
        /*
         * offs is the offset at which to start operating on the
@@ -662,19 +826,12 @@ mdstart_swap(struct md_s *sc, struct bio
        vm_object_pip_add(sc->object, 1);
        for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
                len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
-
-               m = vm_page_grab(sc->object, i,
-                   VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
-               VM_OBJECT_WUNLOCK(sc->object);
-               sched_pin();
-               sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
-               VM_OBJECT_WLOCK(sc->object);
+               m = vm_page_grab(sc->object, i, VM_ALLOC_NORMAL |
+                   VM_ALLOC_RETRY);
                if (bp->bio_cmd == BIO_READ) {
                        if (m->valid != VM_PAGE_BITS_ALL)
                                rv = vm_pager_get_pages(sc->object, &m, 1, 0);
                        if (rv == VM_PAGER_ERROR) {
-                               sf_buf_free(sf);
-                               sched_unpin();
                                vm_page_wakeup(m);
                                break;
                        } else if (rv == VM_PAGER_FAIL) {
@@ -684,40 +841,44 @@ mdstart_swap(struct md_s *sc, struct bio
                                 * valid. Do not set dirty, the page
                                 * can be recreated if thrown out.
                                 */
-                               bzero((void *)sf_buf_kva(sf), PAGE_SIZE);
+                               pmap_zero_page(m);
                                m->valid = VM_PAGE_BITS_ALL;
                        }
-                       bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
-                       cpu_flush_dcache(p, len);
+                       if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
+                               pmap_copy_pages(&m, offs, bp->bio_ma,
+                                   ma_offs, len);
+                       } else {
+                               physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
+                               cpu_flush_dcache(p, len);
+                       }
                } else if (bp->bio_cmd == BIO_WRITE) {
                        if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
                                rv = vm_pager_get_pages(sc->object, &m, 1, 0);
                        if (rv == VM_PAGER_ERROR) {
-                               sf_buf_free(sf);
-                               sched_unpin();
                                vm_page_wakeup(m);
                                break;
                        }
-                       bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
+                       if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
+                               pmap_copy_pages(bp->bio_ma, ma_offs, &m,
+                                   offs, len);
+                       } else {
+                               physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
+                       }
                        m->valid = VM_PAGE_BITS_ALL;
                } else if (bp->bio_cmd == BIO_DELETE) {
                        if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
                                rv = vm_pager_get_pages(sc->object, &m, 1, 0);
                        if (rv == VM_PAGER_ERROR) {
-                               sf_buf_free(sf);
-                               sched_unpin();
                                vm_page_wakeup(m);
                                break;
                        }
                        if (len != PAGE_SIZE) {
-                               bzero((void *)(sf_buf_kva(sf) + offs), len);
+                               pmap_zero_page_area(m, offs, len);
                                vm_page_clear_dirty(m, offs, len);
                                m->valid = VM_PAGE_BITS_ALL;
                        } else
                                vm_pager_page_unswapped(m);
                }
-               sf_buf_free(sf);
-               sched_unpin();
                vm_page_wakeup(m);
                vm_page_lock(m);
                if (bp->bio_cmd == BIO_DELETE && len == PAGE_SIZE)
@@ -731,6 +892,7 @@ mdstart_swap(struct md_s *sc, struct bio
                /* Actions on further pages start at offset 0 */
                p += PAGE_SIZE - offs;
                offs = 0;
+               ma_offs += len;
        }
        vm_object_pip_subtract(sc->object, 1);
        VM_OBJECT_WUNLOCK(sc->object);
@@ -846,6 +1008,7 @@ mdinit(struct md_s *sc)
        pp = g_new_providerf(gp, "md%d", sc->unit);
        pp->mediasize = sc->mediasize;
        pp->sectorsize = sc->sectorsize;
+       pp->flags |= G_PF_ACCEPT_UNMAPPED;
        sc->gp = gp;
        sc->pp = pp;
        g_error_provider(pp, 0);
@@ -1372,6 +1535,7 @@ g_md_init(struct g_class *mp __unused)
                        sx_xunlock(&md_sx);
                }
        }
+       md_vnode_pbuf_freecnt = nswbuf / 10;
        status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
            0600, MDCTL_NAME);
        g_topology_lock();
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to