Please test this patch.  It survived a couple of a buildworld runs
    on my test box with the chunk size forced to 4K but it does mess
    with low level disk I/O so make sure anything important is backed up
    first.

                                                -Matt

Index: kern/subr_diskgpt.c
===================================================================
RCS file: /cvs/src/sys/kern/subr_diskgpt.c,v
retrieving revision 1.3
diff -u -p -r1.3 subr_diskgpt.c
--- kern/subr_diskgpt.c 19 Jun 2007 06:07:57 -0000      1.3
+++ kern/subr_diskgpt.c 19 Jul 2007 02:42:50 -0000
@@ -136,6 +136,9 @@             error = EINVAL;
                goto done;
        }
 
+       /*
+        * XXX subject to device dma size limitations
+        */
        bp2 = geteblk((int)(table_blocks * info->d_media_blksize));
        bp2->b_bio1.bio_offset = (off_t)table_lba * info->d_media_blksize;
        bp2->b_bcount = table_blocks * info->d_media_blksize;
Index: kern/subr_disklabel64.c
===================================================================
RCS file: /cvs/src/sys/kern/subr_disklabel64.c,v
retrieving revision 1.4
diff -u -p -r1.4 subr_disklabel64.c
--- kern/subr_disklabel64.c     19 Jun 2007 06:39:06 -0000      1.4
+++ kern/subr_disklabel64.c     19 Jul 2007 02:43:56 -0000
@@ -118,6 +118,9 @@     size_t dlpcrcsize;
        size_t bpsize;
        int secsize;
 
+       /*
+        * XXX I/O size is subject to device DMA limitations
+        */
        secsize = info->d_media_blksize;
        bpsize = (sizeof(*dlp) + secsize - 1) & ~(secsize - 1);
 
@@ -289,6 +292,9 @@     int secsize;
 
        lp = lpx.lab64;
 
+       /*
+        * XXX I/O size is subject to device DMA limitations
+        */
        secsize = ssp->dss_secsize;
        bpsize = (sizeof(*lp) + secsize - 1) & ~(secsize - 1);
 
Index: kern/vfs_aio.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_aio.c,v
retrieving revision 1.41
diff -u -p -r1.41 vfs_aio.c
--- kern/vfs_aio.c      29 Jun 2007 21:54:08 -0000      1.41
+++ kern/vfs_aio.c      19 Jul 2007 02:49:35 -0000
@@ -944,8 +944,12 @@    bp->b_error = 0;
 
        crit_exit();
        
-       /* Perform transfer. */
-       dev_dstrategy(vp->v_rdev, &bp->b_bio1);
+       /*
+        * Perform the transfer.  vn_strategy must be used even though we
+        * know we have a device in order to deal with requests which exceed
+        * device DMA limitations.
+        */
+       vn_strategy(vp, &bp->b_bio1);
 
        notify = 0;
        crit_enter();
Index: vfs/specfs/spec_vnops.c
===================================================================
RCS file: /cvs/src/sys/vfs/specfs/spec_vnops.c,v
retrieving revision 1.51
diff -u -p -r1.51 spec_vnops.c
--- vfs/specfs/spec_vnops.c     9 May 2007 00:53:36 -0000       1.51
+++ vfs/specfs/spec_vnops.c     19 Jul 2007 02:39:25 -0000
@@ -60,6 +60,15 @@ #include <sys/buf2.h>
 
 #include <sys/thread2.h>
 
+/*
+ * Specfs chained debugging (bitmask)
+ *
+ * 0 - disable debugging
+ * 1 - report chained I/Os
+ * 2 - force 4K chained I/Os
+ */
+#define SPEC_CHAIN_DEBUG       0
+
 static int     spec_advlock (struct vop_advlock_args *);  
 static int     spec_bmap (struct vop_bmap_args *);
 static int     spec_close (struct vop_close_args *);
@@ -75,6 +84,7 @@ 
 static int     spec_read (struct vop_read_args *);  
 static int     spec_strategy (struct vop_strategy_args *);
 static int     spec_write (struct vop_write_args *);
+static void    spec_strategy_done(struct bio *nbio);
 
 struct vop_ops spec_vnode_vops = {
        .vop_default =          vop_defaultop,
@@ -438,7 +448,9 @@     return (0);
 }
 
 /*
- * Just call the device strategy routine
+ * Convert a vnode strategy call into a device strategy call.  Vnode strategy
+ * calls are not limited to device DMA limits so we have to deal with the
+ * case.
  *
  * spec_strategy(struct vnode *a_vp, struct bio *a_bio)
  */
@@ -447,8 +459,11 @@  */
 {
        struct bio *bio = ap->a_bio;
        struct buf *bp = bio->bio_buf;
+       struct buf *nbp;
        struct vnode *vp;
        struct mount *mp;
+       int chunksize;
+       int maxiosize;
 
        if (bp->b_cmd != BUF_CMD_READ &&
            (LIST_FIRST(&bp->b_dep)) != NULL && bioops.io_start) {
@@ -474,11 +489,151 @@                  else
                                mp->mnt_stat.f_syncwrites++;
                }
        }
-       dev_dstrategy_chain(vp->v_rdev, bio);
+
+        /*
+         * Device iosize limitations only apply to read and write.  Shortcut
+         * the I/O if it fits.
+         */
+       maxiosize = vp->v_rdev->si_iosize_max;
+#if SPEC_CHAIN_DEBUG & 2
+       maxiosize = 4096;
+#endif
+        if (bp->b_bcount <= maxiosize ||
+            (bp->b_cmd != BUF_CMD_READ && bp->b_cmd != BUF_CMD_WRITE)) {
+                dev_dstrategy_chain(vp->v_rdev, bio);
+                return (0);
+        }
+
+       /*
+        * Clone the buffer and set up an I/O chain to chunk up the I/O.
+        */
+       nbp = kmalloc(sizeof(*bp), M_DEVBUF, M_INTWAIT|M_ZERO);
+       initbufbio(nbp);
+       LIST_INIT(&nbp->b_dep);
+       BUF_LOCKINIT(nbp);
+       BUF_LOCK(nbp, LK_EXCLUSIVE);
+       BUF_KERNPROC(nbp);
+       nbp->b_vp = vp;
+       nbp->b_flags = B_PAGING | (bp->b_flags & B_BNOCLIP);
+       nbp->b_data = bp->b_data;
+       nbp->b_bio1.bio_done = spec_strategy_done;
+       nbp->b_bio1.bio_offset = bio->bio_offset;
+       nbp->b_bio1.bio_caller_info1.ptr = bio;
+
+       /*
+        * Start the first transfer
+        */
+       if (vn_isdisk(vp, NULL))
+               chunksize = vp->v_rdev->si_bsize_phys;
+       else
+               chunksize = DEV_BSIZE;
+       chunksize = maxiosize / chunksize * chunksize;
+#if SPEC_CHAIN_DEBUG & 1
+       kprintf("spec_strategy chained I/O chunksize=%d\n", chunksize);
+#endif
+       nbp->b_cmd = bp->b_cmd;
+       nbp->b_bcount = chunksize;
+       nbp->b_bufsize = chunksize;     /* used to detect a short I/O */
+       nbp->b_bio1.bio_caller_info2.index = chunksize;
+
+#if SPEC_CHAIN_DEBUG & 1
+       kprintf("spec_strategy: chain %p offset %d/%d bcount %d\n",
+               bp, 0, bp->b_bcount, nbp->b_bcount);
+#endif
+
+       dev_dstrategy(vp->v_rdev, &nbp->b_bio1);
        return (0);
 }
 
 /*
+ * Chunked up transfer completion routine - chain transfers until done
+ */
+static
+void
+spec_strategy_done(struct bio *nbio)
+{
+       struct buf *nbp = nbio->bio_buf;
+       struct bio *bio = nbio->bio_caller_info1.ptr;   /* original bio */
+       struct buf *bp = bio->bio_buf;                  /* original bp */
+       int chunksize = nbio->bio_caller_info2.index;   /* chunking */
+       int boffset = nbp->b_data - bp->b_data;
+
+       if (nbp->b_flags & B_ERROR) {
+               /*
+                * An error terminates the chain, propogate the error back
+                * to the original bp
+                */
+               bp->b_flags |= B_ERROR;
+               bp->b_error = nbp->b_error;
+               bp->b_resid = bp->b_bcount - boffset +
+                             (nbp->b_bcount - nbp->b_resid);
+#if SPEC_CHAIN_DEBUG & 1
+               kprintf("spec_strategy: chain %p error %d bcount %d/%d\n",
+                       bp, bp->b_error, bp->b_bcount,
+                       bp->b_bcount - bp->b_resid);
+#endif
+               kfree(nbp, M_DEVBUF);
+               biodone(bio);
+       } else if (nbp->b_resid) {
+               /*
+                * A short read or write terminates the chain
+                */
+               bp->b_error = nbp->b_error;
+               bp->b_resid = bp->b_bcount - boffset +
+                             (nbp->b_bcount - nbp->b_resid);
+#if SPEC_CHAIN_DEBUG & 1
+               kprintf("spec_strategy: chain %p short read(1) bcount %d/%d\n",
+                       bp, bp->b_bcount - bp->b_resid, bp->b_bcount);
+#endif
+               kfree(nbp, M_DEVBUF);
+               biodone(bio);
+       } else if (nbp->b_bcount != nbp->b_bufsize) {
+               /*
+                * A short read or write can also occur by truncating b_bcount
+                */
+#if SPEC_CHAIN_DEBUG & 1
+               kprintf("spec_strategy: chain %p short read(2) bcount %d/%d\n",
+                       bp, nbp->b_bcount + boffset, bp->b_bcount);
+#endif
+               bp->b_error = 0;
+               bp->b_bcount = nbp->b_bcount + boffset; 
+               bp->b_resid = nbp->b_resid;
+               kfree(nbp, M_DEVBUF);
+               biodone(bio);
+       } else if (nbp->b_bcount + boffset == bp->b_bcount) {
+               /*
+                * No more data terminates the chain
+                */
+#if SPEC_CHAIN_DEBUG & 1
+               kprintf("spec_strategy: chain %p finished bcount %d\n",
+                       bp, bp->b_bcount);
+#endif
+               bp->b_error = 0;
+               bp->b_resid = 0;
+               kfree(nbp, M_DEVBUF);
+               biodone(bio);
+       } else {
+               /*
+                * Continue the chain
+                */
+               boffset += nbp->b_bcount;
+               nbp->b_data = bp->b_data + boffset;
+               nbp->b_bcount = bp->b_bcount - boffset;
+               if (nbp->b_bcount > chunksize)
+                       nbp->b_bcount = chunksize;
+               nbp->b_bio1.bio_done = spec_strategy_done;
+               nbp->b_bio1.bio_offset = bio->bio_offset + boffset;
+
+#if SPEC_CHAIN_DEBUG & 1
+               kprintf("spec_strategy: chain %p offset %d/%d bcount %d\n",
+                       bp, boffset, bp->b_bcount, nbp->b_bcount);
+#endif
+
+               dev_dstrategy(nbp->b_vp->v_rdev, &nbp->b_bio1);
+       }
+}
+
+/*
  * spec_freeblks(struct vnode *a_vp, daddr_t a_addr, daddr_t a_length)
  */
 static int
Index: vm/vm_swap.c
===================================================================
RCS file: /cvs/src/sys/vm/vm_swap.c,v
retrieving revision 1.35
diff -u -p -r1.35 vm_swap.c
--- vm/vm_swap.c        15 May 2007 22:44:21 -0000      1.35
+++ vm/vm_swap.c        19 Jul 2007 02:47:43 -0000
@@ -142,8 +142,9 @@      * Issue a strategy call on the appropr
         * bp->b_vp is not modified.  Strategy code is always supposed to
         * use the passed vp.
         *
-        * XXX do a dev_dstrategy() call on sp->sw_device instead of on
-        * sp->sw_vp ?
+        * We have to use vn_strategy() here even if we know we have a
+        * device in order to properly break up requests which exceed the
+        * device's DMA limits.
         */
        vn_strategy(sp->sw_vp, nbio);
        return 0;

Reply via email to