All the cool kids have new TRIM supporting drives.  FFS should support
trim too, so that FFS can be cool again.

In order to do so, we need to insert a few more operations in the block
freeing code path, which this diff does.  This diff doesn't actually add
trim support, nor is the interface with the to be added trim support
finalized, but the basic concept of splitting ffs_blkfree into pretrim
and posttrim halves is not going to change.  So I'd like a few people to
try testing this.

You can read the diff to see what it does, but the basic idea is that we
collect lists of blocks that the filesystem would like to free.  Then we
tell the drive, hey, these blocks are free.  When the drive is done with
that, then we actually mark them free in the bitmap.

FreeBSD appears to be doing something similar, but with different layers
responsible for different things.  At the moment, it's convenient to put
all this code into FFS regardless of the "best" place for it.

In theory, this diff is a no-op, except to maybe add a little latency to
deleting/truncating files.  You shouldn't notice anything.



Index: ufs/ffs/ffs_alloc.c
===================================================================
RCS file: /home/tedu/cvs/src/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.91
diff -u -p -r1.91 ffs_alloc.c
--- ufs/ffs/ffs_alloc.c 4 Jul 2011 04:30:41 -0000       1.91
+++ ufs/ffs/ffs_alloc.c 7 Jul 2011 17:26:31 -0000
@@ -51,6 +51,7 @@
 #include <sys/syslog.h>
 #include <sys/stdint.h>
 #include <sys/time.h>
+#include <sys/pool.h>
 
 #include <uvm/uvm_extern.h>
 
@@ -70,7 +71,7 @@
 } while (0)
 
 daddr64_t      ffs_alloccg(struct inode *, int, daddr64_t, int);
-struct buf *   ffs_cgread(struct fs *, struct inode *, int);
+struct buf *   ffs_cgread(struct fs *, struct vnode *, int);
 daddr64_t      ffs_alloccgblk(struct inode *, struct buf *, daddr64_t);
 daddr64_t      ffs_clusteralloc(struct inode *, int, daddr64_t, int);
 ino_t          ffs_dirpref(struct inode *);
@@ -1184,11 +1185,11 @@ ffs_hashalloc(struct inode *ip, int cg, 
 }
 
 struct buf *
-ffs_cgread(struct fs *fs, struct inode *ip, int cg)
+ffs_cgread(struct fs *fs, struct vnode *devvp, int cg)
 {
        struct buf *bp;
 
-       if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+       if (bread(devvp, fsbtodb(fs, cgtod(fs, cg)),
            (int)fs->fs_cgsize, &bp)) {
                brelse(bp);
                return (NULL);
@@ -1227,7 +1228,7 @@ ffs_fragextend(struct inode *ip, int cg,
                return (0);
        }
 
-       if (!(bp = ffs_cgread(fs, ip, cg)))
+       if (!(bp = ffs_cgread(fs, ip->i_devvp, cg)))
                return (0);
 
        cgp = (struct cg *)bp->b_data;
@@ -1284,7 +1285,7 @@ ffs_alloccg(struct inode *ip, int cg, da
        if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
                return (0);
 
-       if (!(bp = ffs_cgread(fs, ip, cg)))
+       if (!(bp = ffs_cgread(fs, ip->i_devvp, cg)))
                return (0);
 
        cgp = (struct cg *)bp->b_data;
@@ -1438,7 +1439,7 @@ ffs_clusteralloc(struct inode *ip, int c
        if (fs->fs_maxcluster[cg] < len)
                return (0);
 
-       if (!(bp = ffs_cgread(fs, ip, cg)))
+       if (!(bp = ffs_cgread(fs, ip->i_devvp, cg)))
                return (0);
 
        cgp = (struct cg *)bp->b_data;
@@ -1551,7 +1552,7 @@ ffs_nodealloccg(struct inode *ip, int cg
        if (fs->fs_cs(fs, cg).cs_nifree == 0)
                return (0);
 
-       if (!(bp = ffs_cgread(fs, ip, cg)))
+       if (!(bp = ffs_cgread(fs, ip->i_devvp, cg)))
                return (0);
 
        cgp = (struct cg *)bp->b_data;
@@ -1693,29 +1694,244 @@ gotit:
  * free map. If a fragment is deallocated, a possible
  * block reassembly is checked.
  */
+/*
+ * To support intelligent drives that want to know about free space,
+ * we split this operation into two parts.  First, we notify the drive
+ * that the free space is available.  Only when that's done do we
+ * actually mark the space free in the filesystem.
+ *
+ * We want to coalesce the trim operation into long extents, but
+ * nothing crazy.  Small trim operations hurt performance.  The current
+ * model only supports trimming one extent per request, but most drives
+ * can support lists of extents.
+ *
+ * For simplicity, we maintain the original file system arguments and
+ * chain them off the trim operation's buf.
+ */
+struct blkfree_trimbuf {
+       struct buf buf;
+       struct workq_task wqt;
+       struct blkfree_args *args;
+};
+struct blkfree_args {
+       struct blkfree_args *next;
+       struct fs *fs;
+       ino_t ino;
+       uid_t uid;
+       dev_t idev;
+       struct vnode *devvp;
+       daddr64_t bno;
+       long size;
+};
+struct workq *blkfree_workq;
+struct pool blkfreetrim_pool;
+struct pool blkfreeargs_pool;
+
+void ffs_blkfree_starttrim(void *, void *);
+void ffs_blkfree_biodone(struct buf *);
+void ffs_blkfree_posttrim(void *, void *);
+void ffs_blkfree_dofree(struct fs *, ino_t, uid_t, dev_t,
+    struct vnode *, daddr64_t, long);
+#define TRIMBUF_LEN 512
+
+/* the "queue" for coalescing */
+daddr64_t ffs_trimrunblk;
+daddr64_t ffs_trimrunlen; /* in bytes */
+dev_t ffs_trimrundev;
+struct blkfree_args *ffs_trimrunargs;
+struct timeout ffs_trimtimeout;
+void ffs_trimtimeout_fn(void *);
+
+void ffs_blkfree_init(void)
+{
+       blkfree_workq = workq_create("blkfree", 1, IPL_SOFTCLOCK);
+       pool_init(&blkfreetrim_pool, sizeof(struct blkfree_trimbuf), 0, 0, 0,
+           "blktrimpl", &pool_allocator_nointr);
+       pool_init(&blkfreeargs_pool, sizeof(struct blkfree_args), 0, 0, 0,
+           "blkfreepl", &pool_allocator_nointr);
+       timeout_set(&ffs_trimtimeout, ffs_trimtimeout_fn, NULL);
+}
+
+void
+ffs_trimtimeout_fn(void *v)
+{
+       /* if this fails, args will hang around a while longer. that's ok. */
+       workq_add_task(blkfree_workq, 0, ffs_blkfree_starttrim, NULL, NULL);
+}
+
+/*
+ * The FFS code calls this function.  We queue the request, and perform
+ * it asynchronously.  This can sleep.
+ */
 void
 ffs_blkfree(struct inode *ip, daddr64_t bno, long size)
 {
+       struct blkfree_args *args;
+       struct mount *mp;
+       struct vnode *devvp;
        struct fs *fs;
+       daddr64_t diskblk;
+       daddr64_t blksize;
+
+       fs = ip->i_fs;
+       if (size < fs->fs_bsize) {
+               ffs_blkfree_dofree(ip->i_fs, ip->i_number, DIP(ip, uid), 
ip->i_dev, ip->i_devvp, bno, size);
+               return;
+       }
+
+       blksize = fs->fs_bsize << fs->fs_fsbtodb;
+       mp = ITOV(ip)->v_mount;
+       devvp = ip->i_devvp;
+       diskblk = fsbtodb(fs, bno);
+
+       args = pool_get(&blkfreeargs_pool, PR_WAITOK | PR_ZERO);
+       args->fs = ip->i_fs;
+       args->ino = ip->i_number;
+       args->uid = DIP(ip, uid);
+       args->idev = ip->i_dev;
+       args->devvp = ip->i_devvp;
+       args->bno = bno;
+       args->size = size;
+
+       if (ffs_trimrunlen >= 512 * 30000) {
+               ffs_blkfree_starttrim(NULL, NULL);
+       } else if (args->idev == ffs_trimrundev &&
+           diskblk + size / blksize == ffs_trimrunblk) {
+               ffs_trimrunblk = diskblk;
+               ffs_trimrunlen += size;
+       } else if (args->idev == ffs_trimrundev &&
+           ffs_trimrunblk + ffs_trimrunlen / blksize == diskblk) {
+               ffs_trimrunlen += size;
+       } else {
+               if (ffs_trimrunargs)
+                       ffs_blkfree_starttrim(NULL, NULL);
+               ffs_trimrunblk = diskblk;
+               ffs_trimrunlen = size;
+               ffs_trimrundev = args->idev;
+       }
+       args->next = ffs_trimrunargs;
+       ffs_trimrunargs = args;
+       timeout_add_msec(&ffs_trimtimeout, 100);
+}
+
+/*
+ * Set up the trim operation.  This function can sleep.
+ */
+void
+ffs_blkfree_starttrim(void *unusued1, void *unused2)
+{
+       struct blkfree_args *args;
+       struct blkfree_trimbuf *buf;
+       daddr64_t blk;
+       daddr64_t len;
+
+       args = ffs_trimrunargs;
+       len = ffs_trimrunlen;
+       blk = ffs_trimrunblk;
+
+       ffs_trimrunargs = NULL;
+       ffs_trimrunlen = 0;
+
+       if (!len)
+               return;
+
+       if (len < 1024 * 96) {
+               /* not worth getting out of bed for this */
+               ffs_blkfree_posttrim(NULL, args);
+               return;
+       }
+
+       buf = pool_get(&blkfreetrim_pool, PR_WAITOK | PR_ZERO);
+       buf->args = args;
+       buf->buf.b_data = dma_alloc(TRIMBUF_LEN, PR_WAITOK | PR_ZERO);
+       buf->buf.b_flags = /* B_TRIM | */ B_CALL | B_PHYS;
+       buf->buf.b_iodone = ffs_blkfree_biodone;
+       buf->buf.b_blkno = blk;
+       buf->buf.b_resid = len;
+       buf->buf.b_dev = args->idev;
+       buf->buf.b_vp = args->devvp;
+#if XTRIMX
+       args->devvp->v_numoutput++;
+
+       VOP_STRATEGY(&buf->buf);
+#else
+       ffs_blkfree_biodone(&buf->buf);
+#endif
+}
+
+/*
+ * biodone callback.  This may not sleep so we queue the rest of the work.
+ */
+void
+ffs_blkfree_biodone(struct buf *v)
+{
+       struct blkfree_trimbuf *buf = (struct blkfree_trimbuf *)v;
+
+       workq_queue_task(blkfree_workq, &buf->wqt, 0, ffs_blkfree_posttrim,
+           buf, NULL);
+}
+
+/*
+ * This function may be called two ways.  First, with a buf, as a result
+ * of a trim operation completing.  Or without a buf when we've decided
+ * to skip the trim.  We iterate over all the pending blkfree ops.
+ * This function can sleep.
+ */
+void
+ffs_blkfree_posttrim(void *vbuf, void *vargs)
+{
+       struct blkfree_trimbuf *buf = vbuf;
+       struct blkfree_args *args = vargs;
+
+       if (buf) {
+               args = buf->args;
+               if (buf->buf.b_data)
+                       dma_free(buf->buf.b_data, TRIMBUF_LEN);
+               pool_put(&blkfreetrim_pool, buf);
+       }
+
+       while (args) {
+               struct blkfree_args *next = args->next;
+               struct fs *fs = args->fs;
+               ino_t ino = args->ino;
+               uid_t uid = args->uid;
+               dev_t idev = args->idev;
+               struct vnode *devvp = args->devvp;
+               daddr64_t bno = args->bno;
+               long size = args->size;
+
+               ffs_blkfree_dofree(fs, ino, uid, idev, devvp, bno, size);
+
+               pool_put(&blkfreeargs_pool, args);
+               args = next;
+       }
+}
+
+/*
+ * The original FFS blkfree code.  This function may sleep.
+ */
+void
+ffs_blkfree_dofree(struct fs *fs, ino_t ino, uid_t uid, dev_t idev,
+    struct vnode *devvp, daddr64_t bno, long size)
+{
        struct cg *cgp;
        struct buf *bp;
        daddr64_t blkno;
        int i, cg, blk, frags, bbase;
 
-       fs = ip->i_fs;
        if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
            fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
                printf("dev = 0x%x, bsize = %d, size = %ld, fs = %s\n",
-                   ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
+                   idev, fs->fs_bsize, size, fs->fs_fsmnt);
                panic("ffs_blkfree: bad size");
        }
        cg = dtog(fs, bno);
        if ((u_int)bno >= fs->fs_size) {
-               printf("bad block %lld, ino %u\n", bno, ip->i_number);
-               ffs_fserr(fs, DIP(ip, uid), "bad block");
+               printf("bad block %lld, ino %u\n", bno, ino);
+               ffs_fserr(fs, uid, "bad block");
                return;
        }
-       if (!(bp = ffs_cgread(fs, ip, cg)))
+       if (!(bp = ffs_cgread(fs, devvp, cg)))
                return;
 
        cgp = (struct cg *)bp->b_data;
@@ -1726,7 +1942,7 @@ ffs_blkfree(struct inode *ip, daddr64_t 
                blkno = fragstoblks(fs, bno);
                if (!ffs_isfreeblock(fs, cg_blksfree(cgp), blkno)) {
                        printf("dev = 0x%x, block = %lld, fs = %s\n",
-                           ip->i_dev, bno, fs->fs_fsmnt);
+                           idev, bno, fs->fs_fsmnt);
                        panic("ffs_blkfree: freeing free block");
                }
                ffs_setblock(fs, cg_blksfree(cgp), blkno);
@@ -1755,7 +1971,7 @@ ffs_blkfree(struct inode *ip, daddr64_t 
                for (i = 0; i < frags; i++) {
                        if (isset(cg_blksfree(cgp), bno + i)) {
                                printf("dev = 0x%x, block = %lld, fs = %s\n",
-                                   ip->i_dev, bno + i, fs->fs_fsmnt);
+                                   idev, bno + i, fs->fs_fsmnt);
                                panic("ffs_blkfree: freeing free frag");
                        }
                        setbit(cg_blksfree(cgp), bno + i);
@@ -1823,7 +2039,7 @@ ffs_freefile(struct inode *pip, ino_t in
                    pip->i_dev, ino, fs->fs_fsmnt);
 
        cg = ino_to_cg(fs, ino);
-       if (!(bp = ffs_cgread(fs, pip, cg)))
+       if (!(bp = ffs_cgread(fs, pip->i_devvp, cg)))
                return (0);
 
        cgp = (struct cg *)bp->b_data;
@@ -1874,7 +2090,7 @@ ffs_checkblk(struct inode *ip, daddr64_t
        if ((u_int)bno >= fs->fs_size)
                panic("ffs_checkblk: bad block %lld", bno);
 
-       if (!(bp = ffs_cgread(fs, ip, dtog(fs, bno))))
+       if (!(bp = ffs_cgread(fs, ip->i_devvp, dtog(fs, bno))))
                return (0);
 
        cgp = (struct cg *)bp->b_data;
Index: ufs/ffs/ffs_extern.h
===================================================================
RCS file: /home/tedu/cvs/src/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.37
diff -u -p -r1.37 ffs_extern.h
--- ufs/ffs/ffs_extern.h        21 Dec 2010 20:14:44 -0000      1.37
+++ ufs/ffs/ffs_extern.h        7 Jul 2011 16:37:00 -0000
@@ -114,6 +114,7 @@ int64_t ffs2_blkpref(struct inode *, dad
 #endif
 void ffs_blkfree(struct inode *, daddr64_t, long);
 void ffs_clusteracct(struct fs *, struct cg *, daddr64_t, int);
+void ffs_blkfree_init(void);
 
 /* ffs_balloc.c */
 int ffs_balloc(struct inode *, off_t, int, struct ucred *, int, struct buf **);
Index: ufs/ffs/ffs_vfsops.c
===================================================================
RCS file: /home/tedu/cvs/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.133
diff -u -p -r1.133 ffs_vfsops.c
--- ufs/ffs/ffs_vfsops.c        4 Jul 2011 20:35:35 -0000       1.133
+++ ufs/ffs/ffs_vfsops.c        7 Jul 2011 16:37:00 -0000
@@ -1488,6 +1488,7 @@ ffs_init(struct vfsconf *vfsp)
        pool_init(&ffs_dinode2_pool, sizeof(struct ufs2_dinode), 0, 0, 0,
            "dino2pl", &pool_allocator_nointr);
 #endif
+       ffs_blkfree_init();
 
        softdep_initialize();

Reply via email to