The branch stable/13 has been updated by mckusick:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=3c0ca938ed50385bc3d4b00638e28b8881753060

commit 3c0ca938ed50385bc3d4b00638e28b8881753060
Author:     Kirk McKusick <[email protected]>
AuthorDate: 2022-11-09 18:44:03 +0000
Commit:     Kirk McKusick <[email protected]>
CommitDate: 2022-12-11 00:37:17 +0000

    Add support for managing UFS/FFS snapshots to fsck_ffs(8).
    
    (cherry picked from commit 460ed6106cf0854caff62e4eeba8ffcd00ab0690)
    (cherry picked from commit 689a9368eb60f061e10e1924f388b99e8f6c2cb2)
    (cherry picked from commit 5f7acd18583116c3036e0f6e35a6f385e7e85741)
    
    Sponsored by: The FreeBSD Foundation
---
 sbin/fsck_ffs/dir.c    |  21 ++-
 sbin/fsck_ffs/fsck.h   |  29 +++-
 sbin/fsck_ffs/fsutil.c | 234 +++++++++++++++++---------
 sbin/fsck_ffs/inode.c  | 433 ++++++++++++++++++++++++++++++++++++++++++++++++-
 sbin/fsck_ffs/main.c   |   1 +
 sbin/fsck_ffs/setup.c  | 182 ++++++++++++++++++++-
 sbin/fsck_ffs/suj.c    |  96 +++++++++--
 7 files changed, 891 insertions(+), 105 deletions(-)

diff --git a/sbin/fsck_ffs/dir.c b/sbin/fsck_ffs/dir.c
index ba286a965513..d09e6940f812 100644
--- a/sbin/fsck_ffs/dir.c
+++ b/sbin/fsck_ffs/dir.c
@@ -679,14 +679,17 @@ expanddir(struct inode *ip, char *name)
        struct bufarea *bp, *nbp;
        struct inodesc idesc;
        union dinode *dp;
-       int indiralloced;
+       long cg, indiralloced;
        char *cp;
 
        nbp = NULL;
        indiralloced = newblk = indirblk = 0;
+       memset(&idesc, 0, sizeof(struct inodesc));
+       idesc.id_type = ADDR;
        pwarn("NO SPACE LEFT IN %s", name);
        if (!preen && reply("EXPAND") == 0)
                return (0);
+       cg = ino_to_cg(&sblock, ip->i_number);
        dp = ip->i_dp;
        filesize = DIP(dp, di_size);
        lastlbn = lblkno(&sblock, filesize);
@@ -705,7 +708,8 @@ expanddir(struct inode *ip, char *name)
                bp = getdirblk(oldblk, lastlbnsize);
                if (bp->b_errs)
                        goto bad;
-               if ((newblk = allocblk(sblock.fs_frag)) == 0)
+               newblk = allocblk(cg, sblock.fs_frag, std_checkblkavail);
+               if (newblk == 0)
                        goto bad;
                nbp = getdatablk(newblk, sblock.fs_bsize, BT_DIRDATA);
                if (nbp->b_errs)
@@ -724,6 +728,7 @@ expanddir(struct inode *ip, char *name)
                        memmove(cp, &emptydir, sizeof emptydir);
                dirty(nbp);
                brelse(nbp);
+               binval(bp);
                idesc.id_blkno = oldblk;
                idesc.id_numfrags = numfrags(&sblock, lastlbnsize);
                (void)freeblock(&idesc);
@@ -731,7 +736,7 @@ expanddir(struct inode *ip, char *name)
                        printf(" (EXPANDED)\n");
                return (1);
        }
-       if ((newblk = allocblk(sblock.fs_frag)) == 0)
+       if ((newblk = allocblk(cg, sblock.fs_frag, std_checkblkavail)) == 0)
                goto bad;
        bp = getdirblk(newblk, sblock.fs_bsize);
        if (bp->b_errs)
@@ -749,8 +754,12 @@ expanddir(struct inode *ip, char *name)
                 * Allocate indirect block if needed.
                 */
                if ((indirblk = DIP(dp, di_ib[0])) == 0) {
-                       if ((indirblk = allocblk(sblock.fs_frag)) == 0)
+                       indirblk = allocblk(cg, sblock.fs_frag,
+                           std_checkblkavail);
+                       if (indirblk == 0) {
+                               binval(bp);
                                goto bad;
+                       }
                        indiralloced = 1;
                }
                nbp = getdatablk(indirblk, sblock.fs_bsize, BT_LEVEL1);
@@ -774,8 +783,10 @@ expanddir(struct inode *ip, char *name)
        return (1);
 bad:
        pfatal(" (EXPANSION FAILED)\n");
-       if (nbp != NULL)
+       if (nbp != NULL) {
+               binval(bp);
                brelse(nbp);
+       }
        if (newblk != 0) {
                idesc.id_blkno = newblk;
                idesc.id_numfrags = sblock.fs_frag;
diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h
index 1d3f9b7943ec..c70febdd4e80 100644
--- a/sbin/fsck_ffs/fsck.h
+++ b/sbin/fsck_ffs/fsck.h
@@ -200,8 +200,7 @@ struct bufarea {
 #define        BT_INODES        7      /* Buffer holds inodes */
 #define        BT_DIRDATA       8      /* Buffer holds directory data */
 #define        BT_DATA          9      /* Buffer holds user data */
-#define        BT_EMPTY        10      /* Buffer allocated but not filled */
-#define BT_NUMBUFTYPES 11
+#define BT_NUMBUFTYPES 10
 #define BT_NAMES {                     \
        "unknown",                      \
        "Superblock",                   \
@@ -212,8 +211,7 @@ struct bufarea {
        "External Attribute",           \
        "Inode Block",                  \
        "Directory Contents",           \
-       "User Data",                    \
-       "Allocated but not filled" }
+       "User Data" }
 extern char *buftype[];
 #define BT_BUFTYPE(type) \
        type < BT_NUMBUFTYPES ? buftype[type] : buftype[BT_UNKNOWN]
@@ -234,7 +232,7 @@ extern struct bufarea *pdirbp;              /* current 
directory contents */
                (bp)->b_flags |= B_DIRTY; \
 } while (0)
 #define        initbarea(bp, type) do { \
-       (bp)->b_bno = (ufs2_daddr_t)-1; \
+       (bp)->b_bno = (ufs2_daddr_t)-4; \
        (bp)->b_size = 0; \
        (bp)->b_errs = 0; \
        (bp)->b_flags = 0; \
@@ -347,6 +345,7 @@ extern char *blockmap;              /* ptr to primary blk 
allocation map */
 extern char *cdevname;         /* name of device being checked */
 extern char ckclean;           /* only do work if not cleanly unmounted */
 extern int ckhashadd;          /* check hashes to be added */
+extern char *copybuf;          /* buffer to copy snapshot blocks */
 extern int cvtlevel;           /* convert to newer file system format */
 extern long dev_bsize;         /* computed value of DEV_BSIZE */
 extern u_int real_dev_bsize;   /* actual disk sector size, not overridden */
@@ -372,6 +371,8 @@ extern int returntosingle;  /* 1 => return to single user 
mode on exit */
 extern int sbhashfailed;       /* when reading superblock check hash failed */
 extern long secsize;           /* actual disk sector size */
 extern char skipclean;         /* skip clean file systems if preening */
+extern int snapcnt;            /* number of active snapshots */
+extern struct inode snaplist[FSMAXSNAP + 1]; /* list of active snapshots */
 extern char snapname[BUFSIZ];  /* when doing snapshots, the name of the file */
 extern int sujrecovery;                /* 1 => doing check using the journal */
 extern int surrender;          /* Give up if reads fail */
@@ -442,9 +443,11 @@ struct fstab;
 
 void           adjust(struct inodesc *, int lcnt);
 void           alarmhandler(int sig);
-ufs2_daddr_t   allocblk(long frags);
+ufs2_daddr_t   allocblk(long cg, long frags, ufs2_daddr_t (*checkblkavail)
+                   (ufs2_daddr_t blkno, long frags));
 ino_t          allocdir(ino_t parent, ino_t request, int mode);
 ino_t          allocino(ino_t request, int type);
+void           binval(struct bufarea *);
 void           blkerror(ino_t ino, const char *type, ufs2_daddr_t blk);
 char          *blockcheck(char *name);
 int            blread(int fd, char *buf, ufs2_daddr_t blk, long size);
@@ -459,12 +462,15 @@ void              catchquit(int);
 void           cgdirty(struct bufarea *);
 struct bufarea *cglookup(int cg);
 int            changeino(ino_t dir, const char *name, ino_t newnum);
+void           check_blkcnt(struct inode *ip);
 int            check_cgmagic(int cg, struct bufarea *cgbp, int requestrebuild);
 int            chkrange(ufs2_daddr_t blk, int cnt);
 void           ckfini(int markclean);
 int            ckinode(union dinode *dp, struct inodesc *);
 void           clri(struct inodesc *, const char *type, int flag);
 int            clearentry(struct inodesc *);
+void           copyonwrite(struct fs *, struct bufarea *,
+                   ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long));
 void           direrror(ino_t ino, const char *errmesg);
 int            dirscan(struct inodesc *);
 int            dofix(struct inodesc *, const char *msg);
@@ -477,6 +483,7 @@ void                flush(int fd, struct bufarea *bp);
 int            freeblock(struct inodesc *);
 void           freeino(ino_t ino);
 void           freeinodebuf(void);
+void           fsckinit(void);
 void           fsutilinit(void);
 int            ftypeok(union dinode *dp);
 void           getblk(struct bufarea *bp, ufs2_daddr_t blk, long size);
@@ -485,6 +492,7 @@ struct inoinfo *getinoinfo(ino_t inumber);
 union dinode   *getnextinode(ino_t inumber, int rebuildcg);
 void           getpathname(char *namebuf, ino_t curdir, ino_t ino);
 void           ginode(ino_t, struct inode *);
+void           gjournal_check(const char *filesys);
 void           infohandler(int sig);
 void           irelse(struct inode *);
 ufs2_daddr_t   ino_blkatoff(union dinode *, ino_t, ufs_lbn_t, int *,
@@ -506,6 +514,7 @@ void                pass4(void);
 void           pass5(void);
 void           pfatal(const char *fmt, ...) __printflike(1, 2);
 void           propagate(void);
+void           prtbuf(struct bufarea *, const char *, ...) __printflike(2, 3);
 void           prtinode(struct inode *);
 void           pwarn(const char *fmt, ...) __printflike(1, 2);
 int            readsb(int listerr);
@@ -514,9 +523,13 @@ void               rwerror(const char *mesg, ufs2_daddr_t 
blk);
 void           sblock_init(void);
 void           setinodebuf(int, ino_t);
 int            setup(char *dev);
-void           gjournal_check(const char *filesys);
+int            snapblkfree(struct fs *, ufs2_daddr_t, long, ino_t,
+                   ufs2_daddr_t (*)(ufs2_daddr_t, long));
+void           snapremove(ino_t);
+void           snapflush(ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long));
+ufs2_daddr_t   std_checkblkavail(ufs2_daddr_t blkno, long frags);
+ufs2_daddr_t   suj_checkblkavail(ufs2_daddr_t, long);
 int            suj_check(const char *filesys);
 void           update_maps(struct cg *, struct cg*, int);
-void           fsckinit(void);
 
 #endif /* !_FSCK_H_ */
diff --git a/sbin/fsck_ffs/fsutil.c b/sbin/fsck_ffs/fsutil.c
index 4a7a6b7a2574..277d50f87fb7 100644
--- a/sbin/fsck_ffs/fsutil.c
+++ b/sbin/fsck_ffs/fsutil.c
@@ -71,7 +71,6 @@ static void cg_write(struct bufarea *);
 static void slowio_start(void);
 static void slowio_end(void);
 static void printIOstats(void);
-static void prtbuf(const char *, struct bufarea *);
 
 static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */
 static struct timespec startpass, finishpass;
@@ -79,8 +78,10 @@ struct timeval slowio_starttime;
 int slowio_delay_usec = 10000; /* Initial IO delay for background fsck */
 int slowio_pollcnt;
 static struct bufarea cgblk;   /* backup buffer for cylinder group blocks */
+static struct bufarea failedbuf; /* returned by failed getdatablk() */
 static TAILQ_HEAD(bufqueue, bufarea) bufqueuehd; /* head of buffer cache LRU */
 static LIST_HEAD(bufhash, bufarea) bufhashhd[HASHSIZE]; /* buffer hash list */
+static struct bufhash freebufs;        /* unused buffers */
 static int numbufs;            /* size of buffer cache */
 static int cachelookups;       /* number of cache lookups */
 static int cachereads;         /* number of cache reads */
@@ -186,11 +187,15 @@ bufinit(void)
 {
        int i;
 
+       initbarea(&failedbuf, BT_UNKNOWN);
+       failedbuf.b_errs = -1;
+       failedbuf.b_un.b_buf = NULL;
        if ((cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize)) == NULL)
                errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize);
        initbarea(&cgblk, BT_CYLGRP);
        numbufs = cachelookups = cachereads = 0;
        TAILQ_INIT(&bufqueuehd);
+       LIST_INIT(&freebufs);
        for (i = 0; i < HASHSIZE; i++)
                LIST_INIT(&bufhashhd[i]);
        for (i = 0; i < BT_NUMBUFTYPES; i++) {
@@ -299,7 +304,7 @@ flushentry(void)
 }
 
 /*
- * Manage a cache of directory blocks.
+ * Manage a cache of filesystem disk blocks.
  */
 struct bufarea *
 getdatablk(ufs2_daddr_t blkno, long size, int type)
@@ -308,19 +313,23 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
        struct bufhash *bhdp;
 
        cachelookups++;
-       /* If out of range, return empty buffer with b_err == -1 */
-       if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize)) {
-               blkno = -1;
-               type = BT_EMPTY;
-       }
+       /*
+        * If out of range, return empty buffer with b_err == -1
+        *
+        * Skip check for inodes because chkrange() considers
+        * metadata areas invalid to write data.
+        */
+       if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize))
+               return (&failedbuf);
        bhdp = &bufhashhd[HASH(blkno)];
        LIST_FOREACH(bp, bhdp, b_hash)
                if (bp->b_bno == fsbtodb(&sblock, blkno)) {
                        if (debug && bp->b_size != size) {
-                               prtbuf("getdatablk: size mismatch", bp);
+                               prtbuf(bp, "getdatablk: size mismatch");
                                pfatal("getdatablk: b_size %d != size %ld\n",
                                    bp->b_size, size);
                        }
+                       TAILQ_REMOVE(&bufqueuehd, bp, b_list);
                        goto foundit;
                }
        /*
@@ -339,7 +348,9 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
        if (size > sblock.fs_bsize)
                errx(EEXIT, "Excessive buffer size %ld > %d\n", size,
                    sblock.fs_bsize);
-       if (numbufs < MINBUFS) {
+       if ((bp = LIST_FIRST(&freebufs)) != NULL) {
+               LIST_REMOVE(bp, b_hash);
+       } else if (numbufs < MINBUFS) {
                bp = allocbuf("cannot create minimal buffer pool");
        } else if (sujrecovery) {
                /*
@@ -367,6 +378,7 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
                else
                        LIST_REMOVE(bp, b_hash);
        }
+       TAILQ_REMOVE(&bufqueuehd, bp, b_list);
        flush(fswritefd, bp);
        bp->b_type = type;
        LIST_INSERT_HEAD(bhdp, bp, b_hash);
@@ -374,13 +386,12 @@ getdatablk(ufs2_daddr_t blkno, long size, int type)
        cachereads++;
        /* fall through */
 foundit:
+       TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list);
        if (debug && bp->b_type != type) {
                printf("getdatablk: buffer type changed to %s",
                    BT_BUFTYPE(type));
-               prtbuf("", bp);
+               prtbuf(bp, "");
        }
-       TAILQ_REMOVE(&bufqueuehd, bp, b_list);
-       TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list);
        if (bp->b_errs == 0)
                bp->b_refcnt++;
        return (bp);
@@ -400,11 +411,7 @@ getblk(struct bufarea *bp, ufs2_daddr_t blk, long size)
                        readcnt[bp->b_type]++;
                        clock_gettime(CLOCK_REALTIME_PRECISE, &start);
                }
-               if (bp->b_type != BT_EMPTY)
-                       bp->b_errs =
-                           blread(fsreadfd, bp->b_un.b_buf, dblk, size);
-               else
-                       bp->b_errs = -1;
+               bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size);
                if (debug) {
                        clock_gettime(CLOCK_REALTIME_PRECISE, &finish);
                        timespecsub(&finish, &start, &finish);
@@ -421,10 +428,19 @@ brelse(struct bufarea *bp)
 {
 
        if (bp->b_refcnt <= 0)
-               prtbuf("brelse: buffer with negative reference count", bp);
+               prtbuf(bp, "brelse: buffer with negative reference count");
        bp->b_refcnt--;
 }
 
+void
+binval(struct bufarea *bp)
+{
+
+       bp->b_flags &= ~B_DIRTY;
+       LIST_REMOVE(bp, b_hash);
+       LIST_INSERT_HEAD(&freebufs, bp, b_hash);
+}
+
 void
 flush(int fd, struct bufarea *bp)
 {
@@ -450,10 +466,18 @@ flush(int fd, struct bufarea *bp)
                if (bp != &sblk)
                        pfatal("BUFFER %p DOES NOT MATCH SBLK %p\n",
                            bp, &sblk);
+               /*
+                * Superblocks are always pre-copied so we do not need
+                * to check them for copy-on-write.
+                */
                if (sbput(fd, bp->b_un.b_fs, 0) == 0)
                        fsmodified = 1;
                break;
        case BT_CYLGRP:
+               /*
+                * Cylinder groups are always pre-copied so we do not
+                * need to check them for copy-on-write.
+                */
                if (sujrecovery)
                        cg_write(bp);
                if (cgput(fswritefd, &sblock, bp->b_un.b_cg) == 0)
@@ -482,11 +506,38 @@ flush(int fd, struct bufarea *bp)
                }
                /* FALLTHROUGH */
        default:
+               copyonwrite(&sblock, bp, std_checkblkavail);
                blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size);
                break;
        }
 }
 
+/*
+ * If there are any snapshots, ensure that all the blocks that they
+ * care about have been copied, then release the snapshot inodes.
+ * These operations need to be done before we rebuild the cylinder
+ * groups so that any block allocations are properly recorded.
+ * Since all the cylinder group maps have already been copied in
+ * the snapshots, no further snapshot copies will need to be done.
+ */
+void
+snapflush(ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long))
+{
+       struct bufarea *bp;
+       int cnt;
+
+       if (snapcnt > 0) {
+               if (debug)
+                       printf("Check for snapshot copies\n");
+               TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list)
+                       if ((bp->b_flags & B_DIRTY) != 0)
+                               copyonwrite(&sblock, bp, checkblkavail);
+               for (cnt = 0; cnt < snapcnt; cnt++)
+                       irelse(&snaplist[cnt]);
+               snapcnt = 0;
+       }
+}
+
 /*
  * Journaled soft updates does not maintain cylinder group summary
  * information during cleanup, so this routine recalculates the summary
@@ -498,6 +549,7 @@ cg_write(struct bufarea *bp)
 {
        ufs1_daddr_t fragno, cgbno, maxbno;
        u_int8_t *blksfree;
+       struct csum *csp;
        struct cg *cgp;
        int blk;
        int i;
@@ -535,6 +587,11 @@ cg_write(struct bufarea *bp)
         * Update the superblock cg summary from our now correct values
         * before writing the block.
         */
+       csp = &sblock.fs_cs(&sblock, cgp->cg_cgx);
+       sblock.fs_cstotal.cs_ndir += cgp->cg_cs.cs_ndir - csp->cs_ndir;
+       sblock.fs_cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree - csp->cs_nbfree;
+       sblock.fs_cstotal.cs_nifree += cgp->cg_cs.cs_nifree - csp->cs_nifree;
+       sblock.fs_cstotal.cs_nffree += cgp->cg_cs.cs_nffree - csp->cs_nffree;
        sblock.fs_cs(&sblock, cgp->cg_cgx) = cgp->cg_cs;
 }
 
@@ -586,6 +643,7 @@ ckfini(int markclean)
                (void)close(fsreadfd);
                return;
        }
+
        /*
         * To remain idempotent with partial truncations the buffers
         * must be flushed in this order:
@@ -628,14 +686,9 @@ ckfini(int markclean)
                case BT_SUPERBLK:
                case BT_CYLGRP:
                default:
-                       prtbuf("ckfini: improper buffer type on cache list",bp);
+                       prtbuf(bp,"ckfini: improper buffer type on cache list");
                        continue;
                /* These are the ones to flush in this step */
-               case BT_EMPTY:
-                       if (bp->b_bno >= 0)
-                               pfatal("Unused BT_EMPTY buffer for block %jd\n",
-                                   (intmax_t)bp->b_bno);
-                       /* FALLTHROUGH */
                case BT_LEVEL1:
                case BT_LEVEL2:
                case BT_LEVEL3:
@@ -647,11 +700,10 @@ ckfini(int markclean)
                case BT_INODES:
                        continue;
                }
-               if (debug && bp->b_refcnt != 0) {
-                       prtbuf("ckfini: clearing in-use buffer", bp);
-                       pfatal("ckfini: clearing in-use buffer\n");
-               }
+               if (debug && bp->b_refcnt != 0)
+                       prtbuf(bp, "ckfini: clearing in-use buffer");
                TAILQ_REMOVE(&bufqueuehd, bp, b_list);
+               LIST_REMOVE(bp, b_hash);
                cnt++;
                flush(fswritefd, bp);
                free(bp->b_un.b_buf);
@@ -665,11 +717,10 @@ ckfini(int markclean)
                icachebp = NULL;
        }
        TAILQ_FOREACH_REVERSE_SAFE(bp, &bufqueuehd, bufqueue, b_list, nbp) {
-               if (debug && bp->b_refcnt != 0) {
-                       prtbuf("ckfini: clearing in-use buffer", bp);
-                       pfatal("ckfini: clearing in-use buffer\n");
-               }
+               if (debug && bp->b_refcnt != 0)
+                       prtbuf(bp, "ckfini: clearing in-use buffer");
                TAILQ_REMOVE(&bufqueuehd, bp, b_list);
+               LIST_REMOVE(bp, b_hash);
                cnt++;
                flush(fswritefd, bp);
                free(bp->b_un.b_buf);
@@ -1049,45 +1100,77 @@ check_cgmagic(int cg, struct bufarea *cgbp, int 
request_rebuild)
  * allocate a data block with the specified number of fragments
  */
 ufs2_daddr_t
-allocblk(long frags)
+allocblk(long startcg, long frags,
+    ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags))
 {
-       int i, j, k, cg, baseblk;
-       struct bufarea *cgbp;
-       struct cg *cgp;
+       ufs2_daddr_t blkno, newblk;
 
+       if (sujrecovery && checkblkavail == std_checkblkavail) {
+               pfatal("allocblk: std_checkblkavail used for SUJ recovery\n");
+               return (0);
+       }
        if (frags <= 0 || frags > sblock.fs_frag)
                return (0);
-       for (i = 0; i < maxfsblock - sblock.fs_frag; i += sblock.fs_frag) {
-               for (j = 0; j <= sblock.fs_frag - frags; j++) {
-                       if (testbmap(i + j))
-                               continue;
-                       for (k = 1; k < frags; k++)
-                               if (testbmap(i + j + k))
-                                       break;
-                       if (k < frags) {
-                               j += k;
-                               continue;
-                       }
-                       cg = dtog(&sblock, i + j);
-                       cgbp = cglookup(cg);
-                       cgp = cgbp->b_un.b_cg;
-                       if (!check_cgmagic(cg, cgbp, 0)) {
-                               i = (cg + 1) * sblock.fs_fpg - sblock.fs_frag;
-                               continue;
-                       }
-                       baseblk = dtogd(&sblock, i + j);
-                       for (k = 0; k < frags; k++) {
-                               setbmap(i + j + k);
-                               clrbit(cg_blksfree(cgp), baseblk + k);
-                       }
-                       n_blks += frags;
-                       if (frags == sblock.fs_frag)
-                               cgp->cg_cs.cs_nbfree--;
-                       else
-                               cgp->cg_cs.cs_nffree -= frags;
-                       cgdirty(cgbp);
-                       return (i + j);
+       for (blkno = cgdata(&sblock, startcg);
+            blkno < maxfsblock - sblock.fs_frag;
+            blkno += sblock.fs_frag) {
+               if ((newblk = (*checkblkavail)(blkno, frags)) == 0)
+                       continue;
+               if (newblk > 0)
+                       return (newblk);
+               if (newblk < 0)
+                       blkno = -newblk;
+       }
+       for (blkno = cgdata(&sblock, 0);
+            blkno < cgbase(&sblock, startcg) - sblock.fs_frag;
+            blkno += sblock.fs_frag) {
+               if ((newblk = (*checkblkavail)(blkno, frags)) == 0)
+                       continue;
+               if (newblk > 0)
+                       return (newblk);
+               if (newblk < 0)
+                       blkno = -newblk;
+       }
+       return (0);
+}
+
+ufs2_daddr_t
+std_checkblkavail(blkno, frags)
+       ufs2_daddr_t blkno;
+       long frags;
+{
+       struct bufarea *cgbp;
+       struct cg *cgp;
+       ufs2_daddr_t j, k, baseblk;
+       long cg;
+
+       for (j = 0; j <= sblock.fs_frag - frags; j++) {
+               if (testbmap(blkno + j))
+                       continue;
+               for (k = 1; k < frags; k++)
+                       if (testbmap(blkno + j + k))
+                               break;
+               if (k < frags) {
+                       j += k;
+                       continue;
                }
+               cg = dtog(&sblock, blkno + j);
+               cgbp = cglookup(cg);
+               cgp = cgbp->b_un.b_cg;
+               if (!check_cgmagic(cg, cgbp, 0))
+                       return (-((cg + 1) * sblock.fs_fpg - sblock.fs_frag));
+               baseblk = dtogd(&sblock, blkno + j);
+               for (k = 0; k < frags; k++) {
+                       setbmap(blkno + j + k);
+                       clrbit(cg_blksfree(cgp), baseblk + k);
+               }
+               n_blks += frags;
+               if (frags == sblock.fs_frag)
+                       cgp->cg_cs.cs_nbfree--;
+               else
+                       cgp->cg_cs.cs_nffree -= frags;
+               cgdirty(cgbp);
+               return (blkno + j);
        }
        return (0);
 }
@@ -1260,14 +1343,19 @@ dofix(struct inodesc *idesc, const char *msg)
 /*
  * Print details about a buffer.
  */
-static void
-prtbuf(const char *msg, struct bufarea *bp)
+void
+prtbuf(struct bufarea *bp, const char *fmt, ...)
 {
-       
-       printf("%s: bp %p, type %s, bno %jd, size %d, refcnt %d, flags %s, "
-           "index %jd\n", msg, bp, BT_BUFTYPE(bp->b_type),
-           (intmax_t) bp->b_bno, bp->b_size, bp->b_refcnt,
-           bp->b_flags & B_DIRTY ? "dirty" : "clean", (intmax_t) bp->b_index);
+       va_list ap;
+       va_start(ap, fmt);
+       if (preen)
+               (void)fprintf(stdout, "%s: ", cdevname);
+       (void)vfprintf(stdout, fmt, ap);
+       va_end(ap);
+       printf(": bp %p, type %s, bno %jd, size %d, refcnt %d, flags %s, "
+           "index %jd\n", bp, BT_BUFTYPE(bp->b_type), (intmax_t) bp->b_bno,
+           bp->b_size, bp->b_refcnt, bp->b_flags & B_DIRTY ? "dirty" : "clean",
+           (intmax_t) bp->b_index);
 }
 
 /*
diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c
index c9b4a80b50fb..12c267fa528a 100644
--- a/sbin/fsck_ffs/inode.c
+++ b/sbin/fsck_ffs/inode.c
@@ -38,6 +38,7 @@ static const char sccsid[] = "@(#)inode.c     8.8 (Berkeley) 
4/28/95";
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/stat.h>
 #include <sys/stdint.h>
 #include <sys/sysctl.h>
 
@@ -58,6 +59,9 @@ struct bufarea *icachebp;     /* inode cache buffer */
 static int iblock(struct inodesc *, off_t isize, int type);
 static ufs2_daddr_t indir_blkatoff(ufs2_daddr_t, ino_t, ufs_lbn_t, ufs_lbn_t,
     struct bufarea **);
+static int snapclean(struct inodesc *idesc);
+static void chkcopyonwrite(struct fs *, ufs2_daddr_t,
+    ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long));
 
 int
 ckinode(union dinode *dp, struct inodesc *idesc)
@@ -378,8 +382,12 @@ chkrange(ufs2_daddr_t blk, int cnt)
        int c;
 
        if (cnt <= 0 || blk <= 0 || blk > maxfsblock ||
-           cnt - 1 > maxfsblock - blk)
+           cnt - 1 > maxfsblock - blk) {
+               if (debug)
+                       printf("out of range: blk %ld, offset %i, size %d\n",
+                           (long)blk, (int)fragnum(&sblock, blk), cnt);
                return (1);
+       }
        if (cnt > sblock.fs_frag ||
            fragnum(&sblock, blk) + cnt > sblock.fs_frag) {
                if (debug)
@@ -650,11 +658,21 @@ int
 freeblock(struct inodesc *idesc)
 {
        struct dups *dlp;
+       struct bufarea *cgbp;
+       struct cg *cgp;
        ufs2_daddr_t blkno;
-       long nfrags, res;
+       long size, nfrags, res;
 
        res = KEEPON;
        blkno = idesc->id_blkno;
+       if (idesc->id_type == SNAP) {
+               pfatal("clearing a snapshot dinode\n");
+               return (STOP);
+       }
+       size = lfragtosize(&sblock, idesc->id_numfrags);
+       if (snapblkfree(&sblock, blkno, size, idesc->id_number,
+           std_checkblkavail))
+               return (res);
        for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) {
                if (chkrange(blkno, 1)) {
                        res = SKIP;
@@ -674,12 +692,407 @@ freeblock(struct inodesc *idesc)
                        }
                }
        }
+       /*
+        * If all successfully returned, account for them.
+        */
+       if (nfrags == 0) {
+               cgbp = cglookup(dtog(&sblock, idesc->id_blkno));
+               cgp = cgbp->b_un.b_cg;
+               if (idesc->id_numfrags == sblock.fs_frag)
+                       cgp->cg_cs.cs_nbfree++;
+               else
+                       cgp->cg_cs.cs_nffree += idesc->id_numfrags;
+               cgdirty(cgbp);
+       }
        return (res);
 }
 
+/*
+ * Prepare a snapshot file for being removed.
+ */
+void
+snapremove(ino_t inum)
+{
+       struct inodesc idesc;
+       struct inode ip;
+       int i;
+
+       for (i = 0; i < snapcnt; i++)
+               if (snaplist[i].i_number == inum)
+                       break;
+       if (i == snapcnt)
+               ginode(inum, &ip);
+       else
+               ip = snaplist[i];
+       if ((DIP(ip.i_dp, di_flags) & SF_SNAPSHOT) == 0) {
+               printf("snapremove: inode %jd is not a snapshot\n",
+                   (intmax_t)inum);
+               if (i == snapcnt)
+                       irelse(&ip);
+               return;
+       }
+       if (debug)
+               printf("snapremove: remove %sactive snapshot %jd\n",
+                   i == snapcnt ? "in" : "", (intmax_t)inum);
+       /*
+        * If on active snapshot list, remove it.
+        */
+       if (i < snapcnt) {
+               for (i++; i < FSMAXSNAP; i++) {
+                       if (sblock.fs_snapinum[i] == 0)
+                               break;
+                       snaplist[i - 1] = snaplist[i];
+                       sblock.fs_snapinum[i - 1] = sblock.fs_snapinum[i];
+               }
+               sblock.fs_snapinum[i - 1] = 0;
+               bzero(&snaplist[i - 1], sizeof(struct inode));
+               snapcnt--;
+       }
+       idesc.id_type = SNAP;
+       idesc.id_func = snapclean;
+       idesc.id_number = inum;
+       (void)ckinode(ip.i_dp, &idesc);
+       DIP_SET(ip.i_dp, di_flags, DIP(ip.i_dp, di_flags) & ~SF_SNAPSHOT);
+       inodirty(&ip);
+       irelse(&ip);
+}
+
+static int
+snapclean(struct inodesc *idesc)
+{
+       ufs2_daddr_t blkno;
+       struct bufarea *bp;
+       union dinode *dp;
+
+       blkno = idesc->id_blkno;
+       if (blkno == 0)
+               return (KEEPON);
+
+       bp = idesc->id_bp;
+       dp = idesc->id_dp;
+       if (blkno == BLK_NOCOPY || blkno == BLK_SNAP) {
+               if (idesc->id_lbn < UFS_NDADDR)
+                       DIP_SET(dp, di_db[idesc->id_lbn], 0);
+               else
+                       IBLK_SET(bp, bp->b_index, 0);
+               dirty(bp);
+       }
+       return (KEEPON);
+}
+
+/*
+ * Notification that a block is being freed. Return zero if the free
+ * should be allowed to proceed. Return non-zero if the snapshot file
+ * wants to claim the block. The block will be claimed if it is an
+ * uncopied part of one of the snapshots. It will be freed if it is
+ * either a BLK_NOCOPY or has already been copied in all of the snapshots.
+ * If a fragment is being freed, then all snapshots that care about
+ * it must make a copy since a snapshot file can only claim full sized
+ * blocks. Note that if more than one snapshot file maps the block,
+ * we can pick one at random to claim it. Since none of the snapshots
+ * can change, we are assurred that they will all see the same unmodified
+ * image. When deleting a snapshot file (see ino_trunc above), we
+ * must push any of these claimed blocks to one of the other snapshots
+ * that maps it. These claimed blocks are easily identified as they will
+ * have a block number equal to their logical block number within the
+ * snapshot. A copied block can never have this property because they
+ * must always have been allocated from a BLK_NOCOPY location.
+ */
+int
+snapblkfree(fs, bno, size, inum, checkblkavail)
+       struct fs *fs;
+       ufs2_daddr_t bno;
+       long size;
+       ino_t inum;
+       ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags);
+{
+       union dinode *dp;
+       struct inode ip;
+       struct bufarea *snapbp;
+       ufs_lbn_t lbn;
+       ufs2_daddr_t blkno, relblkno;
+       int i, frags, claimedblk, copydone;
+
+       /* If no snapshots, nothing to do */
+       if (snapcnt == 0)
+               return (0);
+       if (debug)
+               printf("snapblkfree: in ino %jd free blkno %jd, size %jd\n",
+                   (intmax_t)inum, (intmax_t)bno, (intmax_t)size);
+       relblkno = blknum(fs, bno);
+       lbn = fragstoblks(fs, relblkno);
+       /* Direct blocks are always pre-copied */
+       if (lbn < UFS_NDADDR)
+               return (0);
+       copydone = 0;
+       claimedblk = 0;
+       for (i = 0; i < snapcnt; i++) {
+               /*
+                * Lookup block being freed.
+                */
+               ip = snaplist[i];
+               dp = ip.i_dp;
+               blkno = ino_blkatoff(dp, inum != 0 ? inum : ip.i_number,
+                   lbn, &frags, &snapbp);
+               /*
+                * Check to see if block needs to be copied.
+                */
+               if (blkno == 0) {
+                       /*
+                        * A block that we map is being freed. If it has not
+                        * been claimed yet, we will claim or copy it (below).
+                        */
+                       claimedblk = 1;
+               } else if (blkno == BLK_SNAP) {
+                       /*
+                        * No previous snapshot claimed the block,
+                        * so it will be freed and become a BLK_NOCOPY
+                        * (don't care) for us.
+                        */
+                       if (claimedblk)
+                               pfatal("snapblkfree: inconsistent block type");
+                       IBLK_SET(snapbp, snapbp->b_index, BLK_NOCOPY);
+                       dirty(snapbp);
+                       brelse(snapbp);
+                       continue;
+               } else /* BLK_NOCOPY or default */ {
+                       /*
+                        * If the snapshot has already copied the block
+                        * (default), or does not care about the block,
+                        * it is not needed.
+                        */
+                       brelse(snapbp);
+                       continue;
+               }
+               /*
+                * If this is a full size block, we will just grab it
+                * and assign it to the snapshot inode. Otherwise we
+                * will proceed to copy it. See explanation for this
+                * routine as to why only a single snapshot needs to
+                * claim this block.
+                */
+               if (size == fs->fs_bsize) {
+                       if (debug)
+                               printf("Grabonremove snapshot %ju lbn %jd "
+                                   "from inum %ju\n", (intmax_t)ip.i_number,
+                                   (intmax_t)lbn, (uintmax_t)inum);
+                       IBLK_SET(snapbp, snapbp->b_index, relblkno);
+                       dirty(snapbp);
+                       brelse(snapbp);
+                       DIP_SET(dp, di_blocks,
+                           DIP(dp, di_blocks) + btodb(size));
+                       inodirty(&ip);
+                       return (1);
+               }
+
+               /* First time through, read the contents of the old block. */
+               if (copydone == 0) {
+                       copydone = 1;
+                       if (blread(fsreadfd, copybuf, fsbtodb(fs, relblkno),
+                           fs->fs_bsize) != 0) {
+                               pfatal("Could not read snapshot %ju block "
+                                   "%jd\n", (intmax_t)ip.i_number,
+                                   (intmax_t)relblkno);
+                               continue;
+                       }
+               }
+               /*
+                * This allocation will never require any additional
+                * allocations for the snapshot inode.
+                */
+               blkno = allocblk(dtog(fs, relblkno), fs->fs_frag,
+                   checkblkavail);
+               if (blkno == 0) {
+                       pfatal("Could not allocate block for snapshot %ju\n",
+                           (intmax_t)ip.i_number);
+                       continue;
+               }
+               if (debug)
+                       printf("Copyonremove: snapino %jd lbn %jd for inum %ju "
+                           "size %ld new blkno %jd\n", (intmax_t)ip.i_number,
+                           (intmax_t)lbn, (uintmax_t)inum, size,
+                           (intmax_t)blkno);
+               blwrite(fswritefd, copybuf, fsbtodb(fs, blkno), fs->fs_bsize);
+               IBLK_SET(snapbp, snapbp->b_index, blkno);
+               dirty(snapbp);
+               brelse(snapbp);
+               DIP_SET(dp, di_blocks,
+                   DIP(dp, di_blocks) + btodb(fs->fs_bsize));
+               inodirty(&ip);
+       }
+       return (0);
+}
+
+/*
+ * Notification that a block is being written. Return if the block
+ * is part of a snapshot as snapshots never track other snapshots.
+ * The block will be copied in all of the snapshots that are tracking
+ * it and have not yet copied it. Some buffers may hold more than one
+ * block. Here we need to check each block in the buffer.
+ */
+void
+copyonwrite(fs, bp, checkblkavail)
+       struct fs *fs;
+       struct bufarea *bp;
+       ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags);
+{
+       ufs2_daddr_t copyblkno;
+       long i, numblks;
+
+       /* If no snapshots, nothing to do. */
+       if (snapcnt == 0)
+               return;
+       numblks = blkroundup(fs, bp->b_size) / fs->fs_bsize;
+       if (debug)
+               prtbuf(bp, "copyonwrite: checking %jd block%s in buffer",
+                   (intmax_t)numblks, numblks > 1 ? "s" : "");
+       copyblkno = blknum(fs, dbtofsb(fs, bp->b_bno));
+       for (i = 0; i < numblks; i++) {
+               chkcopyonwrite(fs, copyblkno, checkblkavail);
+               copyblkno += fs->fs_frag;
+       }
+}
+
+static void
+chkcopyonwrite(fs, copyblkno, checkblkavail)
+       struct fs *fs;
+       ufs2_daddr_t copyblkno;
+       ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags);
+{
+       struct inode ip;
+       union dinode *dp;
+       struct bufarea *snapbp;
+       ufs2_daddr_t blkno;
+       int i, frags, copydone;
+       ufs_lbn_t lbn;
+
+       lbn = fragstoblks(fs, copyblkno);
+       /* Direct blocks are always pre-copied */
+       if (lbn < UFS_NDADDR)
+               return;
+       copydone = 0;
+       for (i = 0; i < snapcnt; i++) {
+               /*
+                * Lookup block being freed.
+                */
+               ip = snaplist[i];
*** 600 LINES SKIPPED ***

Reply via email to