svn commit: r222958 - in head: sbin/fsck_ffs sys/sys sys/ufs/ffs sys/ufs/ufs

2011-06-10 Thread Jeff Roberson
Author: jeff
Date: Fri Jun 10 22:48:35 2011
New Revision: 222958
URL: http://svn.freebsd.org/changeset/base/222958

Log:
  Implement fully asynchronous partial truncation with softupdates journaling
  to resolve errors which can cause corruption on recovery with the old
  synchronous mechanism.
  
   - Append partial truncation freework structures to indirdeps while
 truncation is proceeding.  These prevent new block pointers from
 becoming valid until truncation completes and serialize truncations.
   - On completion of a partial truncate journal work waits for zeroed
 pointers to hit indirects.
   - softdep_journal_freeblocks() handles last frag allocation and last
 block zeroing.
   - vtruncbuf/ffs_page_remove moved into softdep_*_freeblocks() so it
 is only implemented in one place.
   - Block allocation failure handling moved up one level so it does not
 proceed with buf locks held.  This permits us to do more extensive
 reclaims when filesystem space is exhausted.
   - softdep_sync_metadata() is broken into two parts, the first executes
 once at the start of ffs_syncvnode() and flushes truncations and
 inode dependencies.  The second is called on each locked buf.  This
 eliminates excessive looping and rollbacks.
   - Improve the mechanism in process_worklist_item() that handles
 acquiring vnode locks for handle_workitem_remove() so that it works
 more generally and does not loop excessively over the same worklist
 items on each call.
   - Don't corrupt directories by zeroing the tail in fsck.  This is only
 done for regular files.
   - Push a fsync complete record for files that need it so the checker
 knows a truncation in the journal is no longer valid.
  
  Discussed with:   mckusick, kib (ffs_pages_remove and ffs_truncate parts)
  Tested by:pho

Modified:
  head/sbin/fsck_ffs/suj.c
  head/sys/sys/vnode.h
  head/sys/ufs/ffs/ffs_alloc.c
  head/sys/ufs/ffs/ffs_balloc.c
  head/sys/ufs/ffs/ffs_extern.h
  head/sys/ufs/ffs/ffs_inode.c
  head/sys/ufs/ffs/ffs_softdep.c
  head/sys/ufs/ffs/ffs_vfsops.c
  head/sys/ufs/ffs/ffs_vnops.c
  head/sys/ufs/ffs/fs.h
  head/sys/ufs/ffs/softdep.h
  head/sys/ufs/ufs/inode.h
  head/sys/ufs/ufs/ufsmount.h

Modified: head/sbin/fsck_ffs/suj.c
==
--- head/sbin/fsck_ffs/suj.cFri Jun 10 22:42:00 2011(r222957)
+++ head/sbin/fsck_ffs/suj.cFri Jun 10 22:48:35 2011(r222958)
@@ -1604,7 +1604,7 @@ ino_trunc(ino_t ino, off_t size)
 * uninitialized space later.
 */
off = blkoff(fs, size);
-   if (off) {
+   if (off  DIP(ip, di_mode) != IFDIR) {
uint8_t *buf;
long clrsize;
 
@@ -1775,13 +1775,18 @@ cg_trunc(struct suj_cg *sc)
struct suj_ino *sino;
int i;
 
-   for (i = 0; i  SUJ_HASHSIZE; i++)
-   LIST_FOREACH(sino, sc-sc_inohash[i], si_next)
+   for (i = 0; i  SUJ_HASHSIZE; i++) {
+   LIST_FOREACH(sino, sc-sc_inohash[i], si_next) {
if (sino-si_trunc) {
ino_trunc(sino-si_ino,
sino-si_trunc-jt_size);
+   sino-si_blkadj = 0;
sino-si_trunc = NULL;
}
+   if (sino-si_blkadj)
+   ino_adjblks(sino);
+   }
+   }
 }
 
 /*
@@ -1791,7 +1796,6 @@ cg_trunc(struct suj_cg *sc)
 static void
 cg_check_blk(struct suj_cg *sc)
 {
-   struct suj_ino *sino;
struct suj_blk *sblk;
int i;
 
@@ -1799,15 +1803,6 @@ cg_check_blk(struct suj_cg *sc)
for (i = 0; i  SUJ_HASHSIZE; i++)
LIST_FOREACH(sblk, sc-sc_blkhash[i], sb_next)
blk_check(sblk);
-   /*
-* Now that we've freed blocks which are not referenced we
-* make a second pass over all inodes to adjust their block
-* counts.
-*/
-   for (i = 0; i  SUJ_HASHSIZE; i++)
-   LIST_FOREACH(sino, sc-sc_inohash[i], si_next)
-   if (sino-si_blkadj)
-   ino_adjblks(sino);
 }
 
 /*
@@ -1961,14 +1956,7 @@ ino_append(union jrec *rec)
parent %d, diroff %jd\n,
refrec-jr_op, refrec-jr_ino, refrec-jr_nlink,
refrec-jr_parent, refrec-jr_diroff);
-   /*
-* Lookup the ino and clear truncate if one is found.  Partial
-* truncates are always done synchronously so if we discover
-* an operation that requires a lock the truncation has completed
-* and can be discarded.
-*/
sino = ino_lookup(((struct jrefrec *)rec)-jr_ino, 1);
-   sino-si_trunc = NULL;
sino-si_hasrecs = 1;
srec = errmalloc(sizeof(*srec));
srec-sr_rec = rec;
@@ -2174,9 +2162,7 @@ blk_build(struct jblkrec 

Re: svn commit: r222958 - in head: sbin/fsck_ffs sys/sys sys/ufs/ffs sys/ufs/ufs

2011-06-10 Thread Jeff Roberson

On Fri, 10 Jun 2011, Jeff Roberson wrote:


Author: jeff
Date: Fri Jun 10 22:48:35 2011
New Revision: 222958
URL: http://svn.freebsd.org/changeset/base/222958

Log:
 Implement fully asynchronous partial truncation with softupdates journaling
 to resolve errors which can cause corruption on recovery with the old
 synchronous mechanism.



This diff is enormous and took months of work.  I'm sorry to get it in so 
close to 9.0, I had no idea it would take so long.  pho has tested 
multiple versions of the patch with and without journaling for days of 
test time and it has probably racked up a week of machine time for me but 
there may be problems given that it is so huge.


There is still a snapshot problem with SUJ that mckusick and I are working 
on.  Expect to see some checkins for that soon.


Thanks,
Jeff



  - Append partial truncation freework structures to indirdeps while
truncation is proceeding.  These prevent new block pointers from
becoming valid until truncation completes and serialize truncations.
  - On completion of a partial truncate journal work waits for zeroed
pointers to hit indirects.
  - softdep_journal_freeblocks() handles last frag allocation and last
block zeroing.
  - vtruncbuf/ffs_page_remove moved into softdep_*_freeblocks() so it
is only implemented in one place.
  - Block allocation failure handling moved up one level so it does not
proceed with buf locks held.  This permits us to do more extensive
reclaims when filesystem space is exhausted.
  - softdep_sync_metadata() is broken into two parts, the first executes
once at the start of ffs_syncvnode() and flushes truncations and
inode dependencies.  The second is called on each locked buf.  This
eliminates excessive looping and rollbacks.
  - Improve the mechanism in process_worklist_item() that handles
acquiring vnode locks for handle_workitem_remove() so that it works
more generally and does not loop excessively over the same worklist
items on each call.
  - Don't corrupt directories by zeroing the tail in fsck.  This is only
done for regular files.
  - Push a fsync complete record for files that need it so the checker
knows a truncation in the journal is no longer valid.

 Discussed with:mckusick, kib (ffs_pages_remove and ffs_truncate parts)
 Tested by: pho

Modified:
 head/sbin/fsck_ffs/suj.c
 head/sys/sys/vnode.h
 head/sys/ufs/ffs/ffs_alloc.c
 head/sys/ufs/ffs/ffs_balloc.c
 head/sys/ufs/ffs/ffs_extern.h
 head/sys/ufs/ffs/ffs_inode.c
 head/sys/ufs/ffs/ffs_softdep.c
 head/sys/ufs/ffs/ffs_vfsops.c
 head/sys/ufs/ffs/ffs_vnops.c
 head/sys/ufs/ffs/fs.h
 head/sys/ufs/ffs/softdep.h
 head/sys/ufs/ufs/inode.h
 head/sys/ufs/ufs/ufsmount.h

Modified: head/sbin/fsck_ffs/suj.c
==
--- head/sbin/fsck_ffs/suj.cFri Jun 10 22:42:00 2011(r222957)
+++ head/sbin/fsck_ffs/suj.cFri Jun 10 22:48:35 2011(r222958)
@@ -1604,7 +1604,7 @@ ino_trunc(ino_t ino, off_t size)
 * uninitialized space later.
 */
off = blkoff(fs, size);
-   if (off) {
+   if (off  DIP(ip, di_mode) != IFDIR) {
uint8_t *buf;
long clrsize;

@@ -1775,13 +1775,18 @@ cg_trunc(struct suj_cg *sc)
struct suj_ino *sino;
int i;

-   for (i = 0; i  SUJ_HASHSIZE; i++)
-   LIST_FOREACH(sino, sc-sc_inohash[i], si_next)
+   for (i = 0; i  SUJ_HASHSIZE; i++) {
+   LIST_FOREACH(sino, sc-sc_inohash[i], si_next) {
if (sino-si_trunc) {
ino_trunc(sino-si_ino,
sino-si_trunc-jt_size);
+   sino-si_blkadj = 0;
sino-si_trunc = NULL;
}
+   if (sino-si_blkadj)
+   ino_adjblks(sino);
+   }
+   }
}

/*
@@ -1791,7 +1796,6 @@ cg_trunc(struct suj_cg *sc)
static void
cg_check_blk(struct suj_cg *sc)
{
-   struct suj_ino *sino;
struct suj_blk *sblk;
int i;

@@ -1799,15 +1803,6 @@ cg_check_blk(struct suj_cg *sc)
for (i = 0; i  SUJ_HASHSIZE; i++)
LIST_FOREACH(sblk, sc-sc_blkhash[i], sb_next)
blk_check(sblk);
-   /*
-* Now that we've freed blocks which are not referenced we
-* make a second pass over all inodes to adjust their block
-* counts.
-*/
-   for (i = 0; i  SUJ_HASHSIZE; i++)
-   LIST_FOREACH(sino, sc-sc_inohash[i], si_next)
-   if (sino-si_blkadj)
-   ino_adjblks(sino);
}

/*
@@ -1961,14 +1956,7 @@ ino_append(union jrec *rec)
parent %d, diroff %jd\n,
refrec-jr_op, refrec-jr_ino, refrec-jr_nlink,
refrec-jr_parent, refrec-jr_diroff);
-   /*
-* Lookup the ino