Hi,

I've noticed it's possible to panic system in
wapbl_register_deallocation() on small (200MB) filesystem just by
creating directory with many entries and then removing it, since
system runs out of dealloc array space - wl_dealloclim ended up being
63.

I've also noticed kern/49175, which also is related to the
deallocation path, patch there is trying to force flush on
wapbl_end(), even though there is similar flush being actually done by
wapbl_start().

ufs_truncate() takes a lot of care to not create big transactions
(actually too much), truncating by indirect block boundaries, and
doing the wapbl_end()/wapbl_start() to allow flush. However, the call
to wapbl_flush() in wapbl_start() is not effective, because it doesn't
do anything if !waitfor and wl_bufcount == 0, as is the case during
the truncate. Thus, for any bigger truncate run it would just continue
filling wl_deallocblks past 50% and eventually run out.

I suggest change as outlined in attached diff, does two things:
1. commit transaction in wapbl_flush() even in !waitfor case when
wapbl_transaction_len() > 0
2. allow many more revocation records, using up to 25% of journal size

I'm leaving for three weeks on vac, so sending this for review now to
gather feedback. I'm very new to WAPBL and I don't understand all
consequences of such changes.

When I return, I'll also look on the actual main problem kern/49175,
i.e. the excessive amount of VOP_TRUNCATE() calls triggered by
ufs_wapbl_truncate(). Joerg wrote me there was some floating patch to
refactor ffs_truncate() to be allow partial success, I'll check the
archives for that when I get back.

Jaromir
Index: vfs_wapbl.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_wapbl.c,v
retrieving revision 1.78
diff -u -p -r1.78 vfs_wapbl.c
--- vfs_wapbl.c 19 May 2016 18:32:29 -0000      1.78
+++ vfs_wapbl.c 22 Aug 2016 21:27:28 -0000
@@ -202,6 +202,10 @@ struct wapbl {
        size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
 #endif
 
+#if _KERNEL
+       size_t wl_brperblock;   /* r Block records per journal block */
+#endif
+
        daddr_t *wl_deallocblks;/* lm:  address of block */
        int *wl_dealloclens;    /* lm:  size of block */
        int wl_dealloccnt;      /* lm:  total count */
@@ -498,9 +502,12 @@ wapbl_start(struct wapbl ** wlp, struct 
        /* XXX fix actual number of buffers reserved per filesystem. */
        wl->wl_bufcount_max = (nbuf / 2) * 1024;
 
-       /* XXX tie this into resource estimation */
-       wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
-       
+       /* allow up to 25% of journal to be taken by revocations */
+       wl->wl_brperblock = ((1<<wl->wl_log_dev_bshift)
+               - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
+           sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
+
+       wl->wl_dealloclim  = wl->wl_bufbytes_max / 4 / wl->wl_brperblock;
        wl->wl_deallocblks = wapbl_alloc(sizeof(*wl->wl_deallocblks) *
            wl->wl_dealloclim);
        wl->wl_dealloclens = wapbl_alloc(sizeof(*wl->wl_dealloclens) *
@@ -1560,13 +1567,9 @@ wapbl_flush(struct wapbl *wl, int waitfo
         * issued any deferred block writes for this transaction, check
         * whether there are any blocks to write to the log.  If not,
         * skip waiting for space or writing any log entries.
-        *
-        * XXX Shouldn't this also check wl_dealloccnt and
-        * wl_inohashcnt?  Perhaps wl_dealloccnt doesn't matter if the
-        * file system didn't produce any blocks as a consequence of
-        * it, but the same does not seem to be so of wl_inohashcnt.
         */
-       if (wl->wl_bufcount == 0) {
+       flushsize = wapbl_transaction_len(wl);
+       if (flushsize == 0) {
                goto wait_out;
        }
 
@@ -1578,8 +1581,6 @@ wapbl_flush(struct wapbl *wl, int waitfo
                      wl->wl_bufbytes));
 #endif
 
-       /* Calculate amount of space needed to flush */
-       flushsize = wapbl_transaction_len(wl);
        if (wapbl_verbose_commit) {
                struct timespec ts;
                getnanotime(&ts);
@@ -1940,6 +1941,7 @@ wapbl_register_deallocation(struct wapbl
        wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
        wl->wl_dealloclens[wl->wl_dealloccnt] = len;
        wl->wl_dealloccnt++;
+
        WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
            ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
        mutex_exit(&wl->wl_mtx);
@@ -2228,7 +2230,6 @@ wapbl_write_blocks(struct wapbl *wl, off
        struct wapbl_wc_blocklist *wc =
            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
        int blocklen = 1<<wl->wl_log_dev_bshift;
-       int bph;
        struct buf *bp;
        off_t off = *offp;
        int error;
@@ -2236,9 +2237,6 @@ wapbl_write_blocks(struct wapbl *wl, off
 
        KASSERT(rw_write_held(&wl->wl_rwlock));
 
-       bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
-           sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
-
        bp = LIST_FIRST(&wl->wl_bufs);
 
        while (bp) {
@@ -2250,7 +2248,7 @@ wapbl_write_blocks(struct wapbl *wl, off
                wc->wc_type = WAPBL_WC_BLOCKS;
                wc->wc_len = blocklen;
                wc->wc_blkcount = 0;
-               while (bp && (wc->wc_blkcount < bph)) {
+               while (bp && (wc->wc_blkcount < wl->wl_brperblock)) {
                        /*
                         * Make sure all the physical block numbers are up to
                         * date.  If this is not always true on a given
@@ -2289,7 +2287,7 @@ wapbl_write_blocks(struct wapbl *wl, off
                        return error;
                bp = obp;
                cnt = 0;
-               while (bp && (cnt++ < bph)) {
+               while (bp && (cnt++ < wl->wl_brperblock)) {
                        error = wapbl_circ_write(wl, bp->b_data,
                            bp->b_bcount, &off);
                        if (error)
@@ -2326,22 +2324,18 @@ wapbl_write_revocations(struct wapbl *wl
            (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
        int i;
        int blocklen = 1<<wl->wl_log_dev_bshift;
-       int bph;
        off_t off = *offp;
        int error;
 
        if (wl->wl_dealloccnt == 0)
                return 0;
 
-       bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
-           sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
-
        i = 0;
        while (i < wl->wl_dealloccnt) {
                wc->wc_type = WAPBL_WC_REVOCATIONS;
                wc->wc_len = blocklen;
                wc->wc_blkcount = 0;
-               while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
+               while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < 
wl->wl_brperblock)) {
                        wc->wc_blocks[wc->wc_blkcount].wc_daddr =
                            wl->wl_deallocblks[i];
                        wc->wc_blocks[wc->wc_blkcount].wc_dlen =

Reply via email to