Module Name:    src
Committed By:   jdolecek
Date:           Mon Apr 10 21:34:37 UTC 2017

Modified Files:
        src/sys/kern: vfs_wapbl.c

Log Message:
improve performance of journal writes by parallelizing the I/O - use 4 bufs
by default, add sysctl vfs.wapbl.journal_iobufs to control it

this also removes need to allocate iobuf during commit, so it
might help to avoid deadlock during memory shortages like PR kern/47030


To generate a diff of this commit:
cvs rdiff -u -r1.94 -r1.95 src/sys/kern/vfs_wapbl.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/vfs_wapbl.c
diff -u src/sys/kern/vfs_wapbl.c:1.94 src/sys/kern/vfs_wapbl.c:1.95
--- src/sys/kern/vfs_wapbl.c:1.94	Mon Apr 10 19:52:38 2017
+++ src/sys/kern/vfs_wapbl.c	Mon Apr 10 21:34:37 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $	*/
+/*	$NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $	*/
 
 /*-
  * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
@@ -36,7 +36,7 @@
 #define WAPBL_INTERNAL
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.94 2017/04/10 19:52:38 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.95 2017/04/10 21:34:37 jdolecek Exp $");
 
 #include <sys/param.h>
 #include <sys/bitops.h>
@@ -72,6 +72,7 @@ static struct sysctllog *wapbl_sysctl;
 static int wapbl_flush_disk_cache = 1;
 static int wapbl_verbose_commit = 0;
 static int wapbl_allow_fuadpo = 0; 	/* switched off by default for now */
+static int wapbl_journal_iobufs = 4;
 
 static inline size_t wapbl_space_free(size_t, off_t, off_t);
 
@@ -191,6 +192,8 @@ struct wapbl {
 	char wl_ev_group[EVCNT_STRING_MAX];	/* r	*/
 	struct evcnt wl_ev_commit;		/* l	*/
 	struct evcnt wl_ev_journalwrite;	/* l	*/
+	struct evcnt wl_ev_jbufs_bio_nowait;	/* l	*/
+	struct evcnt wl_ev_jbufs_bio_wait;	/* l	*/
 	struct evcnt wl_ev_metawrite;		/* lm	*/
 	struct evcnt wl_ev_cacheflush;		/* l	*/
 #endif
@@ -228,9 +231,9 @@ struct wapbl {
 	SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
 						   accounting */
 
-	u_char *wl_buffer;	/* l:   buffer for wapbl_buffered_write() */
-	daddr_t wl_buffer_dblk;	/* l:   buffer disk block address */
-	size_t wl_buffer_used;	/* l:   buffer current use */
+	/* buffers for wapbl_buffered_write() */
+	TAILQ_HEAD(, buf) wl_iobufs;		/* l: Free or filling bufs */
+	TAILQ_HEAD(, buf) wl_iobufs_busy;	/* l: In-transit bufs */
 
 	int wl_dkcache;		/* r: 	disk cache flags */
 #define WAPBL_USE_FUA(wl)	\
@@ -360,6 +363,15 @@ wapbl_sysctl_init(void)
 	if (rv)
 		return rv;
 
+	rv = sysctl_createv(&wapbl_sysctl, 0, &rnode, &cnode,
+		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		       CTLTYPE_INT, "journal_iobufs",
+		       SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
+		       NULL, 0, &wapbl_journal_iobufs, 0,
+		       CTL_CREATE, CTL_EOL);
+	if (rv)
+		return rv;
+
 	return rv;
 }
 
@@ -401,6 +413,10 @@ wapbl_evcnt_init(struct wapbl *wl)
 	    NULL, wl->wl_ev_group, "commit");
 	evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
 	    NULL, wl->wl_ev_group, "journal sync block write");
+	evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
+	    NULL, wl->wl_ev_group, "journal I/O bufs no wait");
+	evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_wait, EVCNT_TYPE_MISC,
+	    NULL, wl->wl_ev_group, "journal I/O bufs biowait");
 	evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
 	    NULL, wl->wl_ev_group, "metadata finished block write");
 	evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
@@ -412,6 +428,8 @@ wapbl_evcnt_free(struct wapbl *wl)
 {
 	evcnt_detach(&wl->wl_ev_commit);
 	evcnt_detach(&wl->wl_ev_journalwrite);
+	evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
+	evcnt_detach(&wl->wl_ev_jbufs_bio_wait);
 	evcnt_detach(&wl->wl_ev_metawrite);
 	evcnt_detach(&wl->wl_ev_cacheflush);
 }
@@ -605,9 +623,6 @@ wapbl_start(struct wapbl ** wlp, struct 
 	wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
 	TAILQ_INIT(&wl->wl_dealloclist);
 
-	wl->wl_buffer = wapbl_alloc(MAXPHYS);
-	wl->wl_buffer_used = 0;
-
 	wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
 
 	wapbl_evcnt_init(wl);
@@ -630,6 +645,25 @@ wapbl_start(struct wapbl ** wlp, struct 
 		wl->wl_wc_scratch = wapbl_alloc(len);
 	}
 
+	TAILQ_INIT(&wl->wl_iobufs);
+	TAILQ_INIT(&wl->wl_iobufs_busy);
+	for (int i = 0; i < wapbl_journal_iobufs; i++) {
+		struct buf *bp;
+
+		if ((bp = geteblk(MAXPHYS)) == NULL)
+			goto errout;
+
+		mutex_enter(&bufcache_lock);
+		mutex_enter(devvp->v_interlock);
+		bgetvp(devvp, bp);
+		mutex_exit(devvp->v_interlock);
+		mutex_exit(&bufcache_lock);
+
+		bp->b_dev = devvp->v_rdev;
+
+		TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+	}
+
 	/*
 	 * if there was an existing set of unlinked but
 	 * allocated inodes, preserve it in the new
@@ -656,7 +690,13 @@ wapbl_start(struct wapbl ** wlp, struct 
 	wapbl_discard(wl);
 	wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
 	wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
-	wapbl_free(wl->wl_buffer, MAXPHYS);
+	while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+		struct buf *bp;
+
+		bp = TAILQ_FIRST(&wl->wl_iobufs);
+		TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+		brelse(bp, BC_INVAL);
+	}
 	wapbl_inodetrk_free(wl);
 	wapbl_free(wl, sizeof(*wl));
 
@@ -832,10 +872,17 @@ wapbl_stop(struct wapbl *wl, int force)
 	KASSERT(wl->wl_inohashcnt == 0);
 	KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
 	KASSERT(wl->wl_dealloccnt == 0);
+	KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
 
 	wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
 	wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
-	wapbl_free(wl->wl_buffer, MAXPHYS);
+	while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
+		struct buf *bp;
+
+		bp = TAILQ_FIRST(&wl->wl_iobufs);
+		TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+		brelse(bp, BC_INVAL);
+	}
 	wapbl_inodetrk_free(wl);
 
 	wapbl_evcnt_free(wl);
@@ -853,14 +900,10 @@ wapbl_stop(struct wapbl *wl, int force)
  * Unbuffered disk I/O
  */
 
-static int
-wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+static void
+wapbl_doio_accounting(struct vnode *devvp, int flags)
 {
 	struct pstats *pstats = curlwp->l_proc->p_stats;
-	struct buf *bp;
-	int error;
-
-	KASSERT(devvp->v_type == VBLK);
 
 	if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
 		mutex_enter(devvp->v_interlock);
@@ -871,6 +914,18 @@ wapbl_doio(void *data, size_t len, struc
 		pstats->p_ru.ru_inblock++;
 	}
 
+}
+
+static int
+wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+{
+	struct buf *bp;
+	int error;
+
+	KASSERT(devvp->v_type == VBLK);
+
+	wapbl_doio_accounting(devvp, flags);
+
 	bp = getiobuf(devvp, true);
 	bp->b_flags = flags;
 	bp->b_cflags = BC_BUSY;	/* mandatory, asserted by biowait() */
@@ -935,24 +990,77 @@ wapbl_read(void *data, size_t len, struc
  */
 
 /*
+ * wapbl_buffered_write_async(wl, bp)
+ *
+ *	Send buffer for asynchronous write.
+ */
+static void
+wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
+{
+	wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
+
+	KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
+	TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
+
+	bp->b_flags = B_WRITE | WAPBL_JFLAGS(wl);
+	bp->b_cflags = BC_BUSY;	/* mandatory, asserted by biowait() */
+	bp->b_oflags = 0;
+	bp->b_bcount = bp->b_resid;
+	BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
+
+	VOP_STRATEGY(wl->wl_devvp, bp);
+
+	wl->wl_ev_journalwrite.ev_count++;
+
+	TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
+}
+
+/*
  * wapbl_buffered_flush(wl)
  *
  *	Flush any buffered writes from wapbl_buffered_write.
  */
 static int
-wapbl_buffered_flush(struct wapbl *wl)
+wapbl_buffered_flush(struct wapbl *wl, bool full)
 {
-	int error;
+	int error = 0;
+	struct buf *bp, *bnext;
+	bool only_done = true, found = false;
 
-	if (wl->wl_buffer_used == 0)
-		return 0;
+	/* if there is outstanding buffered write, send it now */
+	if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
+		wapbl_buffered_write_async(wl, bp);
+
+	/* wait for I/O to complete */
+again:
+	TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
+		if (!full && only_done) {
+			/* skip unfinished */
+			if (!ISSET(bp->b_oflags, BO_DONE))
+				continue;
+		}
+			
+		if (ISSET(bp->b_oflags, BO_DONE))
+			wl->wl_ev_jbufs_bio_nowait.ev_count++;
+		else
+			wl->wl_ev_jbufs_bio_wait.ev_count++;
 
-	error = wapbl_doio(wl->wl_buffer, wl->wl_buffer_used,
-	    wl->wl_devvp, wl->wl_buffer_dblk,
-	    B_WRITE | WAPBL_JFLAGS(wl));
-	wl->wl_buffer_used = 0;
+		TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
+		error = biowait(bp);
 
-	wl->wl_ev_journalwrite.ev_count++;
+		/* reset for reuse */
+		bp->b_blkno = bp->b_resid = 0;
+		TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
+		found = true;
+
+		if (!full)
+			break;
+	}
+
+	if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
+		only_done = false;
+		goto again;
+	}
 
 	return error;
 }
@@ -967,49 +1075,63 @@ wapbl_buffered_flush(struct wapbl *wl)
 static int
 wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn)
 {
-	int error;
 	size_t resid;
+	struct buf *bp;
+
+again:
+	bp = TAILQ_FIRST(&wl->wl_iobufs);
+
+	if (bp == NULL) {
+		/* No more buffers, wait for any previous I/O to finish. */
+		wapbl_buffered_flush(wl, false);
+
+		bp = TAILQ_FIRST(&wl->wl_iobufs);
+		KASSERT(bp != NULL);
+	}
 
 	/*
 	 * If not adjacent to buffered data flush first.  Disk block
 	 * address is always valid for non-empty buffer.
 	 */
-	if (wl->wl_buffer_used > 0 &&
-	    pbn != wl->wl_buffer_dblk + btodb(wl->wl_buffer_used)) {
-		error = wapbl_buffered_flush(wl);
-		if (error)
-			return error;
+	if ((bp->b_resid > 0 && pbn != bp->b_blkno + btodb(bp->b_resid))) {
+		wapbl_buffered_write_async(wl, bp);
+		goto again;
 	}
+
 	/*
 	 * If this write goes to an empty buffer we have to
 	 * save the disk block address first.
 	 */
-	if (wl->wl_buffer_used == 0)
-		wl->wl_buffer_dblk = pbn;
+	if (bp->b_blkno == 0)
+		bp->b_blkno = pbn;
+
 	/*
-	 * Remaining space so this buffer ends on a MAXPHYS boundary.
+	 * Remaining space so this buffer ends on a buffer size boundary.
 	 *
 	 * Cannot become less or equal zero as the buffer would have been
 	 * flushed on the last call then.
 	 */
-	resid = MAXPHYS - dbtob(wl->wl_buffer_dblk % btodb(MAXPHYS)) -
-	    wl->wl_buffer_used;
+	resid = bp->b_bufsize - dbtob(bp->b_blkno % btodb(bp->b_bufsize)) -
+	    bp->b_resid;
 	KASSERT(resid > 0);
 	KASSERT(dbtob(btodb(resid)) == resid);
+
+	if (len < resid)
+		resid = len;
+
+	memcpy((uint8_t *)bp->b_data + bp->b_resid, data, resid);
+	bp->b_resid += resid;
+
 	if (len >= resid) {
-		memcpy(wl->wl_buffer + wl->wl_buffer_used, data, resid);
-		wl->wl_buffer_used += resid;
-		error = wapbl_buffered_flush(wl);
+		/* Just filled the buf, or data did not fit */
+		wapbl_buffered_write_async(wl, bp);
+
 		data = (uint8_t *)data + resid;
 		len -= resid;
-		wl->wl_buffer_dblk = pbn + btodb(resid);
-		if (error)
-			return error;
-	}
-	KASSERT(len < MAXPHYS);
-	if (len > 0) {
-		memcpy(wl->wl_buffer + wl->wl_buffer_used, data, len);
-		wl->wl_buffer_used += len;
+		pbn += btodb(resid);
+
+		if (len > 0)
+			goto again;
 	}
 
 	return 0;
@@ -2014,6 +2136,30 @@ wapbl_print(struct wapbl *wl,
 			}
 			(*pr)("\n");
 		}
+
+		(*pr)("iobufs free =");
+		TAILQ_FOREACH(bp, &wl->wl_iobufs, b_wapbllist) {
+			if (!TAILQ_NEXT(bp, b_wapbllist)) {
+				(*pr)(" %p", bp);
+			} else if ((++cnt % 6) == 0) {
+				(*pr)(" %p,\n\t", bp);
+			} else {
+				(*pr)(" %p,", bp);
+			}
+		}
+		(*pr)("\n");
+
+		(*pr)("iobufs busy =");
+		TAILQ_FOREACH(bp, &wl->wl_iobufs_busy, b_wapbllist) {
+			if (!TAILQ_NEXT(bp, b_wapbllist)) {
+				(*pr)(" %p", bp);
+			} else if ((++cnt % 6) == 0) {
+				(*pr)(" %p,\n\t", bp);
+			} else {
+				(*pr)(" %p,", bp);
+			}
+		}
+		(*pr)("\n");
 	}
 }
 
@@ -2315,7 +2461,7 @@ wapbl_write_commit(struct wapbl *wl, off
 	int error;
 	daddr_t pbn;
 
-	error = wapbl_buffered_flush(wl);
+	error = wapbl_buffered_flush(wl, true);
 	if (error)
 		return error;
 	/*
@@ -2352,7 +2498,7 @@ wapbl_write_commit(struct wapbl *wl, off
 	error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn);
 	if (error)
 		return error;
-	error = wapbl_buffered_flush(wl);
+	error = wapbl_buffered_flush(wl, true);
 	if (error)
 		return error;
 

Reply via email to