Module Name:    src
Committed By:   dholland
Date:           Wed Aug 12 18:23:16 UTC 2015

Modified Files:
        src/libexec/lfs_cleanerd: cleaner.h coalesce.c lfs_cleanerd.c
        src/sys/ufs/lfs: lfs.h lfs_extern.h lfs_syscalls.c

Log Message:
Fix assorted 64->32 truncations related to BLOCK_INFO.

Also make note of a cleaner limitation: it seems that when it goes to
coalesce discontiguous files, it mallocs an array with one BLOCK_INFO
for every block in the file. Therefore, with 64-bit LFS, on a 32-bit
platform it will be possible to have files large enough to overflow
the cleaner's address space. Currently these will be skipped and cause
warnings via syslog.

At some point someone should rewrite the logic to coalesce files to
use chunks of some reasonable size, as discontinuity between such
chunks is immaterial and mallocing this much space is silly and
fragile. Also, the kernel only accepts up to 65536 blocks at a time
for bmapv and markv, so processing more than this at once probably
isn't useful and may not even work currently. I don't want to change
this around just now as it's not entirely trivial.


To generate a diff of this commit:
cvs rdiff -u -r1.11 -r1.12 src/libexec/lfs_cleanerd/cleaner.h
cvs rdiff -u -r1.28 -r1.29 src/libexec/lfs_cleanerd/coalesce.c
cvs rdiff -u -r1.44 -r1.45 src/libexec/lfs_cleanerd/lfs_cleanerd.c
cvs rdiff -u -r1.172 -r1.173 src/sys/ufs/lfs/lfs.h
cvs rdiff -u -r1.107 -r1.108 src/sys/ufs/lfs/lfs_extern.h
cvs rdiff -u -r1.164 -r1.165 src/sys/ufs/lfs/lfs_syscalls.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/libexec/lfs_cleanerd/cleaner.h
diff -u src/libexec/lfs_cleanerd/cleaner.h:1.11 src/libexec/lfs_cleanerd/cleaner.h:1.12
--- src/libexec/lfs_cleanerd/cleaner.h:1.11	Sun Aug  2 18:18:09 2015
+++ src/libexec/lfs_cleanerd/cleaner.h	Wed Aug 12 18:23:16 2015
@@ -69,14 +69,13 @@ int load_segment(struct clfs *, int, BLO
 int needs_cleaning(struct clfs *, CLEANERINFO *);
 int reinit_fs(struct clfs *);
 void reload_ifile(struct clfs *);
-void toss_old_blocks(struct clfs *, BLOCK_INFO **, int *, int *);
+void toss_old_blocks(struct clfs *, BLOCK_INFO **, blkcnt_t *, int *);
 
 /* cleansrv.c */
 void check_control_socket(void);
 void try_to_become_master(int, char **);
 
 /* coalesce.c */
-int log2int(int);
 int clean_all_inodes(struct clfs *);
 int fork_coalesce(struct clfs *);
 

Index: src/libexec/lfs_cleanerd/coalesce.c
diff -u src/libexec/lfs_cleanerd/coalesce.c:1.28 src/libexec/lfs_cleanerd/coalesce.c:1.29
--- src/libexec/lfs_cleanerd/coalesce.c:1.28	Tue Jul 28 05:14:23 2015
+++ src/libexec/lfs_cleanerd/coalesce.c	Wed Aug 12 18:23:16 2015
@@ -1,4 +1,4 @@
-/*      $NetBSD: coalesce.c,v 1.28 2015/07/28 05:14:23 dholland Exp $  */
+/*      $NetBSD: coalesce.c,v 1.29 2015/08/12 18:23:16 dholland Exp $  */
 
 /*-
  * Copyright (c) 2002, 2005 The NetBSD Foundation, Inc.
@@ -60,7 +60,12 @@
 
 extern int debug, do_mmap;
 
-int log2int(int n)
+/*
+ * XXX return the arg to just int when/if we don't need it for
+ * potentially huge block counts any more.
+ */
+static int
+log2int(intmax_t n)
 {
 	int log;
 
@@ -151,8 +156,8 @@ clean_inode(struct clfs *fs, ino_t ino)
 		int blkcnt;
 	} */ lim;
 	daddr_t toff;
-	int i;
-	int nb, onb, noff;
+	int noff;
+	blkcnt_t i, nb, onb;
 	int retval;
 	int bps;
 
@@ -178,16 +183,36 @@ clean_inode(struct clfs *fs, ino_t ino)
 	}
 #endif
 	if (nb > dip->di_blocks) {
-		dlog("ino %d, computed blocks %d > held blocks %d", ino, nb,
-		     dip->di_blocks);
+		dlog("ino %ju, computed blocks %jd > held blocks %ju",
+		     (uintmax_t)ino, (intmax_t)nb,
+		     (uintmax_t)dip->di_blocks);
 		free(dip);
 		return COALESCE_BADBLOCKSIZE;
 	}
 
+	/*
+	 * XXX: We should really coalesce really large files in
+	 * chunks, as there's substantial diminishing returns and
+	 * mallocing huge amounts of memory just to get those returns
+	 * is pretty silly. But that requires a big rework of this
+	 * code. (On the plus side though then we can stop worrying
+	 * about block counts > 2^31.)
+	 */
+
+	/* ugh, no DADDR_T_MAX */
+	__CTASSERT(sizeof(daddr_t) == sizeof(int64_t));
+	if (nb > INT64_MAX / sizeof(BLOCK_INFO)) {
+		syslog(LOG_WARNING, "ino %ju, %jd blocks: array too large\n",
+		       (uintmax_t)ino, (uintmax_t)nb);
+		free(dip);
+		return COALESCE_NOMEM;
+	}
+
 	bip = (BLOCK_INFO *)malloc(sizeof(BLOCK_INFO) * nb);
 	if (bip == NULL) {
-		syslog(LOG_WARNING, "ino %llu, %d blocks: %m",
-		    (unsigned long long)ino, nb);
+		syslog(LOG_WARNING, "ino %llu, %jd blocks: %s\n",
+		    (unsigned long long)ino, (intmax_t)nb,
+		    strerror(errno));
 		free(dip);
 		return COALESCE_NOMEM;
 	}
@@ -198,6 +223,18 @@ clean_inode(struct clfs *fs, ino_t ino)
 		bip[i].bi_version = dip->di_gen;
 		/* Don't set the size, but let lfs_bmap fill it in */
 	}
+	/*
+	 * The kernel also contains this check; but as lim.blkcnt is
+	 * only 32 bits wide, we need to check ourselves too in case
+	 * we'd otherwise truncate a value > 2^31, as that might
+	 * succeed and create bizarre results.
+	 */
+	if (nb > LFS_MARKV_MAXBLKCNT) {
+		syslog(LOG_WARNING, "%s: coalesce: LFCNBMAPV: Too large\n",
+		       lfs_sb_getfsmnt(fs));
+		retval = COALESCE_BADBMAPV;
+		goto out;
+	}
 	lim.blkiov = bip;
 	lim.blkcnt = nb;
 	if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNBMAPV, &lim) < 0) { 
@@ -208,13 +245,15 @@ clean_inode(struct clfs *fs, ino_t ino)
 	}
 #if 0
 	for (i = 0; i < nb; i++) {
-		printf("bi_size = %d, bi_ino = %d, "
-		    "bi_lbn = %d, bi_daddr = %d\n",
-		    bip[i].bi_size, bip[i].bi_inode, bip[i].bi_lbn,
-		    bip[i].bi_daddr);
+		printf("bi_size = %d, bi_ino = %ju, "
+		    "bi_lbn = %jd, bi_daddr = %jd\n",
+		    bip[i].bi_size, (uintmax_t)bip[i].bi_inode,
+		    (intmax_t)bip[i].bi_lbn,
+		    (intmax_t)bip[i].bi_daddr);
 	}
 #endif
-	noff = toff = 0;
+	noff = 0;
+	toff = 0;
 	for (i = 1; i < nb; i++) {
 		if (bip[i].bi_daddr != bip[i - 1].bi_daddr + lfs_sb_getfrag(fs))
 			++noff;
@@ -235,8 +274,8 @@ clean_inode(struct clfs *fs, ino_t ino)
 		goto out;
 	} else if (debug)
 		syslog(LOG_DEBUG, "ino %llu total discontinuity "
-		    "%d (%lld) for %d blocks", (unsigned long long)ino,
-		    noff, (long long)toff, nb);
+		    "%d (%jd) for %jd blocks", (unsigned long long)ino,
+		    noff, (intmax_t)toff, (intmax_t)nb);
 
 	/* Search for blocks in active segments; don't move them. */
 	for (i = 0; i < nb; i++) {
@@ -274,8 +313,8 @@ clean_inode(struct clfs *fs, ino_t ino)
 	for (i = 0; i < nb; i++) {
 		bip[i].bi_bp = malloc(bip[i].bi_size);
 		if (bip[i].bi_bp == NULL) {
-			syslog(LOG_WARNING, "allocate block buffer size=%d: %m",
-			    bip[i].bi_size);
+			syslog(LOG_WARNING, "allocate block buffer size=%d: %s\n",
+			    bip[i].bi_size, strerror(errno));
 			retval = COALESCE_NOMEM;
 			goto out;
 		}
@@ -287,13 +326,16 @@ clean_inode(struct clfs *fs, ino_t ino)
 		}
 	}
 	if (debug)
-		syslog(LOG_DEBUG, "ino %llu markv %d blocks",
-		    (unsigned long long)ino, nb);
+		syslog(LOG_DEBUG, "ino %ju markv %jd blocks",
+		    (uintmax_t)ino, (intmax_t)nb);
 
 	/*
 	 * Write in segment-sized chunks.  If at any point we'd write more
 	 * than half of the available segments, sleep until that's not
 	 * true any more.
+	 *
+	 * XXX the pointer arithmetic in this loop is illegal; replace
+	 * TBIP with an integer (blkcnt_t) offset.
 	 */
 	bps = lfs_segtod(fs, 1);
 	for (tbip = bip; tbip < bip + nb; tbip += bps) {
@@ -307,6 +349,11 @@ clean_inode(struct clfs *fs, ino_t ino)
 				    LFCNSEGWAIT, NULL);
 		} while(cip.clean < 4);
 
+		/*
+		 * Note that although lim.blkcnt is 32 bits wide, bps
+		 * (which is blocks-per-segment) is < 2^32 so the
+		 * value assigned here is always in range.
+		 */
 		lim.blkiov = tbip;
 		lim.blkcnt = (tbip + bps < bip + nb ? bps : nb % bps);
 		if (kops.ko_fcntl(fs->clfs_ifilefd, LFCNMARKV, &lim) < 0) {

Index: src/libexec/lfs_cleanerd/lfs_cleanerd.c
diff -u src/libexec/lfs_cleanerd/lfs_cleanerd.c:1.44 src/libexec/lfs_cleanerd/lfs_cleanerd.c:1.45
--- src/libexec/lfs_cleanerd/lfs_cleanerd.c:1.44	Sun Aug  2 18:18:09 2015
+++ src/libexec/lfs_cleanerd/lfs_cleanerd.c	Wed Aug 12 18:23:16 2015
@@ -1,4 +1,4 @@
-/* $NetBSD: lfs_cleanerd.c,v 1.44 2015/08/02 18:18:09 dholland Exp $	 */
+/* $NetBSD: lfs_cleanerd.c,v 1.45 2015/08/12 18:23:16 dholland Exp $	 */
 
 /*-
  * Copyright (c) 2005 The NetBSD Foundation, Inc.
@@ -791,7 +791,7 @@ bi_comparator(const void *va, const void
 		return -1;
 	if (b->bi_lbn == LFS_UNUSED_LBN)
 		return 1;
-	if ((u_int32_t)a->bi_lbn > (u_int32_t)b->bi_lbn)
+	if ((u_int64_t)a->bi_lbn > (u_int64_t)b->bi_lbn)
 		return 1;
 	else
 		return -1;
@@ -813,9 +813,10 @@ cb_comparator(const void *va, const void
 }
 
 void
-toss_old_blocks(struct clfs *fs, BLOCK_INFO **bipp, int *bic, int *sizep)
+toss_old_blocks(struct clfs *fs, BLOCK_INFO **bipp, blkcnt_t *bic, int *sizep)
 {
-	int i, r;
+	blkcnt_t i;
+	int r;
 	BLOCK_INFO *bip = *bipp;
 	struct lfs_fcntl_markv /* {
 		BLOCK_INFO *blkiov;
@@ -832,6 +833,13 @@ toss_old_blocks(struct clfs *fs, BLOCK_I
 	for (i = 0; i < *bic; i++)
 		bip[i].bi_segcreate = bip[i].bi_daddr;
 
+	/*
+	 * XXX: blkcnt_t is 64 bits, so *bic might overflow size_t
+	 * (the argument type of heapsort's number argument) on a
+	 * 32-bit platform. However, if so we won't have got this far
+	 * because we'll have failed trying to allocate the array. So
+	 * while *bic here might cause a 64->32 truncation, it's safe.
+	 */
 	/* Sort the blocks */
 	heapsort(bip, *bic, sizeof(BLOCK_INFO), bi_comparator);
 
@@ -870,6 +878,7 @@ invalidate_segment(struct clfs *fs, int 
 {
 	BLOCK_INFO *bip;
 	int i, r, bic;
+	blkcnt_t widebic;
 	off_t nb;
 	double util;
 	struct lfs_fcntl_markv /* {
@@ -884,7 +893,9 @@ invalidate_segment(struct clfs *fs, int 
 	fs->clfs_nactive = 0;
 	if (load_segment(fs, sn, &bip, &bic) <= 0)
 		return -1;
-	toss_old_blocks(fs, &bip, &bic, NULL);
+	widebic = bic;
+	toss_old_blocks(fs, &bip, &widebic, NULL);
+	bic = widebic;
 
 	/* Record statistics */
 	for (i = nb = 0; i < bic; i++)
@@ -925,7 +936,7 @@ invalidate_segment(struct clfs *fs, int 
  * if the block needs to be added, 0 if it is already represented.
  */
 static int
-check_or_add(ino_t ino, int32_t lbn, BLOCK_INFO *bip, int bic, BLOCK_INFO **ebipp, int *ebicp)
+check_or_add(ino_t ino, daddr_t lbn, BLOCK_INFO *bip, int bic, BLOCK_INFO **ebipp, int *ebicp)
 {
 	BLOCK_INFO *t, *ebip = *ebipp;
 	int ebic = *ebicp;
@@ -975,7 +986,7 @@ check_hidden_cost(struct clfs *fs, BLOCK
 	int num;
 	int i, j, ebic;
 	BLOCK_INFO *ebip;
-	int32_t lbn;
+	daddr_t lbn;
 
 	start = 0;
 	ebip = NULL;
@@ -998,6 +1009,7 @@ check_hidden_cost(struct clfs *fs, BLOCK
 		if (bip[i].bi_lbn < ULFS_NDADDR)
 			continue;
 
+		/* XXX the struct lfs cast is completely wrong/unsafe */
 		ulfs_getlbns((struct lfs *)fs, NULL, (daddr_t)bip[i].bi_lbn, in, &num);
 		for (j = 0; j < num; j++) {
 			check_or_add(bip[i].bi_inode, in[j].in_lbn,
@@ -1016,6 +1028,7 @@ int
 clean_fs(struct clfs *fs, CLEANERINFO *cip)
 {
 	int i, j, ngood, sn, bic, r, npos;
+	blkcnt_t widebic;
 	int bytes, totbytes;
 	struct ubuf *bp;
 	SEGUSE *sup;
@@ -1096,7 +1109,9 @@ clean_fs(struct clfs *fs, CLEANERINFO *c
 			     fs->clfs_segtabp[i]->nbytes);
 			if ((r = load_segment(fs, sn, &bip, &bic)) > 0) {
 				++ngood;
-				toss_old_blocks(fs, &bip, &bic, &bytes);
+				widebic = bic;
+				toss_old_blocks(fs, &bip, &widebic, &bytes);
+				bic = widebic;
 				totbytes += bytes;
 			} else if (r == 0)
 				fd_release(fs->clfs_devvp);
@@ -1124,7 +1139,9 @@ clean_fs(struct clfs *fs, CLEANERINFO *c
 			else
 				break;
 		}
-		toss_old_blocks(fs, &bip, &bic, NULL);
+		widebic = bic;
+		toss_old_blocks(fs, &bip, &widebic, NULL);
+		bic = widebic;
 	}
 
 	/* If there is nothing to do, try again later. */

Index: src/sys/ufs/lfs/lfs.h
diff -u src/sys/ufs/lfs/lfs.h:1.172 src/sys/ufs/lfs/lfs.h:1.173
--- src/sys/ufs/lfs/lfs.h:1.172	Sun Aug  2 18:18:46 2015
+++ src/sys/ufs/lfs/lfs.h	Wed Aug 12 18:23:16 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs.h,v 1.172 2015/08/02 18:18:46 dholland Exp $	*/
+/*	$NetBSD: lfs.h,v 1.173 2015/08/12 18:23:16 dholland Exp $	*/
 
 /*  from NetBSD: dinode.h,v 1.22 2013/01/22 09:39:18 dholland Exp  */
 /*  from NetBSD: dir.h,v 1.21 2009/07/22 04:49:19 dholland Exp  */
@@ -941,7 +941,7 @@ struct lfs_stats {	/* Must match sysctl 
 /* Fcntls to take the place of the lfs syscalls */
 struct lfs_fcntl_markv {
 	BLOCK_INFO *blkiov;	/* blocks to relocate */
-	int blkcnt;		/* number of blocks */
+	int blkcnt;		/* number of blocks (limited to 65536) */
 };
 
 #define LFCNSEGWAITALL	_FCNR_FSPRIV('L', 14, struct timeval)

Index: src/sys/ufs/lfs/lfs_extern.h
diff -u src/sys/ufs/lfs/lfs_extern.h:1.107 src/sys/ufs/lfs/lfs_extern.h:1.108
--- src/sys/ufs/lfs/lfs_extern.h:1.107	Sun Aug  2 18:18:10 2015
+++ src/sys/ufs/lfs/lfs_extern.h	Wed Aug 12 18:23:16 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_extern.h,v 1.107 2015/08/02 18:18:10 dholland Exp $	*/
+/*	$NetBSD: lfs_extern.h,v 1.108 2015/08/12 18:23:16 dholland Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@@ -211,7 +211,6 @@ void lfs_writer_leave(struct lfs *);
 void lfs_wakeup_cleaner(struct lfs *);
 
 /* lfs_syscalls.c */
-struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *);
 int lfs_do_segclean(struct lfs *, unsigned long);
 int lfs_segwait(fsid_t *, struct timeval *);
 int lfs_bmapv(struct proc *, fsid_t *, struct block_info *, int);

Index: src/sys/ufs/lfs/lfs_syscalls.c
diff -u src/sys/ufs/lfs/lfs_syscalls.c:1.164 src/sys/ufs/lfs/lfs_syscalls.c:1.165
--- src/sys/ufs/lfs/lfs_syscalls.c:1.164	Sun Aug  2 18:14:16 2015
+++ src/sys/ufs/lfs/lfs_syscalls.c	Wed Aug 12 18:23:16 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_syscalls.c,v 1.164 2015/08/02 18:14:16 dholland Exp $	*/
+/*	$NetBSD: lfs_syscalls.c,v 1.165 2015/08/12 18:23:16 dholland Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.164 2015/08/02 18:14:16 dholland Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.165 2015/08/12 18:23:16 dholland Exp $");
 
 #ifndef LFS
 # define LFS		/* for prototypes in syscallargs.h */
@@ -88,7 +88,8 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls
 
 static int lfs_fastvget(struct mount *, ino_t, BLOCK_INFO *, int,
     struct vnode **);
-struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, void *);
+static struct buf *lfs_fakebuf(struct lfs *, struct vnode *, daddr_t,
+    size_t, void *);
 
 /*
  * sys_lfs_markv:
@@ -376,8 +377,8 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 			if (lfs_dtosn(fs, LFS_DBTOFSB(fs, b_daddr)) ==
 			    lfs_dtosn(fs, blkp->bi_daddr))
 			{
-				DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %llx vs %llx\n",
-				      (long long)blkp->bi_daddr, (long long)LFS_DBTOFSB(fs, b_daddr)));
+				DLOG((DLOG_CLEAN, "lfs_markv: wrong da same seg: %jx vs %jx\n",
+				      (intmax_t)blkp->bi_daddr, (intmax_t)LFS_DBTOFSB(fs, b_daddr)));
 			}
 			do_again++;
 			continue;
@@ -397,9 +398,9 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 			obsize = ip->i_lfs_fragsize[blkp->bi_lbn];
 		}
 		if (obsize != blkp->bi_size) {
-			DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %lld wrong"
+			DLOG((DLOG_CLEAN, "lfs_markv: ino %d lbn %jd wrong"
 			      " size (%ld != %d), try again\n",
-			      blkp->bi_inode, (long long)blkp->bi_lbn,
+			      blkp->bi_inode, (intmax_t)blkp->bi_lbn,
 			      (long) obsize, blkp->bi_size));
 			do_again++;
 			continue;
@@ -751,7 +752,6 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 		} else {
 			daddr_t bi_daddr;
 
-			/* XXX ondisk32 */
 			error = VOP_BMAP(vp, blkp->bi_lbn, NULL,
 					 &bi_daddr, NULL);
 			if (error)
@@ -1007,8 +1007,8 @@ lfs_fastvget(struct mount *mp, ino_t ino
 /*
  * Make up a "fake" cleaner buffer, copy the data from userland into it.
  */
-struct buf *
-lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, void *uaddr)
+static struct buf *
+lfs_fakebuf(struct lfs *fs, struct vnode *vp, daddr_t lbn, size_t size, void *uaddr)
 {
 	struct buf *bp;
 	int error;

Reply via email to