Re: [PATCH 10/15] block, dax: fix lifetime of in-kernel dax mappings
On Wed, Oct 7, 2015 at 3:56 PM, Logan Gunthorpe wrote: > Hi Dan, > > We've uncovered another issue during testing with these patches. We get a > kernel panic sometimes just while using a DAX filesystem. I've traced the > issue back to this patch. (There's a stack trace at the end of this email.) > > On 22/09/15 10:42 PM, Dan Williams wrote: >> >> +static void dax_unmap_bh(const struct buffer_head *bh, void __pmem *addr) >> +{ >> + struct block_device *bdev = bh->b_bdev; >> + struct request_queue *q = bdev->bd_queue; >> + >> + if (IS_ERR(addr)) >> + return; >> + blk_dax_put(q); >> } >> >> @@ -127,9 +159,8 @@ static ssize_t dax_io(struct inode *inode, struct >> iov_iter *iter, >> if (pos == bh_max) { >> bh->b_size = PAGE_ALIGN(end - pos); >> bh->b_state = 0; >> - retval = get_block(inode, block, bh, >> - iov_iter_rw(iter) == >> WRITE); >> - if (retval) >> + rc = get_block(inode, block, bh, rw == >> WRITE); >> + if (rc) >> break; >> if (!buffer_size_valid(bh)) >> bh->b_size = 1 << blkbits; >> @@ -178,8 +213,9 @@ static ssize_t dax_io(struct inode *inode, struct >> iov_iter *iter, >> >> if (need_wmb) >> wmb_pmem(); >> + dax_unmap_bh(bh, kmap); >> >> - return (pos == start) ? retval : pos - start; >> + return (pos == start) ? rc : pos - start; >> } > > > The problem is if get_block fails and returns an error code, it will still > call dax_unmap_bh which tries to dereference bh->b_bdev. However, seeing > get_block failed, that pointer is NULL. Maybe a null check in dax_unmap_bh > would be sufficient? Thanks for the report, I have this fixed up in v2. Will post shortly. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/15] block, dax: fix lifetime of in-kernel dax mappings
On Wed, Oct 7, 2015 at 3:56 PM, Logan Gunthorpewrote: > Hi Dan, > > We've uncovered another issue during testing with these patches. We get a > kernel panic sometimes just while using a DAX filesystem. I've traced the > issue back to this patch. (There's a stack trace at the end of this email.) > > On 22/09/15 10:42 PM, Dan Williams wrote: >> >> +static void dax_unmap_bh(const struct buffer_head *bh, void __pmem *addr) >> +{ >> + struct block_device *bdev = bh->b_bdev; >> + struct request_queue *q = bdev->bd_queue; >> + >> + if (IS_ERR(addr)) >> + return; >> + blk_dax_put(q); >> } >> >> @@ -127,9 +159,8 @@ static ssize_t dax_io(struct inode *inode, struct >> iov_iter *iter, >> if (pos == bh_max) { >> bh->b_size = PAGE_ALIGN(end - pos); >> bh->b_state = 0; >> - retval = get_block(inode, block, bh, >> - iov_iter_rw(iter) == >> WRITE); >> - if (retval) >> + rc = get_block(inode, block, bh, rw == >> WRITE); >> + if (rc) >> break; >> if (!buffer_size_valid(bh)) >> bh->b_size = 1 << blkbits; >> @@ -178,8 +213,9 @@ static ssize_t dax_io(struct inode *inode, struct >> iov_iter *iter, >> >> if (need_wmb) >> wmb_pmem(); >> + dax_unmap_bh(bh, kmap); >> >> - return (pos == start) ? retval : pos - start; >> + return (pos == start) ? rc : pos - start; >> } > > > The problem is if get_block fails and returns an error code, it will still > call dax_unmap_bh which tries to dereference bh->b_bdev. However, seeing > get_block failed, that pointer is NULL. Maybe a null check in dax_unmap_bh > would be sufficient? Thanks for the report, I have this fixed up in v2. Will post shortly. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/15] block, dax: fix lifetime of in-kernel dax mappings
Hi Dan, We've uncovered another issue during testing with these patches. We get a kernel panic sometimes just while using a DAX filesystem. I've traced the issue back to this patch. (There's a stack trace at the end of this email.) On 22/09/15 10:42 PM, Dan Williams wrote: +static void dax_unmap_bh(const struct buffer_head *bh, void __pmem *addr) +{ + struct block_device *bdev = bh->b_bdev; + struct request_queue *q = bdev->bd_queue; + + if (IS_ERR(addr)) + return; + blk_dax_put(q); } @@ -127,9 +159,8 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; - retval = get_block(inode, block, bh, - iov_iter_rw(iter) == WRITE); - if (retval) + rc = get_block(inode, block, bh, rw == WRITE); + if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; @@ -178,8 +213,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (need_wmb) wmb_pmem(); + dax_unmap_bh(bh, kmap); - return (pos == start) ? retval : pos - start; + return (pos == start) ? rc : pos - start; } The problem is if get_block fails and returns an error code, it will still call dax_unmap_bh which tries to dereference bh->b_bdev. However, seeing get_block failed, that pointer is NULL. Maybe a null check in dax_unmap_bh would be sufficient? Thanks, Logan -- [ 35.391790] BUG: unable to handle kernel NULL pointer dereference at 00a0 [ 35.393306] IP: [] dax_unmap_bh+0x41/0x70 [ 35.394253] PGD 7c7ed067 PUD 7c01f067 PMD 0 [ 35.395020] Oops: [#1] SMP [ 35.395597] Modules linked in: mtramonb(O) [ 35.396320] CPU: 0 PID: 1501 Comm: dd Tainted: G O4.3.0-rc2+ #51 [ 35.397500] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 35.399728] task: 88007bc6d600 ti: 88007925 task.ti: 88007925 [ 35.401006] RIP: 0010:[] [] dax_unmap_bh+0x41/0x70 [ 35.402402] RSP: 0018:880079253ab8 EFLAGS: 00010246 [ 35.403321] RAX: RBX: RCX: 0012 [ 35.404550] RDX: 0007 RSI: 0246 RDI: 8181f630 [ 35.405783] RBP: 880079253ac8 R08: 000a R09: fffe [ 35.407011] R10: 88007fc1ad90 R11: 6abc R12: fffb [ 35.408237] R13: 0038e000 R14: 0038e000 R15: 0038e000 [ 35.409466] FS: 7f0e81476700() GS:88007fc0() knlGS: [ 35.410830] CS: 0010 DS: ES: CR0: 80050033 [ 35.411796] CR2: 00a0 CR3: 7926f000 CR4: 06f0 [ 35.412990] Stack: [ 35.413340] ffe4 880079253cf0 880079253be0 811d2259 [ 35.414670] 0246 81203120 880079253e68 [ 35.415981] 00017c394800 0038e000 0001 fffb [ 35.417298] Call Trace: [ 35.417727] [] dax_do_io+0x199/0x700 [ 35.418615] [] ? _ext4_get_block+0x200/0x200 [ 35.419819] [] ? jbd2_journal_stop+0x60/0x390 [ 35.420886] [] ext4_ind_direct_IO+0x8d/0x410 [ 35.421908] [] ext4_direct_IO+0x2da/0x540 [ 35.422859] [] ? ext4_dirty_inode+0x5c/0x70 [ 35.423841] [] generic_file_direct_write+0xaa/0x170 [ 35.424931] [] __generic_file_write_iter+0xc2/0x1f0 [ 35.426027] [] ext4_file_write_iter+0x13b/0x420 [ 35.427066] [] ? pick_next_entity+0xb2/0x190 [ 35.428061] [] __vfs_write+0xa7/0xf0 [ 35.428940] [] vfs_write+0xa9/0x190 [ 35.429810] [] ? vfs_read+0x114/0x130 [ 35.430706] [] SyS_write+0x46/0xa0 [ 35.431561] [] entry_SYSCALL_64_fastpath+0x12/0x71 [ 35.432637] Code: fe 48 c7 c7 4c 63 7e 81 e8 11 ec f5 ff 48 8b 5b 30 48 c7 c7 52 63 7e 81 31 c0 48 89 de e8 fc eb f5 ff 31 c0 48 c7 c7 30 f6 81 81 <48> 8b 9b a0 00 00 00 e8 e7 eb f5 ff 49 81 fc 00 f0 ff ff 77 08 [ 35.437029] RIP [] dax_unmap_bh+0x41/0x70 [ 35.438005] RSP [ 35.438608] CR2: 00a0 [ 35.439194] ---[ end trace 323695b29b46dd96 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 10/15] block, dax: fix lifetime of in-kernel dax mappings
Hi Dan, We've uncovered another issue during testing with these patches. We get a kernel panic sometimes just while using a DAX filesystem. I've traced the issue back to this patch. (There's a stack trace at the end of this email.) On 22/09/15 10:42 PM, Dan Williams wrote: +static void dax_unmap_bh(const struct buffer_head *bh, void __pmem *addr) +{ + struct block_device *bdev = bh->b_bdev; + struct request_queue *q = bdev->bd_queue; + + if (IS_ERR(addr)) + return; + blk_dax_put(q); } @@ -127,9 +159,8 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; - retval = get_block(inode, block, bh, - iov_iter_rw(iter) == WRITE); - if (retval) + rc = get_block(inode, block, bh, rw == WRITE); + if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; @@ -178,8 +213,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (need_wmb) wmb_pmem(); + dax_unmap_bh(bh, kmap); - return (pos == start) ? retval : pos - start; + return (pos == start) ? rc : pos - start; } The problem is if get_block fails and returns an error code, it will still call dax_unmap_bh which tries to dereference bh->b_bdev. However, seeing get_block failed, that pointer is NULL. Maybe a null check in dax_unmap_bh would be sufficient? Thanks, Logan -- [ 35.391790] BUG: unable to handle kernel NULL pointer dereference at 00a0 [ 35.393306] IP: [] dax_unmap_bh+0x41/0x70 [ 35.394253] PGD 7c7ed067 PUD 7c01f067 PMD 0 [ 35.395020] Oops: [#1] SMP [ 35.395597] Modules linked in: mtramonb(O) [ 35.396320] CPU: 0 PID: 1501 Comm: dd Tainted: G O4.3.0-rc2+ #51 [ 35.397500] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014 [ 35.399728] task: 88007bc6d600 ti: 88007925 task.ti: 88007925 [ 35.401006] RIP: 0010:[] [] dax_unmap_bh+0x41/0x70 [ 35.402402] RSP: 0018:880079253ab8 EFLAGS: 00010246 [ 35.403321] RAX: RBX: RCX: 0012 [ 35.404550] RDX: 0007 RSI: 0246 RDI: 8181f630 [ 35.405783] RBP: 880079253ac8 R08: 000a R09: fffe [ 35.407011] R10: 88007fc1ad90 R11: 6abc R12: fffb [ 35.408237] R13: 0038e000 R14: 0038e000 R15: 0038e000 [ 35.409466] FS: 7f0e81476700() GS:88007fc0() knlGS: [ 35.410830] CS: 0010 DS: ES: CR0: 80050033 [ 35.411796] CR2: 00a0 CR3: 7926f000 CR4: 06f0 [ 35.412990] Stack: [ 35.413340] ffe4 880079253cf0 880079253be0 811d2259 [ 35.414670] 0246 81203120 880079253e68 [ 35.415981] 00017c394800 0038e000 0001 fffb [ 35.417298] Call Trace: [ 35.417727] [] dax_do_io+0x199/0x700 [ 35.418615] [] ? _ext4_get_block+0x200/0x200 [ 35.419819] [] ? jbd2_journal_stop+0x60/0x390 [ 35.420886] [] ext4_ind_direct_IO+0x8d/0x410 [ 35.421908] [] ext4_direct_IO+0x2da/0x540 [ 35.422859] [] ? ext4_dirty_inode+0x5c/0x70 [ 35.423841] [] generic_file_direct_write+0xaa/0x170 [ 35.424931] [] __generic_file_write_iter+0xc2/0x1f0 [ 35.426027] [] ext4_file_write_iter+0x13b/0x420 [ 35.427066] [] ? pick_next_entity+0xb2/0x190 [ 35.428061] [] __vfs_write+0xa7/0xf0 [ 35.428940] [] vfs_write+0xa9/0x190 [ 35.429810] [] ? vfs_read+0x114/0x130 [ 35.430706] [] SyS_write+0x46/0xa0 [ 35.431561] [] entry_SYSCALL_64_fastpath+0x12/0x71 [ 35.432637] Code: fe 48 c7 c7 4c 63 7e 81 e8 11 ec f5 ff 48 8b 5b 30 48 c7 c7 52 63 7e 81 31 c0 48 89 de e8 fc eb f5 ff 31 c0 48 c7 c7 30 f6 81 81 <48> 8b 9b a0 00 00 00 e8 e7 eb f5 ff 49 81 fc 00 f0 ff ff 77 08 [ 35.437029] RIP [] dax_unmap_bh+0x41/0x70 [ 35.438005] RSP [ 35.438608] CR2: 00a0 [ 35.439194] ---[ end trace 323695b29b46dd96 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 10/15] block, dax: fix lifetime of in-kernel dax mappings
The DAX implementation needs to protect new calls to ->direct_access() and usage of its return value against unbind of the underlying block device. Use blk_dax_{get|put}() to either prevent blk_cleanup_queue() from proceeding, or fail the dax_map_bh() if the request_queue is being torn down. Cc: Jens Axboe Cc: Christoph Hellwig Cc: Boaz Harrosh Cc: Ross Zwisler Signed-off-by: Dan Williams --- fs/dax.c | 131 -- 1 file changed, 84 insertions(+), 47 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index bcfb14bfc1e4..358eea39e982 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -63,12 +63,43 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, - unsigned blkbits) +static void __pmem *__dax_map_bh(const struct buffer_head *bh, unsigned blkbits, + unsigned long *pfn, long *len) { - unsigned long pfn; + long rc; + void __pmem *addr; + struct block_device *bdev = bh->b_bdev; + struct request_queue *q = bdev->bd_queue; sector_t sector = bh->b_blocknr << (blkbits - 9); - return bdev_direct_access(bh->b_bdev, sector, addr, , bh->b_size); + + rc = blk_dax_get(q); + if (rc < 0) + return (void __pmem *) ERR_PTR(rc); + rc = bdev_direct_access(bdev, sector, , pfn, bh->b_size); + if (len) + *len = rc; + if (rc < 0) { + blk_dax_put(q); + return (void __pmem *) ERR_PTR(rc); + } + return addr; +} + +static void __pmem *dax_map_bh(const struct buffer_head *bh, unsigned blkbits) +{ + unsigned long pfn; + + return __dax_map_bh(bh, blkbits, , NULL); +} + +static void dax_unmap_bh(const struct buffer_head *bh, void __pmem *addr) +{ + struct block_device *bdev = bh->b_bdev; + struct request_queue *q = bdev->bd_queue; + + if (IS_ERR(addr)) + return; + blk_dax_put(q); } /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ @@ -104,15 +135,16 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { - ssize_t retval = 0; - loff_t pos = start; - loff_t max = start; - loff_t bh_max = start; - void __pmem *addr; + loff_t pos = start, max = start, bh_max = start; + int rw = iov_iter_rw(iter), rc; + long map_len = 0; + unsigned long pfn; + void __pmem *addr = NULL; + void __pmem *kmap = (void __pmem *) ERR_PTR(-EIO); bool hole = false; bool need_wmb = false; - if (iov_iter_rw(iter) != WRITE) + if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { @@ -127,9 +159,8 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; - retval = get_block(inode, block, bh, - iov_iter_rw(iter) == WRITE); - if (retval) + rc = get_block(inode, block, bh, rw == WRITE); + if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; @@ -141,21 +172,25 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size -= done; } - hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); + hole = rw == READ && !buffer_written(bh); if (hole) { addr = NULL; size = bh->b_size - first; } else { - retval = dax_get_addr(bh, , blkbits); - if (retval < 0) + dax_unmap_bh(bh, kmap); + kmap = __dax_map_bh(bh, blkbits, , _len); + if (IS_ERR(kmap)) { + rc = PTR_ERR(kmap); break; + } + addr = kmap; if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(addr, retval, first, pos, - end); + dax_new_buf(addr, map_len, first, pos, +
[PATCH 10/15] block, dax: fix lifetime of in-kernel dax mappings
The DAX implementation needs to protect new calls to ->direct_access() and usage of its return value against unbind of the underlying block device. Use blk_dax_{get|put}() to either prevent blk_cleanup_queue() from proceeding, or fail the dax_map_bh() if the request_queue is being torn down. Cc: Jens AxboeCc: Christoph Hellwig Cc: Boaz Harrosh Cc: Ross Zwisler Signed-off-by: Dan Williams --- fs/dax.c | 131 -- 1 file changed, 84 insertions(+), 47 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index bcfb14bfc1e4..358eea39e982 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -63,12 +63,43 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size) } EXPORT_SYMBOL_GPL(dax_clear_blocks); -static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, - unsigned blkbits) +static void __pmem *__dax_map_bh(const struct buffer_head *bh, unsigned blkbits, + unsigned long *pfn, long *len) { - unsigned long pfn; + long rc; + void __pmem *addr; + struct block_device *bdev = bh->b_bdev; + struct request_queue *q = bdev->bd_queue; sector_t sector = bh->b_blocknr << (blkbits - 9); - return bdev_direct_access(bh->b_bdev, sector, addr, , bh->b_size); + + rc = blk_dax_get(q); + if (rc < 0) + return (void __pmem *) ERR_PTR(rc); + rc = bdev_direct_access(bdev, sector, , pfn, bh->b_size); + if (len) + *len = rc; + if (rc < 0) { + blk_dax_put(q); + return (void __pmem *) ERR_PTR(rc); + } + return addr; +} + +static void __pmem *dax_map_bh(const struct buffer_head *bh, unsigned blkbits) +{ + unsigned long pfn; + + return __dax_map_bh(bh, blkbits, , NULL); +} + +static void dax_unmap_bh(const struct buffer_head *bh, void __pmem *addr) +{ + struct block_device *bdev = bh->b_bdev; + struct request_queue *q = bdev->bd_queue; + + if (IS_ERR(addr)) + return; + blk_dax_put(q); } /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ @@ -104,15 +135,16 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { - ssize_t retval = 0; - loff_t pos = start; - loff_t max = start; - loff_t bh_max = start; - void __pmem *addr; + loff_t pos = start, max = start, bh_max = start; + int rw = iov_iter_rw(iter), rc; + long map_len = 0; + unsigned long pfn; + void __pmem *addr = NULL; + void __pmem *kmap = (void __pmem *) ERR_PTR(-EIO); bool hole = false; bool need_wmb = false; - if (iov_iter_rw(iter) != WRITE) + if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { @@ -127,9 +159,8 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; - retval = get_block(inode, block, bh, - iov_iter_rw(iter) == WRITE); - if (retval) + rc = get_block(inode, block, bh, rw == WRITE); + if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; @@ -141,21 +172,25 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size -= done; } - hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); + hole = rw == READ && !buffer_written(bh); if (hole) { addr = NULL; size = bh->b_size - first; } else { - retval = dax_get_addr(bh, , blkbits); - if (retval < 0) + dax_unmap_bh(bh, kmap); + kmap = __dax_map_bh(bh, blkbits, , _len); + if (IS_ERR(kmap)) { + rc = PTR_ERR(kmap); break; + } + addr = kmap; if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(addr, retval, first, pos, -