@svarog reported crash on kaio over ext4. The investigation shows we take i_es_lock after plo->lock is already held (in reverse order) on fastmap way in ext4_es_lookup_extent():
#4 [ffff9c3e6ca83928] __read_lock_failed at ffffffffac1c2f9e #5 [ffff9c3e6ca83930] _raw_read_lock at ffffffffac5bf044 #6 [ffff9c3e6ca83940] ext4_es_lookup_extent at ffffffffc046c96a [ext4] #7 [ffff9c3e6ca83970] ext4_map_blocks at ffffffffc0425a6d [ext4] #8 [ffff9c3e6ca839f8] __ext4_overwrite_io at ffffffffc041ee37 [ext4] #9 [ffff9c3e6ca83a20] ext4_fastmap at ffffffffc041eece [ext4] #10 [ffff9c3e6ca83a60] kaio_fastmap at ffffffffc01497a1 [pio_kaio] #11 [ffff9c3e6ca83ad0] ploop_make_request at ffffffffc034cb7c [ploop] Thus, normal order fails to work: #13 [ffff9c54bfa03b08] native_queued_spin_lock_slowpath at ffffffffabf29b40 #14 [ffff9c54bfa03b10] queued_spin_lock_slowpath at ffffffffac5b0734 #15 [ffff9c54bfa03b20] _raw_spin_lock_irqsave at ffffffffac5bf387 #16 [ffff9c54bfa03b38] ploop_complete_io_state at ffffffffc0347543 [ploop] #17 [ffff9c54bfa03b68] kaio_complete_io_request at ffffffffc01496c8 [pio_kaio] #18 [ffff9c54bfa03b98] kaio_rw_kreq_complete at ffffffffc0149faa [pio_kaio] #19 [ffff9c54bfa03be0] aio_complete at ffffffffac0c87be #20 [ffff9c54bfa03c48] dio_complete at ffffffffac0b4a50 #21 [ffff9c54bfa03c88] dio_bio_end_aio at ffffffffac0b4c87 #22 [ffff9c54bfa03cb8] bio_endio at ffffffffac0b1acc #23 [ffff9c54bfa03ce8] dec_pending at ffffffffc003d148 [dm_mod] #24 [ffff9c54bfa03d40] clone_endio at ffffffffc003e361 [dm_mod] #25 [ffff9c54bfa03d78] bio_endio at ffffffffac0b1acc #26 [ffff9c54bfa03da8] blk_update_request at ffffffffac17d0e0 #27 [ffff9c54bfa03de8] blk_mq_end_request at ffffffffac18749a #28 [ffff9c54bfa03e08] nvme_complete_rq at ffffffffc01a9cfc [nvme_core] #29 [ffff9c54bfa03e18] nvme_pci_complete_rq at ffffffffc01b9e70 [nvme] #30 [ffff9c54bfa03e40] __blk_mq_complete_request at ffffffffac187666 #31 [ffff9c54bfa03e68] blk_mq_complete_request at ffffffffac187717 #32 [ffff9c54bfa03e78] nvme_irq at ffffffffc01ba0b2 [nvme] #33 [ffff9c54bfa03eb0] __handle_irq_event_percpu at ffffffffabf63ee4 #34 [ffff9c54bfa03ef8] handle_irq_event_percpu at ffffffffabf64092 #35 [ffff9c54bfa03f28] handle_irq_event at ffffffffabf6411c #36 [ffff9c54bfa03f50] handle_edge_irq at ffffffffabf66f3f #37 [ffff9c54bfa03f70] handle_irq at ffffffffabe2f524 #38 [ffff9c54bfa03fb8] do_IRQ at ffffffffac5ce96d --- <IRQ stack> --- #40 [ffff9c3d9e58b530] __es_insert_extent at ffffffffc046b994 [ext4] #41 [ffff9c3d9e58b580] ext4_es_insert_extent at ffffffffc046c75b [ext4] #42 [ffff9c3d9e58b5f0] ext4_map_blocks at ffffffffc0425afb [ext4] #43 [ffff9c3d9e58b678] _ext4_get_block at ffffffffc04262ff [ext4] #44 [ffff9c3d9e58b6e0] ext4_get_block at ffffffffc0426356 [ext4] To fix that, this patch replaces read_lock() with read_try_lock() in ext4_es_lookup_extent(). Also rename define into EXT4_GET_BLOCKS_EXTENT_TREE_ONLY_NONBLOCK to underline that its non-blocking. Signed-off-by: Kirill Tkhai <[email protected]> --- fs/ext4/ext4.h | 4 ++-- fs/ext4/extents_status.c | 7 ++++++- fs/ext4/file.c | 2 +- fs/ext4/inode.c | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 193ef8903a36..3cb410c5925c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -578,8 +578,8 @@ enum { * allows jbd2 to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 - /* Search in extent tree only */ -#define EXT4_GET_BLOCKS_EXTENT_TREE_ONLY 0x8000 + /* Search in extent tree only and do not block */ +#define EXT4_GET_BLOCKS_EXTENT_TREE_ONLY_NONBLOCK 0x8000 /* * The bit position of these flags must not overlap with any of the diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 472103bdd988..543e1eb69eb8 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -798,7 +798,12 @@ int __ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, es_debug("lookup extent in block %u\n", lblk); tree = &EXT4_I(inode)->i_es_tree; - read_lock(&EXT4_I(inode)->i_es_lock); + + if (flags & EXT4_GET_BLOCKS_EXTENT_TREE_ONLY_NONBLOCK) { + if (!read_trylock(&EXT4_I(inode)->i_es_lock)) + return 0; + } else + read_lock(&EXT4_I(inode)->i_es_lock); /* find extent in cache firstly */ es->es_lblk = es->es_len = es->es_pblk = 0; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2ee774a54ad4..edaf966c9a2f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -137,7 +137,7 @@ static int ext4_fastmap(struct inode *inode, sector_t lblk_sec, return -ENOENT; found = __ext4_overwrite_io(inode, lblk_sec << 9, len, &map, - EXT4_GET_BLOCKS_EXTENT_TREE_ONLY); + EXT4_GET_BLOCKS_EXTENT_TREE_ONLY_NONBLOCK); if (!found) return -ENOENT; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a4c1ab7c9a7c..a0f1e91d65d7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -497,7 +497,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return -EIO; /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (__ext4_es_lookup_extent(inode, map->m_lblk, &es, flags)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -524,7 +524,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, goto found; } - if (flags & EXT4_GET_BLOCKS_EXTENT_TREE_ONLY) + if (flags & EXT4_GET_BLOCKS_EXTENT_TREE_ONLY_NONBLOCK) return -ENOENT; /* _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
