If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
btrfs will block with the following backtrace:
Call Trace:
__schedule+0x2d4/0xae0
schedule+0x3d/0x90
btrfs_start_ordered_extent+0x160/0x200 [btrfs]
? wake_atomic_t_function+0x60/0x60
btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
The direct cause is the error handler in run_delalloc_nocow() doesn't
handle error from btrfs_reloc_clone_csums() well.
The related part call path will be:
__extent_writepage
|- writepage_delalloc()
| |- run_delalloc_range()
| |- run_delalloc_nocow()
| |- btrfs_add_ordered_extent()
| | Now one ordered extent for file range, e.g [0, 1M) is inserted
| |
| |- btrfs_reloc_clone_csums()
| | Fails with -EIO, as RAID5/6 doesn't repair some csum tree
| | blocks
| |
| |- extent_clear_unlock_delalloc()
| Error routine, unlock and clear page DIRTY, end page writeback
| So the remaining 255 pages will not go through writeback
|
|- __extent_writepage_io()
|- writepage_end_io_hook()
|- btrfs_dev_test_ordered_pending()
Reduce ordered_extent->bytes_left by 4K.
Still have (1M - 4K) to finish.
While the remaining 255 pages will not go through IO nor trigger
writepage_end_io_hook(), the ordered extent for [0, 1M) will
never finish, and blocking current transaction forever.
Although the root cause is still in RAID5/6, it won't hurt to fix the
error routine first.
This patch will cleanup the ordered extent in error routine, so at least
we won't cause deadlock.
Signed-off-by: Qu Wenruo <[email protected]>
---
fs/btrfs/extent_io.c | 1 -
fs/btrfs/inode.c | 10 ++++++++--
fs/btrfs/ordered-data.c | 25 +++++++++++++++++++++++++
fs/btrfs/ordered-data.h | 10 ++++++++++
4 files changed, 43 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ac383a3a649..a14d1b0840c5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3258,7 +3258,6 @@ static noinline_for_stack int writepage_delalloc(struct
inode *inode,
delalloc_end,
&page_started,
nr_written);
- /* File system has been set read-only */
if (ret) {
SetPageError(page);
/* fill_delalloc should be return < 0 for error
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..3c3ade58afd7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1052,8 +1052,11 @@ static noinline int cow_file_range(struct inode *inode,
BTRFS_DATA_RELOC_TREE_OBJECTID) {
ret = btrfs_reloc_clone_csums(inode, start,
cur_alloc_size);
- if (ret)
+ if (ret) {
+ btrfs_clean_ordered_extent(inode, start,
+ ram_size);
goto out_drop_extent_cache;
+ }
}
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
@@ -1538,7 +1541,7 @@ static noinline int run_delalloc_nocow(struct inode
*inode,
if (!ret)
ret = err;
- if (ret && cur_offset < end)
+ if (ret && cur_offset < end) {
extent_clear_unlock_delalloc(inode, cur_offset, end, end,
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DEFRAG |
@@ -1546,6 +1549,9 @@ static noinline int run_delalloc_nocow(struct inode
*inode,
PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK |
PAGE_END_WRITEBACK);
+ btrfs_clean_ordered_extent(inode, cur_offset,
+ end - cur_offset + 1);
+ }
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 041c3326d109..dba1cf3464a7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -650,6 +650,31 @@ void btrfs_remove_ordered_extent(struct inode *inode,
wake_up(&entry->wait);
}
+void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 ram_len)
+{
+ struct btrfs_ordered_extent *entry;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ entry = btrfs_lookup_ordered_range(inode, file_offset, ram_len);
+ if (!entry || entry->file_offset != file_offset ||
+ entry->len != ram_len)
+ goto not_found;
+
+ /* Same as btrfs_finish_ordered_io() */
+ btrfs_remove_ordered_extent(inode, entry);
+ btrfs_put_ordered_extent(entry);
+ btrfs_put_ordered_extent(entry);
+ return;
+
+not_found:
+ WARN_ON(1);
+ btrfs_err(root->fs_info,
+ "failed to find and clean ordered extent: root %llu ino %llu
file_offset %llu len %llu",
+ root->objectid, btrfs_ino(inode), file_offset, ram_len);
+ return;
+}
+
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
{
struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 5f2b0ca28705..7a989778aa89 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -163,6 +163,16 @@ btrfs_ordered_inode_tree_init(struct
btrfs_ordered_inode_tree *t)
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry);
+
+/*
+ * Function to cleanup an allocated ordered extent in error routine.
+ *
+ * As error handler in run_delalloc_range() will clear all related pages
+ * and skip their IO, we have no method to finish inserted ordered extent.
+ * So we must use this function to clean it up.
+ */
+void btrfs_clean_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 ram_len);
int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
--
2.11.1
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html