Re: BTRFS critical: corrupt leaf, slot offset bad; then read-only

2017-02-21 Thread Lukas Tribus
Upgrading to 4.8, the FS no longer causes a kernel calltrace and does 
not go read-only. It only shows the "corrupt leaf, slot offset bad" message.


A scrub completed without errors on 3 devices, while it was aborted on 2 
devices. Not sure why it was aborted, since there is no error message in 
dmesg?



Any suggestions why the scrub was aborted?



# uname -a
Linux srv1-dom0 4.8.0-36-generic #36~16.04.1-Ubuntu SMP Sun Feb 5 
09:39:57 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux

# btrfs scrub status /storage/users/
scrub status for f50f980e-7640-49c7-bf8d-20d55cfe6005
scrub started at Wed Feb 22 00:07:33 2017 and was aborted after 
06:35:42

total bytes scrubbed: 10.60TiB with 0 errors
/# btrfs scrub status /storage/users/ -d
scrub status for f50f980e-7640-49c7-bf8d-20d55cfe6005
scrub device /dev/dm-5 (id 1) history
scrub started at Wed Feb 22 00:07:33 2017 and finished after 
06:35:36

total bytes scrubbed: 2.30TiB with 0 errors
scrub device /dev/dm-6 (id 2) history
scrub started at Wed Feb 22 00:07:33 2017 and finished after 
06:35:30

total bytes scrubbed: 2.30TiB with 0 errors
scrub device /dev/dm-7 (id 3) history
scrub started at Wed Feb 22 00:07:33 2017 and finished after 
06:35:42

total bytes scrubbed: 2.30TiB with 0 errors
scrub device /dev/dm-8 (id 4) history
scrub started at Wed Feb 22 00:07:33 2017 and was aborted after 
05:01:37

total bytes scrubbed: 1.85TiB with 0 errors
scrub device /dev/mapper/sde3_crypt (id 5) history
scrub started at Wed Feb 22 00:07:33 2017 and was aborted after 
05:01:37

total bytes scrubbed: 1.85TiB with 0 errors
#dmesg | grep BTRFS
[  929.737119] BTRFS critical (device dm-9): corrupt leaf, slot offset 
bad: block=5242107641856,root=1, slot=39
[19772.594129] BTRFS critical (device dm-9): corrupt leaf, slot offset 
bad: block=5242107641856,root=1, slot=39
[19777.127704] BTRFS critical (device dm-9): corrupt leaf, slot offset 
bad: block=5242107641856,root=1, slot=39
[19777.552191] BTRFS critical (device dm-9): corrupt leaf, slot offset 
bad: block=5242107641856,root=1, slot=39

#

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] btrfs: let writepage_end_io_hook return void

2017-02-21 Thread Liu Bo
On Mon, Feb 20, 2017 at 07:31:24PM +0100, David Sterba wrote:
> There's no error path in any of the instances, always return 0.

Reviewed-by: Liu Bo 

Thanks,

-liubo
> 
> Signed-off-by: David Sterba 
> ---
>  fs/btrfs/extent_io.c | 9 +++--
>  fs/btrfs/extent_io.h | 2 +-
>  fs/btrfs/inode.c | 6 ++
>  3 files changed, 6 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d15b5ddb6732..8de29aa4d1a2 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -2435,12 +2435,9 @@ void end_extent_writepage(struct page *page, int err, 
> u64 start, u64 end)
>  
>   tree = _I(page->mapping->host)->io_tree;
>  
> - if (tree->ops && tree->ops->writepage_end_io_hook) {
> - ret = tree->ops->writepage_end_io_hook(page, start,
> -end, NULL, uptodate);
> - if (ret)
> - uptodate = 0;
> - }
> + if (tree->ops && tree->ops->writepage_end_io_hook)
> + tree->ops->writepage_end_io_hook(page, start, end, NULL,
> + uptodate);
>  
>   if (!uptodate) {
>   ClearPageUptodate(page);
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 270d03be290e..fbc92315b503 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -103,7 +103,7 @@ struct extent_io_ops {
>   int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
>   struct page *page, u64 start, u64 end,
>   int mirror);
> - int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
> + void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
> struct extent_state *state, int uptodate);
>   void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
>unsigned *bits);
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index dae2734a725b..eafadf0851d1 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2977,7 +2977,7 @@ static void finish_ordered_fn(struct btrfs_work *work)
>   btrfs_finish_ordered_io(ordered_extent);
>  }
>  
> -static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
> +static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 
> end,
>   struct extent_state *state, int uptodate)
>  {
>   struct inode *inode = page->mapping->host;
> @@ -2991,7 +2991,7 @@ static int btrfs_writepage_end_io_hook(struct page 
> *page, u64 start, u64 end,
>   ClearPagePrivate2(page);
>   if (!btrfs_dec_test_ordered_pending(inode, _extent, start,
>   end - start + 1, uptodate))
> - return 0;
> + return;
>  
>   if (btrfs_is_free_space_inode(inode)) {
>   wq = fs_info->endio_freespace_worker;
> @@ -3004,8 +3004,6 @@ static int btrfs_writepage_end_io_hook(struct page 
> *page, u64 start, u64 end,
>   btrfs_init_work(_extent->work, func, finish_ordered_fn, NULL,
>   NULL);
>   btrfs_queue_work(wq, _extent->work);
> -
> - return 0;
>  }
>  
>  static int __readpage_endio_check(struct inode *inode,
> -- 
> 2.10.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] btrfs: do proper error handling in btrfs_insert_xattr_item

2017-02-21 Thread Liu Bo
On Mon, Feb 20, 2017 at 07:25:06PM +0100, David Sterba wrote:
> The space check in btrfs_insert_xattr_item is duplicated in it's caller
> (do_setxattr) so we won't hit the BUG_ON. Continuing without any check
> could be disasterous so turn it to a proper error handling.
> 
> Signed-off-by: David Sterba 
> ---
>  fs/btrfs/dir-item.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
> index 724504a2d7ac..640801082533 100644
> --- a/fs/btrfs/dir-item.c
> +++ b/fs/btrfs/dir-item.c
> @@ -80,7 +80,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle 
> *trans,
>   struct extent_buffer *leaf;
>   u32 data_size;
>  
> - BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info));
> + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info))
> + return -ENOSPC;
>

Besides making it silent, how about adding a ASSERT to cry out?
(Although currently we'd never come into this case.)

Reviewed-by: Liu Bo 

Thanks,

-liubo
>   key.objectid = objectid;
>   key.type = BTRFS_XATTR_ITEM_KEY;
> -- 
> 2.10.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] btrfs: handle allocation error in update_dev_stat_item

2017-02-21 Thread Liu Bo
On Mon, Feb 20, 2017 at 07:25:04PM +0100, David Sterba wrote:
> Signed-off-by: David Sterba 
> ---
>  fs/btrfs/volumes.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 1fac98728814..64d6665f6eda 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -6954,7 +6954,8 @@ static int update_dev_stat_item(struct 
> btrfs_trans_handle *trans,
>   key.offset = device->devid;
>  
>   path = btrfs_alloc_path();
> - BUG_ON(!path);
> + if (!path)
> + return -ENOMEM;
>   ret = btrfs_search_slot(trans, dev_root, , path, -1, 1);
>   if (ret < 0) {
>   btrfs_warn_in_rcu(fs_info,

Reviewed-by: Liu Bo 

Thanks,

-liubo
> -- 
> 2.10.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] btrfs: remove BUG_ON from __tree_mod_log_insert

2017-02-21 Thread Liu Bo
On Mon, Feb 20, 2017 at 07:25:01PM +0100, David Sterba wrote:
> All callers dereference the 'tm' parameter before it gets to this
> function, the NULL check does not make much sense here.

Reviewed-by: Liu Bo 

Thanks,

-liubo
> 
> Signed-off-by: David Sterba 
> ---
>  fs/btrfs/ctree.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 1192bc7d2ee7..2c3c943bfcdc 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -453,8 +453,6 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, 
> struct tree_mod_elem *tm)
>   struct rb_node *parent = NULL;
>   struct tree_mod_elem *cur;
>  
> - BUG_ON(!tm);
> -
>   tm->seq = btrfs_inc_tree_mod_seq(fs_info);
>  
>   tm_root = _info->tree_mod_log;
> -- 
> 2.10.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 2/2] btrfs: Handle delalloc error correctly to avoid ordered extent deadlock

2017-02-21 Thread Qu Wenruo
If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
btrfs will block with the following backtrace:

Call Trace:
 __schedule+0x2d4/0xae0
 schedule+0x3d/0x90
 btrfs_start_ordered_extent+0x160/0x200 [btrfs]
 ? wake_atomic_t_function+0x60/0x60
 btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
 btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
 btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
 process_one_work+0x2af/0x720
 ? process_one_work+0x22b/0x720
 worker_thread+0x4b/0x4f0
 kthread+0x10f/0x150
 ? process_one_work+0x720/0x720
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x2e/0x40

The direct cause is the error handler in run_delalloc_nocow() doesn't
handle error from btrfs_reloc_clone_csums() well.

The error handler of run_delalloc_nocow() will clear dirty and finish IO
for the pages in that extent.
However we have already inserted one ordered extent.
And that ordered extent is relying on endio hooks to wait all its pages
to finish, while only the first page will finish.

This makes that ordered extent never finish, so blocking the file
system.

Although the root cause is still in RAID5/6, it won't hurt to fix the
error routine first.

This patch will slightly modify one existing function,
btrfs_endio_direct_write_update_ordered() to handle free space inode,
and skip releasing metadata, which will be handled by
extent_clear_unlock_delalloc().

And use it as base to implement one inline function,
btrfs_cleanup_ordered_extents() to handle the error in
run_delalloc_nocow() and cow_file_range().

Also, extent_clear_unlock_delalloc() will handle all the metadata
release, so btrfs_cleanup_ordered_extents() doesn't need to do it.

For compression, it's calling writepage_end_io_hook() itself to handle
its error, and any submitted ordered extent will have its bio submitted,
so no need to worry about compression part.

Suggested-by: Filipe Manana 
Signed-off-by: Qu Wenruo 
---
v2:
  Add BTRFS_ORDERED_SKIP_METADATA flag to avoid double reducing
  outstanding extents, which is already done by
  extent_clear_unlock_delalloc() with EXTENT_DO_ACCOUNT control bit
v3:
  Skip first page to avoid underflow ordered->bytes_left.
  Fix range passed in cow_file_range() which doesn't cover the whole
  extent.
  Expend extent_clear_unlock_delalloc() range to allow them to handle
  metadata release.
---
 fs/btrfs/extent_io.c |  1 -
 fs/btrfs/inode.c | 68 +---
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ac383a3a649..a14d1b0840c5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3258,7 +3258,6 @@ static noinline_for_stack int writepage_delalloc(struct 
inode *inode,
   delalloc_end,
   _started,
   nr_written);
-   /* File system has been set read-only */
if (ret) {
SetPageError(page);
/* fill_delalloc should be return < 0 for error
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 92a7c3051b94..d4bac8f5caeb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -116,6 +116,33 @@ static struct extent_map *create_pinned_em(struct inode 
*inode, u64 start,
 
 static int btrfs_dirty_inode(struct inode *inode);
 
+
+static void __endio_write_update_ordered(struct inode *inode,
+const u64 offset, const u64 bytes,
+bool uptodate, bool cleanup);
+static inline void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+  const u64 offset,
+  const u64 bytes,
+  const int uptodate)
+{
+   return __endio_write_update_ordered(inode, offset, bytes, uptodate, 
false);
+}
+
+/*
+ * Cleanup all submitted ordered extent in specified range to handle error
+ * in cow_file_range() and run_delalloc_nocow().
+ * Compression handles error and ordered extent submission all by themselves,
+ * so no need to call this function.
+ *
+ * NOTE: caller must ensure they have already released their metadata by
+ * extent_clear_unlock_delalloc() with EXTENT_DO_ACCOUNTING control bit.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+u64 offset, u64 bytes)
+{
+   return __endio_write_update_ordered(inode, offset, bytes, false, true);
+}
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_inode_set_ops(struct inode *inode)
 {
@@ -950,6 +977,7 @@ static noinline int cow_file_range(struct inode *inode,
u64 disk_num_bytes;
u64 cur_alloc_size;
u64 blocksize = fs_info->sectorsize;
+   u64 orig_start = 

[PATCH v3 1/2] btrfs: ordered-extent: Introduce new bit to skip releasing metadata

2017-02-21 Thread Qu Wenruo
Introduce a new bit, BTRFS_ORDERED_SKIP_META for ordered extent to
allow btrfs_finish_ordered_io() to skip releasing metadata.

There are two sources for fill_delalloc() to release metadata:
1) extent_clear_unlock_delalloc()
   When EXTENT_DO_ACCOUNTING control bit is going to be cleared, we will
   free metadata.
2) btrfs_finish_ordered_io()
   When one ordered extent is going to finish, we always free its
   metadata.

This behavior is OK if and only if all ordered extents can finish
without problem.
When we need to manually finish ordered extent, such behavior can lead
to double releasing metadata, causing outstanding extents assert.

So this patch introduce BTRFS_ORDERED_SKIP_META bit to allow us skip
releasing metadata and allow extent_clear_unlock_delalloc() to handle
them all.

This provides the basis for later ordered extent deadlock fix.

Signed-off-by: Qu Wenruo 
---
v3:
  Newly introduced, split from v2 patch.
---
 fs/btrfs/inode.c| 5 +++--
 fs/btrfs/ordered-data.c | 8 +++-
 fs/btrfs/ordered-data.h | 8 +++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..92a7c3051b94 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3008,7 +3008,8 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
 ordered_extent->file_offset +
 ordered_extent->len - 1, _state, GFP_NOFS);
 out:
-   if (root != fs_info->tree_root)
+   if (root != fs_info->tree_root &&
+   !test_bit(BTRFS_ORDERED_SKIP_META, _extent->flags))
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
if (trans)
btrfs_end_transaction(trans);
@@ -8200,7 +8201,7 @@ static void 
btrfs_endio_direct_write_update_ordered(struct inode *inode,
ret = btrfs_dec_test_first_ordered_pending(inode, ,
   _offset,
   ordered_bytes,
-  uptodate);
+  uptodate, false);
if (!ret)
goto out_test;
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 041c3326d109..69372b0eb37a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -298,10 +298,14 @@ void btrfs_add_ordered_sum(struct inode *inode,
  *
  * file_offset is updated to one byte past the range that is recorded as
  * complete.  This allows you to walk forward in the file.
+ *
+ * If @skip_meta is true, we are in error handle routine to cleanup all
+ * ordered extents submitted in fill_delalloc().
  */
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
   struct btrfs_ordered_extent **cached,
-  u64 *file_offset, u64 io_size, int uptodate)
+  u64 *file_offset, u64 io_size, bool uptodate,
+  bool skip_meta)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
@@ -344,6 +348,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode 
*inode,
entry->bytes_left -= to_dec;
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, >flags);
+   if (skip_meta)
+   set_bit(BTRFS_ORDERED_SKIP_META, >flags);
 
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, >flags);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 5f2b0ca28705..b6efcfcd1da6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -75,6 +75,12 @@ struct btrfs_ordered_sum {
 * in the logging code. */
 #define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
  * complete in the current transaction. */
+/*
+ * This ordered extent is going to be cleaned up in error handle routine,
+ * no need to free metadata, as it's handled by extent_clear_unlock_delalloc()
+ */
+#define BTRFS_ORDERED_SKIP_META 12
+
 struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -169,7 +175,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
   struct btrfs_ordered_extent **cached,
   u64 *file_offset, u64 io_size,
-  int uptodate);
+  bool uptodate, bool skip_meta);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
-- 
2.11.1



--
To unsubscribe from this list: send the 

Re: [PATCH v2] btrfs: Handle delalloc error correctly to avoid deadlock

2017-02-21 Thread Qu Wenruo



At 02/22/2017 09:14 AM, Liu Bo wrote:

On Tue, Feb 21, 2017 at 04:06:59PM +0800, Qu Wenruo wrote:

If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
btrfs will block with the following backtrace:

Call Trace:
 __schedule+0x2d4/0xae0
 schedule+0x3d/0x90
 btrfs_start_ordered_extent+0x160/0x200 [btrfs]
 ? wake_atomic_t_function+0x60/0x60
 btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
 btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
 btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
 process_one_work+0x2af/0x720
 ? process_one_work+0x22b/0x720
 worker_thread+0x4b/0x4f0
 kthread+0x10f/0x150
 ? process_one_work+0x720/0x720
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x2e/0x40

The direct cause is the error handler in run_delalloc_nocow() doesn't
handle error from btrfs_reloc_clone_csums() well.

The error handler of run_delalloc_nocow() will clear dirty and finish IO
for the pages in that extent.
However we have already inserted one ordered extent.
And that ordered extent is relying on endio hooks to wait all its pages
to finish, while only the first page will finish.

This makes that ordered extent never finish, so blocking the file
system.

Although the root cause is still in RAID5/6, it won't hurt to fix the
error routine first.

This patch will slightly modify one existing function,
btrfs_endio_direct_write_update_ordered() to handle free space inode,
just like what btrfs_writepage_end_io_hook() does.

And use it as base to implement one inline function,
btrfs_cleanup_ordered_extents() to handle the error in
run_delalloc_nocow() and cow_file_range().

For compression, it's calling writepage_end_io_hook() itself to handle
its error, and any submitted ordered extent will have its bio submitted,
so no need to worry about compression part.

Suggested-by: Filipe Manana 
Signed-off-by: Qu Wenruo 
---
v2:
  Add BTRFS_ORDERED_SKIP_METADATA flag to avoid double reducing
  outstanding extents, which is already done by
  extent_clear_unlock_delalloc()
---
 fs/btrfs/inode.c| 75 +
 fs/btrfs/ordered-data.h |  2 ++
 2 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..a0b09ff73eae 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -116,6 +116,41 @@ static struct extent_map *create_pinned_em(struct inode 
*inode, u64 start,

 static int btrfs_dirty_inode(struct inode *inode);

+static void __endio_write_update_ordered(struct inode *inode,
+const u64 offset,
+const u64 bytes,
+const int uptodate,
+const int skip_meta);
+static inline void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+  const u64 offset,
+  const u64 bytes,
+  const int uptodate)
+{
+   return __endio_write_update_ordered(inode, offset, bytes, uptodate, 0);
+}
+
+/*
+ * Set error bit and cleanup all ordered extents in specified range of @inode.
+ *
+ * This is for error case where ordered extent(s) is submitted but
+ * corresponding bio is not submitted.
+ * This can make waiter on such ordered extent never finish, as there is no
+ * endio hook called to finish such ordered extent.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+const u64 offset,
+const u64 bytes)
+{
+   /*
+* In error handler, we have extent_clear_unlock_delalloc() called
+* to reduce our metadata space reservation and outstanding extents.
+*
+* So here, we don't need finish_ordered_io() to free metadata space
+* for us, or we will underflow outstanding extents.
+*/
+   return __endio_write_update_ordered(inode, offset, bytes, 0, 1);
+}
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_inode_set_ops(struct inode *inode)
 {
@@ -237,7 +272,6 @@ static int insert_inline_extent(struct btrfs_trans_handle 
*trans,
return err;
 }

-
 /*
  * conditionally insert an inline extent into the file.  This
  * does the checks required to make sure the data is small enough
@@ -1096,6 +1130,7 @@ static noinline int cow_file_range(struct inode *inode,
 EXTENT_DELALLOC | EXTENT_DEFRAG,
 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+   btrfs_cleanup_ordered_extents(inode, start, end - start + 1);


Note that @start is rolling forward in the 'while' loop, using start here won't
cleanup previous added ordered extent.


Thanks for pointing this 

Re: [PATCH v2] btrfs: Handle delalloc error correctly to avoid deadlock

2017-02-21 Thread Liu Bo
On Tue, Feb 21, 2017 at 04:06:59PM +0800, Qu Wenruo wrote:
> If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
> btrfs will block with the following backtrace:
> 
> Call Trace:
>  __schedule+0x2d4/0xae0
>  schedule+0x3d/0x90
>  btrfs_start_ordered_extent+0x160/0x200 [btrfs]
>  ? wake_atomic_t_function+0x60/0x60
>  btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
>  btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
>  btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
>  process_one_work+0x2af/0x720
>  ? process_one_work+0x22b/0x720
>  worker_thread+0x4b/0x4f0
>  kthread+0x10f/0x150
>  ? process_one_work+0x720/0x720
>  ? kthread_create_on_node+0x40/0x40
>  ret_from_fork+0x2e/0x40
> 
> The direct cause is the error handler in run_delalloc_nocow() doesn't
> handle error from btrfs_reloc_clone_csums() well.
> 
> The error handler of run_delalloc_nocow() will clear dirty and finish IO
> for the pages in that extent.
> However we have already inserted one ordered extent.
> And that ordered extent is relying on endio hooks to wait all its pages
> to finish, while only the first page will finish.
> 
> This makes that ordered extent never finish, so blocking the file
> system.
> 
> Although the root cause is still in RAID5/6, it won't hurt to fix the
> error routine first.
> 
> This patch will slightly modify one existing function,
> btrfs_endio_direct_write_update_ordered() to handle free space inode,
> just like what btrfs_writepage_end_io_hook() does.
> 
> And use it as base to implement one inline function,
> btrfs_cleanup_ordered_extents() to handle the error in
> run_delalloc_nocow() and cow_file_range().
> 
> For compression, it's calling writepage_end_io_hook() itself to handle
> its error, and any submitted ordered extent will have its bio submitted,
> so no need to worry about compression part.
>
> Suggested-by: Filipe Manana 
> Signed-off-by: Qu Wenruo 
> ---
> v2:
>   Add BTRFS_ORDERED_SKIP_METADATA flag to avoid double reducing
>   outstanding extents, which is already done by
>   extent_clear_unlock_delalloc()
> ---
>  fs/btrfs/inode.c| 75 
> +
>  fs/btrfs/ordered-data.h |  2 ++
>  2 files changed, 66 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 1e861a063721..a0b09ff73eae 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -116,6 +116,41 @@ static struct extent_map *create_pinned_em(struct inode 
> *inode, u64 start,
>  
>  static int btrfs_dirty_inode(struct inode *inode);
>  
> +static void __endio_write_update_ordered(struct inode *inode,
> +  const u64 offset,
> +  const u64 bytes,
> +  const int uptodate,
> +  const int skip_meta);
> +static inline void btrfs_endio_direct_write_update_ordered(struct inode 
> *inode,
> +const u64 offset,
> +const u64 bytes,
> +const int uptodate)
> +{
> + return __endio_write_update_ordered(inode, offset, bytes, uptodate, 0);
> +}
> +
> +/*
> + * Set error bit and cleanup all ordered extents in specified range of 
> @inode.
> + *
> + * This is for error case where ordered extent(s) is submitted but
> + * corresponding bio is not submitted.
> + * This can make waiter on such ordered extent never finish, as there is no
> + * endio hook called to finish such ordered extent.
> + */
> +static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
> +  const u64 offset,
> +  const u64 bytes)
> +{
> + /*
> +  * In error handler, we have extent_clear_unlock_delalloc() called
> +  * to reduce our metadata space reservation and outstanding extents.
> +  *
> +  * So here, we don't need finish_ordered_io() to free metadata space
> +  * for us, or we will underflow outstanding extents.
> +  */
> + return __endio_write_update_ordered(inode, offset, bytes, 0, 1);
> +}
> +
>  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>  void btrfs_test_inode_set_ops(struct inode *inode)
>  {
> @@ -237,7 +272,6 @@ static int insert_inline_extent(struct btrfs_trans_handle 
> *trans,
>   return err;
>  }
>  
> -
>  /*
>   * conditionally insert an inline extent into the file.  This
>   * does the checks required to make sure the data is small enough
> @@ -1096,6 +1130,7 @@ static noinline int cow_file_range(struct inode *inode,
>EXTENT_DELALLOC | EXTENT_DEFRAG,
>PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
>PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
> + btrfs_cleanup_ordered_extents(inode, start, end - 

Re: [PATCH] Btrfs: try harder to migrate items to left sibling before splitting a leaf

2017-02-21 Thread Liu Bo
On Sun, Feb 19, 2017 at 08:56:39PM +, fdman...@kernel.org wrote:
> From: Filipe Manana 
> 
> Before attempting to split a leaf we try to migrate items from the leaf to
> its right and left siblings. We start by trying to move items into the
> rigth sibling and, if the new item is meant to be inserted at the end of
> our leaf, we try to free from our leaf an amount of bytes equal to the
> number of bytes used by the new item, by setting the variable space_needed
> to the byte size of that new item. However if we fail to move enough items
> to the right sibling due to lack of space in that sibling, we then try
> to move items into the left sibling, and in that case we try to free
> an amount equal to the size of the new item from our leaf, when we need
> only to free an amount corresponding to the size of the new item minus
> the current free space of our leaf. So make sure that before we try to
> move items to the left sibling we do set the variable space_needed with
> a value corresponding to the new item's size minus the leaf's current
> free space.
> 
> Signed-off-by: Filipe Manana 
> ---
>  fs/btrfs/ctree.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index a426dc8..1d66761 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -4160,6 +4160,9 @@ static noinline int push_for_double_split(struct 
> btrfs_trans_handle *trans,
>  
>   /* try to push all the items before our slot into the next leaf */
>   slot = path->slots[0];
> + space_needed = data_size;
> + if (slot > 0)
> + space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);

Good point.

>   ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
>   if (ret < 0)
>   return ret;
> @@ -4215,6 +4218,10 @@ static noinline int split_leaf(struct 
> btrfs_trans_handle *trans,
>   if (wret < 0)
>   return wret;
>   if (wret) {
> + space_needed = data_size;
> + if (slot > 0)
> + space_needed -= btrfs_leaf_free_space(fs_info,
> +   l);

Not sure if we need this, the above push_leaf_right() was called with
@min_data_size == space_needed, thus if @wret == 1, no items have been moved in
push_leaf_right() so that leaf 'l' remains unchanged.

Thanks,

-liubo
>   wret = push_leaf_left(trans, root, path, space_needed,
> space_needed, 0, (u32)-1);
>   if (wret < 0)
> -- 
> 2.7.0.rc3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/1] btrfs-progs: docs: fix many typos, plus three edits for clarity

2017-02-21 Thread Nicholas D Steeves
Signed-off-by: Nicholas D Steeves 
---
 Documentation/btrfs-balance.asciidoc  |  2 +-
 Documentation/btrfs-check.asciidoc|  8 
 Documentation/btrfs-device.asciidoc   |  6 +++---
 Documentation/btrfs-filesystem.asciidoc   |  6 +++---
 Documentation/btrfs-inspect-internal.asciidoc |  6 +++---
 Documentation/btrfs-man5.asciidoc | 15 ---
 Documentation/btrfs-quota.asciidoc|  8 
 Documentation/btrfs-receive.asciidoc  |  4 ++--
 Documentation/btrfs-restore.asciidoc  |  2 +-
 Documentation/btrfs-scrub.asciidoc|  9 -
 Documentation/btrfs-send.asciidoc |  6 +++---
 11 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/Documentation/btrfs-balance.asciidoc 
b/Documentation/btrfs-balance.asciidoc
index c456898e..0b687eaf 100644
--- a/Documentation/btrfs-balance.asciidoc
+++ b/Documentation/btrfs-balance.asciidoc
@@ -89,7 +89,7 @@ warned and has a few seconds to cancel the operation before 
it starts. The
 warning and delay can be skipped with '--full-balance' option.
 +
 Please note that the filters must be written together with the '-d', '-m' and
-'-s' options, because they're optional and bare '-d' etc alwo work and mean no
+'-s' options, because they're optional and bare '-d' etc also work and mean no
 filters.
 +
 `Options`
diff --git a/Documentation/btrfs-check.asciidoc 
b/Documentation/btrfs-check.asciidoc
index 633cbbf6..28ed9dd7 100644
--- a/Documentation/btrfs-check.asciidoc
+++ b/Documentation/btrfs-check.asciidoc
@@ -30,11 +30,11 @@ data structures satisfy the constraints, point to the right 
objects or are
 correctly connected together.
 
 There are several cross checks that can detect wrong reference counts of shared
-extents, backrefrences, missing extents of inodes, directory and inode
+extents, backreferences, missing extents of inodes, directory and inode
 connectivity etc.
 
 The amount of memory required can be high, depending on the size of the
-filesystem, smililarly the run time.
+filesystem, similarly the run time.
 
 SAFE OR ADVISORY OPTIONS
 
@@ -49,7 +49,7 @@ verify checksums of data blocks
 +
 This expects that the filesystem is otherwise
 OK, so this is basically and offline 'scrub' but does not repair data from
-spare coipes.
+spare copies.
 
 --chunk-root ::
 use the given offset 'bytenr' for the chunk tree root
@@ -111,7 +111,7 @@ NOTE: Do not use unless you know what you're doing.
 select mode of operation regarding memory and IO
 +
 The 'MODE' can be one of 'original' and 'lowmem'. The original mode is mostly
-unoptimized regarding memory consumpption and can lead to out-of-memory
+unoptimized regarding memory consumption and can lead to out-of-memory
 conditions on large filesystems. The possible workaround is to export the block
 device over network to a machine with enough memory. The low memory mode is
 supposed to address the memory consumption, at the cost of increased IO when it
diff --git a/Documentation/btrfs-device.asciidoc 
b/Documentation/btrfs-device.asciidoc
index eedcac85..b7f27c44 100644
--- a/Documentation/btrfs-device.asciidoc
+++ b/Documentation/btrfs-device.asciidoc
@@ -24,14 +24,14 @@ similarity, the RAID terminology is widely used in the 
documentation.  See
 constraints.
 
 The device management works on a mounted filesystem. Devices can be added,
-removed or replaced, by commands profided by *btrfs device* and *btrfs 
replace*.
+removed or replaced, by commands provided by *btrfs device* and *btrfs 
replace*.
 
 The profiles can be also changed, provided there's enough workspace to do the
 conversion, using the *btrfs balance* command and namely the filter 'convert'.
 
 Profile::
 A profile describes an allocation policy based on the redundancy/replication
-constrants in connection with the number of devices. The profile applies to
+constraints in connection with the number of devices. The profile applies to
 data and metadata block groups separately.
 
 RAID level::
@@ -182,7 +182,7 @@ blocks, the disk seeking is the key factor affecting 
performance.
 
 You'll note that the system block group has been also converted to RAID1, this
 normally happens as the system block group also holds metadata (the physical to
-logial mappings).
+logical mappings).
 
 What changed:
 
diff --git a/Documentation/btrfs-filesystem.asciidoc 
b/Documentation/btrfs-filesystem.asciidoc
index 0f7ea495..d57f28fb 100644
--- a/Documentation/btrfs-filesystem.asciidoc
+++ b/Documentation/btrfs-filesystem.asciidoc
@@ -14,7 +14,7 @@ DESCRIPTION
 *btrfs filesystem* is used to perform several whole filesystem level tasks,
 including all the regular filesystem operations like resizing, space stats,
 label setting/getting, and defragmentation. There are other whole filesystem
-taks like scrub or balance that are grouped in separate commands.
+tasks like scrub or balance that are grouped in separate commands.
 
 

[PATCH] btrfs-progs: docs: fix many typos, plus three edits for clarit

2017-02-21 Thread Nicholas D Steeves
Hi David,

Please see attached a reasonably thorough patch for all the typos I
could find in btrfs-progs documentation.  The three edits are very
minor and look larger than they are because I had to reflow the
paragraphs.  I'm confident in the quality of the work, with the
exception of the following, where I'm not sure what to do:

Documentation/btrfs-receive.asciidoc
@@ -66,7 +66,7 @@ tell us where this filesystem is mounted.
 --dump::
 dump the stream metadata, one line per operation
 +
-Does not require the 'path' parameter. The filesystem chanded.
+Does not require the 'path' parameter. The filesystem changed.


Sincerely,
Nicholas

Nicholas D Steeves (1):
  btrfs-progs: docs: fix many typos, plus three edits for clarity

 Documentation/btrfs-balance.asciidoc  |  2 +-
 Documentation/btrfs-check.asciidoc|  8 
 Documentation/btrfs-device.asciidoc   |  6 +++---
 Documentation/btrfs-filesystem.asciidoc   |  6 +++---
 Documentation/btrfs-inspect-internal.asciidoc |  6 +++---
 Documentation/btrfs-man5.asciidoc | 15 ---
 Documentation/btrfs-quota.asciidoc|  8 
 Documentation/btrfs-receive.asciidoc  |  4 ++--
 Documentation/btrfs-restore.asciidoc  |  2 +-
 Documentation/btrfs-scrub.asciidoc|  9 -
 Documentation/btrfs-send.asciidoc |  6 +++---
 11 files changed, 36 insertions(+), 36 deletions(-)

-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: FS gives kernel UPS on attempt to create snapshot and after running balance it's unmountable.

2017-02-21 Thread Tomasz Kusmierz
Anyone ?

On 18 Feb 2017, at 16:44, Tomasz Kusmierz  wrote:

So Qu,

currently my situation is that:
I've tried to go btrfs scan --repair, and it did relair some stuff is
qgroup's ... then tried to mont it and, surprise surpeire system
locked out in 20 seconds.

Reboot, again scan --repair = a lot of missing back pointers were
repaired and system is supposedly "OK"  attempted to mount it and
within 20 seconds system locked out so hard it wold no even reboot
from acpi.

installed "ellrepo kernel-lm" and installed 4.9.10

another scan --repair = same problem with lot's of back pointer
missing, fixed  system again seems "OK" ... another attempt to
mount /dev/sdc /mnt2/main_pool and again after 20 seconds system locks
up hard.

There is nothing in messages, nothing in dmesg ... I think that system
lock up so hard that master btrfs filesystem does not get time those
logs pushed to disk.






On 16 February 2017 at 23:46, Tomasz Kusmierz  wrote:

Thanks Qu,

Just before I’ll go and accidentally mess up this FS more - I’ve
mentioned originally that this problem started with FS not being able
to create a snapshot ( it would get remounted RO automatically ) for
about a month, and when I’ve realised that there is a problem like
that I’ve attempted a full FS balance that caused this FS to be
unmountable. Is there any other debug you would require before I
proceed (I’ve got a lot i

On 16 Feb 2017, at 01:26, Qu Wenruo  wrote:



At 02/15/2017 10:11 PM, Tomasz Kusmierz wrote:

So guys, any help here ? I’m kinda stuck now with system just idling
and doing nothing while I wait for some feedback ...


Sorry for the late reply.

Busying debugging a kernel bug.

On 14 Feb 2017, at 19:38, Tomasz Kusmierz  wrote:

[root@server ~]#  btrfs-show-super -af /dev/sdc
superblock: bytenr=65536, device=/dev/sdc
-
csum_type   0 (crc32c)
csum_size   4
csum0x17d56ce0 [match]


This superblock is good.

bytenr  65536
flags   0x1
 ( WRITTEN )
magic   _BHRfS_M [match]
fsid0576d577-8954-4a60-a02b-9492b3c29318
label   main_pool
generation  150682
root5223857717248
sys_array_size  321
chunk_root_generation   150678
root_level  1
chunk_root  8669488005120
chunk_root_level1
log_root0
log_root_transid0
log_root_level  0
total_bytes 16003191472128
bytes_used  6411278503936
sectorsize  4096
nodesize16384
leafsize16384
stripesize  4096
root_dir6
num_devices 8
compat_flags0x0
compat_ro_flags 0x0
incompat_flags  0x161
 ( MIXED_BACKREF |
   BIG_METADATA |
   EXTENDED_IREF |
   SKINNY_METADATA )
cache_generation150682
uuid_tree_generation150679
dev_item.uuid   46abffa8-7afe-451f-93c6-abb8e589c4e8
dev_item.fsid   0576d577-8954-4a60-a02b-9492b3c29318 [match]
dev_item.type   0
dev_item.total_bytes2000398934016
dev_item.bytes_used 1647136735232
dev_item.io_align   4096
dev_item.io_width   4096
dev_item.sector_size4096
dev_item.devid  1
dev_item.dev_group  0
dev_item.seek_speed 0
dev_item.bandwidth  0
dev_item.generation 0
sys_chunk_array[2048]:
 item 0 key (FIRST_CHUNK_TREE CHUNK_ITEM 8669487824896)
 length 67108864 owner 2 stripe_len 65536 type SYSTEM|RAID10
 io_align 65536 io_width 65536 sector_size 4096
 num_stripes 8 sub_stripes 2
 stripe 0 devid 7 offset 1083674984448
 dev_uuid 566fb8a3-d6de-4230-8b70-a5fda0a120f6
 stripe 1 devid 8 offset 1083674984448
 dev_uuid 845aefb2-e0a6-479a-957b-a82fb7207d6c
 stripe 2 devid 1 offset 1365901312
 dev_uuid 46abffa8-7afe-451f-93c6-abb8e589c4e8
 stripe 3 devid 3 offset 1345978368
 dev_uuid 95921633-2fc1-479f-a3ba-e6e5a1989755
 stripe 4 devid 4 offset 1345978368
 dev_uuid 20828f0e-4661-4987-ac11-72814c1e423a
 stripe 5 devid 5 offset 1345978368
 dev_uuid 2c3cd71f-5178-48e7-8032-6b6eec023197
 stripe 6 devid 6 offset 1345978368
 dev_uuid 806a47e5-cac4-41c9-abb9-5c49506459e1
 stripe 7 devid 2 offset 1345978368
 dev_uuid e1358e0e-edaf-4505-9c71-ed0862c45841


And I didn't see anything wrong in sys_chunk_array.


Would you please try to mount the fs with latest kernel?

[PATCH] ioctl_getfsmap.2: document the GETFSMAP ioctl

2017-02-21 Thread Darrick J. Wong
Document the new GETFSMAP ioctl that returns the physical layout of a
(disk-based) filesystem.  This time around the fs-specific parts have
been moved to a separate section; I'll move move them into separate
xfsprogs/e2fsprogs manpages when we get closer to landing the ioctl.

Signed-off-by: Darrick J. Wong 
---
 man2/ioctl_getfsmap.2 |  359 +
 1 file changed, 359 insertions(+)
 create mode 100644 man2/ioctl_getfsmap.2

diff --git a/man2/ioctl_getfsmap.2 b/man2/ioctl_getfsmap.2
new file mode 100644
index 000..7121d61
--- /dev/null
+++ b/man2/ioctl_getfsmap.2
@@ -0,0 +1,359 @@
+.\" Copyright (c) 2017, Oracle.  All rights reserved.
+.\"
+.\" %%%LICENSE_START(GPLv2+_DOC_FULL)
+.\" This is free documentation; you can redistribute it and/or
+.\" modify it under the terms of the GNU General Public License as
+.\" published by the Free Software Foundation; either version 2 of
+.\" the License, or (at your option) any later version.
+.\"
+.\" The GNU General Public License's references to "object code"
+.\" and "executables" are to be interpreted as the output of any
+.\" document formatting or typesetting system, including
+.\" intermediate and printed output.
+.\"
+.\" This manual is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+.\" GNU General Public License for more details.
+.\"
+.\" You should have received a copy of the GNU General Public
+.\" License along with this manual; if not, see
+.\" .
+.\" %%%LICENSE_END
+.TH IOCTL-GETFSMAP 2 2017-02-10 "Linux" "Linux Programmer's Manual"
+.SH NAME
+ioctl_getfsmap \- retrieve the physical layout of the filesystem
+.SH SYNOPSIS
+.br
+.B #include 
+.br
+.B #include 
+.br
+.B #include 
+.sp
+.BI "int ioctl(int " fd ", GETFSMAP, struct fsmap_head * " arg );
+.SH DESCRIPTION
+This
+.BR ioctl (2)
+retrieves physical extent mappings for a filesystem.
+This information can be used to discover which files are mapped to a physical
+block, examine free space, or find known bad blocks, among other things.
+
+The sole argument to this ioctl should be a pointer to a single
+.BR "struct fsmap_head" ":"
+.in +4n
+.nf
+
+struct fsmap {
+   __u32   fmr_device; /* device id */
+   __u32   fmr_flags;  /* mapping flags */
+   __u64   fmr_physical;   /* device offset of segment */
+   __u64   fmr_owner;  /* owner id */
+   __u64   fmr_offset; /* file offset of segment */
+   __u64   fmr_length; /* length of segment */
+   __u64   fmr_reserved[3];/* must be zero */
+};
+
+struct fsmap_head {
+   __u32   fmh_iflags; /* control flags */
+   __u32   fmh_oflags; /* output flags */
+   __u32   fmh_count;  /* # of entries in array incl. input */
+   __u32   fmh_entries;/* # of entries filled in (output). */
+   __u64   fmh_reserved[6];/* must be zero */
+
+   struct fsmapfmh_keys[2];/* low and high keys for the mapping 
search */
+   struct fsmapfmh_recs[]; /* returned records */
+};
+
+.fi
+.in
+The two
+.I fmh_keys
+array elements specify the lowest and highest reverse-mapping
+keys, respectively, for which userspace would like physical mapping
+information.
+A reverse mapping key consists of the tuple (device, block, owner, offset).
+The owner and offset fields are part of the key because some filesystems
+support sharing physical blocks between multiple files and
+therefore may return multiple mappings for a given physical block.
+.PP
+Filesystem mappings are copied into the
+.I fmh_recs
+array, which immediately follows the header data.
+.SS Fields of struct fsmap_head
+.PP
+The
+.I fmh_iflags
+field is a bitmask passed to the kernel to alter the output.
+There are no flags defined, so this value must be zero.
+
+.PP
+The
+.I fmh_oflags
+field is a bitmask of flags that concern all output mappings.
+If
+.B FMH_OF_DEV_T
+is set, then the
+.I fmr_device
+field represents a
+.B dev_t
+structure containing the major and minor numbers of the block device.
+
+.PP
+The
+.I fmh_count
+field contains the number of elements in the array being passed to the
+kernel.
+If this value is 0,
+.I fmh_entries
+will be set to the number of records that would have been returned had
+the array been large enough;
+no mapping information will be returned.
+
+.PP
+The
+.I fmh_entries
+field contains the number of elements in the
+.I fmh_recs
+array that contain useful information.
+
+.PP
+The
+.I fmh_reserved
+fields must be set to zero.
+
+.SS Keys
+.PP
+The two key records in
+.B fsmap_head.fmh_keys
+specify the lowest and highest extent records in the keyspace that the caller
+wants returned.
+A filesystem that can share blocks between files likely 

[PATCH] Btrfs: use the correct type when creating cow dio extent

2017-02-21 Thread Liu Bo
'BTRFS_ORDERED_REGULAR' was introduced for the cow case in patch
'Btrfs: specify a new ordered extent type for create_io_em',
but it missed the directIO cow case.

Signed-off-by: Liu Bo 
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d90b5b3..3060d5c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7185,7 +7185,7 @@ static struct extent_map *btrfs_new_extent_direct(struct 
inode *inode,
 
em = btrfs_create_dio_extent(inode, start, ins.offset, start,
 ins.objectid, ins.offset, ins.offset,
-ins.offset, 0);
+ins.offset, BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
btrfs_free_reserved_extent(fs_info, ins.objectid,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: fix deadlock between dedup on same file and starting writeback

2017-02-21 Thread fdmanana
From: Filipe Manana 

If we are deduping two ranges of the same file we need to make sure that
we lock all pages in ascending order, that is, lock first the pages from
the range with lower offset and then the pages from the other range, as
otherwise we can deadlock with a concurrent task that is starting delalloc
(writeback). Example trace:

[74073.052218] INFO: task kworker/u32:10:17997 blocked for more than 120 
seconds.
[74073.053889]   Tainted: GW   4.9.0-rc7-btrfs-next-36+ #1
[74073.055071] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[74073.056696] kworker/u32:10  D0 17997  2 0x
[74073.058606] Workqueue: writeback wb_workfn (flush-btrfs-53176)
[74073.061370]  880031e79858 8802159d2580 880237004580 
880031e79240
[74073.064784]  88023f4978c0 c9000817b638 814c15e1 

[74073.068386]  88023f4978d8 88023f4978c0 0017b620 
880031e79240
[74073.071712] Call Trace:
[74073.072884]  [] ? __schedule+0x48f/0x6f4
[74073.075395]  [] ? bit_wait+0x2f/0x2f
[74073.077511]  [] schedule+0x8c/0xa0
[74073.079440]  [] schedule_timeout+0x43/0xff
[74073.081637]  [] ? time_hardirqs_on+0x9/0x14
[74073.083809]  [] ? trace_hardirqs_on_caller+0x16/0x197
[74073.086314]  [] ? timekeeping_get_ns+0x1e/0x32
[74073.100654]  [] ? ktime_get+0x41/0x52
[74073.102619]  [] io_schedule_timeout+0xa0/0x102
[74073.104771]  [] ? io_schedule_timeout+0xa0/0x102
[74073.106969]  [] bit_wait_io+0x1b/0x39
[74073.108954]  [] __wait_on_bit_lock+0x4f/0x99
[74073.110981]  [] __lock_page+0x6b/0x6d
[74073.112833]  [] ? autoremove_wake_function+0x3a/0x3a
[74073.115010]  [] lock_page+0x2f/0x32 [btrfs]
[74073.116999]  [] lock_delalloc_pages+0xc7/0x1a0 [btrfs]
[74073.119243]  [] find_lock_delalloc_range+0xc3/0x1a4 [btrfs]
[74073.121636]  [] writepage_delalloc.isra.31+0x8b/0x134 
[btrfs]
[74073.124229]  [] __extent_writepage+0x1c1/0x2bf [btrfs]
[74073.126372]  [] 
extent_write_cache_pages.isra.30.constprop.49+0x28b/0x36c [btrfs]
[74073.129371]  [] extent_writepages+0x4b/0x5c [btrfs]
[74073.131440]  [] ? 
insert_reserved_file_extent.constprop.42+0x261/0x261 [btrfs]
[74073.134303]  [] ? writeback_sb_inodes+0xe0/0x4a1
[74073.136298]  [] btrfs_writepages+0x28/0x2a [btrfs]
[74073.138248]  [] do_writepages+0x23/0x2c
[74073.139910]  [] __writeback_single_inode+0x105/0x6d2
[74073.142003]  [] writeback_sb_inodes+0x292/0x4a1
[74073.136298]  [] btrfs_writepages+0x28/0x2a [btrfs]
[74073.138248]  [] do_writepages+0x23/0x2c
[74073.139910]  [] __writeback_single_inode+0x105/0x6d2
[74073.142003]  [] writeback_sb_inodes+0x292/0x4a1
[74073.143911]  [] __writeback_inodes_wb+0x76/0xae
[74073.145787]  [] wb_writeback+0x1cc/0x4d7
[74073.147452]  [] wb_workfn+0x194/0x37d
[74073.149084]  [] ? wb_workfn+0x194/0x37d
[74073.150726]  [] ? process_one_work+0x154/0x4e4
[74073.152694]  [] process_one_work+0x273/0x4e4
[74073.154452]  [] worker_thread+0x1eb/0x2ca
[74073.156138]  [] ? rescuer_thread+0x2b6/0x2b6
[74073.157837]  [] kthread+0xd5/0xdd
[74073.159339]  [] ? __kthread_unpark+0x5a/0x5a
[74073.161088]  [] ret_from_fork+0x27/0x40
[74073.162680] INFO: lockdep is turned off.
[74073.163855] INFO: task do-dedup:30264 blocked for more than 120 seconds.
[74073.181180]   Tainted: GW   4.9.0-rc7-btrfs-next-36+ #1
[74073.181180] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[74073.185296] fdm-stress  D0 30264  29974 0x
[74073.186810]  880089595118 880211b8eac0 880237030380 
880089594b00
[74073.188998]  88023f2978c0 c900063abb68 814c15e1 

[74073.191070]  88023f2978d8 88023f2978c0 003abb50 
880089594b00
[74073.193286] Call Trace:
[74073.193990]  [] ? __schedule+0x48f/0x6f4
[74073.195418]  [] ? bit_wait+0x2f/0x2f
[74073.196796]  [] schedule+0x8c/0xa0
[74073.198163]  [] schedule_timeout+0x43/0xff
[74073.199621]  [] ? trace_hardirqs_on+0xd/0xf
[74073.201100]  [] ? timekeeping_get_ns+0x1e/0x32
[74073.202686]  [] ? ktime_get+0x41/0x52
[74073.204051]  [] io_schedule_timeout+0xa0/0x102
[74073.205585]  [] ? io_schedule_timeout+0xa0/0x102
[74073.207123]  [] bit_wait_io+0x1b/0x39
[74073.208238]  [] __wait_on_bit_lock+0x4f/0x99
[74073.208871]  [] __lock_page+0x6b/0x6d
[74073.209430]  [] ? autoremove_wake_function+0x3a/0x3a
[74073.210101]  [] lock_page+0x2f/0x32
[74073.210636]  [] pagecache_get_page+0x5e/0x153
[74073.211270]  [] gather_extent_pages+0x4e/0x109 [btrfs]
[74073.212166]  [] btrfs_dedupe_file_range+0x1e1/0x4dd [btrfs]
[74073.213257]  [] vfs_dedupe_file_range+0x1c1/0x221
[74073.214086]  [] do_vfs_ioctl+0x442/0x600
[74073.214767]  [] ? rcu_read_unlock+0x5b/0x5d
[74073.215619]  [] ? __fget+0x6b/0x77
[74073.216338]  [] SyS_ioctl+0x57/0x79
[74073.217149]  [] entry_SYSCALL_64_fastpath+0x18/0xad
[74073.218102]  [] ? time_hardirqs_off+0x9/0x14
[74073.218968]  [] ? trace_hardirqs_off_caller+0x1f/0xaa
[74073.219938] INFO: lockdep is turned 

BTRFS critical: corrupt leaf, slot offset bad; then read-only

2017-02-21 Thread Lukas Tribus

Hi list!


I have btrfs pool consisting of 5x 2,72 TiB LUKS (dm-crypt) partitions 
in RAID1, mounted on Linux 4.4 with btrfs-progs 4.4. I never had any 
crashes or power loss here, but recently about every 60 - 120 minutes 
(while in use) btrfs detects corruptions, aborts the transaction and 
drops to read-only mode.
btrfs still mounts normally without any special options (it does take 
about 60 seconds, which I guess is normal for this kind of size). All 
LUKS partitions have at least 400GiB of free space.


I don't see any HW problems here; I doubt there is a corruption coming 
from the LUKS partition. I did test the RAM but it seems fine in 
multiple memtest86+ amd memtest86 runs.



Are there any known bugs in 4.4? Any suggestions would be greatly 
appreciated!



I have to admit I did not regularly scrub.


Thanks,
Lukas


---
~# uname -a
Linux srv1-dom0 4.4.0-63-generic #84-Ubuntu SMP Wed Feb 1 17:20:32 UTC 
2017 x86_64 x86_64 x86_64 GNU/Linux

~# btrfs --version
btrfs-progs v4.4
~# btrfs fi show
Label: 'dom0-os'  uuid: e475636c-21e0-4563-87d6-91f03c519a62
Total devices 5 FS bytes used 3.52GiB
devid1 size 10.00GiB used 3.53GiB path /dev/sda2
devid2 size 10.00GiB used 4.25GiB path /dev/sdb2
devid3 size 10.00GiB used 3.28GiB path /dev/sdc2
devid4 size 10.00GiB used 4.00GiB path /dev/sdd2
devid5 size 10.00GiB used 4.00GiB path /dev/sde2

Label: 'storage_pool'  uuid: f50f980e-7640-49c7-bf8d-20d55cfe6005
Total devices 5 FS bytes used 5.77TiB
devid1 size 2.72TiB used 2.31TiB path /dev/mapper/sda3_crypt
devid2 size 2.72TiB used 2.31TiB path /dev/mapper/sdb3_crypt
devid3 size 2.72TiB used 2.31TiB path /dev/mapper/sdc3_crypt
devid4 size 2.72TiB used 2.31TiB path /dev/mapper/sdd3_crypt
devid5 size 2.72TiB used 2.31TiB path /dev/mapper/sde3_crypt
~# btrfs fi df /storage/users/
Data, RAID1: total=5.77TiB, used=5.76TiB
System, RAID1: total=32.00MiB, used=832.00KiB
Metadata, RAID1: total=8.00GiB, used=6.96GiB
GlobalReserve, single: total=512.00MiB, used=0.00B
~#

~#

partial dmesg:
[ 1509.033492] BTRFS: device label storage_pool devid 1 transid 238135 
/dev/dm-5
[ 1510.498804] BTRFS: device label storage_pool devid 2 transid 238135 
/dev/dm-6
[ 1511.980968] BTRFS: device label storage_pool devid 3 transid 238135 
/dev/dm-7
[ 1513.461799] BTRFS: device label storage_pool devid 4 transid 238135 
/dev/dm-8
[ 1514.838757] BTRFS: device label storage_pool devid 5 transid 238135 
/dev/dm-9

[ 1517.726471] BTRFS info (device dm-9): btrfs: use no compression
[ 1517.726477] BTRFS info (device dm-9): disk space caching is enabled
[ 1517.726479] BTRFS: has skinny extents
[ 1569.598633] BTRFS: checking UUID tree
[ 3540.825747] BTRFS critical (device dm-9): corrupt leaf, slot offset 
bad: block=5242107641856,root=1, slot=39
[ 3540.836168] BTRFS critical (device dm-9): corrupt leaf, slot offset 
bad: block=5242107641856,root=1, slot=39

[ 3540.846413] [ cut here ]
[ 3540.846432] WARNING: CPU: 2 PID: 2757 at 
/build/linux-mPTI9s/linux-4.4.0/fs/btrfs/extent-tree.c:2930 
btrfs_run_delayed_refs+0x26b/0x2a0 [btrfs]()

[ 3540.846433] BTRFS: Transaction aborted (error -5)
[ 3540.846434] Modules linked in: algif_skcipher af_alg xen_gntdev 
xen_evtchn xenfs xen_privcmd drbg ansi_cprng dm_crypt nls_iso8859_1 
bridge stp llc intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp 
crct10dif_pclmul crc32_pclmul ghash_clmulni_intel serio_raw joydev 
input_leds nuvoton_cir 8250_fintek ie31200_edac mac_hid rc_core lpc_ich 
edac_core shpchp mei_me mei ib_iser rdma_cm iw_cm ib_cm ib_sa ib_mad 
ib_core ib_addr iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi 
autofs4 btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq 
async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear 
hid_generic usbhid hid mxm_wmi i915 i2c_algo_bit drm_kms_helper 
aesni_intel aes_x86_64 glue_helper syscopyarea sysfillrect firewire_ohci 
sysimgblt firewire_core fb_sys_fops lrw psmouse
[ 3540.846466]  tg3 gf128mul ablk_helper cryptd crc_itu_t ptp ahci drm 
pps_core libahci fjes wmi video
[ 3540.846473] CPU: 2 PID: 2757 Comm: btrfs-transacti Not tainted 
4.4.0-63-generic #84-Ubuntu
[ 3540.846475] Hardware name: To Be Filled By O.E.M. To Be Filled By 
O.E.M./Z77 Extreme6, BIOS P2.80 07/01/2013
[ 3540.846476]  0200 02709bc3 88007615fc90 
813f8083
[ 3540.846478]  88007615fcd8 c048d498 88007615fcc8 
810812d2
[ 3540.846479]  8802adf562f8 8802a9c71800 8800056caef0 


[ 3540.846481] Call Trace:
[ 3540.846486]  [] dump_stack+0x63/0x90
[ 3540.846489]  [] warn_slowpath_common+0x82/0xc0
[ 3540.846491]  [] warn_slowpath_fmt+0x5c/0x80
[ 3540.846500]  [] ? 
__btrfs_run_delayed_refs+0xcdd/0x1220 [btrfs]
[ 3540.846509]  [] btrfs_run_delayed_refs+0x26b/0x2a0 
[btrfs]
[ 3540.846520]  [] commit_cowonly_roots+0x22b/0x2c2 

Re: Large no.of extents for a single small file with a fresh RAID5 setup

2017-02-21 Thread Lakshmipathi.G
> 
> >I'm using Ubuntu(16.04) desktop version (not server) running Xorg and 
> >others. May be its possible its
> >flushing data to disk at constant time. If you like to give it a try again 
> >with servers, below is the
> >exact script along with its timing & multiextent output.
> 
> Archlinux here, with my btrfs vm and browser and music player, still one
> extent.
> 
looks like I have better setup to produce single-extent files than yours :-)
> Are you under heavy memory usage?
> 
no, not as per top and free. 'top' shows mem.usage as:
KiB Mem :  7092516 total,  5961796 free,   794836 used,   335884 buff/cache

where 'compiz' process uses around 2.5% of memory and 'free' command reports
almost 5gb as free. May be some 'sysctl' or others needs to be adjusted on my 
setup?
> >
> >one more curious thing, is it fine to have extents with size 4096 on RAID5 
> >setup?
> >
> 
> Sure, why not?
> 
> Any size aligned to sector size is valid, no matter the profile.
> 
> For 4K extent, RAID5/6 will do read-modify-write, which is very common.
> 
> Thanks,
> Qu
> 
While looking into mapping from logical->physical chunks on RAID5. I was 
wondering 
how these 4KB extents will fit into 64KB data-stripe?

I didn't have idea about read-modify-write, just found few wiki links on it, 
will 
check. thanks

Cheers.
Lakshmipathi.G

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: RAID5:Inject data stripe corruption and verify scrub fixes it.

2017-02-21 Thread Lakshmipathi.G
> >
> >Looked into patch description:
> >
> >After scrubbing dev3 only:
> >0xcdcd (Good)  |  0xcdcd  | 0xcdcd (Bad)
> >(D1)  (D2)(P)
> >
> >So the Parity stripe (P) always get replaced by exact content of D1/D2 
> >(data-stripe)
> >or by random  data?
> 
> Neither. it's just XOR result of D2(never changed, 0xcdcd) and old D1(wrong,
> 0x).
> 0xcdcd XOR 0x = 0xcdcd
> 
> So you got 0xcdcd, bad result.
> 
> If you corrupt D1 with random data, then parity will be random then.
> 
> >If it always  get replaced by exact value from either
> >D1 or D2.  I think current script can be modified to detect that bug. If 
> >parity gets
> >replaced by random value, then it will the make task more difficult.
> 
> Not hard to detect.
> As the content is completely under your control, you know the correct parity
> value, and you can verify it very easy.
> 

version-3 of this script calculates exact data/parity location, instead of 
dumping data 
and searching locations. Tested with upto 8MB file, from the output all 128 
data-stripes 
and 64 parity stripe location seems fine. It constantly hit the parity bug with 
the script.


If the script gets accepted, will add slightly other corruption variants likes:
- corrupt all even stripe (D2,D4..)
- corrupt all odd stripe  (D1,D3..)
- corrupt all parity stripes
- corrupt all both data stripe (D0 & D1) and expect error message
(Cover above cases for RAID6)

thanks.

Cheers.
Lakshmipathi.G

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Large no.of extents for a single small file with a fresh RAID5 setup

2017-02-21 Thread Lakshmipathi.G
> >>How did you create the 2m file?
> >
> >Yes,this is the problem. script invokes 'dd' with 'notrunc' for each
> >byte. If the file is 1kb, 'dd' invoked 1024 times with notrunc.
> >This seems to be creating the issue.
> 
> IIRC that's not the direct cause though.
> 
> Since kernel is using delayed allocation, each time we trigger buffered
> write, kernel just info fs to do accounting and copy the data into page
> cache, no real write is triggered.
> 
> Only sync/fsync and memory pressure will make us to write pages into disc,
> and until then we allocate space for them.
> 
> So unless you're trying such operation on a busy server which triggered
> several sync/fsync/memory pressure during the several seconds of dd, it
> won't cause so many fragments.
> 
> I also tried your dd bs=1 method, no problem and still one single extent.
> 
> Thanks,
> Qu
I'm using Ubuntu(16.04) desktop version (not server) running Xorg and others. 
May be its possible its
flushing data to disk at constant time. If you like to give it a try again with 
servers, below is the
exact script along with its timing & multiextent output.

one more curious thing, is it fine to have extents with size 4096 on RAID5 
setup?

Cheers.
Lakshmipathi.G

#time /root/check_extents.sh f2MB 32
real1m30.284s
user1m22.796s
sys 0m39.324s

laks/btrfs-progs# ./btrfs-debugfs -f tests/mnt/f2MB
(263 0): ram 110592 disk 151015424 disk_size 114688
(263 110592): ram 4096 disk 147103744 disk_size 4096
(263 114688): ram 110592 disk 151130112 disk_size 114688
(263 225280): ram 696320 disk 151257088 disk_size 700416
(263 921600): ram 4096 disk 147107840 disk_size 4096
(263 925696): ram 700416 disk 152043520 disk_size 704512
(263 1626112): ram 4096 disk 147111936 disk_size 4096
(263 1630208): ram 344064 disk 153223168 disk_size 348160
(263 1974272): ram 4096 disk 147116032 disk_size 4096
(263 1978368): ram 118784 disk 152748032 disk_size 118784
file: tests/mnt/f2MB extents 10 disk size 2117632 logical size 2097152 ratio 
0.99

--
$cat /root/check_extents.sh 
#!/bin/bash
#$1 Filename
#$2 Expected no.of data stripes for the file.

create_layout(){
fname=$1
size=$(( $2 * 65536 ))
n=0
bs_value=1
stripe=0
while (( $n < $size ))
do
if [ $(( $n % 65536 )) -eq 0 ]; then
val='D'$stripe
echo -n $val
stripe=$(( $stripe+1 ))
# ensure proper value   
bs_value=`echo "${#val}"` 
else
echo -n 'x'
bs_value=1
fi
n=$(( $n+$bs_value ))
done | dd of="/home/laks/btrfs-progs/tests/mnt/$fname" bs=$bs_value 
conv=notrunc &> /dev/null
##EDIT above hard-coded path
}
create_layout $1 $2
--
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 01/12] btrfs-progs: lowmem check: Fix several bugs related to afterward search

2017-02-21 Thread Qu Wenruo
Since btrfs_search_slot() can points to the slot which is beyond the
leaves' capacity, we should pay extra attention when doing afterward
search.

While for lowmem check, several places uses afterward search:
1) Block group item used space check
2) Device item used space check
3) Data extent backref check.

In the following case for block group item check, btrfs lowmem mode
check will skip the block group and report false alert:

leaf 29405184 items 37 free space 1273 generation 11 owner 2
...
item 36 key (77594624 EXTENT_ITEM 2097152)
extent refs 1 gen 8 flags DATA
extent data backref root 5 objectid 265 offset 0 count 1
leaf 29409280 items 43 free space 670 generation 11 owner 2
item 0 key (96468992 EXTENT_ITEM 2097152)
extent refs 1 gen 8 flags DATA
extent data backref root 5 objectid 274 offset 0 count 1
item 1 key (96468992 BLOCK_GROUP_ITEM 33554432)
block group used 2265088 chunk_objectid 256 flags DATA

When checking block group item, we will search key(96468992 0 0) to
start from the first item in the block group.

While search_slot() will point to leaf 29405184, slot 37 which is beyond
leaf capacity.

And when reading key from slot 37, uninitialized data can be read out
and cause us to exit block group item check, leading to false alert.

Fix it by checking path.slot[0] before reading out the key.

Reported-by: Christoph Anton Mitterer 
Reported-by: Chris Murphy 
Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index 37e5ff18..699753fb 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -10591,6 +10591,8 @@ static int check_extent_data_backref(struct 
btrfs_fs_info *fs_info,
leaf = path.nodes[0];
slot = path.slots[0];
 
+   if (slot >= btrfs_header_nritems(leaf))
+   goto next;
btrfs_item_key_to_cpu(leaf, , slot);
if (key.objectid != objectid || key.type != 
BTRFS_EXTENT_DATA_KEY)
break;
@@ -10606,6 +10608,7 @@ static int check_extent_data_backref(struct 
btrfs_fs_info *fs_info,
offset)
found_count++;
 
+next:
ret = btrfs_next_item(root, );
if (ret)
break;
@@ -10878,8 +10881,10 @@ static int check_dev_item(struct btrfs_fs_info 
*fs_info,
 
/* Iterate dev_extents to calculate the used space of a device */
while (1) {
-   btrfs_item_key_to_cpu(path.nodes[0], , path.slots[0]);
+   if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+   goto next;
 
+   btrfs_item_key_to_cpu(path.nodes[0], , path.slots[0]);
if (key.objectid > dev_id)
break;
if (key.type != BTRFS_DEV_EXTENT_KEY || key.objectid != dev_id)
@@ -10976,6 +10981,11 @@ static int check_block_group_item(struct btrfs_fs_info 
*fs_info,
/* Iterate extent tree to account used space */
while (1) {
leaf = path.nodes[0];
+
+   /* Search slot can point to the last item beyond leaf nritems */
+   if (path.slots[0] >= btrfs_header_nritems(leaf))
+   goto next;
+
btrfs_item_key_to_cpu(leaf, _key, path.slots[0]);
if (extent_key.objectid >= bg_key.objectid + bg_key.offset)
break;
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 11/12] btrfs-progs: fsck: Fix lowmem mode override to allow it skip repair work

2017-02-21 Thread Qu Wenruo
From: Lu Fengqi 

Current common.local doesn't handle lowmem mode well.
It passes "--mode=lowmem" alone with "--repair", making it unable to
check lowmem mode.

It's caused by the following bugs:

1) Wrong variable in test/common.local
   We should check TEST_ARGS_CHECK, not TEST_CHECK, which is not defined
   so we never return 1.

2) Wrong parameter passed to _cmd_spec() in test/common
   This prevents us from grepping the correct parameters.

Fix it.

Signed-off-by: Lu Fengqi 
---
 tests/common   | 8 
 tests/common.local | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/common b/tests/common
index 51c2e267..7ad436e3 100644
--- a/tests/common
+++ b/tests/common
@@ -106,7 +106,7 @@ run_check()
ins=$(_get_spec_ins "$@")
spec=$(($ins-1))
cmd=$(eval echo "\${$spec}")
-   spec=$(_cmd_spec "$cmd")
+   spec=$(_cmd_spec "${@:$spec}")
set -- "${@:1:$(($ins-1))}" $spec "${@: $ins}"
echo "### $@" >> "$RESULTS" 2>&1
if [[ $TEST_LOG =~ tty ]]; then echo "CMD: $@" > /dev/tty; fi
@@ -128,7 +128,7 @@ run_check_stdout()
ins=$(_get_spec_ins "$@")
spec=$(($ins-1))
cmd=$(eval echo "\${$spec}")
-   spec=$(_cmd_spec "$cmd")
+   spec=$(_cmd_spec "${@:$spec}")
set -- "${@:1:$(($ins-1))}" $spec "${@: $ins}"
echo "### $@" >> "$RESULTS" 2>&1
if [[ $TEST_LOG =~ tty ]]; then echo "CMD(stdout): $@" > /dev/tty; fi
@@ -152,7 +152,7 @@ run_mayfail()
ins=$(_get_spec_ins "$@")
spec=$(($ins-1))
cmd=$(eval echo "\${$spec}")
-   spec=$(_cmd_spec "$cmd")
+   spec=$(_cmd_spec "${@:$spec}")
set -- "${@:1:$(($ins-1))}" $spec "${@: $ins}"
echo "### $@" >> "$RESULTS" 2>&1
if [[ $TEST_LOG =~ tty ]]; then echo "CMD(mayfail): $@" > /dev/tty; fi
@@ -188,7 +188,7 @@ run_mustfail()
ins=$(_get_spec_ins "$@")
spec=$(($ins-1))
cmd=$(eval echo "\${$spec}")
-   spec=$(_cmd_spec "$cmd")
+   spec=$(_cmd_spec "${@:$spec}")
set -- "${@:1:$(($ins-1))}" $spec "${@: $ins}"
echo "### $@" >> "$RESULTS" 2>&1
if [[ $TEST_LOG =~ tty ]]; then echo "CMD(mustfail): $@" > /dev/tty; fi
diff --git a/tests/common.local b/tests/common.local
index 9f567c27..4f56bb08 100644
--- a/tests/common.local
+++ b/tests/common.local
@@ -17,7 +17,7 @@ TEST_ARGS_CHECK=--mode=lowmem
 # break tests
 _skip_spec()
 {
-   if echo "$TEST_CHECK" | grep -q 'mode=lowmem' &&
+   if echo "$TEST_ARGS_CHECK" | grep -q 'mode=lowmem' &&
   echo "$@" | grep -q -- '--repair'; then
return 0
fi
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 12/12] btrfs-progs: cmds-check.c: walk_down_tree_v2 break cause of leaf process

2017-02-21 Thread Qu Wenruo
From: Su Yue 

In lowmem mode, 'walk_down_tree_v2' returns negative values wheather
the error is fatal or not. It causes the loop where 'walk_down_tree_v2'
is to break even the error is tolerated and then subsequent nodes process
will be skipped.

Fix it by redefining meanings of values 'walk_down_tree_v2' returns.
Do a similar fix for 'process_one_leaf_v2'.

Signed-off-by: Su Yue 
---
 cmds-check.c | 40 +---
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 7d273623..9cc1932c 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -1868,6 +1868,11 @@ static int update_nodes_refs(struct btrfs_root *root, 
u64 bytenr,
 static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path,
unsigned int ext_ref);
 
+/*
+ * Returns >0  Found error, not fatal, should continue process
+ * Returns <0  Fata error, must exit the whole check
+ * returns 0   No error is found
+ */
 static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path 
*path,
   struct node_refs *nrefs, int *level, int ext_ref)
 {
@@ -1937,13 +1942,8 @@ again:
}
 out:
err &= ~LAST_ITEM;
-   /*
-* Convert any error bitmap to -EIO, as we should avoid
-* mixing positive and negative return value to represent
-* error
-*/
if (err && !ret)
-   ret = -EIO;
+   ret = err;
return ret;
 }
 
@@ -2213,6 +2213,11 @@ out:
 static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path,
unsigned int ext_ref);
 
+/*
+ * Returns >0  Found error, should continue
+ * Returns 0   No error is found
+ * Returns <0  Fatal error, must exit the whole check
+ */
 static int walk_down_tree_v2(struct btrfs_root *root, struct btrfs_path *path,
 int *level, struct node_refs *nrefs, int ext_ref)
 {
@@ -5028,8 +5033,9 @@ static int check_fs_root_v2(struct btrfs_root *root, 
unsigned int ext_ref)
struct btrfs_path path;
struct node_refs nrefs;
struct btrfs_root_item *root_item = >root_item;
-   int ret, wret;
+   int ret;
int level;
+   int err = 0;
 
/*
 * We need to manually check the first inode item(256)
@@ -5063,17 +5069,21 @@ static int check_fs_root_v2(struct btrfs_root *root, 
unsigned int ext_ref)
}
 
while (1) {
-   wret = walk_down_tree_v2(root, , , , ext_ref);
-   if (wret < 0)
-   ret = wret;
-   if (wret != 0)
+   ret = walk_down_tree_v2(root, , , , ext_ref);
+   err |= !!ret;
+
+   /* if ret is negative, walk shall stop */
+   if (ret < 0) {
+   ret = err;
break;
+   }
 
-   wret = walk_up_tree_v2(root, , );
-   if (wret < 0)
-   ret = wret;
-   if (wret != 0)
+   ret = walk_up_tree_v2(root, , );
+   if (ret != 0) {
+   /* Normal exit, reset ret to err */
+   ret = err;
break;
+   }
}
 
 out:
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 05/12] btrfs-progs: lowmem check: Fix false alert on inline compressed extent

2017-02-21 Thread Qu Wenruo
Old lowmem check doesn't check if the inline extent is compressed and
always check extent numbytes against inline item size.

And when it finds the extent numbytes mismatch with inline item size it
doesn't output any error message, just return error silently, making it
quite hard to debug.

Fix it by only checking extent numbytes against inline item size when
the extent is not compressed, and output error message.

Reported-by: Christoph Anton Mitterer 
Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index eb146432..cf5a08ce 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -4715,17 +4715,28 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
 
fi = btrfs_item_ptr(node, slot, struct btrfs_file_extent_item);
 
+   /* Check inline extent */
extent_type = btrfs_file_extent_type(node, fi);
-   /* Skip if file extent is inline */
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
struct btrfs_item *e = btrfs_item_nr(slot);
u32 item_inline_len;
 
item_inline_len = btrfs_file_extent_inline_item_len(node, e);
extent_num_bytes = btrfs_file_extent_inline_len(node, slot, fi);
-   if (extent_num_bytes == 0 ||
-   extent_num_bytes != item_inline_len)
+   compressed = btrfs_file_extent_compression(node, fi);
+   if (extent_num_bytes == 0) {
+   error(
+   "root %llu EXTENT_DATA[%llu %llu] has empty inline extent",
+   root->objectid, fkey->objectid, fkey->offset);
err |= FILE_EXTENT_ERROR;
+   }
+   if (!compressed && extent_num_bytes != item_inline_len) {
+   error(
+   "root %llu EXTENT_DATA[%llu %llu] wrong inline size, have: 
%llu, expect: %u",
+   root->objectid, fkey->objectid, fkey->offset,
+   extent_num_bytes, item_inline_len);
+   err |= FILE_EXTENT_ERROR;
+   }
*size += extent_num_bytes;
return err;
}
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 00/12] lowmem mode fixes

2017-02-21 Thread Qu Wenruo
Patches can be fetch from github:
https://github.com/adam900710/btrfs-progs/tree/lowmem_fixes

Thanks for reports from Chris Murphy and Christoph Anton Mitterer,
several new bugs are exposed for lowmem mode fsck.

Special thank to Christoph, who did rounds of test during the patch
development.

The following bugs are fixed in lowmem mode:

1) Block group used space false alert
   If a BLOCK_GROUP_ITEM or its first EXTENT/METADATA_ITEM is located at
   the first slot of a leaf, search_slot() used by lowmem mode can
   point to previous leaf, with path->slots[0] beyond valid leaf slots.

   This makes us to read out uninitialized data, and can abort block
   group used space check loop, causing a false alert.

   Fix it with a test case image inside fsck-tests/020/extent-ref-cases
   Also fix all possible backward search.
   Reported by Christoph.

2) Partly written prealloc extent false alert
   If a prealloc extent gets partily written, lowmem mode will report
   prealloc extent shouldn't have csum.

   Lowmem mode passed wrong variable to csum checking code, causing it
   to check the whole range of the prealloc extent, making the bug
   happens.

   Fix it with a test case inside fsck-tests/020/extent-ref-cases.
   Reported by Chirs Murphy And Christoph.

3) Extent item size false alert.
   Under certain case, btrfs lowmem mode check reports data backref
   lost.
   It's because newly introduced extent item size check aborts normal
   check routine.

   It can happen if a data/metadata extent item has no inline ref.

   Fix it, test case already submitted before and merged, but due to
   fsck-tests framework bugs, it never get called for lowmem mode.

4) Compressed inline data extent
   The extra check on inline data extent length doesn't take compression
   into consideration and will cause false alert without outputting any
   error message.

   Fix it and add correct error message output for it.
   Also fix all possible silent error.
   Reported by Christoph.

5) fsck-tests Lowmem mode override fixes
   Allow lowmem mode override to get called for all tests, and allow
   them all to pass lowmem mode except fsck-tests/006, which is a
   original repair mode bug.


changelog:
  v2:
More generic forward search bug fix, not restricted to block group
item.
Compressed inline extent false alert fix.
Lowmem fsck-test enhance, to allow it really work.
Fix walk_down_tree_v2() to continue after non-fatal errors detected
  v3:
Update the last patch to make it works better for incoming lowmem
mode repair patchset.

Lu Fengqi (1):
  btrfs-progs: fsck: Fix lowmem mode override to allow it skip repair
work

Qu Wenruo (10):
  btrfs-progs: lowmem check: Fix several bugs related to afterward
search
  btrfs-progs: check: Output verbose error when fsck found a bug in any
tree
  btrfs-progs: lowmem check: Fix false alert in checking data extent
csums
  btrfs-progs: lowmem check: Fix extent item size false alert
  btrfs-progs: lowmem check: Fix false alert on inline compressed extent
  btrfs-progs: lowmem check: Fix silent error if first inode item
missing
  btrfs-progs: tests: Move fsck-tests/015 to fuzz tests
  btrfs-progs: fsck-test: Add test image for lowmem mode block group
false alert
  btrfs-progs: fsck-test: Make 013 compatible with lowmem mode
  btrfs-progs: fsck-test: Add new test case for file extent false alerts

Su Yue (1):
  btrfs-progs: cmds-check.c: walk_down_tree_v2 break cause of leaf
process

 cmds-check.c   | 175 +++--
 tests/common   |   8 +-
 tests/common.local |   2 +-
 tests/fsck-tests/013-extent-tree-rebuild/test.sh   |   2 +-
 .../block_group_item_false_alert.raw.xz| Bin 0 -> 47792 bytes
 tests/fsck-tests/020-extent-ref-cases/test.sh  |  15 +-
 tests/fsck-tests/025-file-extents/test.sh  |  42 +
 .../images}/bko-97171-btrfs-image.raw.txt  |   0
 .../images}/bko-97171-btrfs-image.raw.xz   | Bin
 9 files changed, 190 insertions(+), 54 deletions(-)
 create mode 100644 
tests/fsck-tests/020-extent-ref-cases/block_group_item_false_alert.raw.xz
 create mode 100755 tests/fsck-tests/025-file-extents/test.sh
 rename tests/{fsck-tests/015-check-bad-memory-access => 
fuzz-tests/images}/bko-97171-btrfs-image.raw.txt (100%)
 rename tests/{fsck-tests/015-check-bad-memory-access => 
fuzz-tests/images}/bko-97171-btrfs-image.raw.xz (100%)

-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 03/12] btrfs-progs: lowmem check: Fix false alert in checking data extent csums

2017-02-21 Thread Qu Wenruo
Btrfs lowmem check can report false csum error like:
ERROR: root 5 EXTENT_DATA[257 0] datasum missing
ERROR: root 5 EXTENT_DATA[257 4096] prealloc shouldn't have datasum

This is because lowmem check code always compare the found csum size
with the whole extent which data extents points to.

Normally it's OK, but when prealloc extent is written, or reflink is
done, data extent can points to part of a larger extent, making the csum
check wrong.

To fix it, the csum check part is modified to handle plain and
compressed extents in different ways:

1) Plain extent
   Only search csums for the range it refers to.
   So the search range is from (disk_bytenr + extent_offset) and search
   length is (extent_num_bytes)

2) Compressed extent
   Search the whole extent.
   Search range is from (disk_bytner) and search length is
   (disk_num_bytes)

Reported-by: Chris Murphy 
Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 47 ---
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 107359f8..e99e3c36 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -4703,9 +4703,13 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
u64 disk_bytenr;
u64 disk_num_bytes;
u64 extent_num_bytes;
-   u64 found;
+   u64 extent_offset;
+   u64 csum_found; /* In byte size, sectorsize aligned */
+   u64 search_start;   /* Logical range start we search for csum */
+   u64 search_len; /* Logical range len we search for csum */
unsigned int extent_type;
unsigned int is_hole;
+   int compressed = 0;
int ret;
int err = 0;
 
@@ -4739,24 +4743,45 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
disk_bytenr = btrfs_file_extent_disk_bytenr(node, fi);
disk_num_bytes = btrfs_file_extent_disk_num_bytes(node, fi);
extent_num_bytes = btrfs_file_extent_num_bytes(node, fi);
+   extent_offset = btrfs_file_extent_offset(node, fi);
+   compressed = btrfs_file_extent_compression(node, fi);
is_hole = (disk_bytenr == 0) && (disk_num_bytes == 0);
 
-   /* Check EXTENT_DATA datasum */
-   ret = count_csum_range(root, disk_bytenr, disk_num_bytes, );
-   if (found > 0 && nodatasum) {
+   /*
+* Check EXTENT_DATA datasum
+*
+* For plain(uncompressed) extent, we should only check the range
+* we're referring to, as it's possible that part of prealloc extent
+* has been written, and has csum:
+*
+* |<-Original large preallocate extent A >|
+* |<- Prealloc File Extent ->|<- Regular Extent ->|
+*  No csum Has csum
+*
+* For compressed extent, we should check the whole range.
+*/
+   if (!compressed) {
+   search_start = disk_bytenr + extent_offset;
+   search_len = extent_num_bytes;
+   } else {
+   search_start = disk_bytenr;
+   search_len = disk_num_bytes;
+   }
+   ret = count_csum_range(root, search_start, search_len, _found);
+   if (csum_found > 0 && nodatasum) {
err |= ODD_CSUM_ITEM;
error("root %llu EXTENT_DATA[%llu %llu] nodatasum shouldn't 
have datasum",
  root->objectid, fkey->objectid, fkey->offset);
} else if (extent_type == BTRFS_FILE_EXTENT_REG && !nodatasum &&
-  !is_hole &&
-  (ret < 0 || found == 0 || found < disk_num_bytes)) {
+  !is_hole && (ret < 0 || csum_found < search_len)) {
err |= CSUM_ITEM_MISSING;
-   error("root %llu EXTENT_DATA[%llu %llu] datasum missing",
- root->objectid, fkey->objectid, fkey->offset);
-   } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC && found > 0) {
+   error("root %llu EXTENT_DATA[%llu %llu] datasum missing, have: 
%llu, expect: %llu",
+ root->objectid, fkey->objectid, fkey->offset,
+ csum_found, search_len);
+   } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC && csum_found > 0) 
{
err |= ODD_CSUM_ITEM;
-   error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have 
datasum",
- root->objectid, fkey->objectid, fkey->offset);
+   error("root %llu EXTENT_DATA[%llu %llu] prealloc shouldn't have 
datasum, but have: %llu",
+ root->objectid, fkey->objectid, fkey->offset, csum_found);
}
 
/* Check EXTENT_DATA hole */
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 08/12] btrfs-progs: fsck-test: Add test image for lowmem mode block group false alert

2017-02-21 Thread Qu Wenruo
Add a minimal image which can reproduce the block group used space
false alert for lowmem mode fsck.

Reported-by: Christoph Anton Mitterer 
Signed-off-by: Qu Wenruo 
---
 .../block_group_item_false_alert.raw.xz | Bin 0 -> 47792 bytes
 tests/fsck-tests/020-extent-ref-cases/test.sh   |  15 +++
 2 files changed, 11 insertions(+), 4 deletions(-)
 create mode 100644 
tests/fsck-tests/020-extent-ref-cases/block_group_item_false_alert.raw.xz

diff --git 
a/tests/fsck-tests/020-extent-ref-cases/block_group_item_false_alert.raw.xz 
b/tests/fsck-tests/020-extent-ref-cases/block_group_item_false_alert.raw.xz
new file mode 100644
index 
..559c3fa9e8491f3ce1f424d1baef29853e8fb889
GIT binary patch
literal 47792
zcmeHwWmukDmL={If(LhZcL)S`cXxMpcPD`W!5xAV+%>qnyL)h$?&&{0(>=H9R^2;Q
z-Ti*Qd7ksJzrEL4dz~#W9G>c$ARrKXb9JI%AVi?JARr*`#??pO-s~W@bwEJuUf$mF
ze!PkCi=#Vo87MR+Qtsu2LO-1C;0I1K^l|foN>%#~MMlandk9peSuN#)cCI-~7;EQFhOF(7$wFo%;?(r9$c-B&8)gSv5^r*S24&1YlJ6HO@I5-X
zMm=+|4QW$f92}?pM0A{MA>fqJm@%6us;Ff@N+Zs65^bwee?I@KoP9G|&?dIU)1?`f
z4M(i5PN{Y3DY4oU7paaNp*_2dZ+Ar<+4GVa;iOf(R@$4vz3=WTw4%Jkc
z!0_XpyKSs!ByoH#K|!dqiZ7C;7K1l`aP_3bf$)5#L+LuAGa|IOgTifxCf#-T
z(LGUl?exf#!FLTA!OAMf+%4N%IaCqW+ls=Oz>+P64u$MzZO@a6fKyEAtKyH|ZzTJ1
z*|gv(!ggvf5O+#vp3dCF*euT_%Nqvn5fg>bjl!loHUf={<3BOM^fj`sHr(}n!w6Sx
z<4%=oG1p|C<=pua55w8;Mmk}y5>!*%yJm=d@BFM~bu>Z;Bl^OemlRq0vRWT*
z;f-!nqvd8#=lnh`q=6x=?6R^$o3BK>LJg(DBH#fXFhC{@k)T|K4-?>G0QV
zxf(2#%(dgu^iowEz
z)L6@2g|d8oc7Fl-k)%ZPE<3kpw;LQS*YWK$lN_o*K77Tb$paUxyWq22XaOtZ18eSQ
zMqILf0b$pH{Sp!{@No?e{5*~6gfJsqRO;@NV!iQb%Sfj<__U|Z=2
z4*U1Y;MNqn1oJ=IJV$_z3Lo1o9%;X6r?5EGr(Y>aT5{)bHdH?dp_c?*-yAO=J4$sO
z4u{Xr5V_oaV@Jg)vR3?ET?4?@fb!e2q_l*kFBY~
zwb_=-uWI5eely{$8k7NHJJPmz5Lxks?v4KVD_-+hH3gQoM(ClbqI`eGt`;QB
z!@yF-^OGWt^C-R+;m3znCBAzqV^|*lVQTC3dY+5kHhR5MWPT~rkMXETZ|{=s9o#kU
zyq#mj*g7HQPgqD}tJ@T>`_h6?C(FxC;xFQkx3qTiL-e=V9jyAeiXkK+*BPQ%%|m>5
zAz11uSfU_pm4sK*lMy3Ew
zIw5h-Xv@8+{3Jt^I{2ZLo(QynUBs`Dw@U?rp}-Jq4nqjUeMF9Su_z_K&3J;JIKR3h
zJnHIF?)oPi-@hgtNkxu`nmG|Rz64RdgxRQpIj9MESq-C|p2PY16{)6&!xYXr2DZw7
zfZ?sINZXU|!sX3GlmeYoA8^HwSh!NquhCp;j+(d^-Na5g!7|gGY9{S#coB|-V;9)d
zs&+3Hy=|YI`%Jzow$F^;F=I(EcMG$SaPF!q)$Mij6bC+oKHNRb+G5FE?du@w07C+5
zgQ~_Vho9B{-J9t{){ht_Tl;TCBzh;5!y%I?sgb_a30D*@ci5d!sWPcA3zf6+$h~Ar
z_qPe`OppatXLeooGGYjm2`Hr%`oH;0GN~(^X1%=wr2#o{QYw1+lTZ6yesd2$`j$c|
zid^#Sz4r8#kzTbkF625Wpwr}U`bYn$qZN<^YO>yfz=}ZlFJbpc6RGCYOt%lV%mo*s
zC4_j{paX6t?|9|*zL?oVd*UF_c3!Q9drg?vVEF2l@-$W_Y662zU=p$d>UDN~E6I
zYrw-e=Unf94wBWIE(;6O`H~559gnEDaexalAs>$sDKU<_laNa%f6Vu>?w2u*m?CG>`vojw-wps>>5Yiz$-4Tt~dqC+|C*u(W|;)pDn5VK@h26sZbjj~ufR-RwPba=ALKKpZxs8GDUAZUV~jLq>q
zL@Ml#ZV*$rg*Bf~)xY_r|tgr+QO)qHz=Fr4~zx0*(A3wm2%I^%Dp*iF(q
zlh7tgvq>7fh}FEL(D}As-EPcsI346#EGxseSh@dg9{OdhGKZezepni26VK0#$kZlSpmj2Hq!yrq798ZkQlFH*X_A*aB48vIwR6RhZvH88_D8wj?+Wqz
z^~{wDAiDqW3+Xq2<=>(kfW!au1_uoGJ1GxfFu-8Hn~MEWR|J4z|5+9SgybI`lE|P<
zARVs(pQwi++UeY2$9zDD5SH59g7}Mn3=kmm6EKY6JU+=

[PATCH v3 10/12] btrfs-progs: fsck-test: Add new test case for file extent false alerts

2017-02-21 Thread Qu Wenruo
Lowmem mode exposed several false alerts, all related to file extents
check.

1) Partly written prealloc extent
   Cause lowmem mode to report missing csum or prealloc extent should
   not have csum

2) Compressed inline extent
   Cause lowmem mode to find mismatch on inline len and item len.
   While no error message is output but exit silently.

Reported-by: Chris Murphy 
Reported-by: Christoph Anton Mitterer 
Signed-off-by: Qu Wenruo 
---
 tests/fsck-tests/025-file-extents/test.sh | 42 +++
 1 file changed, 42 insertions(+)
 create mode 100755 tests/fsck-tests/025-file-extents/test.sh

diff --git a/tests/fsck-tests/025-file-extents/test.sh 
b/tests/fsck-tests/025-file-extents/test.sh
new file mode 100755
index ..cb64c500
--- /dev/null
+++ b/tests/fsck-tests/025-file-extents/test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Confirm btrfs check can check file extents without causing false alert
+
+source $TOP/tests/common
+
+check_prereq btrfs
+check_prereq mkfs.btrfs
+check_global_prereq xfs_io
+check_global_prereq fallocate
+
+setup_root_helper
+prepare_test_dev 128M
+
+# Do some write into a large prealloc range
+# Lowmem mode can report missing csum due to wrong csum range
+test_paritical_write_into_prealloc()
+{
+   run_check $SUDO_HELPER "$TOP/mkfs.btrfs" -f "$TEST_DEV"
+   run_check_mount_test_dev
+
+   run_check fallocate -l 128K "$TEST_MNT/file"
+   sync
+   run_check xfs_io -c "pwrite 0 64K" "$TEST_MNT/file"
+   run_check_umount_test_dev
+   run_check "$TOP/btrfs" check "$TEST_DEV"
+}
+
+# Inline compressed file extent
+# Lowmem mode can cause silent error without any error message
+# due to too restrict check on inline extent size
+test_compressed_inline_extent()
+{
+   run_check $SUDO_HELPER "$TOP/mkfs.btrfs" -f "$TEST_DEV"
+   run_check_mount_test_dev -o compress=lzo,max_inline=2048
+
+   run_check xfs_io -f -c "pwrite 0 1K" "$TEST_MNT/file"
+   run_check_umount_test_dev
+   run_check "$TOP/btrfs" check "$TEST_DEV"
+}
+
+test_paritical_write_into_prealloc
+test_compressed_inline_extent
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 04/12] btrfs-progs: lowmem check: Fix extent item size false alert

2017-02-21 Thread Qu Wenruo
If one extent item has no inline ref, btrfs lowmem mode check can give
false alert without outputting any error message.

The problem is lowmem mode always assume that extent item has inline
refs, and when it encounters such case it flags the extent item has
wrong size, but doesn't output the error message.

Although we already have such image submitted, at the commit time due to
another bug in cmds-check return value, it doesn't detect it until that
bug is fixed.

Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index e99e3c36..eb146432 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -10761,13 +10761,20 @@ static int check_extent_item(struct btrfs_fs_info 
*fs_info,
}
end = (unsigned long)ei + item_size;
 
-   if (ptr >= end) {
+next:
+   /* Reached extent item end normally */
+   if (ptr == end)
+   goto out;
+
+   /* Beyond extent item end, wrong item size */
+   if (ptr > end) {
err |= ITEM_SIZE_MISMATCH;
+   error("extent item at bytenr %llu slot %d has wrong size",
+   eb->start, slot);
goto out;
}
 
/* Now check every backref in this extent item */
-next:
iref = (struct btrfs_extent_inline_ref *)ptr;
type = btrfs_extent_inline_ref_type(eb, iref);
offset = btrfs_extent_inline_ref_offset(eb, iref);
@@ -10804,8 +10811,7 @@ next:
}
 
ptr += btrfs_extent_inline_ref_size(type);
-   if (ptr < end)
-   goto next;
+   goto next;
 
 out:
return err;
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 06/12] btrfs-progs: lowmem check: Fix silent error if first inode item missing

2017-02-21 Thread Qu Wenruo
If first inode item is missing, lowmem check will detect it but not
output any error message.

Add error message for it.

Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmds-check.c b/cmds-check.c
index cf5a08ce..7d273623 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -5001,6 +5001,8 @@ static int check_fs_first_inode(struct btrfs_root *root, 
unsigned int ext_ref)
if (ret > 0) {
ret = 0;
err |= INODE_ITEM_MISSING;
+   error("first inode item of root %llu is missing",
+ root->objectid);
}
 
err |= check_inode_item(root, , ext_ref);
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 07/12] btrfs-progs: tests: Move fsck-tests/015 to fuzz tests

2017-02-21 Thread Qu Wenruo
The test case fsck-tests/015-check-bad-memory-access can't be repair by
btrfs check, and it's a fortunate bug makes original mode to forget the
error code from extent tree, making original mode pass it.

So fuzz-tests is more suitable for it.

Signed-off-by: Qu Wenruo 
---
 .../images}/bko-97171-btrfs-image.raw.txt   |   0
 .../images}/bko-97171-btrfs-image.raw.xz| Bin
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/{fsck-tests/015-check-bad-memory-access => 
fuzz-tests/images}/bko-97171-btrfs-image.raw.txt (100%)
 rename tests/{fsck-tests/015-check-bad-memory-access => 
fuzz-tests/images}/bko-97171-btrfs-image.raw.xz (100%)

diff --git 
a/tests/fsck-tests/015-check-bad-memory-access/bko-97171-btrfs-image.raw.txt 
b/tests/fuzz-tests/images/bko-97171-btrfs-image.raw.txt
similarity index 100%
rename from 
tests/fsck-tests/015-check-bad-memory-access/bko-97171-btrfs-image.raw.txt
rename to tests/fuzz-tests/images/bko-97171-btrfs-image.raw.txt
diff --git 
a/tests/fsck-tests/015-check-bad-memory-access/bko-97171-btrfs-image.raw.xz 
b/tests/fuzz-tests/images/bko-97171-btrfs-image.raw.xz
similarity index 100%
rename from 
tests/fsck-tests/015-check-bad-memory-access/bko-97171-btrfs-image.raw.xz
rename to tests/fuzz-tests/images/bko-97171-btrfs-image.raw.xz
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 02/12] btrfs-progs: check: Output verbose error when fsck found a bug in any tree

2017-02-21 Thread Qu Wenruo
Although we output error like "errors found in extent allocation tree or
chunk allocation", but we lacks such output for other trees, but leaving
the final "found error is %d" to catch the last return value(and
sometime it's cleared)

This patch adds extra error message for top level error path, and modify
the last "found error is %d" to "error(s) found" or "no error found".

Cc: Christoph Anton Mitterer 
Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 43 +--
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 699753fb..107359f8 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -12933,8 +12933,10 @@ int cmd_check(int argc, char **argv)
 
ret = repair_root_items(info);
err |= !!ret;
-   if (ret < 0)
+   if (ret < 0) {
+   error("failed to repair root items: %s", strerror(-ret));
goto close_out;
+   }
if (repair) {
fprintf(stderr, "Fixed %d roots.\n", ret);
ret = 0;
@@ -12957,8 +12959,13 @@ int cmd_check(int argc, char **argv)
}
ret = check_space_cache(root);
err |= !!ret;
-   if (ret)
+   if (ret) {
+   if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
+   error("errors found in free space tree");
+   else
+   error("errors found in free space cache");
goto out;
+   }
 
/*
 * We used to have to have these hole extents in between our real
@@ -12974,22 +12981,28 @@ int cmd_check(int argc, char **argv)
else
ret = check_fs_roots(root, _cache);
err |= !!ret;
-   if (ret)
+   if (ret) {
+   error("errors found in fs roots");
goto out;
+   }
 
fprintf(stderr, "checking csums\n");
ret = check_csums(root);
err |= !!ret;
-   if (ret)
+   if (ret) {
+   error("errors found in csum tree");
goto out;
+   }
 
fprintf(stderr, "checking root refs\n");
/* For low memory mode, check_fs_roots_v2 handles root refs */
if (check_mode != CHECK_MODE_LOWMEM) {
ret = check_root_refs(root, _cache);
err |= !!ret;
-   if (ret)
+   if (ret) {
+   error("errors found in root refs");
goto out;
+   }
}
 
while (repair && !list_empty(>fs_info->recow_ebs)) {
@@ -13000,8 +13013,10 @@ int cmd_check(int argc, char **argv)
list_del_init(>recow);
ret = recow_extent_buffer(root, eb);
err |= !!ret;
-   if (ret)
+   if (ret) {
+   error("fails to fix transid errors");
break;
+   }
}
 
while (!list_empty(_items)) {
@@ -13020,13 +13035,17 @@ int cmd_check(int argc, char **argv)
fprintf(stderr, "checking quota groups\n");
ret = qgroup_verify_all(info);
err |= !!ret;
-   if (ret)
+   if (ret) {
+   error("failed to check quota groups");
goto out;
+   }
report_qgroups(0);
ret = repair_qgroups(info, _repaired);
err |= !!ret;
-   if (err)
+   if (err) {
+   error("failed to repair quota groups");
goto out;
+   }
ret = 0;
}
 
@@ -13047,8 +13066,12 @@ out:
   "backup data and re-format the FS. *\n\n");
err |= 1;
}
-   printf("found %llu bytes used err is %d\n",
-  (unsigned long long)bytes_used, ret);
+   printf("found %llu bytes used, ",
+  (unsigned long long)bytes_used);
+   if (err)
+   printf("error(s) found\n");
+   else
+   printf("no error found\n");
printf("total csum bytes: %llu\n",(unsigned long long)total_csum_bytes);
printf("total tree bytes: %llu\n",
   (unsigned long long)total_btree_bytes);
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 09/12] btrfs-progs: fsck-test: Make 013 compatible with lowmem mode

2017-02-21 Thread Qu Wenruo
fsck-tests/013-extent-tree-rebuild uses "--init-extent-tree", which
implies "--repair".

But the test script doesn't specify "--repair" for lowmem mode test to
detect it.

Add it so lowmem mode test can be happy with it.

Signed-off-by: Qu Wenruo 
---
 tests/fsck-tests/013-extent-tree-rebuild/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fsck-tests/013-extent-tree-rebuild/test.sh 
b/tests/fsck-tests/013-extent-tree-rebuild/test.sh
index 37bdcd9c..08c1e50e 100755
--- a/tests/fsck-tests/013-extent-tree-rebuild/test.sh
+++ b/tests/fsck-tests/013-extent-tree-rebuild/test.sh
@@ -36,7 +36,7 @@ test_extent_tree_rebuild()
 
$SUDO_HELPER $TOP/btrfs check $TEST_DEV >& /dev/null && \
_fail "btrfs check should detect failure"
-   run_check $SUDO_HELPER $TOP/btrfs check --init-extent-tree $TEST_DEV
+   run_check $SUDO_HELPER $TOP/btrfs check --repair --init-extent-tree 
$TEST_DEV
run_check $SUDO_HELPER $TOP/btrfs check $TEST_DEV
 }
 
-- 
2.11.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mount troubles after crash

2017-02-21 Thread Patrick Schmid

Hi Qu


A known bug when qgroup is enabled.
Fixed in v4.10-rcs.

Please mount using v4.10 kernels and it will mount without problem.


I have rebootet with the v4.10 kernel and everything works again!
Thank you very much, you save me a lot of work!

Regards Patrick
--
Patrick Schmid   support: +41 44 633 2668
IT Services Group, HPT H 8voice:   +41 44 633 3997
Departement Physik, ETH Zurich
CH-8093 Zurich, Switzerland
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] btrfs: Handle delalloc error correctly to avoid deadlock

2017-02-21 Thread Qu Wenruo
If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
btrfs will block with the following backtrace:

Call Trace:
 __schedule+0x2d4/0xae0
 schedule+0x3d/0x90
 btrfs_start_ordered_extent+0x160/0x200 [btrfs]
 ? wake_atomic_t_function+0x60/0x60
 btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
 btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
 btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
 process_one_work+0x2af/0x720
 ? process_one_work+0x22b/0x720
 worker_thread+0x4b/0x4f0
 kthread+0x10f/0x150
 ? process_one_work+0x720/0x720
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x2e/0x40

The direct cause is the error handler in run_delalloc_nocow() doesn't
handle error from btrfs_reloc_clone_csums() well.

The error handler of run_delalloc_nocow() will clear dirty and finish IO
for the pages in that extent.
However we have already inserted one ordered extent.
And that ordered extent is relying on endio hooks to wait all its pages
to finish, while only the first page will finish.

This makes that ordered extent never finish, so blocking the file
system.

Although the root cause is still in RAID5/6, it won't hurt to fix the
error routine first.

This patch will slightly modify one existing function,
btrfs_endio_direct_write_update_ordered() to handle free space inode,
just like what btrfs_writepage_end_io_hook() does.

And use it as base to implement one inline function,
btrfs_cleanup_ordered_extents() to handle the error in
run_delalloc_nocow() and cow_file_range().

For compression, it's calling writepage_end_io_hook() itself to handle
its error, and any submitted ordered extent will have its bio submitted,
so no need to worry about compression part.

Suggested-by: Filipe Manana 
Signed-off-by: Qu Wenruo 
---
v2:
  Add BTRFS_ORDERED_SKIP_METADATA flag to avoid double reducing
  outstanding extents, which is already done by
  extent_clear_unlock_delalloc()
---
 fs/btrfs/inode.c| 75 +
 fs/btrfs/ordered-data.h |  2 ++
 2 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..a0b09ff73eae 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -116,6 +116,41 @@ static struct extent_map *create_pinned_em(struct inode 
*inode, u64 start,
 
 static int btrfs_dirty_inode(struct inode *inode);
 
+static void __endio_write_update_ordered(struct inode *inode,
+const u64 offset,
+const u64 bytes,
+const int uptodate,
+const int skip_meta);
+static inline void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+  const u64 offset,
+  const u64 bytes,
+  const int uptodate)
+{
+   return __endio_write_update_ordered(inode, offset, bytes, uptodate, 0);
+}
+
+/*
+ * Set error bit and cleanup all ordered extents in specified range of @inode.
+ *
+ * This is for error case where ordered extent(s) is submitted but
+ * corresponding bio is not submitted.
+ * This can make waiter on such ordered extent never finish, as there is no
+ * endio hook called to finish such ordered extent.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+const u64 offset,
+const u64 bytes)
+{
+   /*
+* In error handler, we have extent_clear_unlock_delalloc() called
+* to reduce our metadata space reservation and outstanding extents.
+*
+* So here, we don't need finish_ordered_io() to free metadata space
+* for us, or we will underflow outstanding extents.
+*/
+   return __endio_write_update_ordered(inode, offset, bytes, 0, 1);
+}
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_inode_set_ops(struct inode *inode)
 {
@@ -237,7 +272,6 @@ static int insert_inline_extent(struct btrfs_trans_handle 
*trans,
return err;
 }
 
-
 /*
  * conditionally insert an inline extent into the file.  This
  * does the checks required to make sure the data is small enough
@@ -1096,6 +1130,7 @@ static noinline int cow_file_range(struct inode *inode,
 EXTENT_DELALLOC | EXTENT_DEFRAG,
 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+   btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
goto out;
 }
 
@@ -1538,7 +1573,7 @@ static noinline int run_delalloc_nocow(struct inode 
*inode,
if (!ret)
ret = err;
 
-   if (ret && cur_offset < end)
+   if (ret && cur_offset < end) {

[PATCH v2] btrfs: Handle delalloc error correctly to avoid deadlock

2017-02-21 Thread Qu Wenruo
If run btrfs/125 with nospace_cache or space_cache=v2 mount option,
btrfs will block with the following backtrace:

Call Trace:
 __schedule+0x2d4/0xae0
 schedule+0x3d/0x90
 btrfs_start_ordered_extent+0x160/0x200 [btrfs]
 ? wake_atomic_t_function+0x60/0x60
 btrfs_run_ordered_extent_work+0x25/0x40 [btrfs]
 btrfs_scrubparity_helper+0x1c1/0x620 [btrfs]
 btrfs_flush_delalloc_helper+0xe/0x10 [btrfs]
 process_one_work+0x2af/0x720
 ? process_one_work+0x22b/0x720
 worker_thread+0x4b/0x4f0
 kthread+0x10f/0x150
 ? process_one_work+0x720/0x720
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x2e/0x40

The direct cause is the error handler in run_delalloc_nocow() doesn't
handle error from btrfs_reloc_clone_csums() well.

The error handler of run_delalloc_nocow() will clear dirty and finish IO
for the pages in that extent.
However we have already inserted one ordered extent.
And that ordered extent is relying on endio hooks to wait all its pages
to finish, while only the first page will finish.

This makes that ordered extent never finish, so blocking the file
system.

Although the root cause is still in RAID5/6, it won't hurt to fix the
error routine first.

This patch will slightly modify one existing function,
btrfs_endio_direct_write_update_ordered() to handle free space inode,
just like what btrfs_writepage_end_io_hook() does.

And use it as base to implement one inline function,
btrfs_cleanup_ordered_extents() to handle the error in
run_delalloc_nocow() and cow_file_range().

For compression, it's calling writepage_end_io_hook() itself to handle
its error, and any submitted ordered extent will have its bio submitted,
so no need to worry about compression part.

Suggested-by: Filipe Manana 
Signed-off-by: Qu Wenruo 

Signed-off-by: Qu Wenruo 
---
v2:
  Add BTRFS_ORDERED_SKIP_METADATA flag to avoid double reducing
  outstanding extents, which is already done by
  extent_clear_unlock_delalloc()
---
 fs/btrfs/inode.c| 75 +
 fs/btrfs/ordered-data.h |  2 ++
 2 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e861a063721..a0b09ff73eae 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -116,6 +116,41 @@ static struct extent_map *create_pinned_em(struct inode 
*inode, u64 start,
 
 static int btrfs_dirty_inode(struct inode *inode);
 
+static void __endio_write_update_ordered(struct inode *inode,
+const u64 offset,
+const u64 bytes,
+const int uptodate,
+const int skip_meta);
+static inline void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+  const u64 offset,
+  const u64 bytes,
+  const int uptodate)
+{
+   return __endio_write_update_ordered(inode, offset, bytes, uptodate, 0);
+}
+
+/*
+ * Set error bit and cleanup all ordered extents in specified range of @inode.
+ *
+ * This is for error case where ordered extent(s) is submitted but
+ * corresponding bio is not submitted.
+ * This can make waiter on such ordered extent never finish, as there is no
+ * endio hook called to finish such ordered extent.
+ */
+static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
+const u64 offset,
+const u64 bytes)
+{
+   /*
+* In error handler, we have extent_clear_unlock_delalloc() called
+* to reduce our metadata space reservation and outstanding extents.
+*
+* So here, we don't need finish_ordered_io() to free metadata space
+* for us, or we will underflow outstanding extents.
+*/
+   return __endio_write_update_ordered(inode, offset, bytes, 0, 1);
+}
+
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 void btrfs_test_inode_set_ops(struct inode *inode)
 {
@@ -237,7 +272,6 @@ static int insert_inline_extent(struct btrfs_trans_handle 
*trans,
return err;
 }
 
-
 /*
  * conditionally insert an inline extent into the file.  This
  * does the checks required to make sure the data is small enough
@@ -1096,6 +1130,7 @@ static noinline int cow_file_range(struct inode *inode,
 EXTENT_DELALLOC | EXTENT_DEFRAG,
 PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+   btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
goto out;
 }
 
@@ -1538,7 +1573,7 @@ static noinline int run_delalloc_nocow(struct inode 
*inode,
if (!ret)
ret = err;
 
-   if (ret && cur_offset < end)
+