[PATCH] btrfs: Remove unused function btrfs_account_dev_extents_size()

2018-07-08 Thread Qu Wenruo
This function is never used by any one in kernel, just remove it.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/volumes.c | 85 --
 fs/btrfs/volumes.h |  2 --
 2 files changed, 87 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b33bf29130b6..e6a8e4aabc66 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1259,91 +1259,6 @@ int btrfs_scan_one_device(const char *path, fmode_t 
flags, void *holder,
return ret;
 }
 
-/* helper to account the used device space in the range */
-int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
-  u64 end, u64 *length)
-{
-   struct btrfs_key key;
-   struct btrfs_root *root = device->fs_info->dev_root;
-   struct btrfs_dev_extent *dev_extent;
-   struct btrfs_path *path;
-   u64 extent_end;
-   int ret;
-   int slot;
-   struct extent_buffer *l;
-
-   *length = 0;
-
-   if (start >= device->total_bytes ||
-   test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
-   return 0;
-
-   path = btrfs_alloc_path();
-   if (!path)
-   return -ENOMEM;
-   path->reada = READA_FORWARD;
-
-   key.objectid = device->devid;
-   key.offset = start;
-   key.type = BTRFS_DEV_EXTENT_KEY;
-
-   ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-   if (ret < 0)
-   goto out;
-   if (ret > 0) {
-   ret = btrfs_previous_item(root, path, key.objectid, key.type);
-   if (ret < 0)
-   goto out;
-   }
-
-   while (1) {
-   l = path->nodes[0];
-   slot = path->slots[0];
-   if (slot >= btrfs_header_nritems(l)) {
-   ret = btrfs_next_leaf(root, path);
-   if (ret == 0)
-   continue;
-   if (ret < 0)
-   goto out;
-
-   break;
-   }
-   btrfs_item_key_to_cpu(l, &key, slot);
-
-   if (key.objectid < device->devid)
-   goto next;
-
-   if (key.objectid > device->devid)
-   break;
-
-   if (key.type != BTRFS_DEV_EXTENT_KEY)
-   goto next;
-
-   dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-   extent_end = key.offset + btrfs_dev_extent_length(l,
- dev_extent);
-   if (key.offset <= start && extent_end > end) {
-   *length = end - start + 1;
-   break;
-   } else if (key.offset <= start && extent_end > start)
-   *length += extent_end - start;
-   else if (key.offset > start && extent_end <= end)
-   *length += extent_end - key.offset;
-   else if (key.offset > start && key.offset <= end) {
-   *length += end - key.offset + 1;
-   break;
-   } else if (key.offset > end)
-   break;
-
-next:
-   path->slots[0]++;
-   }
-   ret = 0;
-out:
-   btrfs_free_path(path);
-   return ret;
-}
-
 static int contains_pending_extent(struct btrfs_transaction *transaction,
   struct btrfs_device *device,
   u64 *start, u64 len)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 77e6004b6cb9..6d4f38ad9f5c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -384,8 +384,6 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
 }
 
-int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
-  u64 end, u64 *length);
 void btrfs_get_bbio(struct btrfs_bio *bbio);
 void btrfs_put_bbio(struct btrfs_bio *bbio);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] btrfs-progs: tests/fuzz: Add image for bko-200409

2018-07-08 Thread Qu Wenruo
Reported-by: Xu Wen 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200409
Signed-off-by: Qu Wenruo 
---
 tests/fuzz-tests/images/bko-200409.raw.txt | 125 +
 tests/fuzz-tests/images/bko-200409.raw.xz  | Bin 0 -> 24480 bytes
 2 files changed, 125 insertions(+)
 create mode 100644 tests/fuzz-tests/images/bko-200409.raw.txt
 create mode 100644 tests/fuzz-tests/images/bko-200409.raw.xz

diff --git a/tests/fuzz-tests/images/bko-200409.raw.txt 
b/tests/fuzz-tests/images/bko-200409.raw.txt
new file mode 100644
index ..7df7924370eb
--- /dev/null
+++ b/tests/fuzz-tests/images/bko-200409.raw.txt
@@ -0,0 +1,125 @@
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=200409
+Wen Xu 2018-07-04 17:47:09 UTC
+
+Created attachment 277173 [details]
+The (compressed) crafted image which causes crash
+
+- Reproduce
+# mkdir mnt
+# mount -t btrfs 5.img mnt
+
+- Kernel message
+[  333.770743] BTRFS: device fsid 3381d111-94a3-4ac7-8f39-611bbbdab7e6 devid 1 
transid 8 /dev/loop0
+[  333.779221] BTRFS info (device loop0): disk space caching is enabled
+[  333.779234] BTRFS info (device loop0): has skinny extents
+[  333.798081] [ cut here ]
+[  333.798090] kernel BUG at fs/btrfs/volumes.c:6564!
+[  333.799293] invalid opcode:  [#1] SMP KASAN PTI
+[  333.800355] CPU: 0 PID: 1353 Comm: mount Not tainted 4.18.0-rc1+ #8
+[  333.801652] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
Ubuntu-1.8.2-1ubuntu1 04/01/2014
+[  333.803658] RIP: 0010:read_one_chunk+0x77c/0x880
+[  333.804630] Code: e8 a9 82 fd ff 48 8b 95 70 ff ff ff 48 8b bd 60 ff ff ff 
b9 01 00 00 00 4c 89 f6 e8 2e 14 ff ff b8 fe ff ff ff e9 cb fe ff ff <0f> 0b 48 
8b bd 38 ff ff ff e8 76 82 fd ff e9 35 ff ff ff 48 8b 95
+[  333.808462] RSP: 0018:8801eedf7230 EFLAGS: 00010282
+[  333.809542] RAX: 8801f2df2100 RBX: ffef RCX: 
a5839143
+[  333.810991] RDX: 11003e5be444 RSI: e30001c0 RDI: 
8801f2df2220
+[  333.812451] RBP: 8801eedf7310 R08: ed003e5be445 R09: 
ed003e5be445
+[  333.813905] R10: 0001 R11: ed003e5be444 R12: 
8801e6788158
+[  333.815357] R13: 0001 R14: 0001 R15: 
8801f2df2220
+[  333.846990] FS:  7f2013519840() GS:8801f6e0() 
knlGS:
+[  333.848645] CS:  0010 DS:  ES:  CR0: 80050033
+[  333.849816] CR2: 7f88a3c6b760 CR3: 0001e655e000 CR4: 
06f0
+[  333.851304] Call Trace:
+[  333.851864]  ? add_missing_dev+0xc0/0xc0
+[  333.852715]  ? read_extent_buffer+0xe9/0x130
+[  333.853604]  btrfs_read_chunk_tree+0x957/0xd20
+[  333.854551]  ? free_root_pointers+0xb0/0xb0
+[  333.855435]  ? btrfs_check_rw_degradable+0x240/0x240
+[  333.856491]  ? btree_read_extent_buffer_pages+0x1e0/0x3b0
+[  333.857617]  ? run_one_async_done+0xb0/0xb0
+[  333.858498]  ? cache_state.part.32+0x10/0x40
+[  333.859430]  ? unlock_page+0x16/0x40
+[  333.860202]  ? alloc_extent_buffer+0x4a1/0x4e0
+[  333.861149]  ? memcpy+0x45/0x50
+[  333.861818]  ? read_extent_buffer+0xe9/0x130
+[  333.862711]  open_ctree+0x246c/0x35c6
+[  333.863488]  ? close_ctree+0x460/0x460
+[  333.864302]  ? bdi_register_va+0x44/0x50
+[  333.865142]  ? super_setup_bdi_name+0x11b/0x1a0
+[  333.866089]  ? kill_block_super+0x80/0x80
+[  333.866970]  ? snprintf+0x96/0xd0
+[  333.867704]  btrfs_mount_root+0xae6/0xc60
+[  333.868550]  ? btrfs_mount_root+0xae6/0xc60
+[  333.869419]  ? pcpu_block_update_hint_alloc+0x1d2/0x2a0
+[  333.870492]  ? btrfs_decode_error+0x40/0x40
+[  333.871389]  ? find_next_bit+0x57/0x90
+[  333.872206]  ? cpumask_next+0x1a/0x20
+[  333.872986]  ? pcpu_alloc+0x449/0x8c0
+[  333.873761]  ? pcpu_free_area+0x410/0x410
+[  333.874614]  ? memcg_kmem_put_cache+0x1b/0xa0
+[  333.875531]  ? memcpy+0x45/0x50
+[  333.876209]  mount_fs+0x60/0x1a0
+[  333.876892]  ? btrfs_decode_error+0x40/0x40
+[  333.877763]  ? mount_fs+0x60/0x1a0
+[  333.878492]  ? alloc_vfsmnt+0x309/0x360
+[  333.879303]  vfs_kern_mount+0x6b/0x1a0
+[  333.880121]  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
+[  333.881209]  btrfs_mount+0x209/0xb71
+[  333.881962]  ? pcpu_block_update_hint_alloc+0x1d2/0x2a0
+[  333.883044]  ? btrfs_remount+0x8e0/0x8e0
+[  333.883878]  ? find_next_zero_bit+0x2c/0xa0
+[  333.884753]  ? find_next_bit+0x57/0x90
+[  333.885538]  ? cpumask_next+0x1a/0x20
+[  333.886307]  ? pcpu_alloc+0x449/0x8c0
+[  333.887078]  ? pcpu_free_area+0x410/0x410
+[  333.887930]  ? memcg_kmem_put_cache+0x1b/0xa0
+[  333.36]  ? memcpy+0x45/0x50
+[  333.889500]  mount_fs+0x60/0x1a0
+[  333.890182]  ? btrfs_remount+0x8e0/0x8e0
+[  333.891001]  ? mount_fs+0x60/0x1a0
+[  333.891728]  ? alloc_vfsmnt+0x309/0x360
+[  333.892533]  vfs_kern_mount+0x6b/0x1a0
+[  333.893323]  do_mount+0x34a/0x18c0
+[  333.894042]  ? copy_mount_string+0x20/0x20
+[  333.894898]  ? memcg_kmem_put_cache+0x1b/0xa0
+[  333.895832]  ? kasan_check_write+0x14/0x20
+[  333.896704]  ? _copy_from_user+0x6a/0x90
+[  333.897542]  ? memdu

[PATCH 1/2] btrfs-progs: Exit gracefully when overlap chunks are detected

2018-07-08 Thread Qu Wenruo
BUG_ON() can be triggered if some image contains overlap chunks.
volumes.c:1930: read_one_chunk: BUG_ON `ret` triggered, value -17
btrfs(+0x2cf12)[0x5601efa17f12]
btrfs(+0x2fd8b)[0x5601efa1ad8b]
btrfs(btrfs_read_chunk_tree+0x2bf)[0x5601efa1b30f]
btrfs(btrfs_setup_chunk_tree_and_device_map+0xe8)[0x5601efa07718]
btrfs(+0x1c944)[0x5601efa07944]
btrfs(open_ctree_fs_info+0x90)[0x5601efa07b90]
btrfs(cmd_check+0x4d7)[0x5601efa4f8c7]
btrfs(main+0x88)[0x5601ef9fd768]
/usr/lib/libc.so.6(__libc_start_main+0xeb)[0x7f3c7787306b]
btrfs(_start+0x2a)[0x5601ef9fd88a]

Extent cache code can already detect it without problem, we only need to
remove the BUG_ON() and exit gracefully.

Reported-by: Xu Wen 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200409
Signed-off-by: Qu Wenruo 
---
 volumes.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/volumes.c b/volumes.c
index 24eb3e8b2578..21a1fc31fcba 100644
--- a/volumes.c
+++ b/volumes.c
@@ -1924,9 +1924,12 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, 
struct btrfs_key *key,
 
}
ret = insert_cache_extent(&map_tree->cache_tree, &map->ce);
-   BUG_ON(ret);
+   if (ret < 0) {
+   error("failed to add chunk map start=%llu len=%llu: %d (%s)",
+ map->ce.start, map->ce.size, ret, strerror(-ret));
+   }
 
-   return 0;
+   return ret;
 }
 
 static int fill_device_from_item(struct extent_buffer *leaf,
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] btrfs: Introduce mount time chunk <-> dev extent mapping check

2018-07-08 Thread Qu Wenruo
This patch will introduce chunk <-> dev extent mapping check, to protect
us against invalid dev extents or chunks.

Since chunk mapping is the fundamental infrastructure of btrfs, extra
check at mount time could prevent a lot of unexpected behavior (BUG_ON).

Reported-by: Xu Wen 
Links: https://bugzilla.kernel.org/show_bug.cgi?id=200403
Links: https://bugzilla.kernel.org/show_bug.cgi?id=200407
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/disk-io.c |   7 ++
 fs/btrfs/volumes.c | 173 +
 fs/btrfs/volumes.h |   2 +
 3 files changed, 182 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 205092dc9390..068ca7498e94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3075,6 +3075,13 @@ int open_ctree(struct super_block *sb,
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
 
+   ret = btrfs_verify_dev_extents(fs_info);
+   if (ret) {
+   btrfs_err(fs_info,
+ "failed to verify dev extents against chunks: %d",
+ ret);
+   goto fail_block_groups;
+   }
ret = btrfs_recover_balance(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to recover balance: %d", ret);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e6a8e4aabc66..05e418cb37f3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6440,6 +6440,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, 
struct btrfs_key *key,
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = btrfs_chunk_type(leaf, chunk);
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+   map->verified_stripes = 0;
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7295,3 +7296,175 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info 
*fs_info)
fs_devices = fs_devices->seed;
}
 }
+
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+{
+   switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+   case BTRFS_BLOCK_GROUP_RAID0:
+   return div_u64(chunk_len, num_stripes);
+   case BTRFS_BLOCK_GROUP_RAID10:
+   return div_u64(chunk_len * 2, num_stripes);
+   case BTRFS_BLOCK_GROUP_RAID5:
+   return div_u64(chunk_len, num_stripes - 1);
+   case BTRFS_BLOCK_GROUP_RAID6:
+   return div_u64(chunk_len, num_stripes - 2);
+   default:
+   return chunk_len;
+   }
+}
+static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
+u64 chunk_offset, u64 devid,
+u64 physical_offset, u64 physical_len)
+{
+   struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+   struct extent_map *em;
+   struct map_lookup *map;
+   u64 stripe_len;
+   bool found = false;
+   int ret = 0;
+   int i;
+
+   read_lock(&em_tree->lock);
+   em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+   read_unlock(&em_tree->lock);
+
+   if (!em) {
+   ret = -EUCLEAN;
+   btrfs_err(fs_info,
+   "dev extent (%llu, %llu) doesn't have corresponding chunk",
+ devid, physical_offset);
+   goto out;
+   }
+
+   map = em->map_lookup;
+   stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+   if (physical_len != stripe_len) {
+   btrfs_err(fs_info,
+"dev extent (%llu, %llu) length doesn't match with chunk %llu, have %llu 
expect %llu",
+ devid, physical_offset, em->start, physical_len,
+ stripe_len);
+   ret = -EUCLEAN;
+   goto out;
+   }
+
+   for (i = 0; i < map->num_stripes; i++) {
+   if (map->stripes[i].dev->devid == devid &&
+   map->stripes[i].physical == physical_offset) {
+   found = true;
+   if (map->verified_stripes >= map->num_stripes) {
+   btrfs_err(fs_info,
+   "too many dev extent for chunk %llu is detected",
+ em->start);
+   ret = -EUCLEAN;
+   goto out;
+   }
+   map->verified_stripes++;
+   break;
+   }
+   }
+out:
+   free_extent_map(em);
+   return ret;
+}
+
+static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
+{
+   struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+   struct extent_map *em;
+   struct rb_node *node;
+   int ret = 0;
+
+   read_lock(&em_tree->lock);
+   for (node = rb_first(&em_tree->map); node; node = rb_ne

[PATCH 0/2] btrfs: chunk and dev-extent related error handler enhancement

2018-07-08 Thread Qu Wenruo
Can be fetched with all existing tree-checker/bg<->chunk error detector
from github:
https://github.com/adam900710/linux/tree/tree_checker_enhance

Still some fuzzed images reported from Xu Wen.
This time, 2 can be fixed by chunk <-> dev extent mapping verification.
One BUG_ON() can be removed.

The remaining 2 images are all mostly about extent tree corruption,
which is not as easy to detect, since extent tree has way more complex
reference relationship.

At least, fix what we can first.

And for chunk <-> dev extent mapping verification, it will trigger
read on the whole device tree, to iterate through all DEV_EXTENT items.
This will introduce an extra overhead on mount.

However since device tree is pretty small (the same level as chunk tree),
and according to previous analyse on mount time, chunk tree iteration is
only a pretty small fraction of the whole mount time (less than 5%), it
shouldn't bring obvious impact to users.

Qu Wenruo (2):
  btrfs: Introduce mount time chunk <-> dev extent mapping check
  btrfs: Exit gracefully when failed to add chunk map

 fs/btrfs/disk-io.c |   7 ++
 fs/btrfs/volumes.c | 181 -
 fs/btrfs/volumes.h |   2 +
 3 files changed, 188 insertions(+), 2 deletions(-)

-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] btrfs: Exit gracefully when failed to add chunk map

2018-07-08 Thread Qu Wenruo
It's completely possible that a crafted btrfs image contains overlapping
chunks.

Although we can't detect such problem by tree-checker, but it's not a
catastrophic problem, current extent map can already detect such problem
and return -EEXIST.

We just only need to exit gracefully so btrfs can refuse to mount the
fs.

Reported-by: Xu Wen 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200409
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/volumes.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 05e418cb37f3..affd288bb216 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6477,10 +6477,14 @@ static int read_one_chunk(struct btrfs_fs_info 
*fs_info, struct btrfs_key *key,
write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em, 0);
write_unlock(&map_tree->map_tree.lock);
-   BUG_ON(ret); /* Tree corruption */
+   if (ret < 0) {
+   btrfs_err(fs_info,
+ "failed to add chunk map, start=%llu len=%llu: %d",
+ em->start, em->len, ret);
+   }
free_extent_map(em);
 
-   return 0;
+   return ret;
 }
 
 static void fill_device_from_item(struct extent_buffer *leaf,
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: remove unused parameter

2018-07-08 Thread Gu Jinxiang
Since parameter flags is no more used since
commit d7407606564c ("btrfs: split parse_early_options() in two"),
remove it.

Signed-off-by: Gu Jinxiang 
---
 fs/btrfs/super.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 81107ad49f3a..bf546d6c286c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -935,8 +935,8 @@ static int btrfs_parse_early_options(const char *options, 
fmode_t flags,
  *
  * The value is later passed to mount_subvol()
  */
-static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
-   char **subvol_name, u64 *subvol_objectid)
+static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
+   u64 *subvol_objectid)
 {
substring_t args[MAX_OPT_ARGS];
char *opts, *orig, *p;
@@ -1650,8 +1650,8 @@ static struct dentry *btrfs_mount(struct file_system_type 
*fs_type, int flags,
if (!(flags & SB_RDONLY))
mode |= FMODE_WRITE;
 
-   error = btrfs_parse_subvol_options(data, mode,
- &subvol_name, &subvol_objectid);
+   error = btrfs_parse_subvol_options(data, &subvol_name,
+   &subvol_objectid);
if (error) {
kfree(subvol_name);
return ERR_PTR(error);
-- 
2.17.1



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: how to best segment a big block device in resizeable btrfs filesystems?

2018-07-08 Thread Duncan
Andrei Borzenkov posted on Fri, 06 Jul 2018 07:28:48 +0300 as excerpted:

> 03.07.2018 10:15, Duncan пишет:
>> Andrei Borzenkov posted on Tue, 03 Jul 2018 07:25:14 +0300 as
>> excerpted:
>> 
>>> 02.07.2018 21:35, Austin S. Hemmelgarn пишет:
 them (trimming blocks on BTRFS gets rid of old root trees, so it's a
 bit dangerous to do it while writes are happening).
>>>
>>> Could you please elaborate? Do you mean btrfs can trim data before new
>>> writes are actually committed to disk?
>> 
>> No.
>> 
>> But normally old roots aren't rewritten for some time simply due to
>> odds (fuller filesystems will of course recycle them sooner), and the
>> btrfs mount option usebackuproot (formerly recovery, until the
>> norecovery mount option that parallels that of other filesystems was
>> added and this option was renamed to avoid confusion) can be used to
>> try an older root if the current root is too damaged to successfully
>> mount.

>> But other than simply by odds not using them again immediately, btrfs
>> has
>> no special protection for those old roots, and trim/discard will
>> recover them to hardware-unused as it does any other unused space, tho
>> whether it simply marks them for later processing or actually processes
>> them immediately is up to the individual implementation -- some do it
>> immediately, killing all chances at using the backup root because it's
>> already zeroed out, some don't.
>> 
>> 
> How is it relevant to "while writes are happening"? Will trimming old
> tress immediately after writes have stopped be any different? Why?

Define "while writes are happening" vs. "immediately after writes have 
stopped".  How soon is "immediately", and does the writes stopped 
condition account for data that has reached the device-hardware write 
buffer (so is no longer being transmitted to the device across the bus) 
but not been actually written to media, or not?

On a reasonably quiescent system, multiple empty write cycles are likely 
to have occurred since the last write barrier, and anything in-process is 
likely to have made it to media even if software is missing a write 
barrier it needs (software bug) or the hardware lies about honoring the 
write barrier (hardware bug, allegedly sometimes deliberate on hardware 
willing to gamble with your data that a crash won't happen in a critical 
moment, a somewhat rare occurrence, in ordered to improve normal 
operation performance metrics).

On an IO-maxed system, data and write-barriers are coming down as fast as 
the system can handle them, and write-barriers become critical -- crash 
after something was supposed to get to media but didn't, either because 
of a missing write barrier or because the hardware/firmware lied about 
the barrier and said the data it was supposed to ensure was on-media was, 
when it wasn't, and the btrfs atomic-cow commit guarantees of consistent 
state at each commit go out the window.

At this point it becomes useful to have a number of previous "guaranteed 
consistent state" roots to fall back on, with the /hope/ being that at 
least /one/ of them is usably consistent.  If all but the last one are 
wiped due to trim...

When the system isn't write-maxed the write will have almost certainly 
made it regardless of whether the barrier is there or not, because 
there's enough idle time to finish the current write before another one 
comes down the pipe, so the last-written root is almost certain to be 
fine regardless of barriers, and the history of past roots doesn't matter 
even if there's a crash.

If "immediately after writes have stopped" is strictly defined as a 
condition when all writes including the btrfs commit updating the current 
root and the superblock pointers to the current root have completed, with 
no new writes coming down the pipe in the mean time that might have 
delayed a critical update if a barrier was missed, then trimming old 
roots in this state should be entirely safe, and the distinction between 
that state and the "while writes are happening" is clear.

But if "immediately after writes have stopped" is less strictly defined, 
then the distinction between that state and "while writes are happening" 
remains blurry at best, and having old roots around to fall back on in 
case a write-barrier was missed (for whatever reason, hardware or 
software) becomes a very good thing.

Of course the fact that trim/discard itself is an instruction written to 
the device in the combined command/data stream complexifies the picture 
substantially.  If those write barriers get missed who knows what state 
the new root is in, and if the old ones got erased...  But again, on a 
mostly idle system, it'll probably all "just work", because the writes 
will likely all make it to media, regardless, because there's not a bunch 
of other writes competing for limited write bandwidth and making ordering 
critical.

>> In the context of the discard mount option, that can mean there's never
>> any old roots available ever