date:20180919

From: Omar Sandoval 

The Btrfs swap code is going to need it, so give it a btrfs_ prefix and
make it non-static.

Reviewed-by: Nikolay Borisov 
Signed-off-by: Omar Sandoval 
---
 fs/btrfs/volumes.c | 29 ++---
 fs/btrfs/volumes.h |  2 ++
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a2761395ed22..fe66b635c023 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2714,8 +2714,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info 
*fs_info, u64 chunk_offset)
return ret;
 }
 
-static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
-   u64 logical, u64 length)
+/**
+ * btrfs_get_chunk_map() - Find the mapping containing the given logical 
extent.
+ * @logical: Logical block offset in bytes.
+ * @length: Length of extent in bytes.
+ *
+ * Return: Chunk mapping or ERR_PTR.
+ */
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+  u64 logical, u64 length)
 {
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -2752,7 +2759,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, 
u64 chunk_offset)
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 
-   em = get_chunk_map(fs_info, chunk_offset, 1);
+   em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em)) {
/*
 * This is a logic error, but we don't want to just rely on the
@@ -4902,7 +4909,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle 
*trans,
int i = 0;
int ret = 0;
 
-   em = get_chunk_map(fs_info, chunk_offset, chunk_size);
+   em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
if (IS_ERR(em))
return PTR_ERR(em);
 
@@ -5044,7 +5051,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, 
u64 chunk_offset)
int miss_ndevs = 0;
int i;
 
-   em = get_chunk_map(fs_info, chunk_offset, 1);
+   em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
if (IS_ERR(em))
return 1;
 
@@ -5104,7 +5111,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 
logical, u64 len)
struct map_lookup *map;
int ret;
 
-   em = get_chunk_map(fs_info, logical, len);
+   em = btrfs_get_chunk_map(fs_info, logical, len);
if (IS_ERR(em))
/*
 * We could return errors for these cases, but that could get
@@ -5150,7 +5157,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info 
*fs_info,
struct map_lookup *map;
unsigned long len = fs_info->sectorsize;
 
-   em = get_chunk_map(fs_info, logical, len);
+   em = btrfs_get_chunk_map(fs_info, logical, len);
 
if (!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
@@ -5167,7 +5174,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, 
u64 logical, u64 len)
struct map_lookup *map;
int ret = 0;
 
-   em = get_chunk_map(fs_info, logical, len);
+   em = btrfs_get_chunk_map(fs_info, logical, len);
 
if(!WARN_ON(IS_ERR(em))) {
map = em->map_lookup;
@@ -5326,7 +5333,7 @@ static int __btrfs_map_block_for_discard(struct 
btrfs_fs_info *fs_info,
/* discard always return a bbio */
ASSERT(bbio_ret);
 
-   em = get_chunk_map(fs_info, logical, length);
+   em = btrfs_get_chunk_map(fs_info, logical, length);
if (IS_ERR(em))
return PTR_ERR(em);
 
@@ -5652,7 +5659,7 @@ static int __btrfs_map_block(struct btrfs_fs_info 
*fs_info,
return __btrfs_map_block_for_discard(fs_info, logical,
 *length, bbio_ret);
 
-   em = get_chunk_map(fs_info, logical, *length);
+   em = btrfs_get_chunk_map(fs_info, logical, *length);
if (IS_ERR(em))
return PTR_ERR(em);
 
@@ -5951,7 +5958,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 
chunk_start,
u64 rmap_len;
int i, j, nr = 0;
 
-   em = get_chunk_map(fs_info, chunk_start, 1);
+   em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
if (IS_ERR(em))
return -EIO;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 23e9285d88de..f4c190c2ab84 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -465,6 +465,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info 
*fs_info,
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 u64 chunk_offset, u64 chunk_size);
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
+struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+  u64 logical, u64 length);
 
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,

[PATCH v8 0/6] Btrfs: implement swap file support

From: Omar Sandoval 

Hi,

This series implements swap file support for Btrfs.

Changes from v7 [1]:

- Expanded a few commit messages
- Added Johannes' acked-by on patches 1 and 2
- Rebased on v4.19-rc4

No functional changes.

Thanks!

1: https://www.spinics.net/lists/linux-btrfs/msg81933.html

Omar Sandoval (6):
  mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
  mm: export add_swap_extent()
  vfs: update swap_{,de}activate documentation
  Btrfs: prevent ioctls from interfering with a swap file
  Btrfs: rename get_chunk_map() and make it non-static
  Btrfs: support swap files

 Documentation/filesystems/Locking |  17 +-
 Documentation/filesystems/vfs.txt |  12 +-
 fs/btrfs/ctree.h  |  29 +++
 fs/btrfs/dev-replace.c|   8 +
 fs/btrfs/disk-io.c|   4 +
 fs/btrfs/inode.c  | 317 ++
 fs/btrfs/ioctl.c  |  31 ++-
 fs/btrfs/relocation.c |  18 +-
 fs/btrfs/volumes.c|  82 ++--
 fs/btrfs/volumes.h|   2 +
 include/linux/swap.h  |  13 +-
 mm/page_io.c  |   6 +-
 mm/swapfile.c |  14 +-
 13 files changed, 502 insertions(+), 51 deletions(-)

-- 
2.19.0

[PATCH v8 4/6] Btrfs: prevent ioctls from interfering with a swap file

From: Omar Sandoval 

A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.

When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:

- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag

Don't allow those to happen on an active swap file.

Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.

Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.

Signed-off-by: Omar Sandoval 
---
 fs/btrfs/ctree.h   | 29 +++
 fs/btrfs/dev-replace.c |  8 +++
 fs/btrfs/disk-io.c |  4 
 fs/btrfs/ioctl.c   | 31 +---
 fs/btrfs/relocation.c  | 18 ++
 fs/btrfs/volumes.c | 53 ++
 6 files changed, 131 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..08df61b8fc87 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -716,6 +716,28 @@ struct btrfs_fs_devices;
 struct btrfs_balance_control;
 struct btrfs_delayed_root;
 
+/*
+ * Block group or device which contains an active swapfile. Used for preventing
+ * unsafe operations while a swapfile is active.
+ *
+ * These are sorted on (ptr, inode) (note that a block group or device can
+ * contain more than one swapfile). We compare the pointer values because we
+ * don't actually care what the object is, we just need a quick check whether
+ * the object exists in the rbtree.
+ */
+struct btrfs_swapfile_pin {
+   struct rb_node node;
+   void *ptr;
+   struct inode *inode;
+   /*
+* If true, ptr points to a struct btrfs_block_group_cache. Otherwise,
+* ptr points to a struct btrfs_device.
+*/
+   bool is_block_group;
+};
+
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+
 #define BTRFS_FS_BARRIER   1
 #define BTRFS_FS_CLOSING_START 2
 #define BTRFS_FS_CLOSING_DONE  3
@@ -1121,6 +1143,10 @@ struct btrfs_fs_info {
u32 sectorsize;
u32 stripesize;
 
+   /* Block groups and devices containing active swapfiles. */
+   spinlock_t swapfile_pins_lock;
+   struct rb_root swapfile_pins;
+
 #ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1286,6 +1312,9 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+
+   /* Number of active swapfiles */
+   atomic_t nr_swapfiles;
 };
 
 struct btrfs_file_private {
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dec01970d8c5..09d2cee2635b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -414,6 +414,14 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
if (ret)
return ret;
 
+   if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
+   btrfs_info_in_rcu(fs_info,
+ "cannot replace device %s (devid %llu) due to 
active swapfile",
+ btrfs_dev_name(src_device),
+ src_device->devid);
+   return -ETXTBSY;
+   }
+
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, _device);
if (ret)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 05dc3c17cb62..2428a73067d2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1188,6 +1188,7 @@ static void __setup_root(struct btrfs_root *root, struct 
btrfs_fs_info *fs_info,
refcount_set(>refs, 1);
atomic_set(>will_be_snapshotted, 0);
atomic_set(>snapshot_force_cow, 0);
+   atomic_set(>nr_swapfiles, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -2782,6 +2783,9 @@ int open_ctree(struct super_block *sb,
fs_info->sectorsize = 4096;
fs_info->stripesize = 4096;
 
+   spin_lock_init(_info->swapfile_pins_lock);
+   fs_info->swapfile_pins = RB_ROOT;
+
ret =

[PATCH v8 6/6] Btrfs: support swap files

From: Omar Sandoval 

Btrfs has not allowed swap files since commit 35054394c4b3 ("Btrfs: stop
providing a bmap operation to avoid swapfile corruptions"). However, now
that the proper restrictions are in place, Btrfs can support swap files
through the swap file a_ops, similar to iomap in commit 67482129cdab
("iomap: add a swapfile activation function").

For Btrfs, activation needs to make sure that the file can be used as a
swap file, which currently means that it must be fully allocated as
nocow with no compression on one device. It must also do the proper
tracking so that ioctls will not interfere with the swap file.
Deactivation clears this tracking.

Signed-off-by: Omar Sandoval 
---
 fs/btrfs/inode.c | 317 +++
 1 file changed, 317 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..0586285b1d9f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "ctree.h"
 #include "disk-io.h"
@@ -10488,6 +10489,320 @@ void btrfs_set_range_writeback(struct extent_io_tree 
*tree, u64 start, u64 end)
}
 }
 
+/*
+ * Add an entry indicating a block group or device which is pinned by a
+ * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
+ * negative errno on failure.
+ */
+static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
+ bool is_block_group)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+   struct btrfs_swapfile_pin *sp, *entry;
+   struct rb_node **p;
+   struct rb_node *parent = NULL;
+
+   sp = kmalloc(sizeof(*sp), GFP_NOFS);
+   if (!sp)
+   return -ENOMEM;
+   sp->ptr = ptr;
+   sp->inode = inode;
+   sp->is_block_group = is_block_group;
+
+   spin_lock(_info->swapfile_pins_lock);
+   p = _info->swapfile_pins.rb_node;
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
+   if (sp->ptr < entry->ptr ||
+   (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
+   p = &(*p)->rb_left;
+   } else if (sp->ptr > entry->ptr ||
+  (sp->ptr == entry->ptr && sp->inode > entry->inode)) 
{
+   p = &(*p)->rb_right;
+   } else {
+   spin_unlock(_info->swapfile_pins_lock);
+   kfree(sp);
+   return 1;
+   }
+   }
+   rb_link_node(>node, parent, p);
+   rb_insert_color(>node, _info->swapfile_pins);
+   spin_unlock(_info->swapfile_pins_lock);
+   return 0;
+}
+
+/* Free all of the entries pinned by this swapfile. */
+static void btrfs_free_swapfile_pins(struct inode *inode)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+   struct btrfs_swapfile_pin *sp;
+   struct rb_node *node, *next;
+
+   spin_lock(_info->swapfile_pins_lock);
+   node = rb_first(_info->swapfile_pins);
+   while (node) {
+   next = rb_next(node);
+   sp = rb_entry(node, struct btrfs_swapfile_pin, node);
+   if (sp->inode == inode) {
+   rb_erase(>node, _info->swapfile_pins);
+   if (sp->is_block_group)
+   btrfs_put_block_group(sp->ptr);
+   kfree(sp);
+   }
+   node = next;
+   }
+   spin_unlock(_info->swapfile_pins_lock);
+}
+
+struct btrfs_swap_info {
+   u64 start;
+   u64 block_start;
+   u64 block_len;
+   u64 lowest_ppage;
+   u64 highest_ppage;
+   unsigned long nr_pages;
+   int nr_extents;
+};
+
+static int btrfs_add_swap_extent(struct swap_info_struct *sis,
+struct btrfs_swap_info *bsi)
+{
+   unsigned long nr_pages;
+   u64 first_ppage, first_ppage_reported, next_ppage;
+   int ret;
+
+   first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+   next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+   PAGE_SIZE) >> PAGE_SHIFT;
+
+   if (first_ppage >= next_ppage)
+   return 0;
+   nr_pages = next_ppage - first_ppage;
+
+   first_ppage_reported = first_ppage;
+   if (bsi->start == 0)
+   first_ppage_reported++;
+   if (bsi->lowest_ppage > first_ppage_reported)
+   bsi->lowest_ppage = first_ppage_reported;
+   if (bsi->highest_ppage < (next_ppage - 1))
+   bsi->highest_ppage = next_ppage - 1;
+
+   ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
+   if (ret < 0)
+   return ret;
+   bsi->nr_extents += ret;
+   bsi->nr_pages += nr_pages;
+   return 0;
+}
+
+static void btrfs_swap_deactivate(struct file *file)
+{
+

[PATCH v8 1/6] mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS

From: Omar Sandoval 

The SWP_FILE flag serves two purposes: to make swap_{read,write}page()
go through the filesystem, and to make swapoff() call
->swap_deactivate(). For Btrfs, we want the latter but not the former,
so split this flag into two. This makes us always call
->swap_deactivate() if ->swap_activate() succeeded, not just if it
didn't add any swap extents itself.

This also resolves the issue of the very misleading name of SWP_FILE,
which is only used for swap files over NFS.

Reviewed-by: Nikolay Borisov 
Acked-by: Johannes Weiner 
Signed-off-by: Omar Sandoval 
---
 include/linux/swap.h | 13 +++--
 mm/page_io.c |  6 +++---
 mm/swapfile.c| 13 -
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8e2c11e692ba..0fda0aa743f0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -167,13 +167,14 @@ enum {
SWP_SOLIDSTATE  = (1 << 4), /* blkdev seeks are cheap */
SWP_CONTINUED   = (1 << 5), /* swap_map has count continuation */
SWP_BLKDEV  = (1 << 6), /* its a block device */
-   SWP_FILE= (1 << 7), /* set after swap_activate success */
-   SWP_AREA_DISCARD = (1 << 8),/* single-time swap area discards */
-   SWP_PAGE_DISCARD = (1 << 9),/* freed swap page-cluster discards */
-   SWP_STABLE_WRITES = (1 << 10),  /* no overwrite PG_writeback pages */
-   SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */
+   SWP_ACTIVATED   = (1 << 7), /* set after swap_activate success */
+   SWP_FS  = (1 << 8), /* swap file goes through fs */
+   SWP_AREA_DISCARD = (1 << 9),/* single-time swap area discards */
+   SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
+   SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
+   SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
/* add others here before... */
-   SWP_SCANNING= (1 << 12),/* refcount in scan_swap_map */
+   SWP_SCANNING= (1 << 13),/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
diff --git a/mm/page_io.c b/mm/page_io.c
index aafd19ec1db4..e8653c368069 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -283,7 +283,7 @@ int __swap_writepage(struct page *page, struct 
writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
 
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
-   if (sis->flags & SWP_FILE) {
+   if (sis->flags & SWP_FS) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -365,7 +365,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
 
-   if (sis->flags & SWP_FILE) {
+   if (sis->flags & SWP_FS) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
 
@@ -423,7 +423,7 @@ int swap_set_page_dirty(struct page *page)
 {
struct swap_info_struct *sis = page_swap_info(page);
 
-   if (sis->flags & SWP_FILE) {
+   if (sis->flags & SWP_FS) {
struct address_space *mapping = sis->swap_file->f_mapping;
 
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d954b71c4f9c..d3f95833d12e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -989,7 +989,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], 
int entry_size)
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
-   if (!(si->flags & SWP_FILE))
+   if (!(si->flags & SWP_FS))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
@@ -2310,12 +2310,13 @@ static void destroy_swap_extents(struct 
swap_info_struct *sis)
kfree(se);
}
 
-   if (sis->flags & SWP_FILE) {
+   if (sis->flags & SWP_ACTIVATED) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
 
-   sis->flags &= ~SWP_FILE;
-   mapping->a_ops->swap_deactivate(swap_file);
+   sis->flags &= ~SWP_ACTIVATED;
+   if (mapping->a_ops->swap_deactivate)
+   mapping->a_ops->swap_deactivate(swap_file);
}
 }
 
@@ -2411,8 +2412,10 @@ static int setup_swap_extents(struct swap_info_struct 
*sis, sector_t *span)
 
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+   if (ret >= 0)
+   sis->flags |= SWP_ACTIVATED;
if

[PATCH v8 2/6] mm: export add_swap_extent()

From: Omar Sandoval 

Btrfs currently does not support swap files because swap's use of bmap
does not work with copy-on-write and multiple devices. See commit
35054394c4b3 ("Btrfs: stop providing a bmap operation to avoid swapfile
corruptions"). However, the swap code has a mechanism for the filesystem
to manually add swap extents using add_swap_extent() from the
->swap_activate() aop. iomap has done this since commit 67482129cdab
("iomap: add a swapfile activation function"). Btrfs will do the same in
a later patch, so export add_swap_extent().

Acked-by: Johannes Weiner 
Signed-off-by: Omar Sandoval 
---
 mm/swapfile.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index d3f95833d12e..51cb30de17bc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2365,6 +2365,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned 
long start_page,
list_add_tail(_se->list, >first_swap_extent.list);
return 1;
 }
+EXPORT_SYMBOL_GPL(add_swap_extent);
 
 /*
  * A `swap extent' is a simple thing which maps a contiguous range of pages
-- 
2.19.0

[PATCH v8 3/6] vfs: update swap_{,de}activate documentation

From: Omar Sandoval 

The documentation for these functions is wrong in several ways:

- swap_activate() is called with the inode locked
- swap_activate() takes a swap_info_struct * and a sector_t *
- swap_activate() can also return a positive number of extents it added
  itself
- swap_deactivate() does not return anything

Reviewed-by: Nikolay Borisov 
Signed-off-by: Omar Sandoval 
---
 Documentation/filesystems/Locking | 17 +++--
 Documentation/filesystems/vfs.txt | 12 
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/Documentation/filesystems/Locking 
b/Documentation/filesystems/Locking
index efea228ccd8a..b970c8c2ee22 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -210,8 +210,9 @@ prototypes:
int (*launder_page)(struct page *);
int (*is_partially_uptodate)(struct page *, unsigned long, unsigned 
long);
int (*error_remove_page)(struct address_space *, struct page *);
-   int (*swap_activate)(struct file *);
-   int (*swap_deactivate)(struct file *);
+   int (*swap_activate)(struct swap_info_struct *, struct file *,
+sector_t *);
+   void (*swap_deactivate)(struct file *);
 
 locking rules:
All except set_page_dirty and freepage may block
@@ -235,8 +236,8 @@ putback_page:   yes
 launder_page:  yes
 is_partially_uptodate: yes
 error_remove_page: yes
-swap_activate: no
-swap_deactivate:   no
+swap_activate: yes
+swap_deactivate:   no
 
->write_begin(), ->write_end() and ->readpage() may be called from
 the request handler (/dev/loop).
@@ -333,14 +334,10 @@ cleaned, or an error value if not. Note that in order to 
prevent the page
 getting mapped back in and redirtied, it needs to be kept locked
 across the entire operation.
 
-   ->swap_activate will be called with a non-zero argument on
-files backing (non block device backed) swapfiles. A return value
-of zero indicates success, in which case this file can be used for
-backing swapspace. The swapspace operations will be proxied to the
-address space operations.
+   ->swap_activate is called from sys_swapon() with the inode locked.
 
->swap_deactivate() will be called in the sys_swapoff()
-path after ->swap_activate() returned success.
+path after ->swap_activate() returned success. The inode is not locked.
 
 --- file_lock_operations --
 prototypes:
diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index a6c6a8af48a2..6e14db053eaa 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -652,8 +652,9 @@ struct address_space_operations {
unsigned long);
void (*is_dirty_writeback) (struct page *, bool *, bool *);
int (*error_remove_page) (struct mapping *mapping, struct page *page);
-   int (*swap_activate)(struct file *);
-   int (*swap_deactivate)(struct file *);
+   int (*swap_activate)(struct swap_info_struct *, struct file *,
+sector_t *);
+   void (*swap_deactivate)(struct file *);
 };
 
   writepage: called by the VM to write a dirty page to backing store.
@@ -830,8 +831,11 @@ struct address_space_operations {
 
   swap_activate: Called when swapon is used on a file to allocate
space if necessary and pin the block lookup information in
-   memory. A return value of zero indicates success,
-   in which case this file can be used to back swapspace.
+   memory. If this returns zero, the swap system will call the address
+   space operations ->readpage() and ->direct_IO(). Alternatively, this
+   may call add_swap_extent() and return the number of extents added, in
+   which case the swap system will use the provided blocks directly
+   instead of going through the filesystem.
 
   swap_deactivate: Called during swapoff on files where swap_activate
was successful.
-- 
2.19.0

Re: inline extents

2018-09-19 Thread Chris Murphy

Adding fsdevel@, linux-ext4, and btrfs@ (which has a separate subject
on this same issue)

On Wed, Sep 19, 2018 at 7:45 PM, Dave Chinner  wrote:
>On Wed, Sep 19, 2018 at 10:23:38AM -0600, Chris Murphy wrote:
>> Fedora 29 has a new feature to test if boot+startup fails, so the
>> bootloader can do a fallback at next boot, to a previously working
>> entry. Part of this means GRUB (the bootloader code, not the user
>> space code) uses "save_env" to overwrite the 1024 data bytes with
>> updated environment information.
>
> That's just broken. Illegal. Completely unsupportable. Doesn't
> matter what the filesystem is, nobody is allowed to write directly
> to the block device a filesystem owns.

Yeah, the word I'm thinking of is abomination.

However in their defense, grubenv and the 'save_env' command are old features:

line 3638 @node Environment block
http://git.savannah.gnu.org/cgit/grub.git/tree/docs/grub.texi

"For safety reasons, this storage is only available when installed on a plain
disk (no LVM or RAID), using a non-checksumming filesystem (no ZFS), and
using BIOS or EFI functions (no ATA, USB or IEEE1275)."

I haven't checked how it tests for this. But by now, it should list
the supported file systems, rather than what's exempt. That's a
shorter list.

> ext4 has inline data, too, so there's every chance grub will corrupt
> ext4 filesystems with tit's wonderful new feature. I'm not sure if
> the ext4 metadata cksums cover the entire inode and inline data, but
> if they do it's the same problem as btrfs.

I don't see inline used with a default mkfs, but I do see metadata_csum

e2fsprogs-1.44.3-1.fc29.x86_64

Filesystem features: has_journal ext_attr resize_inode dir_index
filetype extent 64bit flex_bg sparse_super large_file huge_file
dir_nlink extra_isize metadata_csum
Filesystem flags: signed_directory_hash
Default mount options: user_xattr acl

>
>> For XFS, I'm not sure how the inline extent is saved, and whether
>> metadata checksumming includes or excludes the inline extent.
>
> When XFS implements this, it will be like btrfs as the data will be
> covered by the metadata CRCs for the inode, and so writing directly
> to it would corrupt the inode and render it unreadable by the
> filesystem.

Good to know.

>
>> I'm also kinda ignoring the reflink ramifications of this behavior,
>> for now. Let's just say even if there's no corruption I'm really
>> suspicious of bootloader code writing anything, even what seems to be
>> a simple overwrite of two sectors.
>
> You're not the only one
>
> Like I said, it doesn't matter what the filesystem is, overwriting
> file data by writing directly to the block device is not
> supportable. It's essentially a filesystem corruption vector, and
> grub needs to have that functionality removed immediately.

I'm in agreement with respect to the more complex file systems. We've
already realized the folly of the bootloader being unable to do
journal replay, ergo it doesn't really for sure have a complete
picture of the file system anyway. That's suboptimal when it results
in boot failure. But if it were going to use stale file system
information, get a wrong idea of the file system, and then use that to
do even 1024 bytes of writes? No, no, and no.

Meanwhile, also in Fedoraland, it's one of the distros where grubenv
and grub.cfg stuff is on the EFI System partition, which is FAT. This
overwrite behavior will work there, but even this case is a kind of
betrayal that the file is being modified, without its metadata being
updated. I think it's an old era hack that by today's standards simply
isn't good enough. I'm a little surprised that all UEFI
implementations permit arbitrary writes from the pre-boot environment
to arbitrary block devices, even with Secure Boot enabled. That seems
specious.

I know some of the file systems have reserve areas for bootloader
stuff. I'm not sure if that's preferred over bootloaders just getting
their own partition and controlling it stem to stern however they
want.

-- 
Chris Murphy

Re: very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)



On 2018/9/20 上午4:11, Hans van Kranenburg wrote:
> On 09/19/2018 10:04 PM, Martin Steigerwald wrote:
>> Hans van Kranenburg - 19.09.18, 19:58:
>>> However, as soon as we remount the filesystem with space_cache=v2 -
>>>
 writes drop to just around 3-10 MB/s to each disk. If we remount to
 space_cache - lots of writes, system unresponsive. Again remount to
 space_cache=v2 - low writes, system responsive.

 That's a huuge, 10x overhead! Is it expected? Especially that
 space_cache=v1 is still the default mount option?
>>>
>>> Yes, that does not surprise me.
>>>
>>> https://events.static.linuxfound.org/sites/events/files/slides/vault20
>>> 16_0.pdf
>>>
>>> Free space cache v1 is the default because of issues with btrfs-progs,
>>> not because it's unwise to use the kernel code. I can totally
>>> recommend using it. The linked presentation above gives some good
>>> background information.
>>
>> What issues in btrfs-progs are that?
> 
> Missing code to make offline changes to a filesystem that has a free
> space tree. So when using btrfstune / repair / whatever you first need
> to remove the whole free space tree with a command, and then add it back
> on the next mount.
> 
> For me personally that's not a problem (I don't have to make offline
> changes), but I understand that having that situation out of the box for
> every new user would be a bit awkward.
> 
>> I am wondering whether to switch to freespace tree v2. Would it provide 
>> benefit for a regular / and /home filesystems as dual SSD BTRFS RAID-1 
>> on a laptop?
> 
> As shown in the linked presentation, it provides benefit on a largeish
> filesystem and if your writes are touching a lot of different block
> groups (since v1 writes out the full space cache for all of them on
> every transaction commit).

In fact that's the problem.

From free space cache inode flags, it's
NODATASUM|NODATACOW|NOCOMPRESS|PREALLOC.

But the fact is, if it's modified, the whole file just get CoWed.

If we could change it to follow the inode flags, we could reduce
overhead even smaller than v2 one.
(v1 needs at least (1 + n) * sectorsize(4K) one for the header which
contains the csum, while v2 needs metadata CoW which is at least
nodesize (default to 16K)).

Thanks,
Qu

> I'd say, it provides benefit as soon as you
> encounter filesystem delays because of it, and as soon as you see using
> it eases the pain a lot. So, yes, that's your case.
> 



signature.asc
Description: OpenPGP digital signature

Re: cannot mount btrfs as root

2018-09-19 Thread anand . jain





On 09/19/2018 09:19 PM, Zbigniew 'zibi' Jarosik wrote:

Hi!

I can't mount my RAID1 set as rootfs, but as normal mount to running
system works normally. It was working normal, than some day it died.

When booted to rootfs as
root=UUID=98c94774-d93a-400e-a275-58cc3ac2a58a rootflags=subvol=@root
i got:

BTRFS error (device bcache2): devid 5 uuid
dd09f810-717e-4f2f-97ab-26469d5adca5 is missing
BTRFS error (device bcache2): failed to read the system array: -2
BTRFS error (device bcache2): open_ctree failed
mount: mounting /dev/bcache2 on /root failed: Invalid argument



Having the '-o degraded' mount option will make this successful but 
looks like the other disk is slow to be identified as you were able to 
mount after boot.


Thanks, Anand


and (initramfs) prompt.

Bcache devices are registered and populated in /dev.

When I boot exacly the same kernel and initrd, but rootfs is on
pendrive ( root=/dev/sda2 ) than mount dataset to subdir ( mount UUID=
98c94774-d93a-400e-a275-58cc3ac2a58a  /data ) - works flawlessly.

Thanks in advance for any ideas.

Some system info:

/dev/sdb1: UUID="780cc211-2408-4594-83df-379ee23d6ed8" TYPE="bcache"
PARTLABEL="Linux filesystem"
PARTUUID="ef83f1b4-b55c-4deb-8037-69ff16909222"
/dev/sdc1: UUID="80a1b115-8a1f-41f7-b65b-dd4fdaf400f1" TYPE="bcache"
PARTLABEL="Linux filesystem"
PARTUUID="1f920d16-af68-440b-891c-5ff61c3057ca"
/dev/sdd1: UUID="a40e80dc-f7e2-4a9c-9bfe-81489a1261ca" TYPE="bcache"
PARTLABEL="Linux filesystem"
PARTUUID="4b68c314-f029-402a-b332-b0b9109ad51d"
/dev/bcache0: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
UUID_SUB="dd09f810-717e-4f2f-97ab-26469d5adca5" TYPE="btrfs"
/dev/bcache1: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
UUID_SUB="8469962d-f9ee-4b5f-b4d5-4a62190c0d8f" TYPE="btrfs"
/dev/bcache2: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
UUID_SUB="516f2bd1-3fcc-42fd-87ad-5d5fa2f84b5f" TYPE="btrfs"

Linux hp1 4.17.11-ldkb #2 SMP Thu Aug 2 09:24:58 CEST 2018 x86_64 GNU/Linux
btrfs-progs v4.7.3
Label: none  uuid: 98c94774-d93a-400e-a275-58cc3ac2a58a
 Total devices 3 FS bytes used 3.21TiB
 devid3 size 2.73TiB used 2.07TiB path /dev/bcache2
 devid4 size 2.73TiB used 2.18TiB path /dev/bcache1
 devid5 size 2.73TiB used 2.18TiB path /dev/bcache0
Data, RAID1: total=3.20TiB, used=3.20TiB
System, RAID1: total=32.00MiB, used=480.00KiB
Metadata, RAID1: total=12.00GiB, used=10.81GiB
GlobalReserve, single: total=512.00MiB, used=0.00B

Re: state of btrfs snapshot limitations?

2018-09-19 Thread James A. Robinson

On Wed, Sep 19, 2018 at 4:04 PM Pete  wrote:
> snapshots.  You need to delete it out of them as well which defeats the
> idea of read only snapshots if you are using them.

I wouldn't say it defeats the idea of read-only snapshots. If you want
to be able to "go back in time" and see what changed, you have to
pay the price, right?  At least until someone figures out a quantum
computer that can generate all possible states of data at once!

> I've since made /tmp a subvolume to prevent snap-shotting to partially
> mitigate this.  I'm wondering if I should make /lib/modules one for the
> same reason.  In previous posts on this mailing list people have

Yes, in my scheme I'm excluding things like tmp and the trash folder
when I populate the actual backup directory.

I decided on a 'opt-in' scheme where I have to specifically select
specific folders to send over to the backup server, rather than
attempt to track everything on disk.  Of course I am in danger
of missing an important directory and not realizing it until it's too
late.

> Now I've seen it I can't un-see it!

I think we should just redefine end-of-line to 0x00 to let him have
his 0x0a! :)

So with my scheme, similar to yours, I've got a bit of overlap in
that I take a snapshot under hourly as well as a daily snapshot,
but my hourly will rotate off at the end of the 24 hour period.

Here's an example list of the snapshots accumulated so far,
you can see that the positions change as time proceeds and
older snapshots are replaced with newer ones for the
hour and minute buckets.

$ sudo btrfs subvolume list /snapshot/
ID 259 gen 2271 top level 5 path c
ID 1024 gen 1064 top level 5 path d/2018/0915
ID 1106 gen 1216 top level 5 path d/2018/0916
ID 1227 gen 1435 top level 5 path d/2018/0917
ID 1348 gen 1681 top level 5 path d/2018/0918
ID 1566 gen 2017 top level 5 path h/1700
ID 1571 gen 2026 top level 5 path h/1800
ID 1576 gen 2035 top level 5 path h/1900
ID 1581 gen 2044 top level 5 path h/2000
ID 1586 gen 2053 top level 5 path h/2100
ID 1591 gen 2062 top level 5 path h/2200
ID 1596 gen 2071 top level 5 path h/2300
ID 1602 gen 2081 top level 5 path h/
ID 1607 gen 2090 top level 5 path h/0100
ID 1612 gen 2099 top level 5 path h/0200
ID 1617 gen 2108 top level 5 path h/0300
ID 1622 gen 2119 top level 5 path h/0400
ID 1627 gen 2133 top level 5 path h/0500
ID 1632 gen 2148 top level 5 path h/0600
ID 1637 gen 2159 top level 5 path h/0700
ID 1642 gen 2169 top level 5 path h/0800
ID 1648 gen 2182 top level 5 path h/0900
ID 1654 gen 2193 top level 5 path h/1000
ID 1660 gen 2203 top level 5 path h/1100
ID 1666 gen 2213 top level 5 path h/1200
ID 1672 gen 2224 top level 5 path h/1300
ID 1678 gen 2239 top level 5 path h/1400
ID 1684 gen 2253 top level 5 path h/1500
ID 1687 gen 2260 top level 5 path m/30
ID 1688 gen 2263 top level 5 path m/45
ID 1689 gen 2266 top level 5 path d/2018/0919
ID 1690 gen 2267 top level 5 path h/1600
ID 1691 gen 2268 top level 5 path m/00
ID 1692 gen 2271 top level 5 path m/15

The script as it currently stands:

#!/bin/bash
#
# snapshots - sync and snapshot at intervals
#
# this script is assumed to be run on a 15 minute cycle,
# at 00, 15, 30, and 45 minutes after the hour.  It will
# sync ${source} to ${volume}/c/ and then take
# snapshots:
#
# ${volume}/d//
# ${volume}/h/
# ${volume}/m/
#
# The daily snapshot is taken when run at 00:00
# The hourly snapshot is taken when run at *:00 minutes.
# The minute snapshot is taken every time it is run.
#
# So the directory structure created under ${volume} will be:
#
#c: the most recently synced data from /backup
#
#d: a daily snapshot with the naming scheme /mmdd
#
#h: an hourly snapshot with the naming scheme hhmm, note
#that it is on a 24-hour cycle, so if it is currently 13:30 then
#snapshots  through 1300 are from today, and snapshots
#1400 through 2300 are from yesterday.
#
#m: a minute snapshot (00, 15, 30, 45), also on a cycle meaning
#if it is 14:20 then 00 and 15 are for 14:00 and 14:15, and 30 and
#45 are from 13:30 and 13:45.
#
umask 0077

# fully qualified path to backup
source="/backup/";

# btrfs volume where snapshots are managed
volume=/snapshot

# local lock dir / pid file
lockdir="/var/tmp/snapshots.lock";
pid="${lockdir}/pid";

# compute current year, month, day of the month, hour, and minute
t=($(/bin/date +"%Y %m %d %H %M"));
year=${t[0]};
month=${t[1]};
day=${t[2]};
hour=${t[3]};
min=${t[4]};

function unlock {
rm -rf "${lockdir}"
}

# is another instance already running?
mkdir "${lockdir}" 2>/dev/null
if [ "$?" != "0" ]; then
PID=$(/bin/cat "$pid");
if /bin/kill -0 "$PID" >/dev/null 2>&1; then
exit;
fi
else
trap unlock QUIT TERM EXIT INT
echo "$$" > "${pid}";
fi

# if volume is not mounted, terminate
if ! /bin/mount | /bin/grep -q "${volume}"; then
/bin/echo "$0: snapshot aborted, ${volume} is not mounted";
exit;
fi

# update 'c' subvolume
if [ ! -d "${volume}/c" ]; then

Re: state of btrfs snapshot limitations?

2018-09-19 Thread Pete

On 09/19/2018 03:41 PM, Piotr Pawłow wrote:
> Hello,
>> If the limit is 100 or less I'd need use a more complicated
>> rotation scheme.
> 
> If you just want to thin them out over time without having selected "special" 
> monthly, yearly etc snapshots, then my favorite scheme is to just compare the 
> age of a snapshot to the distance to its neighbours, and if the distance is 
> less than age / constant then delete it. If the constant is, for example, 12, 
> then it will start thinning out hourly snapshots after around 12 hours, 
> monthly after 12 months etc.
> 
> This is how it looks after 2 years with daily snapshots and the constant=6:
> 
> backup-20160328143825
> backup-20161210043001

User not dev here, I thought I'd share my experience.  That scheme looks
really interesting.  Though how you'd achieve that sounds like it might
be a little complex, one piece of script covers it.

My approach is to snapshot on three timeframes, daily, weekly and
monthly.  I store approximately 30 days worth of daily snapshots, 1
years worth of weekly snapshots and 1 years worth of monthly snapshots.
On reflection however, if I retail 1 years worth of weekly then the
monthly snapshots are redundant.  Perhaps a little adjustment is in order.

However, there are pitfalls, which I ironically hit today.  This is not
a btrfs issue, but a simple consequence of snap-shotting a system with a
reasonable amount of changes - the volume of data stored grows owing to
the many changed files and hence free space reduces even if the 'master'
sub volumes are kept tidy.  It does not matter how tidy you keep the
'master', tidying the data, if the old redundant data is hiding in the
snapshots.  You need to delete it out of them as well which defeats the
idea of read only snapshots if you are using them.  For example today I
deleted redundant kernel modules from both the root subvolume and the
snapshots, and similar for /tmp, only then did I free up 55 GB (!) to
give myself some free space.  (I have been frequently updating my kernel
and left some debugging options on resulting in many copies of far
larger kernel modules than intended).

I've since made /tmp a subvolume to prevent snap-shotting to partially
mitigate this.  I'm wondering if I should make /lib/modules one for the
same reason.  In previous posts on this mailing list people have
recommended making various cache and tmp directories separate subvolumes
to reduce loss of available disk space by snapshotting churning files
that have little value for retention.  I'm wondering if some guide to
snapshotting is appropriate to make people aware of the management
actions that might be required?

> I have a horrid perl "one-liner" to do the thinning (caution! it deletes 
> subvolumes without asking!):
> 
> perl -e 'for(@ARGV){open($in,"-|",qw(btrfs subvolume 
> show),$_);$ts{$_}=(map{/: \t+(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \+\d{4})$/ 
> or die "error parsing: $_\n";0+`date --date "$1" +%s` or die 
> $!}grep{/Creation 
> time:/}<$in>)[0]}@s=sort{$ts{$b}<=>$ts{$a}}keys%ts;while(@s>2){($s1,$s2,$s3)=@s;if(($ts{$s1}-$ts{$s3})/2<(time-$ts{$s2})/12){system(qw(btrfs
>  subvolume delete),$s2);$s[1]=$s1};shift@s}' [snapshot ...]
> 
> (hey, everything can be a one-liner if you allow unlimited line length!)
> 

Now I've seen it I can't un-see it!

Re: [RFC PATCH v2 0/4] btrfs-progs: build distinct binaries for specific btrfs subcommands

2018-09-19 Thread Axel Burri

In Reply to:

On 30/08/2018 04.38, Misono Tomohiro wrote:
>
> Hello,
>
> Not directly related this series and just FYI,
> I'm working to allow sub show/list to non-privileged user as long
> as he can access to the subvolume:
>   https://www.spinics.net/lists/linux-btrfs/msg79285.html
>
> Hopefully this will be merged to master in near future
> (any comments from user/dev is welcome).
>
> Thanks,
> Misono
>

I found some time to play around with your patchset, it worked fine on
4.18.8-gentoo kernel.

As far as I can see, only "btrfs subvolume show" and "btrfs subvolume
list" makes sense for a regular user?

With the "cmds-separated-fscaps-v2" patchset [1], you can build
separated binaries as follows:

# make btrfs-subvolume-list.separated
# make btrfs-subvolume-show.separated

Now not everybody wants to install these with fscaps or setuid, but it
might also make sense to provide "/usr/bin/btrfs-subvolume-{show,list}",
as they now work for a regular user. Having both root/user binaries
concurrently is not an issue (e.g. in gentoo the full-featured btrfs
command is in "/sbin/").

Last time I checked, debian installs it to "/bin/btrfs", which from my
perspective seems to be the preferred location as soon as Misonos patch
is merged.

 [1] https://github.com/digint/btrfs-progs/tree/cmds-separated-fscaps-v2

Re: very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)

2018-09-19 Thread Nikolay Borisov




On 19.09.2018 23:11, Hans van Kranenburg wrote:
> On 09/19/2018 10:04 PM, Martin Steigerwald wrote:
>> Hans van Kranenburg - 19.09.18, 19:58:
>>> However, as soon as we remount the filesystem with space_cache=v2 -
>>>
 writes drop to just around 3-10 MB/s to each disk. If we remount to
 space_cache - lots of writes, system unresponsive. Again remount to
 space_cache=v2 - low writes, system responsive.

 That's a huuge, 10x overhead! Is it expected? Especially that
 space_cache=v1 is still the default mount option?
>>>
>>> Yes, that does not surprise me.
>>>
>>> https://events.static.linuxfound.org/sites/events/files/slides/vault20
>>> 16_0.pdf
>>>
>>> Free space cache v1 is the default because of issues with btrfs-progs,
>>> not because it's unwise to use the kernel code. I can totally
>>> recommend using it. The linked presentation above gives some good
>>> background information.
>>
>> What issues in btrfs-progs are that?
> 
> Missing code to make offline changes to a filesystem that has a free
> space tree. So when using btrfstune / repair / whatever you first need
> to remove the whole free space tree with a command, and then add it back
> on the next mount.

And as a matter of fact this code has already been published on the
mailing list for review, and even some parts got merged so we are in
good shape to get it into progs and eventually switch the kernel to
default to v2 cache.
> 
> For me personally that's not a problem (I don't have to make offline
> changes), but I understand that having that situation out of the box for
> every new user would be a bit awkward.
> 
>> I am wondering whether to switch to freespace tree v2. Would it provide 
>> benefit for a regular / and /home filesystems as dual SSD BTRFS RAID-1 
>> on a laptop?
> 
> As shown in the linked presentation, it provides benefit on a largeish
> filesystem and if your writes are touching a lot of different block
> groups (since v1 writes out the full space cache for all of them on
> every transaction commit). I'd say, it provides benefit as soon as you
> encounter filesystem delays because of it, and as soon as you see using
> it eases the pain a lot. So, yes, that's your case.
>

Re: very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)

2018-09-19 Thread Hans van Kranenburg

On 09/19/2018 10:04 PM, Martin Steigerwald wrote:
> Hans van Kranenburg - 19.09.18, 19:58:
>> However, as soon as we remount the filesystem with space_cache=v2 -
>>
>>> writes drop to just around 3-10 MB/s to each disk. If we remount to
>>> space_cache - lots of writes, system unresponsive. Again remount to
>>> space_cache=v2 - low writes, system responsive.
>>>
>>> That's a huuge, 10x overhead! Is it expected? Especially that
>>> space_cache=v1 is still the default mount option?
>>
>> Yes, that does not surprise me.
>>
>> https://events.static.linuxfound.org/sites/events/files/slides/vault20
>> 16_0.pdf
>>
>> Free space cache v1 is the default because of issues with btrfs-progs,
>> not because it's unwise to use the kernel code. I can totally
>> recommend using it. The linked presentation above gives some good
>> background information.
> 
> What issues in btrfs-progs are that?

Missing code to make offline changes to a filesystem that has a free
space tree. So when using btrfstune / repair / whatever you first need
to remove the whole free space tree with a command, and then add it back
on the next mount.

For me personally that's not a problem (I don't have to make offline
changes), but I understand that having that situation out of the box for
every new user would be a bit awkward.

> I am wondering whether to switch to freespace tree v2. Would it provide 
> benefit for a regular / and /home filesystems as dual SSD BTRFS RAID-1 
> on a laptop?

As shown in the linked presentation, it provides benefit on a largeish
filesystem and if your writes are touching a lot of different block
groups (since v1 writes out the full space cache for all of them on
every transaction commit). I'd say, it provides benefit as soon as you
encounter filesystem delays because of it, and as soon as you see using
it eases the pain a lot. So, yes, that's your case.

-- 
Hans van Kranenburg

Re: very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)

2018-09-19 Thread Martin Steigerwald

Hans van Kranenburg - 19.09.18, 19:58:
> However, as soon as we remount the filesystem with space_cache=v2 -
> 
> > writes drop to just around 3-10 MB/s to each disk. If we remount to
> > space_cache - lots of writes, system unresponsive. Again remount to
> > space_cache=v2 - low writes, system responsive.
> > 
> > That's a huuge, 10x overhead! Is it expected? Especially that
> > space_cache=v1 is still the default mount option?
> 
> Yes, that does not surprise me.
> 
> https://events.static.linuxfound.org/sites/events/files/slides/vault20
> 16_0.pdf
> 
> Free space cache v1 is the default because of issues with btrfs-progs,
> not because it's unwise to use the kernel code. I can totally
> recommend using it. The linked presentation above gives some good
> background information.

What issues in btrfs-progs are that?

I am wondering whether to switch to freespace tree v2. Would it provide 
benefit for a regular / and /home filesystems as dual SSD BTRFS RAID-1 
on a laptop?

Thanks,
-- 
Martin

Re: btrfs send hangs after partial transfer and blocks all IO

2018-09-19 Thread Jürgen Herrmann

Am 13.9.2018 14:35, schrieb Nikolay Borisov:

On 13.09.2018 15:30, Jürgen Herrmann wrote:

OK, I will install kdump later and perform a dump after the hang.

One more noob question beforehand: does this dump contain sensitive
information, for example the luks encryption key for the disk etc? A
Google search only brings up one relevant search result which can only
be viewed with a redhat subscription...

So a kdump will dump the kernel memory so it's possible that the LUKS
encryption keys could be extracted from that image. Bummer, it's
understandable why you wouldn't want to upload it :). In this case
you'd

have to install also the 'crash' utility to open the crashdump and
extract the calltrace of the btrfs process. The rough process should be
:

crash 'path to vm linux' 'path to vmcore file', then once inside the
crash utility :

set , you can acquire the pid by issuing
'ps'

which will give you a ps-like output of all running processes at the
time of crash. After the context has been set you can run 'bt' which
will give you a backtrace of the send process.

Best regards,
Jürgen

Am 13. September 2018 14:02:11 schrieb Nikolay Borisov
:

On 13.09.2018 14:50, Jürgen Herrmann wrote:
I was echoing "w" to /proc/sysrq_trigger every 0.5s which did work
also

after the hang because I started the loop before the hang. The dmesg
output should show the hanging tasks from second 346 on or so. Still
not

useful?

So from 346 it's evident that transaction commit is waiting for
commit_root_sem to be acquired. So something else is holding it and
not
giving the transaction chance to finish committing. Now the only
place
where send acquires this lock is in find_extent_clone around the
call
to extent_from_logical. The latter basically does an extent tree
search
and doesn't loop so can't possibly deadlock. Furthermore I don't see
any

userspace processes being hung in kernel space.

Additionally looking at the userspace processes they indicate that
find_extent_clone has finished and are blocked in send_write_or_clone
which does the write. But I guess this actually happens before the
hang.

So at this point without looking at the stacktrace of the btrfs send
process after the hung has occurred I don't think much can be done

I know this is probably not the correct list to ask this question but
maybe someone of the devs can point me to the right list?

I cannot get kdump to work. The crashkernel is loaded and everything is
setup for it afaict. I asked a question on this over at stackexchange
but no answer yet.

https://unix.stackexchange.com/questions/469838/linux-kdump-does-not-boot-second-kernel-when-kernel-is-crashing

So i did a little digging and added some debug printk() statements to
see whats going on and it seems that panic() is never called. maybe the
second stack trace is the reason?

Screenshot is here: https://t-5.eu/owncloud/index.php/s/OegsikXo4VFLTJN

Could someone please tell me where I can report this problem and get
some help on this topic?

Best regards,
Jürgen

--
Jürgen Herrmann
https://t-5.eu
ALbertstraße 2
94327 Bogen

Re: btrfs send hangs after partial transfer and blocks all IO

2018-09-19 Thread Jürgen Herrmann


Am 13.9.2018 18:22, schrieb Chris Murphy:

(resend to all)

On Thu, Sep 13, 2018 at 9:44 AM, Nikolay Borisov  
wrote:



On 13.09.2018 18:30, Chris Murphy wrote:

This is the 2nd or 3rd thread containing hanging btrfs send, with
kernel 4.18.x. The subject of one is "btrfs send hung in pipe_wait"
and the other I can't find at the moment. In that case though the 
hang

is reproducible in 4.14.x and weirdly it only happens when a snapshot
contains (perhaps many) reflinks. Scrub and check lowmem find nothing
wrong.

I have snapshots with a few reflinks (cp --reflink and also
deduplication), and I see maybe 15-30 second hangs where nothing is
apparently happening (in top or iotop), but I'm also not seeing any
blocked tasks or high CPU usage. Perhaps in my case it's just
recovering quickly.

Are there any kernel config options in "# Debug Lockups and Hangs"
that might hint at what's going on? Some of these are enabled in
Fedora debug kernels, which are built practically daily, e.g. right
now the latest in the build system is 4.19.0-0.rc3.git2.1 - which
translates to git 54eda9df17f3.


If it's a lock-related problem then you need Lock Debugging => Lock
debugging: prove locking correctness


OK looks like that's under a different section as CONFIG_PROVE_LOCKING
which is enabled on Fedora debug kernels.


# Debug Lockups and Hangs
CONFIG_LOCKUP_DETECTOR=y
CONFIG_SOFTLOCKUP_DETECTOR=y
# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
CONFIG_HARDLOCKUP_DETECTOR_PERF=y
CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y
CONFIG_HARDLOCKUP_DETECTOR=y
# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0
# Lock Debugging (spinlocks, mutexes, etc...)
CONFIG_LOCK_DEBUGGING_SUPPORT=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_LOCKDEP=y
# CONFIG_DEBUG_LOCKDEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
CONFIG_LOCK_TORTURE_TEST=m


Hello again!

I have CONFIG_PROVE_LOCKING enabled in my kernel, but no change to
the observed behaviour.

Best regards, Jürgen


--
Jürgen Herrmann
https://t-5.eu
ALbertstraße 2
94327 Bogen

Re: GRUB writing to grubenv outside of kernel fs code

2018-09-19 Thread Austin S. Hemmelgarn


On 2018-09-19 15:08, Goffredo Baroncelli wrote:

On 18/09/2018 19.15, Goffredo Baroncelli wrote:

b. The bootloader code, would have to have sophisticated enough Btrfs
knowledge to know if the grubenv has been reflinked or snapshot,
because even if +C, it may not be valid to overwrite, and COW must
still happen, and there's no way the code in GRUB can do full blow COW
and update a bunch of metadata.



And what if GRUB ignore the possibility of COWing and overwrite the data ? Is 
it a so big problem that the data is changed in all the snapshots ?
It would be interested if the same problem happens for a swap file.


I gave a look to the Sandoval's patches about implementing swap on BTRFS. This 
patch set
prevents the subvolume containing the swapfile to be snapshot-ted (and the file 
to be balanced and so on...); what if we would add the same constraint to the 
grubenv file ?
We would need to have a generalized mechanism of doing this then, 
because there's no way in hell a patch special-casing a single filename 
is going to make it into mainline.


Whatever mechanism is used, it should also:

* Force the file to not be inlined in metadata.
* Enforce the file having the NOCOW attribute being set.

Re: GRUB writing to grubenv outside of kernel fs code

2018-09-19 Thread Goffredo Baroncelli

On 18/09/2018 19.15, Goffredo Baroncelli wrote:
>> b. The bootloader code, would have to have sophisticated enough Btrfs
>> knowledge to know if the grubenv has been reflinked or snapshot,
>> because even if +C, it may not be valid to overwrite, and COW must
>> still happen, and there's no way the code in GRUB can do full blow COW
>> and update a bunch of metadata.

> And what if GRUB ignore the possibility of COWing and overwrite the data ? Is 
> it a so big problem that the data is changed in all the snapshots ? 
> It would be interested if the same problem happens for a swap file.

I gave a look to the Sandoval's patches about implementing swap on BTRFS. This 
patch set
prevents the subvolume containing the swapfile to be snapshot-ted (and the file 
to be balanced and so on...); what if we would add the same constraint to the 
grubenv file ?


BR
G.Baroncelli
-- 
gpg @keyserver.linux.it: Goffredo Baroncelli 
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5

Re: [PATCH v7 2/6] mm: export add_swap_extent()

On Wed, Sep 19, 2018 at 11:28:00AM -0700, Omar Sandoval wrote:
> On Wed, Sep 19, 2018 at 02:09:09PM -0400, Johannes Weiner wrote:
> > On Tue, Sep 11, 2018 at 03:34:45PM -0700, Omar Sandoval wrote:
> > > From: Omar Sandoval 
> > > 
> > > Btrfs will need this for swap file support.
> > > 
> > > Signed-off-by: Omar Sandoval 
> > 
> > That looks reasonable. After reading the last patch, it's somewhat
> > understandable why you cannot simply implemnet ->bmap and use the
> > generic activation code. But it would be good to explain the reason(s)
> > for why you can't here briefly to justify this patch.
> 
> I'll rewrite it to:
> 
> Btrfs currently does not support swap files because swap's use of bmap
> does not work with copy-on-write and multiple devices. See 35054394c4b3
> ("Btrfs: stop providing a bmap operation to avoid swapfile
> corruptions"). However, the swap code has a mechanism for the filesystem
> to manually add swap extents using add_swap_extent() from the
> ->swap_activate() aop. iomap has done this since 67482129cdab ("iomap:
> add a swapfile activation function"). Btrfs will do the same in a later
> patch, so export add_swap_extent().

That explains it perfectly. Thanks!

Re: [PATCH v7 1/6] mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS

On Wed, Sep 19, 2018 at 11:12:02AM -0700, Omar Sandoval wrote:
> On Wed, Sep 19, 2018 at 02:02:32PM -0400, Johannes Weiner wrote:
> > On Tue, Sep 11, 2018 at 03:34:44PM -0700, Omar Sandoval wrote:
> > > @@ -2411,8 +2412,10 @@ static int setup_swap_extents(struct 
> > > swap_info_struct *sis, sector_t *span)
> > >  
> > >   if (mapping->a_ops->swap_activate) {
> > >   ret = mapping->a_ops->swap_activate(sis, swap_file, span);
> > > + if (ret >= 0)
> > > + sis->flags |= SWP_ACTIVATED;
> > >   if (!ret) {
> > > - sis->flags |= SWP_FILE;
> > > + sis->flags |= SWP_FS;
> > >   ret = add_swap_extent(sis, 0, sis->max, 0);
> > 
> > Won't this single, linear extent be in conflict with the discontiguous
> > extents you set up in your swap_activate callback in the last patch?
> 
> That's only in the case that ->swap_activate() returned 0, which only
> nfs_swap_activate() will do. btrfs_swap_activate() and
> iomap_swapfile_activate() both return the number of extents they set up.

Ah yes, I missed that.

That's a little under-documented I guess, but that's not your fault.

[PATCH 8/9] btrfs: Make more generic the code for RAID 6 rebuilding

2018-09-19 Thread Goffredo Baroncelli

From: Goffredo Baroncelli 

The original code which handles the recovery of a RAID 6 disks array
assumes that all reads are multiple of 1 << GRUB_DISK_SECTOR_BITS and it
assumes that all the I/O is done via the struct grub_diskfilter_segment.
This is not true for the btrfs code. In order to reuse the native
grub_raid6_recover() code, it is modified to not call
grub_diskfilter_read_node() directly, but to call an handler passed
as an argument.

Signed-off-by: Goffredo Baroncelli 
Reviewed-by: Daniel Kiper 
---
 grub-core/disk/raid6_recover.c | 52 ++
 include/grub/diskfilter.h  |  9 ++
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/grub-core/disk/raid6_recover.c b/grub-core/disk/raid6_recover.c
index aa674f6ca..0cf691ddf 100644
--- a/grub-core/disk/raid6_recover.c
+++ b/grub-core/disk/raid6_recover.c
@@ -74,14 +74,26 @@ mod_255 (unsigned x)
 }
 
 static grub_err_t
-grub_raid6_recover (struct grub_diskfilter_segment *array, int disknr, int p,
-char *buf, grub_disk_addr_t sector, grub_size_t size)
+raid6_recover_read_node (void *data, int disknr,
+   grub_uint64_t sector,
+   void *buf, grub_size_t size)
+{
+struct grub_diskfilter_segment *array = data;
+
+return grub_diskfilter_read_node (>nodes[disknr],
+ (grub_disk_addr_t)sector,
+ size >> GRUB_DISK_SECTOR_BITS, buf);
+}
+
+grub_err_t
+grub_raid6_recover_gen (void *data, grub_uint64_t nstripes, int disknr, int p,
+   char *buf, grub_uint64_t sector, grub_size_t size,
+   int layout, raid_recover_read_t read_func)
 {
   int i, q, pos;
   int bad1 = -1, bad2 = -1;
   char *pbuf = 0, *qbuf = 0;
 
-  size <<= GRUB_DISK_SECTOR_BITS;
   pbuf = grub_zalloc (size);
   if (!pbuf)
 goto quit;
@@ -91,17 +103,17 @@ grub_raid6_recover (struct grub_diskfilter_segment *array, 
int disknr, int p,
 goto quit;
 
   q = p + 1;
-  if (q == (int) array->node_count)
+  if (q == (int) nstripes)
 q = 0;
 
   pos = q + 1;
-  if (pos == (int) array->node_count)
+  if (pos == (int) nstripes)
 pos = 0;
 
-  for (i = 0; i < (int) array->node_count - 2; i++)
+  for (i = 0; i < (int) nstripes - 2; i++)
 {
   int c;
-  if (array->layout & GRUB_RAID_LAYOUT_MUL_FROM_POS)
+  if (layout & GRUB_RAID_LAYOUT_MUL_FROM_POS)
c = pos;
   else
c = i;
@@ -109,8 +121,7 @@ grub_raid6_recover (struct grub_diskfilter_segment *array, 
int disknr, int p,
 bad1 = c;
   else
 {
-  if (! grub_diskfilter_read_node (>nodes[pos], sector,
-  size >> GRUB_DISK_SECTOR_BITS, buf))
+ if (!read_func(data, pos, sector, buf, size))
 {
   grub_crypto_xor (pbuf, pbuf, buf, size);
   grub_raid_block_mulx (c, buf, size);
@@ -128,7 +139,7 @@ grub_raid6_recover (struct grub_diskfilter_segment *array, 
int disknr, int p,
 }
 
   pos++;
-  if (pos == (int) array->node_count)
+  if (pos == (int) nstripes)
 pos = 0;
 }
 
@@ -139,16 +150,14 @@ grub_raid6_recover (struct grub_diskfilter_segment 
*array, int disknr, int p,
   if (bad2 < 0)
 {
   /* One bad device */
-  if ((! grub_diskfilter_read_node (>nodes[p], sector,
-   size >> GRUB_DISK_SECTOR_BITS, buf)))
+  if (!read_func(data, p, sector, buf, size))
 {
   grub_crypto_xor (buf, buf, pbuf, size);
   goto quit;
 }
 
   grub_errno = GRUB_ERR_NONE;
-  if (grub_diskfilter_read_node (>nodes[q], sector,
-size >> GRUB_DISK_SECTOR_BITS, buf))
+  if (read_func(data, q, sector, buf, size))
 goto quit;
 
   grub_crypto_xor (buf, buf, qbuf, size);
@@ -160,14 +169,12 @@ grub_raid6_recover (struct grub_diskfilter_segment 
*array, int disknr, int p,
   /* Two bad devices */
   unsigned c;
 
-  if (grub_diskfilter_read_node (>nodes[p], sector,
-size >> GRUB_DISK_SECTOR_BITS, buf))
+  if (read_func(data, p, sector, buf, size))
 goto quit;
 
   grub_crypto_xor (pbuf, pbuf, buf, size);
 
-  if (grub_diskfilter_read_node (>nodes[q], sector,
-size >> GRUB_DISK_SECTOR_BITS, buf))
+  if (read_func(data, q, sector, buf, size))
 goto quit;
 
   grub_crypto_xor (qbuf, qbuf, buf, size);
@@ -190,6 +197,15 @@ quit:
   return grub_errno;
 }
 
+static grub_err_t
+grub_raid6_recover (struct grub_diskfilter_segment *array, int disknr, int p,
+char *buf, grub_disk_addr_t sector, grub_size_t size)
+{
+  return grub_raid6_recover_gen (array, array->node_count, disknr, p, buf,
+sector, size << GRUB_DISK_SECTOR_BITS,
+

Re: btrfs panic problem

2018-09-19 Thread Liu Bo

On Mon, Sep 17, 2018 at 5:28 PM, sunny.s.zhang  wrote:
> Hi All,
>
> My OS(4.1.12) panic in kmem_cache_alloc, which is called by
> btrfs_get_or_create_delayed_node.
>
> I found that the freelist of the slub is wrong.
>
> crash> struct kmem_cache_cpu 887e7d7a24b0
>
> struct kmem_cache_cpu {
>   freelist = 0x2026,   <<< the value is id of one inode
>   tid = 29567861,
>   page = 0xea0132168d00,
>   partial = 0x0
> }
>
> And, I found there are two different btrfs inodes pointing delayed_node. It
> means that the same slub is used twice.
>
> I think this slub is freed twice, and then the next pointer of this slub
> point itself. So we get the same slub twice.
>
> When use this slub again, that break the freelist.
>
> Folloing code will make the delayed node being freed twice. But I don't
> found what is the process.
>
> Process A (btrfs_evict_inode) Process B
>
> call btrfs_remove_delayed_node call  btrfs_get_delayed_node
>
> node = ACCESS_ONCE(btrfs_inode->delayed_node);
>
> BTRFS_I(inode)->delayed_node = NULL;
> btrfs_release_delayed_node(delayed_node);
>
> if (node) {
> atomic_inc(>refs);
> return node;
> }
>
> ..
>
> btrfs_release_delayed_node(delayed_node);
>

By looking at the race,  seems the following commit has addressed it.

btrfs: fix refcount_t usage when deleting btrfs_delayed_nodes
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ec35e48b286959991cdbb886f1bdeda4575c80b4

thanks,
liubo


>
> 1313 void btrfs_remove_delayed_node(struct inode *inode)
> 1314 {
> 1315 struct btrfs_delayed_node *delayed_node;
> 1316
> 1317 delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
> 1318 if (!delayed_node)
> 1319 return;
> 1320
> 1321 BTRFS_I(inode)->delayed_node = NULL;
> 1322 btrfs_release_delayed_node(delayed_node);
> 1323 }
>
>
>   87 static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode
> *inode)
>   88 {
>   89 struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
>   90 struct btrfs_root *root = btrfs_inode->root;
>   91 u64 ino = btrfs_ino(inode);
>   92 struct btrfs_delayed_node *node;
>   93
>   94 node = ACCESS_ONCE(btrfs_inode->delayed_node);
>   95 if (node) {
>   96 atomic_inc(>refs);
>   97 return node;
>   98 }
>
>
> Thanks,
>
> Sunny
>
>
> PS:
>
> 
>
> panic informations
>
> PID: 73638  TASK: 887deb586200  CPU: 38  COMMAND: "dockerd"
>  #0 [88130404f940] machine_kexec at 8105ec10
>  #1 [88130404f9b0] crash_kexec at 811145b8
>  #2 [88130404fa80] oops_end at 8101a868
>  #3 [88130404fab0] no_context at 8106ea91
>  #4 [88130404fb00] __bad_area_nosemaphore at 8106ec8d
>  #5 [88130404fb50] bad_area_nosemaphore at 8106eda3
>  #6 [88130404fb60] __do_page_fault at 8106f328
>  #7 [88130404fbd0] do_page_fault at 8106f637
>  #8 [88130404fc10] page_fault at 816f6308
> [exception RIP: kmem_cache_alloc+121]
> RIP: 811ef019  RSP: 88130404fcc8  RFLAGS: 00010286
> RAX:   RBX:   RCX: 01c32b76
> RDX: 01c32b75  RSI:   RDI: 000224b0
> RBP: 88130404fd08   R8: 887e7d7a24b0   R9: 
> R10: 8802668b6618  R11: 0002  R12: 887e3e230a00
> R13: 2026  R14: 887e3e230a00  R15: a01abf49
> ORIG_RAX:   CS: 0010  SS: 0018
>  #9 [88130404fd10] btrfs_get_or_create_delayed_node at a01abf49
> [btrfs]
> #10 [88130404fd60] btrfs_delayed_update_inode at a01aea12
> [btrfs]
> #11 [88130404fdb0] btrfs_update_inode at a015b199 [btrfs]
> #12 [88130404fdf0] btrfs_dirty_inode at a015cd11 [btrfs]
> #13 [88130404fe20] btrfs_update_time at a015fa25 [btrfs]
> #14 [88130404fe50] touch_atime at 812286d3
> #15 [88130404fe90] iterate_dir at 81221929
> #16 [88130404fee0] sys_getdents64 at 81221a19
> #17 [88130404ff50] system_call_fastpath at 816f2594
> RIP: 006b68e4  RSP: 00c866259080  RFLAGS: 0246
> RAX: ffda  RBX: 00c828dbbe00  RCX: 006b68e4
> RDX: 1000  RSI: 00c83da14000  RDI: 0011
> RBP:    R8:    R9: 
> R10:   R11: 0246  R12: 00c7
> R13: 02174e74  R14: 0555  R15: 0038
> ORIG_RAX: 00d9  CS: 0033  SS: 002b
>
>
> We also find the list double add informations, including n_list and p_list:
>
> [8642921.110568] [ cut here ]
> [8642921.167929] WARNING: CPU: 38 PID: 73638 at lib/list_debug.c:33
> __list_add+0xbe/0xd0()
> [8642921.263780] list_add

Re: [PATCH v7 2/6] mm: export add_swap_extent()

On Wed, Sep 19, 2018 at 02:09:09PM -0400, Johannes Weiner wrote:
> On Tue, Sep 11, 2018 at 03:34:45PM -0700, Omar Sandoval wrote:
> > From: Omar Sandoval 
> > 
> > Btrfs will need this for swap file support.
> > 
> > Signed-off-by: Omar Sandoval 
> 
> That looks reasonable. After reading the last patch, it's somewhat
> understandable why you cannot simply implemnet ->bmap and use the
> generic activation code. But it would be good to explain the reason(s)
> for why you can't here briefly to justify this patch.

I'll rewrite it to:

Btrfs currently does not support swap files because swap's use of bmap
does not work with copy-on-write and multiple devices. See 35054394c4b3
("Btrfs: stop providing a bmap operation to avoid swapfile
corruptions"). However, the swap code has a mechanism for the filesystem
to manually add swap extents using add_swap_extent() from the
->swap_activate() aop. iomap has done this since 67482129cdab ("iomap:
add a swapfile activation function"). Btrfs will do the same in a later
patch, so export add_swap_extent().

Re: [PATCH v7 1/6] mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS

On Wed, Sep 19, 2018 at 02:02:32PM -0400, Johannes Weiner wrote:
> On Tue, Sep 11, 2018 at 03:34:44PM -0700, Omar Sandoval wrote:
> > @@ -2411,8 +2412,10 @@ static int setup_swap_extents(struct 
> > swap_info_struct *sis, sector_t *span)
> >  
> > if (mapping->a_ops->swap_activate) {
> > ret = mapping->a_ops->swap_activate(sis, swap_file, span);
> > +   if (ret >= 0)
> > +   sis->flags |= SWP_ACTIVATED;
> > if (!ret) {
> > -   sis->flags |= SWP_FILE;
> > +   sis->flags |= SWP_FS;
> > ret = add_swap_extent(sis, 0, sis->max, 0);
> 
> Won't this single, linear extent be in conflict with the discontiguous
> extents you set up in your swap_activate callback in the last patch?

That's only in the case that ->swap_activate() returned 0, which only
nfs_swap_activate() will do. btrfs_swap_activate() and
iomap_swapfile_activate() both return the number of extents they set up.

Re: [PATCH v7 2/6] mm: export add_swap_extent()

On Tue, Sep 11, 2018 at 03:34:45PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> Btrfs will need this for swap file support.
> 
> Signed-off-by: Omar Sandoval 

That looks reasonable. After reading the last patch, it's somewhat
understandable why you cannot simply implemnet ->bmap and use the
generic activation code. But it would be good to explain the reason(s)
for why you can't here briefly to justify this patch.

Re: [PATCH v7 1/6] mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS

On Tue, Sep 11, 2018 at 03:34:44PM -0700, Omar Sandoval wrote:
> @@ -2411,8 +2412,10 @@ static int setup_swap_extents(struct swap_info_struct 
> *sis, sector_t *span)
>  
>   if (mapping->a_ops->swap_activate) {
>   ret = mapping->a_ops->swap_activate(sis, swap_file, span);
> + if (ret >= 0)
> + sis->flags |= SWP_ACTIVATED;
>   if (!ret) {
> - sis->flags |= SWP_FILE;
> + sis->flags |= SWP_FS;
>   ret = add_swap_extent(sis, 0, sis->max, 0);

Won't this single, linear extent be in conflict with the discontiguous
extents you set up in your swap_activate callback in the last patch?

Re: very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)

2018-09-19 Thread Hans van Kranenburg

Hi,

On 09/19/2018 10:43 AM, Tomasz Chmielewski wrote:
> I have a mysql slave which writes to a RAID-1 btrfs filesystem (with
> 4.17.14 kernel) on 3 x ~1.9 TB SSD disks; filesystem is around 40% full.
> 
> The slave receives around 0.5-1 MB/s of data from the master over the
> network, which is then saved to MySQL's relay log and executed. In ideal
> conditions (i.e. no filesystem overhead) we should expect some 1-3 MB/s
> of data written to disk.
> 
> MySQL directory and files in it are chattr +C (since the directory was
> created, so all files are really +C); there are no snapshots.
> 
> 
> Now, an interesting thing.
> 
> When the filesystem is mounted with these options in fstab:
> 
> defaults,noatime,discard
> 
> We can see a *constant* write of 25-100 MB/s to each disk. The system is
> generally unresponsive and it sometimes takes long seconds for a simple
> command executed in bash to return.

Did you already test the difference with/without 'discard'? Also, I
think that depending on the tooling that you use to view disk IO,
discards will also show up as disk write statistics.

> However, as soon as we remount the filesystem with space_cache=v2 -
> writes drop to just around 3-10 MB/s to each disk. If we remount to
> space_cache - lots of writes, system unresponsive. Again remount to
> space_cache=v2 - low writes, system responsive.
> 
> That's a huuge, 10x overhead! Is it expected? Especially that
> space_cache=v1 is still the default mount option?

Yes, that does not surprise me.

https://events.static.linuxfound.org/sites/events/files/slides/vault2016_0.pdf

Free space cache v1 is the default because of issues with btrfs-progs,
not because it's unwise to use the kernel code. I can totally recommend
using it. The linked presentation above gives some good background
information.

Another thing that's interesting is finding out what kind of things
btrfs is writing if it's writing that much MB/s to disk. Doing this is
not very trivial.

I've been spending quite some time researching these kind of issues.

Here's what I found out:
https://www.spinics.net/lists/linux-btrfs/msg70624.html (oh wow, that's
almost a year ago already)

There are a bunch of tracepoints in the kernel code that could help
debugging all of this more, but I've not yet gotten around to writing
something to conveniently to use them to live show what's happening.

I'm still using the "Thanks to a bug, solved in [2]" in the above
mailing list post way of combining extent allocators in btrfs now to
keep things workable on the larger filesystem.

-- 
Hans van Kranenburg

Re: [PATCH v7 0/6] Btrfs: implement swap file support

On Tue, Sep 11, 2018 at 03:34:43PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> Hi,
> 
> This series implements swap file support for Btrfs.
> 
> Changes from v6 [1]:
> 
> - Moved btrfs_get_chunk_map() comment to function body
> - Added more comments about pinned block group/device rbtree
> - Fixed bug in patch 4 which broke resize
> 
> Based on v4.19-rc3.
> 
> Thanks!
> 
> 1: https://www.spinics.net/lists/linux-btrfs/msg81732.html
> 
> Omar Sandoval (6):
>   mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
>   mm: export add_swap_extent()
>   vfs: update swap_{,de}activate documentation
>   Btrfs: prevent ioctls from interfering with a swap file
>   Btrfs: rename get_chunk_map() and make it non-static
>   Btrfs: support swap files
> 
>  Documentation/filesystems/Locking |  17 +-
>  Documentation/filesystems/vfs.txt |  12 +-
>  fs/btrfs/ctree.h  |  29 +++
>  fs/btrfs/dev-replace.c|   8 +
>  fs/btrfs/disk-io.c|   4 +
>  fs/btrfs/inode.c  | 317 ++
>  fs/btrfs/ioctl.c  |  31 ++-
>  fs/btrfs/relocation.c |  18 +-
>  fs/btrfs/volumes.c|  82 ++--
>  fs/btrfs/volumes.h|   2 +
>  include/linux/swap.h  |  13 +-
>  mm/page_io.c  |   6 +-
>  mm/swapfile.c |  14 +-
>  13 files changed, 502 insertions(+), 51 deletions(-)

Ping, any other comments on this version?

Re: btrfs panic problem

2018-09-19 Thread Nikolay Borisov




On 19.09.2018 02:53, sunny.s.zhang wrote:
> Hi Duncan,
> 
> Thank you for your advice. I understand what you mean.  But i have
> reviewed the latest btrfs code, and i think the issue is exist still.
> 
> At 71 line, if the function of btrfs_get_delayed_node run over this
> line, then switch to other process, which run over the 1282 and release
> the delayed node at the end.
> 
> And then, switch back to the  btrfs_get_delayed_node. find that the node
> is not null, and use it as normal. that mean we used a freed memory.
> 
> at some time, this memory will be freed again.
> 
> latest code as below.
> 
> 1278 void btrfs_remove_delayed_node(struct btrfs_inode *inode)
> 1279 {
> 1280 struct btrfs_delayed_node *delayed_node;
> 1281
> 1282 delayed_node = READ_ONCE(inode->delayed_node);
> 1283 if (!delayed_node)
> 1284 return;
> 1285
> 1286 inode->delayed_node = NULL;
> 1287 btrfs_release_delayed_node(delayed_node);
> 1288 }
> 
> 
>   64 static struct btrfs_delayed_node *btrfs_get_delayed_node(
>   65 struct btrfs_inode *btrfs_inode)
>   66 {
>   67 struct btrfs_root *root = btrfs_inode->root;
>   68 u64 ino = btrfs_ino(btrfs_inode);
>   69 struct btrfs_delayed_node *node;
>   70
>   71 node = READ_ONCE(btrfs_inode->delayed_node);
>   72 if (node) {
>   73 refcount_inc(>refs);
>   74 return node;
>   75 }
>   76
>   77 spin_lock(>inode_lock);
>   78 node = radix_tree_lookup(>delayed_nodes_tree, ino);
> 
> 

You are analysis is correct, however it's missing one crucial point -
btrfs_remove_delayed_node is called only from btrfs_evict_inode. And
inodes are evicted when all other references have been dropped. Check
the code in evict_inodes() - inodes are added to the dispose list when
their i_count is 0 at which point there should be no references in this
inode. This invalidates your analysis...

> 在 2018年09月18日 13:05, Duncan 写道:
>> sunny.s.zhang posted on Tue, 18 Sep 2018 08:28:14 +0800 as excerpted:
>>
>>> My OS(4.1.12) panic in kmem_cache_alloc, which is called by
>>> btrfs_get_or_create_delayed_node.
>>>
>>> I found that the freelist of the slub is wrong.
>> [Not a dev, just a btrfs list regular and user, myself.  But here's a
>> general btrfs list recommendations reply...]
>>
>> You appear to mean kernel 4.1.12 -- confirmed by the version reported in
>> the posted dump:  4.1.12-112.14.13.el6uek.x86_64
>>
>> OK, so from the perspective of this forward-development-focused list,
>> kernel 4.1 is pretty ancient history, but you do have a number of
>> options.
>>
>> First let's consider the general situation.  Most people choose an
>> enterprise distro for supported stability, and that's certainly a valid
>> thing to want.  However, btrfs, while now reaching early maturity for the
>> basics (single device in single or dup mode, and multi-device in single/
>> raid0/1/10 modes, note that raid56 mode is newer and less mature),
>> remains under quite heavy development, and keeping reasonably current is
>> recommended for that reason.
>>
>> So you you chose an enterprise distro presumably to lock in supported
>> stability for several years, but you chose a filesystem, btrfs, that's
>> still under heavy development, with reasonably current kernels and
>> userspace recommended as tending to have the known bugs fixed.  There's a
>> bit of a conflict there, and the /general/ recommendation would thus be
>> to consider whether one or the other of those choices are inappropriate
>> for your use-case, because it's really quite likely that if you really
>> want the stability of an enterprise distro and kernel, that btrfs isn't
>> as stable a filesystem as you're likely to want to match with it.
>> Alternatively, if you want something newer to match the still under heavy
>> development btrfs, you very likely want a distro that's not focused on
>> years-old stability just for the sake of it.  One or the other is likely
>> to be a poor match for your needs, and choosing something else that's a
>> better match is likely to be a much better experience for you.
>>
>> But perhaps you do have reason to want to run the newer and not quite to
>> traditional enterprise-distro level stability btrfs, on an otherwise
>> older and very stable enterprise distro.  That's fine, provided you know
>> what you're getting yourself into, and are prepared to deal with it.
>>
>> In that case, for best support from the list, we'd recommend running one
>> of the latest two kernels in either the current or mainline LTS tracks.
>>
>> For current track, With 4.18 being the latest kernel, that'd be 4.18 or
>> 4.17, as available on kernel.org (tho 4.17 is already EOL, no further
>> releases, at 4.17.19).
>>
>> For mainline-LTS track, 4.14 and 4.9 are the latest two LTS series
>> kernels, tho IIRC 4.19 is scheduled to be this year's LTS (or was it 4.18
>> and it's just not out of normal

state of btrfs snapshot limitations?

2018-09-19 Thread Piotr Pawłow

Hello,
> If the limit is 100 or less I'd need use a more complicated
> rotation scheme.

If you just want to thin them out over time without having selected "special" 
monthly, yearly etc snapshots, then my favorite scheme is to just compare the 
age of a snapshot to the distance to its neighbours, and if the distance is 
less than age / constant then delete it. If the constant is, for example, 12, 
then it will start thinning out hourly snapshots after around 12 hours, monthly 
after 12 months etc.

This is how it looks after 2 years with daily snapshots and the constant=6:

backup-20160328143825
backup-20161210043001
backup-20170424033001
backup-20170830033001
backup-20171102043001
backup-20180105043001
backup-20180310043001
backup-20180411033001
backup-20180513033001
backup-20180614033001
backup-20180630033001
backup-20180716033001
backup-20180801033001
backup-20180809033001
backup-20180817033001
backup-20180825033001
backup-20180829033001
backup-20180902033001
backup-20180906033001
backup-20180908033001
backup-20180910033001
backup-20180912033001
backup-20180914033001
backup-20180915033001
backup-20180916033001
backup-20180917033001
backup-20180918033001
backup-20180919033001

Notice how I have 6 daily snapshots (from 09-14 to 09-19), then I have at least 
1 snapshot from each month 6 months back (04 to 09) and I would have at least 1 
snapshot from each year for 6 years if I kept them longer. I delete the oldest 
snapshot when free space gets too low.

I have a horrid perl "one-liner" to do the thinning (caution! it deletes 
subvolumes without asking!):

perl -e 'for(@ARGV){open($in,"-|",qw(btrfs subvolume show),$_);$ts{$_}=(map{/: 
\t+(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \+\d{4})$/ or die "error parsing: 
$_\n";0+`date --date "$1" +%s` or die $!}grep{/Creation 
time:/}<$in>)[0]}@s=sort{$ts{$b}<=>$ts{$a}}keys%ts;while(@s>2){($s1,$s2,$s3)=@s;if(($ts{$s1}-$ts{$s3})/2<(time-$ts{$s2})/12){system(qw(btrfs
 subvolume delete),$s2);$s[1]=$s1};shift@s}' [snapshot ...]

(hey, everything can be a one-liner if you allow unlimited line length!)

I will take this opportunity to tidy it up a bit (below). Maybe someone else 
will find it useful or have some ideas for improvements. I would really like to 
avoid parsing "btrfs subvolume show" output (maybe python-btrfs can read 
subvolume creation time?)

#!/usr/bin/perl
use strict;
use warnings;

# map snapshot names to timestamps
my %ts;
for (@ARGV) {
    # run "btrfs subvolume show" for each snapshot
    open( my $in, "-|", qw(btrfs subvolume show), $_ );
    # convert "Creation time" from btrfs output to timestamp
    $ts{$_} = (
    map {
    /: \t+(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \+\d{4})$/
  or die "error parsing: $_\n";
    # using "date" command to parse "Creation time"
    # who needs modules from CPAN right? ;)
    0 + `date --date "$1" +%s` or die $!
    } grep { /Creation time:/ } <$in>
    )[0];
}
# sort snapshot names by timestamps
my @s = sort { $ts{$b} <=> $ts{$a} } keys %ts;
while ( @s > 2 ) {
    my ( $s1, $s2, $s3 ) = @s;
    # compare average distance to age / 12
    # change 12 to some other value to keep more or less snapshots
    # higher value = more snapshots
    if ( ( $ts{$s1} - $ts{$s3} ) / 2 < ( time - $ts{$s2} ) / 12 ) {
    # caution! this runs "btrfs subvolume delete"
    # you can put "echo" before "btrfs" for a "dry run"
    system( qw(btrfs subvolume delete), $s2 );
    # we deleted $s2 snapshot, so replace $s2 with $s1
    $s[1] = $s1;
    }
    shift @s;
}

Re: cannot mount btrfs as root



On 2018/9/19 下午9:19, Zbigniew 'zibi' Jarosik wrote:
> Hi!
> 
> I can't mount my RAID1 set as rootfs, but as normal mount to running
> system works normally. It was working normal, than some day it died.
> 
> When booted to rootfs as
> root=UUID=98c94774-d93a-400e-a275-58cc3ac2a58a rootflags=subvol=@root
> i got:
> 
> BTRFS error (device bcache2): devid 5 uuid
> dd09f810-717e-4f2f-97ab-26469d5adca5 is missing
> BTRFS error (device bcache2): failed to read the system array: -2

One of your device is missing, thus read_one_dev() refuse to continue
and return -ENOENT (-2).

Does the iniramfs do a "btrfs dev scan" to make populate btrfs devices
lists?

> BTRFS error (device bcache2): open_ctree failed
> mount: mounting /dev/bcache2 on /root failed: Invalid argument
> 
> and (initramfs) prompt.
> 
> Bcache devices are registered and populated in /dev.
> 
> When I boot exacly the same kernel and initrd, but rootfs is on
> pendrive ( root=/dev/sda2 ) than mount dataset to subdir ( mount UUID=
> 98c94774-d93a-400e-a275-58cc3ac2a58a  /data ) - works flawlessly.
> 
> Thanks in advance for any ideas.
> 
> Some system info:
> 
> /dev/sdb1: UUID="780cc211-2408-4594-83df-379ee23d6ed8" TYPE="bcache"
> PARTLABEL="Linux filesystem"
> PARTUUID="ef83f1b4-b55c-4deb-8037-69ff16909222"
> /dev/sdc1: UUID="80a1b115-8a1f-41f7-b65b-dd4fdaf400f1" TYPE="bcache"
> PARTLABEL="Linux filesystem"
> PARTUUID="1f920d16-af68-440b-891c-5ff61c3057ca"
> /dev/sdd1: UUID="a40e80dc-f7e2-4a9c-9bfe-81489a1261ca" TYPE="bcache"
> PARTLABEL="Linux filesystem"
> PARTUUID="4b68c314-f029-402a-b332-b0b9109ad51d"
> /dev/bcache0: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
> UUID_SUB="dd09f810-717e-4f2f-97ab-26469d5adca5" TYPE="btrfs"

The devices is indeed here, so it's not a missing device, but looks like
at mount time, btrfs hasn't fully scanned all devices.

You could refer to your distribution manual to figure out how to make
btrfs scan work before mounting the rootfs.

Thanks,
Qu

> /dev/bcache1: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
> UUID_SUB="8469962d-f9ee-4b5f-b4d5-4a62190c0d8f" TYPE="btrfs"
> /dev/bcache2: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
> UUID_SUB="516f2bd1-3fcc-42fd-87ad-5d5fa2f84b5f" TYPE="btrfs"
> 
> Linux hp1 4.17.11-ldkb #2 SMP Thu Aug 2 09:24:58 CEST 2018 x86_64 GNU/Linux
> btrfs-progs v4.7.3
> Label: none  uuid: 98c94774-d93a-400e-a275-58cc3ac2a58a
> Total devices 3 FS bytes used 3.21TiB
> devid3 size 2.73TiB used 2.07TiB path /dev/bcache2
> devid4 size 2.73TiB used 2.18TiB path /dev/bcache1
> devid5 size 2.73TiB used 2.18TiB path /dev/bcache0
> Data, RAID1: total=3.20TiB, used=3.20TiB
> System, RAID1: total=32.00MiB, used=480.00KiB
> Metadata, RAID1: total=12.00GiB, used=10.81GiB
> GlobalReserve, single: total=512.00MiB, used=0.00B
> 
> 



signature.asc
Description: OpenPGP digital signature

cannot mount btrfs as root

2018-09-19 Thread Zbigniew 'zibi' Jarosik

Hi!

I can't mount my RAID1 set as rootfs, but as normal mount to running
system works normally. It was working normal, than some day it died.

When booted to rootfs as
root=UUID=98c94774-d93a-400e-a275-58cc3ac2a58a rootflags=subvol=@root
i got:

BTRFS error (device bcache2): devid 5 uuid
dd09f810-717e-4f2f-97ab-26469d5adca5 is missing
BTRFS error (device bcache2): failed to read the system array: -2
BTRFS error (device bcache2): open_ctree failed
mount: mounting /dev/bcache2 on /root failed: Invalid argument

and (initramfs) prompt.

Bcache devices are registered and populated in /dev.

When I boot exacly the same kernel and initrd, but rootfs is on
pendrive ( root=/dev/sda2 ) than mount dataset to subdir ( mount UUID=
98c94774-d93a-400e-a275-58cc3ac2a58a  /data ) - works flawlessly.

Thanks in advance for any ideas.

Some system info:

/dev/sdb1: UUID="780cc211-2408-4594-83df-379ee23d6ed8" TYPE="bcache"
PARTLABEL="Linux filesystem"
PARTUUID="ef83f1b4-b55c-4deb-8037-69ff16909222"
/dev/sdc1: UUID="80a1b115-8a1f-41f7-b65b-dd4fdaf400f1" TYPE="bcache"
PARTLABEL="Linux filesystem"
PARTUUID="1f920d16-af68-440b-891c-5ff61c3057ca"
/dev/sdd1: UUID="a40e80dc-f7e2-4a9c-9bfe-81489a1261ca" TYPE="bcache"
PARTLABEL="Linux filesystem"
PARTUUID="4b68c314-f029-402a-b332-b0b9109ad51d"
/dev/bcache0: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
UUID_SUB="dd09f810-717e-4f2f-97ab-26469d5adca5" TYPE="btrfs"
/dev/bcache1: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
UUID_SUB="8469962d-f9ee-4b5f-b4d5-4a62190c0d8f" TYPE="btrfs"
/dev/bcache2: UUID="98c94774-d93a-400e-a275-58cc3ac2a58a"
UUID_SUB="516f2bd1-3fcc-42fd-87ad-5d5fa2f84b5f" TYPE="btrfs"

Linux hp1 4.17.11-ldkb #2 SMP Thu Aug 2 09:24:58 CEST 2018 x86_64 GNU/Linux
btrfs-progs v4.7.3
Label: none  uuid: 98c94774-d93a-400e-a275-58cc3ac2a58a
Total devices 3 FS bytes used 3.21TiB
devid3 size 2.73TiB used 2.07TiB path /dev/bcache2
devid4 size 2.73TiB used 2.18TiB path /dev/bcache1
devid5 size 2.73TiB used 2.18TiB path /dev/bcache0
Data, RAID1: total=3.20TiB, used=3.20TiB
System, RAID1: total=32.00MiB, used=480.00KiB
Metadata, RAID1: total=12.00GiB, used=10.81GiB
GlobalReserve, single: total=512.00MiB, used=0.00B


-- 
Zbigniew 'zibi' Jarosik
cell: +48 667956686
jid: tap...@gmail.com
gg: 2830
http://zibi.nora.pl/

Re: very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)

2018-09-19 Thread Remi Gauvin

On 2018-09-19 04:43 AM, Tomasz Chmielewski wrote:
> I have a mysql slave which writes to a RAID-1 btrfs filesystem (with
> 4.17.14 kernel) on 3 x ~1.9 TB SSD disks; filesystem is around 40% full.
> 
> The slave receives around 0.5-1 MB/s of data from the master over the
> network, which is then saved to MySQL's relay log and executed. In ideal
> conditions (i.e. no filesystem overhead) we should expect some 1-3 MB/s
> of data written to disk.
> 
> MySQL directory and files in it are chattr +C (since the directory was
> created, so all files are really +C); there are no snapshots.

Not related to the issue you are reporting, but I thought it's worth
mentioning, (since not many do), that using chattr +C on a BTRFS Raid 1
is a dangerous thing.  without COW, the 2 copies are never synchronized,
even if a scrub is executed.  So any kind of unclean shutdown that
interrupts writes (not to mention the extreme of a temporarily
disconnected drive.) will result in files that are inconsistent.  (ie,
depending on which disk happens to read at the time, the data will be
different on each read.)

<>

Re: [PATCH RFC] btrfs: delayed-inode: Use spinlock to protect btrfs_inode::delayed_node

2018-09-19 Thread Nikolay Borisov

On 19.09.2018 09:59, Qu Wenruo wrote:
> In the following case, we could trigger a use-after-free bug:
> 
>  CPU0|   CPU1
> -
> btrfs_remove_delayed_node| btrfs_get_delayed_node
> |- delayed_node =| |- node = btrfs_inode->delayed_node;
> |btrfs_inode->delayed_node   | |
> |- btrfs_release_delaedy_node()  | |
>|- ref_count_dev_and_test()   | |
>|- kmem_cache_free()  | |
>   Now delayed node is freed  | |
>  | |- refcount_inc(>refs)
> 

btrfs_remove_delayed_node is called from evict_inode which is called
once the inode has been freed and there are no more referencs to this
inode (inode->i_count is 0). Also before calling
btrfs_remove_delayed_node we have flushed all the pages and ordered
extents. So the crucial bit of information missing is what is the
higher-level operation that requests the delayed node for a freed inode ?

Re: very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)

On 2018/9/19 下午4:43, Tomasz Chmielewski wrote:
> I have a mysql slave which writes to a RAID-1 btrfs filesystem (with
> 4.17.14 kernel) on 3 x ~1.9 TB SSD disks; filesystem is around 40% full.

This sounds a little concerning.
Not about the the usage percentage itself. but the size and how many
free space cache could be updated for each transaction.

Detail will follow below.

> 
> The slave receives around 0.5-1 MB/s of data from the master over the
> network, which is then saved to MySQL's relay log and executed. In ideal
> conditions (i.e. no filesystem overhead) we should expect some 1-3 MB/s
> of data written to disk.
> 
> MySQL directory and files in it are chattr +C (since the directory was
> created, so all files are really +C); there are no snapshots.

Not familiar with space cache nor MySQL workload, but at least we don't
need to bother extra data CoW.

> 
> 
> Now, an interesting thing.
> 
> When the filesystem is mounted with these options in fstab:
> 
> defaults,noatime,discard
> 
> 
> We can see a *constant* write of 25-100 MB/s to each disk. The system is
> generally unresponsive and it sometimes takes long seconds for a simple
> command executed in bash to return.

The main concern here is how many metadata block groups are involved in
one transaction.

From my observation, although free space cache files (v1 space cache)
are marked NODATACOW, they in fact get updated in a COW behavior.

This means if there are say 100 metadata block groups get updated, then
we need to write around 12M data just for space cache.

On the other than, if we fix v1 space cache to really do NODATACOW, then
it should hugely reduce the IO for free space cache

> 
> 
> However, as soon as we remount the filesystem with space_cache=v2 -
> writes drop to just around 3-10 MB/s to each disk. If we remount to
> space_cache - lots of writes, system unresponsive. Again remount to
> space_cache=v2 - low writes, system responsive.

Have you tried nospace_cache? I think it should behavior a little worse
than v2 space cache but much better than the *broken* v1 space cache.

And for v2 space cache, it's already based on btrfs btree, which get
CoWed like all other btrfs btrees, thus no need to update the whole
space cache for each metadata block group. (Although in theory, the
overhead should still be larger than *working* v1 cache)

Thanks,
Qu

> 
> 
> That's a huuge, 10x overhead! Is it expected? Especially that
> space_cache=v1 is still the default mount option?
> 
> 
> Tomasz Chmielewski
> https://lxadm.com

signature.asc
Description: OpenPGP digital signature

very poor performance / a lot of writes to disk with space_cache (but not with space_cache=v2)

2018-09-19 Thread Tomasz Chmielewski

I have a mysql slave which writes to a RAID-1 btrfs filesystem (with 
4.17.14 kernel) on 3 x ~1.9 TB SSD disks; filesystem is around 40% full.


The slave receives around 0.5-1 MB/s of data from the master over the 
network, which is then saved to MySQL's relay log and executed. In ideal 
conditions (i.e. no filesystem overhead) we should expect some 1-3 MB/s 
of data written to disk.


MySQL directory and files in it are chattr +C (since the directory was 
created, so all files are really +C); there are no snapshots.



Now, an interesting thing.

When the filesystem is mounted with these options in fstab:

defaults,noatime,discard


We can see a *constant* write of 25-100 MB/s to each disk. The system is 
generally unresponsive and it sometimes takes long seconds for a simple 
command executed in bash to return.



However, as soon as we remount the filesystem with space_cache=v2 - 
writes drop to just around 3-10 MB/s to each disk. If we remount to 
space_cache - lots of writes, system unresponsive. Again remount to 
space_cache=v2 - low writes, system responsive.



That's a huuge, 10x overhead! Is it expected? Especially that 
space_cache=v1 is still the default mount option?



Tomasz Chmielewski
https://lxadm.com

Re: [PATCH RFC] btrfs: delayed-inode: Use spinlock to protect btrfs_inode::delayed_node



On 2018/9/19 下午2:59, Qu Wenruo wrote:
> In the following case, we could trigger a use-after-free bug:
> 
>  CPU0|   CPU1
> -
> btrfs_remove_delayed_node| btrfs_get_delayed_node
> |- delayed_node =| |- node = btrfs_inode->delayed_node;
> |btrfs_inode->delayed_node   | |
> |- btrfs_release_delaedy_node()  | |
>|- ref_count_dev_and_test()   | |
>|- kmem_cache_free()  | |
>   Now delayed node is freed  | |
>  | |- refcount_inc(>refs)
> 
> In that case sine delayed_node is using kmem cache, such use-after-free
> bug won't directly cause problem, but could leads to corrupted data
> structure of other kmem cache user.
> 
> Fix it by adding btrfs_inode::delayed_node_lock to protect such
> operation.
> 
> Reported-by: sunny.s.zhang 
> Signed-off-by: Qu Wenruo 
> ---
> Please don't merge this patch yet.

False alert.

The performance degradation is a false alert, and it's pretty awkward.

Before this test run, I refilled TEST_DEV with a special file layout
(for my qgroup balance test) to increase balance/qgroup overhead.
And the file layout also turns out to be pretty heavy for btrfs check,
which makes the test time increase.

Since it's a false alert, the RFC tag is no longer needed.

Thanks,
Qu

> 
> The patch caused random slow down for a lot of quick test cases.
> Old tests can be executed in 1s or so now randomly needs near 20s.
> 
> It looks like the spin_lock() with root->inode_lock hold is causing the
> problem but I can't see what's going wrong.
> As the operation done with @delayed_node_lock hold is literatly tiny.
> 
> Any comment on this is welcomed.
> ---
>  fs/btrfs/btrfs_inode.h   |  2 ++
>  fs/btrfs/delayed-inode.c | 18 +++---
>  fs/btrfs/inode.c |  1 +
>  3 files changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1343ac57b438..c2f054223588 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -175,6 +175,8 @@ struct btrfs_inode {
>*/
>   unsigned defrag_compress;
>  
> + /* lock for grabbing/freeing @delayed_node */
> + spinlock_t delayed_node_lock;
>   struct btrfs_delayed_node *delayed_node;
>  
>   /* File creation time. */
> diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
> index f51b509f2d9b..16c405e54930 100644
> --- a/fs/btrfs/delayed-inode.c
> +++ b/fs/btrfs/delayed-inode.c
> @@ -68,19 +68,24 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
>   u64 ino = btrfs_ino(btrfs_inode);
>   struct btrfs_delayed_node *node;
>  
> - node = READ_ONCE(btrfs_inode->delayed_node);
> + spin_lock(_inode->delayed_node_lock);
> + node = btrfs_inode->delayed_node;
>   if (node) {
>   refcount_inc(>refs);
> + spin_unlock(_inode->delayed_node_lock);
>   return node;
>   }
> + spin_unlock(_inode->delayed_node_lock);
>  
>   spin_lock(>inode_lock);
>   node = radix_tree_lookup(>delayed_nodes_tree, ino);
>  
>   if (node) {
> + spin_lock(_inode->delayed_node_lock);
>   if (btrfs_inode->delayed_node) {
>   refcount_inc(>refs);  /* can be accessed */
>   BUG_ON(btrfs_inode->delayed_node != node);
> + spin_unlock(_inode->delayed_node_lock);
>   spin_unlock(>inode_lock);
>   return node;
>   }
> @@ -108,6 +113,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
>   node = NULL;
>   }
>  
> + spin_unlock(_inode->delayed_node_lock);
>   spin_unlock(>inode_lock);
>   return node;
>   }
> @@ -152,7 +158,9 @@ static struct btrfs_delayed_node 
> *btrfs_get_or_create_delayed_node(
>   radix_tree_preload_end();
>   goto again;
>   }
> + spin_lock(_inode->delayed_node_lock);
>   btrfs_inode->delayed_node = node;
> + spin_unlock(_inode->delayed_node_lock);
>   spin_unlock(>inode_lock);
>   radix_tree_preload_end();
>  
> @@ -1279,11 +1287,15 @@ void btrfs_remove_delayed_node(struct btrfs_inode 
> *inode)
>  {
>   struct btrfs_delayed_node *delayed_node;
>  
> - delayed_node = READ_ONCE(inode->delayed_node);
> - if (!delayed_node)
> + spin_lock(>delayed_node_lock);
> + delayed_node = inode->delayed_node;
> + if (!delayed_node) {
> + spin_unlock(>delayed_node_lock);
>   return;
> + }
>  
>   inode->delayed_node = NULL;
> + spin_unlock(>delayed_node_lock);
>   btrfs_release_delayed_node(delayed_node);
>  }
>  
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 9357a19d2bff..f438be5fecaf 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9177,6 +9177,7 @@

[PATCH RFC] btrfs: delayed-inode: Use spinlock to protect btrfs_inode::delayed_node