[PATCH v2 1/1] btrfs: Add mechanism to configure automatic level-0 qgroup removal

2017-09-17 Thread Sargun Dhillon
This patch introduces a persisted sysfs knob - qgroup_autoremove.
The purpose of this knob is to cause btrfs (kernel) to automatically
remove level-0 qgroups on subvolume removal. It does not try to
traverse the qgroup tree, and delete other dangling qgroups.

The knob is  disabled by default to avoid breaking userspace.
Once the knob is enabled, it is persisted across remounts in
qgroup_flags.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ctree.h|  2 +-
 fs/btrfs/ioctl.c| 14 +++
 fs/btrfs/sysfs.c| 91 -
 include/uapi/linux/btrfs_tree.h |  7 
 4 files changed, 103 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ded3e9154a5..65f8fa246a4d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1018,7 +1018,7 @@ struct btrfs_fs_info {
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
 #endif
-   /* is qgroup tracking in a consistent state? */
+   /* qgroup configuration; is qgroup tracking in a consistent state? */
u64 qgroup_flags;
 
/* holds configuration and tracking. Protected by qgroup_lock */
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 94934901d58c..d7ef13720374 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2315,6 +2315,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file 
*file,
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
+   bool remove_qgroup = false;
u64 root_flags;
u64 qgroup_reserved;
int namelen;
@@ -2497,6 +2498,19 @@ static noinline int btrfs_ioctl_snap_destroy(struct file 
*file,
}
}
 
+
+   spin_lock(_info->qgroup_lock);
+   if (fs_info->qgroup_flags & BTRFS_QGROUP_AUTOREMOVE_FLAG)
+   remove_qgroup = true;
+   spin_unlock(_info->qgroup_lock);
+   if (remove_qgroup) {
+   ret = btrfs_remove_qgroup(trans, fs_info,
+ dest->root_key.objectid);
+   if (ret && ret != -ENOENT)
+   btrfs_warn(fs_info,
+  "Failed to cleanup qgroup. err: %d", ret);
+   }
+
 out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c2d5f3580b4c..9604778038fd 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -90,6 +90,15 @@ static int can_modify_feature(struct btrfs_feature_attr *fa)
return val;
 }
 
+static void set_pending_commit(struct btrfs_fs_info *fs_info)
+{
+   /*
+* We don't want to do full transaction commit from inside sysfs
+*/
+   btrfs_set_pending(fs_info, COMMIT);
+   wake_up_process(fs_info->transaction_kthread);
+}
+
 static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
   struct kobj_attribute *a, char *buf)
 {
@@ -165,11 +174,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject 
*kobj,
set_features(fs_info, fa->feature_set, features);
spin_unlock(_info->super_lock);
 
-   /*
-* We don't want to do full transaction commit from inside sysfs
-*/
-   btrfs_set_pending(fs_info, COMMIT);
-   wake_up_process(fs_info->transaction_kthread);
+   set_pending_commit(fs_info);
 
return count;
 }
@@ -405,11 +410,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
memcpy(fs_info->super_copy->label, buf, p_len);
spin_unlock(_info->super_lock);
 
-   /*
-* We don't want to do full transaction commit from inside sysfs
-*/
-   btrfs_set_pending(fs_info, COMMIT);
-   wake_up_process(fs_info->transaction_kthread);
+   set_pending_commit(fs_info);
 
return len;
 }
@@ -487,12 +488,82 @@ static ssize_t quota_override_store(struct kobject *kobj,
 
 BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
 
+static ssize_t qgroup_autoremove_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+   struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+   int qgroup_autoremove = 0;
+
+   mutex_lock(_info->qgroup_ioctl_lock);
+   /* Check if qgroups are enabled */
+   if (!test_bit(BTRFS_FS_QUOTA_ENABLED, _info->flags))
+   goto out;
+   if (!fs_info->quota_root)
+   goto out;
+
+   if (fs_info->qgroup_flags & BTRFS_QGROUP_AUTOREMOVE_FLAG)
+   qgroup_autoremove = 1;
+
+out:
+   mutex_unlock(_info->qgroup_ioctl_lock);
+
+   return snprintf(buf, PAGE_SIZE, "%d\n", qgroup_autoremove);
+}
+
+static ssize_t qgroup_autoremove_store(struct kobject *kobj,
+   

[PATCH v2 0/1] Add qgroup_autoremove flag

2017-09-17 Thread Sargun Dhillon
This patch makes it so level-0 qgroups are automatically deleted. The
flag that enables this behaviour is persisted in btrfs_qgroup_status_item.
Although, it may make sense to introduce a btrfs_qgroup_configuration_item,
it seems somewhat overkill to do that just for one simple knob that can
fit into the existing flags.

The reason why this is a sysfs knob, and not a mount option is because
qgroups have to be enabled after the first mount. Having to do a mount,
enable qgroups, flip the knob, and then remounting seems somewhat more
awkward than this single sysfs knob.

Changes since v1:
 * Move from mount option to sysfs knob
 * Persist the knob across remounts

Sargun Dhillon (1):
  btrfs: Add mechanism to configure automatic level-0 qgroup removal

 fs/btrfs/ctree.h|  2 +-
 fs/btrfs/ioctl.c| 14 +++
 fs/btrfs/sysfs.c| 91 -
 include/uapi/linux/btrfs_tree.h |  7 
 4 files changed, 103 insertions(+), 11 deletions(-)

-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: Report error on removing qgroup if del_qgroup_item fails

2017-09-17 Thread Sargun Dhillon
Previously, we were calling del_qgroup_item, and ignoring the return code
resulting in a potential to have divergent in-memory state without an
error. Perhaps, it makes sense to handle this error code, and put the
filesystem into a read only, or similar state.

This patch only adds reporting of the error if the error is fatal,
(any error other than qgroup not found).

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 770f667269f5..e172d4843eae 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1305,6 +1305,8 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
}
}
ret = del_qgroup_item(trans, quota_root, qgroupid);
+   if (ret && ret != -ENOENT)
+   goto out;
 
while (!list_empty(>groups)) {
list = list_first_entry(>groups,
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 2/2] btrfs: Add new ioctl uapis for qgroup creation / removal

2017-07-14 Thread Sargun Dhillon
This patch introduces two new ioctls to create, and remove qgroups. These
offer a somewhat more intuitive set of operations with the opportunity
to add flags that gate to unintentional manipulation of qgroups.

The create qgroup ioctl has a a new semantic which level-0 qgroups cannot
be created unless a matching subvolume ID is created.

The delete qgroup ioctl has the new semantic that it will not let you
delete a currently utilized (referenced by a subvolume) level-0
qgroup unless you pass the BTRFS_QGROUP_NO_SUBVOL_CHECK flag.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ioctl.c   | 100 -
 fs/btrfs/qgroup.c  |  76 +++---
 fs/btrfs/qgroup.h  |   6 ++-
 include/uapi/linux/btrfs.h |  16 
 4 files changed, 189 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa1b78c..2eca8e5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4924,6 +4924,98 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, 
void __user *arg)
return ret;
 }
 
+static long btrfs_ioctl_qgroup_create_v2(struct file *file, void __user *uargs)
+{
+   struct btrfs_ioctl_qgroup_args_v2 args;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_fs_info *fs_info;
+   struct btrfs_root *root;
+   struct inode *inode;
+   int ret;
+   int err;
+
+   inode = file_inode(file);
+   fs_info = btrfs_sb(inode->i_sb);
+   root = BTRFS_I(inode)->root;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = copy_from_user(, uargs, sizeof(args));
+   if (ret)
+   return ret;
+
+   if (!args.qgroupid)
+   return -EINVAL;
+
+   ret = mnt_want_write_file(file);
+   if (ret)
+   return ret;
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   ret = btrfs_create_qgroup(trans, fs_info, args.qgroupid, 1);
+   err = btrfs_end_transaction(trans);
+
+   if (err && !ret)
+   ret = err;
+
+out:
+   mnt_drop_write_file(file);
+   return ret;
+}
+
+static long btrfs_ioctl_qgroup_remove_v2(struct file *file, void __user *uargs)
+{
+   struct btrfs_ioctl_qgroup_args_v2 args;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_fs_info *fs_info;
+   struct btrfs_root *root;
+   struct inode *inode;
+   int check;
+   int ret;
+   int err;
+
+   inode = file_inode(file);
+   fs_info = btrfs_sb(inode->i_sb);
+   root = BTRFS_I(inode)->root;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = copy_from_user(, uargs, sizeof(args));
+   if (ret)
+   return ret;
+
+   if (!args.qgroupid)
+   return -EINVAL;
+   check = !(args.flags & BTRFS_QGROUP_NO_SUBVOL_CHECK);
+
+   ret = mnt_want_write_file(file);
+   if (ret)
+   return ret;
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   ret = btrfs_remove_qgroup(trans, fs_info, args.qgroupid, check);
+   err = btrfs_end_transaction(trans);
+
+   if (err && !ret)
+   ret = err;
+
+out:
+   mnt_drop_write_file(file);
+   return ret;
+}
+
 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
struct inode *inode = file_inode(file);
@@ -4959,9 +5051,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, 
void __user *arg)
}
 
if (sa->create) {
-   ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
+   ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid, 0);
} else {
-   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
+   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid, 0);
}
 
err = btrfs_end_transaction(trans);
@@ -5632,6 +5724,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_features(file, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
+   case BTRFS_IOC_QGROUP_CREATE_V2:
+   return btrfs_ioctl_qgroup_create_v2(file, argp);
+   case BTRFS_IOC_QGROUP_REMOVE_V2:
+   return btrfs_ioctl_qgroup_remove_v2(file, argp);
}
 
return -ENOTTY;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3abc517..05bce7b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1247,8 +1247,51 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
+/*
+ * Meant to only operate on level-0 qroupid.
+ *
+ * It returns 1 if a matching subvolume is found; 0 if none is 

[PATCH v3 1/2] btrfs: Fail on removing qgroup if del_qgroup_item fails

2017-07-14 Thread Sargun Dhillon
Previously, we were calling del_qgroup_item, and ignoring the return code
resulting in a potential to have divergent in-memory state without an
error. Perhaps, it makes sense to handle this error code, and put the
filesystem into a read only, or similar state.

This patch only adds reporting of the error if the error is fatal,
(any error other than qgroup not found).

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4ce351e..3abc517 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1309,6 +1309,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
}
ret = del_qgroup_item(trans, quota_root, qgroupid);
 
+   if (ret && ret != -ENOENT)
+   goto out;
+
while (!list_empty(>groups)) {
list = list_first_entry(>groups,
struct btrfs_qgroup_list, next_group);
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/2] New qgroup creation / removal ioctls

2017-07-14 Thread Sargun Dhillon
This patch introduces two new ioctls for creating, and removing qgroups.
These ioctls have a new args structure that allows passing flags, and
some reserved fields for future expansion. The ioctls prevent some
operations around level-0 qgroups as well.


The create operation prevents the creation of level-0 qgroups for
subvolumes that do not exist. The delete operations prevents the
deletion of level-0 qgroups that reference active volumes.

There are two different ioctl for creation, and deletion to enable
filtering of the ioctls via tools like seccomp-bpf. Currently, there
is one "swiss-army" ioctl that's multiplexed for creation, and
deletion, and you need to be able to dereference user memory to determine
whether or not it's a destructive operation -- something that cannot
be done via seccomp nor LSMs.

Changes since v2:
  * Use a common datastructure for removal and creation
  * Remove deprecation message for old API
Changes since v1:
  * Remove creation of level-0 qgroups without subvol

Sargun Dhillon (2):
  btrfs: Fail on removing qgroup if del_qgroup_item fails
  btrfs: Add new ioctl uapis for qgroup creation / removal

 fs/btrfs/ioctl.c   | 100 -
 fs/btrfs/qgroup.c  |  79 ---
 fs/btrfs/qgroup.h  |   6 ++-
 include/uapi/linux/btrfs.h |  16 
 4 files changed, 192 insertions(+), 9 deletions(-)

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/2] New qgroup creation / removal ioctl

2017-07-14 Thread Sargun Dhillon
This patch introduces two new ioctls for creating, and removing qgroups.
These ioctls have a new args structure that allows passing flags, and
some reserved fields for future expansion. The ioctls prevent some
operations around level-0 qgroups as well.


The create operation prevents the creation of level-0 qgroups for
subvolumes that do not exist. The delete operations prevents the
deletion of level-0 qgroups that reference active volumes.

There are two different ioctl for creation, and deletion to enable
filtering of the ioctls via tools like seccomp-bpf. Currently, there
is one "swiss-army" ioctl that's multiplexed for creation, and
deletion, and you need to be able to dereference user memory to determine
whether or not it's a destructive operation -- something that cannot
be done via seccomp nor LSMs.

Changes since v2:
  * Use a common datastructure for removal and creation
  * Remove deprecation message for old API
Changes since v1:
  * Remove creation of level-0 qgroups without subvol


Sargun Dhillon (2):
  btrfs: Fail on removing qgroup if del_qgroup_item fails
  btrfs: Add new ioctl uapis for qgroup creation / removal

 fs/btrfs/ioctl.c   | 100 -
 fs/btrfs/qgroup.c  |  79 ---
 fs/btrfs/qgroup.h  |   6 ++-
 include/uapi/linux/btrfs.h |  16 
 4 files changed, 192 insertions(+), 9 deletions(-)

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Containers, Btrfs vs Btrfs + overlayfs

2017-07-13 Thread Sargun Dhillon
On Thu, Jul 13, 2017 at 7:01 PM, Qu Wenruo  wrote:
>
>
> On 2017年07月14日 07:26, Chris Murphy wrote:
>>
>> On Thu, Jul 13, 2017 at 4:32 PM, Liu Bo  wrote:
>>>
>>> On Thu, Jul 13, 2017 at 02:49:27PM -0600, Chris Murphy wrote:

 Has anyone been working with Docker and Btrfs + overlayfs? It seems
 superfluous or unnecessary to use overlayfs, but the shared page cache
 aspect and avoiding some of the problems with large numbers of Btrfs
 snapshots, might make it a useful combination. But I'm not finding
 useful information with searches. Typically it's Btrfs alone vs
 ext4/XFS + overlayfs.

 ?
We've been running Btrfs with Docker at appreciable scale for a few
months now (100-200k containers  / day ). We originally looked at the
Overlay FS route, but it turns out that one of the downsides the
shared page cache is it breaks cgroup accounting. If you want to
properly allow people to ensure their container never touches disk, it
may get complicated.
>>>
>>>
>>> Is there a reproducer for problems with large number of btrfs
>>> snapshots?
>>
>>
>> No benchmarking comparison but it's known that deletion of snapshots
>> gets more expensive when there are many snapshots due to backref
>> search and metadata updates. I have no idea how it compares to
>> overlayfs. But then also some use cases I guess it's non-trivial
>> benefit to leverage a shared page cache.
We churn through ~80 containers per instance (over a day or so), and
each container's image has 20 layers. The deletion if very expensive,
and it would be nice to be able to throttle it, but ~100GB subvolumes
(on SSD) with 1+ files are typically removed in <5s. Qgroups turn
out to have a lot of overhead here -- even with a single level.  At
least in our testing, even with qgroups, there's lower latency for I/O
and metadata during build jobs (Java or C compilation) as compared to
OverlayFS on BtrFS or AUFS on ZFS (on Linux). Without qgroups, it's
almost certainly "faster". YMMV though, because we're already paying
the network storage latency cost.

We've investigating using the blkio controller to isolate I/O per
container to avoid I/O stalls, and restrict I/O during snapshot
cleanup, but that's been unsuccessful.
>
>
> In fact, except balance and quota, I can't see much extra performance impact
> from backref walk.
>
> And if it's not snapshots, but subvolumes, then more subvolumes means
> smaller subvolume trees, and less race to lock subvolume trees.
> So, more (evenly distributed) subvolumes should in fact lead to higher
> performance.
>
>>
>>> Btrfs + overlayfs?  The copy-up coperation in overlayfs can take
>>> advantage of btrfs's clone, but this benefit applies for xfs, too.
>>
>>
>> Btrfs supports fs shrink, and also multiple device add/remove so it's
>> pretty nice for managing its storage in the cloud. And also seed
>> device might have uses. Some of it is doable with LVM but it's much
>> simpler, faster and safer with Btrfs.
>
>
> Faster? Not really.
> For metadata operation, btrfs is slower than traditional FSes.
>
> Due to metadata CoW, any metadata update will lead to superblock update.
> Such extra FUA for superblock is specially obvious for fsync heavy load but
> low concurrency case.
> Not to mention its default data CoW will lead to metadata CoW, making things
> even slower.
Since containers are ephemeral, they really shouldn't fsync. One of
the biggest (recent) problems has been workloads that use O_SYNC, or
sync after a large number of operations -- this stalls out all of the
containers (subvolumes) on the machine because the transaction lock is
under hold. This, in turn, manifests itself in soft lockups, and
operational trouble. Our plan to work around it is patch the VFS
layer, and stub out sync for certain cgroups.

>
> And race to lock fs/subvolume trees makes metadata operation even slower,
> especially for multi-thread IO.
> Unlike other FSes which use one-tree-one-inode, btrfs uses
> one-tree-one-subvoume, which makes race much hotter.
>
> Extent tree used to have the same problem, but delayed-ref (no matter you
> like it or not) did reduce race and improved performance.
>
> IIRC, some postgresql benchmark shows that XFS/Ext4 with LVM-thin provide
> much better performance than Btrfs, even ZFS-on-Linux out-performs btrfs.
>
At least in our testing, AUFS + ZFS-on-Linux did not have lower
latency than BtrFS. Stability is decent, bar the occasional soft
lockup, or hung transaction. One of the experiments that I've been
wanting to run is a custom graph driver which has XFS images in
snapshots / subvolumes on BtrFS, and mounts them over loopback -- This
makes things like limiting threads easier, and short-circuiting sync
logic per container.

>>
>> And that's why I'm kinda curious about the combination of Btrfs and
>> overlayfs. Overlayfs managed by Docker. And Btrfs for simpler and more
>> flexible storage management.
>
> Despite the performance problem, 

Re: Lock between userspace and btrfs-cleaner on extent_buffer

2017-07-12 Thread Sargun Dhillon
On Thu, Jun 29, 2017 at 11:49 AM, Jeff Mahoney <je...@suse.com> wrote:
> On 6/29/17 2:46 PM, Sargun Dhillon wrote:
>> On Thu, Jun 29, 2017 at 11:42 AM, Jeff Mahoney <je...@suse.com> wrote:
>>> On 6/28/17 6:02 PM, Sargun Dhillon wrote:
>>>> On Wed, Jun 28, 2017 at 2:55 PM, Jeff Mahoney <je...@suse.com> wrote:
>>>>> On 6/27/17 5:12 PM, Jeff Mahoney wrote:
>>>>>> On 6/13/17 9:05 PM, Sargun Dhillon wrote:
>>>>>>> On Thu, Jun 8, 2017 at 11:34 AM, Sargun Dhillon <sar...@sargun.me> 
>>>>>>> wrote:
>>>>>>>> I have a deadlock caught in the wild between two processes --
>>>>>>>> btrfs-cleaner, and userspace process (Docker). Here, you can see both
>>>>>>>> of the backtraces. btrfs-cleaner is trying to get a lock on
>>>>>>>> 9859d360caf0, which is owned by Docker's pid. Docker on the other
>>>>>>>> hand is trying to get a lock on 9859dc0f0578, which is owned by
>>>>>>>> btrfs-cleaner's Pid.
>>>>>>>>
>>>>>>>> This is on vanilla 4.11.3 without much workload. The background
>>>>>>>> workload was basically starting and stopping Docker with a medium
>>>>>>>> sized image like ubuntu:latest with sleep 5. So, snapshot creation,
>>>>>>>> destruction. And there's some stuff that's logging to btrfs.
>>>>>>
>>>>>> Hi Sargun -
>>>>>>
>>>>>> We hit this bug in testing last week.  I have a patch that I've written
>>>>>> up and have run under your reproducer for a while.  So far it hasn't
>>>>>> hit.  I'll post it shortly and CC you.  It does depend lightly on the
>>>>>> rbtree code, though.  Since we'll want this fix for -stable, I'll write
>>>>>> up a version for that too.
>>>>>
>>>>> After thinking about it a bit more, I think my patch just happens to
>>>>> make it less likely to hit but would ultimately degrade into a livelock
>>>>> where it was a deadlock previously.  I was just trylocking and
>>>>> requeuing, so both threads are allowed to do other work and maybe even
>>>>> finish but ultimately if there's a true deadlock it'll hit anyway.
>>>>>
>>>>> -Jeff
>>>>>
>>>> Does it make sense to spend the time on making it so that
>>>> btrfs-cleaner has abortable operations, and the ability to abort if
>>>> the root deletion either takes too long, or if it receives a signal?
>>>> Although, such a case may result in a livelock, to me it seems like a
>>>> lot less bad than deadlocking.
>>>
>>>
>>> For now, reverting:
>>>
>>> commit fb235dc06fac9eaa4408ade9c8b20d45d63c89b7
>>> Author: Qu Wenruo <quwen...@cn.fujitsu.com>
>>> Date:   Wed Feb 15 10:43:03 2017 +0800
>>>
>>> btrfs: qgroup: Move half of the qgroup accounting time out of commit
>>> trans
>>>
>>> ... should do the trick.
>>>
>>> -Jeff
>>>
>> I thought it was this as well, but we still saw lock-ups even after
>> reverting this change on 4.11. They were rarer, but we still saw
>> issues with locked up btrfs-transactions. It may have been due to a
>> different issue. If you want. I can try to revert this, and run a
>> workload on it to see where the exact lock-up is?
>
> Yeah, I'd be interested in those results.
>
> -Jeff
>
>
> --
> Jeff Mahoney
> SUSE Labs
>
Thanks Jeff,
Upon further analysis, it looks like rolling this back fixed the
btrfs-cleaner lock up, but the we're seeing a different hard lockup,
where num_writers on the current transaction gets stuck at 2.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Lock between userspace and btrfs-cleaner on extent_buffer

2017-06-29 Thread Sargun Dhillon
On Thu, Jun 29, 2017 at 11:42 AM, Jeff Mahoney <je...@suse.com> wrote:
> On 6/28/17 6:02 PM, Sargun Dhillon wrote:
>> On Wed, Jun 28, 2017 at 2:55 PM, Jeff Mahoney <je...@suse.com> wrote:
>>> On 6/27/17 5:12 PM, Jeff Mahoney wrote:
>>>> On 6/13/17 9:05 PM, Sargun Dhillon wrote:
>>>>> On Thu, Jun 8, 2017 at 11:34 AM, Sargun Dhillon <sar...@sargun.me> wrote:
>>>>>> I have a deadlock caught in the wild between two processes --
>>>>>> btrfs-cleaner, and userspace process (Docker). Here, you can see both
>>>>>> of the backtraces. btrfs-cleaner is trying to get a lock on
>>>>>> 9859d360caf0, which is owned by Docker's pid. Docker on the other
>>>>>> hand is trying to get a lock on 9859dc0f0578, which is owned by
>>>>>> btrfs-cleaner's Pid.
>>>>>>
>>>>>> This is on vanilla 4.11.3 without much workload. The background
>>>>>> workload was basically starting and stopping Docker with a medium
>>>>>> sized image like ubuntu:latest with sleep 5. So, snapshot creation,
>>>>>> destruction. And there's some stuff that's logging to btrfs.
>>>>
>>>> Hi Sargun -
>>>>
>>>> We hit this bug in testing last week.  I have a patch that I've written
>>>> up and have run under your reproducer for a while.  So far it hasn't
>>>> hit.  I'll post it shortly and CC you.  It does depend lightly on the
>>>> rbtree code, though.  Since we'll want this fix for -stable, I'll write
>>>> up a version for that too.
>>>
>>> After thinking about it a bit more, I think my patch just happens to
>>> make it less likely to hit but would ultimately degrade into a livelock
>>> where it was a deadlock previously.  I was just trylocking and
>>> requeuing, so both threads are allowed to do other work and maybe even
>>> finish but ultimately if there's a true deadlock it'll hit anyway.
>>>
>>> -Jeff
>>>
>> Does it make sense to spend the time on making it so that
>> btrfs-cleaner has abortable operations, and the ability to abort if
>> the root deletion either takes too long, or if it receives a signal?
>> Although, such a case may result in a livelock, to me it seems like a
>> lot less bad than deadlocking.
>
>
> For now, reverting:
>
> commit fb235dc06fac9eaa4408ade9c8b20d45d63c89b7
> Author: Qu Wenruo <quwen...@cn.fujitsu.com>
> Date:   Wed Feb 15 10:43:03 2017 +0800
>
> btrfs: qgroup: Move half of the qgroup accounting time out of commit
> trans
>
> ... should do the trick.
>
> -Jeff
>
I thought it was this as well, but we still saw lock-ups even after
reverting this change on 4.11. They were rarer, but we still saw
issues with locked up btrfs-transactions. It may have been due to a
different issue. If you want. I can try to revert this, and run a
workload on it to see where the exact lock-up is?

>>>> -Jeff
>>>>
>>>>>> crash> bt -FF
>>>>>> PID: 3423   TASK: 985ec7a16580  CPU: 2   COMMAND: "btrfs-cleaner"
>>>>>>  #0 [afca9d9078e8] __schedule at bb235729
>>>>>> afca9d9078f0:  [985eccb2e580:task_struct]
>>>>>> afca9d907900: [985ec7a16580:task_struct] 985ed949b280
>>>>>> afca9d907910: afca9d907978 __schedule+953
>>>>>> afca9d907920: btree_get_extent 9de968f0
>>>>>> afca9d907930: 985ed949b280 afca9d907958
>>>>>> afca9d907940: 0004 00a90842012fd9df
>>>>>> afca9d907950: [985ec7a16580:task_struct]
>>>>>> [9859d360cb50:btrfs_extent_buffer]
>>>>>> afca9d907960: [9859d360cb58:btrfs_extent_buffer]
>>>>>> [985ec7a16580:task_struct]
>>>>>> afca9d907970: [985ec7a16580:task_struct] afca9d907990
>>>>>> afca9d907980: schedule+54
>>>>>>  #1 [afca9d907980] schedule at bb235c96
>>>>>> afca9d907988: [9859d360caf0:btrfs_extent_buffer] 
>>>>>> afca9d9079f8
>>>>>> afca9d907998: btrfs_tree_read_lock+204
>>>>>>  #2 [afca9d907998] btrfs_tree_read_lock at c03e112c [btrfs]
>>>>>> afca9d9079a0: 985e [985ec7a16580:task_struct]
>>>>>> afca9d9079b0: autoremove_wak

Re: Lock between userspace and btrfs-cleaner on extent_buffer

2017-06-28 Thread Sargun Dhillon
On Wed, Jun 28, 2017 at 2:55 PM, Jeff Mahoney <je...@suse.com> wrote:
> On 6/27/17 5:12 PM, Jeff Mahoney wrote:
>> On 6/13/17 9:05 PM, Sargun Dhillon wrote:
>>> On Thu, Jun 8, 2017 at 11:34 AM, Sargun Dhillon <sar...@sargun.me> wrote:
>>>> I have a deadlock caught in the wild between two processes --
>>>> btrfs-cleaner, and userspace process (Docker). Here, you can see both
>>>> of the backtraces. btrfs-cleaner is trying to get a lock on
>>>> 9859d360caf0, which is owned by Docker's pid. Docker on the other
>>>> hand is trying to get a lock on 9859dc0f0578, which is owned by
>>>> btrfs-cleaner's Pid.
>>>>
>>>> This is on vanilla 4.11.3 without much workload. The background
>>>> workload was basically starting and stopping Docker with a medium
>>>> sized image like ubuntu:latest with sleep 5. So, snapshot creation,
>>>> destruction. And there's some stuff that's logging to btrfs.
>>
>> Hi Sargun -
>>
>> We hit this bug in testing last week.  I have a patch that I've written
>> up and have run under your reproducer for a while.  So far it hasn't
>> hit.  I'll post it shortly and CC you.  It does depend lightly on the
>> rbtree code, though.  Since we'll want this fix for -stable, I'll write
>> up a version for that too.
>
> After thinking about it a bit more, I think my patch just happens to
> make it less likely to hit but would ultimately degrade into a livelock
> where it was a deadlock previously.  I was just trylocking and
> requeuing, so both threads are allowed to do other work and maybe even
> finish but ultimately if there's a true deadlock it'll hit anyway.
>
> -Jeff
>
Does it make sense to spend the time on making it so that
btrfs-cleaner has abortable operations, and the ability to abort if
the root deletion either takes too long, or if it receives a signal?
Although, such a case may result in a livelock, to me it seems like a
lot less bad than deadlocking.

>> -Jeff
>>
>>>> crash> bt -FF
>>>> PID: 3423   TASK: 985ec7a16580  CPU: 2   COMMAND: "btrfs-cleaner"
>>>>  #0 [afca9d9078e8] __schedule at bb235729
>>>> afca9d9078f0:  [985eccb2e580:task_struct]
>>>> afca9d907900: [985ec7a16580:task_struct] 985ed949b280
>>>> afca9d907910: afca9d907978 __schedule+953
>>>> afca9d907920: btree_get_extent 9de968f0
>>>> afca9d907930: 985ed949b280 afca9d907958
>>>> afca9d907940: 0004 00a90842012fd9df
>>>> afca9d907950: [985ec7a16580:task_struct]
>>>> [9859d360cb50:btrfs_extent_buffer]
>>>> afca9d907960: [9859d360cb58:btrfs_extent_buffer]
>>>> [985ec7a16580:task_struct]
>>>> afca9d907970: [985ec7a16580:task_struct] afca9d907990
>>>> afca9d907980: schedule+54
>>>>  #1 [afca9d907980] schedule at bb235c96
>>>> afca9d907988: [9859d360caf0:btrfs_extent_buffer] 
>>>> afca9d9079f8
>>>> afca9d907998: btrfs_tree_read_lock+204
>>>>  #2 [afca9d907998] btrfs_tree_read_lock at c03e112c [btrfs]
>>>> afca9d9079a0: 985e [985ec7a16580:task_struct]
>>>> afca9d9079b0: autoremove_wake_function
>>>> [9859d360cb60:btrfs_extent_buffer]
>>>> afca9d9079c0: [9859d360cb60:btrfs_extent_buffer] 
>>>> 00a90842012fd9df
>>>> afca9d9079d0: [985a6ca3c370:Acpi-State]
>>>> [9859d360caf0:btrfs_extent_buffer]
>>>> afca9d9079e0: afca9d907ac0 [985e751bc000:kmalloc-8192]
>>>> afca9d9079f0: [985e751bc000:kmalloc-8192] afca9d907a48
>>>> afca9d907a00: __add_missing_keys+190
>>>>  #3 [afca9d907a00] __add_missing_keys at c040abae [btrfs]
>>>> afca9d907a08:  afca9d907a28
>>>> afca9d907a18: free_extent_buffer+75 00a90842012fd9df
>>>> afca9d907a28: afca9d907ab0 afca9d907be8
>>>> afca9d907a38:  [985e78dae540:btrfs_path]
>>>> afca9d907a48: afca9d907b28 find_parent_nodes+889
>>>>  #4 [afca9d907a50] find_parent_nodes at c040c4d9 [btrfs]
>>>> afca9d907a58: [985e751bc000:kmalloc-8192]
>>>> [9859d613cf40:kmalloc-32]
>>>> afca9d907a68: [9859d613c220:kmallo

[PATCH] btrfs: Add qgroup_auto_cleanup mount flag to automatically cleanup qgroups

2017-06-28 Thread Sargun Dhillon
This patch introduces a new mount option - qgroup_auto_cleanup.
The purpose of this mount option is to cause btrfs to automatically
delete qgroups on subvolume deletion. This only cleans up the
associated level-0 qgroup, and not qgroups that are above it.

Since this behaviour is API-changing it is opt-in. Existing software,
and scripts may be doing qgroup cleanup on subvolume deletion explicitly,
and the absence of a qgroup may cause failure.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/ioctl.c | 15 +++
 fs/btrfs/super.c | 15 ++-
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5bdd3666..b3d54e4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1336,6 +1336,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct 
btrfs_fs_info *info)
 #define BTRFS_MOUNT_FRAGMENT_METADATA  (1 << 25)
 #define BTRFS_MOUNT_FREE_SPACE_TREE(1 << 26)
 #define BTRFS_MOUNT_NOLOGREPLAY(1 << 27)
+#define BTRFS_MOUNT_QGROUP_AUTO_CLEANUP(1 << 28)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL  (30)
 #define BTRFS_DEFAULT_MAX_INLINE   (2048)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fa1b78c..e9901c4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2546,6 +2546,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file 
*file,
goto out_end_trans;
}
}
+   /*
+* Attempt to automatically remove the automatically attached qgroup
+* setup in btrfs_qgroup_inherit. As a matter of convention, the id
+* is the same as the subvolume id.
+*
+* This can fail non-fatally for level 0 qgroups, therefore we do
+* not abort the transaction if this fails, nor return an error.
+*/
+   if (btrfs_test_opt(fs_info, QGROUP_AUTO_CLEANUP)) {
+   ret = btrfs_remove_qgroup(trans, fs_info,
+ dest->root_key.objectid, 0);
+   if (ret && ret != -ENOENT)
+   btrfs_warn(fs_info,
+  "Failed to cleanup qgroup. err: %d", ret);
+   }
 
 out_end_trans:
trans->block_rsv = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 74e4779..d83c841 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -321,7 +321,8 @@ enum {
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
-   Opt_nologreplay, Opt_norecovery,
+   Opt_nologreplay, Opt_norecovery, Opt_qgroup_auto_cleanup,
+   Opt_no_qgroup_auto_cleanup,
 #ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 #endif
@@ -381,6 +382,8 @@ static const match_table_t tokens = {
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_commit_interval, "commit=%d"},
+   {Opt_qgroup_auto_cleanup, "qgroup_auto_cleanup"},
+   {Opt_no_qgroup_auto_cleanup, "no_qgroup_auto_cleanup"},
 #ifdef CONFIG_BTRFS_DEBUG
{Opt_fragment_data, "fragment=data"},
{Opt_fragment_metadata, "fragment=metadata"},
@@ -798,6 +801,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
info->commit_interval = 
BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
+   case Opt_qgroup_auto_cleanup:
+btrfs_set_and_info(info, QGROUP_AUTO_CLEANUP,
+   "enabling qgroup auto cleanup");
+   break;
+   case Opt_no_qgroup_auto_cleanup:
+   btrfs_clear_and_info(info, QGROUP_AUTO_CLEANUP,
+"disabling qgroup auto cleanup");
+   break;
 #ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
@@ -1287,6 +1298,8 @@ static int btrfs_show_options(struct seq_file *seq, 
struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+   if (btrfs_test_opt(info, QGROUP_AUTO_CLEANUP))
+   seq_puts(seq, ",qgroup_auto_cleanup");
 #ifdef CONFIG_BTRFS_DEBUG
if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/4] Qgroup uapi improvements

2017-06-23 Thread Sargun Dhillon
On Fri, May 26, 2017 at 1:44 PM, Sargun Dhillon <sar...@sargun.me> wrote:
> This patchset has several improvements around the qgroups user-facing
> API. It introduces two new ioctls for creating, and removing qgroups.
> These ioctls have a new args structure that allows passing flags, and
> some reserved fields for future expansion. The ioctls prevent some
> operations around level-0 qgroups as well.
>
> The create operation prevents the creation of level-0 qgroups for
> subvolumes that do not exist. The delete operations prevents the
> deletion of level-0 qgroups that reference active volumes.
>
> In adddition, it adds a mount option "qgroup_auto_cleanup".
> When this mount option is specified, qgroups will automatically
> be cleaned up at volume deletion time. The reason this is a mount
> option over a default behaviour is to avoid breaking old scripts
> that rely on the behaviour of the existing APIs. Users can opt
> into the new behaviour, as opposed it being an automated
> trigger on newly created qgroups. Later on, we can introduce
> a flag to subvol / qgroup create that marks the qgroup as
> automatically created, and delete the qgroup as automatically
> as well.
>
> Changes since v1:
>   * Remove creation of level-0 qgroups without subvol
>   * Add deprecation message for old API
>
> Sargun Dhillon (4):
>   btrfs: Fail on removing qgroup if del_qgroup_item fails
>   btrfs: Add new ioctl uapis for qgroup creation / removal
>   btrfs: Warn the user when the legacy btrfs_qgroup_create API is used
>   btrfs: Add qgroup_auto_cleanup mount flag to automatically cleanup
> qgroups
>
>  fs/btrfs/ctree.h   |   1 +
>  fs/btrfs/ioctl.c   | 117 
> -
>  fs/btrfs/qgroup.c  |  79 --
>  fs/btrfs/qgroup.h  |   6 ++-
>  fs/btrfs/super.c   |  15 +-
>  include/uapi/linux/btrfs.h |  22 +
>  6 files changed, 230 insertions(+), 10 deletions(-)
>
> --
> 2.9.3
>

Hey,
I wanted to see what people thought of this. This was the only way I
saw forward that wouldn't break existing scripts, and would allow
users to opt-in to this new behaviour. If y'all think that different
semantics make sense, let me know, and I'll rev this patchset.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Lock between userspace and btrfs-cleaner on extent_buffer

2017-06-13 Thread Sargun Dhillon
On Thu, Jun 8, 2017 at 11:34 AM, Sargun Dhillon <sar...@sargun.me> wrote:
> I have a deadlock caught in the wild between two processes --
> btrfs-cleaner, and userspace process (Docker). Here, you can see both
> of the backtraces. btrfs-cleaner is trying to get a lock on
> 9859d360caf0, which is owned by Docker's pid. Docker on the other
> hand is trying to get a lock on 9859dc0f0578, which is owned by
> btrfs-cleaner's Pid.
>
> This is on vanilla 4.11.3 without much workload. The background
> workload was basically starting and stopping Docker with a medium
> sized image like ubuntu:latest with sleep 5. So, snapshot creation,
> destruction. And there's some stuff that's logging to btrfs.
>
> crash> bt -FF
> PID: 3423   TASK: 985ec7a16580  CPU: 2   COMMAND: "btrfs-cleaner"
>  #0 [afca9d9078e8] __schedule at bb235729
> afca9d9078f0:  [985eccb2e580:task_struct]
> afca9d907900: [985ec7a16580:task_struct] 985ed949b280
> afca9d907910: afca9d907978 __schedule+953
> afca9d907920: btree_get_extent 9de968f0
> afca9d907930: 985ed949b280 afca9d907958
> afca9d907940: 0004 00a90842012fd9df
> afca9d907950: [985ec7a16580:task_struct]
> [9859d360cb50:btrfs_extent_buffer]
> afca9d907960: [9859d360cb58:btrfs_extent_buffer]
> [985ec7a16580:task_struct]
> afca9d907970: [985ec7a16580:task_struct] afca9d907990
> afca9d907980: schedule+54
>  #1 [afca9d907980] schedule at bb235c96
> afca9d907988: [9859d360caf0:btrfs_extent_buffer] afca9d9079f8
> afca9d907998: btrfs_tree_read_lock+204
>  #2 [afca9d907998] btrfs_tree_read_lock at c03e112c [btrfs]
> afca9d9079a0: 985e [985ec7a16580:task_struct]
> afca9d9079b0: autoremove_wake_function
> [9859d360cb60:btrfs_extent_buffer]
> afca9d9079c0: [9859d360cb60:btrfs_extent_buffer] 00a90842012fd9df
> afca9d9079d0: [985a6ca3c370:Acpi-State]
> [9859d360caf0:btrfs_extent_buffer]
> afca9d9079e0: afca9d907ac0 [985e751bc000:kmalloc-8192]
> afca9d9079f0: [985e751bc000:kmalloc-8192] afca9d907a48
> afca9d907a00: __add_missing_keys+190
>  #3 [afca9d907a00] __add_missing_keys at c040abae [btrfs]
> afca9d907a08:  afca9d907a28
> afca9d907a18: free_extent_buffer+75 00a90842012fd9df
> afca9d907a28: afca9d907ab0 afca9d907be8
> afca9d907a38:  [985e78dae540:btrfs_path]
> afca9d907a48: afca9d907b28 find_parent_nodes+889
>  #4 [afca9d907a50] find_parent_nodes at c040c4d9 [btrfs]
> afca9d907a58: [985e751bc000:kmalloc-8192]
> [9859d613cf40:kmalloc-32]
> afca9d907a68: [9859d613c220:kmalloc-32] 
> afca9d907a78: 030dc000 
> afca9d907a88: [985e78dae540:btrfs_path] 
> afca9d907a98: 000178dae540 
> afca9d907aa8: 0002 afca9d907ab0
> afca9d907ab8: afca9d907ab0 [985a6ca3c370:Acpi-State]
> afca9d907ac8: [985a6ca3ce10:Acpi-State] c000985e751bc000
> afca9d907ad8: 01a9030d 
> afca9d907ae8: a9030dc0 0001
> afca9d907af8: 00a90842012fd9df [9859d613c220:kmalloc-32]
> afca9d907b08: afca9d907be8 030dc000
> afca9d907b18: [985e751bc000:kmalloc-8192] 
> afca9d907b28: afca9d907b98 __btrfs_find_all_roots+169
>  #5 [afca9d907b30] __btrfs_find_all_roots at c040cb09 [btrfs]
> afca9d907b38:  
> afca9d907b48:  
> afca9d907b58:  [9859d5e63c10:kmalloc-64]
> afca9d907b68: 00a90842012fd9df [985e751bc788:kmalloc-8192]
> afca9d907b78: [9859d5e63140:kmalloc-64]
> [985e9dfa8ee8:btrfs_transaction]
> afca9d907b88: [985e9dfa8d80:btrfs_transaction] 0321
> afca9d907b98: afca9d907bd0 btrfs_find_all_roots+85
>  #6 [afca9d907ba0] btrfs_find_all_roots at c040cbf5 [btrfs]
> afca9d907ba8: afca9d907be8 
> afca9d907bb8: 42b93000 [985e751bc000:kmalloc-8192]
> afca9d907bc8: [985e751bc000:kmalloc-8192] afca9d907c18
> afca9d907bd8: btrfs_qgroup_trace_extent+302
>  #7 [afca9d907bd8] btrfs_qgroup_trace_extent at c04115ee [btrfs]
> afca9d907be0: 1000 [9859d

Lock between userspace and btrfs-cleaner on extent_buffer

2017-06-08 Thread Sargun Dhillon
I have a deadlock caught in the wild between two processes --
btrfs-cleaner, and userspace process (Docker). Here, you can see both
of the backtraces. btrfs-cleaner is trying to get a lock on
9859d360caf0, which is owned by Docker's pid. Docker on the other
hand is trying to get a lock on 9859dc0f0578, which is owned by
btrfs-cleaner's Pid.

This is on vanilla 4.11.3 without much workload. The background
workload was basically starting and stopping Docker with a medium
sized image like ubuntu:latest with sleep 5. So, snapshot creation,
destruction. And there's some stuff that's logging to btrfs.

crash> bt -FF
PID: 3423   TASK: 985ec7a16580  CPU: 2   COMMAND: "btrfs-cleaner"
 #0 [afca9d9078e8] __schedule at bb235729
afca9d9078f0:  [985eccb2e580:task_struct]
afca9d907900: [985ec7a16580:task_struct] 985ed949b280
afca9d907910: afca9d907978 __schedule+953
afca9d907920: btree_get_extent 9de968f0
afca9d907930: 985ed949b280 afca9d907958
afca9d907940: 0004 00a90842012fd9df
afca9d907950: [985ec7a16580:task_struct]
[9859d360cb50:btrfs_extent_buffer]
afca9d907960: [9859d360cb58:btrfs_extent_buffer]
[985ec7a16580:task_struct]
afca9d907970: [985ec7a16580:task_struct] afca9d907990
afca9d907980: schedule+54
 #1 [afca9d907980] schedule at bb235c96
afca9d907988: [9859d360caf0:btrfs_extent_buffer] afca9d9079f8
afca9d907998: btrfs_tree_read_lock+204
 #2 [afca9d907998] btrfs_tree_read_lock at c03e112c [btrfs]
afca9d9079a0: 985e [985ec7a16580:task_struct]
afca9d9079b0: autoremove_wake_function
[9859d360cb60:btrfs_extent_buffer]
afca9d9079c0: [9859d360cb60:btrfs_extent_buffer] 00a90842012fd9df
afca9d9079d0: [985a6ca3c370:Acpi-State]
[9859d360caf0:btrfs_extent_buffer]
afca9d9079e0: afca9d907ac0 [985e751bc000:kmalloc-8192]
afca9d9079f0: [985e751bc000:kmalloc-8192] afca9d907a48
afca9d907a00: __add_missing_keys+190
 #3 [afca9d907a00] __add_missing_keys at c040abae [btrfs]
afca9d907a08:  afca9d907a28
afca9d907a18: free_extent_buffer+75 00a90842012fd9df
afca9d907a28: afca9d907ab0 afca9d907be8
afca9d907a38:  [985e78dae540:btrfs_path]
afca9d907a48: afca9d907b28 find_parent_nodes+889
 #4 [afca9d907a50] find_parent_nodes at c040c4d9 [btrfs]
afca9d907a58: [985e751bc000:kmalloc-8192]
[9859d613cf40:kmalloc-32]
afca9d907a68: [9859d613c220:kmalloc-32] 
afca9d907a78: 030dc000 
afca9d907a88: [985e78dae540:btrfs_path] 
afca9d907a98: 000178dae540 
afca9d907aa8: 0002 afca9d907ab0
afca9d907ab8: afca9d907ab0 [985a6ca3c370:Acpi-State]
afca9d907ac8: [985a6ca3ce10:Acpi-State] c000985e751bc000
afca9d907ad8: 01a9030d 
afca9d907ae8: a9030dc0 0001
afca9d907af8: 00a90842012fd9df [9859d613c220:kmalloc-32]
afca9d907b08: afca9d907be8 030dc000
afca9d907b18: [985e751bc000:kmalloc-8192] 
afca9d907b28: afca9d907b98 __btrfs_find_all_roots+169
 #5 [afca9d907b30] __btrfs_find_all_roots at c040cb09 [btrfs]
afca9d907b38:  
afca9d907b48:  
afca9d907b58:  [9859d5e63c10:kmalloc-64]
afca9d907b68: 00a90842012fd9df [985e751bc788:kmalloc-8192]
afca9d907b78: [9859d5e63140:kmalloc-64]
[985e9dfa8ee8:btrfs_transaction]
afca9d907b88: [985e9dfa8d80:btrfs_transaction] 0321
afca9d907b98: afca9d907bd0 btrfs_find_all_roots+85
 #6 [afca9d907ba0] btrfs_find_all_roots at c040cbf5 [btrfs]
afca9d907ba8: afca9d907be8 
afca9d907bb8: 42b93000 [985e751bc000:kmalloc-8192]
afca9d907bc8: [985e751bc000:kmalloc-8192] afca9d907c18
afca9d907bd8: btrfs_qgroup_trace_extent+302
 #7 [afca9d907bd8] btrfs_qgroup_trace_extent at c04115ee [btrfs]
afca9d907be0: 1000 [9859d613cf40:kmalloc-32]
afca9d907bf0: 00a90842012fd9df [9859dc0f0578:btrfs_extent_buffer]
afca9d907c00: 0ce5 2fa4
afca9d907c10: [985e751bc000:kmalloc-8192] afca9d907c80
afca9d907c20: btrfs_qgroup_trace_leaf_items+279
 #8 [afca9d907c20] btrfs_qgroup_trace_leaf_items at c0411747 [btrfs]
afca9d907c28: 42b93000 [985eb63d9a40:btrfs_trans_handle]
afca9d907c38: 72ffc03848e8 

[PATCH v2] btrfs: Convert btrfs_fs_info's free_chunk_space to an atomic64_t

2017-06-07 Thread Sargun Dhillon
This patch is a small performance optimization to get rid of a spin
lock, where instead an atomic64_t can be used.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Reviewed-by: Omar Sandoval <osan...@fb.com>
---
 fs/btrfs/ctree.h   |  4 ++--
 fs/btrfs/disk-io.c |  3 +--
 fs/btrfs/extent-tree.c |  4 +---
 fs/btrfs/volumes.c | 26 +++---
 4 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d..01bf42c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -748,8 +749,7 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
 
/* keep track of unallocated space */
-   spinlock_t free_chunk_lock;
-   u64 free_chunk_space;
+   atomic64_t free_chunk_space;
 
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8685d67..3700b68 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(_info->fs_roots_radix_lock);
spin_lock_init(_info->delayed_iput_lock);
spin_lock_init(_info->defrag_inodes_lock);
-   spin_lock_init(_info->free_chunk_lock);
spin_lock_init(_info->tree_mod_seq_lock);
spin_lock_init(_info->super_lock);
spin_lock_init(_info->qgroup_op_lock);
@@ -2667,7 +2666,7 @@ int open_ctree(struct super_block *sb,
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
-   fs_info->free_chunk_space = 0;
+   atomic64_set(_info->free_chunk_space, 0);
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e390451..955733c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4645,9 +4645,7 @@ static int can_overcommit(struct btrfs_root *root,
 
used += space_info->bytes_may_use;
 
-   spin_lock(_info->free_chunk_lock);
-   avail = fs_info->free_chunk_space;
-   spin_unlock(_info->free_chunk_lock);
+   avail = atomic64_read(_info->free_chunk_space);
 
/*
 * If we have dup, raid1 or raid10 then only half of the free
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 017b67d..992eb2a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2417,9 +2417,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, 
const char *device_path
fs_info->fs_devices->total_devices++;
fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += device->total_bytes;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_add(device->total_bytes, _info->free_chunk_space);
 
if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
@@ -2874,9 +2872,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
mutex_lock(_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += dev_extent_len;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_add(dev_extent_len, 
_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
mutex_unlock(_info->chunk_mutex);
}
@@ -4409,9 +4405,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 
new_size)
btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space -= diff;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_sub(diff, _info->free_chunk_space);
}
mutex_unlock(_info->chunk_mutex);
 
@@ -4535,9 +4529,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 
new_size)
btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += diff;
-  

Re: [PATCH] btrfs: Convert btrfs_fs_info's free_chunk_space to an atomic64_t

2017-06-07 Thread Sargun Dhillon
On Wed, Jun 7, 2017 at 11:23 AM, Omar Sandoval <osan...@osandov.com> wrote:
> On Wed, Jun 07, 2017 at 06:09:33PM +0000, Sargun Dhillon wrote:
>> This patch is a small performance optimization to get rid of a spin
>> lock, where instead an atomic64_t can be used.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> [snip]
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 017b67d..0123974 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -2417,9 +2417,7 @@ int btrfs_init_new_device(struct btrfs_fs_info 
>> *fs_info, const char *device_path
>>   fs_info->fs_devices->total_devices++;
>>   fs_info->fs_devices->total_rw_bytes += device->total_bytes;
>>
>> - spin_lock(_info->free_chunk_lock);
>> - fs_info->free_chunk_space += device->total_bytes;
>> - spin_unlock(_info->free_chunk_lock);
>> + atomic64_add(device->total_bytes, _info->free_chunk_space);
>>
>>   if (!blk_queue_nonrot(q))
>>   fs_info->fs_devices->rotating = 1;
>> @@ -2874,9 +2872,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle 
>> *trans,
>>   mutex_lock(_info->chunk_mutex);
>>   btrfs_device_set_bytes_used(device,
>>   device->bytes_used - dev_extent_len);
>> - spin_lock(_info->free_chunk_lock);
>> - fs_info->free_chunk_space += dev_extent_len;
>> - spin_unlock(_info->free_chunk_lock);
>> + atomic64_add(dev_extent_len, 
>> _info->free_chunk_space);
>>   btrfs_clear_space_info_full(fs_info);
>>   mutex_unlock(_info->chunk_mutex);
>>   }
>> @@ -4409,9 +4405,7 @@ int btrfs_shrink_device(struct btrfs_device *device, 
>> u64 new_size)
>>   btrfs_device_set_total_bytes(device, new_size);
>>   if (device->writeable) {
>>   device->fs_devices->total_rw_bytes -= diff;
>> - spin_lock(_info->free_chunk_lock);
>> - fs_info->free_chunk_space -= diff;
>> - spin_unlock(_info->free_chunk_lock);
>> + atomic64_sub(diff, _info->free_chunk_space);
>>   }
>>   mutex_unlock(_info->chunk_mutex);
>>
>> @@ -4535,9 +4529,7 @@ int btrfs_shrink_device(struct btrfs_device *device, 
>> u64 new_size)
>>   btrfs_device_set_total_bytes(device, old_size);
>>   if (device->writeable)
>>   device->fs_devices->total_rw_bytes += diff;
>> - spin_lock(_info->free_chunk_lock);
>> - fs_info->free_chunk_space += diff;
>> - spin_unlock(_info->free_chunk_lock);
>> + atomic64_add(diff, _info->free_chunk_space);
>>   mutex_unlock(_info->chunk_mutex);
>>   }
>>   return ret;
>> @@ -4882,9 +4874,7 @@ static int __btrfs_alloc_chunk(struct 
>> btrfs_trans_handle *trans,
>>   btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
>>   }
>>
>> - spin_lock(>free_chunk_lock);
>> - info->free_chunk_space -= (stripe_size * map->num_stripes);
>> - spin_unlock(>free_chunk_lock);
>> + atomic64_sub((stripe_size * map->num_stripes), 
>> >free_chunk_space);
>
> Can you please get rid of the extra parentheses around the
> multiplication here? Also curious if you were able to measure any sort
> of performance difference. Besides that,
Sure.

According to lock stats, this spinlock was accounting for a little bit
more than 1% of time in transactions on our workload. The latency
benefit in synthetic testing was not significant, but the fewer
spurious locks we're measuring, means that perf locks, and bpf
profiling is at a lower cost.
>
> Reviewed-by: Omar Sandoval <osan...@fb.com>
>
>>
>>   free_extent_map(em);
>>   check_raid56_incompat_flag(info, type);
>> @@ -6684,10 +6674,8 @@ static int read_one_dev(struct btrfs_fs_info *fs_info,
>>   device->in_fs_metadata = 1;
>>   if (device->writeable && !device->is_tgtdev_for_dev_replace) {
>>   device->fs_devices->total_rw_bytes += device->total_bytes;
>> - spin_lock(_info->free_chunk_lock);
>> - fs_info->free_chunk_space += device->total_bytes -
>> - device->bytes_used;
>> - spin_unlock(_info->free_chunk_lock);
>> + atomic64_add(device->total_bytes - device->bytes_used,
>> +  _info->free_chunk_space);
>>   }
>>   ret = 0;
>>   return ret;
>> --
>> 2.9.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: Convert btrfs_fs_info's free_chunk_space to an atomic64_t

2017-06-07 Thread Sargun Dhillon
This patch is a small performance optimization to get rid of a spin
lock, where instead an atomic64_t can be used.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ctree.h   |  4 ++--
 fs/btrfs/disk-io.c |  3 +--
 fs/btrfs/extent-tree.c |  4 +---
 fs/btrfs/volumes.c | 26 +++---
 4 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d..01bf42c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -748,8 +749,7 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
 
/* keep track of unallocated space */
-   spinlock_t free_chunk_lock;
-   u64 free_chunk_space;
+   atomic64_t free_chunk_space;
 
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8685d67..3700b68 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2626,7 +2626,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(_info->fs_roots_radix_lock);
spin_lock_init(_info->delayed_iput_lock);
spin_lock_init(_info->defrag_inodes_lock);
-   spin_lock_init(_info->free_chunk_lock);
spin_lock_init(_info->tree_mod_seq_lock);
spin_lock_init(_info->super_lock);
spin_lock_init(_info->qgroup_op_lock);
@@ -2667,7 +2666,7 @@ int open_ctree(struct super_block *sb,
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
-   fs_info->free_chunk_space = 0;
+   atomic64_set(_info->free_chunk_space, 0);
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e390451..955733c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4645,9 +4645,7 @@ static int can_overcommit(struct btrfs_root *root,
 
used += space_info->bytes_may_use;
 
-   spin_lock(_info->free_chunk_lock);
-   avail = fs_info->free_chunk_space;
-   spin_unlock(_info->free_chunk_lock);
+   avail = atomic64_read(_info->free_chunk_space);
 
/*
 * If we have dup, raid1 or raid10 then only half of the free
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 017b67d..0123974 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2417,9 +2417,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, 
const char *device_path
fs_info->fs_devices->total_devices++;
fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += device->total_bytes;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_add(device->total_bytes, _info->free_chunk_space);
 
if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
@@ -2874,9 +2872,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
mutex_lock(_info->chunk_mutex);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += dev_extent_len;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_add(dev_extent_len, 
_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
mutex_unlock(_info->chunk_mutex);
}
@@ -4409,9 +4405,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 
new_size)
btrfs_device_set_total_bytes(device, new_size);
if (device->writeable) {
device->fs_devices->total_rw_bytes -= diff;
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space -= diff;
-   spin_unlock(_info->free_chunk_lock);
+   atomic64_sub(diff, _info->free_chunk_space);
}
mutex_unlock(_info->chunk_mutex);
 
@@ -4535,9 +4529,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 
new_size)
btrfs_device_set_total_bytes(device, old_size);
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
-   spin_lock(_info->free_chunk_lock);
-   fs_info->free_chunk_space += diff;
-   spin_unlock(_info->free_chunk_loc

Re: Debugging Deadlocks?

2017-05-31 Thread Sargun Dhillon
On Wed, May 31, 2017 at 5:54 AM, David Sterba <dste...@suse.cz> wrote:
> On Tue, May 30, 2017 at 09:12:39AM -0700, Sargun Dhillon wrote:
>> We've been running BtrFS for a couple months now in production on
>> several clusters. We're running on Canonical's 4.8 kernel, and
>> currently, in the process of moving to our own patchset atop vanilla
>> 4.10+. I'm glad to say it's been a fairly good experience for us. Bar
>> some performance issues, it's been largely smooth sailing.
>
> Yay, thanks for the feedback.
>
>> There has been one class of persistent issues that has been plaguing
>> our cluster is deadlocks. We've seen a fair number of issues where
>> there are some number of background threads and user threads are in
>> the process of performing operations where some are waiting to start a
>> transaction, and at least one background thread or user thread is in
>> the process of committing a transaction. Unfortunately, these
>> situations are ending in deadlocks, where no threads are making
>> progress.
>
> In such situations, save stacks of all processes (/proc/PID/stack). I
> don't want to play terminology here, so by a deadlock I could understand
> a system that's making progress so slow that's effectively stuck. This
> could happen if the files are freamgented, so eg. traversing extents
> takes locks and has a lot of work before it unlocks. Add some extent
 > sharing and updating references, this adds some points where the threads
> just wait.
>
> The stacktraces could give an idea of what kind of hang it is.
>
We're saving a dump of the tasks currently running. A recent dump can
be found here: http://cwillu.com:8080/50.19.255.106/1. This is the
only snapshot I have from a node that's not making any progress.

We also see the other case, where tasks are not making progress very
quickly, and it causes the kernel hung task detector to kick in. This
happens pretty often, and it's difficult to catch when it's happening,
but the symptoms can be frustrating, including failed instance
healthchecks, poor performance, and high latency for interactive
services. Some of the traces we've gotten from the stuck task detector
include:
https://gist.github.com/sargun/9643c0c380d27a147ef3486e1d51dbdb
https://gist.github.com/sargun/8858263b8d04c8ab726738022725ec12


>> We've talked about a couple ideas internally, like adding the ability
>> to timeout transactions, abort commits or start_transactions which are
>> taking too long, and adding more debugging to get insights into the
>> state of the filesystem. Unfortunately, since our usage and knowledge
>> of BtrFS is still somewhat nascent, we're unsure of what is the right
>> investment.
>
> There's a kernel-wide hung task detection, but I think a similar
> mechanism around just the transaction commits would be useful, as a
> debugging option.
>
> There are number of ways how a transaction can be blocked though, so
> we'd need to choose the starting point. Extent-related locks, waiting
> for writes, other locks, the intenral transactional logic (and possibly
> more).
>
As a first step, it'd be nice to have the transaction wrapped in a
stack frame. We can then instrument it much more easily with off the
shelf tools like simple BPF-based kprobes / kretprobes, or ftraces,
rather than having to write a custom probe that's familiar with the
innards of the txn datastructure, and does its own accounting to keep
track of what's in flight.

I'll take a cut at something as simple as an in-memory list of
transactions which is periodically scanned for transactions which are
taking too long, and log whether they're stuck starting, commiting, or
in-flight and uncommitted.

>> I'm curious, are other people seeing deadlocks crop up in production
>> often? How are you going about debugging them, and are there any good
>> pieces of advice on avoiding these for production workloads?
>
> I have seen hangs with kernel 4.9 a while back triggered by a
> long-running iozone stress test, but 4.8 was not affected, and 4.10+
> worked fine again. I don't know if/which btrfs patches the 'canonical
> 4.8' kernel has, so this might not be related.
>
> As for deadlocks (double taken lock, lock inversion), I haven't seen
> them for a long time. The testing kernels run with lockdep, so we should
> be able to see them early. You could try to run turn lockdep on if the
> performance penalty is still acceptable for you.  But there are still
> cases that lockdep does not cover IIRC, due to the higher-level
> semantics of the various btrfs trees and locking of extent buffers.
For some of these use-cases, we can pretty easily recreate the pattern
on the machine. For others, it's more complicated to find out which
containers, and datasets were scheduled to be pro

Debugging Deadlocks?

2017-05-30 Thread Sargun Dhillon
We've been running BtrFS for a couple months now in production on
several clusters. We're running on Canonical's 4.8 kernel, and
currently, in the process of moving to our own patchset atop vanilla
4.10+. I'm glad to say it's been a fairly good experience for us. Bar
some performance issues, it's been largely smooth sailing.

There has been one class of persistent issues that has been plaguing
our cluster is deadlocks. We've seen a fair number of issues where
there are some number of background threads and user threads are in
the process of performing operations where some are waiting to start a
transaction, and at least one background thread or user thread is in
the process of committing a transaction. Unfortunately, these
situations are ending in deadlocks, where no threads are making
progress.

We've talked about a couple ideas internally, like adding the ability
to timeout transactions, abort commits or start_transactions which are
taking too long, and adding more debugging to get insights into the
state of the filesystem. Unfortunately, since our usage and knowledge
of BtrFS is still somewhat nascent, we're unsure of what is the right
investment.

I'm curious, are other people seeing deadlocks crop up in production
often? How are you going about debugging them, and are there any good
pieces of advice on avoiding these for production workloads?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/4] btrfs: Warn the user when the legacy btrfs_qgroup_create API is used

2017-05-26 Thread Sargun Dhillon
This patch adds a warning to let the user know when the legacy
qgroup creation / removal API is in use. Eventually, we can
deprecate this API.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 78c8321..fba409f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5027,6 +5027,8 @@ static long btrfs_ioctl_qgroup_create(struct file *file, 
void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
 
+   pr_info_once("btrfs: Usage of deprecated btrfs_qgroup_create ioctl\n");
+
ret = mnt_want_write_file(file);
if (ret)
return ret;
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/4] Qgroup uapi improvements

2017-05-26 Thread Sargun Dhillon
This patchset has several improvements around the qgroups user-facing
API. It introduces two new ioctls for creating, and removing qgroups.
These ioctls have a new args structure that allows passing flags, and
some reserved fields for future expansion. The ioctls prevent some
operations around level-0 qgroups as well.

The create operation prevents the creation of level-0 qgroups for
subvolumes that do not exist. The delete operations prevents the
deletion of level-0 qgroups that reference active volumes.

In adddition, it adds a mount option "qgroup_auto_cleanup".
When this mount option is specified, qgroups will automatically
be cleaned up at volume deletion time. The reason this is a mount
option over a default behaviour is to avoid breaking old scripts
that rely on the behaviour of the existing APIs. Users can opt
into the new behaviour, as opposed it being an automated
trigger on newly created qgroups. Later on, we can introduce
a flag to subvol / qgroup create that marks the qgroup as
automatically created, and delete the qgroup as automatically
as well.

Changes since v1:
  * Remove creation of level-0 qgroups without subvol
  * Add deprecation message for old API

Sargun Dhillon (4):
  btrfs: Fail on removing qgroup if del_qgroup_item fails
  btrfs: Add new ioctl uapis for qgroup creation / removal
  btrfs: Warn the user when the legacy btrfs_qgroup_create API is used
  btrfs: Add qgroup_auto_cleanup mount flag to automatically cleanup
qgroups

 fs/btrfs/ctree.h   |   1 +
 fs/btrfs/ioctl.c   | 117 -
 fs/btrfs/qgroup.c  |  79 --
 fs/btrfs/qgroup.h  |   6 ++-
 fs/btrfs/super.c   |  15 +-
 include/uapi/linux/btrfs.h |  22 +
 6 files changed, 230 insertions(+), 10 deletions(-)

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/4] btrfs: Add new ioctl uapis for qgroup creation / removal

2017-05-26 Thread Sargun Dhillon
This patch introduces two new ioctls to create, and remove qgroups. These
offer a somewhat more intuitive set of operations with the opportunity
to add flags that gate to unintentional manipulation of qgroups.

The create qgroup ioctl has a a new semantic which level-0 qgroups cannot
be created unless a matching subvolume ID is created.

The delete qgroup ioctl has the new semantic that it will not let you
delete a currently utilized (referenced by a subvolume) level-0
qgroup unless you pass the BTRFS_QGROUP_NO_SUBVOL_CHECK flag.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ioctl.c   | 100 -
 fs/btrfs/qgroup.c  |  76 +++---
 fs/btrfs/qgroup.h  |   6 ++-
 include/uapi/linux/btrfs.h |  22 ++
 4 files changed, 195 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e176375..78c8321 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4922,6 +4922,98 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, 
void __user *arg)
return ret;
 }
 
+static long btrfs_ioctl_qgroup_create_v2(struct file *file, void __user *arg)
+{
+   struct btrfs_ioctl_qgroup_create_args_v2 create_args;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_fs_info *fs_info;
+   struct btrfs_root *root;
+   struct inode *inode;
+   int ret;
+   int err;
+
+   inode = file_inode(file);
+   fs_info = btrfs_sb(inode->i_sb);
+   root = BTRFS_I(inode)->root;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = copy_from_user(_args, arg, sizeof(create_args));
+   if (ret)
+   return ret;
+
+   if (!create_args.qgroupid)
+   return -EINVAL;
+
+   ret = mnt_want_write_file(file);
+   if (ret)
+   return ret;
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   ret = btrfs_create_qgroup(trans, fs_info, create_args.qgroupid, 1);
+   err = btrfs_end_transaction(trans);
+
+   if (err && !ret)
+   ret = err;
+
+out:
+   mnt_drop_write_file(file);
+   return ret;
+}
+
+static long btrfs_ioctl_qgroup_remove_v2(struct file *file, void __user *arg)
+{
+   struct btrfs_ioctl_qgroup_remove_args_v2 remove_args;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_fs_info *fs_info;
+   struct btrfs_root *root;
+   struct inode *inode;
+   int check;
+   int ret;
+   int err;
+
+   inode = file_inode(file);
+   fs_info = btrfs_sb(inode->i_sb);
+   root = BTRFS_I(inode)->root;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = copy_from_user(_args, arg, sizeof(remove_args));
+   if (ret)
+   return ret;
+
+   if (!remove_args.qgroupid)
+   return -EINVAL;
+   check = !(remove_args.flags & BTRFS_QGROUP_NO_SUBVOL_CHECK);
+
+   ret = mnt_want_write_file(file);
+   if (ret)
+   return ret;
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   ret = btrfs_remove_qgroup(trans, fs_info, remove_args.qgroupid, check);
+   err = btrfs_end_transaction(trans);
+
+   if (err && !ret)
+   ret = err;
+
+out:
+   mnt_drop_write_file(file);
+   return ret;
+}
+
 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
struct inode *inode = file_inode(file);
@@ -4958,9 +5050,9 @@ static long btrfs_ioctl_qgroup_create(struct file *file, 
void __user *arg)
 
/* FIXME: check if the IDs really exist */
if (sa->create) {
-   ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
+   ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid, 0);
} else {
-   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
+   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid, 0);
}
 
err = btrfs_end_transaction(trans);
@@ -5632,6 +5724,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_features(file, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
+   case BTRFS_IOC_QGROUP_CREATE_V2:
+   return btrfs_ioctl_qgroup_create_v2(file, argp);
+   case BTRFS_IOC_QGROUP_REMOVE_V2:
+   return btrfs_ioctl_qgroup_remove_v2(file, argp);
}
 
return -ENOTTY;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d39ffcc..ba8a6ee 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1247,8 +1247,51 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle 
*trans,
return ret;

[PATCH v2 4/4] btrfs: Add qgroup_auto_cleanup mount flag to automatically cleanup qgroups

2017-05-26 Thread Sargun Dhillon
This patch introduces a new mount option - qgroup_auto_cleanup.
The purpose of this mount option is to cause btrfs to automatically
delete qgroups on subvolume deletion. This only cleans up the
associated level-0 qgroup, and not qgroups that are above it.

Since this behaviour is API-breaking, as people may already have scripts
that do this cleanup on subvolume destruction, this feature, at least
for now, must be opt-in.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/ioctl.c | 15 +++
 fs/btrfs/super.c | 15 ++-
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d..0300f6f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1348,6 +1348,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct 
btrfs_fs_info *info)
 #define BTRFS_MOUNT_FRAGMENT_METADATA  (1 << 25)
 #define BTRFS_MOUNT_FREE_SPACE_TREE(1 << 26)
 #define BTRFS_MOUNT_NOLOGREPLAY(1 << 27)
+#define BTRFS_MOUNT_QGROUP_AUTO_CLEANUP(1 << 28)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL  (30)
 #define BTRFS_DEFAULT_MAX_INLINE   (2048)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fba409f..04e4022 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2543,6 +2543,21 @@ static noinline int btrfs_ioctl_snap_destroy(struct file 
*file,
goto out_end_trans;
}
}
+   /*
+* Attempt to automatically remove the automatically attached qgroup
+* setup in btrfs_qgroup_inherit. As a matter of convention, the id
+* is the same as the subvolume id.
+*
+* This can fail non-fatally for level 0 qgroups, therefore we do
+* not abort the transaction if this fails, nor return an error.
+*/
+   if (btrfs_test_opt(fs_info, QGROUP_AUTO_CLEANUP)) {
+   ret = btrfs_remove_qgroup(trans, fs_info,
+ dest->root_key.objectid, 0);
+   if (ret && ret != -ENOENT)
+   btrfs_warn(fs_info,
+  "Failed to cleanup qgroup. err: %d", ret);
+   }
 
 out_end_trans:
trans->block_rsv = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4f1cdd5..557f53f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -321,7 +321,8 @@ enum {
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
-   Opt_nologreplay, Opt_norecovery,
+   Opt_nologreplay, Opt_norecovery, Opt_qgroup_auto_cleanup,
+   Opt_no_qgroup_auto_cleanup,
 #ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 #endif
@@ -381,6 +382,8 @@ static const match_table_t tokens = {
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_commit_interval, "commit=%d"},
+   {Opt_qgroup_auto_cleanup, "qgroup_auto_cleanup"},
+   {Opt_no_qgroup_auto_cleanup, "no_qgroup_auto_cleanup"},
 #ifdef CONFIG_BTRFS_DEBUG
{Opt_fragment_data, "fragment=data"},
{Opt_fragment_metadata, "fragment=metadata"},
@@ -808,6 +811,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
info->commit_interval = 
BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
+   case Opt_qgroup_auto_cleanup:
+btrfs_set_and_info(info, QGROUP_AUTO_CLEANUP,
+   "enabling qgroup auto cleanup");
+   break;
+   case Opt_no_qgroup_auto_cleanup:
+   btrfs_clear_and_info(info, QGROUP_AUTO_CLEANUP,
+"disabling qgroup auto cleanup");
+   break;
 #ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
@@ -1299,6 +1310,8 @@ static int btrfs_show_options(struct seq_file *seq, 
struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+   if (btrfs_test_opt(info, QGROUP_AUTO_CLEANUP))
+   seq_puts(seq, ",qgroup_auto_cleanup");
 #ifdef CONFIG_BTRFS_DEBUG
if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/4] btrfs: Fail on removing qgroup if del_qgroup_item fails

2017-05-26 Thread Sargun Dhillon
Previously, we were calling del_qgroup_item, and ignoring the return code
resulting in a potential to have divergent in-memory state without an
error. Perhaps, it makes sense to handle this error code, and put the
filesystem into a read only, or similar state.

This patch only adds reporting of the error if the error is fatal,
(any error other than qgroup not found).

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index deffbeb..d39ffcc 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1309,6 +1309,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
}
ret = del_qgroup_item(trans, quota_root, qgroupid);
 
+   if (ret && ret != -ENOENT)
+   goto out;
+
while (!list_empty(>groups)) {
list = list_first_entry(>groups,
struct btrfs_qgroup_list, next_group);
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: QGroups Semantics

2017-05-22 Thread Sargun Dhillon
On Sun, May 21, 2017 at 5:59 PM, Qu Wenruo <quwen...@cn.fujitsu.com> wrote:
>
>
> At 05/19/2017 05:07 PM, Sargun Dhillon wrote:
>>
>> I have some questions about the *intended* qgroups semantics, and why
>> we allow certain operations:
>>
>> 1) Why can you create a level 0 qgroup for a non-existent subvolume?
>> It looks like it just gets reset when you create the subvol anyway?
>> Should we prevent this?
>
>
> Personally speaking, I didn't see much meaning of allowing this, except
> setting limit before creating the subvolume.
>
> But that can also be done by setting up higher level qgroup and attach new
> subvolume to that higher level qgroup.
>
> IIRC there are some guys hoping to disable multi-level qgroup completely, if
> one day we decide to do that, then current behavior may be quite important.
>
>>
>> 2) Why do we allow deleting a level 0 qgroup for a currently existing
>> subvolume? It seems to me that just setting the limit to 0, and/or
>> removing relations would work equally well? Perhaps a new ioctl to
>> clear the qgroup would make sense to do this.
>
>
> At least I didn't see the meaning to allow deleting qgroup for existing
> subvolume.
>
> It won't even improve any performance since we still need to do the time
> consuming backref walk.
>
>>
>> 3) Why don't we remove the associated level 0 qgroup when you remove
>> the subvol with that id?
>
> This may be related to question 1).
> Sometimes we may want to keep the limit?
>
> Despite of that, I didn't see any reason to keep the orphan level 0 qgroup.
>
>>
>> 4) I noticed in qgroup.c, one of the outstanding items is to determine
>> how to  autocleanup. Are there any stated semantics around that, or
>> opinions?
>
>
> As long as we're going to stick with multi level qgroups, then autocleanup
> seems to be a good idea.
>
> Thanks,
> Qu
>
I propose the following changes:
1) We always cleanup level-0 qgroups by default, with no opt-out.
I see absolutely no reason to keep these around.

2) We do not allow the creation of level-0 qgroups for (sub)volumes
that do not exist.
If people want to have a limit at creation time, they should create a
higher level qgroup, and add the new qgroup to that one.

3) We add two new ioctls: qgroup_create_v2, and qgroup_delete_v2, and
add a pr_warn_once message when we see usage of the old API.

4) Add a flag to the qgroup_delete_v2 ioctl, NO_SUBVOL_CHECK.
If the flag is present, it will allow you to delete qgroups which
reference active subvolumes.

Opinions?

>>
>> PS:
>> If the answer to any of these is "it just needs to be worked on" --
>> I'm currently poking around this code path for some enhancements we're
>> doing for our usecase. I'm just trying to figure out how much of what
>> I'm doing is generalizable.
>>
>> -Thanks,
>> Sargun
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/8] btrfs: autoremove qgroup by default, and add a mount flag to override

2017-05-22 Thread Sargun Dhillon
On Sun, May 21, 2017 at 7:03 PM, Qu Wenruo <quwen...@cn.fujitsu.com> wrote:
>
>
> At 05/22/2017 09:58 AM, Sargun Dhillon wrote:
>>
>> On Sun, May 21, 2017 at 6:20 PM, Qu Wenruo <quwen...@cn.fujitsu.com>
>> wrote:
>>>
>>>
>>>
>>> At 05/20/2017 04:39 PM, Sargun Dhillon wrote:
>>>>
>>>>
>>>> This change follows the change to automatically remove qgroups
>>>> if the associated subvolume has also been removed. It changes
>>>> the default behaviour to automatically remove qgroups when
>>>> a subvolume is deleted, but this can be override with the
>>>> qgroup_keep mount flag.
>>>>
>>>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>>>> ---
>>>>fs/btrfs/ctree.h |  1 +
>>>>fs/btrfs/ioctl.c | 14 ++
>>>>fs/btrfs/super.c | 16 +++-
>>>>3 files changed, 30 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>>> index 643c70d..4d57eb6 100644
>>>> --- a/fs/btrfs/ctree.h
>>>> +++ b/fs/btrfs/ctree.h
>>>> @@ -1348,6 +1348,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const
>>>> struct
>>>> btrfs_fs_info *info)
>>>>#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
>>>>#define BTRFS_MOUNT_FREE_SPACE_TREE   (1 << 26)
>>>>#define BTRFS_MOUNT_NOLOGREPLAY   (1 << 27)
>>>> +#define BTRFS_MOUNT_QGROUP_KEEP(1 << 28)
>>>>  #define BTRFS_DEFAULT_COMMIT_INTERVAL   (30)
>>>>#define BTRFS_DEFAULT_MAX_INLINE  (2048)
>>>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>>>> index e176375..b10d7bb 100644
>>>> --- a/fs/btrfs/ioctl.c
>>>> +++ b/fs/btrfs/ioctl.c
>>>> @@ -2544,6 +2544,20 @@ static noinline int
>>>> btrfs_ioctl_snap_destroy(struct
>>>> file *file,
>>>>  }
>>>>  }
>>>>+ /*
>>>> +* Attempt to automatically remove the automatically attached
>>>> qgroup
>>>> +* setup in btrfs_qgroup_inherit. As a matter of convention, the
>>>> id
>>>> +* is the same as the subvolume id.
>>>> +*
>>>> +* This can fail non-fatally for level 0 qgroups, and
>>>> potentially
>>>> +* leave the filesystem in an awkward, (but working) state.
>>>> +*/
>>>> +   if (!btrfs_test_opt(fs_info, QGROUP_KEEP)) {
>>>> +   ret = btrfs_remove_qgroup(trans, fs_info,
>>>> + dest->root_key.objectid);
>>>> +   if (ret && ret != -ENOENT)
>>>> +   pr_info("Could not automatically delete qgroup:
>>>> %d\n", ret);
>>>> +   }
>>>>out_end_trans:
>>>>  trans->block_rsv = NULL;
>>>>  trans->bytes_reserved = 0;
>>>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>>>> index 4f1cdd5..232e1b8 100644
>>>> --- a/fs/btrfs/super.c
>>>> +++ b/fs/btrfs/super.c
>>>> @@ -321,7 +321,7 @@ enum {
>>>>  Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
>>>>  Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
>>>>  Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
>>>> -   Opt_nologreplay, Opt_norecovery,
>>>> +   Opt_nologreplay, Opt_norecovery, Opt_qgroup_keep,
>>>> Opt_qgroup_nokeep,
>>>
>>>
>>>
>>> I prefer to add new meanings for btrfs_ioctl_qgroup_create_args->create.
So, just to bring this full circle before I do a v2, we want to make
it so on creation time, or after creation time you can make a qgroup
autoremove. And we want this to be a flag that gets inherited if
SUBVOL_QGROUP_INHERIT is set?

>>
>> I have two issues with this:
>> 1) Existing software, like Docker Daemon. There's no way I can tell
>> Docker daemon to use this new flag to create qgroups. Although, I
>> could modify it, it doesn't fit into the API very well.
>> 2) How do I do this with qgroups that already exist, or qgroups that
>> are created on-demand by qgroup_inherit?
>>
>> I think both of these would be fixed if we copied (a subset) of the
>> qgrou

Re: [PATCH 6/8] btrfs: Add code to check if a qgroup's subvol exists

2017-05-21 Thread Sargun Dhillon
On Sun, May 21, 2017 at 6:39 PM, Qu Wenruo <quwen...@cn.fujitsu.com> wrote:
>
>
> At 05/20/2017 04:39 PM, Sargun Dhillon wrote:
>>
>> This patch is to prepare for following patches in this patchset. The
>> purpose is to make it so that we can prevent accidental removal of
>> qgroups that are actively in use.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>> ---
>>   fs/btrfs/ioctl.c  |  4 ++--
>>   fs/btrfs/qgroup.c | 50
>> +-
>>   fs/btrfs/qgroup.h |  3 ++-
>>   3 files changed, 53 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index b10d7bb..2b1a8c1 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -2554,7 +2554,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct
>> file *file,
>>  */
>> if (!btrfs_test_opt(fs_info, QGROUP_KEEP)) {
>> ret = btrfs_remove_qgroup(trans, fs_info,
>> - dest->root_key.objectid);
>> + dest->root_key.objectid, 0);
>> if (ret && ret != -ENOENT)
>> pr_info("Could not automatically delete qgroup:
>> %d\n", ret);
>> }
>> @@ -4974,7 +4974,7 @@ static long btrfs_ioctl_qgroup_create(struct file
>> *file, void __user *arg)
>> if (sa->create) {
>> ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
>> } else {
>> -   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
>> +   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid,
>> 0);
>> }
>> err = btrfs_end_transaction(trans);
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index 588248b..a0699fd 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -1247,6 +1247,45 @@ int btrfs_del_qgroup_relation(struct
>> btrfs_trans_handle *trans,
>> return ret;
>>   }
>>   +/*
>> + * Meant to only operate on level-0 qroupid.
>> + *
>> + * It returns 1 if a matching subvolume is found; 0 if none is found.
>> + * < 0 if there is an error.
>> + */
>> +static int btrfs_subvolume_exists(struct btrfs_fs_info *fs_info, u64
>> qgroupid)
>> +{
>> +   struct btrfs_path *path;
>> +   struct btrfs_key key;
>> +   int err, ret = 0;
>> +
>> +   path = btrfs_alloc_path();
>> +   if (!path)
>> +   return -ENOMEM;
>> +
>> +   key.objectid = qgroupid;
>> +   key.type = BTRFS_ROOT_BACKREF_KEY;
>
>
> Fs root (subvolume id equals to 5) has no ROOT_BACKREF key.
> Such search would make fs tree always treated as not existed.
Is it okay to just explicitly exempt 5, or is there a better way to do
this search?

>
> Thanks,
> Qu
>
>> +   key.offset = 0;
>> +
>> +   err = btrfs_search_slot_for_read(fs_info->tree_root, , path,
>> 1, 0);
>> +   if (err == 1)
>> +   goto out;
>> +
>> +   if (err) {
>> +   ret = err;
>> +   goto out;
>> +   }
>> +
>> +   btrfs_item_key_to_cpu(path->nodes[0], , path->slots[0]);
>> +   if (key.objectid != qgroupid || key.type !=
>> BTRFS_ROOT_BACKREF_KEY)
>> +   goto out;
>> +
>> +   ret = 1;
>> +out:
>> +   btrfs_free_path(path);
>> +   return ret;
>> +}
>> +
>>   /* Must be called with qgroup_ioctl_lock held */
>>   static int __btrfs_create_qgroup(struct btrfs_trans_handle *trans,
>>  struct btrfs_fs_info *fs_info, u64
>> qgroupid)
>> @@ -1333,10 +1372,19 @@ static int __btrfs_remove_qgroup(struct
>> btrfs_trans_handle *trans,
>>   }
>> int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
>> -   struct btrfs_fs_info *fs_info, u64 qgroupid)
>> +   struct btrfs_fs_info *fs_info, u64 qgroupid,
>> +   int check_in_use)
>>   {
>> int ret;
>>   + if (check_in_use && btrfs_qgroup_level(qgroupid) == 0) {
>> +   ret = btrfs_subvolume_exists(fs_info, qgroupid);
>> +   if (ret < 0)
>> +   return ret;
>> +   if (ret)
>> +   return -EBUSY;
>> +   }
>> +
>> mutex_lock(_info->qgroup_ioctl_lock);
>&

Re: [PATCH 4/8] btrfs: autoremove qgroup by default, and add a mount flag to override

2017-05-21 Thread Sargun Dhillon
On Sun, May 21, 2017 at 6:20 PM, Qu Wenruo <quwen...@cn.fujitsu.com> wrote:
>
>
> At 05/20/2017 04:39 PM, Sargun Dhillon wrote:
>>
>> This change follows the change to automatically remove qgroups
>> if the associated subvolume has also been removed. It changes
>> the default behaviour to automatically remove qgroups when
>> a subvolume is deleted, but this can be override with the
>> qgroup_keep mount flag.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>> ---
>>   fs/btrfs/ctree.h |  1 +
>>   fs/btrfs/ioctl.c | 14 ++
>>   fs/btrfs/super.c | 16 +++-
>>   3 files changed, 30 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 643c70d..4d57eb6 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -1348,6 +1348,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct
>> btrfs_fs_info *info)
>>   #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
>>   #define BTRFS_MOUNT_FREE_SPACE_TREE   (1 << 26)
>>   #define BTRFS_MOUNT_NOLOGREPLAY   (1 << 27)
>> +#define BTRFS_MOUNT_QGROUP_KEEP(1 << 28)
>> #define BTRFS_DEFAULT_COMMIT_INTERVAL   (30)
>>   #define BTRFS_DEFAULT_MAX_INLINE  (2048)
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index e176375..b10d7bb 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -2544,6 +2544,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct
>> file *file,
>> }
>> }
>>   + /*
>> +* Attempt to automatically remove the automatically attached
>> qgroup
>> +* setup in btrfs_qgroup_inherit. As a matter of convention, the
>> id
>> +* is the same as the subvolume id.
>> +*
>> +* This can fail non-fatally for level 0 qgroups, and potentially
>> +* leave the filesystem in an awkward, (but working) state.
>> +*/
>> +   if (!btrfs_test_opt(fs_info, QGROUP_KEEP)) {
>> +   ret = btrfs_remove_qgroup(trans, fs_info,
>> + dest->root_key.objectid);
>> +   if (ret && ret != -ENOENT)
>> +   pr_info("Could not automatically delete qgroup:
>> %d\n", ret);
>> +   }
>>   out_end_trans:
>> trans->block_rsv = NULL;
>> trans->bytes_reserved = 0;
>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>> index 4f1cdd5..232e1b8 100644
>> --- a/fs/btrfs/super.c
>> +++ b/fs/btrfs/super.c
>> @@ -321,7 +321,7 @@ enum {
>> Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
>> Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
>> Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
>> -   Opt_nologreplay, Opt_norecovery,
>> +   Opt_nologreplay, Opt_norecovery, Opt_qgroup_keep,
>> Opt_qgroup_nokeep,
>
>
> I prefer to add new meanings for btrfs_ioctl_qgroup_create_args->create.
I have two issues with this:
1) Existing software, like Docker Daemon. There's no way I can tell
Docker daemon to use this new flag to create qgroups. Although, I
could modify it, it doesn't fit into the API very well.
2) How do I do this with qgroups that already exist, or qgroups that
are created on-demand by qgroup_inherit?

I think both of these would be fixed if we copied (a subset) of the
qgroup flags. If you look at my other patches, I think we want to
remove the ability to remove level-0 qgroups (which are inherited), so
we'd also want to add an ioctl to be able to modify (some of those)
flags. Do you think that adding inheritance, and a new ioctl is a
reasonable approach?

>
> It's a u64 value while we only checks if it's zero or not, to determine if
> it's creation or deletion.
>
> We could reuse it to extent the create/delete behavior, other than a new
> mount option, which is a global flag, just to alter qgroup behavior.
>
> Thanks,
> Qu
>
>
>>   #ifdef CONFIG_BTRFS_DEBUG
>> Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
>>   #endif
>> @@ -381,6 +381,8 @@ static const match_table_t tokens = {
>> {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
>> {Opt_fatal_errors, "fatal_errors=%s"},
>> {Opt_commit_interval, "commit=%d"},
>> +   {Opt_qgroup_keep, "qgroup_keep"},
>> +   {Opt_qgroup_nokeep, "qgroup_nokeep"},
>>   #ifdef CONFIG_BTRFS_DEBUG
>> 

Re: [PATCH 2/8] btrfs: Fail on removing qgroup if del_qgroup_item fails

2017-05-21 Thread Sargun Dhillon
On Sun, May 21, 2017 at 6:14 PM, Qu Wenruo <quwen...@cn.fujitsu.com> wrote:
>
>
> At 05/20/2017 04:39 PM, Sargun Dhillon wrote:
>>
>> Previously, we were calling del_qgroup_item, and ignoring the return code
>> resulting in a potential to have divergent in-memory state without an
>> error. Perhaps, it makes sense to handle this error code, and put the
>> filesystem into a read only, or similar state.
>>
>> This patch only adds reporting of the error.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>> ---
>>   fs/btrfs/qgroup.c | 2 ++
>>   1 file changed, 2 insertions(+)
>>
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index a2add44..7e772ba 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -1303,6 +1303,8 @@ static int __btrfs_remove_qgroup(struct
>> btrfs_trans_handle *trans,
>> return -EBUSY;
>> ret = del_qgroup_item(trans, quota_root, qgroupid);
>> +   if (ret)
>> +   goto out;
>
>
> I think we should continue to remove in-memory qgroup relationship even we
> didn't find corresponding qgroup item.
>
> IIRC it's better to catch less severe error like -ENOENT and continue.
>
> Although normally previous find_qgroup_rb() should return -ENOENT before we
> continue to del_qgroup_item(), but catching -ENOENT here could make the code
> more robust.
Agreed. Will fix this. I think it also return ENOMEM.
>
> Thanks,
> Qu
>
>
>> while (!list_empty(>groups)) {
>> list = list_first_entry(>groups,
>>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/8] btrfs: autoremove qgroup by default, and add a mount flag to override

2017-05-20 Thread Sargun Dhillon
Just as a follow-up, folks are working around this in userspace:
https://github.com/moby/moby/pull/29427


On Sat, May 20, 2017 at 1:39 AM, Sargun Dhillon <sar...@sargun.me> wrote:
> This change follows the change to automatically remove qgroups
> if the associated subvolume has also been removed. It changes
> the default behaviour to automatically remove qgroups when
> a subvolume is deleted, but this can be override with the
> qgroup_keep mount flag.
>
> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> ---
>  fs/btrfs/ctree.h |  1 +
>  fs/btrfs/ioctl.c | 14 ++
>  fs/btrfs/super.c | 16 +++-
>  3 files changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 643c70d..4d57eb6 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1348,6 +1348,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct 
> btrfs_fs_info *info)
>  #define BTRFS_MOUNT_FRAGMENT_METADATA  (1 << 25)
>  #define BTRFS_MOUNT_FREE_SPACE_TREE(1 << 26)
>  #define BTRFS_MOUNT_NOLOGREPLAY(1 << 27)
> +#define BTRFS_MOUNT_QGROUP_KEEP(1 << 28)
>
>  #define BTRFS_DEFAULT_COMMIT_INTERVAL  (30)
>  #define BTRFS_DEFAULT_MAX_INLINE   (2048)
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index e176375..b10d7bb 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -2544,6 +2544,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct 
> file *file,
> }
> }
>
> +   /*
> +* Attempt to automatically remove the automatically attached qgroup
> +* setup in btrfs_qgroup_inherit. As a matter of convention, the id
> +* is the same as the subvolume id.
> +*
> +* This can fail non-fatally for level 0 qgroups, and potentially
> +* leave the filesystem in an awkward, (but working) state.
> +*/
> +   if (!btrfs_test_opt(fs_info, QGROUP_KEEP)) {
> +   ret = btrfs_remove_qgroup(trans, fs_info,
> + dest->root_key.objectid);
> +   if (ret && ret != -ENOENT)
> +   pr_info("Could not automatically delete qgroup: 
> %d\n", ret);
> +   }
>  out_end_trans:
> trans->block_rsv = NULL;
> trans->bytes_reserved = 0;
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 4f1cdd5..232e1b8 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -321,7 +321,7 @@ enum {
> Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
> Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
> Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
> -   Opt_nologreplay, Opt_norecovery,
> +   Opt_nologreplay, Opt_norecovery, Opt_qgroup_keep, Opt_qgroup_nokeep,
>  #ifdef CONFIG_BTRFS_DEBUG
> Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
>  #endif
> @@ -381,6 +381,8 @@ static const match_table_t tokens = {
> {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
> {Opt_fatal_errors, "fatal_errors=%s"},
> {Opt_commit_interval, "commit=%d"},
> +   {Opt_qgroup_keep, "qgroup_keep"},
> +   {Opt_qgroup_nokeep, "qgroup_nokeep"},
>  #ifdef CONFIG_BTRFS_DEBUG
> {Opt_fragment_data, "fragment=data"},
> {Opt_fragment_metadata, "fragment=metadata"},
> @@ -808,6 +810,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
> *options,
> info->commit_interval = 
> BTRFS_DEFAULT_COMMIT_INTERVAL;
> }
> break;
> +   case Opt_qgroup_keep:
> +   btrfs_set_and_info(info, QGROUP_KEEP,
> +  "do not automatically delete 
> qgroups");
> +   break;
> +   case Opt_qgroup_nokeep:
> +   btrfs_clear_and_info(info, QGROUP_KEEP,
> +"automatically delete qgroups");
> +   break;
>  #ifdef CONFIG_BTRFS_DEBUG
> case Opt_fragment_all:
> btrfs_info(info, "fragmenting all space");
> @@ -1299,6 +1309,10 @@ static int btrfs_show_options(struct seq_file *seq, 
> struct dentry *dentry)
> seq_puts(seq, ",fatal_errors=panic");
> if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
> seq_printf(seq, ",commit=%d", info->commit_interval);
> +   if (btrfs_test_opt(info, QGROUP_KEEP))
> +   seq_puts(seq, ",qgroup_keep");
> +   else
> +   seq_puts(seq, ",qgroup_nokeep");
>  #ifdef CONFIG_BTRFS_DEBUG
> if (btrfs_test_opt(info, FRAGMENT_DATA))
> seq_puts(seq, ",fragment=data");
> --
> 2.9.3
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/8] btrfs: Add new ioctl uapis for qgroup creation / removal

2017-05-20 Thread Sargun Dhillon
This patch ties together the work in the previous patches, to introduce
semantics around the qgroup creation / removal API that are a bit more
intuitive that the current one. It also creates two new args structures
which have reserved space for future expansion, as opposed to single
datastructure for both creation and removal.

The associated semantics are as follows:
1) You cannot create a level 0 qgroup for a subvol which does not exist
   unless you pass in an override flag.
2) You cannot delete a level 0 qgroup which refers to a subvolume, which
   currently exists unless you pass in an override flag.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ioctl.c   | 98 ++
 include/uapi/linux/btrfs.h | 23 +++
 2 files changed, 121 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5e20643..7bf34b7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4936,6 +4936,100 @@ static long btrfs_ioctl_qgroup_assign(struct file 
*file, void __user *arg)
return ret;
 }
 
+static long btrfs_ioctl_qgroup_create_v2(struct file *file, void __user *arg)
+{
+   struct btrfs_ioctl_qgroup_create_args_v2 create_args;
+   struct inode *inode = file_inode(file);
+   struct btrfs_trans_handle *trans;
+   struct btrfs_fs_info *fs_info;
+   struct btrfs_root *root;
+   int check_subvol_exists;
+   int ret, err;
+
+   fs_info = btrfs_sb(inode->i_sb);
+   root = BTRFS_I(inode)->root;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = copy_from_user(_args, arg, sizeof(create_args));
+   if (ret)
+   return ret;
+
+   if (!create_args.qgroupid)
+   return -EINVAL;
+
+   ret = mnt_want_write_file(file);
+   if (ret)
+   return ret;
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   check_subvol_exists = !(create_args.flags &
+   BTRFS_QGROUP_CREATE_IGNORE_UNUSED);
+   ret = btrfs_create_qgroup(trans, fs_info, create_args.qgroupid,
+ check_subvol_exists);
+
+   err = btrfs_end_transaction(trans);
+   if (err && !ret)
+   ret = err;
+
+out:
+   mnt_drop_write_file(file);
+   return ret;
+}
+
+static long btrfs_ioctl_qgroup_remove_v2(struct file *file, void __user *arg)
+{
+   struct btrfs_ioctl_qgroup_remove_args_v2 remove_args;
+   struct inode *inode = file_inode(file);
+   struct btrfs_trans_handle *trans;
+   struct btrfs_fs_info *fs_info;
+   struct btrfs_root *root;
+   int check_in_use;
+   int ret, err;
+
+   fs_info = btrfs_sb(inode->i_sb);
+   root = BTRFS_I(inode)->root;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = copy_from_user(_args, arg, sizeof(remove_args));
+   if (ret)
+   return ret;
+
+   if (!remove_args.qgroupid)
+   return -EINVAL;
+
+   ret = mnt_want_write_file(file);
+   if (ret)
+   return ret;
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   check_in_use = !(remove_args.flags &
+BTRFS_QGROUP_REMOVE_NO_CHECK_IN_USE);
+   ret = btrfs_remove_qgroup(trans, fs_info, remove_args.qgroupid,
+ check_in_use);
+
+   err = btrfs_end_transaction(trans);
+   if (err && !ret)
+   ret = err;
+
+out:
+   mnt_drop_write_file(file);
+   return ret;
+}
+
 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
 {
struct inode *inode = file_inode(file);
@@ -5626,6 +5720,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_qgroup_assign(file, argp);
case BTRFS_IOC_QGROUP_CREATE:
return btrfs_ioctl_qgroup_create(file, argp);
+   case BTRFS_IOC_QGROUP_CREATE_V2:
+   return btrfs_ioctl_qgroup_create_v2(file, argp);
+   case BTRFS_IOC_QGROUP_REMOVE_V2:
+   return btrfs_ioctl_qgroup_remove_v2(file, argp);
case BTRFS_IOC_QGROUP_LIMIT:
return btrfs_ioctl_qgroup_limit(file, argp);
case BTRFS_IOC_QUOTA_RESCAN:
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index a456e53..0a6652d 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -653,6 +653,25 @@ struct btrfs_ioctl_qgroup_create_args {
__u64 create;
__u64 qgroupid;
 };
+
+/* Allow the user to delete qgroups even if there isn't a subvol with the id */
+#defineBTRFS_QGROUP_CREATE_IGNORE_UNUSED   (1ULL << 0)
+
+struct btrfs_ioctl_qgroup_create_args_v2 {
+   __

[PATCH 6/8] btrfs: Add code to check if a qgroup's subvol exists

2017-05-20 Thread Sargun Dhillon
This patch is to prepare for following patches in this patchset. The
purpose is to make it so that we can prevent accidental removal of
qgroups that are actively in use.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ioctl.c  |  4 ++--
 fs/btrfs/qgroup.c | 50 +-
 fs/btrfs/qgroup.h |  3 ++-
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b10d7bb..2b1a8c1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2554,7 +2554,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file 
*file,
 */
if (!btrfs_test_opt(fs_info, QGROUP_KEEP)) {
ret = btrfs_remove_qgroup(trans, fs_info,
- dest->root_key.objectid);
+ dest->root_key.objectid, 0);
if (ret && ret != -ENOENT)
pr_info("Could not automatically delete qgroup: %d\n", 
ret);
}
@@ -4974,7 +4974,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, 
void __user *arg)
if (sa->create) {
ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
} else {
-   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
+   ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid, 0);
}
 
err = btrfs_end_transaction(trans);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 588248b..a0699fd 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1247,6 +1247,45 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
+/*
+ * Meant to only operate on level-0 qroupid.
+ *
+ * It returns 1 if a matching subvolume is found; 0 if none is found.
+ * < 0 if there is an error.
+ */
+static int btrfs_subvolume_exists(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+   struct btrfs_path *path;
+   struct btrfs_key key;
+   int err, ret = 0;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   key.objectid = qgroupid;
+   key.type = BTRFS_ROOT_BACKREF_KEY;
+   key.offset = 0;
+
+   err = btrfs_search_slot_for_read(fs_info->tree_root, , path, 1, 0);
+   if (err == 1)
+   goto out;
+
+   if (err) {
+   ret = err;
+   goto out;
+   }
+
+   btrfs_item_key_to_cpu(path->nodes[0], , path->slots[0]);
+   if (key.objectid != qgroupid || key.type != BTRFS_ROOT_BACKREF_KEY)
+   goto out;
+
+   ret = 1;
+out:
+   btrfs_free_path(path);
+   return ret;
+}
+
 /* Must be called with qgroup_ioctl_lock held */
 static int __btrfs_create_qgroup(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info, u64 qgroupid)
@@ -1333,10 +1372,19 @@ static int __btrfs_remove_qgroup(struct 
btrfs_trans_handle *trans,
 }
 
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
-   struct btrfs_fs_info *fs_info, u64 qgroupid)
+   struct btrfs_fs_info *fs_info, u64 qgroupid,
+   int check_in_use)
 {
int ret;
 
+   if (check_in_use && btrfs_qgroup_level(qgroupid) == 0) {
+   ret = btrfs_subvolume_exists(fs_info, qgroupid);
+   if (ret < 0)
+   return ret;
+   if (ret)
+   return -EBUSY;
+   }
+
mutex_lock(_info->qgroup_ioctl_lock);
ret = __btrfs_remove_qgroup(trans, fs_info, qgroupid);
mutex_unlock(_info->qgroup_ioctl_lock);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index fb6c7da..fc08bdb 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -127,7 +127,8 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle 
*trans,
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid);
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
-   struct btrfs_fs_info *fs_info, u64 qgroupid);
+   struct btrfs_fs_info *fs_info, u64 qgroupid,
+   int check_in_use);
 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
   struct btrfs_fs_info *fs_info, u64 qgroupid,
   struct btrfs_qgroup_limit *limit);
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] btrfs: Add code to prevent qgroup creation for a non-existent subvol

2017-05-20 Thread Sargun Dhillon
This patch is to prepare for following patches in this patchset. The code
allows you to check if a subvol exists, and to only allow the creation
of qgroups on subvols that already exist. It doesn't make sense to allow
the creation of level 0 qgroups otherwise.

The behaviour is to inherit (create) a qgroup when you create a new
subvol with quota on. If there is an existing qgroup with this same
ID, it will be reset.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ioctl.c  |  2 +-
 fs/btrfs/qgroup.c | 11 ++-
 fs/btrfs/qgroup.h |  3 ++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2b1a8c1..5e20643 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4972,7 +4972,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, 
void __user *arg)
 
/* FIXME: check if the IDs really exist */
if (sa->create) {
-   ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
+   ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid, 0);
} else {
ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid, 0);
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a0699fd..28e2c7f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1317,10 +1317,19 @@ static int __btrfs_create_qgroup(struct 
btrfs_trans_handle *trans,
 }
 
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-   struct btrfs_fs_info *fs_info, u64 qgroupid)
+   struct btrfs_fs_info *fs_info, u64 qgroupid,
+   int check_subvol_exists)
 {
int ret;
 
+   if (check_subvol_exists && btrfs_qgroup_level(qgroupid) == 0) {
+   ret = btrfs_subvolume_exists(fs_info, qgroupid);
+   if (ret < 0)
+   return ret;
+   if (!ret)
+   return -ENOENT;
+   }
+
mutex_lock(_info->qgroup_ioctl_lock);
ret = __btrfs_create_qgroup(trans, fs_info, qgroupid);
mutex_unlock(_info->qgroup_ioctl_lock);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index fc08bdb..1afbe40 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -125,7 +125,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle 
*trans,
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
  struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-   struct btrfs_fs_info *fs_info, u64 qgroupid);
+   struct btrfs_fs_info *fs_info, u64 qgroupid,
+   int check_subvol_exists);
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid,
int check_in_use);
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/8] btrfs: autoremove qgroup by default, and add a mount flag to override

2017-05-20 Thread Sargun Dhillon
This change follows the change to automatically remove qgroups
if the associated subvolume has also been removed. It changes
the default behaviour to automatically remove qgroups when
a subvolume is deleted, but this can be override with the
qgroup_keep mount flag.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/ioctl.c | 14 ++
 fs/btrfs/super.c | 16 +++-
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d..4d57eb6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1348,6 +1348,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct 
btrfs_fs_info *info)
 #define BTRFS_MOUNT_FRAGMENT_METADATA  (1 << 25)
 #define BTRFS_MOUNT_FREE_SPACE_TREE(1 << 26)
 #define BTRFS_MOUNT_NOLOGREPLAY(1 << 27)
+#define BTRFS_MOUNT_QGROUP_KEEP(1 << 28)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL  (30)
 #define BTRFS_DEFAULT_MAX_INLINE   (2048)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e176375..b10d7bb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2544,6 +2544,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file 
*file,
}
}
 
+   /*
+* Attempt to automatically remove the automatically attached qgroup
+* setup in btrfs_qgroup_inherit. As a matter of convention, the id
+* is the same as the subvolume id.
+*
+* This can fail non-fatally for level 0 qgroups, and potentially
+* leave the filesystem in an awkward, (but working) state.
+*/
+   if (!btrfs_test_opt(fs_info, QGROUP_KEEP)) {
+   ret = btrfs_remove_qgroup(trans, fs_info,
+ dest->root_key.objectid);
+   if (ret && ret != -ENOENT)
+   pr_info("Could not automatically delete qgroup: %d\n", 
ret);
+   }
 out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4f1cdd5..232e1b8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -321,7 +321,7 @@ enum {
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
-   Opt_nologreplay, Opt_norecovery,
+   Opt_nologreplay, Opt_norecovery, Opt_qgroup_keep, Opt_qgroup_nokeep,
 #ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
 #endif
@@ -381,6 +381,8 @@ static const match_table_t tokens = {
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_commit_interval, "commit=%d"},
+   {Opt_qgroup_keep, "qgroup_keep"},
+   {Opt_qgroup_nokeep, "qgroup_nokeep"},
 #ifdef CONFIG_BTRFS_DEBUG
{Opt_fragment_data, "fragment=data"},
{Opt_fragment_metadata, "fragment=metadata"},
@@ -808,6 +810,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
info->commit_interval = 
BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
+   case Opt_qgroup_keep:
+   btrfs_set_and_info(info, QGROUP_KEEP,
+  "do not automatically delete 
qgroups");
+   break;
+   case Opt_qgroup_nokeep:
+   btrfs_clear_and_info(info, QGROUP_KEEP,
+"automatically delete qgroups");
+   break;
 #ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
btrfs_info(info, "fragmenting all space");
@@ -1299,6 +1309,10 @@ static int btrfs_show_options(struct seq_file *seq, 
struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+   if (btrfs_test_opt(info, QGROUP_KEEP))
+   seq_puts(seq, ",qgroup_keep");
+   else
+   seq_puts(seq, ",qgroup_nokeep");
 #ifdef CONFIG_BTRFS_DEBUG
if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/8] btrfs: qgroup.h whitespace change

2017-05-20 Thread Sargun Dhillon
This patch is just changing whitespace alignment in qgroup.h

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index fe04d3f..fb6c7da 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -127,7 +127,7 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle 
*trans,
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid);
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid);
+   struct btrfs_fs_info *fs_info, u64 qgroupid);
 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
   struct btrfs_fs_info *fs_info, u64 qgroupid,
   struct btrfs_qgroup_limit *limit);
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] btrfs: Split up btrfs_remove_qgroup, no logic changes

2017-05-20 Thread Sargun Dhillon
This change is purely a style change, so that btrfs_remove_qgroup is
split. There is some whitespace changes, but this shouldn't have any
effect on the logic of the code.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.c | 49 -
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index deffbeb..a2add44 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1281,40 +1281,35 @@ int btrfs_create_qgroup(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
-int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
-   struct btrfs_fs_info *fs_info, u64 qgroupid)
+/* Must be called with qgroup_ioctl_lock held */
+static int __btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+struct btrfs_fs_info *fs_info, u64 qgroupid)
 {
+   struct btrfs_qgroup_list *list;
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
-   struct btrfs_qgroup_list *list;
-   int ret = 0;
+   int ret;
 
-   mutex_lock(_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
-   if (!quota_root) {
-   ret = -EINVAL;
-   goto out;
-   }
+   if (!quota_root)
+   return -EINVAL;
 
qgroup = find_qgroup_rb(fs_info, qgroupid);
-   if (!qgroup) {
-   ret = -ENOENT;
-   goto out;
-   } else {
-   /* check if there are no children of this qgroup */
-   if (!list_empty(>members)) {
-   ret = -EBUSY;
-   goto out;
-   }
-   }
+   if (!qgroup)
+   return -ENOENT;
+
+   /* check if there are no children of this qgroup */
+   if (!list_empty(>members))
+   return -EBUSY;
+
ret = del_qgroup_item(trans, quota_root, qgroupid);
 
while (!list_empty(>groups)) {
list = list_first_entry(>groups,
struct btrfs_qgroup_list, next_group);
ret = __del_qgroup_relation(trans, fs_info,
-  qgroupid,
-  list->group->qgroupid);
+   qgroupid,
+   list->group->qgroupid);
if (ret)
goto out;
}
@@ -1322,8 +1317,20 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
spin_lock(_info->qgroup_lock);
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(_info->qgroup_lock);
+
 out:
+   return ret;
+}
+
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+   struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+   int ret;
+
+   mutex_lock(_info->qgroup_ioctl_lock);
+   ret = __btrfs_remove_qgroup(trans, fs_info, qgroupid);
mutex_unlock(_info->qgroup_ioctl_lock);
+
return ret;
 }
 
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/8] btrfs: Split up btrfs_create_qgroup, no logic changes

2017-05-20 Thread Sargun Dhillon
This change is purely a style change, so that btrfs_create_qgroup is
split. There is some whitespace changes, but this shouldn't have any
effect on the logic of the code.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.c | 32 
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7e772ba..588248b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1247,24 +1247,21 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
-int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-   struct btrfs_fs_info *fs_info, u64 qgroupid)
+/* Must be called with qgroup_ioctl_lock held */
+static int __btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+struct btrfs_fs_info *fs_info, u64 qgroupid)
 {
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
-   int ret = 0;
+   int ret;
 
-   mutex_lock(_info->qgroup_ioctl_lock);
quota_root = fs_info->quota_root;
-   if (!quota_root) {
-   ret = -EINVAL;
-   goto out;
-   }
+   if (!quota_root)
+   return -EINVAL;
+
qgroup = find_qgroup_rb(fs_info, qgroupid);
-   if (qgroup) {
-   ret = -EEXIST;
-   goto out;
-   }
+   if (qgroup)
+   return -EEXIST;
 
ret = add_qgroup_item(trans, quota_root, qgroupid);
if (ret)
@@ -1277,7 +1274,18 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
if (IS_ERR(qgroup))
ret = PTR_ERR(qgroup);
 out:
+   return ret;
+}
+
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+   struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+   int ret;
+
+   mutex_lock(_info->qgroup_ioctl_lock);
+   ret = __btrfs_create_qgroup(trans, fs_info, qgroupid);
mutex_unlock(_info->qgroup_ioctl_lock);
+
return ret;
 }
 
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/8] btrfs: Fail on removing qgroup if del_qgroup_item fails

2017-05-20 Thread Sargun Dhillon
Previously, we were calling del_qgroup_item, and ignoring the return code
resulting in a potential to have divergent in-memory state without an
error. Perhaps, it makes sense to handle this error code, and put the
filesystem into a read only, or similar state.

This patch only adds reporting of the error.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a2add44..7e772ba 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1303,6 +1303,8 @@ static int __btrfs_remove_qgroup(struct 
btrfs_trans_handle *trans,
return -EBUSY;
 
ret = del_qgroup_item(trans, quota_root, qgroupid);
+   if (ret)
+   goto out;
 
while (!list_empty(>groups)) {
list = list_first_entry(>groups,
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/8] BtrFS: QGroups uapi improvements

2017-05-20 Thread Sargun Dhillon
This patchset contains some improvements to qgroups. It changes the
semantics around how qgroups are dealt with when subvolumes are
deleted, and it also adds two new ioctls for qgroup deletion
and addition.

The new semantic around qgroup removal is that when the qgroup_nokeep
mount flag is set, it when a subvolume is deleted, the associated
level-0 qgroup will also be removed. This does not trickle up to
high level qgroups.

In addition, it adds two new ioctls for qgroup addition and removal
which have flags to protect against creating qgroups for non-existent
volumes, and in addition flags to prevent the deletion of qgroups
that are associated with volumes.

Sargun Dhillon (8):
  btrfs: Split up btrfs_remove_qgroup, no logic changes
  btrfs: Fail on removing qgroup if del_qgroup_item fails
  btrfs: Split up btrfs_create_qgroup, no logic changes
  btrfs: autoremove qgroup by default, and add a mount flag to override
  btrfs: qgroup.h whitespace change
  btrfs: Add code to check if a qgroup's subvol exists
  btrfs: Add code to prevent qgroup creation for a non-existent subvol
  btrfs: Add new ioctl uapis for qgroup creation / removal

 fs/btrfs/ctree.h   |   1 +
 fs/btrfs/ioctl.c   | 116 -
 fs/btrfs/qgroup.c  | 140 ++---
 fs/btrfs/qgroup.h  |   6 +-
 fs/btrfs/super.c   |  16 +-
 include/uapi/linux/btrfs.h |  23 
 6 files changed, 264 insertions(+), 38 deletions(-)

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


QGroups Semantics

2017-05-19 Thread Sargun Dhillon
I have some questions about the *intended* qgroups semantics, and why
we allow certain operations:

1) Why can you create a level 0 qgroup for a non-existent subvolume?
It looks like it just gets reset when you create the subvol anyway?
Should we prevent this?

2) Why do we allow deleting a level 0 qgroup for a currently existing
subvolume? It seems to me that just setting the limit to 0, and/or
removing relations would work equally well? Perhaps a new ioctl to
clear the qgroup would make sense to do this.

3) Why don't we remove the associated level 0 qgroup when you remove
the subvol with that id?

4) I noticed in qgroup.c, one of the outstanding items is to determine
how to autocleanup. Are there any stated semantics around that, or
opinions?

PS:
If the answer to any of these is "it just needs to be worked on" --
I'm currently poking around this code path for some enhancements we're
doing for our usecase. I'm just trying to figure out how much of what
I'm doing is generalizable.

-Thanks,
Sargun
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 0/2] btrfs: allow mechanism to override quota

2017-05-19 Thread Sargun Dhillon
On Fri, May 12, 2017 at 7:48 AM, David Sterba <dste...@suse.cz> wrote:
> On Thu, May 11, 2017 at 09:17:01PM +0000, Sargun Dhillon wrote:
>> This patchset makes it so that on a per-filesystem basis one can disable
>> quota enforcement for users with cap_sys_resource. This patchset can
>> likely later be extended to per-qgroup, or a per-volume basis. I'm
>> thinking of extending the sysfs interface to list the qgroups and
>> this same interface for the qgroups themselves.
>>
>> Changes since v1:
>>   -Rather than a separate member of btrfs_fs_info, use the existing
>>flags field
>
> Looks good to me, thanks.
I'm curious as to whether this approach is fine to get an Acked-by, or
if I need to figure out how to make it more leak-tolerant. I don't
think modifying the overridden extents inflight is a problem. I'm not
sure of a way a user would be able to create *new* chunks of data.
Alternatively, I'd be quite happy making this applicable to metadata
only, for file xattrs, creation, deletion, etc..

Opinions?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/2] btrfs: Add quota_override knob into sysfs

2017-05-11 Thread Sargun Dhillon
This patch adds the read-write attribute quota_override into sysfs.
Any process which has cap_sys_resource can set this flag to on, and
once it is set to true, processes with cap_sys_resource can exceed
the quota.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/sysfs.c | 41 +
 1 file changed, 41 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fb..c2d5f35 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -447,11 +447,52 @@ static ssize_t btrfs_clone_alignment_show(struct kobject 
*kobj,
 
 BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
 
+static ssize_t quota_override_show(struct kobject *kobj,
+  struct kobj_attribute *a, char *buf)
+{
+   struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+   int quota_override;
+
+   quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags);
+   return snprintf(buf, PAGE_SIZE, "%d\n", quota_override);
+}
+
+static ssize_t quota_override_store(struct kobject *kobj,
+   struct kobj_attribute *a,
+   const char *buf, size_t len)
+{
+   struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+   unsigned long knob;
+   int err;
+
+   if (!fs_info)
+   return -EPERM;
+
+   if (!capable(CAP_SYS_RESOURCE))
+   return -EPERM;
+
+   err = kstrtoul(buf, 10, );
+   if (err)
+   return err;
+   if (knob > 1)
+   return -EINVAL;
+
+   if (knob)
+   set_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags);
+   else
+   clear_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags);
+
+   return len;
+}
+
+BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
+
 static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
BTRFS_ATTR_PTR(clone_alignment),
+   BTRFS_ATTR_PTR(quota_override),
NULL,
 };
 
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/2] btrfs: add quota override flag to enable quota override for sys_resource

2017-05-11 Thread Sargun Dhillon
This patch introduces the quota override flag to btrfs_fs_info, and
a change to quota limit checking code to temporarily allow for quota
to be overridden for processes with cap_sys_resource.

It's useful for administrative programs, such as log rotation,
that may need to temporarily use more disk space in order to free up
a greater amount of overall disk space without yielding more disk
space to the rest of userland.

Eventually, we may want to add the idea of an operator-specific
quota, operator reserved space, or something else to allow for
administrative override, but this is perhaps the simplest
solution.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ctree.h  | 2 ++
 fs/btrfs/qgroup.c | 5 +
 2 files changed, 7 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d..e86cb7c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -716,6 +716,8 @@ struct btrfs_delayed_root;
 #define BTRFS_FS_BTREE_ERR 11
 #define BTRFS_FS_LOG1_ERR  12
 #define BTRFS_FS_LOG2_ERR  13
+#define BTRFS_FS_QUOTA_OVERRIDE14
+
 /*
  * Indicate that a whole-filesystem exclusive operation is running
  * (device replace, resize, device add/delete, balance)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index deffbeb..458fec0 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2338,6 +2338,11 @@ static int qgroup_reserve(struct btrfs_root *root, u64 
num_bytes, bool enforce)
 
if (num_bytes == 0)
return 0;
+
+   if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, _info->flags) &&
+   capable(CAP_SYS_RESOURCE))
+   enforce = false;
+
 retry:
spin_lock(_info->qgroup_lock);
quota_root = fs_info->quota_root;
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/2] btrfs: allow mechanism to override quota

2017-05-11 Thread Sargun Dhillon
This patchset makes it so that on a per-filesystem basis one can disable
quota enforcement for users with cap_sys_resource. This patchset can
likely later be extended to per-qgroup, or a per-volume basis. I'm
thinking of extending the sysfs interface to list the qgroups and
this same interface for the qgroups themselves.

Changes since v1:
  -Rather than a separate member of btrfs_fs_info, use the existing
   flags field

Sargun Dhillon (2):
  btrfs: add quota override flag to enable quota override for
sys_resource
  btrfs: Add quota_override knob into sysfs

 fs/btrfs/ctree.h  |  2 ++
 fs/btrfs/qgroup.c |  5 +
 fs/btrfs/sysfs.c  | 41 +
 3 files changed, 48 insertions(+)

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: allow processes with cap_sys_resource to exceed quota

2017-05-05 Thread Sargun Dhillon
If you see my follow-on patch, it allows disabling the quota limit for
folks with cap_sys_resource per filesystem. I don't want to have any
process to be able to turn off quota limits, but just the process that
is the logrotator (and has the proper capabilities). Unfortunately,
most folks don't lock down their capabilities, so I agree, making it
blindly based on capabilities seems like a poor idea.

On Fri, May 5, 2017 at 11:22 AM, David Sterba <dste...@suse.cz> wrote:
> On Fri, Apr 21, 2017 at 07:27:23AM -0500, Sargun Dhillon wrote:
>> What do you think about putting this behaviour behind a sysctl? Seems
>> better than to start introducing a new mechanism of marking tasks?
>
> Technically it's easy to add own btrfs-specific ioctl, temporarily
> turning off quota limits, but I'm still not sure about all the
> consequences as this would apply to the whole system, no? The
> capabilities are per process, much more fine grained. I don't have other
> ideas how to address the problem though.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: add quota override attribute

2017-04-23 Thread Sargun Dhillon
On Sun, Apr 23, 2017 at 8:42 PM, Qu Wenruo <quwen...@cn.fujitsu.com> wrote:
>
>
> At 04/22/2017 07:12 AM, Sargun Dhillon wrote:
>>
>> This patch introduces the quota override flag to btrfs_fs_info, and
>> a change to quota limit checking code to temporarily allow for quota
>> to be overridden for processes with cap_sys_resource.
>>
>> It's useful for administrative programs, such as log rotation,
>> that may need to temporarily use more disk space in order to free up
>> a greater amount of overall disk space without yielding more disk
>> space to the rest of userland.
>>
>> Eventually, we may want to add the idea of an operator-specific
>> quota, operator reserved space, or something else to allow for
>> administrative override, but this is perhaps the simplest
>> solution.
>
>
> Indeed simplest method yet.
>
> But considering that reserved data space can be used by none privileged
> user, I'm not sure if it's a good idea.
>
> For example:
>
> If root want to write a 64M file with new data, then it reserved 64M data
> space.
>
> And another none privileged user also want to write that 64M file with
> different data, then the user won't need to reserve data space.
> (Although metadata space is still needed).
>
> Won't this cause some method to escaping the qgroup limit?
This is more of a failure-avoidance mechanism. We run containers that
don't have cap_sys_resource. The log rotator, on the other hand, has a
full-set of capabilities in the root user namespace. Given that we'd
only flip quota_override if the system gets into a state where the log
rotator cannot run, I don't see it being particularly problematic.

At least looking at my systems, none of my users have cap_sys_resource
in their capabilities set, and it seems to be the closest capability
that maps to disk quota logic. I'd hate to drop this into the bucket
of cap_sys_admin. Are you perhaps suggesting per-uid and per-gid
qgroups? Or being able to have a quota_override_uid value? I thought
about adding an extended attribute, but that would require the attr to
be set at file creation time, not necessarily when I need it for an
escape. Another idea was to do what ext does, in adding a special
"operator" reserved space which processes with uid == 0 &&
cap_sys_resource can use. This would require changing the on-disk
qgroup format.

You're right, we would (intentionally) "escape" the qgroup limit. A
process with cap_sys_resource would be able to allocate more disk
space temporarily If I'm understanding you correctly, I don't think
that they would be able to rewrite the file's contents because that'd
be considered changed extents, no?

>
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>> ---
>>   fs/btrfs/ctree.h  | 3 +++
>>   fs/btrfs/qgroup.c | 9 +++--
>>   2 files changed, 10 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index c411590..01a095b 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -1098,6 +1098,9 @@ struct btrfs_fs_info {
>> u32 nodesize;
>> u32 sectorsize;
>> u32 stripesize;
>> +
>> +   /* Allow tasks with cap_sys_resource to override the quota */
>> +   bool quota_override;
>
>
> Why not use existing fs_info->qgroup_flags?
Isn't that persisted? I don't want this to surprise users across reboots.

>
> Thanks,
> Qu
>
>
>>   };
>> static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index a59801d..0328492 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -2347,8 +2347,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle
>> *trans,
>> return ret;
>>   }
>>   -static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64
>> num_bytes)
>> +static bool qgroup_check_limits(const struct btrfs_qgroup *qg,
>> +   u64 num_bytes, bool quota_override)
>>   {
>> +   if (quota_override && capable(CAP_SYS_RESOURCE))
>> +   return true;
>> +
>> if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
>> qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
>> return false;
>> @@ -2366,6 +2370,7 @@ static int qgroup_reserve(struct btrfs_root *root,
>> u64 num_bytes, bool enforce)
>> struct btrfs_qgroup *qgroup;
>> struct btrfs_fs_info *fs_info = root->fs_info;
>> u64 ref_root = root->root_key.objectid;
>> + 

[PATCH 2/2] btrfs: Add quota_override knob into sysfs

2017-04-21 Thread Sargun Dhillon
This patch adds the read-write attribute quota_override into sysfs.
Any process which has cap_sys_resource can set this flag to on, and
once it is set to true, processes with cap_sys_resource can exceed
the quota.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/sysfs.c | 36 
 1 file changed, 36 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1f157fb..c62acca 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -447,11 +447,47 @@ static ssize_t btrfs_clone_alignment_show(struct kobject 
*kobj,
 
 BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
 
+static ssize_t quota_override_show(struct kobject *kobj,
+  struct kobj_attribute *a, char *buf)
+{
+   struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+   return snprintf(buf, PAGE_SIZE, "%d\n", fs_info->quota_override);
+}
+
+static ssize_t quota_override_store(struct kobject *kobj,
+   struct kobj_attribute *a,
+   const char *buf, size_t len)
+{
+   struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+   unsigned long knob;
+   int err;
+
+   if (!fs_info)
+   return -EPERM;
+
+   if (!capable(CAP_SYS_RESOURCE))
+   return -EPERM;
+
+   err = kstrtoul(buf, 10, );
+   if (err)
+   return err;
+   if (knob > 1)
+   return -EINVAL;
+
+   fs_info->quota_override = !!knob;
+
+   return len;
+}
+
+BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store);
+
 static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(label),
BTRFS_ATTR_PTR(nodesize),
BTRFS_ATTR_PTR(sectorsize),
BTRFS_ATTR_PTR(clone_alignment),
+   BTRFS_ATTR_PTR(quota_override),
NULL,
 };
 
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] btrfs: add quota override attribute

2017-04-21 Thread Sargun Dhillon
This patch introduces the quota override flag to btrfs_fs_info, and
a change to quota limit checking code to temporarily allow for quota
to be overridden for processes with cap_sys_resource.

It's useful for administrative programs, such as log rotation,
that may need to temporarily use more disk space in order to free up
a greater amount of overall disk space without yielding more disk
space to the rest of userland.

Eventually, we may want to add the idea of an operator-specific
quota, operator reserved space, or something else to allow for
administrative override, but this is perhaps the simplest
solution.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/ctree.h  | 3 +++
 fs/btrfs/qgroup.c | 9 +++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c411590..01a095b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1098,6 +1098,9 @@ struct btrfs_fs_info {
u32 nodesize;
u32 sectorsize;
u32 stripesize;
+
+   /* Allow tasks with cap_sys_resource to override the quota */
+   bool quota_override;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a59801d..0328492 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2347,8 +2347,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
-static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg,
+   u64 num_bytes, bool quota_override)
 {
+   if (quota_override && capable(CAP_SYS_RESOURCE))
+   return true;
+
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
@@ -2366,6 +2370,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 
num_bytes, bool enforce)
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
+   bool qo = fs_info->quota_override;
int ret = 0;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2401,7 +2406,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 
num_bytes, bool enforce)
 
qg = unode_aux_to_qgroup(unode);
 
-   if (enforce && !qgroup_check_limits(qg, num_bytes)) {
+   if (enforce && !qgroup_check_limits(qg, num_bytes, qo)) {
ret = -EDQUOT;
goto out;
}
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] btrfs: allow processes with exceed quota with override

2017-04-21 Thread Sargun Dhillon
This patchset makes it so that on a per-filesystem basis one can disable
quota enforcement for users with cap_sys_resource. This patchset can
likely later be extended to per-qgroup, or a per-volume basis. I'm
thinking of extending the sysfs interface to list the qgroups and
this same interface for the qgroups themselves.

Sargun Dhillon (2):
  btrfs: add quota override attribute
  btrfs: Add quota_override knob into sysfs

 fs/btrfs/ctree.h  |  3 +++
 fs/btrfs/qgroup.c |  9 +++--
 fs/btrfs/sysfs.c  | 36 
 3 files changed, 46 insertions(+), 2 deletions(-)

-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: allow processes with cap_sys_resource to exceed quota

2017-04-21 Thread Sargun Dhillon
What do you think about putting this behaviour behind a sysctl? Seems
better than to start introducing a new mechanism of marking tasks?

On Fri, Apr 21, 2017 at 6:05 AM, Adam Borowski <kilob...@angband.pl> wrote:
> On Fri, Apr 21, 2017 at 10:09:46AM +0000, Sargun Dhillon wrote:
>> This patch allows processes with CAP_SYS_RESOURCE to exceed the qgroup
>> limit. It's useful for administrative programs, such as log rotation,
>> that may need to temporarily use more disk space in order to free up
>> a greater amount of overall disk space without yielding more disk
>> space to the rest of userland.
>
>>  static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 
>> num_bytes)
>>  {
>> + if (capable(CAP_SYS_RESOURCE))
>> + return true;
>> +
>
> I don't think it's a good idea to make random root-uid processes ignore
> qgroups completely.  Just because the daemon in question doesn't use a
> separate uid is no reason to not protect you from it consuming all the disk
> space.
>
> A temporary request "please let me exceed limits" would make sense, though.
>
> The problem with CAP_SYS_RESOURCE is that it's always on unless explicitly
> dropped.
>
> --
> ⢀⣴⠾⠻⢶⣦⠀ Meow!
> ⣾⠁⢠⠒⠀⣿⡁
> ⢿⡄⠘⠷⠚⠋⠀ Collisions shmolisions, let's see them find a collision or second
> ⠈⠳⣄ preimage for double rot13!
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: allow processes with cap_sys_resource to exceed quota

2017-04-21 Thread Sargun Dhillon
The log rotation code that I have requires creating the new file
before it calls FICLONERANGE. Since it tries to copy close to a
newline and not the filesystem block boundary, it means that some of
the clone isn't a lazy copy, and instead has to be cloned byte for
byte. At a minimum I need a mechanism in order to circumvent the
metadata quota.

Since we don't have the notion of "operator reserved space", I'm not
sure of another straightforward way to do this. The other idea I had
was to introduce the idea of an operator UID to btrfs_qgroup, and
introduce a rsv_operator_rfer, and rsv_operator_excl, but that seems
to be killing the mosquito with the cannon.

On Fri, Apr 21, 2017 at 5:09 AM, Sargun Dhillon <sar...@sargun.me> wrote:
> This patch allows processes with CAP_SYS_RESOURCE to exceed the qgroup
> limit. It's useful for administrative programs, such as log rotation,
> that may need to temporarily use more disk space in order to free up
> a greater amount of overall disk space without yielding more disk
> space to the rest of userland.
>
> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> ---
>  fs/btrfs/qgroup.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index a59801d..b0af39c 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2349,6 +2349,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
> *trans,
>
>  static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
>  {
> +   if (capable(CAP_SYS_RESOURCE))
> +   return true;
> +
> if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
> qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
> return false;
> --
> 2.9.3
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: allow processes with cap_sys_resource to exceed quota

2017-04-21 Thread Sargun Dhillon
This patch allows processes with CAP_SYS_RESOURCE to exceed the qgroup
limit. It's useful for administrative programs, such as log rotation,
that may need to temporarily use more disk space in order to free up
a greater amount of overall disk space without yielding more disk
space to the rest of userland.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 fs/btrfs/qgroup.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a59801d..b0af39c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2349,6 +2349,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 
 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
 {
+   if (capable(CAP_SYS_RESOURCE))
+   return true;
+
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH ping] btrfs: warn about RAID5/6 being experimental at mount time

2017-04-20 Thread Sargun Dhillon
On Thu, Apr 20, 2017 at 3:13 PM, Duncan <1i5t5.dun...@cox.net> wrote:
> Adam Borowski posted on Wed, 19 Apr 2017 23:07:45 +0200 as excerpted:
>
>> Too many people come complaining about losing their data -- and indeed,
>> there's no warning outside a wiki and the mailing list tribal knowledge.
>> Message severity chosen for consistency with XFS -- "alert" makes dmesg
>> produce nice red background which should get the point across.
>
> Commenting on the idea and comment, because as a non-coder list regular,
> that's what I can evaluate fairly. =:^)
>
> A kernel dmesg warning like this makes more sense to me than trying to
> put it in, for instance, mkfs.btrfs, because the instability is primarily
> kernel code and at least the message can stay synced with it, being
> removed when considered appropriate, unlike userspace code which can't,
> because people often run userspace and kernelspace versions well out of
> sync with each other.
>
Seconded. As someone who's been trying to get BtrFS adopted, the
biggest hurdle has been around perception. Rarely do people use the
userspace tools directly, but rather through multiple layers of
abstraction where they don't see any warnings coming from it. I think
adding these warnings to kernel logs is an excellent suggestion.

>> I intend to ask for inclusion of this one (or an equivalent) in 4.9,
>> either in Debian or via GregKH -- while for us kernels "that old" are
>> history, regular users expect stable releases to be free of known
>> serious data loss bugs.
>
> Arguably it should go in the LTS-4.4 series as well, because we at least
> try to support the last two LTS series on-list, more or less giving up
> beyond that, and that's the relatively new 4.9 and the now going stale
> but we really should be still trying to support it 4.4.  Older than that,
> 4.1 was the only LTS after initial code completion, but since it should
> be simple enough even before that and certainly would be true, queuing
> the patch for any still being updated LTS back to initial partial support
> (3.9 IIRC) is arguably worthwhile.
>
> --
> Duncan - List replies preferred.   No HTML msgs.
> "Every nonfree program has a lord, a master --
> and if you use the program, he is your master."  Richard Stallman
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS as a GlusterFS storage back-end, and what I've learned from using it as such.

2017-04-12 Thread Sargun Dhillon
Not to change the topic too much but, is there a suite of tracing
scripts that one can attach to their BtrFS installation to gather
metrics about tree locking performance? We see an awful lot of
machines with a task waiting on btrfs_tree_lock, and a bunch of other
tasks that are also in disk sleep waiting on BtrFS. We also see a
bunch of hung timeouts around btrfs_destroy_inode -- We're running
Kernel 4.8, so we can pretty easily plug in BPF based probes into the
kernel to get this information, and aggregate it.

Rather than doing this work ourselves, I'm wondering if anyone else
has a good set of tools to collect perf data about BtrFS performance
and lock contention?

On Tue, Apr 11, 2017 at 10:49 PM, Qu Wenruo  wrote:
>
>
> At 04/11/2017 11:40 PM, Austin S. Hemmelgarn wrote:
>>
>> About a year ago now, I decided to set up a small storage cluster to store
>> backups (and partially replace Dropbox for my usage, but that's a separate
>> story).  I ended up using GlusterFS as the clustering software itself, and
>> BTRFS as the back-end storage.
>>
>> GlusterFS itself is actually a pretty easy workload as far as cluster
>> software goes.  It does some processing prior to actually storing the data
>> (a significant amount in fact), but the actual on-device storage on any
>> given node is pretty simple.  You have the full directory structure for the
>> whole volume, and whatever files happen to be on that node are located
>> within that tree exactly like they are in the GlusterFS volume. Beyond the
>> basic data, gluster only stores 2-4 xattrs per-file (which are used to track
>> synchronization, and also for it's internal data scrubbing), and a directory
>> called .glusterfs in the top of the back-end storage location for the volume
>> which contains the data required to figure out which node a file is on.
>> Overall, the access patterns mostly mirror whatever is using the Gluster
>> volume, or are reduced to slow streaming writes (when writing files and the
>> back-end nodes are computationally limited instead of I/O limited), with the
>> addition of some serious metadata operations in the .glusterfs directory
>> (lots of stat calls there, together with large numbers of small files).
>
>
> Any real world experience is welcomed to share.
>
>>
>> As far as overall performance, BTRFS is actually on par for this usage
>> with both ext4 and XFS (at least, on my hardware it is), and I actually see
>> more SSD friendly access patterns when using BTRFS in this case than any
>> other FS I tried.
>
>
> We also find that, for pure buffered read/write, btrfs is no worse than
> traditional fs.
>
> In our PostgreSQL test, btrfs can even get a little better performance than
> ext4/xfs when handling DB files.
>
> But if using btrfs for PostgreSQL Write Ahead Log (WAL), then it's
> completely another thing.
> Btrfs falls far behind ext4/xfs on HDD, only half of the TPC performance for
> low concurrency load.
>
> Due to btrfs CoW, btrfs causes extra IO for fsync.
> For example, if only to fsync 4K data, btrfs can cause 64K metadata write
> for default mkfs options.
> (One tree block for log root tree, one tree block for log tree, multiple by
> 2 for default DUP profile)
>
>>
>> After some serious experimentation with various configurations for this
>> during the past few months, I've noticed a handful of other things:
>>
>> 1. The 'ssd' mount option does not actually improve performance on these
>> SSD's.  To a certain extent, this actually surprised me at first, but having
>> seen Hans' e-mail and what he found about this option, it actually makes
>> sense, since erase-blocks on these devices are 4MB, not 2MB, and the drives
>> have a very good FTL (so they will aggregate all the little writes
>> properly).
>>
>> Given this, I'm beginning to wonder if it actually makes sense to not
>> automatically enable this on mount when dealing with certain types of
>> storage (for example, most SATA and SAS SSD's have reasonably good FTL's, so
>> I would expect them to have similar behavior).  Extrapolating further, it
>> might instead make sense to just never automatically enable this, and expose
>> the value this option is manipulating as a mount option as there are other
>> circumstances where setting specific values could improve performance (for
>> example, if you're on hardware RAID6, setting this to the stripe size would
>> probably improve performance on many cheaper controllers).
>>
>> 2. Up to a certain point, running a single larger BTRFS volume with
>> multiple sub-volumes is more computationally efficient than running multiple
>> smaller BTRFS volumes.  More specifically, there is lower load on the system
>> and lower CPU utilization by BTRFS itself without much noticeable difference
>> in performance (in my tests it was about 0.5-1% performance difference,
>> YMMV).  To a certain extent this makes some sense, but the turnover point
>> was actually a lot higher than I expected (with this workload,