Re: [PATCH 01/20] btrfs-progs: quota: Add -W option to rescan to wait without starting rescan

2018-05-02 Thread Qu Wenruo
Gentle ping.

This patch can be applied independently and is pretty critical to expose
rescan and transaction race.

It would be quite an important part for crafting test case.

Thanks,
Qu

On 2018年03月08日 10:40, je...@suse.com wrote:
> From: Jeff Mahoney 
> 
> This patch adds a new -W option to wait for a rescan without starting a
> new operation.  This is useful for things like xfstests where we want
> do to do a "btrfs quota enable" and not continue until the subsequent
> rescan has finished.
> 
> In addition to documenting the new option in the man page, I've cleaned
> up the rescan entry to document the -w option a bit better.
> 
> Reviewed-by: Qu Wenruo 
> Signed-off-by: Jeff Mahoney 
> ---
>  Documentation/btrfs-quota.asciidoc | 10 +++---
>  cmds-quota.c   | 20 ++--
>  2 files changed, 21 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/btrfs-quota.asciidoc 
> b/Documentation/btrfs-quota.asciidoc
> index 85ebf729..0b64a69b 100644
> --- a/Documentation/btrfs-quota.asciidoc
> +++ b/Documentation/btrfs-quota.asciidoc
> @@ -238,15 +238,19 @@ Disable subvolume quota support for a filesystem.
>  *enable* ::
>  Enable subvolume quota support for a filesystem.
>  
> -*rescan* [-s] ::
> +*rescan* [-s|-w|-W] ::
>  Trash all qgroup numbers and scan the metadata again with the current config.
>  +
>  `Options`
>  +
>  -s
> -show status of a running rescan operation.
> +Show status of a running rescan operation.
> +
>  -w
> -wait for rescan operation to finish(can be already in progress).
> +Start rescan operation and wait until it has finished before exiting.  If a 
> rescan is already running, wait until it finishes and then exit without 
> starting a new one.
> +
> +-W
> +Wait for rescan operation to finish and then exit.  If a rescan is not 
> already running, exit silently.
>  
>  EXIT STATUS
>  ---
> diff --git a/cmds-quota.c b/cmds-quota.c
> index 745889d1..7f933495 100644
> --- a/cmds-quota.c
> +++ b/cmds-quota.c
> @@ -120,14 +120,20 @@ static int cmd_quota_rescan(int argc, char **argv)
>   int wait_for_completion = 0;
>  
>   while (1) {
> - int c = getopt(argc, argv, "sw");
> + int c = getopt(argc, argv, "swW");
>   if (c < 0)
>   break;
>   switch (c) {
>   case 's':
>   ioctlnum = BTRFS_IOC_QUOTA_RESCAN_STATUS;
>   break;
> + case 'W':
> + ioctlnum = 0;
> + wait_for_completion = 1;
> + break;
>   case 'w':
> + /* Reset it in case the user did both -W and -w */
> + ioctlnum = BTRFS_IOC_QUOTA_RESCAN;
>   wait_for_completion = 1;
>   break;
>   default:
> @@ -135,8 +141,8 @@ static int cmd_quota_rescan(int argc, char **argv)
>   }
>   }
>  
> - if (ioctlnum != BTRFS_IOC_QUOTA_RESCAN && wait_for_completion) {
> - error("switch -w cannot be used with -s");
> + if (ioctlnum == BTRFS_IOC_QUOTA_RESCAN_STATUS && wait_for_completion) {
> + error("switches -w/-W cannot be used with -s");
>   return 1;
>   }
>  
> @@ -150,8 +156,10 @@ static int cmd_quota_rescan(int argc, char **argv)
>   if (fd < 0)
>   return 1;
>  
> - ret = ioctl(fd, ioctlnum, );
> - e = errno;
> + if (ioctlnum) {
> + ret = ioctl(fd, ioctlnum, );
> + e = errno;
> + }
>  
>   if (ioctlnum == BTRFS_IOC_QUOTA_RESCAN_STATUS) {
>   close_file_or_dir(fd, dirstream);
> @@ -167,7 +175,7 @@ static int cmd_quota_rescan(int argc, char **argv)
>   return 0;
>   }
>  
> - if (ret == 0) {
> + if (ioctlnum == BTRFS_IOC_QUOTA_RESCAN && ret == 0) {
>   printf("quota rescan started\n");
>   fflush(stdout);
>   } else if (ret < 0 && (!wait_for_completion || e != EINPROGRESS)) {
> 



signature.asc
Description: OpenPGP digital signature


[PATCH] btrfs: qgroup: Allow trace_btrfs_qgroup_account_extent() to record its transid

2018-05-02 Thread Qu Wenruo
When debugging quota rescan race, some times btrfs rescan could account
some old (committed) leaf and then re-account newly committed leaf
in next generation.

This race needs extra transid to locate, so add @transid for
trace_btrfs_qgroup_account_extent() for such debug.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/qgroup.c|  4 ++--
 include/trace/events/btrfs.h | 20 
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a5742e9e9a14..4baa4ba2d630 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2014,8 +2014,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle 
*trans,
 
BUG_ON(!fs_info->quota_root);
 
-   trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
- nr_old_roots, nr_new_roots);
+   trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
+   num_bytes, nr_old_roots, nr_new_roots);
 
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index cfcbcd2eb338..59bb885b557e 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1578,12 +1578,14 @@ DEFINE_EVENT(btrfs_qgroup_extent, 
btrfs_qgroup_trace_extent,
 
 TRACE_EVENT(btrfs_qgroup_account_extent,
 
-   TP_PROTO(const struct btrfs_fs_info *fs_info, u64 bytenr,
+   TP_PROTO(const struct btrfs_fs_info *fs_info, u64 transid, u64 bytenr,
 u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
 
-   TP_ARGS(fs_info, bytenr, num_bytes, nr_old_roots, nr_new_roots),
+   TP_ARGS(fs_info, transid, bytenr, num_bytes, nr_old_roots,
+   nr_new_roots),
 
TP_STRUCT__entry_btrfs(
+   __field(u64,  transid   )
__field(u64,  bytenr)
__field(u64,  num_bytes )
__field(u64,  nr_old_roots  )
@@ -1591,18 +1593,20 @@ TRACE_EVENT(btrfs_qgroup_account_extent,
),
 
TP_fast_assign_btrfs(fs_info,
+   __entry->transid= transid;
__entry->bytenr = bytenr;
__entry->num_bytes  = num_bytes;
__entry->nr_old_roots   = nr_old_roots;
__entry->nr_new_roots   = nr_new_roots;
),
 
-   TP_printk_btrfs("bytenr=%llu num_bytes=%llu nr_old_roots=%llu "
- "nr_new_roots=%llu",
- __entry->bytenr,
- __entry->num_bytes,
- __entry->nr_old_roots,
- __entry->nr_new_roots)
+   TP_printk_btrfs(
+"transid=%llu bytenr=%llu num_bytes=%llu nr_old_roots=%llu nr_new_roots=%llu",
+   __entry->transid,
+   __entry->bytenr,
+   __entry->num_bytes,
+   __entry->nr_old_roots,
+   __entry->nr_new_roots)
 );
 
 TRACE_EVENT(qgroup_update_counters,
-- 
2.17.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Duncan
Goffredo Baroncelli posted on Wed, 02 May 2018 22:40:27 +0200 as
excerpted:

> Anyway, my "rant" started when Ducan put near the missing of parity
> checksum and the write hole. The first might be a performance problem.
> Instead the write hole could lead to a loosing data. My intention was to
> highlight that the parity-checksum is not related to the reliability and
> safety of raid5/6.

Thanks for making that point... and to everyone else for the vigorous 
thread debating it, as I'm learning quite a lot! =:^)

>From your first reply:

>> Why the fact that the parity is not checksummed is a problem ?
>> I read several times that this is a problem. However each time the
>> thread reached the conclusion that... it is not a problem.

I must have missed those threads, or at least, missed that conclusion 
from them (maybe believing they were about something rather narrower, or 
conflating... for instance), because AFAICT, this is the first time I've 
seen the practical merits of checksummed parity actually debated, at 
least in terms I as a non-dev can reasonably understand.  To my mind it 
was settled (or I'd have worded my original claim rather differently) and 
only now am I learning different.

And... to my credit... given the healthy vigor of the debate, it seems 
I'm not the only one that missed them...

But I'm surely learning of it now, and indeed, I had somewhat conflated 
parity-checksumming with the in-place-stripe-read-modify-write atomicity 
issue.  I'll leave the parity-checksumming debate (now that I know it at 
least remains debatable) to those more knowledgeable than myself, but in 
addition to what I've learned of it, I've definitely learned that I can't 
properly conflate it with the in-place stripe-rmw atomicity issue, so 
thanks!

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Duncan
Gandalf Corvotempesta posted on Wed, 02 May 2018 19:25:41 + as
excerpted:

> On 05/02/2018 03:47 AM, Duncan wrote:
>> Meanwhile, have you looked at zfs? Perhaps they have something like
>> that?
> 
> Yes, i've looked at ZFS and I'm using it on some servers but I don't
> like it too much for multiple reasons, in example:
> 
> 1) is not officially in kernel, we have to build a module every time
> with DKMS

FWIW zfz is excluded from my choice domain as well, due to the well known 
license issues.  Regardless of strict legal implications, because Oracle 
has copyrights they could easily solve that problem and the fact that 
they haven't strongly suggests they have no interest in doing so.  That 
in turn means they have no interest in people like me running zfs, which 
means I have no interest in it either.

But because it does remain effectively the nearest to btrfs features and 
potential features "working now" solution out there, for those who simply 
_must_ have it and/or find it a more acceptable solution than cobbling 
together a multi-layer solution out of a standard filesystem on top of 
device-mapper or whatever, it's what I and others point to when people 
wonder about missing or unstable btrfs features.

> I'm new to BTRFS (if fact, i'm not using it) and I've seen in the status
> page that "it's almost ready".
> The only real missing part is a stable, secure and properly working
> RAID56,
> so i'm thinking why most effort aren't directed to fix RAID56 ?

Well, they are.  But finding and fixing corner-case bugs takes time and 
early-adopter deployments, and btrfs doesn't have the engineering 
resources to simply assign to the problem that Sun had with zfs.

Despite that, as I stated, current btrfs raid56 is, to the best of my/
list knowledge, the current code is now reasonably ready, tho it'll take 
another year or two without serious bug reports to actually test that, 
but it simply has the well known write hole that applies to all parity-
raid unless they've taken specific measures such as partial-stripe-write 
logging (slow), writing a full stripe even if it's partially empty 
(wastes space and needs periodic maintenance to reclaim it), or variable-
stripe-widths (needs periodic maintenance and more complex than always 
writing full stripes even if they're partially empty) (both of the latter 
avoiding the problem by avoiding in-place read-modify-write cycle 
entirely).

So to a large degree what's left is simply time for testing to 
demonstrate stability on the one hand, and a well known problem with 
parity-raid in general on the other.  There's the small detail that said 
well-known write hole has additional implementation-detail implications 
on btrfs, but at it's root it's the same problem all parity-raid has, and 
people choosing parity-raid as a solution are already choosing to either 
live with it or ameliorate it in some other way (tho some parity-raid 
solutions have that amelioration built-in).

> There are some environments where a RAID1/10 is too expensive and a
> RAID6 is mandatory,
> but with the current state of RAID56, BTRFS can't be used for valuable
> data

Not entirely true.  Btrfs, even btrfs raid56 mode, _can_ be used for 
"valuable" data, it simply requires astute /practical/ definitions of 
"valuable", as opposed to simple claims that don't actually stand up in 
practice.

Here's what I mean:  The sysadmin's first rule of backups defines 
"valuable data" by the number of backups it's worth making of that data.  
If there's no backups, then by definition the data is worth less than the 
time/hassle/resources necessary to have that backup, because it's not a 
question of if, but rather when, something's going to go wrong with the 
working copy and it won't be available any longer.

Additional layers of backup and whether one keeps geographically 
separated off-site backups as well are simply extensions of the first-
level-backup case/rule.  The more valuable the data, the more backups 
it's worth having of it, and the more effort is justified in ensuring 
that single or even multiple disasters aren't going to leave no working 
backup.

With this view, it's perfectly fine to use btrfs raid56 mode for 
"valuable" data, because that data is backed up and that backup can be 
used as a fallback if necessary.  True, the "working copy" might not be 
as reliable as it is in some cases, but statistically, that simply brings 
the 50% chance of failure rate (or whatever other percentage chance you 
choose) closer, to say once a year, or once a month, rather than perhaps 
once or twice a decade.  Working copy failure is GOING to happen in any 
case, it's just a matter of playing the chance game as to when, and using 
a not yet fully demonstrated reliable filesystem mode simply brings ups 
the chances a bit.

But if the data really *is* defined as "valuable", not simply /claimed/ 
to be valuable, then by that same definition, it *will* have a backup.

In the worst case, when some 

Re: Strange behavior (possible bugs) in btrfs

2018-05-02 Thread Vijay Chidambaram
On Mon, Apr 30, 2018 at 11:56 AM, Jayashree Mohan
 wrote:
> Hi Jeff,
>
> On Mon, Apr 30, 2018 at 11:51 AM, Jeff Mahoney  wrote:
>> On 4/30/18 12:04 PM, Vijay Chidambaram wrote:
>>> Hi,
>>>
>>> We found two more cases where the btrfs behavior is a little strange.
>>> In one case, an fsync-ed file goes missing after a crash. In the
>>> other, a renamed file shows up in both directories after a crash.
>>
>> Hi Vijay -
>>
>> What kernel version did you observe these with?  These seem like bugs
>> Filipe has already fixed.
>>
>> -Jeff
>>
>
> These bugs were observed on Kernel 4.16.0-041600-generic.
>

To add on to what Jayashree said, these bugs are still present in the
latest stable release of btrfs. We aren't sure if they have been
patched on a development branch.

We should note that similar bugs have been fixed before in btrfs, but
those patches don't seem to handle these cases.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Goffredo Baroncelli
On 05/02/2018 11:20 PM, waxhead wrote:
> 
> 
[...]
> 
> Ok, before attempting and answer I have to admit that I do not know enough 
> about how RAID56 is laid out on disk in BTRFS terms. Is data checksummed pr. 
> stripe or pr. disk? Is parity calculated on the data only or is it calculated 
> on the data+checksum ?!
> 

Data is checksummed per block. The parity is invisible to the checksum. The 
parity are allocated in an "address space" parallel to the data "address space" 
exposed by the BG.

BR
G.Baroncelli

-- 
gpg @keyserver.linux.it: Goffredo Baroncelli 
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread waxhead



Andrei Borzenkov wrote:

02.05.2018 21:17, waxhead пишет:

Goffredo Baroncelli wrote:

On 05/02/2018 06:55 PM, waxhead wrote:


So again, which problem would solve having the parity checksummed ?
On the best of my knowledge nothing. In any case the data is
checksummed so it is impossible to return corrupted data (modulo bug
:-) ).


I am not a BTRFS dev , but this should be quite easy to answer.
Unless you checksum the parity there is no way to verify that that
the data (parity) you use to reconstruct other data is correct.


In any case you could catch that the compute data is wrong, because
the data is always checksummed. And in any case you must check the
data against their checksum.


What if you lost an entire disk?


How does it matter exactly? RAID is per chunk anyway.

It does not matter. I was wrong, got bitten by thinking about BTRFS 
"RAID5" as normal RAID5. Again a good reason to change the naming for it 
I think...



or had corruption for both data AND checksum?


By the same logic you may have corrupted parity and its checksum.


Yup. Indeed


How do you plan to safely reconstruct that without checksummed
parity?



Define "safely". The main problem of current RAID56 implementation is
that stripe is not updated atomically (at least, that is what I
understood from the past discussions) and this is not solved by having
extra parity checksum. So how exactly "safety" is improved here? You
still need overall checksum to verify result of reconstruction, what
exactly extra parity checksum buys you?


> [...]




Again - please describe when having parity checksum will be beneficial
over current implementation. You do not reconstruct anything as long as
all data strips are there, so parity checksum will not be used. If one
data strip fails (including checksum) it will be reconstructed and
verified. If parity itself is corrupted, checksum verification fails
(hopefully). How is it different from verifying parity checksum before
reconstructing? In both cases data cannot be reconstructed, end of story.


Ok, before attempting and answer I have to admit that I do not know 
enough about how RAID56 is laid out on disk in BTRFS terms. Is data 
checksummed pr. stripe or pr. disk? Is parity calculated on the data 
only or is it calculated on the data+checksum ?!

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] btrfs: qgroups, remove unnecessary memset before btrfs_init_work

2018-05-02 Thread jeffm
From: Jeff Mahoney 

btrfs_init_work clears the work struct except for ->wq, so the memset
before calling btrfs_init_work in qgroup_rescan_init is unnecessary.

We'll also initialize ->wq in btrfs_init_work so that it's obvious.

Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/async-thread.c | 1 +
 fs/btrfs/qgroup.c   | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d5540749f0e5..c614fb7b9b9d 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -354,6 +354,7 @@ void btrfs_init_work(struct btrfs_work *work, 
btrfs_work_func_t uniq_func,
INIT_WORK(>normal_work, uniq_func);
INIT_LIST_HEAD(>ordered_list);
work->flags = 0;
+   work->wq = NULL;
 }
 
 static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 466744741873..3d47700c6a30 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2102,9 +2102,6 @@ static void queue_rescan_worker(struct btrfs_fs_info 
*fs_info)
init_completion(_info->qgroup_rescan_completion);
mutex_unlock(_info->qgroup_rescan_lock);
 
-   memset(_info->qgroup_rescan_work, 0,
-  sizeof(fs_info->qgroup_rescan_work));
-
btrfs_init_work(_info->qgroup_rescan_work,
btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
-- 
2.12.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] btrfs: qgroup, don't try to insert status item after ENOMEM in rescan worker

2018-05-02 Thread jeffm
From: Jeff Mahoney 

If we fail to allocate memory for a path, don't bother trying to
insert the qgroup status item.  We haven't done anything yet and it'll
fail also.  Just print an error and be done with it.

Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/qgroup.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3d47700c6a30..44d5e3da835a 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2666,7 +2666,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work 
*work)
btrfs_end_transaction(trans);
}
 
-out:
btrfs_free_path(path);
 
mutex_lock(_info->qgroup_rescan_lock);
@@ -2702,13 +2701,13 @@ static void btrfs_qgroup_rescan_worker(struct 
btrfs_work *work)
 
if (btrfs_fs_closing(fs_info)) {
btrfs_info(fs_info, "qgroup scan paused");
-   } else if (err >= 0) {
+   err = 0;
+   } else if (err >= 0)
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
-   } else {
+out:
+   if (err < 0)
btrfs_err(fs_info, "qgroup scan failed with %d", err);
-   }
-
 done:
mutex_lock(_info->qgroup_rescan_lock);
fs_info->qgroup_rescan_running = false;
-- 
2.12.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/3] btrfs: qgroup rescan races (part 1)

2018-05-02 Thread jeffm
From: Jeff Mahoney 

Hi Dave -

Here's the updated patchset for the rescan races.  This fixes the issue
where we'd try to start multiple workers.  It introduces a new "ready"
bool that we set during initialization and clear while queuing the worker.
The queuer is also now responsible for most of the initialization.

I have a separate patch set start that gets rid of the racy mess surrounding
the rescan worker startup.  We can handle it in btrfs_run_qgroups and
just set a flag to start it everywhere else.

-Jeff

---

Jeff Mahoney (3):
  btrfs: qgroups, fix rescan worker running races
  btrfs: qgroups, remove unnecessary memset before btrfs_init_work
  btrfs: qgroup, don't try to insert status item after ENOMEM in rescan
worker

 fs/btrfs/async-thread.c |   1 +
 fs/btrfs/ctree.h|   2 +
 fs/btrfs/qgroup.c   | 100 +++-
 3 files changed, 60 insertions(+), 43 deletions(-)

-- 
2.12.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] btrfs: qgroups, fix rescan worker running races

2018-05-02 Thread jeffm
From: Jeff Mahoney 

Commit 8d9eddad194 (Btrfs: fix qgroup rescan worker initialization)
fixed the issue with BTRFS_IOC_QUOTA_RESCAN_WAIT being racy, but
ended up reintroducing the hang-on-unmount bug that the commit it
intended to fix addressed.

The race this time is between qgroup_rescan_init setting
->qgroup_rescan_running = true and the worker starting.  There are
many scenarios where we initialize the worker and never start it.  The
completion btrfs_ioctl_quota_rescan_wait waits for will never come.
This can happen even without involving error handling, since mounting
the file system read-only returns between initializing the worker and
queueing it.

The right place to do it is when we're queuing the worker.  The flag
really just means that btrfs_ioctl_quota_rescan_wait should wait for
a completion.

Since the BTRFS_QGROUP_STATUS_FLAG_RESCAN flag is overloaded to
refer to both runtime behavior and on-disk state, we introduce a new
fs_info->qgroup_rescan_ready to indicate that we're initialized and
waiting to start.

This patch introduces a new helper, queue_rescan_worker, that handles
most of the initialization, the two flags, and queuing the worker,
including races with unmount.

While we're at it, ->qgroup_rescan_running is protected only by the
->qgroup_rescan_mutex.  btrfs_ioctl_quota_rescan_wait doesn't need
to take the spinlock too.

Fixes: 8d9eddad194 (Btrfs: fix qgroup rescan worker initialization)
Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/ctree.h  |  2 ++
 fs/btrfs/qgroup.c | 94 +--
 2 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index da308774b8a4..4003498bb714 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1045,6 +1045,8 @@ struct btrfs_fs_info {
struct btrfs_workqueue *qgroup_rescan_workers;
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
+   /* qgroup rescan worker is running or queued to run */
+   bool qgroup_rescan_ready;
bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
 
/* filesystem state */
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index aa259d6986e1..466744741873 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -101,6 +101,7 @@ static int
 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
   int init_flags);
 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work);
 
 /* must be called with qgroup_ioctl_lock held */
 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -2072,6 +2073,46 @@ int btrfs_qgroup_account_extents(struct 
btrfs_trans_handle *trans,
return ret;
 }
 
+static void queue_rescan_worker(struct btrfs_fs_info *fs_info)
+{
+   mutex_lock(_info->qgroup_rescan_lock);
+   if (btrfs_fs_closing(fs_info)) {
+   mutex_unlock(_info->qgroup_rescan_lock);
+   return;
+   }
+
+   if (WARN_ON(!fs_info->qgroup_rescan_ready)) {
+   btrfs_warn(fs_info, "rescan worker not ready");
+   mutex_unlock(_info->qgroup_rescan_lock);
+   return;
+   }
+   fs_info->qgroup_rescan_ready = false;
+
+   if (WARN_ON(fs_info->qgroup_rescan_running)) {
+   btrfs_warn(fs_info, "rescan worker already queued");
+   mutex_unlock(_info->qgroup_rescan_lock);
+   return;
+   }
+
+   /*
+* Being queued is enough for btrfs_qgroup_wait_for_completion
+* to need to wait.
+*/
+   fs_info->qgroup_rescan_running = true;
+   init_completion(_info->qgroup_rescan_completion);
+   mutex_unlock(_info->qgroup_rescan_lock);
+
+   memset(_info->qgroup_rescan_work, 0,
+  sizeof(fs_info->qgroup_rescan_work));
+
+   btrfs_init_work(_info->qgroup_rescan_work,
+   btrfs_qgroup_rescan_helper,
+   btrfs_qgroup_rescan_worker, NULL, NULL);
+
+   btrfs_queue_work(fs_info->qgroup_rescan_workers,
+_info->qgroup_rescan_work);
+}
+
 /*
  * called from commit_transaction. Writes all changed qgroups to disk.
  */
@@ -2123,8 +2164,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
-   btrfs_queue_work(fs_info->qgroup_rescan_workers,
-_info->qgroup_rescan_work);
+   queue_rescan_worker(fs_info);
}
ret = 0;
}
@@ -2607,6 +2647,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work 
*work)
if (!path)
goto out;
 
+   mutex_lock(_info->qgroup_rescan_lock);
+   

Re: RAID56 - 6 parity raid

2018-05-02 Thread Goffredo Baroncelli
On 05/02/2018 09:29 PM, Austin S. Hemmelgarn wrote:
> On 2018-05-02 13:25, Goffredo Baroncelli wrote:
>> On 05/02/2018 06:55 PM, waxhead wrote:

 So again, which problem would solve having the parity checksummed ? On the 
 best of my knowledge nothing. In any case the data is checksummed so it is 
 impossible to return corrupted data (modulo bug :-) ).

>>> I am not a BTRFS dev , but this should be quite easy to answer. Unless you 
>>> checksum the parity there is no way to verify that that the data (parity) 
>>> you use to reconstruct other data is correct.
>>
>> In any case you could catch that the compute data is wrong, because the data 
>> is always checksummed. And in any case you must check the data against their 
>> checksum.
>>
>> My point is that storing the checksum is a cost that you pay *every time*. 
>> Every time you update a part of a stripe you need to update the parity, and 
>> then in turn the parity checksum. It is not a problem of space occupied nor 
>> a computational problem. It is a a problem of write amplification...
>>
>> The only gain is to avoid to try to use the parity when
>> a) you need it (i.e. when the data is missing and/or corrupted)
>> and b) it is corrupted.
>> But the likelihood of this case is very low. And you can catch it during the 
>> data checksum check (which has to be performed in any case !).
>>
>> So from one side you have a *cost every time* (the write amplification), to 
>> other side you have a gain (cpu-time) *only in case* of the parity is 
>> corrupted and you need it (eg. scrub or corrupted data)).
>>
>> IMHO the cost are very higher than the gain, and the likelihood the gain is 
>> very lower compared to the likelihood (=100% or always) of the cost.
> You do realize that a write is already rewriting checksums elsewhere? It 
> would be pretty trivial to make sure that the checksums for every part of a 
> stripe end up in the same metadata block, at which point the only cost is 
> computing the checksum (because when a checksum gets updated, the whole block 
> it's in gets rewritten, period, because that's how CoW works).
> 
> Looking at this another way (all the math below uses SI units):
> 
[...]
Good point: precomputing the checksum of the parity save a lot of time for the 
scrub process. You can see this in a more simply way saying that the parity 
calculation (which is dominated by the memory bandwidth) is like O(n) (where n 
is the number of disk); the parity checking (which again is dominated by the 
memory bandwidth) against a checksum is like O(1). And when the data written is 
2 order of magnitude lesser than the data stored, the effort required to 
precompute the checksum is negligible.

Anyway, my "rant" started when Ducan put near the missing of parity checksum 
and the write hole. The first might be a performance problem. Instead the write 
hole could lead to a loosing data. My intention was to highlight that the 
parity-checksum is not related to the reliability and safety of raid5/6.

> 
> So, lets look at data usage:
> 
> 1GB of data is translates to 62500 16kB blocks of data, which equates to an 
> additional 15625 blocks for parity.  Adding parity checksums adds a 25% 
> overhead to checksums being written, but that actually doesn't translate to a 
> huge increase in the number of _blocks_ of checksums written.  One 16k block 
> can hold roughly 500 checksums, so it would take 125 blocks worth of 
> checksums without parity, and 157 (technically 156.25, but you can't write a 
> quarter block) with parity checksums. Thus, without parity checksums, writing 
> 1GB of data involves writing 78250 blocks, while doing the same with parity 
> checksums involves writing 78282 blocks, a net change of only 32 blocks, or 
> **0.0409%**.

How you would store the checksum ?
I asked that because I am not sure that we could use the "standard" btrfs 
metadata to store the checksum of the parity. Doing so you could face some 
pathological effect like:
- update a block(1) in a stripe(1)
- update the parity of stripe(1) containing block(1)
- update the checksum of parity stripe (1), which is contained in another 
stripe(2) [**]

- update the parity of stripe (2) containing the checksum of parity stripe(1)
- update the checksum of parity stripe (2), which is contained in another 
stripe(3)

and so on...

[**] pay attention that the checksum and the parity *have* to be in different 
stripe, otherwise you have the egg/chicken problem: compute the parity, then 
update the checksum, then update the parity again because the checksum is 
changed

In order to avoid that, I fear that you can't store the checksum over a raid5/6 
BG with parity checksummed; 

It is a bit late and I am a bit tired out, so may be that I am wrong however I 
fear that the above "write amplification problem" may be a big problem; a 
possible solution would be storing the checksum in a N-mirror BG (where N is 1 
for raid5, 2 for raid6)

BR
G.Baroncelli

-- 

Re: fs/btrfs/ctree.h:1559 warnings

2018-05-02 Thread Paul Richards
On 2 May 2018 at 20:43, Paul Richards  wrote:
> The issue I have now is that the filesystem cannot be unmounted.
> "umount" reports "target is busy", but I cannot find anything relevant
> with "lsof" or "fuser" (this is a very quiet home NAS).  Is this
> something related to the resize?
>

Oh please ignore me.  I forgot to unmount "/storage/.snapshots" before
"/storage".

Everything looks fine.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: fs/btrfs/ctree.h:1559 warnings

2018-05-02 Thread Paul Richards
On 29 April 2018 at 02:50, Qu Wenruo  wrote:
>
>
> On 2018年04月29日 04:16, Paul Richards wrote:
>> On 28 April 2018 at 20:39, Patrik Lundquist  
>> wrote:
>>> On 28 April 2018 at 20:54, Paul Richards  wrote:

 Hi,
 I recently upgraded from Linux 4.4.0 to 4.13.0 (Ubuntu 16.04 stock to
 hwe kernel).

 Since then, I've noticed lots of btrfs warnings in dmesg (example at
 the end).  I believe these warnings to be benign, and they relate to
 my partition not being a multiple of 4KiB in size (I confirmed that
 alignment is okay in this instance).
>>>
>>>
>>> Run btrfs rescue fix-device-size .
>>>
>>
>>
>> Excellent, that looks like exactly what I need.  It's a shame my
>> google-fu didn't uncover it before I posted.
>>
>> However, the "fix-device-size" sub-command is not available in
>> btrfs-tools (v4.4) from Ubuntu 16.04.  I guess my original question
>> still stands.  Is this warning safe for me to ignore?
>
> Safe to ignore.
>
> And there is another way to solve it.
>
> Shrink your fs by 4K, and newer kernel will do the round down for you
> and result new device size to be aligned.
>
> Thanks,
> Qu
>


Thanks - this fixed the issue for me (modulo my problem below..).  I
thought I should follow up with exactly what I did for anyone else
looking to fix this.

I used this command to identify which devices were not exact multiples
of 4KiB in size:

% btrfs fi show --raw /storage
Label: none  uuid: c4b33374-9006-47d7-b6f9-2136dc988f9a
Total devices 3 FS bytes used 7027891666944
devid1 size 8001560055808 used 4691211583488 path /dev/mapper/storage1
devid3 size 8001560059392 used 4691211583488 path /dev/mapper/storage3
devid4 size 8001560059392 used 4690104287232 path /dev/mapper/storage4

I used a calculator to determine that devids 3 and 4 were the
offenders (they don't divide by 4006 nicely).


I then ran:
% btrfs fi resize 3:-1K /storage
% btrfs fi resize 4:-1K /storage

Which has resulted in this:
% btrfs fi show --raw /storage
Label: none  uuid: c4b33374-9006-47d7-b6f9-2136dc988f9a
Total devices 3 FS bytes used 7027891666944
devid1 size 8001560055808 used 4691211583488 path /dev/mapper/storage1
devid3 size 8001560055808 used 4691211583488 path /dev/mapper/storage3
devid4 size 8001560055808 used 4690104287232 path /dev/mapper/storage4

PERFECT! :)


The issue I have now is that the filesystem cannot be unmounted.
"umount" reports "target is busy", but I cannot find anything relevant
with "lsof" or "fuser" (this is a very quiet home NAS).  Is this
something related to the resize?



For what it's worth, here are the sizes of the underlying devices
(output somewhat redacted):

% lsblk -b
NAME MAJ:MIN RM  SIZE RO TYPE  MOUNTPOINT
storage3 253:20 8001560059392  0 crypt
storage4 253:10 8001560059392  0 crypt
storage1 253:00 8001560059392  0 crypt /storage/.snapshots

It looks like all the devices are an odd size.  One of these disks was
added after the others, presumably with a newer kernel that rounded
the btrfs dev size downwards.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Austin S. Hemmelgarn

On 2018-05-02 13:25, Goffredo Baroncelli wrote:

On 05/02/2018 06:55 PM, waxhead wrote:


So again, which problem would solve having the parity checksummed ? On the best 
of my knowledge nothing. In any case the data is checksummed so it is 
impossible to return corrupted data (modulo bug :-) ).


I am not a BTRFS dev , but this should be quite easy to answer. Unless you 
checksum the parity there is no way to verify that that the data (parity) you 
use to reconstruct other data is correct.


In any case you could catch that the compute data is wrong, because the data is 
always checksummed. And in any case you must check the data against their 
checksum.

My point is that storing the checksum is a cost that you pay *every time*. 
Every time you update a part of a stripe you need to update the parity, and 
then in turn the parity checksum. It is not a problem of space occupied nor a 
computational problem. It is a a problem of write amplification...

The only gain is to avoid to try to use the parity when
a) you need it (i.e. when the data is missing and/or corrupted)
and b) it is corrupted.
But the likelihood of this case is very low. And you can catch it during the 
data checksum check (which has to be performed in any case !).

So from one side you have a *cost every time* (the write amplification), to 
other side you have a gain (cpu-time) *only in case* of the parity is corrupted 
and you need it (eg. scrub or corrupted data)).

IMHO the cost are very higher than the gain, and the likelihood the gain is 
very lower compared to the likelihood (=100% or always) of the cost.
You do realize that a write is already rewriting checksums elsewhere? 
It would be pretty trivial to make sure that the checksums for every 
part of a stripe end up in the same metadata block, at which point the 
only cost is computing the checksum (because when a checksum gets 
updated, the whole block it's in gets rewritten, period, because that's 
how CoW works).


Looking at this another way (all the math below uses SI units):

Assume you have a BTRFS raid5 volume consisting of 6 8TB disks (which 
gives you 40TB of usable space).  You're storing roughly 20TB of data on 
it, using a 16kB block size, and it sees about 1GB of writes a day, with 
no partial stripe writes.  You, for reasons of argument, want to scrub 
it every week, because the data in question matters a lot to you.


With a decent CPU, lets say you can compute 1.5GB/s worth of checksums, 
and can compute the parity at a rate of 1.25G/s (the ratio here is about 
the average across the almost 50 systems I have quick access to check, 
including a number of server and workstation systems less than a year 
old, though the numbers themselves are artificially low to accentuate 
the point here).


At this rate, scrubbing by computing parity requires processing:

* Checksums for 20TB of data, at a rate of 1.5GB/s, which would take 
1 seconds, or 222 minutes, or about 3.7 hours.
* Parity for 20TB of data, at a rate of 1.25GB/s, which would take 16000 
seconds, or 267 minutes, or roughly 4.4 hours.


So, over a week, you would be spending 8.1 hours processing data solely 
for data integrity, or roughly 4.8214% of your time.


Now assume instead that you're doing checksummed parity:

* Scrubbing data is the same, 3.7 hours.
* Scrubbing parity turns into computing checksums for 4TB of data, which 
would take 3200 seconds, or 53 minutes, or roughly 0.88 hours.
* Computing parity for the 7GB of data you write each week takes 5.6 
_SECONDS_.


So, over a week, you would spend just over 4.58 hours processing data 
solely for data integrity, or roughly 2.7262% of your time.


So, in terms of just time spent, it's almost twice as fast to use 
checksummed parity (roughly 43% faster to be more specific).


So, lets look at data usage:

1GB of data is translates to 62500 16kB blocks of data, which equates to 
an additional 15625 blocks for parity.  Adding parity checksums adds a 
25% overhead to checksums being written, but that actually doesn't 
translate to a huge increase in the number of _blocks_ of checksums 
written.  One 16k block can hold roughly 500 checksums, so it would take 
125 blocks worth of checksums without parity, and 157 (technically 
156.25, but you can't write a quarter block) with parity checksums. 
Thus, without parity checksums, writing 1GB of data involves writing 
78250 blocks, while doing the same with parity checksums involves 
writing 78282 blocks, a net change of only 32 blocks, or **0.0409%**.


Note that the difference in the amount of checksums written is a simple 
linear function directly proportionate to the amount of data being 
written provided that all rewrites only rewrite full stripes (because 
that's equivalent for this to just adding new data).  In other words, 
even if we were to increase the total amount of data that array was 
getting in a day, the net change from having parity checksumming would 
still stay within the range of 

Re: RAID56 - 6 parity raid

2018-05-02 Thread Gandalf Corvotempesta
On 05/02/2018 03:47 AM, Duncan wrote:
> Meanwhile, have you looked at zfs? Perhaps they have something like that?

Yes, i've looked at ZFS and I'm using it on some servers but I don't like
it too much for multiple reasons, in example:

1) is not officially in kernel, we have to build a module every time with
DKMS
2) it does not forgive, if you add the wrong device to a pool, you are
gone, you can't remove it without migrating all data and creating the new
pool from scratch. If, for mistake, you add a single device to a RAID-Z3,
you totally loose the whole redundancy and so on.
3) doesn't support expansion of RAID-Z one disk per time. if you want to
expand a RAIDZ, you have to create another pool and then stripe over it.

I'm new to BTRFS (if fact, i'm not using it) and I've seen in the status
page that "it's almost ready".
The only real missing part is a stable, secure and properly working RAID56,
so i'm thinking why most effort aren't directed to fix RAID56 ?

There are some environments where a RAID1/10 is too expensive and a RAID6
is mandatory,
but with the current state of RAID56, BTRFS can't be used for valuable data

Also, i've seen that to fix write hole, a dedicated disk is needed ? Is
this true ?
I cant' create a 6 disks RAID6 with only 6 disks and no write-hole like
with ZFS ?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Goffredo Baroncelli
On 05/02/2018 08:17 PM, waxhead wrote:
> Goffredo Baroncelli wrote:
>> On 05/02/2018 06:55 PM, waxhead wrote:

 So again, which problem would solve having the parity checksummed ? On the 
 best of my knowledge nothing. In any case the data is checksummed so it is 
 impossible to return corrupted data (modulo bug :-) ).

>>> I am not a BTRFS dev , but this should be quite easy to answer. Unless you 
>>> checksum the parity there is no way to verify that that the data (parity) 
>>> you use to reconstruct other data is correct.
>>
>> In any case you could catch that the compute data is wrong, because the data 
>> is always checksummed. And in any case you must check the data against their 
>> checksum.
>>
> What if you lost an entire disk? or had corruption for both data AND 
> checksum? How do you plan to safely reconstruct that without checksummed 
> parity?

As general rule, the returned data is *always* check against their checksum. So 
in any case wrong data is never returned. Let me to say in another words: 
having the parity checksummed doesn't increase the reliability and or the 
safety of the RAID rebuilding. I want to repeat again: even if the parity is 
corrupted, the rebuild (wrong) data fails the check against its checksum and it 
is not returned !

Back to your questions:

1) Loosing 1 disks -> 1 fault
2) Loosing both data and checksum -> 2 faults

RAID5 is single fault prof. So if case #2 happens, raid5 can't protect you. 
However BTRFS is capable to detect that the data is wrong due to checksum.
In case #1, there is no problem, because for each stripe you have enough data 
to rebuild the missing one.

Because I read several time that the checksum parity would increase the 
reliability and/or the safety of the raid5/6 profile, let me to explain the 
logic:

read_from_disk() {
data = read_data()
if (data != ERROR && check_checksum(data))
return data;
/* checksum mismatch or data is missing */
full_stripe = read_full_stripe()
if (raid5_profile) {
/* raid5 has only one way of rebuilding data */
data = rebuild_raid5_data(full_stripe)
if (data != ERROR && check_checksum(data)) {
rebuild_stripe(data, full_stripe)
return data;
}
/* parity and/or another data is corrupted/missed */
return ERROR
}

for_each raid6_rebuilding_strategies(full_stripe) {
/* 
 * raid6 might have more than one way of rebuilding data 
 * depending by the degree of failure
 */
data = rebuild_raid6_data(full_stripe)
if (data != ERROR && check_checksum(data)) {
rebuild_stripe(data, full_stripe)
/* data is good, return it */
return data;
}
}
return ERROR
}

In case of raid5, there is only one way of rebuild the data. In case of raid6 
and 1 fault, there are several way of rebuilding data (however in case of two 
failure, there are only one way). So more possibilities have to be tested for 
rebuilding the data.
If the parity is corrupted, the rebuild data is corrupted too, and it fails the 
check against its checksum.


> 
>> My point is that storing the checksum is a cost that you pay *every time*. 
>> Every time you update a part of a stripe you need to update the parity, and 
>> then in turn the parity checksum. It is not a problem of space occupied nor 
>> a computational problem. It is a a problem of write amplification...
> How much of a problem is this? no benchmarks have been run since the feature 
> is not yet there I suppose.

It is simple, for each stripe touched you need to update the parity(1); then 
you need to update parity checksums(1) (which in turn would requires an update 
of the parity(2) of the stripe where is stored the parity(1) checksums, which 
in turn would requires to update the parity(2) checksum... and so on)

> 
>>
>> The only gain is to avoid to try to use the parity when
>> a) you need it (i.e. when the data is missing and/or corrupted)
> I'm not sure I can make out your argument here , but with RAID5/6 you don't 
> have another copy to restore from. You *have* to use the parity to 
> reconstruct data and it is a good thing if this data is trusted.
I never say the opposite

> 
>> and b) it is corrupted.
>> But the likelihood of this case is very low. And you can catch it during the 
>> data checksum check (which has to be performed in any case !).
>>
>> So from one side you have a *cost every time* (the write amplification), to 
>> other side you have a gain (cpu-time) *only in case* of the parity is 
>> corrupted and you need it (eg. scrub or corrupted data)).
>>
>> IMHO the cost are very higher than the gain, and the likelihood the gain is 
>> very lower compared to 

Re: RAID56 - 6 parity raid

2018-05-02 Thread waxhead

Goffredo Baroncelli wrote:

On 05/02/2018 06:55 PM, waxhead wrote:


So again, which problem would solve having the parity checksummed ? On the best 
of my knowledge nothing. In any case the data is checksummed so it is 
impossible to return corrupted data (modulo bug :-) ).


I am not a BTRFS dev , but this should be quite easy to answer. Unless you 
checksum the parity there is no way to verify that that the data (parity) you 
use to reconstruct other data is correct.


In any case you could catch that the compute data is wrong, because the data is 
always checksummed. And in any case you must check the data against their 
checksum.

What if you lost an entire disk? or had corruption for both data AND 
checksum? How do you plan to safely reconstruct that without checksummed 
parity?



My point is that storing the checksum is a cost that you pay *every time*. 
Every time you update a part of a stripe you need to update the parity, and 
then in turn the parity checksum. It is not a problem of space occupied nor a 
computational problem. It is a a problem of write amplification...
How much of a problem is this? no benchmarks have been run since the 
feature is not yet there I suppose.




The only gain is to avoid to try to use the parity when
a) you need it (i.e. when the data is missing and/or corrupted)
I'm not sure I can make out your argument here , but with RAID5/6 you 
don't have another copy to restore from. You *have* to use the parity to 
reconstruct data and it is a good thing if this data is trusted.



and b) it is corrupted.
But the likelihood of this case is very low. And you can catch it during the 
data checksum check (which has to be performed in any case !).

So from one side you have a *cost every time* (the write amplification), to 
other side you have a gain (cpu-time) *only in case* of the parity is corrupted 
and you need it (eg. scrub or corrupted data)).

IMHO the cost are very higher than the gain, and the likelihood the gain is 
very lower compared to the likelihood (=100% or always) of the cost.

Then run benchmarks and considering making parity checksums optional 
(but pretty please dipped in syrup with sugar on top - keep in on by 
default).


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Goffredo Baroncelli
On 05/02/2018 06:55 PM, waxhead wrote:
>>
>> So again, which problem would solve having the parity checksummed ? On the 
>> best of my knowledge nothing. In any case the data is checksummed so it is 
>> impossible to return corrupted data (modulo bug :-) ).
>>
> I am not a BTRFS dev , but this should be quite easy to answer. Unless you 
> checksum the parity there is no way to verify that that the data (parity) you 
> use to reconstruct other data is correct.

In any case you could catch that the compute data is wrong, because the data is 
always checksummed. And in any case you must check the data against their 
checksum.

My point is that storing the checksum is a cost that you pay *every time*. 
Every time you update a part of a stripe you need to update the parity, and 
then in turn the parity checksum. It is not a problem of space occupied nor a 
computational problem. It is a a problem of write amplification...

The only gain is to avoid to try to use the parity when 
a) you need it (i.e. when the data is missing and/or corrupted)
and b) it is corrupted. 
But the likelihood of this case is very low. And you can catch it during the 
data checksum check (which has to be performed in any case !).

So from one side you have a *cost every time* (the write amplification), to 
other side you have a gain (cpu-time) *only in case* of the parity is corrupted 
and you need it (eg. scrub or corrupted data)).

IMHO the cost are very higher than the gain, and the likelihood the gain is 
very lower compared to the likelihood (=100% or always) of the cost.


BR
G.Baroncelli


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli 
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Austin S. Hemmelgarn

On 2018-05-02 12:55, waxhead wrote:

Goffredo Baroncelli wrote:

Hi
On 05/02/2018 03:47 AM, Duncan wrote:

Gandalf Corvotempesta posted on Tue, 01 May 2018 21:57:59 + as
excerpted:


Hi to all I've found some patches from Andrea Mazzoleni that adds
support up to 6 parity raid.
Why these are wasn't merged ?
With modern disk size, having something greater than 2 parity, would be
great.

1) [...] the parity isn't checksummed, 


Why the fact that the parity is not checksummed is a problem ?
I read several times that this is a problem. However each time the 
thread reached the conclusion that... it is not a problem.


So again, which problem would solve having the parity checksummed ? On 
the best of my knowledge nothing. In any case the data is checksummed 
so it is impossible to return corrupted data (modulo bug :-) ).


I am not a BTRFS dev , but this should be quite easy to answer. Unless 
you checksum the parity there is no way to verify that that the data 
(parity) you use to reconstruct other data is correct.
While this is the biggest benefit (and it's a _huge_ one, because it 
means you don't have to waste time doing the parity reconstruction if 
you know the result won't be right), there's also a rather nice benefit 
for scrubbing the array, namely that you don't have to recompute parity 
to check if it's right or not (and thus can avoid wasting time 
recomputing it for every stripe in the common case of almost every 
stripe being correct).


On the other side, having the parity would increase both the code 
complexity and the write amplification, because every time a part of 
the stripe is touched not only the parity has to be updated, but also 
the checksum has too..
Which is a good thing. BTRFS main selling point is that you can feel 
pretty confident that whatever you put is exactly what you get out.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread waxhead

Goffredo Baroncelli wrote:

Hi
On 05/02/2018 03:47 AM, Duncan wrote:

Gandalf Corvotempesta posted on Tue, 01 May 2018 21:57:59 + as
excerpted:


Hi to all I've found some patches from Andrea Mazzoleni that adds
support up to 6 parity raid.
Why these are wasn't merged ?
With modern disk size, having something greater than 2 parity, would be
great.

1) [...] the parity isn't checksummed, 


Why the fact that the parity is not checksummed is a problem ?
I read several times that this is a problem. However each time the thread 
reached the conclusion that... it is not a problem.

So again, which problem would solve having the parity checksummed ? On the best 
of my knowledge nothing. In any case the data is checksummed so it is 
impossible to return corrupted data (modulo bug :-) ).

I am not a BTRFS dev , but this should be quite easy to answer. Unless 
you checksum the parity there is no way to verify that that the data 
(parity) you use to reconstruct other data is correct.



On the other side, having the parity would increase both the code complexity 
and the write amplification, because every time a part of the stripe is touched 
not only the parity has to be updated, but also the checksum has too..
Which is a good thing. BTRFS main selling point is that you can feel 
pretty confident that whatever you put is exactly what you get out.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RAID56 - 6 parity raid

2018-05-02 Thread Goffredo Baroncelli
Hi
On 05/02/2018 03:47 AM, Duncan wrote:
> Gandalf Corvotempesta posted on Tue, 01 May 2018 21:57:59 + as
> excerpted:
> 
>> Hi to all I've found some patches from Andrea Mazzoleni that adds
>> support up to 6 parity raid.
>> Why these are wasn't merged ?
>> With modern disk size, having something greater than 2 parity, would be
>> great.
> 1) [...] the parity isn't checksummed, 

Why the fact that the parity is not checksummed is a problem ?
I read several times that this is a problem. However each time the thread 
reached the conclusion that... it is not a problem.

So again, which problem would solve having the parity checksummed ? On the best 
of my knowledge nothing. In any case the data is checksummed so it is 
impossible to return corrupted data (modulo bug :-) ).

On the other side, having the parity would increase both the code complexity 
and the write amplification, because every time a part of the stripe is touched 
not only the parity has to be updated, but also the checksum has too..


BR
G.Baroncelli

-- 
gpg @keyserver.linux.it: Goffredo Baroncelli 
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] btrfs: qgroups, fix rescan worker running races

2018-05-02 Thread Jeff Mahoney
On 5/2/18 9:15 AM, David Sterba wrote:
> On Wed, May 02, 2018 at 12:29:28PM +0200, David Sterba wrote:
>> On Thu, Apr 26, 2018 at 03:23:49PM -0400, je...@suse.com wrote:
>>> From: Jeff Mahoney 
>>> +static void queue_rescan_worker(struct btrfs_fs_info *fs_info)
>>> +{
>>> +   mutex_lock(_info->qgroup_rescan_lock);
>>> +   if (btrfs_fs_closing(fs_info)) {
>>> +   mutex_unlock(_info->qgroup_rescan_lock);
>>> +   return;
>>> +   }
>>> +   if (WARN_ON(fs_info->qgroup_rescan_running)) {
>>
>> The warning is quite noisy, I see it after tests btrfs/ 017, 022, 124,
>> 139, 153. Is it necessary for non-debugging builds?
>>
>> The tested branch was full for-next so it could be your patchset
>> interacting with other fixes, but the warning noise level question still
>> stands.
> 
> So it must be something with the rest of misc-next or for-next patches,
> current for 4.17 queue does show the warning at all, and the patch is ok
> for merge.
>
You might have something that causes it to be more noisy but it looks
like it should be possible to hit on 4.16.  The warning is supposed to
detect and complain about multiple rescan threads starting.  What I
think it's doing here is (correctly) identifying a different race: at
the end of btrfs_qgroup_rescan_worker, we clear the rescan status flag,
drop the lock, commit the status item transaction, and then update
->qgroup_rescan_running.  If a rescan is requested before the lock is
reacquired, we'll try to start it up and then hit that warning.

So, the warning is doing its job.  Please hold off on merging this patch.

IMO the root cause is overloading fs_info->qgroup_flags to correspond to
the on-disk item and control runtime behavior.  I've been meaning to fix
that for a while, so I'll do that now.

-Jeff

-- 
Jeff Mahoney
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] btrfs: qgroups, fix rescan worker running races

2018-05-02 Thread David Sterba
On Wed, May 02, 2018 at 12:29:28PM +0200, David Sterba wrote:
> On Thu, Apr 26, 2018 at 03:23:49PM -0400, je...@suse.com wrote:
> > From: Jeff Mahoney 
> > +static void queue_rescan_worker(struct btrfs_fs_info *fs_info)
> > +{
> > +   mutex_lock(_info->qgroup_rescan_lock);
> > +   if (btrfs_fs_closing(fs_info)) {
> > +   mutex_unlock(_info->qgroup_rescan_lock);
> > +   return;
> > +   }
> > +   if (WARN_ON(fs_info->qgroup_rescan_running)) {
> 
> The warning is quite noisy, I see it after tests btrfs/ 017, 022, 124,
> 139, 153. Is it necessary for non-debugging builds?
> 
> The tested branch was full for-next so it could be your patchset
> interacting with other fixes, but the warning noise level question still
> stands.

So it must be something with the rest of misc-next or for-next patches,
current for 4.17 queue does show the warning at all, and the patch is ok
for merge.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Use exclude_super_stripes instead of account_super_bytes

2018-05-02 Thread Qu Wenruo


On 2018年05月02日 20:49, Nikolay Borisov wrote:
> 
> 
> On  2.05.2018 15:29, Qu Wenruo wrote:
>>
>>
>> On 2018年05月02日 19:52, Nikolay Borisov wrote:
>>> Originally commit 2681e00f00fe ("btrfs-progs: check for matchingi
>>> free space in cache") added the account_super_bytes function to prevent
>>> false negative when running btrfs check. Turns out this function is
>>> really copied exclude_super_stripes, excluding the calls to
>>> exclude_super_stripes. Later commit e4797df6a9fa ("btrfs-progs: check
>>> the free space tree in btrfsck") introduced proper version of
>>> exclude_super_stripes. Instead of duplicating the function, just remove
>>> account_super_bytes and use exclude_super_stripes instead of the former.
>>> This also has the benefit of bringing the userspace code a bit closer
>>> to the kernel counterpart.
>>>
>>> Signed-off-by: Nikolay Borisov 
>>
>> Since these two functions are the same, it's completely fine to remove one.
> 
> As I mentioned, they are not "comlpetely" the same. The difference is
> that exclude_super_stripes will mark the stripes as
> EXTENT_UPTODATE in fs_info->pinned_extents via add_excluded_extent.
> Dunno if this has any repercussions on functionality. I've run the progs
> test suite and didn't observe any regressions. Also looking at the usage
> of fs_info->pinned_extents didn't see anything conspicuous.

Yeah, still same conclusion here.
All pinned_extents usage I found is either really for pinned extents of
current transaction (EXTENT_DIRTY) or this excluded usage (EXTENT_UPTODATE).
And unlike EXTENT_DIRTY, EXTENT_UPTODATE won't be removed after
transaction commit, so if I didn't miss anything important, it should be OK.

Just adding Su for this, as he worked on pinning down tree blocks for
lowmem mode extent init re-init, he may be more experienced in this field.

Despite that, such abuse of EXTENT_* bits in different trees at least
need extra comment for them later.

Thanks,
Qu

> 
>>
>> Reviewed-by: Qu Wenruo 
>>
>> Thanks,
>> Qu
>>
>>> ---
>>>  extent-tree.c | 52 ++--
>>>  1 file changed, 2 insertions(+), 50 deletions(-)
>>>
>>> diff --git a/extent-tree.c b/extent-tree.c
>>> index ea205ccf4c30..391f0a784710 100644
>>> --- a/extent-tree.c
>>> +++ b/extent-tree.c
>>> @@ -3164,54 +3164,6 @@ static int find_first_block_group(struct btrfs_root 
>>> *root,
>>> return ret;
>>>  }
>>>  
>>> -static void account_super_bytes(struct btrfs_fs_info *fs_info,
>>> -   struct btrfs_block_group_cache *cache)
>>> -{
>>> -   u64 bytenr;
>>> -   u64 *logical;
>>> -   int stripe_len;
>>> -   int i, nr, ret;
>>> -
>>> -   if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
>>> -   stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
>>> -   cache->bytes_super += stripe_len;
>>> -   }
>>> -
>>> -   for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
>>> -   bytenr = btrfs_sb_offset(i);
>>> -   ret = btrfs_rmap_block(fs_info,
>>> -  cache->key.objectid, bytenr,
>>> -  0, , , _len);
>>> -   if (ret)
>>> -   return;
>>> -
>>> -   while (nr--) {
>>> -   u64 start, len;
>>> -
>>> -   if (logical[nr] > cache->key.objectid +
>>> -   cache->key.offset)
>>> -   continue;
>>> -
>>> -   if (logical[nr] + stripe_len <= cache->key.objectid)
>>> -   continue;
>>> -
>>> -   start = logical[nr];
>>> -   if (start < cache->key.objectid) {
>>> -   start = cache->key.objectid;
>>> -   len = (logical[nr] + stripe_len) - start;
>>> -   } else {
>>> -   len = min_t(u64, stripe_len,
>>> -   cache->key.objectid +
>>> -   cache->key.offset - start);
>>> -   }
>>> -
>>> -   cache->bytes_super += len;
>>> -   }
>>> -
>>> -   kfree(logical);
>>> -   }
>>> -}
>>> -
>>>  int btrfs_read_block_groups(struct btrfs_root *root)
>>>  {
>>> struct btrfs_path *path;
>>> @@ -3287,7 +3239,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
>>> if (btrfs_chunk_readonly(info, cache->key.objectid))
>>> cache->ro = 1;
>>>  
>>> -   account_super_bytes(info, cache);
>>> +   exclude_super_stripes(root, cache);
>>>  
>>> ret = update_space_info(info, cache->flags, found_key.offset,
>>> btrfs_block_group_used(>item),
>>> @@ -3331,7 +3283,7 @@ btrfs_add_block_group(struct btrfs_fs_info *fs_info, 
>>> u64 bytes_used, u64 type,
>>> cache->flags = type;
>>> btrfs_set_block_group_flags(>item, type);
>>>  
>>> -   account_super_bytes(fs_info, 

Re: [PATCH] btrfs-progs: Use exclude_super_stripes instead of account_super_bytes

2018-05-02 Thread Nikolay Borisov


On  2.05.2018 15:29, Qu Wenruo wrote:
> 
> 
> On 2018年05月02日 19:52, Nikolay Borisov wrote:
>> Originally commit 2681e00f00fe ("btrfs-progs: check for matchingi
>> free space in cache") added the account_super_bytes function to prevent
>> false negative when running btrfs check. Turns out this function is
>> really copied exclude_super_stripes, excluding the calls to
>> exclude_super_stripes. Later commit e4797df6a9fa ("btrfs-progs: check
>> the free space tree in btrfsck") introduced proper version of
>> exclude_super_stripes. Instead of duplicating the function, just remove
>> account_super_bytes and use exclude_super_stripes instead of the former.
>> This also has the benefit of bringing the userspace code a bit closer
>> to the kernel counterpart.
>>
>> Signed-off-by: Nikolay Borisov 
> 
> Since these two functions are the same, it's completely fine to remove one.

As I mentioned, they are not "comlpetely" the same. The difference is
that exclude_super_stripes will mark the stripes as
EXTENT_UPTODATE in fs_info->pinned_extents via add_excluded_extent.
Dunno if this has any repercussions on functionality. I've run the progs
test suite and didn't observe any regressions. Also looking at the usage
of fs_info->pinned_extents didn't see anything conspicuous.

> 
> Reviewed-by: Qu Wenruo 
> 
> Thanks,
> Qu
> 
>> ---
>>  extent-tree.c | 52 ++--
>>  1 file changed, 2 insertions(+), 50 deletions(-)
>>
>> diff --git a/extent-tree.c b/extent-tree.c
>> index ea205ccf4c30..391f0a784710 100644
>> --- a/extent-tree.c
>> +++ b/extent-tree.c
>> @@ -3164,54 +3164,6 @@ static int find_first_block_group(struct btrfs_root 
>> *root,
>>  return ret;
>>  }
>>  
>> -static void account_super_bytes(struct btrfs_fs_info *fs_info,
>> -struct btrfs_block_group_cache *cache)
>> -{
>> -u64 bytenr;
>> -u64 *logical;
>> -int stripe_len;
>> -int i, nr, ret;
>> -
>> -if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
>> -stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
>> -cache->bytes_super += stripe_len;
>> -}
>> -
>> -for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
>> -bytenr = btrfs_sb_offset(i);
>> -ret = btrfs_rmap_block(fs_info,
>> -   cache->key.objectid, bytenr,
>> -   0, , , _len);
>> -if (ret)
>> -return;
>> -
>> -while (nr--) {
>> -u64 start, len;
>> -
>> -if (logical[nr] > cache->key.objectid +
>> -cache->key.offset)
>> -continue;
>> -
>> -if (logical[nr] + stripe_len <= cache->key.objectid)
>> -continue;
>> -
>> -start = logical[nr];
>> -if (start < cache->key.objectid) {
>> -start = cache->key.objectid;
>> -len = (logical[nr] + stripe_len) - start;
>> -} else {
>> -len = min_t(u64, stripe_len,
>> -cache->key.objectid +
>> -cache->key.offset - start);
>> -}
>> -
>> -cache->bytes_super += len;
>> -}
>> -
>> -kfree(logical);
>> -}
>> -}
>> -
>>  int btrfs_read_block_groups(struct btrfs_root *root)
>>  {
>>  struct btrfs_path *path;
>> @@ -3287,7 +3239,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
>>  if (btrfs_chunk_readonly(info, cache->key.objectid))
>>  cache->ro = 1;
>>  
>> -account_super_bytes(info, cache);
>> +exclude_super_stripes(root, cache);
>>  
>>  ret = update_space_info(info, cache->flags, found_key.offset,
>>  btrfs_block_group_used(>item),
>> @@ -3331,7 +3283,7 @@ btrfs_add_block_group(struct btrfs_fs_info *fs_info, 
>> u64 bytes_used, u64 type,
>>  cache->flags = type;
>>  btrfs_set_block_group_flags(>item, type);
>>  
>> -account_super_bytes(fs_info, cache);
>> +exclude_super_stripes(fs_info->extent_root, cache);
>>  ret = update_space_info(fs_info, cache->flags, size, bytes_used,
>>  >space_info);
>>  BUG_ON(ret);
>>
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Use exclude_super_stripes instead of account_super_bytes

2018-05-02 Thread Qu Wenruo


On 2018年05月02日 19:52, Nikolay Borisov wrote:
> Originally commit 2681e00f00fe ("btrfs-progs: check for matchingi
> free space in cache") added the account_super_bytes function to prevent
> false negative when running btrfs check. Turns out this function is
> really copied exclude_super_stripes, excluding the calls to
> exclude_super_stripes. Later commit e4797df6a9fa ("btrfs-progs: check
> the free space tree in btrfsck") introduced proper version of
> exclude_super_stripes. Instead of duplicating the function, just remove
> account_super_bytes and use exclude_super_stripes instead of the former.
> This also has the benefit of bringing the userspace code a bit closer
> to the kernel counterpart.
> 
> Signed-off-by: Nikolay Borisov 

Since these two functions are the same, it's completely fine to remove one.

Reviewed-by: Qu Wenruo 

Thanks,
Qu

> ---
>  extent-tree.c | 52 ++--
>  1 file changed, 2 insertions(+), 50 deletions(-)
> 
> diff --git a/extent-tree.c b/extent-tree.c
> index ea205ccf4c30..391f0a784710 100644
> --- a/extent-tree.c
> +++ b/extent-tree.c
> @@ -3164,54 +3164,6 @@ static int find_first_block_group(struct btrfs_root 
> *root,
>   return ret;
>  }
>  
> -static void account_super_bytes(struct btrfs_fs_info *fs_info,
> - struct btrfs_block_group_cache *cache)
> -{
> - u64 bytenr;
> - u64 *logical;
> - int stripe_len;
> - int i, nr, ret;
> -
> - if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
> - stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
> - cache->bytes_super += stripe_len;
> - }
> -
> - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
> - bytenr = btrfs_sb_offset(i);
> - ret = btrfs_rmap_block(fs_info,
> -cache->key.objectid, bytenr,
> -0, , , _len);
> - if (ret)
> - return;
> -
> - while (nr--) {
> - u64 start, len;
> -
> - if (logical[nr] > cache->key.objectid +
> - cache->key.offset)
> - continue;
> -
> - if (logical[nr] + stripe_len <= cache->key.objectid)
> - continue;
> -
> - start = logical[nr];
> - if (start < cache->key.objectid) {
> - start = cache->key.objectid;
> - len = (logical[nr] + stripe_len) - start;
> - } else {
> - len = min_t(u64, stripe_len,
> - cache->key.objectid +
> - cache->key.offset - start);
> - }
> -
> - cache->bytes_super += len;
> - }
> -
> - kfree(logical);
> - }
> -}
> -
>  int btrfs_read_block_groups(struct btrfs_root *root)
>  {
>   struct btrfs_path *path;
> @@ -3287,7 +3239,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
>   if (btrfs_chunk_readonly(info, cache->key.objectid))
>   cache->ro = 1;
>  
> - account_super_bytes(info, cache);
> + exclude_super_stripes(root, cache);
>  
>   ret = update_space_info(info, cache->flags, found_key.offset,
>   btrfs_block_group_used(>item),
> @@ -3331,7 +3283,7 @@ btrfs_add_block_group(struct btrfs_fs_info *fs_info, 
> u64 bytes_used, u64 type,
>   cache->flags = type;
>   btrfs_set_block_group_flags(>item, type);
>  
> - account_super_bytes(fs_info, cache);
> + exclude_super_stripes(fs_info->extent_root, cache);
>   ret = update_space_info(fs_info, cache->flags, size, bytes_used,
>   >space_info);
>   BUG_ON(ret);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 2/2] btrfs: Factor out write portion of btrfs_get_blocks_direct

2018-05-02 Thread Nikolay Borisov
Now that the read side is extracted into its own function, do the same
to the write side. This leaves btrfs_get_blocks_direct_write with the
sole purpose of handling common locking required. Also flip the
condition in btrfs_get_blocks_direct_write so that the write case
comes first and we check for if (Create) rather than if (!create). This
is purely subjective but I believe makes reading a bit more "linear".
No functional changes.

Signed-off-by: Nikolay Borisov 
---

V3:
 * Nothing's changed. 

v2: 
 * Removed __ prefix from function name 
  * Use a ret variable in btrfs_get_blocks_direct_write and a goto label to 
   implement return strategy. 
 fs/btrfs/inode.c | 207 +--
 1 file changed, 108 insertions(+), 99 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ce1bcfc4255..c49bfda61a84 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7795,6 +7795,104 @@ static int btrfs_get_blocks_direct_read(struct 
extent_map *em,
return 0;
 }
 
+static int btrfs_get_blocks_direct_write(struct extent_map **map,
+struct buffer_head *bh_result,
+struct inode *inode,
+struct btrfs_dio_data *dio_data,
+u64 start, u64 len)
+{
+   struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+   struct extent_map *em = *map;
+   int ret = 0;
+
+   /*
+* We don't allocate a new extent in the following cases
+*
+* 1) The inode is marked as NODATACOW. In this case we'll just use the
+* existing extent.
+* 2) The extent is marked as PREALLOC. We're good to go here and can
+* just use the extent.
+*
+*/
+   if (test_bit(EXTENT_FLAG_PREALLOC, >flags) ||
+   ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+em->block_start != EXTENT_MAP_HOLE)) {
+   int type;
+   u64 block_start, orig_start, orig_block_len, ram_bytes;
+
+   if (test_bit(EXTENT_FLAG_PREALLOC, >flags))
+   type = BTRFS_ORDERED_PREALLOC;
+   else
+   type = BTRFS_ORDERED_NOCOW;
+   len = min(len, em->len - (start - em->start));
+   block_start = em->block_start + (start - em->start);
+
+   if (can_nocow_extent(inode, start, , _start,
+_block_len, _bytes) == 1 &&
+   btrfs_inc_nocow_writers(fs_info, block_start)) {
+   struct extent_map *em2;
+
+   em2 = btrfs_create_dio_extent(inode, start, len,
+ orig_start, block_start,
+ len, orig_block_len,
+ ram_bytes, type);
+   btrfs_dec_nocow_writers(fs_info, block_start);
+   if (type == BTRFS_ORDERED_PREALLOC) {
+   free_extent_map(em);
+   *map = em = em2;
+   }
+
+   if (em2 && IS_ERR(em2)) {
+   ret = PTR_ERR(em2);
+   goto out;
+   }
+   /*
+* For inode marked NODATACOW or extent marked PREALLOC,
+* use the existing or preallocated extent, so does not
+* need to adjust btrfs_space_info's bytes_may_use.
+*/
+   btrfs_free_reserved_data_space_noquota(inode, start,
+  len);
+   goto skip_cow;
+   }
+   }
+
+   /* this will cow the extent */
+   len = bh_result->b_size;
+   free_extent_map(em);
+   *map = em = btrfs_new_extent_direct(inode, start, len);
+   if (IS_ERR(em)) {
+   ret = PTR_ERR(em);
+   goto out;
+   }
+
+   len = min(len, em->len - (start - em->start));
+
+skip_cow:
+   bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+   inode->i_blkbits;
+   bh_result->b_size = len;
+   bh_result->b_bdev = em->bdev;
+   set_buffer_mapped(bh_result);
+
+   if (!test_bit(EXTENT_FLAG_PREALLOC, >flags))
+   set_buffer_new(bh_result);
+
+   /*
+* Need to update the i_size under the extent lock so buffered
+* readers will get the updated i_size when we unlock.
+*/
+   if (!dio_data->overwrite && start + len > i_size_read(inode))
+   i_size_write(inode, start + len);
+
+   WARN_ON(dio_data->reserve < len);
+   dio_data->reserve -= len;
+   dio_data->unsubmitted_oe_range_end = start + len;
+   

[PATCH v3 1/2] btrfs: Factor out read portion of btrfs_get_blocks_direct

2018-05-02 Thread Nikolay Borisov
Currently this function handles both the READ and WRITE dio cases. This
is facilitated by a bunch of 'if' statements, a goto short-circuit
statement and a very perverse aliasing of "!created"(READ) case
by setting lockstart = lockend and checking for lockstart < lockend for
detecting the write. Let's simplify this mess by extracting the
READ-only code into a separate __btrfs_get_block_direct_read function.
This is only the first step, the next one will be to factor out the
write side as well. The end goal will be to have the common locking/
unlocking code in btrfs_get_blocks_direct and then it will call either
the read|write subvariants. No functional changes.

Signed-off-by: Nikolay Borisov 
---

V3:
 * Add code to unlock unused portions of the file (if any) in the read case
 * Improve the comment about the locking in !create case as well, making it 
 more explicit. 

v2:
 * Removed __ prefix from function name
 fs/btrfs/inode.c | 56 +++-
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d42cd8e6e8a6..1ce1bcfc4255 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7774,6 +7774,27 @@ static struct extent_map *create_io_em(struct inode 
*inode, u64 start, u64 len,
return em;
 }
 
+
+static int btrfs_get_blocks_direct_read(struct extent_map *em,
+   struct buffer_head *bh_result,
+   struct inode *inode,
+   u64 start, u64 len)
+{
+   if (em->block_start == EXTENT_MAP_HOLE ||
+   test_bit(EXTENT_FLAG_PREALLOC, >flags))
+   return -ENOENT;
+
+   len = min(len, em->len - (start - em->start));
+
+   bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+   inode->i_blkbits;
+   bh_result->b_size = len;
+   bh_result->b_bdev = em->bdev;
+   set_buffer_mapped(bh_result);
+
+   return 0;
+}
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
   struct buffer_head *bh_result, int create)
 {
@@ -7842,11 +7863,29 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
goto unlock_err;
}
 
-   /* Just a good old fashioned hole, return */
-   if (!create && (em->block_start == EXTENT_MAP_HOLE ||
-   test_bit(EXTENT_FLAG_PREALLOC, >flags))) {
+   if (!create) {
+   ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
+  start, len);
+   /* Can be negative only if we read from a hole */
+   if (ret < 0) {
+   ret = 0;
+   free_extent_map(em);
+   goto unlock_err;
+   }
+   /*
+* We need to unlock only the end area that we aren't using.
+* The rest is going to be unlocked by the endio routine.
+*/
+   lockstart = start + bh_result->b_size;
+   if (lockstart < lockend) {
+   clear_extent_bit(_I(inode)->io_tree, lockstart,
+lockend, unlock_bits, 1, 0,
+_state);
+   } else {
+   free_extent_state(cached_state);
+   }
free_extent_map(em);
-   goto unlock_err;
+   return 0;
}
 
/*
@@ -7858,12 +7897,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
 * just use the extent.
 *
 */
-   if (!create) {
-   len = min(len, em->len - (start - em->start));
-   lockstart = start + len;
-   goto unlock;
-   }
-
if (test_bit(EXTENT_FLAG_PREALLOC, >flags) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
 em->block_start != EXTENT_MAP_HOLE)) {
@@ -7950,10 +7983,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, 
sector_t iblock,
clear_extent_bit(_I(inode)->io_tree, lockstart,
 lockend, unlock_bits, 1, 0,
 _state);
-   } else {
-   free_extent_state(cached_state);
}
-
free_extent_map(em);
 
return 0;
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: Use exclude_super_stripes instead of account_super_bytes

2018-05-02 Thread Nikolay Borisov
Originally commit 2681e00f00fe ("btrfs-progs: check for matchingi
free space in cache") added the account_super_bytes function to prevent
false negative when running btrfs check. Turns out this function is
really copied exclude_super_stripes, excluding the calls to
exclude_super_stripes. Later commit e4797df6a9fa ("btrfs-progs: check
the free space tree in btrfsck") introduced proper version of
exclude_super_stripes. Instead of duplicating the function, just remove
account_super_bytes and use exclude_super_stripes instead of the former.
This also has the benefit of bringing the userspace code a bit closer
to the kernel counterpart.

Signed-off-by: Nikolay Borisov 
---
 extent-tree.c | 52 ++--
 1 file changed, 2 insertions(+), 50 deletions(-)

diff --git a/extent-tree.c b/extent-tree.c
index ea205ccf4c30..391f0a784710 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -3164,54 +3164,6 @@ static int find_first_block_group(struct btrfs_root 
*root,
return ret;
 }
 
-static void account_super_bytes(struct btrfs_fs_info *fs_info,
-   struct btrfs_block_group_cache *cache)
-{
-   u64 bytenr;
-   u64 *logical;
-   int stripe_len;
-   int i, nr, ret;
-
-   if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
-   stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
-   cache->bytes_super += stripe_len;
-   }
-
-   for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-   bytenr = btrfs_sb_offset(i);
-   ret = btrfs_rmap_block(fs_info,
-  cache->key.objectid, bytenr,
-  0, , , _len);
-   if (ret)
-   return;
-
-   while (nr--) {
-   u64 start, len;
-
-   if (logical[nr] > cache->key.objectid +
-   cache->key.offset)
-   continue;
-
-   if (logical[nr] + stripe_len <= cache->key.objectid)
-   continue;
-
-   start = logical[nr];
-   if (start < cache->key.objectid) {
-   start = cache->key.objectid;
-   len = (logical[nr] + stripe_len) - start;
-   } else {
-   len = min_t(u64, stripe_len,
-   cache->key.objectid +
-   cache->key.offset - start);
-   }
-
-   cache->bytes_super += len;
-   }
-
-   kfree(logical);
-   }
-}
-
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
struct btrfs_path *path;
@@ -3287,7 +3239,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
if (btrfs_chunk_readonly(info, cache->key.objectid))
cache->ro = 1;
 
-   account_super_bytes(info, cache);
+   exclude_super_stripes(root, cache);
 
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(>item),
@@ -3331,7 +3283,7 @@ btrfs_add_block_group(struct btrfs_fs_info *fs_info, u64 
bytes_used, u64 type,
cache->flags = type;
btrfs_set_block_group_flags(>item, type);
 
-   account_super_bytes(fs_info, cache);
+   exclude_super_stripes(fs_info->extent_root, cache);
ret = update_space_info(fs_info, cache->flags, size, bytes_used,
>space_info);
BUG_ON(ret);
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] btrfs: qgroups, fix rescan worker running races

2018-05-02 Thread David Sterba
On Thu, Apr 26, 2018 at 03:23:49PM -0400, je...@suse.com wrote:
> From: Jeff Mahoney 
> +static void queue_rescan_worker(struct btrfs_fs_info *fs_info)
> +{
> + mutex_lock(_info->qgroup_rescan_lock);
> + if (btrfs_fs_closing(fs_info)) {
> + mutex_unlock(_info->qgroup_rescan_lock);
> + return;
> + }
> + if (WARN_ON(fs_info->qgroup_rescan_running)) {

The warning is quite noisy, I see it after tests btrfs/ 017, 022, 124,
139, 153. Is it necessary for non-debugging builds?

The tested branch was full for-next so it could be your patchset
interacting with other fixes, but the warning noise level question still
stands.

> + btrfs_warn(fs_info, "rescan worker already queued");
> + mutex_unlock(_info->qgroup_rescan_lock);
> + return;
> + }
> +
> + /*
> +  * Being queued is enough for btrfs_qgroup_wait_for_completion
> +  * to need to wait.
> +  */
> + fs_info->qgroup_rescan_running = true;
> + mutex_unlock(_info->qgroup_rescan_lock);
> +
> + btrfs_queue_work(fs_info->qgroup_rescan_workers,
> +  _info->qgroup_rescan_work);
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: qgroup: More meaningful qgroup_rescan_init error message

2018-05-02 Thread Nikolay Borisov


On  2.05.2018 08:28, Qu Wenruo wrote:
> Error message from qgroup_rescan_init() mostly looks like:
> 
> --
> BTRFS info (device nvme0n1p1): qgroup_rescan_init failed with -115
> --
> 
> Which is far from meaningful, and sometimes confusing as for above
> -EINPROGRESS it's mostly (despite the init race) harmless, but sometimes
> it can also indicates problem if return value is -EINVAL.
> 
> Change it to some more meaningful messages like:
> 
> --
> BTRFS info (device nvme0n1p1): qgroup rescan is already in progress
> --
> 
> And
> 
> --
> BTRFS err(device nvme0n1p1): qgroup_rescan_init failed, qgroup is not enabled
> --
> 
> Signed-off-by: Qu Wenruo 

Much better indeed:

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/qgroup.c | 34 +++---
>  1 file changed, 19 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index ec2339a49ec3..a5742e9e9a14 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2760,26 +2760,37 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 
> progress_objectid,
>  {
>   int ret = 0;
>  
> - if (!init_flags &&
> - (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) ||
> -  !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))) {
> - ret = -EINVAL;
> - goto err;
> + if (!init_flags) {
> + /* we're resuming qgroup rescan at mount time */
> + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN))
> + btrfs_err(fs_info, "%s failed, qgroup is not enabled",
> +   __func__);
> + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
> + btrfs_err(fs_info,
> +   "%s failed, qgroup rescan is not queued",
> +   __func__);
> + return -EINVAL;
>   }
>  
>   mutex_lock(_info->qgroup_rescan_lock);
>   spin_lock(_info->qgroup_lock);
>  
>   if (init_flags) {
> - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
> + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
> + btrfs_info(fs_info,
> +"qgroup rescan is already in progress");
>   ret = -EINPROGRESS;
> - else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
> + } else if (!(fs_info->qgroup_flags &
> +  BTRFS_QGROUP_STATUS_FLAG_ON)) {
> + btrfs_err(fs_info, "%s failed, qgroup is not enabled",
> +   __func__);
>   ret = -EINVAL;
> + }
>  
>   if (ret) {
>   spin_unlock(_info->qgroup_lock);
>   mutex_unlock(_info->qgroup_rescan_lock);
> - goto err;
> + return ret;
>   }
>   fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
>   }
> @@ -2798,13 +2809,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 
> progress_objectid,
>   btrfs_init_work(_info->qgroup_rescan_work,
>   btrfs_qgroup_rescan_helper,
>   btrfs_qgroup_rescan_worker, NULL, NULL);
> -
> - if (ret) {
> -err:
> - btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
> - return ret;
> - }
> -
>   return 0;
>  }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html