Re: [PATCH V2] block-throttle: avoid double charge

2017-12-20 Thread Shaohua Li
On Wed, Dec 20, 2017 at 02:42:20PM +0800, xuejiufei wrote:
> Hi Shaohua,
> 
> I noticed that the splitted bio will goto the scheduler directly while
> the cloned bio entering the generic_make_request again. So can we just
> leave the BIO_THROTTLED flag in the original bio and do not copy the
> flag to new splitted bio, so it is not necessary to remove the flag in
> bio_set_dev()? Or there are other different situations?

but we can still send the original bio to a different bdev, for example stacked
disks. That bit will prevent throttling for underlayer disks.

Thanks,
Shaohua
 
> On 2017/11/14 上午4:37, Shaohua Li wrote:
> > If a bio is throttled and splitted after throttling, the bio could be
> > resubmited and enters the throttling again. This will cause part of the
> > bio is charged multiple times. If the cgroup has an IO limit, the double
> > charge will significantly harm the performance. The bio split becomes
> > quite common after arbitrary bio size change.
> > 
> > To fix this, we always set the BIO_THROTTLED flag if a bio is throttled.
> > If the bio is cloned/slitted, we copy the flag to new bio too to avoid
> > double charge. However cloned bio could be directed to a new disk,
> > keeping the flag will have problem. The observation is we always set new
> > disk for the bio in this case, so we can clear the flag in
> > bio_set_dev().
> > 
> > This issue exists a long time, arbitrary bio size change makes it worse,
> > so this should go into stable at least since v4.2.
> > 
> > V1-> V2: Not add extra field in bio based on discussion with Tejun
> > 
> > Cc: Tejun Heo <t...@kernel.org>
> > Cc: Vivek Goyal <vgo...@redhat.com>
> > Cc: sta...@vger.kernel.org
> > Signed-off-by: Shaohua Li <s...@fb.com>
> > ---
> >  block/bio.c  | 2 ++
> >  block/blk-throttle.c | 8 +---
> >  include/linux/bio.h  | 2 ++
> >  3 files changed, 5 insertions(+), 7 deletions(-)
> > 
> > diff --git a/block/bio.c b/block/bio.c
> > index 8338304..d1d4d51 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -598,6 +598,8 @@ void __bio_clone_fast(struct bio *bio, struct bio 
> > *bio_src)
> >  */
> > bio->bi_disk = bio_src->bi_disk;
> > bio_set_flag(bio, BIO_CLONED);
> > +   if (bio_flagged(bio_src, BIO_THROTTLED))
> > +   bio_set_flag(bio, BIO_THROTTLED);
> > bio->bi_opf = bio_src->bi_opf;
> > bio->bi_write_hint = bio_src->bi_write_hint;
> > bio->bi_iter = bio_src->bi_iter;
> > diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> > index ee6d7b0..f90fec1 100644
> > --- a/block/blk-throttle.c
> > +++ b/block/blk-throttle.c
> > @@ -,13 +,7 @@ bool blk_throtl_bio(struct request_queue *q, struct 
> > blkcg_gq *blkg,
> >  out_unlock:
> > spin_unlock_irq(q->queue_lock);
> >  out:
> > -   /*
> > -* As multiple blk-throtls may stack in the same issue path, we
> > -* don't want bios to leave with the flag set.  Clear the flag if
> > -* being issued.
> > -*/
> > -   if (!throttled)
> > -   bio_clear_flag(bio, BIO_THROTTLED);
> > +   bio_set_flag(bio, BIO_THROTTLED);
> >  
> >  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> > if (throttled || !td->track_bio_latency)
> > diff --git a/include/linux/bio.h b/include/linux/bio.h
> > index 9c75f58..27b5bac 100644
> > --- a/include/linux/bio.h
> > +++ b/include/linux/bio.h
> > @@ -504,6 +504,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx);
> >  
> >  #define bio_set_dev(bio, bdev) \
> >  do {   \
> > +   if ((bio)->bi_disk != (bdev)->bd_disk)  \
> > +   bio_clear_flag(bio, BIO_THROTTLED);\
> > (bio)->bi_disk = (bdev)->bd_disk;   \
> > (bio)->bi_partno = (bdev)->bd_partno;   \
> >  } while (0)
> > 


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-19 Thread Shaohua Li
On Tue, Dec 19, 2017 at 10:17:43AM -0600, Bruno Wolff III wrote:
> On Sun, Dec 17, 2017 at 21:43:50 +0800,
>  weiping zhang  wrote:
> > Hi, thanks for testing, I think you first reproduce this issue(got WARNING
> > at device_add_disk) by your own build, then add my debug patch.
> 
> The problem is still in rc4. Reverting the commit still fixes the problem. I
> tested that warning level messages should appear using lkdtm. While there
> could be something weird relating to the WARN_ON macro, more likely there is
> something different about the boots with the kernels I build (the exact way
> initramfs is built is probably different) and probably that (WARN_ON) code
> is not getting executed.

Not sure if this is MD related, but could you please check if this debug patch
changes anything?

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e4dee0..c365179 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -518,7 +518,6 @@ static void mddev_put(struct mddev *mddev)
mddev->ctime == 0 && !mddev->hold_active) {
/* Array is not configured at all, and not held active,
 * so destroy it */
-   list_del_init(>all_mddevs);
bs = mddev->bio_set;
sync_bs = mddev->sync_set;
mddev->bio_set = NULL;
@@ -5210,6 +5209,10 @@ static void md_free(struct kobject *ko)
}
percpu_ref_exit(>writes_pending);
 
+   spin_lock(_mddevs_lock);
+   list_del_init(>all_mddevs);
+   spin_unlock(_mddevs_lock);
+
kfree(mddev);
 }
 


Re: [PATCH v2 2/2] blk-throttle: fix wrong initialization in case of dm device

2017-11-20 Thread Shaohua Li
On Mon, Nov 20, 2017 at 04:02:03PM -0500, Mike Snitzer wrote:
> On Sun, Nov 19, 2017 at 9:00 PM, Joseph Qi  wrote:
> > From: Joseph Qi 
> >
> > dm device set QUEUE_FLAG_NONROT in resume, which is after register
> > queue. That is to mean, the previous initialization in
> > blk_throtl_register_queue is wrong in this case.
> > Fix it by checking and then updating the info during root tg
> > initialization as we don't have a better choice.
> 
> Given DM motivated this change, curious why you didn't send this to dm-devel?
> 
> In any case, not sure why you need to reference "resume".  Very few
> people will appreciate that detail.
> 
> Better to just say: DM device sets QUEUE_FLAG_NONROT after the queue
> is registered.
> 
> > Signed-off-by: Joseph Qi 
> > ---
> >  block/blk-throttle.c | 17 +
> >  1 file changed, 17 insertions(+)
> >
> > diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> > index bf52035..6d6b220 100644
> > --- a/block/blk-throttle.c
> > +++ b/block/blk-throttle.c
> > @@ -541,6 +541,23 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
> > if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
> > sq->parent_sq = _to_tg(blkg->parent)->service_queue;
> > tg->td = td;
> > +
> > +   /*
> > +* dm device set QUEUE_FLAG_NONROT in resume, which is after 
> > resister
> > +* queue, so the previous initialization is wrong in this case. 
> > Check
> > +* and update it here.
> > +*/
> 
> typo: s/resister/register/
> But again, best to say:
> DM device sets QUEUE_FLAG_NONROT after the queue is registered, ...
> 
> Also, an alternative to your patch would be to have DM's
> dm_table_set_restrictions() call a blk-throttle function to setup
> blk-throttle after it is done setting queue flags?
> Saves all other drivers from having to run this 2 stage initialization
> code.. just a thought.

Though the patch is a workaround, I'd prefer limiting the code into
blk-throttle itself. Populating it to drivers is a layer violation to me. The
initialization only runs once, so I think it's ok.

Thanks,
Shaohua


Re: [PATCH v2 2/2] blk-throttle: fix wrong initialization in case of dm device

2017-11-20 Thread Shaohua Li
On Mon, Nov 20, 2017 at 10:00:27AM +0800, Joseph Qi wrote:
> From: Joseph Qi <qijiang...@alibaba-inc.com>
> 
> dm device set QUEUE_FLAG_NONROT in resume, which is after register
> queue. That is to mean, the previous initialization in
> blk_throtl_register_queue is wrong in this case.
> Fix it by checking and then updating the info during root tg
> initialization as we don't have a better choice.
> 
> Signed-off-by: Joseph Qi <qijiang...@alibaba-inc.com>
> ---
>  block/blk-throttle.c | 17 +
>  1 file changed, 17 insertions(+)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index bf52035..6d6b220 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -541,6 +541,23 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
>   if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
>   sq->parent_sq = _to_tg(blkg->parent)->service_queue;
>   tg->td = td;
> +
> + /*
> +  * dm device set QUEUE_FLAG_NONROT in resume, which is after resister
> +  * queue, so the previous initialization is wrong in this case. Check
> +  * and update it here.
> +  */
> + if (blk_queue_nonrot(blkg->q) &&
> + td->filtered_latency != LATENCY_FILTERED_SSD) {
> + int i;
> +
> + td->throtl_slice = DFL_THROTL_SLICE_SSD;

if CONFIG_BLK_DEV_THROTTLING_LOW isn't not set, we use old slice, can you do
the same thing here? Otherwise,
Reviewed-by: Shaohua Li <s...@kernel.org>

> + td->filtered_latency = LATENCY_FILTERED_SSD;
> + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
> + td->avg_buckets[READ][i].latency = 0;
> + td->avg_buckets[WRITE][i].latency = 0;
> + }
> + }
>  }
>  
>  /*
> -- 
> 1.9.4


Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-17 Thread Shaohua Li
On Thu, Nov 16, 2017 at 08:25:58PM -0800, Khazhismel Kumykov wrote:
> On Thu, Nov 16, 2017 at 8:50 AM, Shaohua Li <s...@kernel.org> wrote:
> > On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
> >> Allows configuration additional bytes or ios before a throttle is
> >> triggered.
> >>
> >> This allows implementation of a bucket style rate-limit/throttle on a
> >> block device. Previously, bursting to a device was limited to allowance
> >> granted in a single throtl_slice (similar to a bucket with limit N and
> >> refill rate N/slice).
> >>
> >> Additional parameters bytes/io_burst_conf defined for tg, which define a
> >> number of bytes/ios that must be depleted before throttling happens. A
> >> tg that does not deplete this allowance functions as though it has no
> >> configured limits. tgs earn additional allowance at rate defined by
> >> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
> >> kicks in. If a tg is idle for a while, it will again have some burst
> >> allowance before it gets throttled again.
> >>
> >> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
> >> when all "used" burst allowance would be earned back. trim_slice still
> >> does progress slice_start as before and decrements *_disp as before, and
> >> tgs continue to get bytes/ios in throtl_slice intervals.
> >
> > Can you describe why we need this? It would be great if you can describe the
> > usage model and an example. Does this work for io.low/io.max or both?
> >
> > Thanks,
> > Shaohua
> >
> 
> Use case that brought this up was configuring limits for a remote
> shared device. Bursting beyond io.max is desired but only for so much
> before the limit kicks in, afterwards with sustained usage throughput
> is capped. (This proactively avoids remote-side limits). In that case
> one would configure in a root container io.max + io.burst, and
> configure low/other limits on descendants sharing the resource on the
> same node.
> 
> With this patch, so long as tg has not dispatched more than the burst,
> no limit is applied at all by that tg, including limit imposed by
> io.low in tg_iops_limit, etc.

I'd appreciate if you can give more details about the 'why'. 'configuring
limits for a remote shared device' doesn't justify the change.


Re: [RFC] md: make queue limits depending on limits of RAID members

2017-11-17 Thread Shaohua Li
On Wed, Nov 15, 2017 at 11:25:12AM +0100, Mariusz Dabrowski wrote:
> Hi all,
> 
> In order to be compliant with a pass-throug drive behavior, RAID queue
> limits should be calculated in a way that minimal io, optimal io and
> discard granularity size will be met from a single drive perspective.
> Currently MD driver is ignoring queue limits reported by members and all
> calculations are based on chunk (stripe) size.

We did use blk_stack_limits, which will care about these.

> I am working on patch which
> changes this. Since kernel 4.13 drives which support streams (write hints)
> might report discard_alignment or discard_granularity bigger than array
> stripe (for more details view NVMe 1.3 specification, chapter 9.3 Streams)
> so result of current calculation is not optimal for those drives. For 
> example for 4 disk RAID 0 with chunk size smaller than discard_granularity
> of member drives, discard_granularity of RAID array should be equal to
> 4*member_discard_granularity.
> 
> The problem is that for some drives there is a risk that result of this
> calculation exceeds unsigned int range. Current implementation of function
> nvme_config_discard in NVMe driver can also suffer the same problem:
> 
>   if (ctrl->nr_streams && ns->sws && ns->sgs) {
>   unsigned int sz = logical_block_size * ns->sws * ns->sgs;
> 
>   ns->queue->limits.discard_alignment = sz;
>   ns->queue->limits.discard_granularity = sz;
>   }
> 
> One potential solution for that is to change type of some queue limits
> (at least discard_granularity and discard_alignment, maybe more, like
> max_discard_sectors?) from 32 bits to 64 bits.
> 
> What are your thoughts about this? Do you expect any troubles with 
> changing this in block layer? Would you be willing to do such change?
> 
> Signed-off-by: Mariusz Dabrowski 
> ---
> 
>  drivers/md/md.c | 24 
>  drivers/md/md.h |  1 +
>  drivers/md/raid0.c  | 23 ---
>  drivers/md/raid10.c | 30 +-
>  drivers/md/raid5.c  | 25 +++--
>  5 files changed, 89 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 0ff1bbf6c90e..e0dc114cab19 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -66,6 +66,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include "md.h"
> @@ -679,6 +680,29 @@ struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, 
> int nr)
>  }
>  EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
>  
> +void md_rdev_get_queue_limits(struct mddev *mddev, struct queue_limits 
> *limits)
> +{
> + struct md_rdev *rdev;
> +
> + memset(limits, 0, sizeof(struct queue_limits));
> +
> + rdev_for_each(rdev, mddev) {
> + struct queue_limits *rdev_limits;
> +
> + rdev_limits = >bdev->bd_queue->limits;
> + limits->io_min = max(limits->io_min, rdev_limits->io_min);
> + limits->io_opt = lcm_not_zero(limits->io_opt,
> +   rdev_limits->io_opt);
> + limits->discard_granularity =
> + max(limits->discard_granularity,
> + rdev_limits->discard_granularity);
> + limits->discard_alignment =
> + max(limits->discard_alignment,
> + rdev_limits->discard_alignment);
> + }
> +}

isn't this exactly what blk_stack_limits does?

> +EXPORT_SYMBOL_GPL(md_rdev_get_queue_limits);
> +
>  static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
>  {
>   struct md_rdev *rdev;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index d8287d3cd1bf..5b514b9bb535 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -702,6 +702,7 @@ extern void md_reload_sb(struct mddev *mddev, int 
> raid_disk);
>  extern void md_update_sb(struct mddev *mddev, int force);
>  extern void md_kick_rdev_from_array(struct md_rdev * rdev);
>  struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
> +void md_rdev_get_queue_limits(struct mddev *mddev, struct queue_limits 
> *limits);
>  
>  static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev 
> *mddev)
>  {
> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> index 5a00fc118470..f1dcf7fd3eb1 100644
> --- a/drivers/md/raid0.c
> +++ b/drivers/md/raid0.c
> @@ -382,15 +382,31 @@ static int raid0_run(struct mddev *mddev)
>   if (mddev->queue) {
>   struct md_rdev *rdev;
>   bool discard_supported = false;
> + struct queue_limits limits;
> + unsigned int io_min, io_opt;
> + unsigned int granularity, alignment;
> + unsigned int chunk_size = mddev->chunk_sectors << 9;
>  
> + md_rdev_get_queue_limits(mddev, );
>   blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>   

Re: [RFC PATCH] blk-throttle: add burst allowance.

2017-11-16 Thread Shaohua Li
On Tue, Nov 14, 2017 at 03:10:22PM -0800, Khazhismel Kumykov wrote:
> Allows configuration additional bytes or ios before a throttle is
> triggered.
> 
> This allows implementation of a bucket style rate-limit/throttle on a
> block device. Previously, bursting to a device was limited to allowance
> granted in a single throtl_slice (similar to a bucket with limit N and
> refill rate N/slice).
> 
> Additional parameters bytes/io_burst_conf defined for tg, which define a
> number of bytes/ios that must be depleted before throttling happens. A
> tg that does not deplete this allowance functions as though it has no
> configured limits. tgs earn additional allowance at rate defined by
> bps/iops for the tg. Once a tg has *_disp > *_burst_conf, throttling
> kicks in. If a tg is idle for a while, it will again have some burst
> allowance before it gets throttled again.
> 
> slice_end for a tg is extended until io_disp/byte_disp would fall to 0,
> when all "used" burst allowance would be earned back. trim_slice still
> does progress slice_start as before and decrements *_disp as before, and
> tgs continue to get bytes/ios in throtl_slice intervals.

Can you describe why we need this? It would be great if you can describe the
usage model and an example. Does this work for io.low/io.max or both?

Thanks,
Shaohua
 
> Signed-off-by: Khazhismel Kumykov 
> ---
>  block/Kconfig|  11 +++
>  block/blk-throttle.c | 192 
> +++
>  2 files changed, 189 insertions(+), 14 deletions(-)
> 
> diff --git a/block/Kconfig b/block/Kconfig
> index 28ec55752b68..fbd05b419f93 100644
> --- a/block/Kconfig
> +++ b/block/Kconfig
> @@ -128,6 +128,17 @@ config BLK_DEV_THROTTLING_LOW
>  
>   Note, this is an experimental interface and could be changed someday.
>  
> +config BLK_DEV_THROTTLING_BURST
> +bool "Block throttling .burst allowance interface"
> +depends on BLK_DEV_THROTTLING
> +default n
> +---help---
> +Add .burst allowance for block throttling. Burst allowance allows for
> +additional unthrottled usage, while still limiting speed for sustained
> +usage.
> +
> +If in doubt, say N.
> +
>  config BLK_CMDLINE_PARSER
>   bool "Block device command line partition parser"
>   default n
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 96ad32623427..27c084312772 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -157,6 +157,11 @@ struct throtl_grp {
>   /* Number of bio's dispatched in current slice */
>   unsigned int io_disp[2];
>  
> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
> + uint64_t bytes_burst_conf[2];
> + unsigned int io_burst_conf[2];
> +#endif
> +
>   unsigned long last_low_overflow_time[2];
>  
>   uint64_t last_bytes_disp[2];
> @@ -507,6 +512,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t 
> gfp, int node)
>   tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
>   tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
>   tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
> + tg->bytes_burst_conf[READ] = 0;
> + tg->bytes_burst_conf[WRITE] = 0;
> + tg->io_burst_conf[READ] = 0;
> + tg->io_burst_conf[WRITE] = 0;
> +#endif
>   /* LIMIT_LOW will have default value 0 */
>  
>   tg->latency_target = DFL_LATENCY_TARGET;
> @@ -800,6 +811,26 @@ static inline void throtl_start_new_slice(struct 
> throtl_grp *tg, bool rw)
>  tg->slice_end[rw], jiffies);
>  }
>  
> +/*
> + * When current slice should end.
> + *
> + * With CONFIG_BLK_DEV_THROTTLING_BURST, we will wait longer than min_wait
> + * for slice to recover used burst allowance. (*_disp -> 0). Setting 
> slice_end
> + * before this would result in tg receiving additional burst allowance.
> + */
> +static inline unsigned long throtl_slice_wait(struct throtl_grp *tg, bool rw,
> + unsigned long min_wait)
> +{
> + unsigned long bytes_wait = 0, io_wait = 0;
> +#ifdef CONFIG_BLK_DEV_THROTTLING_BURST
> + if (tg->bytes_burst_conf[rw])
> + bytes_wait = (tg->bytes_disp[rw] * HZ) / tg_bps_limit(tg, rw);
> + if (tg->io_burst_conf[rw])
> + io_wait = (tg->io_disp[rw] * HZ) / tg_iops_limit(tg, rw);
> +#endif
> + return max(min_wait, max(bytes_wait, io_wait));
> +}
> +
>  static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
>   unsigned long jiffy_end)
>  {
> @@ -849,7 +880,8 @@ static inline void throtl_trim_slice(struct throtl_grp 
> *tg, bool rw)
>* is bad because it does not allow new slice to start.
>*/
>  
> - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
> + throtl_set_slice_end(tg, rw,
> + jiffies + throtl_slice_wait(tg, rw, tg->td->throtl_slice));
>  
>   time_elapsed = jiffies - tg->slice_start[rw];
>  
> @@ -889,7 +921,7 @@ static bool 

[PATCH] blk-throttle: allow configure 0 for some settings --resend

2017-11-15 Thread Shaohua Li
For io.low, latency target 0 is legit. 0 for rbps/wbps/rios/wios is ok
too. And we use 0 to clear io.low settings.

Cc: Tejun Heo <t...@kernel.org>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad326..7040285 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1633,7 +1633,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
goto out_finish;
 
ret = -ERANGE;
-   if (!val)
+   if (!val && off != LIMIT_LOW)
goto out_finish;
 
ret = -EINVAL;
-- 
2.9.5



[PATCH] blkcg: correctly update stat

2017-11-14 Thread Shaohua Li
blkcg_bio_issue_check() checks throtl for stat update, which isn't good
because the bio could be splitted into small bios, the samll bios go
into blkcg_bio_issue_check again and we update stat for the small bios,
so we the stat is double charged. To fix this, we only update stat if
the bio doesn't have BIO_THROTTLED flag. The fix will update stat ahead
of bio skips from blk-throttle, but eventually the stat is correct.

This patch is on top of patch:
https://marc.info/?l=linux-block=151060860608914=2

Cc: Tejun Heo <t...@kernel.org>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 include/linux/blk-cgroup.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 58f3d25..6fbd9ea 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -690,7 +690,8 @@ static inline bool blkcg_bio_issue_check(struct 
request_queue *q,
 {
struct blkcg *blkcg;
struct blkcg_gq *blkg;
-   bool throtl = false;
+   bool throtl;
+   bool charged;
 
rcu_read_lock();
blkcg = bio_blkcg(bio);
@@ -707,9 +708,10 @@ static inline bool blkcg_bio_issue_check(struct 
request_queue *q,
spin_unlock_irq(q->queue_lock);
}
 
+   charged = bio_flagged(bio, BIO_THROTTLED);
throtl = blk_throtl_bio(q, blkg, bio);
 
-   if (!throtl) {
+   if (!charged) {
blkg = blkg ?: q->root_blkg;
blkg_rwstat_add(>stat_bytes, bio->bi_opf,
bio->bi_iter.bi_size);
-- 
2.9.5



[PATCH V2] block-throttle: avoid double charge

2017-11-13 Thread Shaohua Li
If a bio is throttled and splitted after throttling, the bio could be
resubmited and enters the throttling again. This will cause part of the
bio is charged multiple times. If the cgroup has an IO limit, the double
charge will significantly harm the performance. The bio split becomes
quite common after arbitrary bio size change.

To fix this, we always set the BIO_THROTTLED flag if a bio is throttled.
If the bio is cloned/slitted, we copy the flag to new bio too to avoid
double charge. However cloned bio could be directed to a new disk,
keeping the flag will have problem. The observation is we always set new
disk for the bio in this case, so we can clear the flag in
bio_set_dev().

This issue exists a long time, arbitrary bio size change makes it worse,
so this should go into stable at least since v4.2.

V1-> V2: Not add extra field in bio based on discussion with Tejun

Cc: Tejun Heo <t...@kernel.org>
Cc: Vivek Goyal <vgo...@redhat.com>
Cc: sta...@vger.kernel.org
Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/bio.c  | 2 ++
 block/blk-throttle.c | 8 +---
 include/linux/bio.h  | 2 ++
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 8338304..d1d4d51 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -598,6 +598,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 */
bio->bi_disk = bio_src->bi_disk;
bio_set_flag(bio, BIO_CLONED);
+   if (bio_flagged(bio_src, BIO_THROTTLED))
+   bio_set_flag(bio, BIO_THROTTLED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ee6d7b0..f90fec1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -,13 +,7 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
 out_unlock:
spin_unlock_irq(q->queue_lock);
 out:
-   /*
-* As multiple blk-throtls may stack in the same issue path, we
-* don't want bios to leave with the flag set.  Clear the flag if
-* being issued.
-*/
-   if (!throttled)
-   bio_clear_flag(bio, BIO_THROTTLED);
+   bio_set_flag(bio, BIO_THROTTLED);
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9c75f58..27b5bac 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -504,6 +504,8 @@ extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #define bio_set_dev(bio, bdev) \
 do {   \
+   if ((bio)->bi_disk != (bdev)->bd_disk)  \
+   bio_clear_flag(bio, BIO_THROTTLED);\
(bio)->bi_disk = (bdev)->bd_disk;   \
(bio)->bi_partno = (bdev)->bd_partno;   \
 } while (0)
-- 
2.9.5



Re: [PATCH] null_blk: fix dev->badblocks leak

2017-11-08 Thread Shaohua Li
On Wed, Nov 08, 2017 at 05:29:44PM +0100, David Disseldorp wrote:
> null_alloc_dev() allocates memory for dev->badblocks, but cleanup
> currently only occurs in the configfs release codepath, missing a number
> of other places.
> 
> This bug was found running the blktests block/010 test, alongside
> kmemleak:
> rapido1:/blktests# ./check block/010
> ...
> rapido1:/blktests# echo scan > /sys/kernel/debug/kmemleak
> [  306.966708] kmemleak: 32 new suspected memory leaks (see 
> /sys/kernel/debug/kmemleak)
> rapido1:/blktests# cat /sys/kernel/debug/kmemleak
> unreferenced object 0x88001f86d000 (size 4096):
>   comm "modprobe", pid 231, jiffies 4294892415 (age 318.252s)
>   hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
>   backtrace:
> [] kmemleak_alloc+0x49/0xa0
> [] kmem_cache_alloc+0x9f/0xe0
> [] badblocks_init+0x2f/0x60
> [] 0xa0019fae
> [] nullb_device_badblocks_store+0x63/0x130 [null_blk]
> [] do_one_initcall+0x3d/0x170
> [] do_init_module+0x56/0x1e9
> [] load_module+0x1c47/0x26a0
> [] SyS_finit_module+0xa9/0xd0
> [] entry_SYSCALL_64_fastpath+0x13/0x94
> 
> Fixes: 2f54a613c942 ("nullb: badbblocks support")
> Signed-off-by: David Disseldorp <dd...@suse.de>

Reviewed-by: Shaohua Li <s...@fb.com>
> ---
>  drivers/block/null_blk.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
> index 8042c26ea9e6..fc80354477c6 100644
> --- a/drivers/block/null_blk.c
> +++ b/drivers/block/null_blk.c
> @@ -467,7 +467,6 @@ static void nullb_device_release(struct config_item *item)
>  {
>   struct nullb_device *dev = to_nullb_device(item);
>  
> - badblocks_exit(>badblocks);
>   null_free_device_storage(dev, false);
>   null_free_dev(dev);
>  }
> @@ -578,6 +577,10 @@ static struct nullb_device *null_alloc_dev(void)
>  
>  static void null_free_dev(struct nullb_device *dev)
>  {
> + if (!dev)
> + return;
> +
> + badblocks_exit(>badblocks);
>   kfree(dev);
>  }
>  
> -- 
> 2.13.6
> 


Re: [PATCH] badblocks: fix wrong return value in badblocks_set if badblocks are disabled

2017-11-03 Thread Shaohua Li
On Fri, Nov 03, 2017 at 10:13:38AM -0600, Liu Bo wrote:
> Hi Shaohua,
> 
> Given it's related to md, can you please take this thru your tree?

Yes, the patch makes sense. Can you resend the patch to me? I can't find it in 
my inbox

Thanks,
Shaohua
 
> Thanks,
> 
> -liubo
> 
> On Wed, Sep 27, 2017 at 04:13:17PM -0600, Liu Bo wrote:
> > MD's rdev_set_badblocks() expects that badblocks_set() returns 1 if
> > badblocks are disabled, otherwise, rdev_set_badblocks() will record
> > superblock changes and return success in that case and md will fail to
> > report an IO error which it should.
> > 
> > This bug has existed since badblocks were introduced in commit
> > 9e0e252a048b ("badblocks: Add core badblock management code").
> > 
> > Signed-off-by: Liu Bo 
> > ---
> >  block/badblocks.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/block/badblocks.c b/block/badblocks.c
> > index 43c7116..91f7bcf 100644
> > --- a/block/badblocks.c
> > +++ b/block/badblocks.c
> > @@ -178,7 +178,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int 
> > sectors,
> >  
> > if (bb->shift < 0)
> > /* badblocks are disabled */
> > -   return 0;
> > +   return 1;
> >  
> > if (bb->shift) {
> > /* round the start down, and the end up */
> > -- 
> > 2.9.4
> > 


Re: [PATCH V3 0/3] block/loop: handle discard/zeroout error

2017-10-18 Thread Shaohua Li
On Wed, Oct 04, 2017 at 07:52:42AM -0700, Shaohua Li wrote:
> From: Shaohua Li <s...@fb.com>
> 
> Fix some problems when setting up loop device with a block device as back file
> and create/mount ext4 in the loop device.

Jens,
can you look at these patches?

Thanks,
Shaohua
> 
> Thanks,
> Shaohua
> 
> Shaohua Li (3):
>   block/loop: don't hijack error number
>   block/loop: use FALLOC_FL_ZERO_RANGE for REQ_OP_WRITE_ZEROES
>   block: don't print message for discard error
> 
>  block/blk-core.c | 2 ++
>  drivers/block/loop.c | 9 ++---
>  2 files changed, 8 insertions(+), 3 deletions(-)
> 
> -- 
> 2.9.5
> 


Re: [PATCH v8 03/10] md: Neither resync nor reshape while the system is frozen

2017-10-12 Thread Shaohua Li
On Thu, Oct 12, 2017 at 04:59:01PM +, Bart Van Assche wrote:
> On Wed, 2017-10-11 at 12:32 -0700, Shaohua Li wrote:
> > On Wed, Oct 11, 2017 at 05:17:56PM +, Bart Van Assche wrote:
> > > On Tue, 2017-10-10 at 18:48 -0700, Shaohua Li wrote:
> > > > The problem is __md_stop_writes set some bit like MD_RECOVERY_FROZEN, 
> > > > which
> > > > will prevent md_check_recovery restarting resync/reshape. I think we 
> > > > need
> > > > preserve mddev->recovery in suspend prepare and restore after resume
> > > 
> > > Hello Shaohua,
> > > 
> > > As far as I can see __md_stop_writes() sets MD_RECOVERY_FROZEN and can set
> > > MD_RECOVERY_INTR. Since md_check_recovery() clears MD_RECOVERY_INTR I 
> > > think
> > > it should be sufficient to save and restore the state of the
> > > MD_RECOVERY_FROZEN flag. However, when I ran the following test:
> > > * echo frozen > /sys/block/md0/md/sync_action
> > > * cat /sys/block/md0/md/sync_action
> > > * systemctl hibernate
> > > * (power on system again)
> > > * cat /sys/block/md0/md/sync_action
> > > 
> > > the output that appeared was as follows:
> > > 
> > > frozen
> > > idle 
> > > Does that mean that a hibernate or suspend followed by a resume is 
> > > sufficient
> > > to clear the MD_RECOVERY_FROZEN flag for the md drivers and hence that the
> > > patch at the start of this e-mail thread does not need any further
> > > modifications?
> > 
> > Have no idea why it shows 'idle'. From my understanding, after resume, 
> > previous
> > memory is stored and MD_RECOVERY_FROZEN bit should be set. Was trying to
> > understand what happens.
> 
> Hello Shaohua,
> 
> I think a user space action causes the MD_RECOVERY_FROZEN bit to be cleared.
> After I had added a few debug statements in the md driver this is what I
> found in the system log:
> 
> Oct 12 09:38:39 kernel: action_store: storing action check
> Oct 12 09:38:39 kernel: md: data-check of RAID array md0
> Oct 12 09:38:40 systemd[1]: Starting Hibernate...
> Oct 12 09:38:40 kernel: md: md0: data-check interrupted.
> [ powered on system manually ]
> Oct 12 09:39:05 kernel: action_store: storing action check
> Oct 12 09:39:05 kernel: md: data-check of RAID array md0
> Oct 12 09:39:11 kernel: action_store: storing action idle
> Oct 12 09:39:11 kernel: md: md0: data-check interrupted.
> 
> Anyway, how about the patch below? I have verified by adding a debug pr_info()
> statement that the newly added code really clears the MD_RECOVERY_FROZEN bit.

Ok, for patch 1-3:
Reviewed-by: Shaohua Li <s...@kernel.org>

> Subject: [PATCH] md: Neither resync nor reshape while the system is frozen
> 
> Some people use the md driver on laptops and use the suspend and
> resume functionality. Since it is essential that submitting of
> new I/O requests stops before a hibernation image is created,
> interrupt the md resync and reshape actions if the system is
> being frozen. Note: the resync and reshape will restart after
> the system is resumed and a message similar to the following
> will appear in the system log:
> 
> md: md0: data-check interrupted.
> 
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Shaohua Li <s...@kernel.org>
> Cc: linux-r...@vger.kernel.org
> Cc: Ming Lei <ming@redhat.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: Johannes Thumshirn <jthumsh...@suse.de>
> ---
>  drivers/md/md.c | 50 +-
>  drivers/md/md.h |  8 
>  2 files changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index b99584e5d6b1..8b2fc750f97f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -66,6 +66,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #include 
>  #include "md.h"
> @@ -7439,6 +7441,7 @@ static int md_thread(void *arg)
>*/
>  
>   allow_signal(SIGKILL);
> + set_freezable();
>   while (!kthread_should_stop()) {
>  
>   /* We need to wait INTERRUPTIBLE so that
> @@ -7449,7 +7452,7 @@ static int md_thread(void *arg)
>   if (signal_pending(current))
>   flush_signals(current);
>  
> - wait_event_interruptible_timeout
> + wait_event_freezable_timeout
>   (thread->wqueue,
>test_bit(THREAD_WAKEUP, >flags)
>|| 

Re: [PATCH v2] blk-throttle: track read and write request individually

2017-10-12 Thread Shaohua Li
On Thu, Oct 12, 2017 at 05:15:46PM +0800, Joseph Qi wrote:
> From: Joseph Qi <qijiang...@alibaba-inc.com>
> 
> In mixed read/write workload on SSD, write latency is much lower than
> read. But now we only track and record read latency and then use it as
> threshold base for both read and write io latency accounting. As a
> result, write io latency will always be considered as good and
> bad_bio_cnt is much smaller than 20% of bio_cnt. That is to mean, the
> tg to be checked will be treated as idle most of the time and still let
> others dispatch more ios, even it is truly running under low limit and
> wants its low limit to be guaranteed, which is not we expected in fact.
> So track read and write request individually, which can bring more
> precise latency control for low limit idle detection.
> 
> Signed-off-by: Joseph Qi <qijiang...@alibaba-inc.com>

Reviewed-by: Shaohua Li <s...@fb.com>
> ---
>  block/blk-throttle.c | 134 
> ++-
>  1 file changed, 79 insertions(+), 55 deletions(-)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 17816a0..0897378 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -215,9 +215,9 @@ struct throtl_data
>  
>   unsigned int scale;
>  
> - struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
> - struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
> - struct latency_bucket __percpu *latency_buckets;
> + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
> + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
> + struct latency_bucket __percpu *latency_buckets[2];
>   unsigned long last_calculate_time;
>   unsigned long filtered_latency;
>  
> @@ -2040,10 +2040,10 @@ static void blk_throtl_update_idletime(struct 
> throtl_grp *tg)
>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>  static void throtl_update_latency_buckets(struct throtl_data *td)
>  {
> - struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
> - int i, cpu;
> - unsigned long last_latency = 0;
> - unsigned long latency;
> + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
> + int i, cpu, rw;
> + unsigned long last_latency[2] = { 0 };
> + unsigned long latency[2];
>  
>   if (!blk_queue_nonrot(td->queue))
>   return;
> @@ -2052,56 +2052,67 @@ static void throtl_update_latency_buckets(struct 
> throtl_data *td)
>   td->last_calculate_time = jiffies;
>  
>   memset(avg_latency, 0, sizeof(avg_latency));
> - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
> - struct latency_bucket *tmp = >tmp_buckets[i];
> -
> - for_each_possible_cpu(cpu) {
> - struct latency_bucket *bucket;
> -
> - /* this isn't race free, but ok in practice */
> - bucket = per_cpu_ptr(td->latency_buckets, cpu);
> - tmp->total_latency += bucket[i].total_latency;
> - tmp->samples += bucket[i].samples;
> - bucket[i].total_latency = 0;
> - bucket[i].samples = 0;
> - }
> + for (rw = READ; rw <= WRITE; rw++) {
> + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
> + struct latency_bucket *tmp = >tmp_buckets[rw][i];
> +
> + for_each_possible_cpu(cpu) {
> + struct latency_bucket *bucket;
> +
> + /* this isn't race free, but ok in practice */
> + bucket = per_cpu_ptr(td->latency_buckets[rw],
> + cpu);
> + tmp->total_latency += bucket[i].total_latency;
> + tmp->samples += bucket[i].samples;
> + bucket[i].total_latency = 0;
> + bucket[i].samples = 0;
> + }
>  
> - if (tmp->samples >= 32) {
> - int samples = tmp->samples;
> + if (tmp->samples >= 32) {
> + int samples = tmp->samples;
>  
> - latency = tmp->total_latency;
> + latency[rw] = tmp->total_latency;
>  
> - tmp->total_latency = 0;
> - tmp->samples = 0;
> - latency /= samples;
> - if (latency == 0)
> - continue;
> - avg_latency[i].latency = latency;
> +  

[PATCH] blk-throttle: allow configure 0 for some settings

2017-10-11 Thread Shaohua Li
For io.low, latency target 0 is legit. 0 for rbps/wbps/rios/wios is ok
too as long as not all of them are 0.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0fea76a..ee6d7b0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1632,7 +1632,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
goto out_finish;
 
ret = -ERANGE;
-   if (!val)
+   if (!val && off != LIMIT_LOW)
goto out_finish;
 
ret = -EINVAL;
-- 
2.9.5



Re: [PATCH v8 03/10] md: Neither resync nor reshape while the system is frozen

2017-10-11 Thread Shaohua Li
On Wed, Oct 11, 2017 at 05:17:56PM +, Bart Van Assche wrote:
> On Tue, 2017-10-10 at 18:48 -0700, Shaohua Li wrote:
> > The problem is __md_stop_writes set some bit like MD_RECOVERY_FROZEN, which
> > will prevent md_check_recovery restarting resync/reshape. I think we need
> > preserve mddev->recovery in suspend prepare and restore after resume
> 
> Hello Shaohua,
> 
> As far as I can see __md_stop_writes() sets MD_RECOVERY_FROZEN and can set
> MD_RECOVERY_INTR. Since md_check_recovery() clears MD_RECOVERY_INTR I think
> it should be sufficient to save and restore the state of the
> MD_RECOVERY_FROZEN flag. However, when I ran the following test:
> * echo frozen > /sys/block/md0/md/sync_action
> * cat /sys/block/md0/md/sync_action
> * systemctl hibernate
> * (power on system again)
> * cat /sys/block/md0/md/sync_action
> 
> the output that appeared was as follows:
> 
> frozen
> idle 
> Does that mean that a hibernate or suspend followed by a resume is sufficient
> to clear the MD_RECOVERY_FROZEN flag for the md drivers and hence that the
> patch at the start of this e-mail thread does not need any further
> modifications?

Have no idea why it shows 'idle'. From my understanding, after resume, previous
memory is stored and MD_RECOVERY_FROZEN bit should be set. Was trying to
understand what happens.

Thanks,
Shaohua


Re: [PATCH] blk-throttle: track read and write request individually

2017-10-11 Thread Shaohua Li
On Wed, Oct 11, 2017 at 05:53:34PM +0800, Joseph Qi wrote:
> From: Joseph Qi 
> 
> In mixed read/write workload on SSD, write latency is much lower than
> read. But now we only track and record read latency and then use it as
> threshold base for both read and write io latency accounting. As a
> result, write io latency will always be considered as good and
> bad_bio_cnt is much smaller than 20% of bio_cnt. That is to mean, the
> tg to be checked will be treated as idle most of the time and still let
> others dispatch more ios, even it is truly running under low limit and
> wants its low limit to be guaranteed, which is not we expected in fact.
> So track read and write request individually, which can bring more
> precise latency control for low limit idle detection.

Looks good. Originally I thought we don't need very precise latency control, so
the read/write base latency difference doesn't matter too much, but tracking
write base latency shouldn't harm.
 
> Signed-off-by: Joseph Qi 
> ---
>  block/blk-throttle.c | 134 
> ++-
>  1 file changed, 79 insertions(+), 55 deletions(-)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 17816a0..0897378 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -215,9 +215,9 @@ struct throtl_data
>  
>   unsigned int scale;
>  
> - struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
> - struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
> - struct latency_bucket __percpu *latency_buckets;
> + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
> + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
> + struct latency_bucket __percpu *latency_buckets[2];
>   unsigned long last_calculate_time;
>   unsigned long filtered_latency;
>  
> @@ -2040,10 +2040,10 @@ static void blk_throtl_update_idletime(struct 
> throtl_grp *tg)
>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>  static void throtl_update_latency_buckets(struct throtl_data *td)
>  {
> - struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
> - int i, cpu;
> - unsigned long last_latency = 0;
> - unsigned long latency;
> + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
> + int i, cpu, rw;
> + unsigned long last_latency[2] = { 0 };
> + unsigned long latency[2];
>  
>   if (!blk_queue_nonrot(td->queue))
>   return;
> @@ -2052,56 +2052,67 @@ static void throtl_update_latency_buckets(struct 
> throtl_data *td)
>   td->last_calculate_time = jiffies;
>  
>   memset(avg_latency, 0, sizeof(avg_latency));
> - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
> - struct latency_bucket *tmp = >tmp_buckets[i];
> -
> - for_each_possible_cpu(cpu) {
> - struct latency_bucket *bucket;
> -
> - /* this isn't race free, but ok in practice */
> - bucket = per_cpu_ptr(td->latency_buckets, cpu);
> - tmp->total_latency += bucket[i].total_latency;
> - tmp->samples += bucket[i].samples;
> - bucket[i].total_latency = 0;
> - bucket[i].samples = 0;
> - }
> + for (rw = READ; rw <= WRITE; rw++) {
> + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
> + struct latency_bucket *tmp = >tmp_buckets[rw][i];
> +
> + for_each_possible_cpu(cpu) {
> + struct latency_bucket *bucket;
> +
> + /* this isn't race free, but ok in practice */
> + bucket = per_cpu_ptr(td->latency_buckets[rw],
> + cpu);
> + tmp->total_latency += bucket[i].total_latency;
> + tmp->samples += bucket[i].samples;
> + bucket[i].total_latency = 0;
> + bucket[i].samples = 0;
> + }
>  
> - if (tmp->samples >= 32) {
> - int samples = tmp->samples;
> + if (tmp->samples >= 32) {
> + int samples = tmp->samples;
>  
> - latency = tmp->total_latency;
> + latency[rw] = tmp->total_latency;
>  
> - tmp->total_latency = 0;
> - tmp->samples = 0;
> - latency /= samples;
> - if (latency == 0)
> - continue;
> - avg_latency[i].latency = latency;
> + tmp->total_latency = 0;
> + tmp->samples = 0;
> + latency[rw] /= samples;
> + if (latency[rw] == 0)
> + continue;
> +  

Re: [PATCH v8 03/10] md: Neither resync nor reshape while the system is frozen

2017-10-10 Thread Shaohua Li
On Tue, Oct 10, 2017 at 11:33:06PM +, Bart Van Assche wrote:
> On Tue, 2017-10-10 at 15:30 -0700, Shaohua Li wrote:
> > On Tue, Oct 10, 2017 at 02:03:39PM -0700, Bart Van Assche wrote:
> > > Some people use the md driver on laptops and use the suspend and
> > > resume functionality. Since it is essential that submitting of
> > > new I/O requests stops before a hibernation image is created,
> > > interrupt the md resync and reshape actions if the system is
> > > being frozen. Note: the resync and reshape will restart after
> > > the system is resumed and a message similar to the following
> > > will appear in the system log:
> > > 
> > > md: md0: data-check interrupted.
> > 
> > Where do we restart resync and reshape?
> 
> Hello Shaohua,
> 
> My understanding of the md driver is as follows:
> - Before any write occurs, md_write_start() is called. That function clears
>   mddev->safemode and checks mddev->in_sync. If that variable is set, it is
>   cleared and md_write_start() wakes up mddev->thread and waits until the
>   superblock has been written to disk.
> - All mddev->thread implementations call md_check_recovery(). That function
>   updates the superblock by calling md_update_sb() if needed.
>   md_check_recovery() also starts a resync thread if the array is not in
>   sync. See also the comment block above md_check_recovery().
> - md_do_sync() performs the resync. If it completes successfully the
>   "in_sync" state is set (set_in_sync()). If it is interrupted the "in_sync"
>   state is not modified.
> - super_90_sync() sets the MD_SB_CLEAN flag in the on-disk superblock if the
>   "in_sync" flag is set.
> 
> In other words: if the array is in sync the MD_SB_CLEAN flag is set in the
> on-disk superblock. If a resync is needed that flag is not set. The
> MD_SB_CLEAN flag is checked when the superblock is loaded. If that flag is
> not set a resync is started.

The problem is __md_stop_writes set some bit like MD_RECOVERY_FROZEN, which
will prevent md_check_recovery restarting resync/reshape. I think we need
preserve mddev->recovery in suspend prepare and restore after resume

Thanks,
Shaohua


Re: [PATCH v2 0/9] Nowait support for stacked block devices

2017-10-10 Thread Shaohua Li
On Fri, Oct 06, 2017 at 07:01:19AM -0500, Goldwyn Rodrigues wrote:
> 
> 
> On 10/05/2017 12:19 PM, Shaohua Li wrote:
> > On Wed, Oct 04, 2017 at 08:55:02AM -0500, Goldwyn Rodrigues wrote:
> >> This is a continuation of the nowait support which was incorporated
> >> a while back. We introduced REQ_NOWAIT which would return immediately
> >> if the call would block at the block layer. Request based-devices
> >> do not wait. However, bio based devices (the ones which exclusively
> >> call make_request_fn) need to be trained to handle REQ_NOWAIT.
> >>
> >> This effort covers the devices under MD and DM which would block
> >> for any reason. If there should be more devices or situations
> >> which need to be covered, please let me know.
> >>
> >> The problem with partial writes discussed during v1 turned out
> >> to be a bug in partial writes during direct I/O and is fixed
> >> by the submitted patch[1].
> >>
> >> Changes since v1:
> >>  - mddev to return early in case the device is suspended, within the md 
> >> code as opposed to ->make_request()
> >>  - Check for nowait support with all the lower devices. Same with if 
> >> adding a device which does not support nowait.
> >>  - Nowait under each raid is checked before the final I/O submission for 
> >> the entire I/O.
> >>
> >> [1] https://patchwork.kernel.org/patch/9979887/
> > 
> > Does this fix the partial IO issue we discussed before? It looks not to me. 
> > The
> > partial IO bailed out could be any part of an IO, so simply returning the
> > successed size doesn't help. Am I missing anything? I didn't follow the
> > discussion, maybe Jens knew.
> > 
> 
> If the partial IO bailed out is any part of IO, isn't it supposed to
> return the size of the IO succeeded _so far_? If a latter part of the IO
> succeeds (with a failure in between) what are you supposed to return to
> user in case of direct write()s? Would that even be correct in case it
> is a file overwrite?

I didn't argue about the return value. To me the partial IO issue can't be
fixed simply by whatever 'return value'.


Re: [PATCH v8 03/10] md: Neither resync nor reshape while the system is frozen

2017-10-10 Thread Shaohua Li
On Tue, Oct 10, 2017 at 02:03:39PM -0700, Bart Van Assche wrote:
> Some people use the md driver on laptops and use the suspend and
> resume functionality. Since it is essential that submitting of
> new I/O requests stops before a hibernation image is created,
> interrupt the md resync and reshape actions if the system is
> being frozen. Note: the resync and reshape will restart after
> the system is resumed and a message similar to the following
> will appear in the system log:
> 
> md: md0: data-check interrupted.

Where do we restart resync and reshape?
 
> Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
> Cc: Shaohua Li <s...@kernel.org>
> Cc: linux-r...@vger.kernel.org
> Cc: Ming Lei <ming@redhat.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: Hannes Reinecke <h...@suse.com>
> Cc: Johannes Thumshirn <jthumsh...@suse.de>
> ---
>  drivers/md/md.c | 30 +-
>  1 file changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index b99584e5d6b1..d712d3320c1d 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -66,6 +66,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  
>  #include 
>  #include "md.h"
> @@ -7439,6 +7441,7 @@ static int md_thread(void *arg)
>*/
>  
>   allow_signal(SIGKILL);
> + set_freezable();
>   while (!kthread_should_stop()) {
>  
>   /* We need to wait INTERRUPTIBLE so that
> @@ -7449,7 +7452,7 @@ static int md_thread(void *arg)
>   if (signal_pending(current))
>   flush_signals(current);
>  
> - wait_event_interruptible_timeout
> + wait_event_freezable_timeout
>   (thread->wqueue,
>test_bit(THREAD_WAKEUP, >flags)
>|| kthread_should_stop() || kthread_should_park(),
> @@ -8963,6 +8966,29 @@ static void md_stop_all_writes(void)
>   mdelay(1000*1);
>  }
>  
> +/*
> + * Ensure that neither resyncing nor reshaping occurs while the system is
> + * frozen.
> + */
> +static int md_notify_pm(struct notifier_block *bl, unsigned long state,
> + void *unused)
> +{
> + pr_debug("%s: state = %ld\n", __func__, state);
> +
> + switch (state) {
> + case PM_HIBERNATION_PREPARE:
> + case PM_SUSPEND_PREPARE:
> + case PM_RESTORE_PREPARE:
> + md_stop_all_writes();
> + break;
> + }
> + return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block md_pm_notifier = {
> + .notifier_call  = md_notify_pm,
> +};
> +
>  static int md_notify_reboot(struct notifier_block *this,
>   unsigned long code, void *x)
>  {
> @@ -9009,6 +9035,7 @@ static int __init md_init(void)
>   md_probe, NULL, NULL);
>  
>   register_reboot_notifier(_reboot_notifier);
> + register_pm_notifier(_pm_notifier);
>   raid_table_header = register_sysctl_table(raid_root_table);
>  
>   md_geninit();
> @@ -9248,6 +9275,7 @@ static __exit void md_exit(void)
>  
>   unregister_blkdev(MD_MAJOR,"md");
>   unregister_blkdev(mdp_major, "mdp");
> + unregister_pm_notifier(_pm_notifier);
>   unregister_reboot_notifier(_reboot_notifier);
>   unregister_sysctl_table(raid_table_header);
>  
> -- 
> 2.14.2
> 


Re: [PATCH] blk-throttle: fix null pointer dereference while throttling writeback IOs

2017-10-10 Thread Shaohua Li
On Tue, Oct 10, 2017 at 12:48:38PM -0600, Jens Axboe wrote:
> On 10/10/2017 12:13 PM, Shaohua Li wrote:
> > On Tue, Oct 10, 2017 at 11:13:32AM +0800, xuejiufei wrote:
> >> From: Jiufei Xue <jiufei@alibaba-inc.com>
> >>
> >> A null pointer dereference can occur when blkcg is removed manually
> >> with writeback IOs inflight. This is caused by the following case:
> >>
> >> Writeback kworker submit the bio and set bio->bi_cg_private to tg
> >> in blk_throtl_assoc_bio.
> >> Then we remove the block cgroup manually, the blkg and tg would be
> >> freed if there is no request inflight.
> >> When the submitted bio come back, blk_throtl_bio_endio() fetch the tg
> >> which was already freed.
> >>
> >> Fix this by increasing the refcount of blkg in funcion
> >> blk_throtl_assoc_bio() so that the blkg will not be freed until the
> >> bio_endio called.
> >>
> >> Signed-off-by: Jiufei Xue <jiufei@alibaba-inc.com>
> >> ---
> >>  block/blk-throttle.c | 12 ++--
> >>  1 file changed, 10 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> >> index 17816a0..d80c3f0 100644
> >> --- a/block/blk-throttle.c
> >> +++ b/block/blk-throttle.c
> >> @@ -2112,8 +2112,12 @@ static inline void 
> >> throtl_update_latency_buckets(struct throtl_data *td)
> >>  static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
> >>  {
> >>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> >> -  if (bio->bi_css)
> >> +  if (bio->bi_css) {
> >> +  if (bio->bi_cg_private)
> >> +  blkg_put(tg_to_blkg(bio->bi_cg_private));
> >>bio->bi_cg_private = tg;
> >> +  blkg_get(tg_to_blkg(tg));
> >> +  }
> >>blk_stat_set_issue(>bi_issue_stat, bio_sectors(bio));
> >>  #endif
> >>  }
> >> @@ -2283,8 +2287,10 @@ void blk_throtl_bio_endio(struct bio *bio)
> >>  
> >>start_time = blk_stat_time(>bi_issue_stat) >> 10;
> >>finish_time = __blk_stat_time(finish_time_ns) >> 10;
> >> -  if (!start_time || finish_time <= start_time)
> >> +  if (!start_time || finish_time <= start_time) {
> >> +  blkg_put(tg_to_blkg(tg));
> >>return;
> >> +  }
> >>  
> >>lat = finish_time - start_time;
> >>/* this is only for bio based driver */
> >> @@ -2314,6 +2320,8 @@ void blk_throtl_bio_endio(struct bio *bio)
> >>tg->bio_cnt /= 2;
> >>tg->bad_bio_cnt /= 2;
> >>}
> >> +
> >> +  blkg_put(tg_to_blkg(tg));
> >>  }
> >>  #endif
> > 
> > Reviewed-by: Shaohua Li <s...@fb.com>
> 
> I was going to queue this up for 4.15, but was wondering if there's a
> strong reason to include it for 4.14 instead?

Either is ok to me, this isn't easy to trigger, but on the other hand it's
quite safe.


Re: [PATCH V2 3/3] blockcg: export latency info for each cgroup

2017-10-10 Thread Shaohua Li
On Wed, Oct 11, 2017 at 01:35:51AM +0800, weiping zhang wrote:
> On Fri, Oct 06, 2017 at 05:56:01PM -0700, Shaohua Li wrote:
> > From: Shaohua Li <s...@fb.com>
> > 
> > Export the latency info to user. The latency is a good sign to indicate
> > if IO is congested or not. User can use the info to make decisions like
> > adjust cgroup settings.
> Hi Shaohua,
> How to check IO congested or not by latency ? Different SSD has
> different latency especially when mixed sequence and random IO
> operatons.

There is no magic here, you should know the SSD characteristics first. The idea
is observing the latency when the system isn't overcommited, for example,
running the workload in a one-cgroup setup, then using the observed latency to
guide configuration for multi-cgroup settings or check if IO is healthy in
cgroups.
 
> > 
> > Existing io.stat shows accumulated IO bytes and requests, but
> > accumulated value for latency doesn't make much sense. This patch
> How to understand "accumulated value for latency doesn't make much
> sense" could you give an example? 

rbytes and wbytes in io.stat are accumulated value. If such value of latency
isn't valuable.

> I think iostat's r_await and w_await
> are nearly equal to rlat_mean and wlat_mean if there is no too much
> request starved.

yes, pretty similar. io.stat is per-cgroup though

Thanks,
Shaohua


Re: [PATCH] blk-throttle: fix null pointer dereference while throttling writeback IOs

2017-10-10 Thread Shaohua Li
On Tue, Oct 10, 2017 at 11:13:32AM +0800, xuejiufei wrote:
> From: Jiufei Xue <jiufei@alibaba-inc.com>
> 
> A null pointer dereference can occur when blkcg is removed manually
> with writeback IOs inflight. This is caused by the following case:
> 
> Writeback kworker submit the bio and set bio->bi_cg_private to tg
> in blk_throtl_assoc_bio.
> Then we remove the block cgroup manually, the blkg and tg would be
> freed if there is no request inflight.
> When the submitted bio come back, blk_throtl_bio_endio() fetch the tg
> which was already freed.
> 
> Fix this by increasing the refcount of blkg in funcion
> blk_throtl_assoc_bio() so that the blkg will not be freed until the
> bio_endio called.
> 
> Signed-off-by: Jiufei Xue <jiufei@alibaba-inc.com>
> ---
>  block/blk-throttle.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 17816a0..d80c3f0 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -2112,8 +2112,12 @@ static inline void 
> throtl_update_latency_buckets(struct throtl_data *td)
>  static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
>  {
>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> - if (bio->bi_css)
> + if (bio->bi_css) {
> + if (bio->bi_cg_private)
> + blkg_put(tg_to_blkg(bio->bi_cg_private));
>   bio->bi_cg_private = tg;
> + blkg_get(tg_to_blkg(tg));
> + }
>   blk_stat_set_issue(>bi_issue_stat, bio_sectors(bio));
>  #endif
>  }
> @@ -2283,8 +2287,10 @@ void blk_throtl_bio_endio(struct bio *bio)
>  
>   start_time = blk_stat_time(>bi_issue_stat) >> 10;
>   finish_time = __blk_stat_time(finish_time_ns) >> 10;
> - if (!start_time || finish_time <= start_time)
> + if (!start_time || finish_time <= start_time) {
> + blkg_put(tg_to_blkg(tg));
>   return;
> + }
>  
>   lat = finish_time - start_time;
>   /* this is only for bio based driver */
> @@ -2314,6 +2320,8 @@ void blk_throtl_bio_endio(struct bio *bio)
>   tg->bio_cnt /= 2;
>   tg->bad_bio_cnt /= 2;
>   }
> +
> + blkg_put(tg_to_blkg(tg));
>  }
>  #endif

Reviewed-by: Shaohua Li <s...@fb.com>


[PATCH V2 1/3] blk-stat: delete useless code

2017-10-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Fix two issues:
- the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except
  sum it to global stat. We can do the calculation there. The flush just
  wastes cpu time.
- some fields are signed int/s64. I don't see the point.

Cc: Omar Sandoval <osan...@fb.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-stat.c  | 45 +++--
 include/linux/blk_types.h |  5 ++---
 2 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d..3a2f3c9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
 #include "blk-mq.h"
 #include "blk.h"
 
-#define BLK_RQ_STAT_BATCH  64
-
 struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
 {
stat->min = -1ULL;
stat->max = stat->nr_samples = stat->mean = 0;
-   stat->batch = stat->nr_batch = 0;
-}
-
-static void blk_stat_flush_batch(struct blk_rq_stat *stat)
-{
-   const s32 nr_batch = READ_ONCE(stat->nr_batch);
-   const s32 nr_samples = READ_ONCE(stat->nr_samples);
-
-   if (!nr_batch)
-   return;
-   if (!nr_samples)
-   stat->mean = div64_s64(stat->batch, nr_batch);
-   else {
-   stat->mean = div64_s64((stat->mean * nr_samples) +
-   stat->batch,
-   nr_batch + nr_samples);
-   }
-
-   stat->nr_samples += nr_batch;
-   stat->nr_batch = stat->batch = 0;
+   stat->batch = 0;
 }
 
+/* src is a per-cpu stat, mean isn't initialized */
 static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 {
-   blk_stat_flush_batch(src);
-
if (!src->nr_samples)
return;
 
dst->min = min(dst->min, src->min);
dst->max = max(dst->max, src->max);
 
-   if (!dst->nr_samples)
-   dst->mean = src->mean;
-   else {
-   dst->mean = div64_s64((src->mean * src->nr_samples) +
-   (dst->mean * dst->nr_samples),
-   dst->nr_samples + src->nr_samples);
-   }
+   dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
+   dst->nr_samples + src->nr_samples);
+
dst->nr_samples += src->nr_samples;
 }
 
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 
value)
 {
stat->min = min(stat->min, value);
stat->max = max(stat->max, value);
-
-   if (stat->batch + value < stat->batch ||
-   stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-   blk_stat_flush_batch(stat);
-
stat->batch += value;
-   stat->nr_batch++;
+   stat->nr_samples++;
 }
 
 void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
struct blk_stat_callback *cb;
struct blk_rq_stat *stat;
int bucket;
-   s64 now, value;
+   u64 now, value;
 
now = __blk_stat_time(ktime_to_ns(ktime_get()));
if (now < blk_stat_time(>issue_stat))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a2d2aa7..3385c89 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 }
 
 struct blk_rq_stat {
-   s64 mean;
+   u64 mean;
u64 min;
u64 max;
-   s32 nr_samples;
-   s32 nr_batch;
+   u32 nr_samples;
u64 batch;
 };
 
-- 
2.9.5



[PATCH V2 3/3] blockcg: export latency info for each cgroup

2017-10-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Export the latency info to user. The latency is a good sign to indicate
if IO is congested or not. User can use the info to make decisions like
adjust cgroup settings.

Existing io.stat shows accumulated IO bytes and requests, but
accumulated value for latency doesn't make much sense. This patch
exports the latency info in a 100ms interval.

To reduce overhead, latency info of children is propagated to parents
every 10ms. This means the parents's latency could lost 10ms info of its
children in 100ms. This should be ok, as we don't need precise latency
info.

A micro benchmark running fio test against null_blk in a sixth level
cgroup doesn't show obvious regression. perf shows a little bit overhead
in blk_stat_add (~1%) and blkg_lookup (~1%), which is unavoidable right
now.

With this patch, the io.stat will show:
8:0 rbytes=7282688 wbytes=0 rios=83 wios=0 rlat_mean=2720 rlat_min=183 
rlat_max=14880 wlat_mean=0 wlat_min=0 wlat_max=0
The new fields will display read/write average/minimum/maximum latency
within 100ms. The latency is us.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-cgroup.c |  29 +-
 block/blk-stat.c   | 135 -
 block/blk.h|   5 ++
 include/linux/blk-cgroup.h |   9 +++
 4 files changed, 175 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56ba..89c5075 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -78,6 +78,7 @@ static void blkg_free(struct blkcg_gq *blkg)
 
blkg_rwstat_exit(>stat_ios);
blkg_rwstat_exit(>stat_bytes);
+   blkg_rq_stat_exit(blkg);
kfree(blkg);
 }
 
@@ -104,6 +105,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, 
struct request_queue *q,
blkg_rwstat_init(>stat_ios, gfp_mask))
goto err_free;
 
+   if (blkg_rq_stat_init(blkg, gfp_mask))
+   goto err_free;
blkg->q = q;
INIT_LIST_HEAD(>q_node);
blkg->blkcg = blkcg;
@@ -952,6 +955,8 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
const char *dname;
struct blkg_rwstat rwstat;
u64 rbytes, wbytes, rios, wios;
+   u64 rmean = 0, rmin = 0, rmax = 0;
+   u64 wmean = 0, wmin = 0, wmax = 0;
 
dname = blkg_dev_name(blkg);
if (!dname)
@@ -969,11 +974,30 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
rios = atomic64_read(_cnt[BLKG_RWSTAT_READ]);
wios = atomic64_read(_cnt[BLKG_RWSTAT_WRITE]);
 
+   if (blkg->rq_stat.stat[0].nr_samples) {
+   rmean = blkg->rq_stat.stat[0].mean;
+   do_div(rmean, 1000);
+   rmin = blkg->rq_stat.stat[0].min;
+   do_div(rmin, 1000);
+   rmax = blkg->rq_stat.stat[0].max;
+   do_div(rmax, 1000);
+   }
+   if (blkg->rq_stat.stat[1].nr_samples) {
+   wmean = blkg->rq_stat.stat[1].mean;
+   do_div(wmean, 1000);
+   wmin = blkg->rq_stat.stat[1].min;
+   do_div(wmin, 1000);
+   wmax = blkg->rq_stat.stat[1].max;
+   do_div(wmax, 1000);
+   }
spin_unlock_irq(blkg->q->queue_lock);
 
if (rbytes || wbytes || rios || wios)
-   seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu 
wios=%llu\n",
-  dname, rbytes, wbytes, rios, wios);
+   seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu 
wios=%llu "
+  "rlat_mean=%llu rlat_min=%llu rlat_max=%llu "
+  "wlat_mean=%llu wlat_min=%llu 
wlat_max=%llu\n",
+  dname, rbytes, wbytes, rios, wios,
+  rmean, rmin, rmax, wmean, wmin, wmax);
}
 
rcu_read_unlock();
@@ -1167,6 +1191,7 @@ int blkcg_init_queue(struct request_queue *q)
blkg_destroy_all(q);
spin_unlock_irq(q->queue_lock);
}
+   blk_stat_enable_accounting(q);
return ret;
 }
 
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 3a2f3c9..12fd356 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "blk-stat.h"
 #include "blk-mq.h"
@@ -47,6 +48,135 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 
value)
stat->nr_samples++;
 }
 
+#ifdef CONFIG_BLK_CGROUP
+#define BLKCG_FLUSH_WINDOW (1000 * 1000 * 100)
+#define BLKCG_PROPAGATE_WINDOW (1000 * 1000 * 10)
+static void blkg_rq_stat_flush_per

[PATCH V2 0/3] block: export latency info for cgroups

2017-10-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Hi,

latency info is a good sign to determine if IO is healthy. The patches export
such info to cgroup io.stat.

I sent the first patch separately before, but since the latter depends on it, I
include it here.

Thanks,
Shaohua

V1->V2: improve the scalability

Shaohua Li (3):
  blk-stat: delete useless code
  block: set request_list for request
  blockcg: export latency info for each cgroup

 block/blk-cgroup.c |  29 +++-
 block/blk-core.c   |   2 +-
 block/blk-mq.c |   5 ++
 block/blk-stat.c   | 178 +++--
 block/blk.h|   5 ++
 include/linux/blk-cgroup.h |   9 +++
 include/linux/blk_types.h  |   5 +-
 7 files changed, 189 insertions(+), 44 deletions(-)

-- 
2.9.5



[PATCH V2 2/3] block: set request_list for request

2017-10-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Legacy queue sets request's request_list, mq doesn't. This makes mq does
the same thing, so we can find cgroup of a request. Note, we really
only use blkg field of request_list, it's pointless to allocate mempool
for request_list in mq case.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-core.c | 2 +-
 block/blk-mq.c   | 5 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index adb064a..f88afd1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -718,7 +718,7 @@ static void free_request_size(void *element, void *data)
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask)
 {
-   if (unlikely(rl->rq_pool))
+   if (unlikely(rl->rq_pool) || q->mq_ops)
return 0;
 
rl->q = q;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7f01d69..c3b5876 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -481,6 +481,9 @@ void blk_mq_free_request(struct request *rq)
 
wbt_done(q->rq_wb, >issue_stat);
 
+   if (blk_rq_rl(rq))
+   blk_put_rl(blk_rq_rl(rq));
+
clear_bit(REQ_ATOM_STARTED, >atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, >atomic_flags);
if (rq->tag != -1)
@@ -1504,6 +1507,8 @@ static void blk_mq_bio_to_request(struct request *rq, 
struct bio *bio)
 {
blk_init_request_from_bio(rq, bio);
 
+   blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
+
blk_account_io_start(rq, true);
 }
 
-- 
2.9.5



[PATCH] blk-stat: delete useless code

2017-10-05 Thread Shaohua Li
Fix two issues:
- the per-cpu stat flush is unnecessary, nobody uses per-cpu stat except
  sum it to global stat. We can do the calculation there. The flush just
  wastes cpu time.
- some fields are signed int/s64. I don't see the point.

Cc: Omar Sandoval <osan...@fb.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-stat.c  | 45 +++--
 include/linux/blk_types.h |  5 ++---
 2 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/block/blk-stat.c b/block/blk-stat.c
index c52356d..3a2f3c9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -11,8 +11,6 @@
 #include "blk-mq.h"
 #include "blk.h"
 
-#define BLK_RQ_STAT_BATCH  64
-
 struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
@@ -23,45 +21,21 @@ static void blk_stat_init(struct blk_rq_stat *stat)
 {
stat->min = -1ULL;
stat->max = stat->nr_samples = stat->mean = 0;
-   stat->batch = stat->nr_batch = 0;
-}
-
-static void blk_stat_flush_batch(struct blk_rq_stat *stat)
-{
-   const s32 nr_batch = READ_ONCE(stat->nr_batch);
-   const s32 nr_samples = READ_ONCE(stat->nr_samples);
-
-   if (!nr_batch)
-   return;
-   if (!nr_samples)
-   stat->mean = div64_s64(stat->batch, nr_batch);
-   else {
-   stat->mean = div64_s64((stat->mean * nr_samples) +
-   stat->batch,
-   nr_batch + nr_samples);
-   }
-
-   stat->nr_samples += nr_batch;
-   stat->nr_batch = stat->batch = 0;
+   stat->batch = 0;
 }
 
+/* src is a per-cpu stat, mean isn't initialized */
 static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 {
-   blk_stat_flush_batch(src);
-
if (!src->nr_samples)
return;
 
dst->min = min(dst->min, src->min);
dst->max = max(dst->max, src->max);
 
-   if (!dst->nr_samples)
-   dst->mean = src->mean;
-   else {
-   dst->mean = div64_s64((src->mean * src->nr_samples) +
-   (dst->mean * dst->nr_samples),
-   dst->nr_samples + src->nr_samples);
-   }
+   dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples,
+   dst->nr_samples + src->nr_samples);
+
dst->nr_samples += src->nr_samples;
 }
 
@@ -69,13 +43,8 @@ static void __blk_stat_add(struct blk_rq_stat *stat, u64 
value)
 {
stat->min = min(stat->min, value);
stat->max = max(stat->max, value);
-
-   if (stat->batch + value < stat->batch ||
-   stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-   blk_stat_flush_batch(stat);
-
stat->batch += value;
-   stat->nr_batch++;
+   stat->nr_samples++;
 }
 
 void blk_stat_add(struct request *rq)
@@ -84,7 +53,7 @@ void blk_stat_add(struct request *rq)
struct blk_stat_callback *cb;
struct blk_rq_stat *stat;
int bucket;
-   s64 now, value;
+   u64 now, value;
 
now = __blk_stat_time(ktime_to_ns(ktime_get()));
if (now < blk_stat_time(>issue_stat))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a2d2aa7..3385c89 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -329,11 +329,10 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 }
 
 struct blk_rq_stat {
-   s64 mean;
+   u64 mean;
u64 min;
u64 max;
-   s32 nr_samples;
-   s32 nr_batch;
+   u32 nr_samples;
u64 batch;
 };
 
-- 
2.9.5



Re: [PATCH v2 0/9] Nowait support for stacked block devices

2017-10-05 Thread Shaohua Li
On Wed, Oct 04, 2017 at 08:55:02AM -0500, Goldwyn Rodrigues wrote:
> This is a continuation of the nowait support which was incorporated
> a while back. We introduced REQ_NOWAIT which would return immediately
> if the call would block at the block layer. Request based-devices
> do not wait. However, bio based devices (the ones which exclusively
> call make_request_fn) need to be trained to handle REQ_NOWAIT.
> 
> This effort covers the devices under MD and DM which would block
> for any reason. If there should be more devices or situations
> which need to be covered, please let me know.
> 
> The problem with partial writes discussed during v1 turned out
> to be a bug in partial writes during direct I/O and is fixed
> by the submitted patch[1].
> 
> Changes since v1:
>  - mddev to return early in case the device is suspended, within the md code 
> as opposed to ->make_request()
>  - Check for nowait support with all the lower devices. Same with if adding a 
> device which does not support nowait.
>  - Nowait under each raid is checked before the final I/O submission for the 
> entire I/O.
> 
> [1] https://patchwork.kernel.org/patch/9979887/

Does this fix the partial IO issue we discussed before? It looks not to me. The
partial IO bailed out could be any part of an IO, so simply returning the
successed size doesn't help. Am I missing anything? I didn't follow the
discussion, maybe Jens knew.

Thanks,
Shaohua


Re: [RFC 2/2] blockcg: export latency info for each cgroup

2017-10-05 Thread Shaohua Li
On Wed, Oct 04, 2017 at 11:04:39AM -0700, Tejun Heo wrote:
> Hello,
> 
> On Wed, Oct 04, 2017 at 10:41:20AM -0700, Shaohua Li wrote:
> > Export the latency info to user. The latency is a good sign to indicate
> > if IO is congested or not. User can use the info to make decisions like
> > adjust cgroup settings.
> 
> Nice, yeah, this can be really useful.
> 
> > Existing io.stat shows accumulated IO bytes and requests, but
> > accumulated value for latency doesn't make much sense. This patch
> > exports the latency info in a 100ms interval.
> 
> We probably want running avg of a few intervals.
> 
> > A micro benchmark running fio test against null_blk in a third level
> > cgroup shows around 4% regression. If I only do the latency accouting
> 
> Heh, that's quite a bit.
> 
> > for leaf cgroup, the regression seems to disappear. So not quite sure if
> > we should do the accounting for intermediate nodes or if the whole thing
> > should be enabled optionally.
> 
> I suspect that if we make the calculations and propagations lazy, the
> overhead will likely become negligible.  Can you please take a look at
> how the basic resource accounting code in kernel/cgroup/stat.c?  It's
> an infra code for collecting stats without burdening hot path.  It's
> currently only used for CPU but we can easily expand it to cover other
> resources.  Having to do running avgs might interfere with the lazy
> propagation a bit but I think we can make it work if we make the right
> trade-offs.

Looks that is similar to how io.stat exposes bytes/ios. It does the
propagation at the time when user read the status file. However, doing the same
for latency is meaningless, we shouldn't accumulate latency for a long time. If
we want to do it lazily, alternatives are:
- export total requests and total latency. User can calculate the average in
  any interval. We can't export min/max latency then.
- export avg/min/max since last time when user reads io.stat. We clear all
  statistics once user reads io.stat and re-account from scratch.

Thanks,
Shaohua


Re: [RFC 1/2] block: record blkcss in request

2017-10-04 Thread Shaohua Li
On Wed, Oct 04, 2017 at 10:51:49AM -0700, Tejun Heo wrote:
> Hello, Shaohua.
> 
> On Wed, Oct 04, 2017 at 10:41:19AM -0700, Shaohua Li wrote:
> > From: Shaohua Li <s...@fb.com>
> > 
> > Currently we record block css info in bio but not in request. Normally
> > we can get a request's css from its bio, but in some situations, we
> > can't access request's bio, for example, after blk_update_request. Add
> > the css to request, so we can access css through the life cycle of a
> > request.
> 
> Each request comes from cgroup specific request_list, so given a
> request, its blkcg membership can is accessible through
> 
>   request->rl->blkg->blkcg

Nice. Seems only used for legacy queue though.

Thanks,
Shaohua


[RFC 1/2] block: record blkcss in request

2017-10-04 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Currently we record block css info in bio but not in request. Normally
we can get a request's css from its bio, but in some situations, we
can't access request's bio, for example, after blk_update_request. Add
the css to request, so we can access css through the life cycle of a
request.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-core.c   | 12 
 include/linux/blkdev.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index adb064a..07f8f7e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1551,6 +1551,11 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
return;
}
 
+#ifdef CONFIG_BLK_CGROUP
+   if (req->css)
+   css_put(req->css);
+#endif
+
lockdep_assert_held(q->queue_lock);
 
blk_pm_put_request(req);
@@ -3094,6 +3099,13 @@ void blk_rq_bio_prep(struct request_queue *q, struct 
request *rq,
rq->rq_flags |= RQF_QUIET;
if (bio->bi_disk)
rq->rq_disk = bio->bi_disk;
+#ifdef CONFIG_BLK_CGROUP
+   rq->css = NULL;
+   if (bio->bi_css) {
+   rq->css = bio->bi_css;
+   css_get(rq->css);
+   }
+#endif
 }
 
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 02fa42d..cdd3aeb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -208,6 +208,7 @@ struct request {
struct request_list *rl;/* rl this rq is alloced from */
unsigned long long start_time_ns;
unsigned long long io_start_time_ns;/* when passed to hardware */
+   struct cgroup_subsys_state *css;
 #endif
/* Number of scatter-gather DMA addr+len pairs after
 * physical address coalescing is performed.
-- 
2.9.5



[RFC 0/2] block: export latency info for cgroups

2017-10-04 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Hi,

latency info is a good sign to determine if IO is healthy. The patches export
such info to cgroup io.stat.

Thanks,
Shaohua

Shaohua Li (2):
  block: record blkcss in request
  blockcg: export latency info for each cgroup

 block/blk-cgroup.c | 29 +-
 block/blk-core.c   | 12 ++
 block/blk-stat.c   | 95 +-
 block/blk.h|  5 +++
 include/linux/blk-cgroup.h |  7 
 include/linux/blkdev.h |  1 +
 6 files changed, 146 insertions(+), 3 deletions(-)

-- 
2.9.5



[PATCH V3 2/3] block/loop: use FALLOC_FL_ZERO_RANGE for REQ_OP_WRITE_ZEROES

2017-10-04 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

REQ_OP_WRITE_ZEROES really means zero the data. And in blkdev_fallocate,
FALLOC_FL_ZERO_RANGE will retry but FALLOC_FL_PUNCH_HOLE not, even loop
request doesn't have BLKDEV_ZERO_NOFALLBACK set.

Signed-off-by: Shaohua Li <s...@fb.com>
Reviewed-by: Ming Lei <ming@redhat.com>
---
 drivers/block/loop.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6aa739f..7269341 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -426,6 +426,9 @@ static int lo_discard(struct loop_device *lo, struct 
request *rq, loff_t pos)
int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;
 
+   if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+   mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE;
+
if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
ret = -EOPNOTSUPP;
goto out;
-- 
2.9.5



[PATCH V3 0/3] block/loop: handle discard/zeroout error

2017-10-04 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Fix some problems when setting up loop device with a block device as back file
and create/mount ext4 in the loop device.

Thanks,
Shaohua

Shaohua Li (3):
  block/loop: don't hijack error number
  block/loop: use FALLOC_FL_ZERO_RANGE for REQ_OP_WRITE_ZEROES
  block: don't print message for discard error

 block/blk-core.c | 2 ++
 drivers/block/loop.c | 9 ++---
 2 files changed, 8 insertions(+), 3 deletions(-)

-- 
2.9.5



[PATCH V3 3/3] block: don't print message for discard error

2017-10-04 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

discard error isn't fatal, don't flood discard error messages.

Suggested-by: Ming Lei <tom.leim...@gmail.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 14f7674..adb064a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3090,6 +3090,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct 
request *rq,
rq->__data_len = bio->bi_iter.bi_size;
rq->bio = rq->biotail = bio;
 
+   if (bio_op(bio) == REQ_OP_DISCARD)
+   rq->rq_flags |= RQF_QUIET;
if (bio->bi_disk)
rq->rq_disk = bio->bi_disk;
 }
-- 
2.9.5



[PATCH V3 1/3] block/loop: don't hijack error number

2017-10-04 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

If the bio returns -EOPNOTSUPP, we shouldn't hijack it and return -EIO

Signed-off-by: Shaohua Li <s...@fb.com>
Reviewed-by: Ming Lei <ming@redhat.com>
---
 drivers/block/loop.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bc8e615..6aa739f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -460,7 +460,7 @@ static void lo_complete_rq(struct request *rq)
zero_fill_bio(bio);
}
 
-   blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
+   blk_mq_end_request(rq, errno_to_blk_status(cmd->ret));
 }
 
 static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
@@ -478,7 +478,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
 
if (cmd->css)
css_put(cmd->css);
-   cmd->ret = ret;
+   cmd->ret = ret > 0 ? 0 : ret;
lo_rw_aio_do_completion(cmd);
 }
 
@@ -1719,7 +1719,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
  failed:
/* complete non-aio request */
if (!cmd->use_aio || ret) {
-   cmd->ret = ret ? -EIO : 0;
+   cmd->ret = ret;
blk_mq_complete_request(cmd->rq);
}
 }
-- 
2.9.5



Re: [PATCH] null_blk: change configfs dependency to select

2017-10-03 Thread Shaohua Li
On Tue, Oct 03, 2017 at 03:50:16PM -0600, Jens Axboe wrote:
> A recent commit made null_blk depend on configfs, which is kind of
> annoying since you now have to find this dependency and enable that
> as well. Discovered this since I no longer had null_blk available
> on a box I needed to debug, since it got killed when the config
> updated after the configfs change was merged.
> 
> Fixes: 3bf2bd20734e ("nullb: add configfs interface")
> Signed-off-by: Jens Axboe <ax...@kernel.dk>

Reviewed-by: Shaohua Li <s...@fb.com> 
> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
> index 4a438b8abe27..2dfe99b328f8 100644
> --- a/drivers/block/Kconfig
> +++ b/drivers/block/Kconfig
> @@ -17,7 +17,7 @@ if BLK_DEV
>  
>  config BLK_DEV_NULL_BLK
>   tristate "Null test block driver"
> - depends on CONFIGFS_FS
> + select CONFIGFS_FS
>  
>  config BLK_DEV_FD
>   tristate "Normal floppy disk support"
> 


Re: [PATCH v2] blk-throttle: fix possible io stall when upgrade to max

2017-10-01 Thread Shaohua Li
On Sat, Sep 30, 2017 at 02:38:49PM +0800, Joseph Qi wrote:
> From: Joseph Qi <qijiang...@alibaba-inc.com>
> 
> There is a case which will lead to io stall. The case is described as
> follows. 
> /test1
>   |-subtest1
> /test2
>   |-subtest2
> And subtest1 and subtest2 each has 32 queued bios already.
> 
> Now upgrade to max. In throtl_upgrade_state, it will try to dispatch
> bios as follows:
> 1) tg=subtest1, do nothing;
> 2) tg=test1, transfer 32 queued bios from subtest1 to test1; no pending
> left, no need to schedule next dispatch;
> 3) tg=subtest2, do nothing;
> 4) tg=test2, transfer 32 queued bios from subtest2 to test2; no pending
> left, no need to schedule next dispatch;
> 5) tg=/, transfer 8 queued bios from test1 to /, 8 queued bios from
> test2 to /, 8 queued bios from test1 to /, and 8 queued bios from test2
> to /; note that test1 and test2 each still has 16 queued bios left;
> 6) tg=/, try to schedule next dispatch, but since disptime is now
> (update in tg_update_disptime, wait=0), pending timer is not scheduled
> in fact;
> 7) In throtl_upgrade_state it totally dispatches 32 queued bios and with
> 32 left. test1 and test2 each has 16 queued bios;
> 8) throtl_pending_timer_fn sees the left over bios, but could do
> nothing, because throtl_select_dispatch returns 0, and test1/test2 has
> no pending tg.
> 
> The blktrace shows the following:
> 8,32   00 2.539007641 0  m   N throtl upgrade to max
> 8,32   00 2.539072267 0  m   N throtl /test2 dispatch 
> nr_queued=16 read=0 write=16
> 8,32   70 2.539077142 0  m   N throtl /test1 dispatch 
> nr_queued=16 read=0 write=16
> 
> So force schedule dispatch if there are pending children.
>
> Signed-off-by: Joseph Qi <qijiang...@alibaba-inc.com>
> ---
>  block/blk-throttle.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 0fea76a..17816a0 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -1911,11 +1911,11 @@ static void throtl_upgrade_state(struct throtl_data 
> *td)
>  
>   tg->disptime = jiffies - 1;
>   throtl_select_dispatch(sq);
> - throtl_schedule_next_dispatch(sq, false);
> + throtl_schedule_next_dispatch(sq, true);
>   }
>   rcu_read_unlock();
>   throtl_select_dispatch(>service_queue);
> - throtl_schedule_next_dispatch(>service_queue, false);
> + throtl_schedule_next_dispatch(>service_queue, true);
>   queue_work(kthrotld_workqueue, >dispatch_work);
>  }

Reviewed-by: Shaohua Li <s...@fb.com> 


Re: [PATCH] blk-throttle: fix possible io stall when doing upgrade

2017-09-28 Thread Shaohua Li
On Thu, Sep 28, 2017 at 07:19:45PM +0800, Joseph Qi wrote:
> 
> 
> On 17/9/28 11:48, Joseph Qi wrote:
> > Hi Shahua,
> > 
> > On 17/9/28 05:38, Shaohua Li wrote:
> >> On Tue, Sep 26, 2017 at 11:16:05AM +0800, Joseph Qi wrote:
> >>>
> >>>
> >>> On 17/9/26 10:48, Shaohua Li wrote:
> >>>> On Tue, Sep 26, 2017 at 09:06:57AM +0800, Joseph Qi wrote:
> >>>>> Hi Shaohua,
> >>>>>
> >>>>> On 17/9/26 01:22, Shaohua Li wrote:
> >>>>>> On Mon, Sep 25, 2017 at 06:46:42PM +0800, Joseph Qi wrote:
> >>>>>>> From: Joseph Qi <qijiang...@alibaba-inc.com>
> >>>>>>>
> >>>>>>> Currently it will try to dispatch bio in throtl_upgrade_state. This 
> >>>>>>> may
> >>>>>>> lead to io stall in the following case.
> >>>>>>> Say the hierarchy is like:
> >>>>>>> /-test1
> >>>>>>>   |-subtest1
> >>>>>>> and subtest1 has 32 queued bios now.
> >>>>>>>
> >>>>>>> throtl_pending_timer_fnthrotl_upgrade_state
> >>>>>>> 
> >>>>>>>upgrade to max
> >>>>>>>throtl_select_dispatch
> >>>>>>>throtl_schedule_next_dispatch
> >>>>>>> throtl_select_dispatch
> >>>>>>> throtl_schedule_next_dispatch
> >>>>>>>
> >>>>>>> Since throtl_select_dispatch will move queued bios from subtest1 to
> >>>>>>> test1 in throtl_upgrade_state, it will then just do nothing in
> >>>>>>> throtl_pending_timer_fn. As a result, queued bios won't be dispatched
> >>>>>>> any more if no proper timer scheduled.
> >>>>>>
> >>>>>> Sorry, didn't get it. If throtl_pending_timer_fn does nothing (because
> >>>>>> throtl_upgrade_state already moves bios to parent), there is no pending
> >>>>>> blkcg/bio, not rearming the timer wouldn't lose anything. Am I missing
> >>>>>> anything? could you please describe the failure in details?
> >>>>>>
> >>>>>> Thanks,
> >>>>>> Shaohua
> >>>>>> In normal case, throtl_pending_timer_fn tries to move bios from
> >>>>> subtest1 to test1, and finally do the real issueing work when reach
> >>>>> the top-level.
> >>>>> But int the case above, throtl_select_dispatch in
> >>>>> throtl_pending_timer_fn returns 0, because the work is done by
> >>>>> throtl_upgrade_state. Then throtl_pending_timer_fn *thinks* there is
> >>>>> nothing to do, but the queued bios are still in service queue of
> >>>>> test1.
> >>>>
> >>>> Still didn't get, sorry. If there are pending bios in test1, why
> >>>> throtl_schedule_next_dispatch in throtl_pending_timer_fn doesn't setup 
> >>>> the
> >>>> timer?
> >>>>
> >>>
> >>> throtl_schedule_next_dispatch doesn't setup timer because there is no
> >>> pending children left, all the queued bios are moved to parent test1
> >>> now. IMO, this is used in case that it cannot dispatch all queued bios
> >>> in one round.
> >>> And if the select dispatch is done by timer, it will then do propagate
> >>> dispatch in parent till reach the top-level.
> >>> But in the case above, it breaks this logic.
> >>> Please point out if I am understanding wrong.
> >>
> >> I read your reply again. So if the bios are move to test1, why don't we
> >> dispatch bios of test1? throtl_upgrade_state does a post-order traversal, 
> >> so it
> >> handles subtest1 and then test1. Anything I missed? Please describe in 
> >> details,
> >> thanks! Did you see a real stall or is this based on code analysis?
> >>
> >> Thanks,
> >> Shaohua
> >>
> > 
> > Sorry for the unclear description and the misunderstanding brought in.
> > I backported your patches to my kernel 3.10 and did the test. I tested
> > with libaio and iodepth 32. Most time it worked well, but occasionally
> > it w

Re: [PATCH] blk-throttle: fix possible io stall when doing upgrade

2017-09-27 Thread Shaohua Li
On Tue, Sep 26, 2017 at 11:16:05AM +0800, Joseph Qi wrote:
> 
> 
> On 17/9/26 10:48, Shaohua Li wrote:
> > On Tue, Sep 26, 2017 at 09:06:57AM +0800, Joseph Qi wrote:
> >> Hi Shaohua,
> >>
> >> On 17/9/26 01:22, Shaohua Li wrote:
> >>> On Mon, Sep 25, 2017 at 06:46:42PM +0800, Joseph Qi wrote:
> >>>> From: Joseph Qi <qijiang...@alibaba-inc.com>
> >>>>
> >>>> Currently it will try to dispatch bio in throtl_upgrade_state. This may
> >>>> lead to io stall in the following case.
> >>>> Say the hierarchy is like:
> >>>> /-test1
> >>>>   |-subtest1
> >>>> and subtest1 has 32 queued bios now.
> >>>>
> >>>> throtl_pending_timer_fnthrotl_upgrade_state
> >>>> 
> >>>>upgrade to max
> >>>>throtl_select_dispatch
> >>>>throtl_schedule_next_dispatch
> >>>> throtl_select_dispatch
> >>>> throtl_schedule_next_dispatch
> >>>>
> >>>> Since throtl_select_dispatch will move queued bios from subtest1 to
> >>>> test1 in throtl_upgrade_state, it will then just do nothing in
> >>>> throtl_pending_timer_fn. As a result, queued bios won't be dispatched
> >>>> any more if no proper timer scheduled.
> >>>
> >>> Sorry, didn't get it. If throtl_pending_timer_fn does nothing (because
> >>> throtl_upgrade_state already moves bios to parent), there is no pending
> >>> blkcg/bio, not rearming the timer wouldn't lose anything. Am I missing
> >>> anything? could you please describe the failure in details?
> >>>
> >>> Thanks,
> >>> Shaohua
> >>> In normal case, throtl_pending_timer_fn tries to move bios from
> >> subtest1 to test1, and finally do the real issueing work when reach
> >> the top-level.
> >> But int the case above, throtl_select_dispatch in
> >> throtl_pending_timer_fn returns 0, because the work is done by
> >> throtl_upgrade_state. Then throtl_pending_timer_fn *thinks* there is
> >> nothing to do, but the queued bios are still in service queue of
> >> test1.
> > 
> > Still didn't get, sorry. If there are pending bios in test1, why
> > throtl_schedule_next_dispatch in throtl_pending_timer_fn doesn't setup the
> > timer?
> > 
> 
> throtl_schedule_next_dispatch doesn't setup timer because there is no
> pending children left, all the queued bios are moved to parent test1
> now. IMO, this is used in case that it cannot dispatch all queued bios
> in one round.
> And if the select dispatch is done by timer, it will then do propagate
> dispatch in parent till reach the top-level.
> But in the case above, it breaks this logic.
> Please point out if I am understanding wrong.

I read your reply again. So if the bios are move to test1, why don't we
dispatch bios of test1? throtl_upgrade_state does a post-order traversal, so it
handles subtest1 and then test1. Anything I missed? Please describe in details,
thanks! Did you see a real stall or is this based on code analysis?

Thanks,
Shaohua


[PATCH] block: fix a build error

2017-09-26 Thread Shaohua Li
The code is only for blkcg not for all cgroups

Reported-by: kbuild test robot <fengguang...@intel.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c| 2 +-
 include/linux/kthread.h | 2 +-
 kernel/kthread.c| 8 
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 37c4da7..7269341 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1695,7 +1695,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx 
*hctx,
}
 
/* always use the first bio's css */
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
cmd->css = cmd->rq->bio->bi_css;
css_get(cmd->css);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index bd4369c..fb20184 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct 
kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 void kthread_associate_blkcg(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *kthread_blkcg(void);
 #else
diff --git a/kernel/kthread.c b/kernel/kthread.c
index a8b4e83..ed95828 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -46,7 +46,7 @@ struct kthread {
void *data;
struct completion parked;
struct completion exited;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
 #endif
 };
@@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k)
 * or if kmalloc() in kthread() failed.
 */
kthread = to_kthread(k);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
WARN_ON_ONCE(kthread && kthread->blkcg_css);
 #endif
kfree(kthread);
@@ -224,7 +224,7 @@ static int kthread(void *_create)
self->data = data;
init_completion(>exited);
init_completion(>parked);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
self->blkcg_css = NULL;
 #endif
current->vfork_done = >exited;
@@ -1165,7 +1165,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 /**
  * kthread_associate_blkcg - associate blkcg to current kthread
  * @css: the cgroup info
-- 
2.9.5



Re: [PATCH] blk-throttle: fix possible io stall when doing upgrade

2017-09-25 Thread Shaohua Li
On Tue, Sep 26, 2017 at 09:06:57AM +0800, Joseph Qi wrote:
> Hi Shaohua,
> 
> On 17/9/26 01:22, Shaohua Li wrote:
> > On Mon, Sep 25, 2017 at 06:46:42PM +0800, Joseph Qi wrote:
> >> From: Joseph Qi <qijiang...@alibaba-inc.com>
> >>
> >> Currently it will try to dispatch bio in throtl_upgrade_state. This may
> >> lead to io stall in the following case.
> >> Say the hierarchy is like:
> >> /-test1
> >>   |-subtest1
> >> and subtest1 has 32 queued bios now.
> >>
> >> throtl_pending_timer_fnthrotl_upgrade_state
> >> 
> >>upgrade to max
> >>throtl_select_dispatch
> >>throtl_schedule_next_dispatch
> >> throtl_select_dispatch
> >> throtl_schedule_next_dispatch
> >>
> >> Since throtl_select_dispatch will move queued bios from subtest1 to
> >> test1 in throtl_upgrade_state, it will then just do nothing in
> >> throtl_pending_timer_fn. As a result, queued bios won't be dispatched
> >> any more if no proper timer scheduled.
> > 
> > Sorry, didn't get it. If throtl_pending_timer_fn does nothing (because
> > throtl_upgrade_state already moves bios to parent), there is no pending
> > blkcg/bio, not rearming the timer wouldn't lose anything. Am I missing
> > anything? could you please describe the failure in details?
> > 
> > Thanks,
> > Shaohua
> >In normal case, throtl_pending_timer_fn tries to move bios from
> subtest1 to test1, and finally do the real issueing work when reach
> the top-level.
> But int the case above, throtl_select_dispatch in
> throtl_pending_timer_fn returns 0, because the work is done by
> throtl_upgrade_state. Then throtl_pending_timer_fn *thinks* there is
> nothing to do, but the queued bios are still in service queue of
> test1.

Still didn't get, sorry. If there are pending bios in test1, why
throtl_schedule_next_dispatch in throtl_pending_timer_fn doesn't setup the
timer?


Re: [PATCH] blk-throttle: fix possible io stall when doing upgrade

2017-09-25 Thread Shaohua Li
On Mon, Sep 25, 2017 at 06:46:42PM +0800, Joseph Qi wrote:
> From: Joseph Qi 
> 
> Currently it will try to dispatch bio in throtl_upgrade_state. This may
> lead to io stall in the following case.
> Say the hierarchy is like:
> /-test1
>   |-subtest1
> and subtest1 has 32 queued bios now.
> 
> throtl_pending_timer_fnthrotl_upgrade_state
> 
>upgrade to max
>throtl_select_dispatch
>throtl_schedule_next_dispatch
> throtl_select_dispatch
> throtl_schedule_next_dispatch
> 
> Since throtl_select_dispatch will move queued bios from subtest1 to
> test1 in throtl_upgrade_state, it will then just do nothing in
> throtl_pending_timer_fn. As a result, queued bios won't be dispatched
> any more if no proper timer scheduled.

Sorry, didn't get it. If throtl_pending_timer_fn does nothing (because
throtl_upgrade_state already moves bios to parent), there is no pending
blkcg/bio, not rearming the timer wouldn't lose anything. Am I missing
anything? could you please describe the failure in details?

Thanks,
Shaohua

> Fix this issue by just scheduling dispatch now and let the dispatch be
> done only in timer.
> 
> Signed-off-by: Joseph Qi 
> ---
>  block/blk-throttle.c | 8 +++-
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 0fea76a..29d282f 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -1909,13 +1909,11 @@ static void throtl_upgrade_state(struct throtl_data 
> *td)
>   struct throtl_grp *tg = blkg_to_tg(blkg);
>   struct throtl_service_queue *sq = >service_queue;
>  
> - tg->disptime = jiffies - 1;
> - throtl_select_dispatch(sq);
> - throtl_schedule_next_dispatch(sq, false);
> + tg->disptime = jiffies;
> + throtl_schedule_next_dispatch(sq, true);
>   }
>   rcu_read_unlock();
> - throtl_select_dispatch(>service_queue);
> - throtl_schedule_next_dispatch(>service_queue, false);
> + throtl_schedule_next_dispatch(>service_queue, true);
>   queue_work(kthrotld_workqueue, >dispatch_work);
>  }
>  
> -- 
> 1.9.4


Re: [PATCH V3 0/4] block: make loop block device cgroup aware

2017-09-25 Thread Shaohua Li
On Thu, Sep 14, 2017 at 02:02:03PM -0700, Shaohua Li wrote:
> From: Shaohua Li <s...@fb.com>
> 
> Hi,
> 
> The IO dispatched to under layer disk by loop block device isn't cloned from
> original bio, so the IO loses cgroup information of original bio. These IO
> escapes from cgroup control. The patches try to address this issue. The idea 
> is
> quite generic, but we currently only make it work for blkcg.

Ping! how do we proceed with this patch set?

Thanks,
Shaohua

> 
> Thanks,
> Shaohua
> 
> V2->V3:
> - Make the API more robust pointed out by Tejun
> 
> V1->V2:
> - Address a couple of issues pointed out by Tejun
> 
> 
> Shaohua Li (4):
>   kthread: add a mechanism to store cgroup info
>   blkcg: delete unused APIs
>   block: make blkcg aware of kthread stored original cgroup info
>   block/loop: make loop cgroup aware
> 
>  block/bio.c| 31 --
>  drivers/block/loop.c   | 13 +
>  drivers/block/loop.h   |  1 +
>  include/linux/bio.h|  2 --
>  include/linux/blk-cgroup.h | 25 +-
>  include/linux/kthread.h| 11 
>  kernel/kthread.c   | 66 
> --
>  7 files changed, 96 insertions(+), 53 deletions(-)
> 
> -- 
> 2.9.5
> 


[PATCH] block: fix a crash caused by wrong API

2017-09-21 Thread Shaohua Li
part_stat_show takes a part device not a disk, so we should use
part_to_disk.

Fix: d62e26b3ffd2(block: pass in queue to inflight accounting)
Cc: Bart Van Assche <bart.vanass...@wdc.com>
Cc: Omar Sandoval <osan...@fb.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/partition-generic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/partition-generic.c b/block/partition-generic.c
index 86e8fe1..88c555d 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -112,7 +112,7 @@ ssize_t part_stat_show(struct device *dev,
   struct device_attribute *attr, char *buf)
 {
struct hd_struct *p = dev_to_part(dev);
-   struct request_queue *q = dev_to_disk(dev)->queue;
+   struct request_queue *q = part_to_disk(p)->queue;
unsigned int inflight[2];
int cpu;
 
-- 
2.9.5



[PATCH V3 1/4] kthread: add a mechanism to store cgroup info

2017-09-14 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

kthread usually runs jobs on behalf of other threads. The jobs should be
charged to cgroup of original threads. But the jobs run in a kthread,
where we lose the cgroup context of original threads. The patch adds a
machanism to record cgroup info of original threads in kthread context.
Later we can retrieve the cgroup info and attach the cgroup info to jobs.

Since this mechanism is only required by kthread, we store the cgroup
info in kthread data instead of generic task_struct.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 include/linux/kthread.h | 11 +
 kernel/kthread.c| 66 +++--
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197e..bd4369c 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -3,6 +3,7 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include 
 #include 
+#include 
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct 
kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+#ifdef CONFIG_CGROUPS
+void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_blkcg(void);
+#else
+static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_blkcg(void)
+{
+   return NULL;
+}
+#endif
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528..a8b4e83 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
void *data;
struct completion parked;
struct completion exited;
+#ifdef CONFIG_CGROUPS
+   struct cgroup_subsys_state *blkcg_css;
+#endif
 };
 
 enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct 
*k)
 
 void free_kthread_struct(struct task_struct *k)
 {
+   struct kthread *kthread;
+
/*
 * Can be NULL if this kthread was created by kernel_thread()
 * or if kmalloc() in kthread() failed.
 */
-   kfree(to_kthread(k));
+   kthread = to_kthread(k);
+#ifdef CONFIG_CGROUPS
+   WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+   kfree(kthread);
 }
 
 /**
@@ -216,6 +224,9 @@ static int kthread(void *_create)
self->data = data;
init_completion(>exited);
init_completion(>parked);
+#ifdef CONFIG_CGROUPS
+   self->blkcg_css = NULL;
+#endif
current->vfork_done = >exited;
 
/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -1153,3 +1164,54 @@ void kthread_destroy_worker(struct kthread_worker 
*worker)
kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+   struct kthread *kthread;
+
+   if (!(current->flags & PF_KTHREAD))
+   return;
+   kthread = to_kthread(current);
+   if (!kthread)
+   return;
+
+   if (kthread->blkcg_css) {
+   css_put(kthread->blkcg_css);
+   kthread->blkcg_css = NULL;
+   }
+   if (css) {
+   css_get(css);
+   kthread->blkcg_css = css;
+   }
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+   struct kthread *kthread;
+
+   if (current->flags & PF_KTHREAD) {
+   kthread = to_kthread(current);
+   if (kthread)
+   return kthread->blkcg_css;
+   }
+   return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
-- 
2.9.5



[PATCH V3 2/4] blkcg: delete unused APIs

2017-09-14 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Nobody uses the APIs right now.

Acked-by: Tejun Heo <t...@kernel.org>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/bio.c| 31 ---
 include/linux/bio.h|  2 --
 include/linux/blk-cgroup.h | 12 
 3 files changed, 45 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 6745759..9271fa3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2033,37 +2033,6 @@ int bio_associate_blkcg(struct bio *bio, struct 
cgroup_subsys_state *blkcg_css)
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
 /**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet.  Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released.  The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
-   struct io_context *ioc;
-
-   if (bio->bi_css)
-   return -EBUSY;
-
-   ioc = current->io_context;
-   if (!ioc)
-   return -ENOENT;
-
-   get_io_context_active(ioc);
-   bio->bi_ioc = ioc;
-   bio->bi_css = task_get_css(current, io_cgrp_id);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_current);
-
-/**
  * bio_disassociate_task - undo bio_associate_current()
  * @bio: target bio
  */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a8fe793..d795cdd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -514,13 +514,11 @@ do {  \
 
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state 
*blkcg_css);
-int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else  /* CONFIG_BLK_CGROUP */
 static inline int bio_associate_blkcg(struct bio *bio,
struct cgroup_subsys_state *blkcg_css) { return 0; }
-static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
struct bio *src) { }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9d92153..0cfa8d2 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -235,12 +235,6 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
return task_blkcg(current);
 }
 
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-   return task_get_css(task, io_cgrp_id);
-}
-
 /**
  * blkcg_parent - get the parent of a blkcg
  * @blkcg: blkcg of interest
@@ -735,12 +729,6 @@ struct blkcg_policy {
 
 #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-   return NULL;
-}
-
 #ifdef CONFIG_BLOCK
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { 
return NULL; }
-- 
2.9.5



[PATCH V3 4/4] block/loop: make loop cgroup aware

2017-09-14 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

loop block device handles IO in a separate thread. The actual IO
dispatched isn't cloned from the IO loop device received, so the
dispatched IO loses the cgroup context.

I'm ignoring buffer IO case now, which is quite complicated.  Making the
loop thread aware cgroup context doesn't really help. The loop device
only writes to a single file. In current writeback cgroup
implementation, the file can only belong to one cgroup.

For direct IO case, we could workaround the issue in theory. For
example, say we assign cgroup1 5M/s BW for loop device and cgroup2
10M/s. We can create a special cgroup for loop thread and assign at
least 15M/s for the underlayer disk. In this way, we correctly throttle
the two cgroups. But this is tricky to setup.

This patch tries to address the issue. We record bio's css in loop
command. When loop thread is handling the command, we then use the API
provided in patch 1 to set the css for current task. The bio layer will
use the css for new IO (from patch 3).

Acked-by: Tejun Heo <t...@kernel.org>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 13 +
 drivers/block/loop.h |  1 +
 2 files changed, 14 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 8934e25..37c4da7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -479,6 +479,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+   if (cmd->css)
+   css_put(cmd->css);
cmd->ret = ret > 0 ? 0 : ret;
lo_rw_aio_do_completion(cmd);
 }
@@ -538,6 +540,8 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
cmd->iocb.ki_filp = file;
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
+   if (cmd->css)
+   kthread_associate_blkcg(cmd->css);
 
if (rw == WRITE)
ret = call_write_iter(file, >iocb, );
@@ -545,6 +549,7 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
ret = call_read_iter(file, >iocb, );
 
lo_rw_aio_do_completion(cmd);
+   kthread_associate_blkcg(NULL);
 
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(>iocb, ret, 0);
@@ -1689,6 +1694,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx 
*hctx,
break;
}
 
+   /* always use the first bio's css */
+#ifdef CONFIG_CGROUPS
+   if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
+   cmd->css = cmd->rq->bio->bi_css;
+   css_get(cmd->css);
+   } else
+#endif
+   cmd->css = NULL;
kthread_queue_work(>worker, >work);
 
return BLK_STS_OK;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index f68c1d5..d93b669 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -74,6 +74,7 @@ struct loop_cmd {
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
+   struct cgroup_subsys_state *css;
 };
 
 /* Support for loadable transfer modules */
-- 
2.9.5



[PATCH V3 0/4] block: make loop block device cgroup aware

2017-09-14 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Hi,

The IO dispatched to under layer disk by loop block device isn't cloned from
original bio, so the IO loses cgroup information of original bio. These IO
escapes from cgroup control. The patches try to address this issue. The idea is
quite generic, but we currently only make it work for blkcg.

Thanks,
Shaohua

V2->V3:
- Make the API more robust pointed out by Tejun

V1->V2:
- Address a couple of issues pointed out by Tejun


Shaohua Li (4):
  kthread: add a mechanism to store cgroup info
  blkcg: delete unused APIs
  block: make blkcg aware of kthread stored original cgroup info
  block/loop: make loop cgroup aware

 block/bio.c| 31 --
 drivers/block/loop.c   | 13 +
 drivers/block/loop.h   |  1 +
 include/linux/bio.h|  2 --
 include/linux/blk-cgroup.h | 25 +-
 include/linux/kthread.h| 11 
 kernel/kthread.c   | 66 --
 7 files changed, 96 insertions(+), 53 deletions(-)

-- 
2.9.5



[PATCH V3 3/4] block: make blkcg aware of kthread stored original cgroup info

2017-09-14 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

bio_blkcg is the only API to get cgroup info for a bio right now. If
bio_blkcg finds current task is a kthread and has original blkcg
associated, it will use the css instead of associating the bio to
current task. This makes it possible that kthread dispatches bios on
behalf of other threads.

Acked-by: Tejun Heo <t...@kernel.org>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 include/linux/blk-cgroup.h | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 0cfa8d2..f57e54d 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH(INT_MAX / 2)
@@ -223,16 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct 
cgroup_subsys_state *css)
return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-   return css_to_blkcg(task_css(tsk, io_cgrp_id));
-}
-
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
+   struct cgroup_subsys_state *css;
+
if (bio && bio->bi_css)
return css_to_blkcg(bio->bi_css);
-   return task_blkcg(current);
+   css = kthread_blkcg();
+   if (css)
+   return css_to_blkcg(css);
+   return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
 /**
-- 
2.9.5



Re: [PATCH V2 1/4] kthread: add a mechanism to store cgroup info

2017-09-13 Thread Shaohua Li
On Wed, Sep 13, 2017 at 02:38:20PM -0700, Tejun Heo wrote:
> Hello,
> 
> On Wed, Sep 13, 2017 at 02:01:26PM -0700, Shaohua Li wrote:
> > diff --git a/kernel/kthread.c b/kernel/kthread.c
> > index 26db528..3107eee 100644
> > --- a/kernel/kthread.c
> > +++ b/kernel/kthread.c
> > @@ -20,7 +20,6 @@
> >  #include 
> >  #include 
> >  #include 
> > -#include 
> >  #include 
> >  
> >  static DEFINE_SPINLOCK(kthread_create_lock);
> > @@ -47,6 +46,7 @@ struct kthread {
> > void *data;
> > struct completion parked;
> > struct completion exited;
> 
> maybe #ifdef CONFIG_CGROUPS?
> 
> > +   struct cgroup_subsys_state *blkcg_css;

Ah, I thought cgroup_subsys_state is always defined, let me double check.

> >  };
> ...
> > +void kthread_associate_blkcg(struct cgroup_subsys_state *css)
> > +{
> > +   struct kthread *kthread;
> > +
> > +   if (!(current->flags & PF_KTHREAD) || !current->set_child_tid)
> > +   return;
> > +   kthread = to_kthread(current);
> > +   if (css) {
> > +   css_get(css);
> > +   kthread->blkcg_css = css;
> > +   } else if (kthread->blkcg_css) {
> > +   css_put(kthread->blkcg_css);
> > +   kthread->blkcg_css = NULL;
> > +   }
> > +}
> > +EXPORT_SYMBOL(kthread_associate_blkcg);
> 
> Maybe doing sth like the following is less error-prone?
> 
> kthread_associate_blkcg(@css)
> {
>   if (current's kthread->blkcg_css)
>   put kthread->blkcg_css and set it to NULL;
>   if (@css)
>   get @css and set kthread->blkcg;
> }

Sounds good.

Thanks,
Shaohua


[PATCH V2 1/4] kthread: add a mechanism to store cgroup info

2017-09-13 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

kthread usually runs jobs on behalf of other threads. The jobs should be
charged to cgroup of original threads. But the jobs run in a kthread,
where we lose the cgroup context of original threads. The patch adds a
machanism to record cgroup info of original threads in kthread context.
Later we can retrieve the cgroup info and attach the cgroup info to jobs.

Since this mechanism is only required by kthread, we store the cgroup
info in kthread data instead of generic task_struct.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 include/linux/kthread.h | 11 +++
 kernel/kthread.c| 44 +++-
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197e..bd4369c 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -3,6 +3,7 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include 
 #include 
+#include 
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct 
kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+#ifdef CONFIG_CGROUPS
+void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_blkcg(void);
+#else
+static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_blkcg(void)
+{
+   return NULL;
+}
+#endif
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528..3107eee 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,7 @@ struct kthread {
void *data;
struct completion parked;
struct completion exited;
+   struct cgroup_subsys_state *blkcg_css;
 };
 
 enum KTHREAD_BITS {
@@ -1153,3 +1153,45 @@ void kthread_destroy_worker(struct kthread_worker 
*worker)
kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+   struct kthread *kthread;
+
+   if (!(current->flags & PF_KTHREAD) || !current->set_child_tid)
+   return;
+   kthread = to_kthread(current);
+   if (css) {
+   css_get(css);
+   kthread->blkcg_css = css;
+   } else if (kthread->blkcg_css) {
+   css_put(kthread->blkcg_css);
+   kthread->blkcg_css = NULL;
+   }
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+   if ((current->flags & PF_KTHREAD) && current->set_child_tid)
+   return to_kthread(current)->blkcg_css;
+   return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
-- 
2.9.5



[PATCH V2 3/4] block: make blkcg aware of kthread stored original cgroup info

2017-09-13 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

bio_blkcg is the only API to get cgroup info for a bio right now. If
bio_blkcg finds current task is a kthread and has original blkcg
associated, it will use the css instead of associating the bio to
current task. This makes it possible that kthread dispatches bios on
behalf of other threads.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 include/linux/blk-cgroup.h | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 0cfa8d2..f57e54d 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH(INT_MAX / 2)
@@ -223,16 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct 
cgroup_subsys_state *css)
return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-   return css_to_blkcg(task_css(tsk, io_cgrp_id));
-}
-
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
+   struct cgroup_subsys_state *css;
+
if (bio && bio->bi_css)
return css_to_blkcg(bio->bi_css);
-   return task_blkcg(current);
+   css = kthread_blkcg();
+   if (css)
+   return css_to_blkcg(css);
+   return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
 /**
-- 
2.9.5



Re: [PATCH 3/3] block/loop: make loop cgroup aware

2017-09-08 Thread Shaohua Li
On Fri, Sep 08, 2017 at 07:48:09AM -0700, Tejun Heo wrote:
> Hello,
> 
> On Wed, Sep 06, 2017 at 07:00:53PM -0700, Shaohua Li wrote:
> > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > index 9d4545f..9850b27 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -482,6 +482,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
> > ret, long ret2)
> >  {
> > struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
> >  
> > +   if (cmd->css)
> > +   css_put(cmd->css);
> > cmd->ret = ret > 0 ? 0 : ret;
> > lo_rw_aio_do_completion(cmd);
> 
> The fact that we're forwarding explicitly in loop still bothers me a
> bit.  Can you please elaborate why we don't want to do this
> generically through aio?

I think we must forward in loop, because each cmd could come from different
cgroup, so we must explicitly forward for each cmd.

The main reason not to do the forward in aio is complexity. We at least have 3
different implementations for dio:
- __blockdev_direct_IO for ext4 and btrfs
- iomap dio for xfs
- blockdev dio implementation

Forwarding in dio means hooking the cgroup association for each bio dispatched
in the implementations, which is a little messy. I'd like to avoid this if
there is no strong reason to do it.

Thanks,
Shaohua


Re: [PATCH 1/3] kthread: add a mechanism to store cgroup info

2017-09-08 Thread Shaohua Li
On Fri, Sep 08, 2017 at 07:35:37AM -0700, Tejun Heo wrote:
> Hello,
> 
> On Wed, Sep 06, 2017 at 07:00:51PM -0700, Shaohua Li wrote:
> > +#ifdef CONFIG_CGROUPS
> > +void kthread_set_orig_css(struct cgroup_subsys_state *css);
> > +struct cgroup_subsys_state *kthread_get_orig_css(void);
> > +void kthread_reset_orig_css(void);
> 
> * It's a bit weird to associate a kthread with a css without being
>   specific.  If what's needed is a generic association (this kthread
>   is temporarily servicing this cgroup), it should be associated with
>   the cgroup.  But, I think it'd be better to make it specific instead
>   - ie. kthread_set_io_css().
> 
> * Do we need the reset function to be separate?  Can't we just clear
>   it when the set function is called with NULL css?
> 
> * For the accessor, can we do sth like kthread_orig_css() (or
>   kthread_io_css())?  "get" is overloaded between set/get and get/put,
>   so it can get confusing.
> 
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index c05ac5f..ab2295d 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1276,6 +1276,7 @@ extern struct pid *cad_pid;
> >  #define PF_SWAPWRITE   0x0080  /* Allowed to write to 
> > swap */
> >  #define PF_NO_SETAFFINITY  0x0400  /* Userland is not allowed to 
> > meddle with cpus_allowed */
> >  #define PF_MCE_EARLY   0x0800  /* Early kill for mce 
> > process policy */
> > +#define PF_KTHREAD_HAS_CSS 0x1000  /* kthread has css info 
> > attached */
> 
> Do we need a separate flag for this?  kthreads already have PF_KTHREAD
> set.  I'm not sure why we'd need another flag.  Once we know it's a
> kthread, we can just access its css pointer field, right?

Ok, all suggestions are good, will update.

Thanks,
Shaohua


BDI_CAP_STABLE_WRITES for stacked device (Re: Enable skip_copy can cause data integrity issue in some storage) stack

2017-09-07 Thread Shaohua Li
On Thu, Sep 07, 2017 at 11:11:24AM +1000, Neil Brown wrote:
> On Wed, Sep 06 2017, Shaohua Li wrote:
> 
> > On Fri, Sep 01, 2017 at 03:26:41PM +0800, alexwu wrote:
> >> Hi,
> >> 
> >> Recently a data integrity issue about skip_copy was found. We are able
> >> to reproduce it and found the root cause. This data integrity issue
> >> might happen if there are other layers between file system and raid5.
> >> 
> >> [How to Reproduce]
> >> 
> >> 1. Create a raid5 named md0 first (with skip_copy enable), and wait md0
> >>resync done which ensures that all data and parity are synchronized
> >> 2. Use lvm tools to create a logical volume named lv-md0 over md0
> >> 3. Format an ext4 file system on lv-md0 and mount on /mnt
> >> 4. Do some db operations (e.g. sqlite insert) to write data through /mnt
> >> 5. When those db operations finished, we do the following command
> >>"echo check > /sys/block/md0/md/sync_action" to check mismatch_cnt,
> >>it is very likely that we got mismatch_cnt != 0 when check finished
> >> 
> >> [Root Cause]
> >> 
> >> After tracing code and more experiments, it is more proper to say that
> >> it's a problem about backing_dev_info (bdi) instead of a bug about
> >> skip_copy.
> >> 
> >> We notice that:
> >>1. skip_copy counts on BDI_CAP_STABLE_WRITES to ensure that bio's page
> >>   will not be modified before raid5 completes I/O. Thus we can skip 
> >> copy
> >>   page from bio to stripe cache
> >>2. The ext4 file system will call wait_for_stable_page() to ask whether
> >>   the mapped bdi requires stable writes
> >> 
> >> Data integrity happens because:
> >>1. When raid5 enable skip_copy, it will only set it's own bdi required
> >>   BDI_CAP_STABLE_WRITES, but this information will not propagate to
> >>   other bdi between file system and md
> >>2. When ext4 file system check stable writes requirement by calling
> >>   wait_for_stable_page(), it can only see the underlying bdi's
> >> capability
> >>   and cannot see all related bdi
> >> 
> >> Thus, skip_copy works fine if we format file system directly on md.
> >> But data integrity issue happens if there are some other block layers (e.g.
> >> dm)
> >> between file system and md.
> >> 
> >> [Result]
> >> 
> >> We do more tests on different storage stacks, here are the results.
> >> 
> >> The following settings can pass the test thousand times:
> >>1. raid5 with skip_copy enable + ext4
> >>2. raid5 with skip_copy disable + ext4
> >>3. raid5 with skip_copy disable + lvm + ext4
> >> 
> >> The following setting will fail the test in 10 rounds:
> >>1. raid5 with skip_copy enable + lvm + ext4
> >> 
> >> I think the solution might be let all bdi can communicate through different
> >> block layers,
> >> then we can pass BDI_CAP_STABLE_WRITES information if we enable skip_copy.
> >> But the current bdi structure is not allowed us to do that.
> >> 
> >> What would you suggest to do if we want to make skip_copy more reliable ?
> >
> > Very interesting problem. Looks this isn't raid5 specific. stacked device 
> > with
> > block integrity enabled could expose the same issue.
> >
> > I'm cc Jens and Neil to check if they have good idea. Basically the problem 
> > is
> > for stacked disk, the under layer disk has BDI_CAP_STABLE_WRITES enabled, 
> > but
> > upper layer doesn't enable it. The under layer disk could be a raid5 with
> > skip_copy enabled or could be any disk with block integrity enabled (which 
> > will
> > enable BDI_CAP_STABLE_WRITES). Currently we don't propagate the flag to 
> > upper
> > layer disk.
> >
> > A solution is blk_set_stacking_limits propagate the flag at queue setup, we
> > then don't allow the flag changing in runtime later. The raid5 skip_copy
> > interface changes the flag in runtime which probably isn't safe.
> >
> > Thanks,
> > Shaohua
> 
> It has always been messy to handle dependencies from the bottom of the
> stack affecting things closer to the filesystem.
> We used to have issues with alignments and request sizes, and got rid of
> those problems by requiring all devices to accept all bio sizes, and
> providing support for them to split as necessary.
> This is a similar sort of problem, but also quite different.

Re: [PATCH V2 0/3] block/loop: handle discard/zeroout error

2017-09-07 Thread Shaohua Li
On Thu, Sep 07, 2017 at 03:20:01PM +0200, Ilya Dryomov wrote:
> Hi Shaohua,
> 
> You wrote:
> > BTW: blkdev_issue_zeroout retries if we immediately find the device doesn't
> > support zeroout, but it doesn't retry if submit_bio_wait returns 
> > -EOPNOTSUPP.
> > Is this correct behavior?
> 
> I sent a patch for that yesterday, see "[PATCH] block: cope with WRITE
> SAME failing in blkdev_issue_zeroout()" on linux-block.  It checks for
> -EREMOTEIO, because that's what I hit, but I wonder if it should check
> for -EOPNOTSUPP from submit_bio_wait() as well.  Can you think of
> a case where, given bdev_write_zeroes_sectors() != 0, submit_bio_wait()
> would fail with -EOPNOTSUPP and we would want to do explicit zeroing?

loop block device returns -EOPNOTSUPP before for zeroout. With the second
patch, it doesn't any more though.


Re: [PATCH V2 3/3] block/loop: suppress discard IO error message

2017-09-07 Thread Shaohua Li
On Thu, Sep 07, 2017 at 05:16:21PM +0800, Ming Lei wrote:
> On Thu, Sep 7, 2017 at 8:13 AM, Shaohua Li <s...@kernel.org> wrote:
> > From: Shaohua Li <s...@fb.com>
> >
> > We don't know if fallocate really supports FALLOC_FL_PUNCH_HOLE till
> > fallocate is called. If it doesn't support, loop will return -EOPNOTSUPP
> > and we see a lot of error message printed by blk_update_request. Failure
> > for discard IO isn't a big problem, so we just return 0 in loop which
> > will suppress the IO error message.
> 
> Setting RQF_QUIET for discard IO should be more clean for suppressing error,
> and upper layer can get the failure notification too.

Hmm, forgot why I didn't do this, did consider it before. Probably because
nobody check the return value of discard request. Probably we should skip the
error message for all discard IO in block layer.

I'll repost a patch to fix this issue.

Thanks,
Shaohua


[PATCH 1/3] kthread: add a mechanism to store cgroup info

2017-09-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

kthread usually runs jobs on behalf of other threads. The jobs should be
charged to cgroup of original threads. But the jobs run in a kthread,
where we lose the cgroup context of original threads. The patch adds a
machanism to record cgroup info of original threads in kthread context.
Later we can retrieve the cgroup info and attach the cgroup info to jobs.

Since this mechanism is only required by kthread, we store the cgroup
info in kthread data instead of generic task_struct.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 include/linux/kthread.h | 13 +++
 include/linux/sched.h   |  1 +
 kernel/kthread.c| 57 -
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197e..3eb24d1 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -3,6 +3,7 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include 
 #include 
+#include 
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,4 +199,16 @@ bool kthread_cancel_delayed_work_sync(struct 
kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+#ifdef CONFIG_CGROUPS
+void kthread_set_orig_css(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_get_orig_css(void);
+void kthread_reset_orig_css(void);
+#else
+static inline void kthread_set_orig_css(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_get_orig_css(void)
+{
+   return NULL;
+}
+static inline void kthread_reset_orig_css(void) { }
+#endif
 #endif /* _LINUX_KTHREAD_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c05ac5f..ab2295d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1276,6 +1276,7 @@ extern struct pid *cad_pid;
 #define PF_SWAPWRITE   0x0080  /* Allowed to write to swap */
 #define PF_NO_SETAFFINITY  0x0400  /* Userland is not allowed to 
meddle with cpus_allowed */
 #define PF_MCE_EARLY   0x0800  /* Early kill for mce process 
policy */
+#define PF_KTHREAD_HAS_CSS 0x1000  /* kthread has css info 
attached */
 #define PF_MUTEX_TESTER0x2000  /* Thread belongs to 
the rt mutex tester */
 #define PF_FREEZER_SKIP0x4000  /* Freezer should not 
count it as freezable */
 #define PF_SUSPEND_TASK0x8000  /* This thread called 
freeze_processes() and should not be frozen */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528..d084ef3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,7 @@ struct kthread {
void *data;
struct completion parked;
struct completion exited;
+   struct cgroup_subsys_state *orig_css;
 };
 
 enum KTHREAD_BITS {
@@ -1153,3 +1153,58 @@ void kthread_destroy_worker(struct kthread_worker 
*worker)
kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * kthread_set_orig_css - set the original css for current thread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_set_orig_css(struct cgroup_subsys_state *css)
+{
+   struct kthread *kthread = to_kthread(current);
+
+   if (css) {
+   css_get(css);
+   kthread->orig_css = css;
+   current->flags |= PF_KTHREAD_HAS_CSS;
+   }
+}
+EXPORT_SYMBOL(kthread_set_orig_css);
+
+/**
+ * kthread_get_orig_css - get the stored original cgroup info
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_get_orig_css(void)
+{
+   if (current->flags & PF_KTHREAD_HAS_CSS)
+   return to_kthread(current)->orig_css;
+   return NULL;
+}
+EXPORT_SYMBOL(kthread_get_orig_css);
+
+/**
+ * kthread_reset_orig_css - clear stored cgroup info
+ *
+ * Current thread must be a kthread.
+ */
+void kthread_reset_orig_css(void)
+{
+   struct kthread *kthread = to_kthread(current);
+   struct cgroup_subsys_state *css;
+
+   css = kthread->orig_css;
+   if (css)
+   css_put(css);
+   kthread->orig_css = NULL;
+   current->flags &= ~PF_KTHREAD_HAS_CSS;
+}
+EXPORT_SYMBOL(kthread_reset_orig_css);
+#endif
-- 
2.9.5



[PATCH 2/3] block: make blkcg aware of kthread stored original cgroup info

2017-09-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Several blkcg APIs are deprecated. After removing them, bio_blkcg is the
only API to get cgroup info for a bio. If bio_blkcg finds current task
is a kthread and has original css recorded, it will use the css instead
of associating the bio to current task.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/bio.c| 31 ---
 include/linux/bio.h|  2 --
 include/linux/blk-cgroup.h | 25 +++--
 3 files changed, 7 insertions(+), 51 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 6745759..9271fa3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2033,37 +2033,6 @@ int bio_associate_blkcg(struct bio *bio, struct 
cgroup_subsys_state *blkcg_css)
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
 /**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet.  Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released.  The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
-   struct io_context *ioc;
-
-   if (bio->bi_css)
-   return -EBUSY;
-
-   ioc = current->io_context;
-   if (!ioc)
-   return -ENOENT;
-
-   get_io_context_active(ioc);
-   bio->bi_ioc = ioc;
-   bio->bi_css = task_get_css(current, io_cgrp_id);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_current);
-
-/**
  * bio_disassociate_task - undo bio_associate_current()
  * @bio: target bio
  */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a8fe793..d795cdd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -514,13 +514,11 @@ do {  \
 
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state 
*blkcg_css);
-int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else  /* CONFIG_BLK_CGROUP */
 static inline int bio_associate_blkcg(struct bio *bio,
struct cgroup_subsys_state *blkcg_css) { return 0; }
-static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
struct bio *src) { }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9d92153..0cdcf6b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH(INT_MAX / 2)
@@ -223,22 +224,16 @@ static inline struct blkcg *css_to_blkcg(struct 
cgroup_subsys_state *css)
return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-   return css_to_blkcg(task_css(tsk, io_cgrp_id));
-}
-
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
+   struct cgroup_subsys_state *css;
+
if (bio && bio->bi_css)
return css_to_blkcg(bio->bi_css);
-   return task_blkcg(current);
-}
-
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-   return task_get_css(task, io_cgrp_id);
+   css = kthread_get_orig_css();
+   if (css)
+   return css_to_blkcg(css);
+   return css_to_blkcg(task_css(current, io_cgrp_id));
 }
 
 /**
@@ -735,12 +730,6 @@ struct blkcg_policy {
 
 #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 
-static inline struct cgroup_subsys_state *
-task_get_blkcg_css(struct task_struct *task)
-{
-   return NULL;
-}
-
 #ifdef CONFIG_BLOCK
 
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { 
return NULL; }
-- 
2.9.5



[PATCH 3/3] block/loop: make loop cgroup aware

2017-09-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

loop block device handles IO in a separate thread. The actual IO
dispatched isn't cloned from the IO loop device received, so the
dispatched IO loses the cgroup context.

I'm ignoring buffer IO case now, which is quite complicated.  Making the
loop thread aware cgroup context doesn't really help. The loop device
only writes to a single file. In current writeback cgroup
implementation, the file can only belong to one cgroup.

For direct IO case, we could workaround the issue in theory. For
example, say we assign cgroup1 5M/s BW for loop device and cgroup2
10M/s. We can create a special cgroup for loop thread and assign at
least 15M/s for the underlayer disk. In this way, we correctly throttle
the two cgroups. But this is tricky to setup.

This patch tries to address the issue. We record bio's css in loop
command. When loop thread is handling the command, we then use the API
provided in patch 1 to set the css for current task. The bio layer will
use the css for new IO.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 13 +
 drivers/block/loop.h |  1 +
 2 files changed, 14 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9d4545f..9850b27 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -482,6 +482,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+   if (cmd->css)
+   css_put(cmd->css);
cmd->ret = ret > 0 ? 0 : ret;
lo_rw_aio_do_completion(cmd);
 }
@@ -541,6 +543,8 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
cmd->iocb.ki_filp = file;
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
+   if (cmd->css)
+   kthread_set_orig_css(cmd->css);
 
if (rw == WRITE)
ret = call_write_iter(file, >iocb, );
@@ -548,6 +552,7 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
ret = call_read_iter(file, >iocb, );
 
lo_rw_aio_do_completion(cmd);
+   kthread_reset_orig_css();
 
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(>iocb, ret, 0);
@@ -1692,6 +1697,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx 
*hctx,
break;
}
 
+   /* always use the first bio's css */
+#ifdef CONFIG_CGROUPS
+   if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
+   cmd->css = cmd->rq->bio->bi_css;
+   css_get(cmd->css);
+   } else
+#endif
+   cmd->css = NULL;
kthread_queue_work(>worker, >work);
 
return BLK_STS_OK;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index f68c1d5..d93b669 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -74,6 +74,7 @@ struct loop_cmd {
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
+   struct cgroup_subsys_state *css;
 };
 
 /* Support for loadable transfer modules */
-- 
2.9.5



[PATCH 0/3] block: make loop block device cgroup aware

2017-09-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Hi,

The IO dispatched to under layer disk by loop block device isn't cloned from
original bio, so the IO loses cgroup information of original bio. These IO
escapes from cgroup control. The patches try to address this issue. The idea is
quite generic, but we currently only make it work for blkcg.

Thanks,
Shaohua

Shaohua Li (3):
  kthread: add a mechanism to store cgroup info
  block: make blkcg aware of kthread stored original cgroup info
  block/loop: make loop cgroup aware

 block/bio.c| 31 -
 drivers/block/loop.c   | 13 +++
 drivers/block/loop.h   |  1 +
 include/linux/bio.h|  2 --
 include/linux/blk-cgroup.h | 25 ++--
 include/linux/kthread.h| 13 +++
 include/linux/sched.h  |  1 +
 kernel/kthread.c   | 57 +-
 8 files changed, 91 insertions(+), 52 deletions(-)

-- 
2.9.5



[PATCH V2 1/3] block/loop: don't hijack error number

2017-09-06 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

If the bio returns -EOPNOTSUPP, we shouldn't hijack it and return -EIO

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 85de673..715b762 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -460,7 +460,7 @@ static void lo_complete_rq(struct request *rq)
zero_fill_bio(bio);
}
 
-   blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
+   blk_mq_end_request(rq, errno_to_blk_status(cmd->ret));
 }
 
 static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
@@ -476,7 +476,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
-   cmd->ret = ret;
+   cmd->ret = ret > 0 ? 0 : ret;
lo_rw_aio_do_completion(cmd);
 }
 
@@ -1706,7 +1706,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
  failed:
/* complete non-aio request */
if (!cmd->use_aio || ret) {
-   cmd->ret = ret ? -EIO : 0;
+   cmd->ret = ret;
blk_mq_complete_request(cmd->rq);
}
 }
-- 
2.9.5



Re: [PATCH V6 00/18] blk-throttle: add .low limit

2017-09-06 Thread Shaohua Li
On Wed, Sep 06, 2017 at 09:12:20AM +0800, Joseph Qi wrote:
> Hi Shaohua,
> 
> On 17/9/6 05:02, Shaohua Li wrote:
> > On Thu, Aug 31, 2017 at 09:24:23AM +0200, Paolo VALENTE wrote:
> >>
> >>> Il giorno 15 gen 2017, alle ore 04:42, Shaohua Li <s...@fb.com> ha 
> >>> scritto:
> >>>
> >>> Hi,
> >>>
> >>> cgroup still lacks a good iocontroller. CFQ works well for hard disk, but 
> >>> not
> >>> much for SSD. This patch set try to add a conservative limit for 
> >>> blk-throttle.
> >>> It isn't a proportional scheduling, but can help prioritize cgroups. 
> >>> There are
> >>> several advantages we choose blk-throttle:
> >>> - blk-throttle resides early in the block stack. It works for both bio and
> >>>  request based queues.
> >>> - blk-throttle is light weight in general. It still takes queue lock, but 
> >>> it's
> >>>  not hard to implement a per-cpu cache and remove the lock contention.
> >>> - blk-throttle doesn't use 'idle disk' mechanism, which is used by 
> >>> CFQ/BFQ. The
> >>>  mechanism is proved to harm performance for fast SSD.
> >>>
> >>> The patch set add a new io.low limit for blk-throttle. It's only for 
> >>> cgroup2.
> >>> The existing io.max is a hard limit throttling. cgroup with a max limit 
> >>> never
> >>> dispatch more IO than its max limit. While io.low is a best effort 
> >>> throttling.
> >>> cgroups with 'low' limit can run above their 'low' limit at appropriate 
> >>> time.
> >>> Specifically, if all cgroups reach their 'low' limit, all cgroups can run 
> >>> above
> >>> their 'low' limit. If any cgroup runs under its 'low' limit, all other 
> >>> cgroups
> >>> will run according to their 'low' limit. So the 'low' limit could act as 
> >>> two
> >>> roles, it allows cgroups using free bandwidth and it protects cgroups from
> >>> their 'low' limit.
> >>>
> >>> An example usage is we have a high prio cgroup with high 'low' limit and 
> >>> a low
> >>> prio cgroup with low 'low' limit. If the high prio cgroup isn't running, 
> >>> the low
> >>> prio can run above its 'low' limit, so we don't waste the bandwidth. When 
> >>> the
> >>> high prio cgroup runs and is below its 'low' limit, low prio cgroup will 
> >>> run
> >>> under its 'low' limit. This will protect high prio cgroup to get more
> >>> resources.
> >>>
> >>
> >> Hi Shaohua,
> > 
> > Hi,
> > 
> > Sorry for the late response.
> >> I would like to ask you some questions, to make sure I fully
> >> understand how the 'low' limit and the idle-group detection work in
> >> your above scenario.  Suppose that: the drive has a random-I/O peak
> >> rate of 100MB/s, the high prio group has a 'low' limit of 90 MB/s, and
> >> the low prio group has a 'low' limit of 10 MB/s.  If
> >> - the high prio process happens to do, say, only 5 MB/s for a given
> >>   long time
> >> - the low prio process constantly does greedy I/O
> >> - the idle-group detection is not being used
> >> then the low prio process is limited to 10 MB/s during all this time
> >> interval.  And only 10% of the device bandwidth is utilized.
> >>
> >> To recover lost bandwidth through idle-group detection, we need to set
> >> a target IO latency for the high-prio group.  The high prio group
> >> should happen to be below the threshold, and thus to be detected as
> >> idle, leaving the low prio group free too use all the bandwidth.
> >>
> >> Here are my questions:
> >> 1) Is all I wrote above correct?
> > 
> > Yes
> >> 2) In particular, maybe there are other better mechanism to saturate
> >> the bandwidth in the above scenario?
> > 
> > Assume it's the 4) below.
> >> If what I wrote above is correct:
> >> 3) Doesn't fluctuation occur?  I mean: when the low prio group gets
> >> full bandwidth, the latency threshold of the high prio group may be
> >> overcome, causing the high prio group to not be considered idle any
> >> longer, and thus the low prio group to be limited again; this in turn
> >> will cause the threshold to not be overcome any longer, and so on.
> > 
> > That's true. We try to mitigate the fluctuation by increasing the low prio
> &

Re: [PATCH V6 00/18] blk-throttle: add .low limit

2017-09-05 Thread Shaohua Li
On Thu, Aug 31, 2017 at 09:24:23AM +0200, Paolo VALENTE wrote:
> 
> > Il giorno 15 gen 2017, alle ore 04:42, Shaohua Li <s...@fb.com> ha scritto:
> > 
> > Hi,
> > 
> > cgroup still lacks a good iocontroller. CFQ works well for hard disk, but 
> > not
> > much for SSD. This patch set try to add a conservative limit for 
> > blk-throttle.
> > It isn't a proportional scheduling, but can help prioritize cgroups. There 
> > are
> > several advantages we choose blk-throttle:
> > - blk-throttle resides early in the block stack. It works for both bio and
> >  request based queues.
> > - blk-throttle is light weight in general. It still takes queue lock, but 
> > it's
> >  not hard to implement a per-cpu cache and remove the lock contention.
> > - blk-throttle doesn't use 'idle disk' mechanism, which is used by CFQ/BFQ. 
> > The
> >  mechanism is proved to harm performance for fast SSD.
> > 
> > The patch set add a new io.low limit for blk-throttle. It's only for 
> > cgroup2.
> > The existing io.max is a hard limit throttling. cgroup with a max limit 
> > never
> > dispatch more IO than its max limit. While io.low is a best effort 
> > throttling.
> > cgroups with 'low' limit can run above their 'low' limit at appropriate 
> > time.
> > Specifically, if all cgroups reach their 'low' limit, all cgroups can run 
> > above
> > their 'low' limit. If any cgroup runs under its 'low' limit, all other 
> > cgroups
> > will run according to their 'low' limit. So the 'low' limit could act as two
> > roles, it allows cgroups using free bandwidth and it protects cgroups from
> > their 'low' limit.
> > 
> > An example usage is we have a high prio cgroup with high 'low' limit and a 
> > low
> > prio cgroup with low 'low' limit. If the high prio cgroup isn't running, 
> > the low
> > prio can run above its 'low' limit, so we don't waste the bandwidth. When 
> > the
> > high prio cgroup runs and is below its 'low' limit, low prio cgroup will run
> > under its 'low' limit. This will protect high prio cgroup to get more
> > resources.
> > 
> 
> Hi Shaohua,

Hi,

Sorry for the late response.
> I would like to ask you some questions, to make sure I fully
> understand how the 'low' limit and the idle-group detection work in
> your above scenario.  Suppose that: the drive has a random-I/O peak
> rate of 100MB/s, the high prio group has a 'low' limit of 90 MB/s, and
> the low prio group has a 'low' limit of 10 MB/s.  If
> - the high prio process happens to do, say, only 5 MB/s for a given
>   long time
> - the low prio process constantly does greedy I/O
> - the idle-group detection is not being used
> then the low prio process is limited to 10 MB/s during all this time
> interval.  And only 10% of the device bandwidth is utilized.
> 
> To recover lost bandwidth through idle-group detection, we need to set
> a target IO latency for the high-prio group.  The high prio group
> should happen to be below the threshold, and thus to be detected as
> idle, leaving the low prio group free too use all the bandwidth.
> 
> Here are my questions:
> 1) Is all I wrote above correct?

Yes
> 2) In particular, maybe there are other better mechanism to saturate
> the bandwidth in the above scenario?

Assume it's the 4) below.
> If what I wrote above is correct:
> 3) Doesn't fluctuation occur?  I mean: when the low prio group gets
> full bandwidth, the latency threshold of the high prio group may be
> overcome, causing the high prio group to not be considered idle any
> longer, and thus the low prio group to be limited again; this in turn
> will cause the threshold to not be overcome any longer, and so on.

That's true. We try to mitigate the fluctuation by increasing the low prio
cgroup bandwidth graduately though.

> 4) Is there a way to compute an appropriate target latency of the high
> prio group, if it is a generic group, for which the latency
> requirements of the processes it contains are only partially known or
> completely unknown?  By appropriate target latency, I mean a target
> latency that enables the framework to fully utilize the device
> bandwidth while the high prio group is doing less I/O than its limit.

Not sure how we can do this. The device max bandwidth varies based on request
size and read/write ratio. We don't know when the max bandwidth is reached.
Also I think we must consider a case that the workloads never use the full
bandwidth of a disk, which is pretty common for SSD (at least in our
environment).

Thanks,
Shaohua


[PATCH V2 1/2] block/loop: fix use after free

2017-09-01 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

lo_rw_aio->call_read_iter->
1   aops->direct_IO
2   iov_iter_revert
lo_rw_aio_complete could happen between 1 and 2, the bio and bvec could
be freed before 2, which accesses bvec.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 16 +---
 drivers/block/loop.h |  5 -
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3a35121..78c47c4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -463,14 +463,21 @@ static void lo_complete_rq(struct request *rq)
blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
 }
 
+static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
+{
+   if (!atomic_dec_and_test(>ref))
+   return;
+   kfree(cmd->bvec);
+   cmd->bvec = NULL;
+   blk_mq_complete_request(cmd->rq);
+}
+
 static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
-   kfree(cmd->bvec);
-   cmd->bvec = NULL;
cmd->ret = ret;
-   blk_mq_complete_request(cmd->rq);
+   lo_rw_aio_do_completion(cmd);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -518,6 +525,7 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
segments = bio_segments(bio);
}
+   atomic_set(>ref, 2);
 
iov_iter_bvec(, ITER_BVEC | rw, bvec,
  segments, blk_rq_bytes(rq));
@@ -533,6 +541,8 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
else
ret = call_read_iter(file, >iocb, );
 
+   lo_rw_aio_do_completion(cmd);
+
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(>iocb, ret, 0);
return 0;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 43d20d3..b0ba4a5 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -68,7 +68,10 @@ struct loop_cmd {
struct kthread_work work;
struct request *rq;
struct list_head list;
-   bool use_aio;   /* use AIO interface to handle I/O */
+   union {
+   bool use_aio; /* use AIO interface to handle I/O */
+   atomic_t ref; /* only for aio */
+   };
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
-- 
2.9.5



[PATCH V2 2/2] block/loop: remove unused field

2017-09-01 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

nobody uses the list.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index b0ba4a5..f68c1d5 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -67,7 +67,6 @@ struct loop_device {
 struct loop_cmd {
struct kthread_work work;
struct request *rq;
-   struct list_head list;
union {
bool use_aio; /* use AIO interface to handle I/O */
atomic_t ref; /* only for aio */
-- 
2.9.5



[PATCH V3 1/2] block/loop: set hw_sectors

2017-08-31 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Loop can handle any size of request. Limiting it to 255 sectors just
burns the CPU for bio split and request merge for underlayer disk and
also cause bad fs block allocation in directio mode.

Reviewed-by: Omar Sandoval <osan...@fb.com>
Reviewed-by: Ming Lei <ming@redhat.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f6c204f6..9eff4d3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1736,6 +1736,7 @@ static int loop_add(struct loop_device **l, int i)
 
blk_queue_physical_block_size(lo->lo_queue, PAGE_SIZE);
 
+   blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
/*
 * It doesn't make sense to enable merge because the I/O
 * submitted to backing file is handled page by page.
-- 
2.9.5



Re: [PATCH] block/loop: fix use after feee

2017-08-30 Thread Shaohua Li
On Wed, Aug 30, 2017 at 02:51:05PM -0700, Shaohua Li wrote:
> lo_rw_aio->call_read_iter->
> 1 aops->direct_IO
> 2 iov_iter_revert
> lo_rw_aio_complete could happen between 1 and 2, the bio and bvec could
> be freed before 2, which accesses bvec.

please ignore this one, I accidentally sent it out. The correct fix is in
another patch.
 
> This conflicts with my direcio performance improvement patches, which
> I'll resend.
> 
> Signed-off-by: Shaohua Li <s...@fb.com>
> ---
>  drivers/block/loop.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index ef83349..153ab3c 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -490,6 +490,7 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
>   iov_iter_bvec(, ITER_BVEC | rw, bvec,
> bio_segments(bio), blk_rq_bytes(cmd->rq));
> + bio_inc_remaining(bio);
>   /*
>* This bio may be started from the middle of the 'bvec'
>* because of bio splitting, so offset from the bvec must
> @@ -507,6 +508,7 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>   else
>   ret = call_read_iter(file, >iocb, );
>  
> + bio_endio(bio);
>   if (ret != -EIOCBQUEUED)
>   cmd->iocb.ki_complete(>iocb, ret, 0);
>   return 0;
> -- 
> 2.9.5
> 


Re: [PATCH V2 2/2] block/loop: allow request merge for directio mode

2017-08-30 Thread Shaohua Li
On Wed, Aug 30, 2017 at 02:43:40PM +0800, Ming Lei wrote:
> On Tue, Aug 29, 2017 at 09:43:20PM -0700, Shaohua Li wrote:
> > On Wed, Aug 30, 2017 at 10:51:21AM +0800, Ming Lei wrote:
> > > On Tue, Aug 29, 2017 at 08:13:39AM -0700, Shaohua Li wrote:
> > > > On Tue, Aug 29, 2017 at 05:56:05PM +0800, Ming Lei wrote:
> > > > > On Thu, Aug 24, 2017 at 12:24:53PM -0700, Shaohua Li wrote:
> > > > > > From: Shaohua Li <s...@fb.com>
> > > > > > 
> > > > > > Currently loop disables merge. While it makes sense for buffer IO 
> > > > > > mode,
> > > > > > directio mode can benefit from request merge. Without merge, loop 
> > > > > > could
> > > > > > send small size IO to underlayer disk and harm performance.
> > > > > 
> > > > > Hi Shaohua,
> > > > > 
> > > > > IMO no matter if merge is used, loop always sends page by page
> > > > > to VFS in both dio or buffer I/O.
> > > > 
> > > > Why do you think so?
> > > 
> > > do_blockdev_direct_IO() still handles page by page from iov_iter, and
> > > with bigger request, I guess it might be the plug merge working.
> > 
> > This is not true. directio sends big size bio directly, not because of plug
> > merge. Please at least check the code before you complain.
> 
> I complain nothing, just try to understand the idea behind,
> never mind, :-)
> 
> > 
> > > >  
> > > > > Also if merge is enabled on loop, that means merge is run
> > > > > on both loop and low level block driver, and not sure if we
> > > > > can benefit from that.
> > > > 
> > > > why does merge still happen in low level block driver?
> > > 
> > > Because scheduler is still working on low level disk. My question
> > > is that why the scheduler in low level disk doesn't work now
> > > if scheduler on loop can merge?
> > 
> > The low level disk can still do merge, but since this is directio, the upper
> > layer already dispatches request as big as possible. There is very little
> > chance the requests can be merged again.
> 
> That is true, but these requests need to enter scheduler queue and
> be tried to merge again, even though it is less possible to succeed.
> Double merge may take extra CPU utilization.
> 
> Looks it doesn't answer my question.
> 
> Without this patch, the requests dispatched to loop won't be merged,
> so they may be small and their sectors may be continuous, my question
> is why dio bios converted from these small loop requests can't be
> merged in block layer when queuing these dio bios to low level device?

loop thread doesn't have plug there. Even we have plug there, it's still a bad
idea to do the merge in low level layer. If we run direct_IO for every 4k, the
overhead is much much higher than bio merge. The direct_IO will call into fs
code, take different mutexes, metadata update for write and so on.


[PATCH] block/loop: fix use after feee

2017-08-30 Thread Shaohua Li
lo_rw_aio->call_read_iter->
1   aops->direct_IO
2   iov_iter_revert
lo_rw_aio_complete could happen between 1 and 2, the bio and bvec could
be freed before 2, which accesses bvec.

This conflicts with my direcio performance improvement patches, which
I'll resend.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ef83349..153ab3c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -490,6 +490,7 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
iov_iter_bvec(, ITER_BVEC | rw, bvec,
  bio_segments(bio), blk_rq_bytes(cmd->rq));
+   bio_inc_remaining(bio);
/*
 * This bio may be started from the middle of the 'bvec'
 * because of bio splitting, so offset from the bvec must
@@ -507,6 +508,7 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
else
ret = call_read_iter(file, >iocb, );
 
+   bio_endio(bio);
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(>iocb, ret, 0);
return 0;
-- 
2.9.5



[PATCH 3/3] block/loop: suppress discard IO error message

2017-08-30 Thread Shaohua Li
We don't know if fallocate really supports FALLOC_FL_PUNCH_HOLE till
fallocate is called. If it doesn't support, loop will return
-EOPNOTSUPP and we seee a lot of error message printed by
blk_update_request. Failure for discard IO isn't a problem, so we just
return 0 in loop which will suppress the IO error message

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a30aa45..15f51e3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -441,6 +441,9 @@ static int lo_discard(struct loop_device *lo, struct 
request *rq, loff_t pos)
ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
ret = -EIO;
+
+   if (req_op(rq) == REQ_OP_DISCARD && ret == -EOPNOTSUPP)
+   ret = 0;
  out:
return ret;
 }
-- 
2.9.5



[PATCH 0/3]block/loop: handle discard/zeroout error

2017-08-30 Thread Shaohua Li
Fix some problems when setting up loop device with a block device as back file
and create/mount ext4 in the loop device.

BTW: blkdev_issue_zeroout retries if we immediately find the device doesn't
support zeroout, but it doesn't retry if submit_bio_wait returns -EOPNOTSUPP.
Is this correct behavior?

Thanks,
Shaohua


Shaohua Li (3):
  block/loop: don't hijack error number
  block/loop: use FALLOC_FL_ZERO_RANGE for REQ_OP_WRITE_ZEROES
  block/loop: suppress discard IO error message

 drivers/block/loop.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

-- 
2.9.5



[PATCH 1/3] block/loop: don't hijack error number

2017-08-30 Thread Shaohua Li
If the bio returns -EOPNOTSUPP, we shouldn't hijack it and return -EIO

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ef83349..054dccc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -464,7 +464,7 @@ static void lo_complete_rq(struct request *rq)
zero_fill_bio(bio);
}
 
-   blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
+   blk_mq_end_request(rq, errno_to_blk_status(cmd->ret));
 }
 
 static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
@@ -1725,7 +1725,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
  failed:
/* complete non-aio request */
if (!cmd->use_aio || ret) {
-   cmd->ret = ret ? -EIO : 0;
+   cmd->ret = ret;
blk_mq_complete_request(cmd->rq);
}
 }
-- 
2.9.5



[PATCH 2/3] block/loop: use FALLOC_FL_ZERO_RANGE for REQ_OP_WRITE_ZEROES

2017-08-30 Thread Shaohua Li
REQ_OP_WRITE_ZEROES really means zero the data. And the in
blkdev_fallocate, FALLOC_FL_ZERO_RANGE will retry but
FALLOC_FL_PUNCH_HOLE not, even loop request doesn't have
BLKDEV_ZERO_NOFALLBACK set.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 054dccc..a30aa45 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -430,6 +430,9 @@ static int lo_discard(struct loop_device *lo, struct 
request *rq, loff_t pos)
int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;
 
+   if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+   mode = FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE;
+
if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
ret = -EOPNOTSUPP;
goto out;
-- 
2.9.5



Re: [RFC] block/loop: make loop cgroup aware

2017-08-29 Thread Shaohua Li
On Tue, Aug 29, 2017 at 08:28:09AM -0700, Tejun Heo wrote:
> Hello, Shaohua.
> 
> On Tue, Aug 29, 2017 at 08:22:36AM -0700, Shaohua Li wrote:
> > > Yeah, writeback tracks the most active cgroup and associates writeback
> > > ios with that cgroup.  For buffered loop devices, I think it'd be fine
> > > to make the loop driver forward the cgroup association and let the
> > > writeback layer determine the matching association as usual.
> > 
> > Doing this means we must forward cgroup info to page, not bio. I need to 
> > check
> > if we can make the mechanism work for memcg.
> 
> The association is already coming from the page.  We just need to make
> sure that going through loop driver doesn't get in the way of the
> membership getting propagated to the underlying device.

I think there is confusion. App writes files in upper layer fs on loop. memcg
estabilish membership for the pages of these files. Then writeback does write,
loop then write these pages to under layer fs. The write is done in loop
thread. The write will allocate new page cache for under layer fs files. The
issue is we must forward memcg info from upper layer files page cache to under
layer files page cache.

> > > Hmm... why do we need double forwarding (original issuer -> aio cmd ->
> > > ios) in loop?  If we need this, doesn't this mean that we're missing
> > > ownership forwarding in aio paths and should make the forwarding
> > > happen there?
> > 
> > what do you mean double forwarding?
> 
> So, this looks like the loop driver is explicitly forwarding the
> association from the original issuer to the aio command and then from
> the aio command to the ios to the underlying device.  I'm wondering
> whether the right way to do this is making aio forward the association
> by default, instead of the loop driver doing it explicitly.  That way,
> all aio users can benefit from the forwarding instead of just loop.

That's possible. The downside doing this in aio is we must audit all fs to make
sure all bio have association. I'd like to avoid doing this if there is no
other loop-like cgroup usage.

Thanks,
Shaohua


Re: [PATCH V2 2/2] block/loop: allow request merge for directio mode

2017-08-29 Thread Shaohua Li
On Wed, Aug 30, 2017 at 10:51:21AM +0800, Ming Lei wrote:
> On Tue, Aug 29, 2017 at 08:13:39AM -0700, Shaohua Li wrote:
> > On Tue, Aug 29, 2017 at 05:56:05PM +0800, Ming Lei wrote:
> > > On Thu, Aug 24, 2017 at 12:24:53PM -0700, Shaohua Li wrote:
> > > > From: Shaohua Li <s...@fb.com>
> > > > 
> > > > Currently loop disables merge. While it makes sense for buffer IO mode,
> > > > directio mode can benefit from request merge. Without merge, loop could
> > > > send small size IO to underlayer disk and harm performance.
> > > 
> > > Hi Shaohua,
> > > 
> > > IMO no matter if merge is used, loop always sends page by page
> > > to VFS in both dio or buffer I/O.
> > 
> > Why do you think so?
> 
> do_blockdev_direct_IO() still handles page by page from iov_iter, and
> with bigger request, I guess it might be the plug merge working.

This is not true. directio sends big size bio directly, not because of plug
merge. Please at least check the code before you complain.

> >  
> > > Also if merge is enabled on loop, that means merge is run
> > > on both loop and low level block driver, and not sure if we
> > > can benefit from that.
> > 
> > why does merge still happen in low level block driver?
> 
> Because scheduler is still working on low level disk. My question
> is that why the scheduler in low level disk doesn't work now
> if scheduler on loop can merge?

The low level disk can still do merge, but since this is directio, the upper
layer already dispatches request as big as possible. There is very little
chance the requests can be merged again.

> > 
> > > 
> > > So Could you provide some performance data about this patch?
> > 
> > In my virtual machine, a workload improves from ~20M/s to ~50M/s. And I 
> > clearly
> > see the request size becomes bigger.
> 
> Could you share us what the low level disk is?

It's a SATA ssd.

Thanks,
Shaohua


Re: [RFC] block/loop: make loop cgroup aware

2017-08-29 Thread Shaohua Li
On Mon, Aug 28, 2017 at 03:54:59PM -0700, Tejun Heo wrote:
> Hello, Shaohua.
> 
> On Wed, Aug 23, 2017 at 11:15:15AM -0700, Shaohua Li wrote:
> > loop block device handles IO in a separate thread. The actual IO
> > dispatched isn't cloned from the IO loop device received, so the
> > dispatched IO loses the cgroup context.
> 
> Ah, right.  Thanks for spotting this.
> 
> > I'm ignoring buffer IO case now, which is quite complicated.  Making the
> > loop thread aware of cgroup context doesn't really help. The loop device
> > only writes to a single file. In current writeback cgroup
> > implementation, the file can only belong to one cgroup.
> 
> Yeah, writeback tracks the most active cgroup and associates writeback
> ios with that cgroup.  For buffered loop devices, I think it'd be fine
> to make the loop driver forward the cgroup association and let the
> writeback layer determine the matching association as usual.

Doing this means we must forward cgroup info to page, not bio. I need to check
if we can make the mechanism work for memcg.
 
> > For direct IO case, we could workaround the issue in theory. For
> > example, say we assign cgroup1 5M/s BW for loop device and cgroup2
> > 10M/s. We can create a special cgroup for loop thread and assign at
> > least 15M/s for the underlayer disk. In this way, we correctly throttle
> > the two cgroups. But this is tricky to setup.
> > 
> > This patch tries to address the issue. When loop thread is handling IO,
> > it declares the IO is on behalf of the original task, then in block IO
> > we use the original task to find cgroup. The concept probably works for
> > other scenarios too, but right now I don't make it generic yet.
> 
> The overall approach looks sound to me.  Some comments below.
> 
> > @@ -2058,7 +2058,10 @@ int bio_associate_current(struct bio *bio)
> >  
> > get_io_context_active(ioc);
> > bio->bi_ioc = ioc;
> > -   bio->bi_css = task_get_css(current, io_cgrp_id);
> > +   if (current->cgroup_task)
> > +   bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
> > +   else
> > +   bio->bi_css = task_get_css(current, io_cgrp_id);
> 
> Do we need a full pointer field for this?  I think we should be able
> to mark lo writers with a flag (PF or whatever) and then to
> kthread_data() to get the lo struct which contains the target css.
> Oh, let's do csses instead of tasks for consistency, correctness
> (please see below) and performance (csses are cheaper to pin).

Forwarding css sounds better.

> > @@ -502,11 +504,16 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> > loop_cmd *cmd,
> > cmd->iocb.ki_complete = lo_rw_aio_complete;
> > cmd->iocb.ki_flags = IOCB_DIRECT;
> >  
> > +   if (cmd->cgroup_task)
> > +   current->cgroup_task = cmd->cgroup_task;
> > +
> > if (rw == WRITE)
> > ret = call_write_iter(file, >iocb, );
> > else
> > ret = call_read_iter(file, >iocb, );
> >  
> > +   current->cgroup_task = NULL;
> > +
> > if (ret != -EIOCBQUEUED)
> > cmd->iocb.ki_complete(>iocb, ret, 0);
> > return 0;
> 
> Hmm... why do we need double forwarding (original issuer -> aio cmd ->
> ios) in loop?  If we need this, doesn't this mean that we're missing
> ownership forwarding in aio paths and should make the forwarding
> happen there?

what do you mean double forwarding?
> 
> > @@ -1705,6 +1712,11 @@ static blk_status_t loop_queue_rq(struct 
> > blk_mq_hw_ctx *hctx,
> > break;
> > }
> >  
> > +   if (cmd->use_aio) {
> > +   cmd->cgroup_task = current;
> > +   get_task_struct(current);
> > +   } else
> > +   cmd->cgroup_task = NULL;
> 
> What if %current isn't the original issuer of the io?  It could be a
> writeback worker trying to flush to a loop device and we don't want to
> attribute those ios to the writeback worker.  We should forward the
> bi_css not the current task.

Makes sense.

Thanks,
Shaohua


Re: [PATCH V2 2/2] block/loop: allow request merge for directio mode

2017-08-29 Thread Shaohua Li
On Tue, Aug 29, 2017 at 05:56:05PM +0800, Ming Lei wrote:
> On Thu, Aug 24, 2017 at 12:24:53PM -0700, Shaohua Li wrote:
> > From: Shaohua Li <s...@fb.com>
> > 
> > Currently loop disables merge. While it makes sense for buffer IO mode,
> > directio mode can benefit from request merge. Without merge, loop could
> > send small size IO to underlayer disk and harm performance.
> 
> Hi Shaohua,
> 
> IMO no matter if merge is used, loop always sends page by page
> to VFS in both dio or buffer I/O.

Why do you think so?
 
> Also if merge is enabled on loop, that means merge is run
> on both loop and low level block driver, and not sure if we
> can benefit from that.

why does merge still happen in low level block driver?

> 
> So Could you provide some performance data about this patch?

In my virtual machine, a workload improves from ~20M/s to ~50M/s. And I clearly
see the request size becomes bigger.

> > 
> > Reviewed-by: Omar Sandoval <osan...@fb.com>
> > Signed-off-by: Shaohua Li <s...@fb.com>
> > ---
> >  drivers/block/loop.c | 66 
> > 
> >  drivers/block/loop.h |  1 +
> >  2 files changed, 52 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > index 428da07..75a8f6e 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -213,10 +213,13 @@ static void __loop_update_dio(struct loop_device *lo, 
> > bool dio)
> >  */
> > blk_mq_freeze_queue(lo->lo_queue);
> > lo->use_dio = use_dio;
> > -   if (use_dio)
> > +   if (use_dio) {
> > +   queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
> > lo->lo_flags |= LO_FLAGS_DIRECT_IO;
> > -   else
> > +   } else {
> > +   queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
> > lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
> > +   }
> > blk_mq_unfreeze_queue(lo->lo_queue);
> >  }
> >  
> > @@ -464,6 +467,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
> > ret, long ret2)
> >  {
> > struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
> >  
> > +   kfree(cmd->bvec);
> > +   cmd->bvec = NULL;
> > cmd->ret = ret;
> > blk_mq_complete_request(cmd->rq);
> >  }
> > @@ -473,22 +478,50 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> > loop_cmd *cmd,
> >  {
> > struct iov_iter iter;
> > struct bio_vec *bvec;
> > -   struct bio *bio = cmd->rq->bio;
> > +   struct request *rq = cmd->rq;
> > +   struct bio *bio = rq->bio;
> > struct file *file = lo->lo_backing_file;
> > +   unsigned int offset;
> > +   int segments = 0;
> > int ret;
> >  
> > -   /* nomerge for loop request queue */
> > -   WARN_ON(cmd->rq->bio != cmd->rq->biotail);
> > +   if (rq->bio != rq->biotail) {
> > +   struct req_iterator iter;
> > +   struct bio_vec tmp;
> > +
> > +   __rq_for_each_bio(bio, rq)
> > +   segments += bio_segments(bio);
> > +   bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_KERNEL);
> 
> The allocation should have been GFP_NOIO.

Sounds good. To make this completely correct isn't easy though, we are calling
into the underlayer fs operations which can do allocation.

Thanks,
Shaohua


Re: [PATCH] block/nullb: delete unnecessary memory free

2017-08-28 Thread Shaohua Li
On Mon, Aug 28, 2017 at 02:55:59PM -0600, Jens Axboe wrote:
> On 08/28/2017 02:49 PM, Shaohua Li wrote:
> > Commit 2984c86(nullb: factor disk parameters) has a typo. The
> > nullb_device allocation/free is done outside of null_add_dev. The commit
> > accidentally frees the nullb_device in error code path.
> 
> Is the code in nullb_device_power_store() correct after this?

Yes, for runtime disk configuration, we allocate the device in
nullb_group_make_item and free the device in nullb_device_release. the
nullb_device_power_store only does null_add_dev/null_del_dev, not allocate/free
dev.

> Also, looks like the second else if in there should be using
> 'dev' instead of to_nullb_device(item)->power.

Ah, yes, it should be 'dev->power'. dev == to_nullb_device(item). Do you want
me to send a separate patch to do the cleanup?

Thanks,
Shaohua


[PATCH] block/nullb: delete unnecessary memory free

2017-08-28 Thread Shaohua Li
Commit 2984c86(nullb: factor disk parameters) has a typo. The
nullb_device allocation/free is done outside of null_add_dev. The commit
accidentally frees the nullb_device in error code path.

Reported-by: Dan Carpenter <dan.carpen...@oracle.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/null_blk.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 4d328e3..18f2cb3 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -1909,7 +1909,6 @@ static int null_add_dev(struct nullb_device *dev)
 out_free_nullb:
kfree(nullb);
 out:
-   null_free_dev(dev);
return rv;
 }
 
-- 
2.9.5



[PATCH] block/nullb: fix NULL deference

2017-08-25 Thread Shaohua Li
Dan reported this:

The patch 2984c8684f96: "nullb: factor disk parameters" from Aug 14,
2017, leads to the following Smatch complaint:

drivers/block/null_blk.c:1759 null_init_tag_set()
 error: we previously assumed 'nullb' could be null (see line
1750)

  1755  set->cmd_size   = sizeof(struct nullb_cmd);
  1756  set->flags = BLK_MQ_F_SHOULD_MERGE;
  1757  set->driver_data = NULL;
  1758
  1759  if (nullb->dev->blocking)

And an unchecked dereference.

nullb could be NULL here.

Reported-by: Dan Carpenter <dan.carpen...@oracle.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/null_blk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 2032360..4d328e3 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -1756,7 +1756,7 @@ static int null_init_tag_set(struct nullb *nullb, struct 
blk_mq_tag_set *set)
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->driver_data = NULL;
 
-   if (nullb->dev->blocking)
+   if ((nullb && nullb->dev->blocking) || g_blocking)
set->flags |= BLK_MQ_F_BLOCKING;
 
return blk_mq_alloc_tag_set(set);
-- 
2.9.5



[PATCH V2 2/2] block/loop: allow request merge for directio mode

2017-08-24 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Currently loop disables merge. While it makes sense for buffer IO mode,
directio mode can benefit from request merge. Without merge, loop could
send small size IO to underlayer disk and harm performance.

Reviewed-by: Omar Sandoval <osan...@fb.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 66 
 drivers/block/loop.h |  1 +
 2 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 428da07..75a8f6e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,10 +213,13 @@ static void __loop_update_dio(struct loop_device *lo, 
bool dio)
 */
blk_mq_freeze_queue(lo->lo_queue);
lo->use_dio = use_dio;
-   if (use_dio)
+   if (use_dio) {
+   queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
-   else
+   } else {
+   queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+   }
blk_mq_unfreeze_queue(lo->lo_queue);
 }
 
@@ -464,6 +467,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+   kfree(cmd->bvec);
+   cmd->bvec = NULL;
cmd->ret = ret;
blk_mq_complete_request(cmd->rq);
 }
@@ -473,22 +478,50 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 {
struct iov_iter iter;
struct bio_vec *bvec;
-   struct bio *bio = cmd->rq->bio;
+   struct request *rq = cmd->rq;
+   struct bio *bio = rq->bio;
struct file *file = lo->lo_backing_file;
+   unsigned int offset;
+   int segments = 0;
int ret;
 
-   /* nomerge for loop request queue */
-   WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+   if (rq->bio != rq->biotail) {
+   struct req_iterator iter;
+   struct bio_vec tmp;
+
+   __rq_for_each_bio(bio, rq)
+   segments += bio_segments(bio);
+   bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_KERNEL);
+   if (!bvec)
+   return -EIO;
+   cmd->bvec = bvec;
+
+   /*
+* The bios of the request may be started from the middle of
+* the 'bvec' because of bio splitting, so we can't directly
+* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
+* API will take care of all details for us.
+*/
+   rq_for_each_segment(tmp, rq, iter) {
+   *bvec = tmp;
+   bvec++;
+   }
+   bvec = cmd->bvec;
+   offset = 0;
+   } else {
+   /*
+* Same here, this bio may be started from the middle of the
+* 'bvec' because of bio splitting, so offset from the bvec
+* must be passed to iov iterator
+*/
+   offset = bio->bi_iter.bi_bvec_done;
+   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+   segments = bio_segments(bio);
+   }
 
-   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
iov_iter_bvec(, ITER_BVEC | rw, bvec,
- bio_segments(bio), blk_rq_bytes(cmd->rq));
-   /*
-* This bio may be started from the middle of the 'bvec'
-* because of bio splitting, so offset from the bvec must
-* be passed to iov iterator
-*/
-   iter.iov_offset = bio->bi_iter.bi_bvec_done;
+ segments, blk_rq_bytes(rq));
+   iter.iov_offset = offset;
 
cmd->iocb.ki_pos = pos;
cmd->iocb.ki_filp = file;
@@ -1800,9 +1833,12 @@ static int loop_add(struct loop_device **l, int i)
lo->lo_queue->queuedata = lo;
 
blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
+
/*
-* It doesn't make sense to enable merge because the I/O
-* submitted to backing file is handled page by page.
+* By default, we do buffer IO, so it doesn't make sense to enable
+* merge because the I/O submitted to backing file is handled page by
+* page. For directio mode, merge does help to dispatch bigger request
+* to underlayer disk. We will enable merge once directio is enabled.
 */
queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
 
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index fecd3f9..bc9cf91 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -72,6 +72,7 @@ struct loop_cmd {
bool use_aio;   /* use AIO interface to 

[PATCH V2 0/2] block/loop: improve performance

2017-08-24 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

two small patches to improve performance for loop in directio mode. The goal is
to increase IO size sending to underlayer disks. As Omar pointed out, the
patches have slight conflict with his, but should be easy to fix.

Thanks,
Shaohua

Shaohua Li (2):
  block/loop: set hw_sectors
  block/loop: allow request merge for directio mode

 drivers/block/loop.c | 67 
 drivers/block/loop.h |  1 +
 2 files changed, 53 insertions(+), 15 deletions(-)

-- 
2.9.5



[PATCH V2 1/2] block/loop: set hw_sectors

2017-08-24 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Loop can handle any size of request. Limiting it to 255 sectors just
burns the CPU for bio split and request merge for underlayer disk and
also cause bad fs block allocation in directio mode.

Reviewed-by: Omar Sandoval <osan...@fb.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b55a1f8..428da07 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1799,6 +1799,7 @@ static int loop_add(struct loop_device **l, int i)
}
lo->lo_queue->queuedata = lo;
 
+   blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
/*
 * It doesn't make sense to enable merge because the I/O
 * submitted to backing file is handled page by page.
-- 
2.9.5



Re: [PATCH 2/2] block/loop: allow request merge for directio mode

2017-08-24 Thread Shaohua Li
On Thu, Aug 24, 2017 at 10:57:39AM -0700, Omar Sandoval wrote:
> On Wed, Aug 23, 2017 at 04:49:24PM -0700, Shaohua Li wrote:
> > From: Shaohua Li <s...@fb.com>
> > 
> > Currently loop disables merge. While it makes sense for buffer IO mode,
> > directio mode can benefit from request merge. Without merge, loop could
> > send small size IO to underlayer disk and harm performance.
> 
> A couple of nits on comments below, besides that
> 
> Reviewed-by: Omar Sandoval <osan...@fb.com>
> 
> > Signed-off-by: Shaohua Li <s...@fb.com>
> > ---
> >  drivers/block/loop.c | 43 +++
> >  drivers/block/loop.h |  1 +
> >  2 files changed, 36 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > index 428da07..1bfa3ff 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -213,10 +213,13 @@ static void __loop_update_dio(struct loop_device *lo, 
> > bool dio)
> >  */
> > blk_mq_freeze_queue(lo->lo_queue);
> > lo->use_dio = use_dio;
> > -   if (use_dio)
> > +   if (use_dio) {
> > +   queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
> > lo->lo_flags |= LO_FLAGS_DIRECT_IO;
> > -   else
> > +   } else {
> > +   queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
> > lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
> > +   }
> 
> Could you please also update the comment in loop_add() where we set
> QUEUE_FLAG_NOMERGES to say something like
> 
>   /*
>* By default we do buffered I/O, so it doesn't make sense to
>* enable merging because the I/O submitted to backing file is
>* handled page by page. We enable merging when switching to
>* direct I/O mode.
>*/

sure, will add
 
> > blk_mq_unfreeze_queue(lo->lo_queue);
> >  }
> >  
> > @@ -464,6 +467,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
> > ret, long ret2)
> >  {
> > struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
> >  
> > +   kfree(cmd->bvec);
> > +   cmd->bvec = NULL;
> > cmd->ret = ret;
> > blk_mq_complete_request(cmd->rq);
> >  }
> > @@ -473,22 +478,44 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> > loop_cmd *cmd,
> >  {
> > struct iov_iter iter;
> > struct bio_vec *bvec;
> > -   struct bio *bio = cmd->rq->bio;
> > +   struct request *rq = cmd->rq;
> > +   struct bio *bio = rq->bio;
> > struct file *file = lo->lo_backing_file;
> > +   unsigned int offset;
> > +   int segments = 0;
> > int ret;
> >  
> > -   /* nomerge for loop request queue */
> > -   WARN_ON(cmd->rq->bio != cmd->rq->biotail);
> > +   if (rq->bio != rq->biotail) {
> > +   struct req_iterator iter;
> > +   struct bio_vec tmp;
> > +
> > +   __rq_for_each_bio(bio, rq)
> > +   segments += bio_segments(bio);
> > +   bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_KERNEL);
> > +   if (!bvec)
> > +   return -EIO;
> > +   cmd->bvec = bvec;
> > +
> > +   rq_for_each_segment(tmp, rq, iter) {
> > +   *bvec = tmp;
> > +   bvec++;
> > +   }
> > +   bvec = cmd->bvec;
> > +   offset = 0;
> > +   } else {
> > +   offset = bio->bi_iter.bi_bvec_done;
> > +   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
> > +   segments = bio_segments(bio);
> > +   }
> >  
> > -   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
> > iov_iter_bvec(, ITER_BVEC | rw, bvec,
> > - bio_segments(bio), blk_rq_bytes(cmd->rq));
> > + segments, blk_rq_bytes(rq));
> > /*
> >  * This bio may be started from the middle of the 'bvec'
> >  * because of bio splitting, so offset from the bvec must
> >  * be passed to iov iterator
> >  */
> 
> This comment should be moved above to where you do
> 
>   offset = bio->bi_iter.bi_bvec_done;

This comment actually applies to 'rq->bio != rq->biotail' case too for
each bio of the request. That's why I don't just copy bio->bi_io_vec.
I'll explain this in different way.

Thanks,
Shaohua


[PATCH 0/2] block/loop: improve performance

2017-08-23 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

two small patches to improve performance for loop in directio mode. The goal is
to increase IO size sending to underlayer disks.

Thanks,
Shaohua

Shaohua Li (2):
  block/loop: set hw_sectors
  block/loop: allow request merge for directio mode

 drivers/block/loop.c | 44 
 drivers/block/loop.h |  1 +
 2 files changed, 37 insertions(+), 8 deletions(-)

-- 
2.9.5



[PATCH 2/2] block/loop: allow request merge for directio mode

2017-08-23 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Currently loop disables merge. While it makes sense for buffer IO mode,
directio mode can benefit from request merge. Without merge, loop could
send small size IO to underlayer disk and harm performance.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 43 +++
 drivers/block/loop.h |  1 +
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 428da07..1bfa3ff 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,10 +213,13 @@ static void __loop_update_dio(struct loop_device *lo, 
bool dio)
 */
blk_mq_freeze_queue(lo->lo_queue);
lo->use_dio = use_dio;
-   if (use_dio)
+   if (use_dio) {
+   queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
-   else
+   } else {
+   queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+   }
blk_mq_unfreeze_queue(lo->lo_queue);
 }
 
@@ -464,6 +467,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+   kfree(cmd->bvec);
+   cmd->bvec = NULL;
cmd->ret = ret;
blk_mq_complete_request(cmd->rq);
 }
@@ -473,22 +478,44 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 {
struct iov_iter iter;
struct bio_vec *bvec;
-   struct bio *bio = cmd->rq->bio;
+   struct request *rq = cmd->rq;
+   struct bio *bio = rq->bio;
struct file *file = lo->lo_backing_file;
+   unsigned int offset;
+   int segments = 0;
int ret;
 
-   /* nomerge for loop request queue */
-   WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+   if (rq->bio != rq->biotail) {
+   struct req_iterator iter;
+   struct bio_vec tmp;
+
+   __rq_for_each_bio(bio, rq)
+   segments += bio_segments(bio);
+   bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_KERNEL);
+   if (!bvec)
+   return -EIO;
+   cmd->bvec = bvec;
+
+   rq_for_each_segment(tmp, rq, iter) {
+   *bvec = tmp;
+   bvec++;
+   }
+   bvec = cmd->bvec;
+   offset = 0;
+   } else {
+   offset = bio->bi_iter.bi_bvec_done;
+   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+   segments = bio_segments(bio);
+   }
 
-   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
iov_iter_bvec(, ITER_BVEC | rw, bvec,
- bio_segments(bio), blk_rq_bytes(cmd->rq));
+ segments, blk_rq_bytes(rq));
/*
 * This bio may be started from the middle of the 'bvec'
 * because of bio splitting, so offset from the bvec must
 * be passed to iov iterator
 */
-   iter.iov_offset = bio->bi_iter.bi_bvec_done;
+   iter.iov_offset = offset;
 
cmd->iocb.ki_pos = pos;
cmd->iocb.ki_filp = file;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index fecd3f9..bc9cf91 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -72,6 +72,7 @@ struct loop_cmd {
bool use_aio;   /* use AIO interface to handle I/O */
long ret;
struct kiocb iocb;
+   struct bio_vec *bvec;
 };
 
 /* Support for loadable transfer modules */
-- 
2.9.5



[PATCH 1/2] block/loop: set hw_sectors

2017-08-23 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Loop can handle any size of request. Limiting it to 255 sectors just
burns the CPU for bio split and request merge for underlayer disk and
also cause bad fs block allocation in directio mode.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/loop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b55a1f8..428da07 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1799,6 +1799,7 @@ static int loop_add(struct loop_device **l, int i)
}
lo->lo_queue->queuedata = lo;
 
+   blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
/*
 * It doesn't make sense to enable merge because the I/O
 * submitted to backing file is handled page by page.
-- 
2.9.5



Re: [RFC] block/loop: make loop cgroup aware

2017-08-23 Thread Shaohua Li
On Wed, Aug 23, 2017 at 03:21:25PM -0400, Vivek Goyal wrote:
> On Wed, Aug 23, 2017 at 11:15:15AM -0700, Shaohua Li wrote:
> > From: Shaohua Li <s...@fb.com>
> > 
> > Not a merge request, for discussion only.
> > 
> > loop block device handles IO in a separate thread. The actual IO
> > dispatched isn't cloned from the IO loop device received, so the
> > dispatched IO loses the cgroup context.
> > 
> > I'm ignoring buffer IO case now, which is quite complicated.  Making the
> > loop thread aware of cgroup context doesn't really help. The loop device
> > only writes to a single file. In current writeback cgroup
> > implementation, the file can only belong to one cgroup.
> > 
> > For direct IO case, we could workaround the issue in theory. For
> > example, say we assign cgroup1 5M/s BW for loop device and cgroup2
> > 10M/s. We can create a special cgroup for loop thread and assign at
> > least 15M/s for the underlayer disk. In this way, we correctly throttle
> > the two cgroups. But this is tricky to setup.
> > 
> > This patch tries to address the issue. When loop thread is handling IO,
> > it declares the IO is on behalf of the original task, then in block IO
> > we use the original task to find cgroup. The concept probably works for
> > other scenarios too, but right now I don't make it generic yet.
> 
> Hi Shaohua, 
> 
> Can worker thread switch/move to the cgroup bio is in and do the submision
> and then switch back. That way IO submitted by worker should be accounted
> to the cgroup bio belongs to. Just a thought. I don't even know if that's
> feasible.

That is my first attempt, but looks moving thread to a cgroup is an
expensive operation.

Thanks,
Shaohua

> 
> Vivek
> 
> > 
> > Signed-off-by: Shaohua Li <s...@fb.com>
> > ---
> >  block/bio.c|  5 -
> >  drivers/block/loop.c   | 14 +-
> >  drivers/block/loop.h   |  1 +
> >  include/linux/blk-cgroup.h |  3 ++-
> >  include/linux/sched.h  |  1 +
> >  5 files changed, 21 insertions(+), 3 deletions(-)
> > 
> > diff --git a/block/bio.c b/block/bio.c
> > index e241bbc..8f0df3c 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -2058,7 +2058,10 @@ int bio_associate_current(struct bio *bio)
> >  
> > get_io_context_active(ioc);
> > bio->bi_ioc = ioc;
> > -   bio->bi_css = task_get_css(current, io_cgrp_id);
> > +   if (current->cgroup_task)
> > +   bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
> > +   else
> > +   bio->bi_css = task_get_css(current, io_cgrp_id);
> > return 0;
> >  }
> >  EXPORT_SYMBOL_GPL(bio_associate_current);
> > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > index ef83349..fefede3 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -77,7 +77,7 @@
> >  #include 
> >  #include 
> >  #include "loop.h"
> > -
> > +#include 
> >  #include 
> >  
> >  static DEFINE_IDR(loop_index_idr);
> > @@ -471,6 +471,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
> > ret, long ret2)
> >  {
> > struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
> >  
> > +   if (cmd->cgroup_task)
> > +   put_task_struct(cmd->cgroup_task);
> > cmd->ret = ret;
> > blk_mq_complete_request(cmd->rq);
> >  }
> > @@ -502,11 +504,16 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> > loop_cmd *cmd,
> > cmd->iocb.ki_complete = lo_rw_aio_complete;
> > cmd->iocb.ki_flags = IOCB_DIRECT;
> >  
> > +   if (cmd->cgroup_task)
> > +   current->cgroup_task = cmd->cgroup_task;
> > +
> > if (rw == WRITE)
> > ret = call_write_iter(file, >iocb, );
> > else
> > ret = call_read_iter(file, >iocb, );
> >  
> > +   current->cgroup_task = NULL;
> > +
> > if (ret != -EIOCBQUEUED)
> > cmd->iocb.ki_complete(>iocb, ret, 0);
> > return 0;
> > @@ -1705,6 +1712,11 @@ static blk_status_t loop_queue_rq(struct 
> > blk_mq_hw_ctx *hctx,
> > break;
> > }
> >  
> > +   if (cmd->use_aio) {
> > +   cmd->cgroup_task = current;
> > +   get_task_struct(current);
> > +   } else
> > +   cmd->cgroup_task = NULL;
> > kthread_queue_work(>worker, >work);
> >  
> > return BLK_STS_OK;
> &

Re: [PATCH 2/6] raid5: remove a call to get_start_sect

2017-08-23 Thread Shaohua Li
On Wed, Aug 23, 2017 at 07:10:28PM +0200, Christoph Hellwig wrote:
> The block layer always remaps partitions before calling into the
> ->make_request methods of drivers.  Thus the call to get_start_sect in
> in_chunk_boundary will always return 0 and can be removed.
> 
> Signed-off-by: Christoph Hellwig <h...@lst.de>
> ---
>  drivers/md/raid5.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 0fc2748aaf95..d687aeb1b538 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -5092,10 +5092,12 @@ static int raid5_congested(struct mddev *mddev, int 
> bits)
>  static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
>  {
>   struct r5conf *conf = mddev->private;
> - sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
> + sector_t sector = bio->bi_iter.bi_sector;
>   unsigned int chunk_sectors;
>   unsigned int bio_sectors = bio_sectors(bio);
>  
> + WARN_ON_ONCE(bio->bi_partno);
> +
>   chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
>   return  chunk_sectors >=
>   ((sector & (chunk_sectors - 1)) + bio_sectors);

Reviewed-by: Shaohua Li <s...@fb.com>


[RFC] block/loop: make loop cgroup aware

2017-08-23 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

Not a merge request, for discussion only.

loop block device handles IO in a separate thread. The actual IO
dispatched isn't cloned from the IO loop device received, so the
dispatched IO loses the cgroup context.

I'm ignoring buffer IO case now, which is quite complicated.  Making the
loop thread aware of cgroup context doesn't really help. The loop device
only writes to a single file. In current writeback cgroup
implementation, the file can only belong to one cgroup.

For direct IO case, we could workaround the issue in theory. For
example, say we assign cgroup1 5M/s BW for loop device and cgroup2
10M/s. We can create a special cgroup for loop thread and assign at
least 15M/s for the underlayer disk. In this way, we correctly throttle
the two cgroups. But this is tricky to setup.

This patch tries to address the issue. When loop thread is handling IO,
it declares the IO is on behalf of the original task, then in block IO
we use the original task to find cgroup. The concept probably works for
other scenarios too, but right now I don't make it generic yet.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/bio.c|  5 -
 drivers/block/loop.c   | 14 +-
 drivers/block/loop.h   |  1 +
 include/linux/blk-cgroup.h |  3 ++-
 include/linux/sched.h  |  1 +
 5 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index e241bbc..8f0df3c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2058,7 +2058,10 @@ int bio_associate_current(struct bio *bio)
 
get_io_context_active(ioc);
bio->bi_ioc = ioc;
-   bio->bi_css = task_get_css(current, io_cgrp_id);
+   if (current->cgroup_task)
+   bio->bi_css = task_get_css(current->cgroup_task, io_cgrp_id);
+   else
+   bio->bi_css = task_get_css(current, io_cgrp_id);
return 0;
 }
 EXPORT_SYMBOL_GPL(bio_associate_current);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ef83349..fefede3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,7 +77,7 @@
 #include 
 #include 
 #include "loop.h"
-
+#include 
 #include 
 
 static DEFINE_IDR(loop_index_idr);
@@ -471,6 +471,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long 
ret, long ret2)
 {
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
+   if (cmd->cgroup_task)
+   put_task_struct(cmd->cgroup_task);
cmd->ret = ret;
blk_mq_complete_request(cmd->rq);
 }
@@ -502,11 +504,16 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
cmd->iocb.ki_complete = lo_rw_aio_complete;
cmd->iocb.ki_flags = IOCB_DIRECT;
 
+   if (cmd->cgroup_task)
+   current->cgroup_task = cmd->cgroup_task;
+
if (rw == WRITE)
ret = call_write_iter(file, >iocb, );
else
ret = call_read_iter(file, >iocb, );
 
+   current->cgroup_task = NULL;
+
if (ret != -EIOCBQUEUED)
cmd->iocb.ki_complete(>iocb, ret, 0);
return 0;
@@ -1705,6 +1712,11 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx 
*hctx,
break;
}
 
+   if (cmd->use_aio) {
+   cmd->cgroup_task = current;
+   get_task_struct(current);
+   } else
+   cmd->cgroup_task = NULL;
kthread_queue_work(>worker, >work);
 
return BLK_STS_OK;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 2c096b9..eb98d4d 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -73,6 +73,7 @@ struct loop_cmd {
bool use_aio;   /* use AIO interface to handle I/O */
long ret;
struct kiocb iocb;
+   struct task_struct *cgroup_task;
 };
 
 /* Support for loadable transfer modules */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 9d92153..38a5517 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -232,7 +232,8 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
if (bio && bio->bi_css)
return css_to_blkcg(bio->bi_css);
-   return task_blkcg(current);
+   return task_blkcg(current->cgroup_task ?
+   current->cgroup_task : current);
 }
 
 static inline struct cgroup_subsys_state *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8337e2d..a5958b0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -897,6 +897,7 @@ struct task_struct {
struct css_set __rcu*cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_headcg_list;
+   struct task_struct  *cgroup_task;
 #endif
 #ifdef CONFIG_INTEL_RDT_A
int closid;
-- 
2.9.5



[PATCH V2] blk-throttle: cap discard request size

2017-08-18 Thread Shaohua Li
discard request usually is very big and easily use all bandwidth budget
of a cgroup. discard request size doesn't really mean the size of data
written, so it doesn't make sense to account it into bandwidth budget.
Jens pointed out treating the size 0 doesn't make sense too, because
discard request does have cost. But it's not easy to find the actual
cost. This patch simply makes the size one sector.

Signed-off-by: Shaohua Li <s...@fb.com>
---
 block/blk-throttle.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6a4c4c4..0fea76a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -380,6 +380,14 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, 
int rw)
}   \
 } while (0)
 
+static inline unsigned int throtl_bio_data_size(struct bio *bio)
+{
+   /* assume it's one sector */
+   if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
+   return 512;
+   return bio->bi_iter.bi_size;
+}
+
 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
 {
INIT_LIST_HEAD(>node);
@@ -932,6 +940,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, 
struct bio *bio,
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+   unsigned int bio_size = throtl_bio_data_size(bio);
 
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 
@@ -945,14 +954,14 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, 
struct bio *bio,
do_div(tmp, HZ);
bytes_allowed = tmp;
 
-   if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
+   if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
if (wait)
*wait = 0;
return true;
}
 
/* Calc approx time to dispatch */
-   extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
+   extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
 
if (!jiffy_wait)
@@ -1032,11 +1041,12 @@ static bool tg_may_dispatch(struct throtl_grp *tg, 
struct bio *bio,
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
bool rw = bio_data_dir(bio);
+   unsigned int bio_size = throtl_bio_data_size(bio);
 
/* Charge the bio to the group */
-   tg->bytes_disp[rw] += bio->bi_iter.bi_size;
+   tg->bytes_disp[rw] += bio_size;
tg->io_disp[rw]++;
-   tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+   tg->last_bytes_disp[rw] += bio_size;
tg->last_io_disp[rw]++;
 
/*
-- 
2.9.5



Re: [PATCH] blk-throttle: ignore discard request size

2017-08-18 Thread Shaohua Li
On Fri, Aug 18, 2017 at 01:15:15PM -0600, Jens Axboe wrote:
> On 08/18/2017 01:12 PM, Shaohua Li wrote:
> > On Fri, Aug 18, 2017 at 01:06:46PM -0600, Jens Axboe wrote:
> >> On 08/18/2017 10:28 AM, Shaohua Li wrote:
> >>> On Fri, Aug 18, 2017 at 09:35:01AM -0600, Jens Axboe wrote:
> >>>> On 08/18/2017 09:13 AM, Shaohua Li wrote:
> >>>>> discard request usually is very big and easily use all bandwidth budget
> >>>>> of a cgroup. discard request size doesn't really mean the size of data
> >>>>> written, so it doesn't make sense to account it into bandwidth budget.
> >>>>> This patch ignores discard requests size. It makes sense to account
> >>>>> discard request into iops budget though.
> >>>>
> >>>> Some (most) devices to touch media for a discard operation, but the
> >>>> cost tends to be fairly constant and independent of discard size.
> >>>> Would it make sense to just treat it as a constant cost? Zero
> >>>> cost seems wrong.
> >>>
> >>> that would be hard to find the cost. Would this make sense?
> >>>
> >>> min_t(unsigned int, bio->bi_iter.bi_size, queue_max_sectors(q) << 9)
> >>
> >> It's all going to be approximations, for sure, unfortunately it isn't
> >> an exact science. Why not just use a constant small value? If we assume
> >> that a 4k and 8M discard end up writing roughly the same to media, then
> >> it would follow that just using a smaller constant value (regardless of
> >> actual discard command size) would be useful.
> > 
> > Sounds good. what number do you suggest? queue_max_sectors or a
> > random number?
> 
> Not sure why you want to go that large? Isn't the idea to throttle on
> actual device bandwidth used? In which case a much smaller number should
> be a lot closer to reality, say like 64 bytes per discard, regardless
> of actual size. That still gives you some throttling instead of just
> ignoring it, but at a more reasonable rate.

hmm, my guess is discard is more costly than normal write in some drivers, but
that's just my guess. I'll make it 512B then to make sure nothing is blown.

Thanks,
Shaohua


Re: [PATCH] blk-throttle: ignore discard request size

2017-08-18 Thread Shaohua Li
On Fri, Aug 18, 2017 at 01:06:46PM -0600, Jens Axboe wrote:
> On 08/18/2017 10:28 AM, Shaohua Li wrote:
> > On Fri, Aug 18, 2017 at 09:35:01AM -0600, Jens Axboe wrote:
> >> On 08/18/2017 09:13 AM, Shaohua Li wrote:
> >>> discard request usually is very big and easily use all bandwidth budget
> >>> of a cgroup. discard request size doesn't really mean the size of data
> >>> written, so it doesn't make sense to account it into bandwidth budget.
> >>> This patch ignores discard requests size. It makes sense to account
> >>> discard request into iops budget though.
> >>
> >> Some (most) devices to touch media for a discard operation, but the
> >> cost tends to be fairly constant and independent of discard size.
> >> Would it make sense to just treat it as a constant cost? Zero
> >> cost seems wrong.
> > 
> > that would be hard to find the cost. Would this make sense?
> > 
> > min_t(unsigned int, bio->bi_iter.bi_size, queue_max_sectors(q) << 9)
> 
> It's all going to be approximations, for sure, unfortunately it isn't
> an exact science. Why not just use a constant small value? If we assume
> that a 4k and 8M discard end up writing roughly the same to media, then
> it would follow that just using a smaller constant value (regardless of
> actual discard command size) would be useful.

Sounds good. what number do you suggest? queue_max_sectors or a
random number?

Thanks,
Shaohua


Re: [PATCH] blk-throttle: ignore discard request size

2017-08-18 Thread Shaohua Li
On Fri, Aug 18, 2017 at 09:35:01AM -0600, Jens Axboe wrote:
> On 08/18/2017 09:13 AM, Shaohua Li wrote:
> > discard request usually is very big and easily use all bandwidth budget
> > of a cgroup. discard request size doesn't really mean the size of data
> > written, so it doesn't make sense to account it into bandwidth budget.
> > This patch ignores discard requests size. It makes sense to account
> > discard request into iops budget though.
> 
> Some (most) devices to touch media for a discard operation, but the
> cost tends to be fairly constant and independent of discard size.
> Would it make sense to just treat it as a constant cost? Zero
> cost seems wrong.

that would be hard to find the cost. Would this make sense?

min_t(unsigned int, bio->bi_iter.bi_size, queue_max_sectors(q) << 9)


[PATCH V2 6/9] nullb: support discard

2017-08-14 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

discard makes sense for memory backed disk. And also it's useful to test
if upper layer supports dicard correctly.

User configures 'discard' attribute to enable/disable dicard support.

Based on original patch from Kyungchan Koh

Signed-off-by: Kyungchan Koh <kkc6...@fb.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/null_blk.c | 42 +-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 45e0b56..1f3cf25 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -88,6 +88,7 @@ struct nullb_device {
bool use_per_node_hctx; /* use per-node allocation for hardware context 
*/
bool power; /* power on/off the device */
bool memory_backed; /* if data is stored in memory */
+   bool discard; /* if support discard */
 };
 
 struct nullb {
@@ -318,6 +319,7 @@ NULLB_DEVICE_ATTR(use_lightnvm, bool);
 NULLB_DEVICE_ATTR(blocking, bool);
 NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
 NULLB_DEVICE_ATTR(memory_backed, bool);
+NULLB_DEVICE_ATTR(discard, bool);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -373,6 +375,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
_device_attr_use_per_node_hctx,
_device_attr_power,
_device_attr_memory_backed,
+   _device_attr_discard,
NULL,
 };
 
@@ -425,7 +428,7 @@ nullb_group_drop_item(struct config_group *group, struct 
config_item *item)
 
 static ssize_t memb_group_features_show(struct config_item *item, char *page)
 {
-   return snprintf(page, PAGE_SIZE, "memory_backed\n");
+   return snprintf(page, PAGE_SIZE, "memory_backed,discard\n");
 }
 
 CONFIGFS_ATTR_RO(memb_group_, features);
@@ -815,6 +818,20 @@ static int copy_from_nullb(struct nullb *nullb, struct 
page *dest,
return 0;
 }
 
+static void null_handle_discard(struct nullb *nullb, sector_t sector, size_t n)
+{
+   size_t temp;
+
+   spin_lock_irq(>lock);
+   while (n > 0) {
+   temp = min_t(size_t, n, nullb->dev->blocksize);
+   null_free_sector(nullb, sector);
+   sector += temp >> SECTOR_SHIFT;
+   n -= temp;
+   }
+   spin_unlock_irq(>lock);
+}
+
 static int null_transfer(struct nullb *nullb, struct page *page,
unsigned int len, unsigned int off, bool is_write, sector_t sector)
 {
@@ -843,6 +860,11 @@ static int null_handle_rq(struct nullb_cmd *cmd)
 
sector = blk_rq_pos(rq);
 
+   if (req_op(rq) == REQ_OP_DISCARD) {
+   null_handle_discard(nullb, sector, blk_rq_bytes(rq));
+   return 0;
+   }
+
spin_lock_irq(>lock);
rq_for_each_segment(bvec, rq, iter) {
len = bvec.bv_len;
@@ -871,6 +893,12 @@ static int null_handle_bio(struct nullb_cmd *cmd)
 
sector = bio->bi_iter.bi_sector;
 
+   if (bio_op(bio) == REQ_OP_DISCARD) {
+   null_handle_discard(nullb, sector,
+   bio_sectors(bio) << SECTOR_SHIFT);
+   return 0;
+   }
+
spin_lock_irq(>lock);
bio_for_each_segment(bvec, bio, iter) {
len = bvec.bv_len;
@@ -1207,6 +1235,16 @@ static void null_del_dev(struct nullb *nullb)
dev->nullb = NULL;
 }
 
+static void null_config_discard(struct nullb *nullb)
+{
+   if (nullb->dev->discard == false)
+   return;
+   nullb->q->limits.discard_granularity = nullb->dev->blocksize;
+   nullb->q->limits.discard_alignment = nullb->dev->blocksize;
+   blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
+   queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nullb->q);
+}
+
 static int null_open(struct block_device *bdev, fmode_t mode)
 {
return 0;
@@ -1448,6 +1486,8 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_logical_block_size(nullb->q, dev->blocksize);
blk_queue_physical_block_size(nullb->q, dev->blocksize);
 
+   null_config_discard(nullb);
+
sprintf(nullb->disk_name, "nullb%d", nullb->index);
 
if (dev->use_lightnvm)
-- 
2.9.5



[PATCH V2 5/9] nullb: support memory backed store

2017-08-14 Thread Shaohua Li
From: Shaohua Li <s...@fb.com>

This adds memory backed store in nullb.

User configure 'memory_backed' attribute for this. By default, nullb
disk doesn't use memory backed store.

Based on original patch from Kyungchan Koh

Signed-off-by: Kyungchan Koh <kkc6...@fb.com>
Signed-off-by: Shaohua Li <s...@fb.com>
---
 drivers/block/null_blk.c | 339 +--
 1 file changed, 330 insertions(+), 9 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 2f66627..45e0b56 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -15,6 +15,14 @@
 #include 
 #include 
 
+#define SECTOR_SHIFT   9
+#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS   (1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_SIZE(1 << SECTOR_SHIFT)
+#define SECTOR_MASK(PAGE_SECTORS - 1)
+
+#define FREE_BATCH 16
+
 struct nullb_cmd {
struct list_head list;
struct llist_node ll_list;
@@ -24,6 +32,7 @@ struct nullb_cmd {
unsigned int tag;
struct nullb_queue *nq;
struct hrtimer timer;
+   blk_status_t error;
 };
 
 struct nullb_queue {
@@ -46,9 +55,23 @@ enum nullb_device_flags {
NULLB_DEV_FL_UP = 1,
 };
 
+/*
+ * nullb_page is a page in memory for nullb devices.
+ *
+ * @page:  The page holding the data.
+ * @bitmap:The bitmap represents which sector in the page has data.
+ * Each bit represents one block size. For example, sector 8
+ * will use the 7th bit
+ */
+struct nullb_page {
+   struct page *page;
+   unsigned long bitmap;
+};
+
 struct nullb_device {
struct nullb *nullb;
struct config_item item;
+   struct radix_tree_root data; /* data stored in the disk */
unsigned long flags; /* device flags */
 
unsigned long size; /* device size in MB */
@@ -64,6 +87,7 @@ struct nullb_device {
bool blocking; /* blocking blk-mq device */
bool use_per_node_hctx; /* use per-node allocation for hardware context 
*/
bool power; /* power on/off the device */
+   bool memory_backed; /* if data is stored in memory */
 };
 
 struct nullb {
@@ -197,6 +221,7 @@ static struct nullb_device *null_alloc_dev(void);
 static void null_free_dev(struct nullb_device *dev);
 static void null_del_dev(struct nullb *nullb);
 static int null_add_dev(struct nullb_device *dev);
+static void null_free_device_storage(struct nullb_device *dev);
 
 static inline struct nullb_device *to_nullb_device(struct config_item *item)
 {
@@ -292,6 +317,7 @@ NULLB_DEVICE_ATTR(index, uint);
 NULLB_DEVICE_ATTR(use_lightnvm, bool);
 NULLB_DEVICE_ATTR(blocking, bool);
 NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
+NULLB_DEVICE_ATTR(memory_backed, bool);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -346,12 +372,16 @@ static struct configfs_attribute *nullb_device_attrs[] = {
_device_attr_blocking,
_device_attr_use_per_node_hctx,
_device_attr_power,
+   _device_attr_memory_backed,
NULL,
 };
 
 static void nullb_device_release(struct config_item *item)
 {
-   null_free_dev(to_nullb_device(item));
+   struct nullb_device *dev = to_nullb_device(item);
+
+   null_free_device_storage(dev);
+   null_free_dev(dev);
 }
 
 static struct configfs_item_operations nullb_device_ops = {
@@ -395,7 +425,7 @@ nullb_group_drop_item(struct config_group *group, struct 
config_item *item)
 
 static ssize_t memb_group_features_show(struct config_item *item, char *page)
 {
-   return snprintf(page, PAGE_SIZE, "\n");
+   return snprintf(page, PAGE_SIZE, "memory_backed\n");
 }
 
 CONFIGFS_ATTR_RO(memb_group_, features);
@@ -432,6 +462,7 @@ static struct nullb_device *null_alloc_dev(void)
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return NULL;
+   INIT_RADIX_TREE(>data, GFP_ATOMIC);
dev->size = g_gb * 1024;
dev->completion_nsec = g_completion_nsec;
dev->submit_queues = g_submit_queues;
@@ -532,13 +563,14 @@ static void end_cmd(struct nullb_cmd *cmd)
 
switch (queue_mode)  {
case NULL_Q_MQ:
-   blk_mq_end_request(cmd->rq, BLK_STS_OK);
+   blk_mq_end_request(cmd->rq, cmd->error);
return;
case NULL_Q_RQ:
INIT_LIST_HEAD(>rq->queuelist);
-   blk_end_request_all(cmd->rq, BLK_STS_OK);
+   blk_end_request_all(cmd->rq, cmd->error);
break;
case NULL_Q_BIO:
+   cmd->bio->bi_status = cmd->error;
bio_endio(cmd->bio);
break;
}
@@ -579,12 +611,297 @@ static void null_softirq_done_fn(struct request *rq)
end_cmd(rq->special);
 }
 
-static inline void null_handle_cmd

  1   2   3   4   >