Re: [PATCH 5/6] genhd: Fix BUG in blkdev_open()

2018-02-12 Thread Hou Tao
Hi Jan,

On 2018/2/7 0:05, Jan Kara wrote:
> When two blkdev_open() calls for a partition race with device removal
> and recreation, we can hit BUG_ON(!bd_may_claim(bdev, whole, holder)) in
> blkdev_open(). The race can happen as follows:
> 
> CPU0  CPU1CPU2
>   del_gendisk()
> 
> bdev_unhash_inode(part1);
> 
> blkdev_open(part1, O_EXCL)blkdev_open(part1, O_EXCL)
>   bdev = bd_acquire()   bdev = bd_acquire()
>   blkdev_get(bdev)
> bd_start_claiming(bdev)
>   - finds old inode 'whole'
>   bd_prepare_to_claim() -> 0
> 
> bdev_unhash_inode(whole);
>   
>   number created>
> blkdev_get(bdev);
>   bd_start_claiming(bdev)
> - finds new inode 'whole'
> bd_prepare_to_claim()
>   - this also succeeds as we have
> different 'whole' here...
>   - bad things happen now as we
> have two exclusive openers of
> the same bdev
> 
> The problem here is that block device opens can see various intermediate
> states while gendisk is shutting down and then being recreated.
> 
> We fix the problem by introducing new lookup_sem in gendisk that
> synchronizes gendisk deletion with get_gendisk() and furthermore by
> making sure that get_gendisk() does not return gendisk that is being (or
> has been) deleted. This makes sure that once we ever manage to look up
> newly created bdev inode, we are also guaranteed that following
> get_gendisk() will either return failure (and we fail open) or it
> returns gendisk for the new device and following bdget_disk() will
> return new bdev inode (i.e., blkdev_open() follows the path as if it is
> completely run after new device is created).
> 
> Reported-and-analyzed-by: Hou Tao <hout...@huawei.com>
> Signed-off-by: Jan Kara <j...@suse.cz>
> ---
>  block/genhd.c | 21 -
>  include/linux/genhd.h |  1 +
>  2 files changed, 21 insertions(+), 1 deletion(-)
> 

Before applying the patch set, the BUG_ON in blkdev_open() will reproduce in 
about
10 minutes or less. Now after applying the patch set and running about 8 hours 
or more,
the bug is no longer reproducible, so

Tested-by: Hou Tao <hout...@huawei.com>

Based on the test result, it seems that this patch alone can not fix the BUG_ON 
in
blkdev_open(). Patch 6 is also needed to fix the BUG_ON problem.

Regards,
Tao

> diff --git a/block/genhd.c b/block/genhd.c
> index 64c323549a22..c6f68c332bfe 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -703,6 +703,11 @@ void del_gendisk(struct gendisk *disk)
>   blk_integrity_del(disk);
>   disk_del_events(disk);
>  
> + /*
> +  * Block lookups of the disk until all bdevs are unhashed and the
> +  * disk is marked as dead (GENHD_FL_UP cleared).
> +  */
> + down_write(>lookup_sem);
>   /* invalidate stuff */
>   disk_part_iter_init(, disk,
>DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
> @@ -717,6 +722,7 @@ void del_gendisk(struct gendisk *disk)
>   bdev_unhash_inode(disk_devt(disk));
>   set_capacity(disk, 0);
>   disk->flags &= ~GENHD_FL_UP;
> + up_write(>lookup_sem);
>  
>   if (!(disk->flags & GENHD_FL_HIDDEN))
>   sysfs_remove_link(_to_dev(disk)->kobj, "bdi");
> @@ -801,9 +807,21 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
>   spin_unlock_bh(_devt_lock);
>   }
>  
> - if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) {
> + if (!disk)
> + return NULL;
> +
> + /*
> +  * Synchronize with del_gendisk() to not return disk that is being
> +  * destroyed.
> +  */
> + down_read(>lookup_sem);
> + if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
> +  !(disk->flags & GENHD_FL_UP))) {
> + up_read(>lookup_sem);
>   put_disk_and_module(disk);
>   disk = NULL;
> + } else {
> + up_read(>lookup_sem);
>   }
>   return disk;
>  }
> @@ -1403,6 +1421,7 @@ stru

Re: [PATCH 4/6] genhd: Fix use after free in __blkdev_get()

2018-02-12 Thread Hou Tao
Hi Jan,

On 2018/2/7 0:05, Jan Kara wrote:
> When two blkdev_open() calls race with device removal and recreation,
> __blkdev_get() can use looked up gendisk after it is freed:
> 
> CPU0  CPU1CPU2
>   del_gendisk(disk);
> 
> bdev_unhash_inode(inode);
> blkdev_open() blkdev_open()
>   bdev = bd_acquire(inode);
> - creates and returns new inode
> bdev = bd_acquire(inode);
>   - returns the same inode
>   __blkdev_get(devt)__blkdev_get(devt)
> disk = get_gendisk(devt);
>   - got structure of device going away
>   
>   created under the same
>device number>
> disk = get_gendisk(devt);
>   - got new device structure
> if (!bdev->bd_openers) {
>   does the first open
> }
> if (!bdev->bd_openers)
>   - false
> } else {
>   put_disk_and_module(disk)
> - remember this was old device - this was last ref and disk is
>   now freed
> }
> disk_unblock_events(disk); -> oops
> 
> Fix the problem by making sure we drop reference to disk in
> __blkdev_get() only after we are really done with it.
> 
> Reported-by: Hou Tao <hout...@huawei.com>
> Signed-off-by: Jan Kara <j...@suse.cz>
> ---
>  fs/block_dev.c | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 

After applying the patch, the use-after-free problem is gone, so

Tested-by: Hou Tao <hout...@huawei.com>

> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 1dbbf847911a..fe41a76769fa 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1409,6 +1409,7 @@ static int __blkdev_get(struct block_device *bdev, 
> fmode_t mode, int for_part)
>   int ret;
>   int partno;
>   int perm = 0;
> + bool first_open = false;
>  
>   if (mode & FMODE_READ)
>   perm |= MAY_READ;
> @@ -1435,6 +1436,7 @@ static int __blkdev_get(struct block_device *bdev, 
> fmode_t mode, int for_part)
>   disk_block_events(disk);
>   mutex_lock_nested(>bd_mutex, for_part);
>   if (!bdev->bd_openers) {
> + first_open = true;
>   bdev->bd_disk = disk;
>   bdev->bd_queue = disk->queue;
>   bdev->bd_contains = bdev;
> @@ -1520,14 +1522,15 @@ static int __blkdev_get(struct block_device *bdev, 
> fmode_t mode, int for_part)
>   if (ret)
>   goto out_unlock_bdev;
>   }
> - /* only one opener holds refs to the module and disk */
> - put_disk_and_module(disk);
>   }
>   bdev->bd_openers++;
>   if (for_part)
>   bdev->bd_part_count++;
>   mutex_unlock(>bd_mutex);
>   disk_unblock_events(disk);
> + /* only one opener holds refs to the module and disk */
> + if (!first_open)
> + put_disk_and_module(disk);
>   return 0;
>  
>   out_clear:
> 



Re: [PATCH] blktrace: support enabling cgroup info per-device

2018-01-22 Thread Hou Tao
Hi Jens,

Could you please look at this patch and the related patch set for blktrace [1], 
and
give some feedback ?

Regards,
Tao

[1]: https://www.spinics.net/lists/linux-btrace/msg00790.html

On 2018/1/17 14:10, Hou Tao wrote:
> Hi Jens,
> 
> Any comments on this patch and the related patch set for blktrace [1] ?
> 
> Regards,
> Tao
> 
> [1]: https://www.spinics.net/lists/linux-btrace/msg00790.html
> 
> On 2018/1/11 12:09, Hou Tao wrote:
>> Now blktrace supports outputting cgroup info for trace action and
>> trace message, however, it can only be enabled globally by writing
>> "blk_cgroup" to trace_options file, and there is no per-device API
>> for the new functionality.
>>
>> Adding a new field (enable_cg_info) by using the pad after act_mask
>> in struct blk_user_trace_setup and a new attr file (cgroup_info) under
>> /sys/block/$dev/trace dir, so BLKTRACESETUP ioctl and sysfs file
>> can be used to enable cgroup info for selected block devices.
>>
>> Signed-off-by: Hou Tao <hout...@huawei.com>
>> ---
>>  include/linux/blktrace_api.h  |  2 ++
>>  include/uapi/linux/blktrace_api.h |  1 +
>>  kernel/trace/blktrace.c   | 14 --
>>  3 files changed, 15 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
>> index 8804753..f120c6a 100644
>> --- a/include/linux/blktrace_api.h
>> +++ b/include/linux/blktrace_api.h
>> @@ -18,6 +18,7 @@ struct blk_trace {
>>  unsigned long __percpu *sequence;
>>  unsigned char __percpu *msg_data;
>>  u16 act_mask;
>> +bool enable_cg_info;
>>  u64 start_lba;
>>  u64 end_lba;
>>  u32 pid;
>> @@ -102,6 +103,7 @@ static inline int blk_trace_init_sysfs(struct device 
>> *dev)
>>  struct compat_blk_user_trace_setup {
>>  char name[BLKTRACE_BDEV_SIZE];
>>  u16 act_mask;
>> +u8  enable_cg_info;
>>  u32 buf_size;
>>  u32 buf_nr;
>>  compat_u64 start_lba;
>> diff --git a/include/uapi/linux/blktrace_api.h 
>> b/include/uapi/linux/blktrace_api.h
>> index 20d1490d..d9d9fca 100644
>> --- a/include/uapi/linux/blktrace_api.h
>> +++ b/include/uapi/linux/blktrace_api.h
>> @@ -136,6 +136,7 @@ enum {
>>  struct blk_user_trace_setup {
>>  char name[BLKTRACE_BDEV_SIZE];  /* output */
>>  __u16 act_mask; /* input */
>> +__u8  enable_cg_info;   /* input */
>>  __u32 buf_size; /* input */
>>  __u32 buf_nr;   /* input */
>>  __u64 start_lba;
>> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
>> index 987d9a9a..f420400 100644
>> --- a/kernel/trace/blktrace.c
>> +++ b/kernel/trace/blktrace.c
>> @@ -180,7 +180,8 @@ void __trace_note_message(struct blk_trace *bt, struct 
>> blkcg *blkcg,
>>  n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
>>  va_end(args);
>>  
>> -if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
>> +if (!((blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP) ||
>> +  bt->enable_cg_info))
>>  blkcg = NULL;
>>  #ifdef CONFIG_BLK_CGROUP
>>  trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
>> @@ -549,6 +550,7 @@ static int do_blk_trace_setup(struct request_queue *q, 
>> char *name, dev_t dev,
>>  bt->act_mask = buts->act_mask;
>>  if (!bt->act_mask)
>>  bt->act_mask = (u16) -1;
>> +bt->enable_cg_info = buts->enable_cg_info;
>>  
>>  blk_trace_setup_lba(bt, bdev);
>>  
>> @@ -625,6 +627,7 @@ static int compat_blk_trace_setup(struct request_queue 
>> *q, char *name,
>>  
>>  buts = (struct blk_user_trace_setup) {
>>  .act_mask = cbuts.act_mask,
>> +.enable_cg_info = cbuts.enable_cg_info,
>>  .buf_size = cbuts.buf_size,
>>  .buf_nr = cbuts.buf_nr,
>>  .start_lba = cbuts.start_lba,
>> @@ -773,7 +776,8 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct 
>> bio *bio)
>>  {
>>  struct blk_trace *bt = q->blk_trace;
>>  
>> -if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
>> +if (!(bt && (bt->enable_cg_info ||
>> + (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP
>>  return NULL;
>>  
>>  if (!bio->bi_css)
>> @@ -1664,6 +1668,7 @@ static BLK_TRACE_DEVICE_ATTR(act_mask);
>>  static BLK_TRACE_DEVICE_ATTR(pid);
>>  stat

Re: [PATCH] blktrace: support enabling cgroup info per-device

2018-01-16 Thread Hou Tao
Hi Jens,

Any comments on this patch and the related patch set for blktrace [1] ?

Regards,
Tao

[1]: https://www.spinics.net/lists/linux-btrace/msg00790.html

On 2018/1/11 12:09, Hou Tao wrote:
> Now blktrace supports outputting cgroup info for trace action and
> trace message, however, it can only be enabled globally by writing
> "blk_cgroup" to trace_options file, and there is no per-device API
> for the new functionality.
> 
> Adding a new field (enable_cg_info) by using the pad after act_mask
> in struct blk_user_trace_setup and a new attr file (cgroup_info) under
> /sys/block/$dev/trace dir, so BLKTRACESETUP ioctl and sysfs file
> can be used to enable cgroup info for selected block devices.
> 
> Signed-off-by: Hou Tao <hout...@huawei.com>
> ---
>  include/linux/blktrace_api.h  |  2 ++
>  include/uapi/linux/blktrace_api.h |  1 +
>  kernel/trace/blktrace.c   | 14 --
>  3 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
> index 8804753..f120c6a 100644
> --- a/include/linux/blktrace_api.h
> +++ b/include/linux/blktrace_api.h
> @@ -18,6 +18,7 @@ struct blk_trace {
>   unsigned long __percpu *sequence;
>   unsigned char __percpu *msg_data;
>   u16 act_mask;
> + bool enable_cg_info;
>   u64 start_lba;
>   u64 end_lba;
>   u32 pid;
> @@ -102,6 +103,7 @@ static inline int blk_trace_init_sysfs(struct device *dev)
>  struct compat_blk_user_trace_setup {
>   char name[BLKTRACE_BDEV_SIZE];
>   u16 act_mask;
> + u8  enable_cg_info;
>   u32 buf_size;
>   u32 buf_nr;
>   compat_u64 start_lba;
> diff --git a/include/uapi/linux/blktrace_api.h 
> b/include/uapi/linux/blktrace_api.h
> index 20d1490d..d9d9fca 100644
> --- a/include/uapi/linux/blktrace_api.h
> +++ b/include/uapi/linux/blktrace_api.h
> @@ -136,6 +136,7 @@ enum {
>  struct blk_user_trace_setup {
>   char name[BLKTRACE_BDEV_SIZE];  /* output */
>   __u16 act_mask; /* input */
> + __u8  enable_cg_info;   /* input */
>   __u32 buf_size; /* input */
>   __u32 buf_nr;   /* input */
>   __u64 start_lba;
> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
> index 987d9a9a..f420400 100644
> --- a/kernel/trace/blktrace.c
> +++ b/kernel/trace/blktrace.c
> @@ -180,7 +180,8 @@ void __trace_note_message(struct blk_trace *bt, struct 
> blkcg *blkcg,
>   n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
>   va_end(args);
>  
> - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
> + if (!((blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP) ||
> +   bt->enable_cg_info))
>   blkcg = NULL;
>  #ifdef CONFIG_BLK_CGROUP
>   trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
> @@ -549,6 +550,7 @@ static int do_blk_trace_setup(struct request_queue *q, 
> char *name, dev_t dev,
>   bt->act_mask = buts->act_mask;
>   if (!bt->act_mask)
>   bt->act_mask = (u16) -1;
> + bt->enable_cg_info = buts->enable_cg_info;
>  
>   blk_trace_setup_lba(bt, bdev);
>  
> @@ -625,6 +627,7 @@ static int compat_blk_trace_setup(struct request_queue 
> *q, char *name,
>  
>   buts = (struct blk_user_trace_setup) {
>   .act_mask = cbuts.act_mask,
> + .enable_cg_info = cbuts.enable_cg_info,
>   .buf_size = cbuts.buf_size,
>   .buf_nr = cbuts.buf_nr,
>   .start_lba = cbuts.start_lba,
> @@ -773,7 +776,8 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct 
> bio *bio)
>  {
>   struct blk_trace *bt = q->blk_trace;
>  
> - if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
> + if (!(bt && (bt->enable_cg_info ||
> +  (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP
>   return NULL;
>  
>   if (!bio->bi_css)
> @@ -1664,6 +1668,7 @@ static BLK_TRACE_DEVICE_ATTR(act_mask);
>  static BLK_TRACE_DEVICE_ATTR(pid);
>  static BLK_TRACE_DEVICE_ATTR(start_lba);
>  static BLK_TRACE_DEVICE_ATTR(end_lba);
> +static BLK_TRACE_DEVICE_ATTR(cgroup_info);
>  
>  static struct attribute *blk_trace_attrs[] = {
>   _attr_enable.attr,
> @@ -1671,6 +1676,7 @@ static struct attribute *blk_trace_attrs[] = {
>   _attr_pid.attr,
>   _attr_start_lba.attr,
>   _attr_end_lba.attr,
> + _attr_cgroup_info.attr,
>   NULL
>  };
>  
> @@ -1794,6 +1800,8 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
> *dev,
>   ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
&g

Re: blkdev loop UAF

2018-01-11 Thread Hou Tao
Hi,

On 2018/1/11 16:24, Dan Carpenter wrote:
> Thanks for your report and the patch.  I am sending it to the
> linux-block devs since it's already public.
> 
> regards,
> dan carpenter

The User-after-free problem is not specific for loop device, it can also
be reproduced on scsi device, and there are more race problems caused by
the race between bdev open and gendisk shutdown [1].

The cause of the UAF problem is that there are two instances of gendisk which 
share
the same bdev. After the process owning the new gendisk increases 
bdev->bd_openers,
the other process which owns the older gendisk will find bdev->bd_openers is 
not zero
and will put the last reference of the older gendisk and cause User-after-free.

I had proposed a patch for the problem, but it's still an incomplete fix for 
the race
between gendisk shutdown and bdev opening.

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4a181fc..5ecdb9f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1510,6 +1510,11 @@ static int __blkdev_get(struct block_device *bdev, 
fmode_t mode, int for_part)
if (bdev->bd_bdi == _backing_dev_info)
bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
} else {
+   if (bdev->bd_disk != disk) {
+   ret = -ENXIO;
+   goto out_unlock_bdev;
+   }
+
if (bdev->bd_contains == bdev) {
ret = 0;
if (bdev->bd_disk->fops->open)


As far as I know, Jan Kara is working on these problems. So, Jan, any 
suggestions ?

Regards
Tao

[1]: https://www.spinics.net/lists/linux-block/msg20066.html

> On Thu, Jan 11, 2018 at 03:51:06PM +0800, Foy wrote:
>> BUG:
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 4a181fc..db919a9 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -1430,12 +1430,15 @@ static int __blkdev_get(struct block_device *bdev, 
>> fmode_t mode, int for_part)
>>   restart:
>>  
>> ret = -ENXIO;
>> +   //2. Process C: loop_control_ioctl ==> LOOP_CTL_REMOVE ==> 
>> idr_remove 
>> +   //3. Process B: get_gendisk ==> get_gendisk ==> kobj_lookup ==> 
>> loop_probe ==> loop_add, get a new disk(2)
>> disk = get_gendisk(bdev->bd_dev, );
>> if (!disk)
>> goto out;
>> owner = disk->fops->owner;
>>  
>> disk_block_events(disk);
>> +   //1. Process A get the disk(1),before the mutex_lock_nested.And then 
>> be scheduled 
>> mutex_lock_nested(>bd_mutex, for_part);
>> if (!bdev->bd_openers) {
>> bdev->bd_disk = disk;
>> @@ -1524,14 +1527,17 @@ static int __blkdev_get(struct block_device *bdev, 
>> fmode_t mode, int for_part)
>> if (ret)
>> goto out_unlock_bdev;
>> }
>> +   //5. Process A disk(1) will be free,because disk(1)'s refs 
>> == 1
>> /* only one opener holds refs to the module and disk */
>> put_disk(disk);
>> module_put(owner);
>> }
>> +   //4. Process B: bdev->bd_openers != 0
>> bdev->bd_openers++;
>> if (for_part)
>> bdev->bd_part_count++;
>> mutex_unlock(>bd_mutex);
>> +   //6. Process A the disk(1) will be use
>> disk_unblock_events(disk);
>> return 0;
>>
>>
>>
>>
>> Patch:
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 4a181fc..1f5c7bf 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -1526,13 +1526,15 @@ static int __blkdev_get(struct block_device *bdev, 
>> fmode_t mode, int for_part)
>>  }
>>  /* only one opener holds refs to the module and disk */
>>  put_disk(disk);
>> +disk = NULL;
>>  module_put(owner);
>>  }
>>  bdev->bd_openers++;
>>  if (for_part)
>>  bdev->bd_part_count++;
>>  mutex_unlock(>bd_mutex);
>> -disk_unblock_events(disk);
>> +if (disk)
>> +disk_unblock_events(disk);
>>  return 0;
>>  
>>   out_clear:
>>
>>
>>
>>
>> Crash:
>> ==
>> BUG: KASAN: use-after-free in disk_unblock_events+0x4b/0x50 
>> block/genhd.c:1657
>> Read of size 8 at addr 880035c273f8 by task syz-executor6/21165
>>
>>
>> CPU: 0 PID: 21165 Comm: syz-executor6 Not tainted 4.15.0-rc6 #18
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
>> Ubuntu-1.8.2-1ubuntu1 04/01/2014
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:17 [inline]
>>  dump_stack+0x104/0x1c5 lib/dump_stack.c:53
>>  print_address_description+0x6e/0x280 mm/kasan/report.c:252
>>  kasan_report_error mm/kasan/report.c:351 [inline]
>>  kasan_report+0x254/0x340 mm/kasan/report.c:409
>>  disk_unblock_events+0x4b/0x50 block/genhd.c:1657
>>  __blkdev_get+0x5f5/0xdb0 fs/block_dev.c:1535
>>  blkdev_get+0x338/0x9e0 fs/block_dev.c:1591
>>  blkdev_open+0x1bd/0x240 fs/block_dev.c:1749
>>  do_dentry_open+0x682/0xd80 fs/open.c:752
>>  vfs_open+0x107/0x220 fs/open.c:866
>>  do_last 

[PATCH] blktrace: support enabling cgroup info per-device

2018-01-10 Thread Hou Tao
Now blktrace supports outputting cgroup info for trace action and
trace message, however, it can only be enabled globally by writing
"blk_cgroup" to trace_options file, and there is no per-device API
for the new functionality.

Adding a new field (enable_cg_info) by using the pad after act_mask
in struct blk_user_trace_setup and a new attr file (cgroup_info) under
/sys/block/$dev/trace dir, so BLKTRACESETUP ioctl and sysfs file
can be used to enable cgroup info for selected block devices.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 include/linux/blktrace_api.h  |  2 ++
 include/uapi/linux/blktrace_api.h |  1 +
 kernel/trace/blktrace.c   | 14 --
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 8804753..f120c6a 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -18,6 +18,7 @@ struct blk_trace {
unsigned long __percpu *sequence;
unsigned char __percpu *msg_data;
u16 act_mask;
+   bool enable_cg_info;
u64 start_lba;
u64 end_lba;
u32 pid;
@@ -102,6 +103,7 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 struct compat_blk_user_trace_setup {
char name[BLKTRACE_BDEV_SIZE];
u16 act_mask;
+   u8  enable_cg_info;
u32 buf_size;
u32 buf_nr;
compat_u64 start_lba;
diff --git a/include/uapi/linux/blktrace_api.h 
b/include/uapi/linux/blktrace_api.h
index 20d1490d..d9d9fca 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -136,6 +136,7 @@ enum {
 struct blk_user_trace_setup {
char name[BLKTRACE_BDEV_SIZE];  /* output */
__u16 act_mask; /* input */
+   __u8  enable_cg_info;   /* input */
__u32 buf_size; /* input */
__u32 buf_nr;   /* input */
__u64 start_lba;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 987d9a9a..f420400 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -180,7 +180,8 @@ void __trace_note_message(struct blk_trace *bt, struct 
blkcg *blkcg,
n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
va_end(args);
 
-   if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+   if (!((blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP) ||
+ bt->enable_cg_info))
blkcg = NULL;
 #ifdef CONFIG_BLK_CGROUP
trace_note(bt, 0, BLK_TN_MESSAGE, buf, n,
@@ -549,6 +550,7 @@ static int do_blk_trace_setup(struct request_queue *q, char 
*name, dev_t dev,
bt->act_mask = buts->act_mask;
if (!bt->act_mask)
bt->act_mask = (u16) -1;
+   bt->enable_cg_info = buts->enable_cg_info;
 
blk_trace_setup_lba(bt, bdev);
 
@@ -625,6 +627,7 @@ static int compat_blk_trace_setup(struct request_queue *q, 
char *name,
 
buts = (struct blk_user_trace_setup) {
.act_mask = cbuts.act_mask,
+   .enable_cg_info = cbuts.enable_cg_info,
.buf_size = cbuts.buf_size,
.buf_nr = cbuts.buf_nr,
.start_lba = cbuts.start_lba,
@@ -773,7 +776,8 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio 
*bio)
 {
struct blk_trace *bt = q->blk_trace;
 
-   if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
+   if (!(bt && (bt->enable_cg_info ||
+(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP
return NULL;
 
if (!bio->bi_css)
@@ -1664,6 +1668,7 @@ static BLK_TRACE_DEVICE_ATTR(act_mask);
 static BLK_TRACE_DEVICE_ATTR(pid);
 static BLK_TRACE_DEVICE_ATTR(start_lba);
 static BLK_TRACE_DEVICE_ATTR(end_lba);
+static BLK_TRACE_DEVICE_ATTR(cgroup_info);
 
 static struct attribute *blk_trace_attrs[] = {
_attr_enable.attr,
@@ -1671,6 +1676,7 @@ static struct attribute *blk_trace_attrs[] = {
_attr_pid.attr,
_attr_start_lba.attr,
_attr_end_lba.attr,
+   _attr_cgroup_info.attr,
NULL
 };
 
@@ -1794,6 +1800,8 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
else if (attr == _attr_end_lba)
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+   else if (attr == _attr_cgroup_info)
+   ret = sprintf(buf, "%u\n", q->blk_trace->enable_cg_info);
 
 out_unlock_bdev:
mutex_unlock(>blk_trace_mutex);
@@ -1861,6 +1869,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
q->blk_trace->start_lba = value;
else if (attr == _attr_end_lba)
q->blk_trace->end_lba = value;
+   else if (attr == _attr_cgroup_info)
+   q->blk_trace->enable_cg_info = !!value;
}
 
 out_unlock_bdev:
-- 
2.9.5



[PATCH] blktrace: output io cgroup name for cgroup v1

2017-12-27 Thread Hou Tao
Now the output of io cgroup name in blktrace is controlled by
blk_cgroup & blk_cgname options in trace_options files. When
using cgroup v1 for io controller, there is no output of cgroup
name in trace file, because cgroup_path_from_kernfs_id() uses
cgrp_dfl_root.kf_root to find the cgroup file and cgrp_dfl_root
is only valid for cgroup v2.

So fix cgroup_path_from_kernfs_id() to support both cgroup v1 and v2.

Fixes: 69fd5c3 ("blktrace: add an option to allow displaying cgroup path")
Signed-off-by: Hou Tao <hout...@huawei.com>
---
 include/linux/cgroup.h  |  6 +++---
 kernel/cgroup/cgroup.c  | 39 ---
 kernel/trace/blktrace.c |  4 ++--
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0..ed80490 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -651,7 +651,7 @@ static inline union kernfs_node_id 
*cgroup_get_kernfs_id(struct cgroup *cgrp)
return >kn->id;
 }
 
-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+void cgroup_path_from_kernfs_id(int ssid, const union kernfs_node_id *id,
char *buf, size_t buflen);
 #else /* !CONFIG_CGROUPS */
 
@@ -686,8 +686,8 @@ static inline bool task_under_cgroup_hierarchy(struct 
task_struct *task,
return true;
 }
 
-static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
-   char *buf, size_t buflen) {}
+static inline void cgroup_path_from_kernfs_id(int ssid,
+   const union kernfs_node_id *id, char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe1..49d63c6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5354,16 +5354,49 @@ static int __init cgroup_wq_init(void)
 }
 core_initcall(cgroup_wq_init);
 
-void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
+void cgroup_path_from_kernfs_id(int ssid, const union kernfs_node_id *id,
char *buf, size_t buflen)
 {
+   struct kernfs_root *root;
struct kernfs_node *kn;
+   struct cgroup *root_cgrp = NULL;
 
-   kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
-   if (!kn)
+   if (ssid >= CGROUP_SUBSYS_COUNT)
return;
+
+   if (likely(static_key_enabled(cgroup_subsys_on_dfl_key[ssid]))) {
+   root = cgrp_dfl_root.kf_root;
+   } else {
+   struct cgroup_subsys *subsys = cgroup_subsys[ssid];
+
+   /*
+* It seems we can not use rcu_read_lock() to protect
+* the liveness check of subsys->root->cgrp. Although
+* root->cgrp is freed by RCU, when we dereference the
+* old root, the old root may been destroying by
+* cgroup_destroy_root().
+*/
+   mutex_lock(_mutex);
+   if (percpu_ref_tryget_live(>root->cgrp.self.refcnt)) {
+   root_cgrp = >root->cgrp;
+   root = subsys->root->kf_root;
+   }
+   mutex_unlock(_mutex);
+
+   if (!root_cgrp)
+   goto out;
+   }
+
+   kn = kernfs_get_node_by_id(root, id);
+   if (!kn)
+   goto out;
+
kernfs_path(kn, buf, buflen);
kernfs_put(kn);
+
+out:
+   if (root_cgrp)
+   cgroup_put(root_cgrp);
 }
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 987d9a9a..79890e0 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1279,8 +1279,8 @@ static void blk_log_action(struct trace_iterator *iter, 
const char *act,
if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) {
char blkcg_name_buf[NAME_MAX + 1] = "<...>";
 
-   cgroup_path_from_kernfs_id(id, blkcg_name_buf,
-   sizeof(blkcg_name_buf));
+   cgroup_path_from_kernfs_id(io_cgrp_id, id,
+   blkcg_name_buf, sizeof(blkcg_name_buf));
trace_seq_printf(>seq, "%3d,%-3d %s %2s %3s ",
 MAJOR(t->device), MINOR(t->device),
 blkcg_name_buf, act, rwbs);
-- 
2.9.5



Re: [PATCH 02/11] block: Fix race of bdev open with gendisk shutdown

2017-11-23 Thread Hou Tao
Hi Jan,

On 2017/11/21 0:43, Jan Kara wrote:
> Hi Tao!
> 
> On Fri 17-11-17 14:51:18, Hou Tao wrote:
>> On 2017/3/13 23:14, Jan Kara wrote:
>>> blkdev_open() may race with gendisk shutdown in two different ways.
>>> Either del_gendisk() has already unhashed block device inode (and thus
>>> bd_acquire() will end up creating new block device inode) however
>>> gen_gendisk() will still return the gendisk that is being destroyed.
>>> Or bdev returned by bd_acquire() will get unhashed and gendisk destroyed
>>> before we get to get_gendisk() and get_gendisk() will return new gendisk
>>> that got allocated for device that reused the device number.
>>>
>>> In both cases this will result in possible inconsistencies between
>>> bdev->bd_disk and bdev->bd_bdi (in the first case after gendisk gets
>>> destroyed and device number reused, in the second case immediately).
>>
>>> Fix the problem by checking whether the gendisk is still alive and inode
>>> hashed when associating bdev inode with it and its bdi. That way we are
>>> sure that we will not associate bdev inode with disk that got past
>>> blk_unregister_region() in del_gendisk() (and thus device number can get
>>> reused). Similarly, we will not associate bdev that was once associated
>>> with gendisk that is going away (and thus the corresponding bdev inode
>>> will get unhashed in del_gendisk()) with a new gendisk that is just
>>> reusing the device number.
>>
>> I have run some extended tests based on Omar Sandoval's script [1] these 
>> days,
>> and found two new problems related with the race between bdev open and 
>> gendisk
>> shutdown.
>>
>> The first problem lies in __blkdev_get(), and will cause use-after-free, or
>> worse oops. It happens because there may be two instances of gendisk related
>> with one bdev. When the process which owns the newer gendisk completes the
>> invocation of __blkdev_get() firstly, the other process which owns the older
>> gendisk will put the last reference of the older gendisk and cause 
>> use-after-free.
>>
>> The following call sequences illustrate the problem:
>>
>> unhash the bdev_inode of /dev/sda
>>
>> process A (blkdev_open()):
>>  bd_acquire() returns new bdev_inode of /dev/sda
>>  get_gendisk() returns gendisk (v1 gendisk)
>>
>> remove gendisk from bdev_map
>> device number is reused
> 
> ^^ this is through blk_unregister_region() in del_gendisk(), isn't it?

Yes, both the unhash of inode and the removal of gendisk from bdev_map
happen in del_gendisk().

>> process B (blkdev_open()):
>>  bd_acquire() returns new bdev_inode of /dev/sda
>>  get_gendisk() returns a new gendisk (v2 gendisk)
>>  increase bdev->bd_openers
> 
> So this needs to be a bit more complex - bd_acquire() must happen before
> bdev_unhash_inode() in del_gendisk() above happened. And get_gendisk() must
> happen after blk_unregister_region() from del_gendisk() happened. But yes,
> this seems possible.
> 
>> process A (blkdev_open()):
>>  find bdev->bd_openers != 0
>>  put_disk(disk) // this is the last reference to v1 gendisk
>>  disk_unblock_events(disk) // use-after-free occurs
>>
>> The problem can be fixed by an extra check showed in the following snippet:
>>
>> static int __blkdev_get(struct block_device *bdev, fmode_t mode, int 
>> for_part)
>> {
>>  if (!bdev->bd_openers) {
>>  } else {
>>  if (bdev->bd_disk != disk) {
>>  ret = -ENXIO;
>>  goto out_unlock_bdev;
>>  }
>>  }
>> }
>>
>> And we also can simplify the check in your patch by moving
>> blk_unregister_region() before bdev_unhash_inode(), so whenever we get a
>> new bdev_inode, the gendisk returned by get_gendisk() will either be NULL
>> or a new gendisk.
> 
> I don't think we can do that. If we call blk_unregister_region() before
> bdev_unhash_inode(), the device number can get reused while bdev inode is
> still visible. So you can get that bdev inode v1 associated with gendisk
> v2. And open following unhashing of bdev inode v1 will get bdev inode v2
> associated with gendisk v2. So you have two different bdev inodes
> associated with the same gendisk and that will cause data integrity issues.

Even we do not move blk_unregister_region(), the data integrity issue still
exists: bd_acquire() is invoked before bdev_unhash_inode() and get_gendisk()
is invoked after blk_unregister_region(). The move is used to simplify 

Re: [PATCH 02/11] block: Fix race of bdev open with gendisk shutdown

2017-11-16 Thread Hou Tao
Hi Jan,

On 2017/3/13 23:14, Jan Kara wrote:
> blkdev_open() may race with gendisk shutdown in two different ways.
> Either del_gendisk() has already unhashed block device inode (and thus
> bd_acquire() will end up creating new block device inode) however
> gen_gendisk() will still return the gendisk that is being destroyed.
> Or bdev returned by bd_acquire() will get unhashed and gendisk destroyed
> before we get to get_gendisk() and get_gendisk() will return new gendisk
> that got allocated for device that reused the device number.
> 
> In both cases this will result in possible inconsistencies between
> bdev->bd_disk and bdev->bd_bdi (in the first case after gendisk gets
> destroyed and device number reused, in the second case immediately).

> Fix the problem by checking whether the gendisk is still alive and inode
> hashed when associating bdev inode with it and its bdi. That way we are
> sure that we will not associate bdev inode with disk that got past
> blk_unregister_region() in del_gendisk() (and thus device number can get
> reused). Similarly, we will not associate bdev that was once associated
> with gendisk that is going away (and thus the corresponding bdev inode
> will get unhashed in del_gendisk()) with a new gendisk that is just
> reusing the device number.

I have run some extended tests based on Omar Sandoval's script [1] these days,
and found two new problems related with the race between bdev open and gendisk
shutdown.

The first problem lies in __blkdev_get(), and will cause use-after-free, or
worse oops. It happens because there may be two instances of gendisk related
with one bdev. When the process which owns the newer gendisk completes the
invocation of __blkdev_get() firstly, the other process which owns the older
gendisk will put the last reference of the older gendisk and cause 
use-after-free.

The following call sequences illustrate the problem:

unhash the bdev_inode of /dev/sda

process A (blkdev_open()):
bd_acquire() returns new bdev_inode of /dev/sda
get_gendisk() returns gendisk (v1 gendisk)

remove gendisk from bdev_map
device number is reused

process B (blkdev_open()):
bd_acquire() returns new bdev_inode of /dev/sda
get_gendisk() returns a new gendisk (v2 gendisk)
increase bdev->bd_openers

process A (blkdev_open()):
find bdev->bd_openers != 0
put_disk(disk) // this is the last reference to v1 gendisk
disk_unblock_events(disk) // use-after-free occurs

The problem can be fixed by an extra check showed in the following snippet:

static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
if (!bdev->bd_openers) {
} else {
if (bdev->bd_disk != disk) {
ret = -ENXIO;
goto out_unlock_bdev;
}
}
}

And we also can simplify the check in your patch by moving 
blk_unregister_region()
before bdev_unhash_inode(), so whenever we get a new bdev_inode, the gendisk 
returned
by get_gendisk() will either be NULL or a new gendisk. And the only check we 
need to do
in __blkdev_get() is checking the hash status of the bdev_inode after we get a 
gendisk
from get_gendisk(). The check can be done like the following snippet:

static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
{
/* .. */
ret = -ENXIO;
disk = get_gendisk(bdev->bd_dev, );
if (!disk)
goto out;
owner = disk->fops->owner;

if (inode_unhashed(bdev->bd_inode))
goto out_unmatched;
}

The second problem lies in bd_start_claiming(), and will trigger the BUG_ON 
assert in
blkdev_get: BUG_ON(!bd_may_claim(bdev, whole, holder)). It occurs when testing 
the
exclusive open and close of the disk and its partitions.

The cause of the problem is that a bdev_inode of a disk partition corresponds 
with two
instances of bdev_inode of the whole disk simultaneously. When one pair of the 
partition
inode and disk inode completes the claiming, the other pair will be stumbled by 
the BUG_ON
assert in blkdev_get() because bdev->bd_holder is no longer NULL.

The following sequences illustrate the problem:

unhash the bdev_inode of /dev/sda2

process A (blkdev_open()):
bd_acquire() returns a new bdev_inode for /dev/sda2
bd_start_claiming() returns bdev_inode of /dev/sda

process B (blkdev_open()):
bd_acquire() returns the new bdev_inode for /dev/sda2

unhash the bdev_inode of /dev/sda
remove gendisk from bdev_map
device number is reused

process B (blkdev_open()):
bd_start_claming() returns a new bdev_inode for /dev/sda

process A (blkdev_open()):
__blkdev_get() returns successfully
finish claiming  // bdev->bd_holder = holder

process B (blkdev_open()):
__blkdev_get() returns successfully
trigger BUG_ON(!bd_may_claim(bdev, whole, holder))

And the problem can be fixed by adding more checks in 

Re: [PATCH] block, bfq: dispatch request to prevent queue stalling after the request completion

2017-07-12 Thread Hou Tao


On 2017/7/12 17:41, Paolo Valente wrote:
> 
>> Il giorno 11 lug 2017, alle ore 15:58, Hou Tao <hout...@huawei.com> ha 
>> scritto:
>>
>> There are mq devices (eg., virtio-blk, nbd and loopback) which don't
>> invoke blk_mq_run_hw_queues() after the completion of a request.
>> If bfq is enabled on these devices and the slice_idle attribute or
>> strict_guarantees attribute is set as zero,
> 
> I guess you meant slice_idle > 0 or strict_guarantees equal to 1 here.
Sorry for the typo, I mean slice_idle = 0 or strict_guarantees = 1.
Setting slice_idle as 0 alone could also trigger the latency problem.

>> it is possible that
>> after a request completion the remaining requests of busy bfq queue
>> will stalled in the bfq schedule until a new request arrives.
>>
>> To fix the scheduler latency problem, we need to check whether or not
>> all issued requests have completed and dispatch more requests to driver
>> if there is no request in driver.
>>
>> Signed-off-by: Hou Tao <hout...@huawei.com>
>>
> 
> Thanks for this fix.  My only (possible) concern is whether there
> would be some more coherent and possibly more efficient solution to
> this problem, outside BFQ.  For example, would it make sense to call
> blk_mq_run_hw_queues(), after a request completion, from the offended
> devices too?  This would make the behavior of these devices coherent
> with that of the other devices.  Unfortunately I have no sound answer
> to such a question.  Apart from this concern:
> 
> Reviewed-by: Paolo Valente <paolo.vale...@linaro.org>
Thanks for your review.

The inconsistencies between the different mq drivers also make me confused.
I don't known the reason why scsi-mq dispatches the remaining requests
after each request completion and virtio-blk does not.

I'm not sure whether or not fixing in MQ is a better idea, but I has a proposal.
If the BLK_MQ_S_SCHED_RESTART flag has been set, MQ would invoke 
blk_mq_run_hw_queues()
after the request completion. The BLK_MQ_S_SCHED_RESTART flag will be set by
blk_mq_sched_dispatch_requests() when it finds out that there are residual 
requests in its
dispatch list. We can let .dispatch_request() return a flag to set the 
BLK_MQ_S_SCHED_RESTART
flag if the underlying scheduler needs it, so after the request completion, the 
MQ core will
pull the remaining requests from BFQ.

Regards,
Tao

> 
>> The problem can be reproduced by running the following script
>> on a virtio-blk device with nr_hw_queues as 1:
>>
>> #!/bin/sh
>>
>> dev=vdb
>> # mount point for dev
>> mp=/tmp/mnt
>> cd $mp
>>
>> job=strict.job
>> cat < $job
>> [global]
>> direct=1
>> bs=4k
>> size=256M
>> rw=write
>> ioengine=libaio
>> iodepth=128
>> runtime=5
>> time_based
>>
>> [1]
>> filename=1.data
>>
>> [2]
>> new_group
>> filename=2.data
>> EOF
>>
>> echo bfq > /sys/block/$dev/queue/scheduler
>> echo 1 > /sys/block/$dev/queue/iosched/strict_guarantees
>> fio $job
>> ---
>> block/bfq-iosched.c | 3 +++
>> 1 file changed, 3 insertions(+)
>>
>> diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
>> index 12bbc6b..94cd682 100644
>> --- a/block/bfq-iosched.c
>> +++ b/block/bfq-iosched.c
>> @@ -4293,6 +4293,9 @@ static void bfq_completed_request(struct bfq_queue 
>> *bfqq, struct bfq_data *bfqd)
>>  bfq_bfqq_expire(bfqd, bfqq, false,
>>  BFQQE_NO_MORE_REQUESTS);
>>  }
>> +
>> +if (!bfqd->rq_in_driver)
>> +bfq_schedule_dispatch(bfqd);
>> }
>>
>> static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
>> -- 
>> 2.5.0
>>
> 
> 
> .
> 



[PATCH] block, bfq: fix typos in comments about B-WF2Q+ algorithm

2017-07-12 Thread Hou Tao
The start time of eligible entity should be less than or equal to
the current virtual time, and the entity in idle tree has a finish
time being greater than the current virtual time.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 block/bfq-iosched.h | 2 +-
 block/bfq-wf2q.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 5c3bf98..32feddab 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -52,7 +52,7 @@ struct bfq_entity;
 struct bfq_service_tree {
/* tree for active entities (i.e., those backlogged) */
struct rb_root active;
-   /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+   /* tree for idle entities (i.e., not backlogged, with V < F_i)*/
struct rb_root idle;
 
/* idle entity with minimum F_i */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 8726ede..d03d7b6 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1268,7 +1268,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st, 
u64 new_value)
  *
  * This function searches the first schedulable entity, starting from the
  * root of the tree and going on the left every time on this side there is
- * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * a subtree with at least one eligible (start <= vtime) entity. The path on
  * the right is followed only if a) the left subtree contains no eligible
  * entities and b) no eligible entity has been found yet.
  */
-- 
2.5.0



[PATCH] block, bfq: dispatch request to prevent queue stalling after the request completion

2017-07-11 Thread Hou Tao
There are mq devices (eg., virtio-blk, nbd and loopback) which don't
invoke blk_mq_run_hw_queues() after the completion of a request.
If bfq is enabled on these devices and the slice_idle attribute or
strict_guarantees attribute is set as zero, it is possible that
after a request completion the remaining requests of busy bfq queue
will stalled in the bfq schedule until a new request arrives.

To fix the scheduler latency problem, we need to check whether or not
all issued requests have completed and dispatch more requests to driver
if there is no request in driver.

Signed-off-by: Hou Tao <hout...@huawei.com>

The problem can be reproduced by running the following script
on a virtio-blk device with nr_hw_queues as 1:

#!/bin/sh

dev=vdb
# mount point for dev
mp=/tmp/mnt
cd $mp

job=strict.job
cat < $job
[global]
direct=1
bs=4k
size=256M
rw=write
ioengine=libaio
iodepth=128
runtime=5
time_based

[1]
filename=1.data

[2]
new_group
filename=2.data
EOF

echo bfq > /sys/block/$dev/queue/scheduler
echo 1 > /sys/block/$dev/queue/iosched/strict_guarantees
fio $job
---
 block/bfq-iosched.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 12bbc6b..94cd682 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4293,6 +4293,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, 
struct bfq_data *bfqd)
bfq_bfqq_expire(bfqd, bfqq, false,
BFQQE_NO_MORE_REQUESTS);
}
+
+   if (!bfqd->rq_in_driver)
+   bfq_schedule_dispatch(bfqd);
 }
 
 static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
-- 
2.5.0



Re: [PATCH v2] cfq-iosched: fix the delay of cfq_group's vdisktime under iops mode

2017-05-30 Thread Hou Tao
Hi Jens,

I didn't found the patch in your linux-block git tree and the vanilla git tree.
Maybe you have forgot this CFQ fix ?

Regards,
Tao

On 2017/3/9 19:22, Hou Tao wrote:
> On 2017/3/8 22:05, Jan Kara wrote:
>> On Wed 08-03-17 20:16:55, Hou Tao wrote:
>>> When adding a cfq_group into the cfq service tree, we use CFQ_IDLE_DELAY
>>> as the delay of cfq_group's vdisktime if there have been other cfq_groups
>>> already.
>>>
>>> When cfq is under iops mode, commit 9a7f38c42c2b ("cfq-iosched: Convert
>>> from jiffies to nanoseconds") could result in a large iops delay and
>>> lead to an abnormal io schedule delay for the added cfq_group. To fix
>>> it, we just need to revert to the old CFQ_IDLE_DELAY value: HZ / 5
>>> when iops mode is enabled.
>>>
>>> Despite having the same value, the delay of a cfq_queue in idle class
>>> and the delay of cfq_queue are different things, so I define two new
>>> macros for the delay of a cfq_group under time-slice mode and IOPs mode.
>>>
>>> Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
>>> Cc: <sta...@vger.kernel.org> # 4.8+
>>> Signed-off-by: Hou Tao <hout...@huawei.com>
>>
>> OK, the number 200 is somewhat arbitrary but so is HZ/5 so I guess we are
>> OK. You can add:
>>
>> Acked-by: Jan Kara <j...@suse.cz>
> Oops, sorry for the arbitrary 200. My intention was to use HZ / 5 instead of 
> 200
> to keep consistent with the old CFQ_IDLE_DELAY. And I spot two typos in commit
> message: "the delay of cfq_queue" -> "the delay of cfq_group" and "IOPs" -> 
> "iops".
> 
> Jan, could you please fix them ? And I also attach a revised patch to fix 
> them.
> 
> 
> From: Hou Tao <hout...@huawei.com>
> Date: Wed, 1 Mar 2017 09:02:33 +0800
> Subject: [PATCH] cfq-iosched: fix the delay of cfq_group's vdisktime under
>  iops mode
> 
> When adding a cfq_group into the cfq service tree, we use CFQ_IDLE_DELAY
> as the delay of cfq_group's vdisktime if there have been other cfq_groups
> already.
> 
> When cfq is under iops mode, commit 9a7f38c42c2b ("cfq-iosched: Convert
> from jiffies to nanoseconds") could result in a large iops delay and
> lead to an abnormal io schedule delay for the added cfq_group. To fix
> it, we just need to revert to the old CFQ_IDLE_DELAY value: HZ / 5
> when iops mode is enabled.
> 
> Despite having the same value, the delay of a cfq_queue in idle class
> and the delay of cfq_group are different things, so I define two new
> macros for the delay of a cfq_group under time-slice mode and iops mode.
> 
> Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
> Cc: <sta...@vger.kernel.org> # 4.8+
> Signed-off-by: Hou Tao <hout...@huawei.com>
> Acked-by: Jan Kara <j...@suse.cz>
> ---
>  block/cfq-iosched.c | 17 +++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index 440b95e..2762505 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -38,9 +38,13 @@ static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 
> 3/10; /* 300 ms */
>  static const int cfq_hist_divisor = 4;
> 
>  /*
> - * offset from end of service tree
> + * offset from end of queue service tree for idle class
>   */
>  #define CFQ_IDLE_DELAY   (NSEC_PER_SEC / 5)
> +/* offset from end of group service tree under time slice mode */
> +#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
> +/* offset from end of group service under IOPS mode */
> +#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
> 
>  /*
>   * below this threshold, we consider thinktime immediate
> @@ -1362,6 +1366,14 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, 
> struct cfq_group *cfqg)
>   cfqg->vfraction = max_t(unsigned, vfr, 1);
>  }
> 
> +static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
> +{
> + if (!iops_mode(cfqd))
> + return CFQ_SLICE_MODE_GROUP_DELAY;
> + else
> + return CFQ_IOPS_MODE_GROUP_DELAY;
> +}
> +
>  static void
>  cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>  {
> @@ -1381,7 +1393,8 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, 
> struct cfq_group *cfqg)
>   n = rb_last(>rb);
>   if (n) {
>   __cfqg = rb_entry_cfqg(n);
> - cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
> + cfqg->vdisktime = __cfqg->vdisktime +
> + cfq_get_cfqg_vdisktime_delay(cfqd);
>   } else
>   cfqg->vdisktime = st->min_vdisktime;
>   cfq_group_service_tree_add(st, cfqg);
> 



BFQ: the purpose of idle rb-tree in bfq_service_tree

2017-05-25 Thread Hou Tao
Hi Paolo,

I am reading the code of BFQ scheduler and having a question about the purpose
of idle rb-tree in bfq_service_tree.

>From the comment in code, the idle rb-tree is used to keep the bfq_queue which
doesn't have any request and has a finish time greater than the vtime of the
service tree.

The only function that I can find and reveals the purpose of the idle rb-tree is
__bfq_activate_entity(). In this function, when the activated queue had been on
the idle tree, the start time of the queue will be reset to the greater value
between min_vstart and finish time. It seems to me that the idle rb-tree is used
to delay the schedule of the queue which issues requests periodically, but I 
don't
known why the delay is needed. Could you please explain the purpose of the idle
rb-tree in bf_service_tree and its main scenario ?

Thanks,
Tao



[PATCH v2] cfq-iosched: fix the delay of cfq_group's vdisktime under iops mode

2017-03-08 Thread Hou Tao
When adding a cfq_group into the cfq service tree, we use CFQ_IDLE_DELAY
as the delay of cfq_group's vdisktime if there have been other cfq_groups
already.

When cfq is under iops mode, commit 9a7f38c42c2b ("cfq-iosched: Convert
from jiffies to nanoseconds") could result in a large iops delay and
lead to an abnormal io schedule delay for the added cfq_group. To fix
it, we just need to revert to the old CFQ_IDLE_DELAY value: HZ / 5
when iops mode is enabled.

Despite having the same value, the delay of a cfq_queue in idle class
and the delay of cfq_queue are different things, so I define two new
macros for the delay of a cfq_group under time-slice mode and IOPs mode.

Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
Cc: <sta...@vger.kernel.org> # 4.8+
Signed-off-by: Hou Tao <hout...@huawei.com>
---
 block/cfq-iosched.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

v2:
- use constant instead of "nsecs_to_jiffies64(CFQ_IDLE_DELAY)"
  as suggested by Jan Kara

v1:
https://www.spinics.net/lists/stable/msg160580.html

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 440b95e..69754e8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -38,9 +38,13 @@ static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 
3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
 
 /*
- * offset from end of service tree
+ * offset from end of queue service tree for idle class
  */
 #define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5)
+/* offset from end of group service tree under time slice mode */
+#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
+/* offset from end of group service under IOPS mode */
+#define CFQ_IOPS_MODE_GROUP_DELAY (200)
 
 /*
  * below this threshold, we consider thinktime immediate
@@ -1362,6 +1366,14 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, 
struct cfq_group *cfqg)
cfqg->vfraction = max_t(unsigned, vfr, 1);
 }
 
+static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
+{
+   if (!iops_mode(cfqd))
+   return CFQ_SLICE_MODE_GROUP_DELAY;
+   else
+   return CFQ_IOPS_MODE_GROUP_DELAY;
+}
+
 static void
 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
@@ -1381,7 +1393,8 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct 
cfq_group *cfqg)
n = rb_last(>rb);
if (n) {
__cfqg = rb_entry_cfqg(n);
-   cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+   cfqg->vdisktime = __cfqg->vdisktime +
+   cfq_get_cfqg_vdisktime_delay(cfqd);
} else
cfqg->vdisktime = st->min_vdisktime;
cfq_group_service_tree_add(st, cfqg);
-- 
2.5.0



cfq-iosched: two questions about the hrtimer version of CFQ

2017-03-06 Thread Hou Tao
Hi Jan and list,

When testing the hrtimer version of CFQ, we found a performance degradation
problem which seems to be caused by commit 0b31c10 ("cfq-iosched: Charge at
least 1 jiffie instead of 1 ns").

The following is the test process:

* filesystem and block device
* XFS + /dev/sda mounted on /tmp/sda
* CFQ configuration
* default configuration
* run "fio ./cfq.job"
* fio job configuration cfq.job
[global]
bs=4k
ioengine=psync
iodepth=1
direct=1
rw=randwrite
time_based
runtime=15
cgroup_nodelete=1
group_reporting=1

[cfq_a]
filename=/tmp/sda/cfq_a.dat
size=2G
cgroup_weight=500
cgroup=cfq_a
thread=1
numjobs=2

[cfq_b]
new_group
filename=/tmp/sda/cfq_b.dat
size=2G
rate=4m
cgroup_weight=500
cgroup=cfq_b
thread=1
numjobs=2

The following is the test result:
* with 0b31c10:
* fio report
cfq_a: bw=5312.6KB/s, iops=1328
cfq_b: bw=8192.6KB/s, iops=2048

* blkcg debug files
./cfq_a/blkio.group_wait_time:8:0 12062571233
./cfq_b/blkio.group_wait_time:8:0 155841600
./cfq_a/blkio.io_serviced:Total 19922
./cfq_b/blkio.io_serviced:Total 30722
./cfq_a/blkio.time:8:0 19406083246
./cfq_b/blkio.time:8:0 19417146869

* without 0b31c10:
* fio report
cfq_a: bw=21670KB/s, iops=5417
cfq_b: bw=8191.2KB/s, iops=2047

* blkcg debug files
./cfq_a/blkio.group_wait_time:8:0 5798452504
./cfq_b/blkio.group_wait_time:8:0 5131844007
./cfq_a/blkio.io_serviced:8:0 Write 81261
./cfq_b/blkio.io_serviced:8:0 Write 30722
./cfq_a/blkio.time:8:0 5642608173
./cfq_b/blkio.time:8:0 5849949812

We want to known the reason why you revert the minimal used slice to 1 jiffy
when the slice has not been allocated. Doest it lead to some performance
regressions or something similar ? If not, I think we could revert the minimal
slice to 1 ns again.

Another problem is about the time comparison in CFQ code. In no-hrtimer version
of CFQ, it uses time_after or time_before when possible, Why the hrtimer version
doesn't use the equivalent time_after64/time_before64 ? Can ktime_get_ns()
ensure there will be no wrapping problem ?

Thanks very much.

Regards,

Tao



Re: [PATCH] cfq-iosched: fix the delay of cfq_group's vdisktime under iops mode

2017-03-06 Thread Hou Tao
Hi Vivek,

On 2017/3/4 3:53, Vivek Goyal wrote:
> On Fri, Mar 03, 2017 at 09:20:44PM +0800, Hou Tao wrote:
> 
> [..]
>>> Frankly, vdisktime is in fixed-point precision shifted by
>>> CFQ_SERVICE_SHIFT so using CFQ_IDLE_DELAY does not make much sense in any
>>> case and just adding 1 to maximum vdisktime should be fine in all the
>>> cases. But that would require more testing whether I did not miss anything
>>> subtle.
> 
> I think even 1 will work. But in the beginning IIRC I took the idea
> from cpu scheduler. Adding a value bigger than 1 will allow you to add
> some other group later before this group. (If you want to give that group
> higher priority).
I still don't understand why using a value bigger than 1 will allow a later 
added
group to have a vdisktime less than the firstly added group. Could you explain 
it
in more detail ?

Regards,

Tao

> Thanks
> Vivek
> 
> .
> 



Re: [PATCH] cfq-iosched: fix the delay of cfq_group's vdisktime under iops mode

2017-03-03 Thread Hou Tao

On 2017/3/2 18:29, Jan Kara wrote:
> On Wed 01-03-17 10:07:44, Hou Tao wrote:
>> When adding a cfq_group into the cfq service tree, we use CFQ_IDLE_DELAY
>> as the delay of cfq_group's vdisktime if there have been other cfq_groups
>> already.
>>
>> When cfq is under iops mode, commit 9a7f38c42c2b ("cfq-iosched: Convert
>> from jiffies to nanoseconds") could result in a large iops delay and
>> lead to an abnormal io schedule delay for the added cfq_group. To fix
>> it, we just need to revert to the old CFQ_IDLE_DELAY value: HZ / 5
>> when iops mode is enabled.
>>
>> Cc: <sta...@vger.kernel.org> # 4.8+
>> Signed-off-by: Hou Tao <hout...@huawei.com>
> 
> OK, I agree my commit broke the logic in this case. Thanks for the fix.
> Please add also tag:
> 
> Fixes: 9a7f38c42c2b92391d9dabaf9f51df7cfe5608e4
> 
> I somewhat disagree with the fix though. See below:
> 
>> +static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
>> +{
>> +if (!iops_mode(cfqd))
>> +return CFQ_IDLE_DELAY;
>> +else
>> +return nsecs_to_jiffies64(CFQ_IDLE_DELAY);
>> +}
>> +
> 
> So using nsecs_to_jiffies64(CFQ_IDLE_DELAY) when in iops mode just does not
> make any sense. AFAIU the code in cfq_group_notify_queue_add() we just want
> to add the cfqg as the last one in the tree. So returning 1 from
> cfq_get_cfqg_vdisktime_delay() in iops mode should be fine as well.
Yes, nsecs_to_jiffies64(CFQ_IDLE_DELAY) is odd here, the better way is to
define a new macro with a value of 1 or 200 and use it directly, but I still
prefer to use 200 to be consistent with the no-hrtimer configuration.

> Frankly, vdisktime is in fixed-point precision shifted by
> CFQ_SERVICE_SHIFT so using CFQ_IDLE_DELAY does not make much sense in any
> case and just adding 1 to maximum vdisktime should be fine in all the
> cases. But that would require more testing whether I did not miss anything
> subtle.
Although the current implementation has done this, I don't think we should
add the cfq_group as the last one in the service tree. In some test cases,
I found that the delayed vdisktime of cfq_group is smaller than its last
vdisktime when the cfq_group was removed from the service_tree, and I think
it hurts the fairness. Maybe we can learn from CFS and calculate the delay
dynamically, but it would be the topic of another thread.

Regards,

Tao

> 
>>  static void
>>  cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>>  {
>> @@ -1380,7 +1388,8 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, 
>> struct cfq_group *cfqg)
>>  n = rb_last(>rb);
>>  if (n) {
>>  __cfqg = rb_entry_cfqg(n);
>> -cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
>> +cfqg->vdisktime = __cfqg->vdisktime +
>> +cfq_get_cfqg_vdisktime_delay(cfqd);
>>  } else
>>  cfqg->vdisktime = st->min_vdisktime;
>>  cfq_group_service_tree_add(st, cfqg);
>> -- 
>> 2.5.0
>>



[PATCH] block: use put_io_context_active() to disassociate bio from a task

2017-03-02 Thread Hou Tao
bio_associate_current() invokes get_io_context_active() to tell
CFQ scheduler that the current io_context is still issuing IOs
by increasing active_ref. When the bio is done, we also need
to invoke put_io_context_active() to decrease active_ref else
the associated io_context will always be active.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e0..d8ed36f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2072,7 +2072,7 @@ EXPORT_SYMBOL_GPL(bio_associate_current);
 void bio_disassociate_task(struct bio *bio)
 {
if (bio->bi_ioc) {
-   put_io_context(bio->bi_ioc);
+   put_io_context_active(bio->bi_ioc);
bio->bi_ioc = NULL;
}
if (bio->bi_css) {
-- 
2.5.0



[PATCH] cfq-iosched: fix the delay of cfq_group's vdisktime under iops mode

2017-02-28 Thread Hou Tao
When adding a cfq_group into the cfq service tree, we use CFQ_IDLE_DELAY
as the delay of cfq_group's vdisktime if there have been other cfq_groups
already.

When cfq is under iops mode, commit 9a7f38c42c2b ("cfq-iosched: Convert
from jiffies to nanoseconds") could result in a large iops delay and
lead to an abnormal io schedule delay for the added cfq_group. To fix
it, we just need to revert to the old CFQ_IDLE_DELAY value: HZ / 5
when iops mode is enabled.

Cc: <sta...@vger.kernel.org> # 4.8+
Signed-off-by: Hou Tao <hout...@huawei.com>
---
 block/cfq-iosched.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1379447..fdeb70b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1361,6 +1361,14 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, 
struct cfq_group *cfqg)
cfqg->vfraction = max_t(unsigned, vfr, 1);
 }
 
+static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
+{
+   if (!iops_mode(cfqd))
+   return CFQ_IDLE_DELAY;
+   else
+   return nsecs_to_jiffies64(CFQ_IDLE_DELAY);
+}
+
 static void
 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
@@ -1380,7 +1388,8 @@ cfq_group_notify_queue_add(struct cfq_data *cfqd, struct 
cfq_group *cfqg)
n = rb_last(>rb);
if (n) {
__cfqg = rb_entry_cfqg(n);
-   cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
+   cfqg->vdisktime = __cfqg->vdisktime +
+   cfq_get_cfqg_vdisktime_delay(cfqd);
} else
cfqg->vdisktime = st->min_vdisktime;
cfq_group_service_tree_add(st, cfqg);
-- 
2.5.0



rate-capped fio jobs in a CFQ group degrade performance of fio jobs in another CFQ group with the same weight

2017-02-09 Thread Hou Tao
Hi all,

During our test of the CFQ group schedule, we found a performance related 
problem.
Rate-capped fio jobs in a CFQ group will degrade the performance of fio jobs in
another CFQ group. Both of the CFQ groups have the same blkio.weight.

We launch two fios in difference terminals. The following is the content of
these two job files:

# cat a.job
[global]
bs=4k
ioengine=psync
iodepth=1
direct=1
rw=randwrite
time_based
runtime=30
cgroup_nodelete=1
group_reporting=1

[test1]
filename=test1.dat
size=2G
cgroup_weight=500
cgroup=testA
thread=1
numjobs=2

[test3]
stonewall
filename=test3.dat
size=2G
rate=4m
cgroup_weight=500
cgroup=testA
thread=1
numjobs=2


# cat b.job
[global]
bs=4k
ioengine=psync
iodepth=1
direct=1
rw=randwrite
time_based
runtime=60
cgroup_nodelete=1
group_reporting=1

[test2]
filename=test2.dat
size=2G
cgroup_weight=500
cgroup=testB
thread=1
numjobs=2

In the first 30 seconds, the iops of both "test1" and "test3" is ~5000 and
the rate is about 20MBps.

In the last 30 seconds, we start "test3" and the rate of "test3" is limited to
8MBps (4MBps * 2 jobs). The cap works fine, but the iops of "test2" also
degraded from ~5000 to ~2040 or lower. This may seems reasonable for CFQ group
schedule because it needs to dispatch disk time equally to these two CFQ group,
but it doesn't fully utilize the disk throughput.

It can be reproduced on 4.4 and 4.10.0-rc7, the version of fio is 2.2.4.
The configuration options of CFQ are untouched.

I am not sure whether this situation is a bug or not. Are there some
configuration options we can used to alleviate the performance degradation
problem ? And how CFQ group schedule handles the trace-off between fairness
and performance ?

Any suggestion will be appreciated.

Regards,

Tao



[PATCH RESEND] blkcg: fix double free of new_blkg in blkcg_init_queue

2017-02-03 Thread Hou Tao
If blkg_create fails, new_blkg passed as an argument will
be freed by blkg_create, so there is no need to free it again.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 block/blk-cgroup.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ba0af7..f570f38 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1079,10 +1079,8 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded)
radix_tree_preload_end();
 
-   if (IS_ERR(blkg)) {
-   blkg_free(new_blkg);
+   if (IS_ERR(blkg))
return PTR_ERR(blkg);
-   }
 
q->root_blkg = blkg;
q->root_rl.blkg = blkg;
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] blkcg: fix double free of new_blkg in blkcg_init_queue

2017-01-24 Thread Hou Tao
if blkg_create fails, new_blkg passed as an argument will
be freed by blkg_create, so there is no need to free it again.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 block/blk-cgroup.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ba0af7..58fb0dd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1080,7 +1080,6 @@ int blkcg_init_queue(struct request_queue *q)
radix_tree_preload_end();
 
if (IS_ERR(blkg)) {
-   blkg_free(new_blkg);
return PTR_ERR(blkg);
}
 
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 4/4] dm thin: associate bio with current task if keep_bio_blkcg is enabled

2017-01-20 Thread Hou Tao
If keep_bio_blkcg is enabled, assign the io_context and the blkcg of
current task to bio before processing the bio.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 drivers/md/dm-thin.c |  5 +
 drivers/md/dm-thin.h | 17 +
 2 files changed, 22 insertions(+)
 create mode 100644 drivers/md/dm-thin.h

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 140cdae..0efbdbe 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -22,6 +22,8 @@
 #include 
 #include 
 
+#include "dm-thin.h"
+
 #defineDM_MSG_PREFIX   "thin"
 
 /*
@@ -2629,6 +2631,9 @@ static int thin_bio_map(struct dm_target *ti, struct bio 
*bio)
struct dm_bio_prison_cell *virt_cell, *data_cell;
struct dm_cell_key key;
 
+   if (keep_pool_bio_blkcg(tc->pool))
+   thin_keep_bio_blkcg(bio);
+
thin_hook_bio(tc, bio);
 
if (tc->requeue_mode) {
diff --git a/drivers/md/dm-thin.h b/drivers/md/dm-thin.h
new file mode 100644
index 000..09e920a
--- /dev/null
+++ b/drivers/md/dm-thin.h
@@ -0,0 +1,17 @@
+#ifndef DM_THIN_H
+#define DM_THIN_H
+
+#include 
+
+#ifdef CONFIG_BLK_CGROUP
+static inline void thin_keep_bio_blkcg(struct bio *bio)
+{
+   if (!bio->bi_css)
+   bio_associate_current(bio);
+}
+#else
+static inline void thin_keep_bio_blkcg(struct bio *bio) {}
+#endif /* CONFIG_BLK_CGROUP */
+
+#endif
+
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 0/4] dm thin: support blk-throttle on data and metadata device

2017-01-20 Thread Hou Tao
Hi all,

We need to throttle the O_DIRECT IO on data and metadata device
of a dm-thin pool and encounter some problems. If we set the
limitation on the root blkcg, the throttle works. If we set the
limitation on a child blkcg, the throttle doesn't work well.

The reason why the throttle doesn't work is that dm-thin defers
the process of bio when the physical block of bio has not been
allocated. The bio will be submitted by the pool worker, and the
blkcg of the bio will be the blkcg of the pool worker, namely,
the root blkcg instead of the blkcg of the original IO thread.
We only set a limitation on the blkcg of the original IO thread,
so the blk-throttle doesn't work well.

In order to handle the situation, we add a "keep_bio_blkcg" feature
to dm-thin. If the feature is enabled, the original blkcg of bio
will be saved at thin_map() and will be used during blk-throttle.

Tao

Hou Tao (4):
  dm thin: add a pool feature "keep_bio_blkcg"
  dm thin: parse "keep_bio_blkcg" from userspace tools
  dm thin: show the enabled status of keep_bio_blkcg feature
  dm thin: associate bio with current task if keep_bio_blkcg is enabled

 drivers/md/dm-thin.c | 26 --
 drivers/md/dm-thin.h | 17 +
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 drivers/md/dm-thin.h

-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 3/4] dm thin: show the enabled status of keep_bio_blkcg feature

2017-01-20 Thread Hou Tao
If keep_bio_blkcg feature is enabled, we can ensure that
by STATUSTYPE_TABLE or STATUSTYPE_INFO command.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 drivers/md/dm-thin.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 57d6202..140cdae 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3760,7 +3760,7 @@ static void emit_flags(struct pool_features *pf, char 
*result,
 {
unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
-   pf->error_if_no_space;
+   pf->error_if_no_space + pf->keep_bio_blkcg;
DMEMIT("%u ", count);
 
if (!pf->zero_new_blocks)
@@ -3777,6 +3777,9 @@ static void emit_flags(struct pool_features *pf, char 
*result,
 
if (pf->error_if_no_space)
DMEMIT("error_if_no_space ");
+
+   if (pf->keep_bio_blkcg)
+   DMEMIT("keep_bio_blkcg ");
 }
 
 /*
@@ -3885,6 +3888,9 @@ static void pool_status(struct dm_target *ti, 
status_type_t type,
else
DMEMIT("queue_if_no_space ");
 
+   if (pool->pf.keep_bio_blkcg)
+   DMEMIT("keep_bio_blkcg ");
+
if (dm_pool_metadata_needs_check(pool->pmd))
DMEMIT("needs_check ");
else
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 2/4] dm thin: parse "keep_bio_blkcg" from userspace tools

2017-01-20 Thread Hou Tao
keep_bio_blkcg feature is off by default, and it can
be turned on by using "keep_bio_blkcg" argument.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 drivers/md/dm-thin.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 8178ee8..57d6202 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2824,6 +2824,7 @@ static void pool_features_init(struct pool_features *pf)
pf->discard_enabled = true;
pf->discard_passdown = true;
pf->error_if_no_space = false;
+   pf->keep_bio_blkcg = false;
 }
 
 static void __pool_destroy(struct pool *pool)
@@ -3053,7 +3054,7 @@ static int parse_pool_features(struct dm_arg_set *as, 
struct pool_features *pf,
const char *arg_name;
 
static struct dm_arg _args[] = {
-   {0, 4, "Invalid number of pool feature arguments"},
+   {0, 5, "Invalid number of pool feature arguments"},
};
 
/*
@@ -3085,6 +3086,9 @@ static int parse_pool_features(struct dm_arg_set *as, 
struct pool_features *pf,
else if (!strcasecmp(arg_name, "error_if_no_space"))
pf->error_if_no_space = true;
 
+   else if (!strcasecmp(arg_name, "keep_bio_blkcg"))
+   pf->keep_bio_blkcg = true;
+
else {
ti->error = "Unrecognised pool feature requested";
r = -EINVAL;
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] blk-throttle: fix infinite throttling caused by non-cascading timer wheel

2016-09-12 Thread Hou Tao
Due to commit 500462a9de65 ("timers: Switch to a non-cascading wheel"),
the slack of timer increases when the timeout increases:

 So for HZ=250 we end up with the following granularity levels:
  Level Offset   Granularity  Range
  0  0  4 ms 0 ms -252 ms
  1 64 32 ms   256 ms -   2044 ms (256ms - ~2s)
  2128256 ms  2048 ms -  16380 ms (~2s   - ~16s)

When the slack is bigger than throtl_slice (100ms), there will be
a problem: throtl_slice_used() will always return true, a new slice
will always be genereated, and the bio will be throttled forever.

The following is a example:

 echo 253:0 512 > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
 fio --readonly --direct=1 --filename=/dev/vda --size=4K --rate=4K \
 --rw=read --ioengine=libaio --iodepth=16 --name 1

 the slack of 8s-timer is about 302ms.

 throtl / [R] bio. bdisp=0 sz=4096 bps=512 iodisp=0 iops=4294967295 queued=0/0
 throtl schedule timer. delay=8000 jiffies=4295784850
 throtl / dispatch nr_queued=1 read=1 write=0, bdisp=0/0, iodisp=0/0
 throtl / [R] new slice start=4295793152 end=4295793252 jiffies=4295793152
 throtl / [R] extend slice start=4295793152 end=4295801200 jiffies=4295793152
 throtl schedule timer. delay=8000 jiffies=4295793152
 throtl / dispatch nr_queued=1 read=1 write=0, bdisp=0/0, iodisp=0/0
 throtl / [R] new slice start=4295801344 end=4295801444 jiffies=4295801344
 throtl / [R] extend slice start=4295801344 end=4295809400 jiffies=4295801344
 throtl schedule timer. delay=8000 jiffies=4295801344

Fix it by checking the delayed dispatch in tg_may_dispatch():
1. If there is any dispatched bio, the time slice must have been used,
   so it's OK to renew the time slice.
2. If there is no queued bio, the time slice must have been expired,
   so it's Ok to renew the time slice.

Signed-off-by: Hou Tao <hout...@huawei.com>
---
 block/blk-throttle.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f1aba26..91f8140 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -591,13 +591,20 @@ static inline void throtl_extend_slice(struct throtl_grp 
*tg, bool rw,
   tg->slice_end[rw], jiffies);
 }
 
+static bool throtl_is_delayed_disp(struct throtl_grp *tg, bool rw)
+{
+   return (time_after(jiffies, tg->slice_end[rw]) &&
+   !tg->bytes_disp[rw] && !tg->io_disp[rw] &&
+   tg->service_queue.nr_queued[rw]) ? true : false;
+}
+
 /* Determine if previously allocated or extended slice is complete or not */
 static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
 {
if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
return false;
 
-   return 1;
+   return true;
 }
 
 /* Trim the used slices and adjust slice start accordingly */
@@ -782,7 +789,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct 
bio *bio,
 * existing slice to make sure it is at least throtl_slice interval
 * long since now.
 */
-   if (throtl_slice_used(tg, rw))
+   if (throtl_slice_used(tg, rw) && !throtl_is_delayed_disp(tg, rw))
throtl_start_new_slice(tg, rw);
else {
if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
-- 
2.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html