[PATCH] block/loop: Don't hold lock while rereading partition.

2018-09-24 Thread Tetsuo Handa
syzbot is reporting circular locking dependency between bdev->bd_mutex and
lo->lo_ctl_mutex [1] which is caused by calling blkdev_reread_part() with
lock held. Don't hold loop_ctl_mutex while calling blkdev_reread_part().
Also, bring bdgrab() at loop_set_fd() to before loop_reread_partitions()
in case loop_clr_fd() is called while blkdev_reread_part() from
loop_set_fd() is in progress.

[1] 
https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889

Signed-off-by: Tetsuo Handa 
Reported-by: syzbot 

---
 drivers/block/loop.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 920cbb1..877cca8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -632,7 +632,12 @@ static void loop_reread_partitions(struct loop_device *lo,
   struct block_device *bdev)
 {
int rc;
+   char filename[LO_NAME_SIZE];
+   const int num = lo->lo_number;
+   const int count = atomic_read(&lo->lo_refcnt);
 
+   memcpy(filename, lo->lo_file_name, sizeof(filename));
+   mutex_unlock(&loop_ctl_mutex);
/*
 * bd_mutex has been held already in release path, so don't
 * acquire it if this function is called in such case.
@@ -641,13 +646,14 @@ static void loop_reread_partitions(struct loop_device *lo,
 * must be at least one and it can only become zero when the
 * current holder is released.
 */
-   if (!atomic_read(&lo->lo_refcnt))
+   if (!count)
rc = __blkdev_reread_part(bdev);
else
rc = blkdev_reread_part(bdev);
+   mutex_lock(&loop_ctl_mutex);
if (rc)
pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
-   __func__, lo->lo_number, lo->lo_file_name, rc);
+   __func__, num, filename, rc);
 }
 
 static inline int is_loop_device(struct file *file)
@@ -971,16 +977,18 @@ static int loop_set_fd(struct loop_device *lo, fmode_t 
mode,
set_blocksize(bdev, S_ISBLK(inode->i_mode) ?
  block_size(inode->i_bdev) : PAGE_SIZE);
 
+   /*
+* Grab the block_device to prevent its destruction after we
+* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
+*/
+   bdgrab(bdev);
+
lo->lo_state = Lo_bound;
if (part_shift)
lo->lo_flags |= LO_FLAGS_PARTSCAN;
if (lo->lo_flags & LO_FLAGS_PARTSCAN)
loop_reread_partitions(lo, bdev);
 
-   /* Grab the block_device to prevent its destruction after we
-* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
-*/
-   bdgrab(bdev);
return 0;
 
  out_putf:
-- 
1.8.3.1



[PATCH v2] block/loop: Use global lock for ioctl() operation.

2018-09-24 Thread Tetsuo Handa
syzbot is reporting NULL pointer dereference [1] which is caused by
race condition between ioctl(loop_fd, LOOP_CLR_FD, 0) versus
ioctl(other_loop_fd, LOOP_SET_FD, loop_fd) due to traversing other
loop devices at loop_validate_file() without holding corresponding
lo->lo_ctl_mutex locks.

Since ioctl() request on loop devices is not frequent operation, we don't
need fine grained locking. Let's use global lock in order to allow safe
traversal at loop_validate_file().

Note that syzbot is also reporting circular locking dependency between
bdev->bd_mutex and lo->lo_ctl_mutex [2] which is caused by calling
blkdev_reread_part() with lock held. This patch does not address it.

[1] 
https://syzkaller.appspot.com/bug?id=f3cfe26e785d85f9ee259f385515291d21bd80a3
[2] 
https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889

v2: Don't call mutex_init() upon loop_add() request.

Signed-off-by: Tetsuo Handa 
Reported-by: syzbot 
---
 drivers/block/loop.c | 58 ++--
 drivers/block/loop.h |  1 -
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c2745e6..920cbb1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -85,6 +85,7 @@
 
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_index_mutex);
+static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
 static int part_shift;
@@ -1048,7 +1049,7 @@ static int loop_clr_fd(struct loop_device *lo)
 */
if (atomic_read(&lo->lo_refcnt) > 1) {
lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return 0;
}
 
@@ -1101,12 +1102,12 @@ static int loop_clr_fd(struct loop_device *lo)
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo);
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
/*
-* Need not hold lo_ctl_mutex to fput backing file.
-* Calling fput holding lo_ctl_mutex triggers a circular
+* Need not hold loop_ctl_mutex to fput backing file.
+* Calling fput holding loop_ctl_mutex triggers a circular
 * lock dependency possibility warning as fput can take
-* bd_mutex which is usually taken before lo_ctl_mutex.
+* bd_mutex which is usually taken before loop_ctl_mutex.
 */
fput(filp);
return 0;
@@ -1211,7 +1212,7 @@ static int loop_clr_fd(struct loop_device *lo)
int ret;
 
if (lo->lo_state != Lo_bound) {
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return -ENXIO;
}
 
@@ -1230,10 +1231,10 @@ static int loop_clr_fd(struct loop_device *lo)
   lo->lo_encrypt_key_size);
}
 
-   /* Drop lo_ctl_mutex while we call into the filesystem. */
+   /* Drop loop_ctl_mutex while we call into the filesystem. */
path = lo->lo_backing_file->f_path;
path_get(&path);
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
if (!ret) {
info->lo_device = huge_encode_dev(stat.dev);
@@ -1325,7 +1326,7 @@ static int loop_clr_fd(struct loop_device *lo)
int err;
 
if (!arg) {
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return -EINVAL;
}
err = loop_get_status(lo, &info64);
@@ -1343,7 +1344,7 @@ static int loop_clr_fd(struct loop_device *lo)
int err;
 
if (!arg) {
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return -EINVAL;
}
err = loop_get_status(lo, &info64);
@@ -1401,7 +1402,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t 
mode,
struct loop_device *lo = bdev->bd_disk->private_data;
int err;
 
-   err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1);
+   err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
if (err)
goto out_unlocked;
 
@@ -1413,7 +1414,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t 
mode,
err = loop_change_fd(lo, bdev, arg);
break;
case LOOP_CLR_FD:
-   /* loop_clr_fd would have unlocked lo_ctl_mutex on success */
+   /* loop_clr_fd would have unlocked loop_ctl_mutex on success */
err = loop_clr_fd(lo);
if (!err)
goto out_unlocked;
@@ -1426,7 +1427,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t 
mode,
break;
case LOOP_GET_STATUS:
err = loop_get_status_old(lo, (struct loop_info __user *) arg);
-   /* loop_get_statu

[PATCH] block/loop: Use global lock for ioctl() operation.

2018-09-24 Thread Tetsuo Handa
syzbot is reporting NULL pointer dereference [1] which is caused by
race condition between ioctl(loop_fd, LOOP_CLR_FD, 0) versus
ioctl(other_loop_fd, LOOP_SET_FD, loop_fd) due to traversing other
loop devices at loop_validate_file() without holding corresponding
lo->lo_ctl_mutex locks.

Since ioctl() request on loop devices is not frequent operation, we don't
need fine grained locking. Let's use global lock in order to allow safe
traversal at loop_validate_file().

Note that syzbot is also reporting circular locking dependency between
bdev->bd_mutex and lo->lo_ctl_mutex [2] which is caused by calling
blkdev_reread_part() with lock held. This patch does not address it.

[1] 
https://syzkaller.appspot.com/bug?id=f3cfe26e785d85f9ee259f385515291d21bd80a3
[2] 
https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889

Signed-off-by: Tetsuo Handa 
Reported-by: syzbot 
---
 drivers/block/loop.c | 59 ++--
 drivers/block/loop.h |  1 -
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c2745e6..1477f52e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -85,6 +85,7 @@
 
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_index_mutex);
+static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
 static int part_shift;
@@ -1048,7 +1049,7 @@ static int loop_clr_fd(struct loop_device *lo)
 */
if (atomic_read(&lo->lo_refcnt) > 1) {
lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return 0;
}
 
@@ -1101,12 +1102,12 @@ static int loop_clr_fd(struct loop_device *lo)
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo);
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
/*
-* Need not hold lo_ctl_mutex to fput backing file.
-* Calling fput holding lo_ctl_mutex triggers a circular
+* Need not hold loop_ctl_mutex to fput backing file.
+* Calling fput holding loop_ctl_mutex triggers a circular
 * lock dependency possibility warning as fput can take
-* bd_mutex which is usually taken before lo_ctl_mutex.
+* bd_mutex which is usually taken before loop_ctl_mutex.
 */
fput(filp);
return 0;
@@ -1211,7 +1212,7 @@ static int loop_clr_fd(struct loop_device *lo)
int ret;
 
if (lo->lo_state != Lo_bound) {
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return -ENXIO;
}
 
@@ -1230,10 +1231,10 @@ static int loop_clr_fd(struct loop_device *lo)
   lo->lo_encrypt_key_size);
}
 
-   /* Drop lo_ctl_mutex while we call into the filesystem. */
+   /* Drop loop_ctl_mutex while we call into the filesystem. */
path = lo->lo_backing_file->f_path;
path_get(&path);
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
if (!ret) {
info->lo_device = huge_encode_dev(stat.dev);
@@ -1325,7 +1326,7 @@ static int loop_clr_fd(struct loop_device *lo)
int err;
 
if (!arg) {
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return -EINVAL;
}
err = loop_get_status(lo, &info64);
@@ -1343,7 +1344,7 @@ static int loop_clr_fd(struct loop_device *lo)
int err;
 
if (!arg) {
-   mutex_unlock(&lo->lo_ctl_mutex);
+   mutex_unlock(&loop_ctl_mutex);
return -EINVAL;
}
err = loop_get_status(lo, &info64);
@@ -1401,7 +1402,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t 
mode,
struct loop_device *lo = bdev->bd_disk->private_data;
int err;
 
-   err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1);
+   err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
if (err)
goto out_unlocked;
 
@@ -1413,7 +1414,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t 
mode,
err = loop_change_fd(lo, bdev, arg);
break;
case LOOP_CLR_FD:
-   /* loop_clr_fd would have unlocked lo_ctl_mutex on success */
+   /* loop_clr_fd would have unlocked loop_ctl_mutex on success */
err = loop_clr_fd(lo);
if (!err)
goto out_unlocked;
@@ -1426,7 +1427,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t 
mode,
break;
case LOOP_GET_STATUS:
err = loop_get_status_old(lo, (struct loop_info __user *) arg);
-   /* loop_get_status() unlocks lo_ctl_mutex */
+   /* loop_g

[PATCH] block/loop: Don't grab "struct file" for vfs_getattr() operation.

2018-09-24 Thread Tetsuo Handa
vfs_getattr() needs "struct path" rather than "struct file".
Let's use path_get()/path_put() rather than get_file()/fput().

Signed-off-by: Tetsuo Handa 
---
 drivers/block/loop.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index abad6d1..c2745e6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1206,7 +1206,7 @@ static int loop_clr_fd(struct loop_device *lo)
 static int
 loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 {
-   struct file *file;
+   struct path path;
struct kstat stat;
int ret;
 
@@ -1231,16 +1231,16 @@ static int loop_clr_fd(struct loop_device *lo)
}
 
/* Drop lo_ctl_mutex while we call into the filesystem. */
-   file = get_file(lo->lo_backing_file);
+   path = lo->lo_backing_file->f_path;
+   path_get(&path);
mutex_unlock(&lo->lo_ctl_mutex);
-   ret = vfs_getattr(&file->f_path, &stat, STATX_INO,
- AT_STATX_SYNC_AS_STAT);
+   ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
if (!ret) {
info->lo_device = huge_encode_dev(stat.dev);
info->lo_inode = stat.ino;
info->lo_rdevice = huge_encode_dev(stat.rdev);
}
-   fput(file);
+   path_put(&path);
return ret;
 }
 
-- 
1.8.3.1



Re: [PATCH] blk-mq: Allow blocking queue tag iter callbacks

2018-09-24 Thread jianchao.wang
Hi Bart

On 09/25/2018 10:20 AM, Bart Van Assche wrote:
> On 9/24/18 7:11 PM, jianchao.wang wrote:
>> Hi Keith
>>
>> On 09/25/2018 05:09 AM, Keith Busch wrote:
>>> -    /* A deadlock might occur if a request is stuck requiring a
>>> - * timeout at the same time a queue freeze is waiting
>>> - * completion, since the timeout code would not be able to
>>> - * acquire the queue reference here.
>>> - *
>>> - * That's why we don't use blk_queue_enter here; instead, we use
>>> - * percpu_ref_tryget directly, because we need to be able to
>>> - * obtain a reference even in the short window between the queue
>>> - * starting to freeze, by dropping the first reference in
>>> - * blk_freeze_queue_start, and the moment the last request is
>>> - * consumed, marked by the instant q_usage_counter reaches
>>> - * zero.
>>> - */
>>> -    if (!percpu_ref_tryget(&q->q_usage_counter))
>>> +    if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next))
>>>   return;
>>
>> We cannot discard the percpu_ref_tryget here.
>>
>> There following code in blk_mq_timeout_work still need it:
>>
>> if (next != 0) {
>>     mod_timer(&q->timeout, next);
>> } else {
>>     queue_for_each_hw_ctx(q, hctx, i) {
>>     /* the hctx may be unmapped, so check it here */
>>     if (blk_mq_hw_queue_mapped(hctx))
>>     blk_mq_tag_idle(hctx);
>>     }
>> }
> 
> Hi Jianchao,
> 
> Had you noticed that the percpu_ref_tryget() call has been moved into
> blk_mq_queue_tag_busy_iter()?
> 

Yes.

But the issue is the left part of blk_mq_timeout_work is moved out of 
protection of q refcount.

Thanks
Jianchao


Re: [PATCH] blk-mq: Allow blocking queue tag iter callbacks

2018-09-24 Thread Bart Van Assche

On 9/24/18 7:11 PM, jianchao.wang wrote:

Hi Keith

On 09/25/2018 05:09 AM, Keith Busch wrote:

-   /* A deadlock might occur if a request is stuck requiring a
-* timeout at the same time a queue freeze is waiting
-* completion, since the timeout code would not be able to
-* acquire the queue reference here.
-*
-* That's why we don't use blk_queue_enter here; instead, we use
-* percpu_ref_tryget directly, because we need to be able to
-* obtain a reference even in the short window between the queue
-* starting to freeze, by dropping the first reference in
-* blk_freeze_queue_start, and the moment the last request is
-* consumed, marked by the instant q_usage_counter reaches
-* zero.
-*/
-   if (!percpu_ref_tryget(&q->q_usage_counter))
+   if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next))
return;


We cannot discard the percpu_ref_tryget here.

There following code in blk_mq_timeout_work still need it:

if (next != 0) {
mod_timer(&q->timeout, next);
} else {
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
}


Hi Jianchao,

Had you noticed that the percpu_ref_tryget() call has been moved into
blk_mq_queue_tag_busy_iter()?

Bart.



Re: [PATCH] blk-mq: Allow blocking queue tag iter callbacks

2018-09-24 Thread jianchao.wang
Hi Keith

On 09/25/2018 05:09 AM, Keith Busch wrote:
> - /* A deadlock might occur if a request is stuck requiring a
> -  * timeout at the same time a queue freeze is waiting
> -  * completion, since the timeout code would not be able to
> -  * acquire the queue reference here.
> -  *
> -  * That's why we don't use blk_queue_enter here; instead, we use
> -  * percpu_ref_tryget directly, because we need to be able to
> -  * obtain a reference even in the short window between the queue
> -  * starting to freeze, by dropping the first reference in
> -  * blk_freeze_queue_start, and the moment the last request is
> -  * consumed, marked by the instant q_usage_counter reaches
> -  * zero.
> -  */
> - if (!percpu_ref_tryget(&q->q_usage_counter))
> + if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next))
>   return;

We cannot discard the percpu_ref_tryget here.

There following code in blk_mq_timeout_work still need it:

if (next != 0) {
mod_timer(&q->timeout, next);
} else {
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
}

Thanks
Jianchao


Re: clean up physical merging helpers V2

2018-09-24 Thread Ming Lei
On Mon, Sep 24, 2018 at 09:43:45AM +0200, Christoph Hellwig wrote:
> Hi Jens,
> 
> this series moves various helpers related to merging based on physical
> addresses from the public headers into block/, cleans up the code a bit
> and removes not nessecary includes from the block headers.
> 
> Change since V1:
>  - dropped the Xen related changed which are moved into a new series
>to be sent after this one

Looks fine:

Reviewed-by: Ming Lei 

Thanks,
Ming


[PATCH] blk-mq: Allow blocking queue tag iter callbacks

2018-09-24 Thread Keith Busch
A recent commit had tag iterator callbacks run under the rcu read
lock. Existing callbacks exist that do not satisy the rcu non-blocking
requirement.

The commit intended to prevent an iterator from accessing a queue that's
being modified. This patch fixes the original issue by taking a queue
reference if the queue is not frozen instead of a rcu read lock.

Fixes: f5e4d6357 ("blk-mq: sync the update nr_hw_queues with 
blk_mq_queue_tag_busy_iter")
Cc: Jianchao Wang 
Signed-off-by: Keith Busch 
---
 block/blk-mq-tag.c | 30 +-
 block/blk-mq-tag.h |  2 +-
 block/blk-mq.c | 22 +-
 3 files changed, 19 insertions(+), 35 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 94e1ed667b6e..63542642a017 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -314,24 +314,27 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set 
*tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+bool blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
 {
struct blk_mq_hw_ctx *hctx;
int i;
 
-   /*
-* __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
-* queue_hw_ctx after freeze the queue. So we could use q_usage_counter
-* to avoid race with it. __blk_mq_update_nr_hw_queues will users
-* synchronize_rcu to ensure all of the users go out of the critical
-* section below and see zeroed q_usage_counter.
+   /* A deadlock might occur if a request is stuck requiring a
+* timeout at the same time a queue freeze is waiting
+* completion, since the timeout code would not be able to
+* acquire the queue reference here.
+*
+* That's why we don't use blk_queue_enter here; instead, we use
+* percpu_ref_tryget directly, because we need to be able to
+* obtain a reference even in the short window between the queue
+* starting to freeze, by dropping the first reference in
+* blk_freeze_queue_start, and the moment the last request is
+* consumed, marked by the instant q_usage_counter reaches
+* zero.
 */
-   rcu_read_lock();
-   if (percpu_ref_is_zero(&q->q_usage_counter)) {
-   rcu_read_unlock();
-   return;
-   }
+   if (!percpu_ref_tryget(&q->q_usage_counter))
+   return false;
 
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
@@ -347,7 +350,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, 
busy_iter_fn *fn,
bt_for_each(hctx, &tags->breserved_tags, fn, priv, 
true);
bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
}
-   rcu_read_unlock();
+   blk_queue_exit(q);
+   return true;
 }
 
 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..36b3bc90e867 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -33,7 +33,7 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags,
unsigned int depth, bool can_grow);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+bool blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 85a1c1a59c72..019f9b169887 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -848,24 +848,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   /* A deadlock might occur if a request is stuck requiring a
-* timeout at the same time a queue freeze is waiting
-* completion, since the timeout code would not be able to
-* acquire the queue reference here.
-*
-* That's why we don't use blk_queue_enter here; instead, we use
-* percpu_ref_tryget directly, because we need to be able to
-* obtain a reference even in the short window between the queue
-* starting to freeze, by dropping the first reference in
-* blk_freeze_queue_start, and the moment the last request is
-* consumed, marked by the instant q_usage_counter reaches
-* zero.
-*/
-   if (!percpu_ref_tryget(&q->q_usage_counter))
+   if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next))
return;
 
-   blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
-
if (next != 0) {
mod_timer(&q->timeout, next);
} else {
@@ -881,7 +866,6 @@ stat

Re: [PATCH v4] block/loop: Serialize ioctl operations.

2018-09-24 Thread Tetsuo Handa
On 2018/09/25 3:47, Jan Kara wrote:
>> +/*
>> + * unlock_loop - Unlock loop_mutex as needed.
>> + *
>> + * Explicitly call this function before calling fput() or 
>> blkdev_reread_part()
>> + * in order to avoid circular lock dependency. After this function is 
>> called,
>> + * current thread is no longer allowed to access "struct loop_device" 
>> memory,
>> + * for another thread would access that memory as soon as loop_mutex is 
>> held.
>> + */
>> +static void unlock_loop(void)
>> +{
>> +if (loop_mutex_owner == current) {
> 
> Urgh, why this check? Conditional locking / unlocking is evil so it has to
> have *very* good reasons and there is not any explanation here. So far I
> don't see a reason why this is needed at all.

Yeah, this is why Jens hates this patch. But any alternative?



>> @@ -630,7 +669,12 @@ static void loop_reread_partitions(struct loop_device 
>> *lo,
>> +unlock_loop();
> 
> Unlocking in loop_reread_partitions() makes the locking rules ugly. And
> unnecessarily AFAICT. Can't we just use lo_refcnt to protect us against
> loop_clr_fd() and freeing of 'lo' structure itself?

Really? I think that just elevating lo->lo_refcnt will cause another lockdep
warning because __blkdev_reread_part() requires bdev->bd_mutex being held.
Don't we need to drop the lock in order to solve original lockdep warning at 
[2] ?

int __blkdev_reread_part(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;

if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;

lockdep_assert_held(&bdev->bd_mutex);

return rescan_partitions(disk, bdev);
}

int blkdev_reread_part(struct block_device *bdev)
{
int res;

mutex_lock(&bdev->bd_mutex);
res = __blkdev_reread_part(bdev);
mutex_unlock(&bdev->bd_mutex);

return res;
}



Re: [PATCH v10 5/8] percpu-refcount: Introduce percpu_ref_resurrect()

2018-09-24 Thread Bart Van Assche
On Mon, 2018-09-24 at 11:01 -0700, Tejun Heo wrote:
> Hello, Bart.
> 
> On Fri, Sep 21, 2018 at 01:31:19PM -0700, Bart Van Assche wrote:
> > +void percpu_ref_resurrect(struct percpu_ref *ref)
> > +{
> > +   unsigned long __percpu *percpu_count;
> > unsigned long flags;
> >  
> > spin_lock_irqsave(&percpu_ref_switch_lock, flags);
> >  
> > -   WARN_ON_ONCE(!percpu_ref_is_zero(ref));
> > +   WARN_ON_ONCE(!(ref->percpu_count_ptr & __PERCPU_REF_DEAD));
> > +   WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count));
> 
> So, this in itself is fine but I'm a bit worried that this might it
> easier to abuse percpu_ref's dying mode.  More specifically, it isn't
> a good idea to depend on percpu_ref's implementation details from
> outside - ie. the only guarantee there ever is that percpu_ref won't
> give out new refs after ->percpu_ref is called - e.g. users can't
> piggy back on implied rcu grace period.
> 
> Can you please note that in the function comment?  Provided that,
> 
> Acked-by: Tejun Heo 

Hi Tejun,

Thanks for the review. But could you please clarify what "after ->percpu_ref
is called" refers to?

Thanks,

Bart.



Re: Regression caused by f5bbbbe4d635

2018-09-24 Thread Keith Busch
On Mon, Sep 24, 2018 at 12:51:07PM -0700, Bart Van Assche wrote:
> On Mon, 2018-09-24 at 13:13 -0600, Keith Busch wrote:
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 85a1c1a59c72..28d128450621 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -848,22 +848,6 @@ static void blk_mq_timeout_work(struct work_struct 
> > *work)
> > struct blk_mq_hw_ctx *hctx;
> > int i;
> >  
> > -   /* A deadlock might occur if a request is stuck requiring a
> > -* timeout at the same time a queue freeze is waiting
> > -* completion, since the timeout code would not be able to
> > -* acquire the queue reference here.
> > -*
> > -* That's why we don't use blk_queue_enter here; instead, we use
> > -* percpu_ref_tryget directly, because we need to be able to
> > -* obtain a reference even in the short window between the queue
> > -* starting to freeze, by dropping the first reference in
> > -* blk_freeze_queue_start, and the moment the last request is
> > -* consumed, marked by the instant q_usage_counter reaches
> > -* zero.
> > -*/
> > -   if (!percpu_ref_tryget(&q->q_usage_counter))
> > -   return;
> > -
> > blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
> >  
> > if (next != 0) {
> > @@ -881,7 +865,6 @@ static void blk_mq_timeout_work(struct work_struct 
> > *work)
> > blk_mq_tag_idle(hctx);
> > }
> > }
> > -   blk_queue_exit(q);
> >  }
> 
> Hi Keith,
> 
> The above introduces a behavior change: if the percpu_ref_tryget() call inside
> blk_mq_queue_tag_busy_iter() fails then blk_mq_timeout_work() will now call
> blk_mq_tag_idle(). I think that's wrong if the percpu_ref_tryget() call fails
> due to the queue having been frozen. Please make blk_mq_queue_tag_busy_iter()
> return a bool that indicates whether or not it has iterated over the request
> queue.

Good point, thanks for the feedback.


Re: Regression caused by f5bbbbe4d635

2018-09-24 Thread Bart Van Assche
On Mon, 2018-09-24 at 13:13 -0600, Keith Busch wrote:
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 85a1c1a59c72..28d128450621 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -848,22 +848,6 @@ static void blk_mq_timeout_work(struct work_struct *work)
>   struct blk_mq_hw_ctx *hctx;
>   int i;
>  
> - /* A deadlock might occur if a request is stuck requiring a
> -  * timeout at the same time a queue freeze is waiting
> -  * completion, since the timeout code would not be able to
> -  * acquire the queue reference here.
> -  *
> -  * That's why we don't use blk_queue_enter here; instead, we use
> -  * percpu_ref_tryget directly, because we need to be able to
> -  * obtain a reference even in the short window between the queue
> -  * starting to freeze, by dropping the first reference in
> -  * blk_freeze_queue_start, and the moment the last request is
> -  * consumed, marked by the instant q_usage_counter reaches
> -  * zero.
> -  */
> - if (!percpu_ref_tryget(&q->q_usage_counter))
> - return;
> -
>   blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
>  
>   if (next != 0) {
> @@ -881,7 +865,6 @@ static void blk_mq_timeout_work(struct work_struct *work)
>   blk_mq_tag_idle(hctx);
>   }
>   }
> - blk_queue_exit(q);
>  }

Hi Keith,

The above introduces a behavior change: if the percpu_ref_tryget() call inside
blk_mq_queue_tag_busy_iter() fails then blk_mq_timeout_work() will now call
blk_mq_tag_idle(). I think that's wrong if the percpu_ref_tryget() call fails
due to the queue having been frozen. Please make blk_mq_queue_tag_busy_iter()
return a bool that indicates whether or not it has iterated over the request
queue.

Thanks,

Bart.


Re: Regression caused by f5bbbbe4d635

2018-09-24 Thread Keith Busch
On Mon, Sep 24, 2018 at 12:44:13PM -0600, Jens Axboe wrote:
> Hi,
> 
> This commit introduced a rcu_read_lock() inside
> blk_mq_queue_tag_busy_iter() - this is problematic for the timout code,
> since we now end up holding the RCU read lock over the timeout code. As
> just one example, nvme ends up doing:
> 
> nvme_timeout()
>   nvme_dev_disable()
>   mutex_lock(&dev->shutdown_lock);
> 
> and things are then obviously unhappy...

Yah, there's never been a requirement that tag iterator callbacks be
non-blocking as far as I remember.

The queue's reference in blk_mq_timeout_work looks applicable to any
blk_mq_queue_tag_busy_iter user, so just moving it there looks like it
should do what f5e4d635 was trying to fix. 

---
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 94e1ed667b6e..850577a3de6d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -320,18 +320,21 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, 
busy_iter_fn *fn,
struct blk_mq_hw_ctx *hctx;
int i;
 
-   /*
-* __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
-* queue_hw_ctx after freeze the queue. So we could use q_usage_counter
-* to avoid race with it. __blk_mq_update_nr_hw_queues will users
-* synchronize_rcu to ensure all of the users go out of the critical
-* section below and see zeroed q_usage_counter.
+   /* A deadlock might occur if a request is stuck requiring a
+* timeout at the same time a queue freeze is waiting
+* completion, since the timeout code would not be able to
+* acquire the queue reference here.
+*
+* That's why we don't use blk_queue_enter here; instead, we use
+* percpu_ref_tryget directly, because we need to be able to
+* obtain a reference even in the short window between the queue
+* starting to freeze, by dropping the first reference in
+* blk_freeze_queue_start, and the moment the last request is
+* consumed, marked by the instant q_usage_counter reaches
+* zero.
 */
-   rcu_read_lock();
-   if (percpu_ref_is_zero(&q->q_usage_counter)) {
-   rcu_read_unlock();
+   if (!percpu_ref_tryget(&q->q_usage_counter))
return;
-   }
 
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
@@ -347,7 +350,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, 
busy_iter_fn *fn,
bt_for_each(hctx, &tags->breserved_tags, fn, priv, 
true);
bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
}
-   rcu_read_unlock();
+   blk_queue_exit(q);
 }
 
 static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 85a1c1a59c72..28d128450621 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -848,22 +848,6 @@ static void blk_mq_timeout_work(struct work_struct *work)
struct blk_mq_hw_ctx *hctx;
int i;
 
-   /* A deadlock might occur if a request is stuck requiring a
-* timeout at the same time a queue freeze is waiting
-* completion, since the timeout code would not be able to
-* acquire the queue reference here.
-*
-* That's why we don't use blk_queue_enter here; instead, we use
-* percpu_ref_tryget directly, because we need to be able to
-* obtain a reference even in the short window between the queue
-* starting to freeze, by dropping the first reference in
-* blk_freeze_queue_start, and the moment the last request is
-* consumed, marked by the instant q_usage_counter reaches
-* zero.
-*/
-   if (!percpu_ref_tryget(&q->q_usage_counter))
-   return;
-
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
 
if (next != 0) {
@@ -881,7 +865,6 @@ static void blk_mq_timeout_work(struct work_struct *work)
blk_mq_tag_idle(hctx);
}
}
-   blk_queue_exit(q);
 }
 
 struct flush_busy_ctx_data {
@@ -2974,10 +2957,7 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
 
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
-   /*
-* Sync with blk_mq_queue_tag_busy_iter.
-*/
-   synchronize_rcu();
+
/*
 * Switch IO scheduler to 'none', cleaning up the data associated
 * with the previous scheduler. We will switch back once we are done
--


Regression caused by f5bbbbe4d635

2018-09-24 Thread Jens Axboe
Hi,

This commit introduced a rcu_read_lock() inside
blk_mq_queue_tag_busy_iter() - this is problematic for the timout code,
since we now end up holding the RCU read lock over the timeout code. As
just one example, nvme ends up doing:

nvme_timeout()
nvme_dev_disable()
mutex_lock(&dev->shutdown_lock);

and things are then obviously unhappy...

-- 
Jens Axboe



Re: clean up physical merging helpers V2

2018-09-24 Thread Jens Axboe
On 9/24/18 1:43 AM, Christoph Hellwig wrote:
> Hi Jens,
> 
> this series moves various helpers related to merging based on physical
> addresses from the public headers into block/, cleans up the code a bit
> and removes not nessecary includes from the block headers.
> 
> Change since V1:
>  - dropped the Xen related changed which are moved into a new series
>to be sent after this one

Looks good to me - #1 should just go to the arm guys separately, there's
no point in me carrying this arch patch.

-- 
Jens Axboe



Re: [PATCH v10 5/8] percpu-refcount: Introduce percpu_ref_resurrect()

2018-09-24 Thread Tejun Heo
Hello, Bart.

On Fri, Sep 21, 2018 at 01:31:19PM -0700, Bart Van Assche wrote:
> +void percpu_ref_resurrect(struct percpu_ref *ref)
> +{
> + unsigned long __percpu *percpu_count;
>   unsigned long flags;
>  
>   spin_lock_irqsave(&percpu_ref_switch_lock, flags);
>  
> - WARN_ON_ONCE(!percpu_ref_is_zero(ref));
> + WARN_ON_ONCE(!(ref->percpu_count_ptr & __PERCPU_REF_DEAD));
> + WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count));

So, this in itself is fine but I'm a bit worried that this might it
easier to abuse percpu_ref's dying mode.  More specifically, it isn't
a good idea to depend on percpu_ref's implementation details from
outside - ie. the only guarantee there ever is that percpu_ref won't
give out new refs after ->percpu_ref is called - e.g. users can't
piggy back on implied rcu grace period.

Can you please note that in the function comment?  Provided that,

Acked-by: Tejun Heo 

Thanks.

-- 
tejun


Re: [PATCH] block: use nanosecond resolution for iostat

2018-09-24 Thread Omar Sandoval
On Fri, Sep 21, 2018 at 08:27:17PM -0600, Jens Axboe wrote:
> On 9/21/18 5:44 PM, Omar Sandoval wrote:
> > From: Omar Sandoval 
> > 
> > Klaus Kusche reported that the I/O busy time in /proc/diskstats was not
> > updating properly on 4.18. This is because we started using ktime to
> > track elapsed time, and we convert nanoseconds to jiffies when we update
> > the partition counter. However, this gets rounded down, so any I/Os that
> > take less than a jiffy are not accounted for. Previously in this case,
> > the value of jiffies would sometimes increment while we were doing I/O,
> > so at least some I/Os were accounted for.
> > 
> > Let's convert the stats to use nanoseconds internally. We still report
> > milliseconds as before, now more accurately than ever. The value is
> > still truncated to 32 bits for backwards compatibility.
> 
> Thanks Omar, applied for 4.19.

Thanks, Jens. I also just pushed a regression test to blktests.


[PATCH 06/10] block: add a missing BIOVEC_SEG_BOUNDARY check in bio_add_pc_page

2018-09-24 Thread Christoph Hellwig
The actual recaculation of segments in __blk_recalc_rq_segments will
do this check, so there is no point in forcing it if we know it won't
succeed.

Signed-off-by: Christoph Hellwig 
---
 block/bio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 81d90b839e05..c254e5aa331f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -731,7 +731,9 @@ int bio_add_pc_page(struct request_queue *q, struct bio 
*bio, struct page
}
 
/* If we may be able to merge these biovecs, force a recount */
-   if (bio->bi_vcnt > 1 && biovec_phys_mergeable(bvec-1, bvec))
+   if (bio->bi_vcnt > 1 &&
+   biovec_phys_mergeable(bvec - 1, bvec) &&
+   BIOVEC_SEG_BOUNDARY(q, bvec - 1, bvec))
bio_clear_flag(bio, BIO_SEG_VALID);
 
  done:
-- 
2.19.0



[PATCH 05/10] block: simplify BIOVEC_PHYS_MERGEABLE

2018-09-24 Thread Christoph Hellwig
Turn the macro into an inline, move it to blk.h and simplify the
arch hooks a bit.

Also rename the function to biovec_phys_mergeable as there is no need
to shout.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/include/asm/io.h   |  5 ++---
 arch/arm64/include/asm/io.h |  5 ++---
 arch/x86/include/asm/io.h   |  5 ++---
 block/bio.c |  2 +-
 block/blk-integrity.c   |  4 ++--
 block/blk-merge.c   | 10 +-
 block/blk.h | 14 ++
 include/linux/bio.h | 13 -
 8 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 6774553dc214..e58ca25eddb7 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -462,9 +462,8 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem 
*addr);
 struct bio_vec;
 extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
  const struct bio_vec *vec2);
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)  \
-   (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
-(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#define ARCH_BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))
 
 #ifdef CONFIG_MMU
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 35b2e50f17fb..774e03ea1bb0 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -208,9 +208,8 @@ extern int devmem_is_allowed(unsigned long pfn);
 struct bio_vec;
 extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
  const struct bio_vec *vec2);
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)  \
-   (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
-(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#define ARCH_BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_IO_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 6de64840dd22..7c6106216d9c 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -376,9 +376,8 @@ struct bio_vec;
 extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
  const struct bio_vec *vec2);
 
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)  \
-   (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
-(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
+#define ARCH_BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))
 #endif /* CONFIG_XEN */
 
 #define IO_SPACE_LIMIT 0x
diff --git a/block/bio.c b/block/bio.c
index 1cd47f218200..81d90b839e05 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -731,7 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio 
*bio, struct page
}
 
/* If we may be able to merge these biovecs, force a recount */
-   if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+   if (bio->bi_vcnt > 1 && biovec_phys_mergeable(bvec-1, bvec))
bio_clear_flag(bio, BIO_SEG_VALID);
 
  done:
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 6121611e1316..0f7267916509 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -49,7 +49,7 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct 
bio *bio)
bio_for_each_integrity_vec(iv, bio, iter) {
 
if (prev) {
-   if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+   if (!biovec_phys_mergeable(&ivprv, &iv))
goto new_segment;
 
if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
@@ -95,7 +95,7 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct 
bio *bio,
bio_for_each_integrity_vec(iv, bio, iter) {
 
if (prev) {
-   if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+   if (!biovec_phys_mergeable(&ivprv, &iv))
goto new_segment;
 
if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ad8a226347a6..5e63e8259f92 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -21,7 +21,7 @@ static inline bool bios_segs_mergeable(struct request_queue 
*q,
struct bio *prev, struct bio_vec *prev_last_bv,
struct bio_vec *next_first_bv)
 {
-   if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv))
+   if (!biovec_phys_mergeable(prev_last_bv, next_first_bv))
return false;
if (!BIOVEC_SEG_BOUNDAR

[PATCH 10/10] block: don't include bug.h from bio.h

2018-09-24 Thread Christoph Hellwig
No need to pull in the BUG() defintion.

Signed-off-by: Christoph Hellwig 
---
 include/linux/bio.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index b3d47862b1b4..f447b0ebb288 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -21,7 +21,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #ifdef CONFIG_BLOCK
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
-- 
2.19.0



[PATCH 08/10] block: remove bvec_to_phys

2018-09-24 Thread Christoph Hellwig
We only use it in biovec_phys_mergeable and a m68k paravirt driver,
so just opencode it there.  Also remove the pointless unsigned long cast
for the offset in the opencoded instances.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Geert Uytterhoeven 
---
 arch/m68k/emu/nfblock.c | 2 +-
 block/blk.h | 4 ++--
 include/linux/bio.h | 5 -
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index e9110b9b8bcd..38049357d6d3 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -73,7 +73,7 @@ static blk_qc_t nfhd_make_request(struct request_queue 
*queue, struct bio *bio)
len = bvec.bv_len;
len >>= 9;
nfhd_read_write(dev->id, 0, dir, sec >> shift, len >> shift,
-   bvec_to_phys(&bvec));
+   page_to_phys(bvec.bv_page) + bvec.bv_offset);
sec += len;
}
bio_endio(bio);
diff --git a/block/blk.h b/block/blk.h
index 8f7229b6f63e..50f74ce60453 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -157,8 +157,8 @@ static inline bool biovec_phys_mergeable(struct 
request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
 {
unsigned long mask = queue_segment_boundary(q);
-   phys_addr_t addr1 = bvec_to_phys(vec1);
-   phys_addr_t addr2 = bvec_to_phys(vec2);
+   phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
+   phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
 
if (addr1 + vec1->bv_len != addr2)
return false;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9b580e1cb2e8..9ad4b0a487a4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -132,11 +132,6 @@ static inline bool bio_full(struct bio *bio)
return bio->bi_vcnt >= bio->bi_max_vecs;
 }
 
-/*
- * will die
- */
-#define bvec_to_phys(bv)   (page_to_phys((bv)->bv_page) + (unsigned long) 
(bv)->bv_offset)
-
 /*
  * drivers should _never_ use the all version - the bio may have been split
  * before it got to the driver and the driver won't own all of it
-- 
2.19.0



[PATCH 09/10] block: don't include io.h from bio.h

2018-09-24 Thread Christoph Hellwig
Now that we don't need an override for BIOVEC_PHYS_MERGEABLE there is
no need to drag this header in.

Signed-off-by: Christoph Hellwig 
---
 include/linux/bio.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9ad4b0a487a4..b3d47862b1b4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -24,9 +24,6 @@
 #include 
 
 #ifdef CONFIG_BLOCK
-
-#include 
-
 /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
 #include 
 
-- 
2.19.0



[PATCH 03/10] block: move req_gap_{back,front}_merge to blk-merge.c

2018-09-24 Thread Christoph Hellwig
Keep it close to the actual users instead of exposing the function to all
drivers.

Signed-off-by: Christoph Hellwig 
---
 block/blk-merge.c  | 65 +++
 include/linux/blkdev.h | 69 --
 2 files changed, 65 insertions(+), 69 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index aaec38cc37b8..ad8a226347a6 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -12,6 +12,71 @@
 
 #include "blk.h"
 
+/*
+ * Check if the two bvecs from two bios can be merged to one segment.  If yes,
+ * no need to check gap between the two bios since the 1st bio and the 1st bvec
+ * in the 2nd bio can be handled in one segment.
+ */
+static inline bool bios_segs_mergeable(struct request_queue *q,
+   struct bio *prev, struct bio_vec *prev_last_bv,
+   struct bio_vec *next_first_bv)
+{
+   if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv))
+   return false;
+   if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv))
+   return false;
+   if (prev->bi_seg_back_size + next_first_bv->bv_len >
+   queue_max_segment_size(q))
+   return false;
+   return true;
+}
+
+static inline bool bio_will_gap(struct request_queue *q,
+   struct request *prev_rq, struct bio *prev, struct bio *next)
+{
+   struct bio_vec pb, nb;
+
+   if (!bio_has_data(prev) || !queue_virt_boundary(q))
+   return false;
+
+   /*
+* Don't merge if the 1st bio starts with non-zero offset, otherwise it
+* is quite difficult to respect the sg gap limit.  We work hard to
+* merge a huge number of small single bios in case of mkfs.
+*/
+   if (prev_rq)
+   bio_get_first_bvec(prev_rq->bio, &pb);
+   else
+   bio_get_first_bvec(prev, &pb);
+   if (pb.bv_offset)
+   return true;
+
+   /*
+* We don't need to worry about the situation that the merged segment
+* ends in unaligned virt boundary:
+*
+* - if 'pb' ends aligned, the merged segment ends aligned
+* - if 'pb' ends unaligned, the next bio must include
+*   one single bvec of 'nb', otherwise the 'nb' can't
+*   merge with 'pb'
+*/
+   bio_get_last_bvec(prev, &pb);
+   bio_get_first_bvec(next, &nb);
+   if (bios_segs_mergeable(q, prev, &pb, &nb))
+   return false;
+   return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+}
+
+static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
+{
+   return bio_will_gap(req->q, req, req->biotail, bio);
+}
+
+static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
+{
+   return bio_will_gap(req->q, NULL, bio, req->bio);
+}
+
 static struct bio *blk_bio_discard_split(struct request_queue *q,
 struct bio *bio,
 struct bio_set *bs,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bc534c857344..b7e676bb01bc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1695,75 +1695,6 @@ static inline bool bvec_gap_to_prev(struct request_queue 
*q,
return __bvec_gap_to_prev(q, bprv, offset);
 }
 
-/*
- * Check if the two bvecs from two bios can be merged to one segment.
- * If yes, no need to check gap between the two bios since the 1st bio
- * and the 1st bvec in the 2nd bio can be handled in one segment.
- */
-static inline bool bios_segs_mergeable(struct request_queue *q,
-   struct bio *prev, struct bio_vec *prev_last_bv,
-   struct bio_vec *next_first_bv)
-{
-   if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv))
-   return false;
-   if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv))
-   return false;
-   if (prev->bi_seg_back_size + next_first_bv->bv_len >
-   queue_max_segment_size(q))
-   return false;
-   return true;
-}
-
-static inline bool bio_will_gap(struct request_queue *q,
-   struct request *prev_rq,
-   struct bio *prev,
-   struct bio *next)
-{
-   if (bio_has_data(prev) && queue_virt_boundary(q)) {
-   struct bio_vec pb, nb;
-
-   /*
-* don't merge if the 1st bio starts with non-zero
-* offset, otherwise it is quite difficult to respect
-* sg gap limit. We work hard to merge a huge number of small
-* single bios in case of mkfs.
-*/
-   if (prev_rq)
-   bio_get_first_bvec(prev_rq->bio, &pb);
-   else
-   bio_get_first_bvec(prev, &pb);
-   if (pb.bv_offset)
-   return true;
-
-   /*
-* We don't

[PATCH 04/10] block: move req_gap_back_merge to blk.h

2018-09-24 Thread Christoph Hellwig
No need to expose these helpers outside the block layer.

Signed-off-by: Christoph Hellwig 
---
 block/blk.h| 19 +++
 include/linux/blkdev.h | 19 ---
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/block/blk.h b/block/blk.h
index 441c2de1d4b9..63035c95689c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -149,6 +149,25 @@ static inline void blk_queue_enter_live(struct 
request_queue *q)
percpu_ref_get(&q->q_usage_counter);
 }
 
+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+   struct bio_vec *bprv, unsigned int offset)
+{
+   return offset ||
+   ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
+/*
+ * Check if adding a bio_vec after bprv with offset would create a gap in
+ * the SG list. Most drivers don't care about this, but some do.
+ */
+static inline bool bvec_gap_to_prev(struct request_queue *q,
+   struct bio_vec *bprv, unsigned int offset)
+{
+   if (!queue_virt_boundary(q))
+   return false;
+   return __bvec_gap_to_prev(q, bprv, offset);
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
 bool __bio_integrity_endio(struct bio *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b7e676bb01bc..1d5e14139795 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1676,25 +1676,6 @@ static inline void put_dev_sector(Sector p)
put_page(p.v);
 }
 
-static inline bool __bvec_gap_to_prev(struct request_queue *q,
-   struct bio_vec *bprv, unsigned int offset)
-{
-   return offset ||
-   ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
-}
-
-/*
- * Check if adding a bio_vec after bprv with offset would create a gap in
- * the SG list. Most drivers don't care about this, but some do.
- */
-static inline bool bvec_gap_to_prev(struct request_queue *q,
-   struct bio_vec *bprv, unsigned int offset)
-{
-   if (!queue_virt_boundary(q))
-   return false;
-   return __bvec_gap_to_prev(q, bprv, offset);
-}
-
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned 
long delay);
-- 
2.19.0



[PATCH 07/10] block: merge BIOVEC_SEG_BOUNDARY into biovec_phys_mergeable

2018-09-24 Thread Christoph Hellwig
These two checks should always be performed together, so merge them into
a single helper.

Signed-off-by: Christoph Hellwig 
---
 block/bio.c   |  4 +---
 block/blk-integrity.c | 12 ++--
 block/blk-merge.c | 29 +
 block/blk.h   | 12 +---
 include/linux/bio.h   |  8 
 5 files changed, 17 insertions(+), 48 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index c254e5aa331f..e9f92b50724d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -731,9 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio 
*bio, struct page
}
 
/* If we may be able to merge these biovecs, force a recount */
-   if (bio->bi_vcnt > 1 &&
-   biovec_phys_mergeable(bvec - 1, bvec) &&
-   BIOVEC_SEG_BOUNDARY(q, bvec - 1, bvec))
+   if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec))
bio_clear_flag(bio, BIO_SEG_VALID);
 
  done:
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 0f7267916509..d1ab089e0919 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -49,12 +49,8 @@ int blk_rq_count_integrity_sg(struct request_queue *q, 
struct bio *bio)
bio_for_each_integrity_vec(iv, bio, iter) {
 
if (prev) {
-   if (!biovec_phys_mergeable(&ivprv, &iv))
+   if (!biovec_phys_mergeable(q, &ivprv, &iv))
goto new_segment;
-
-   if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
-   goto new_segment;
-
if (seg_size + iv.bv_len > queue_max_segment_size(q))
goto new_segment;
 
@@ -95,12 +91,8 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct 
bio *bio,
bio_for_each_integrity_vec(iv, bio, iter) {
 
if (prev) {
-   if (!biovec_phys_mergeable(&ivprv, &iv))
+   if (!biovec_phys_mergeable(q, &ivprv, &iv))
goto new_segment;
-
-   if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
-   goto new_segment;
-
if (sg->length + iv.bv_len > queue_max_segment_size(q))
goto new_segment;
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5e63e8259f92..42a46744c11b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -21,9 +21,7 @@ static inline bool bios_segs_mergeable(struct request_queue 
*q,
struct bio *prev, struct bio_vec *prev_last_bv,
struct bio_vec *next_first_bv)
 {
-   if (!biovec_phys_mergeable(prev_last_bv, next_first_bv))
-   return false;
-   if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv))
+   if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv))
return false;
if (prev->bi_seg_back_size + next_first_bv->bv_len >
queue_max_segment_size(q))
@@ -199,9 +197,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
if (bvprvp && blk_queue_cluster(q)) {
if (seg_size + bv.bv_len > queue_max_segment_size(q))
goto new_segment;
-   if (!biovec_phys_mergeable(bvprvp, &bv))
-   goto new_segment;
-   if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
+   if (!biovec_phys_mergeable(q, bvprvp, &bv))
goto new_segment;
 
seg_size += bv.bv_len;
@@ -332,9 +328,7 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
goto new_segment;
-   if (!biovec_phys_mergeable(&bvprv, &bv))
-   goto new_segment;
-   if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
+   if (!biovec_phys_mergeable(q, &bvprv, &bv))
goto new_segment;
 
seg_size += bv.bv_len;
@@ -414,17 +408,7 @@ static int blk_phys_contig_segment(struct request_queue 
*q, struct bio *bio,
bio_get_last_bvec(bio, &end_bv);
bio_get_first_bvec(nxt, &nxt_bv);
 
-   if (!biovec_phys_mergeable(&end_bv, &nxt_bv))
-   return 0;
-
-   /*
-* bio and nxt are contiguous in memory; check if the queue allows
-* these two to be merged into one
-*/
-   if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
-   return 1;
-
-   return 0;
+   return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
 }
 
 static inline void
@@ -438,10 +422,7 @@ __blk_segment_map_sg(struct request_queue *q, struct 
bio_vec *bvec,

[PATCH 01/10] arm: remove the unused BIOVEC_MERGEABLE define

2018-09-24 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
---
 arch/arm/include/asm/io.h | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 2cfbc531f63b..6774553dc214 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -459,13 +459,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem 
*addr);
 
 #include 
 
-/*
- * can the hardware map this into one segment or not, given no other
- * constraints.
- */
-#define BIOVEC_MERGEABLE(vec1, vec2)   \
-   ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
-
 struct bio_vec;
 extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
  const struct bio_vec *vec2);
-- 
2.19.0



[PATCH 02/10] block: move integrity_req_gap_{back,front}_merge to blk.h

2018-09-24 Thread Christoph Hellwig
No need to expose these to drivers.

Signed-off-by: Christoph Hellwig 
---
 block/blk.h| 35 +--
 include/linux/blkdev.h | 31 ---
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/block/blk.h b/block/blk.h
index 9db4e389582c..441c2de1d4b9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -158,7 +158,38 @@ static inline bool bio_integrity_endio(struct bio *bio)
return __bio_integrity_endio(bio);
return true;
 }
-#else
+
+static inline bool integrity_req_gap_back_merge(struct request *req,
+   struct bio *next)
+{
+   struct bio_integrity_payload *bip = bio_integrity(req->bio);
+   struct bio_integrity_payload *bip_next = bio_integrity(next);
+
+   return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+   bip_next->bip_vec[0].bv_offset);
+}
+
+static inline bool integrity_req_gap_front_merge(struct request *req,
+   struct bio *bio)
+{
+   struct bio_integrity_payload *bip = bio_integrity(bio);
+   struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
+
+   return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+   bip_next->bip_vec[0].bv_offset);
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline bool integrity_req_gap_back_merge(struct request *req,
+   struct bio *next)
+{
+   return false;
+}
+static inline bool integrity_req_gap_front_merge(struct request *req,
+   struct bio *bio)
+{
+   return false;
+}
+
 static inline void blk_flush_integrity(void)
 {
 }
@@ -166,7 +197,7 @@ static inline bool bio_integrity_endio(struct bio *bio)
 {
return true;
 }
-#endif
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 void blk_timeout_work(struct work_struct *work);
 unsigned long blk_rq_timeout(unsigned long timeout);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d6869e0e2b64..bc534c857344 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1843,26 +1843,6 @@ queue_max_integrity_segments(struct request_queue *q)
return q->limits.max_integrity_segments;
 }
 
-static inline bool integrity_req_gap_back_merge(struct request *req,
-   struct bio *next)
-{
-   struct bio_integrity_payload *bip = bio_integrity(req->bio);
-   struct bio_integrity_payload *bip_next = bio_integrity(next);
-
-   return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
-   bip_next->bip_vec[0].bv_offset);
-}
-
-static inline bool integrity_req_gap_front_merge(struct request *req,
-struct bio *bio)
-{
-   struct bio_integrity_payload *bip = bio_integrity(bio);
-   struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
-
-   return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
-   bip_next->bip_vec[0].bv_offset);
-}
-
 /**
  * bio_integrity_intervals - Return number of integrity intervals for a bio
  * @bi:blk_integrity profile for device
@@ -1947,17 +1927,6 @@ static inline bool blk_integrity_merge_bio(struct 
request_queue *rq,
return true;
 }
 
-static inline bool integrity_req_gap_back_merge(struct request *req,
-   struct bio *next)
-{
-   return false;
-}
-static inline bool integrity_req_gap_front_merge(struct request *req,
-struct bio *bio)
-{
-   return false;
-}
-
 static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
   unsigned int sectors)
 {
-- 
2.19.0



clean up physical merging helpers V2

2018-09-24 Thread Christoph Hellwig
Hi Jens,

this series moves various helpers related to merging based on physical
addresses from the public headers into block/, cleans up the code a bit
and removes not nessecary includes from the block headers.

Change since V1:
 - dropped the Xen related changed which are moved into a new series
   to be sent after this one