[PATCH] block/loop: Don't hold lock while rereading partition.
syzbot is reporting circular locking dependency between bdev->bd_mutex and lo->lo_ctl_mutex [1] which is caused by calling blkdev_reread_part() with lock held. Don't hold loop_ctl_mutex while calling blkdev_reread_part(). Also, bring bdgrab() at loop_set_fd() to before loop_reread_partitions() in case loop_clr_fd() is called while blkdev_reread_part() from loop_set_fd() is in progress. [1] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889 Signed-off-by: Tetsuo Handa Reported-by: syzbot --- drivers/block/loop.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 920cbb1..877cca8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -632,7 +632,12 @@ static void loop_reread_partitions(struct loop_device *lo, struct block_device *bdev) { int rc; + char filename[LO_NAME_SIZE]; + const int num = lo->lo_number; + const int count = atomic_read(&lo->lo_refcnt); + memcpy(filename, lo->lo_file_name, sizeof(filename)); + mutex_unlock(&loop_ctl_mutex); /* * bd_mutex has been held already in release path, so don't * acquire it if this function is called in such case. @@ -641,13 +646,14 @@ static void loop_reread_partitions(struct loop_device *lo, * must be at least one and it can only become zero when the * current holder is released. */ - if (!atomic_read(&lo->lo_refcnt)) + if (!count) rc = __blkdev_reread_part(bdev); else rc = blkdev_reread_part(bdev); + mutex_lock(&loop_ctl_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", - __func__, lo->lo_number, lo->lo_file_name, rc); + __func__, num, filename, rc); } static inline int is_loop_device(struct file *file) @@ -971,16 +977,18 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, set_blocksize(bdev, S_ISBLK(inode->i_mode) ? block_size(inode->i_bdev) : PAGE_SIZE); + /* +* Grab the block_device to prevent its destruction after we +* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). +*/ + bdgrab(bdev); + lo->lo_state = Lo_bound; if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) loop_reread_partitions(lo, bdev); - /* Grab the block_device to prevent its destruction after we -* put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). -*/ - bdgrab(bdev); return 0; out_putf: -- 1.8.3.1
[PATCH v2] block/loop: Use global lock for ioctl() operation.
syzbot is reporting NULL pointer dereference [1] which is caused by race condition between ioctl(loop_fd, LOOP_CLR_FD, 0) versus ioctl(other_loop_fd, LOOP_SET_FD, loop_fd) due to traversing other loop devices at loop_validate_file() without holding corresponding lo->lo_ctl_mutex locks. Since ioctl() request on loop devices is not frequent operation, we don't need fine grained locking. Let's use global lock in order to allow safe traversal at loop_validate_file(). Note that syzbot is also reporting circular locking dependency between bdev->bd_mutex and lo->lo_ctl_mutex [2] which is caused by calling blkdev_reread_part() with lock held. This patch does not address it. [1] https://syzkaller.appspot.com/bug?id=f3cfe26e785d85f9ee259f385515291d21bd80a3 [2] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889 v2: Don't call mutex_init() upon loop_add() request. Signed-off-by: Tetsuo Handa Reported-by: syzbot --- drivers/block/loop.c | 58 ++-- drivers/block/loop.h | 1 - 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c2745e6..920cbb1 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -85,6 +85,7 @@ static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_index_mutex); +static DEFINE_MUTEX(loop_ctl_mutex); static int max_part; static int part_shift; @@ -1048,7 +1049,7 @@ static int loop_clr_fd(struct loop_device *lo) */ if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return 0; } @@ -1101,12 +1102,12 @@ static int loop_clr_fd(struct loop_device *lo) if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; loop_unprepare_queue(lo); - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); /* -* Need not hold lo_ctl_mutex to fput backing file. -* Calling fput holding lo_ctl_mutex triggers a circular +* Need not hold loop_ctl_mutex to fput backing file. +* Calling fput holding loop_ctl_mutex triggers a circular * lock dependency possibility warning as fput can take -* bd_mutex which is usually taken before lo_ctl_mutex. +* bd_mutex which is usually taken before loop_ctl_mutex. */ fput(filp); return 0; @@ -1211,7 +1212,7 @@ static int loop_clr_fd(struct loop_device *lo) int ret; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return -ENXIO; } @@ -1230,10 +1231,10 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_encrypt_key_size); } - /* Drop lo_ctl_mutex while we call into the filesystem. */ + /* Drop loop_ctl_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; path_get(&path); - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); @@ -1325,7 +1326,7 @@ static int loop_clr_fd(struct loop_device *lo) int err; if (!arg) { - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return -EINVAL; } err = loop_get_status(lo, &info64); @@ -1343,7 +1344,7 @@ static int loop_clr_fd(struct loop_device *lo) int err; if (!arg) { - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return -EINVAL; } err = loop_get_status(lo, &info64); @@ -1401,7 +1402,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, struct loop_device *lo = bdev->bd_disk->private_data; int err; - err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1); + err = mutex_lock_killable_nested(&loop_ctl_mutex, 1); if (err) goto out_unlocked; @@ -1413,7 +1414,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, err = loop_change_fd(lo, bdev, arg); break; case LOOP_CLR_FD: - /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ + /* loop_clr_fd would have unlocked loop_ctl_mutex on success */ err = loop_clr_fd(lo); if (!err) goto out_unlocked; @@ -1426,7 +1427,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, break; case LOOP_GET_STATUS: err = loop_get_status_old(lo, (struct loop_info __user *) arg); - /* loop_get_statu
[PATCH] block/loop: Use global lock for ioctl() operation.
syzbot is reporting NULL pointer dereference [1] which is caused by race condition between ioctl(loop_fd, LOOP_CLR_FD, 0) versus ioctl(other_loop_fd, LOOP_SET_FD, loop_fd) due to traversing other loop devices at loop_validate_file() without holding corresponding lo->lo_ctl_mutex locks. Since ioctl() request on loop devices is not frequent operation, we don't need fine grained locking. Let's use global lock in order to allow safe traversal at loop_validate_file(). Note that syzbot is also reporting circular locking dependency between bdev->bd_mutex and lo->lo_ctl_mutex [2] which is caused by calling blkdev_reread_part() with lock held. This patch does not address it. [1] https://syzkaller.appspot.com/bug?id=f3cfe26e785d85f9ee259f385515291d21bd80a3 [2] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889 Signed-off-by: Tetsuo Handa Reported-by: syzbot --- drivers/block/loop.c | 59 ++-- drivers/block/loop.h | 1 - 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c2745e6..1477f52e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -85,6 +85,7 @@ static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_index_mutex); +static DEFINE_MUTEX(loop_ctl_mutex); static int max_part; static int part_shift; @@ -1048,7 +1049,7 @@ static int loop_clr_fd(struct loop_device *lo) */ if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return 0; } @@ -1101,12 +1102,12 @@ static int loop_clr_fd(struct loop_device *lo) if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; loop_unprepare_queue(lo); - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); /* -* Need not hold lo_ctl_mutex to fput backing file. -* Calling fput holding lo_ctl_mutex triggers a circular +* Need not hold loop_ctl_mutex to fput backing file. +* Calling fput holding loop_ctl_mutex triggers a circular * lock dependency possibility warning as fput can take -* bd_mutex which is usually taken before lo_ctl_mutex. +* bd_mutex which is usually taken before loop_ctl_mutex. */ fput(filp); return 0; @@ -1211,7 +1212,7 @@ static int loop_clr_fd(struct loop_device *lo) int ret; if (lo->lo_state != Lo_bound) { - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return -ENXIO; } @@ -1230,10 +1231,10 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_encrypt_key_size); } - /* Drop lo_ctl_mutex while we call into the filesystem. */ + /* Drop loop_ctl_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; path_get(&path); - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); @@ -1325,7 +1326,7 @@ static int loop_clr_fd(struct loop_device *lo) int err; if (!arg) { - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return -EINVAL; } err = loop_get_status(lo, &info64); @@ -1343,7 +1344,7 @@ static int loop_clr_fd(struct loop_device *lo) int err; if (!arg) { - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&loop_ctl_mutex); return -EINVAL; } err = loop_get_status(lo, &info64); @@ -1401,7 +1402,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, struct loop_device *lo = bdev->bd_disk->private_data; int err; - err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1); + err = mutex_lock_killable_nested(&loop_ctl_mutex, 1); if (err) goto out_unlocked; @@ -1413,7 +1414,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, err = loop_change_fd(lo, bdev, arg); break; case LOOP_CLR_FD: - /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ + /* loop_clr_fd would have unlocked loop_ctl_mutex on success */ err = loop_clr_fd(lo); if (!err) goto out_unlocked; @@ -1426,7 +1427,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, break; case LOOP_GET_STATUS: err = loop_get_status_old(lo, (struct loop_info __user *) arg); - /* loop_get_status() unlocks lo_ctl_mutex */ + /* loop_g
[PATCH] block/loop: Don't grab "struct file" for vfs_getattr() operation.
vfs_getattr() needs "struct path" rather than "struct file". Let's use path_get()/path_put() rather than get_file()/fput(). Signed-off-by: Tetsuo Handa --- drivers/block/loop.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index abad6d1..c2745e6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1206,7 +1206,7 @@ static int loop_clr_fd(struct loop_device *lo) static int loop_get_status(struct loop_device *lo, struct loop_info64 *info) { - struct file *file; + struct path path; struct kstat stat; int ret; @@ -1231,16 +1231,16 @@ static int loop_clr_fd(struct loop_device *lo) } /* Drop lo_ctl_mutex while we call into the filesystem. */ - file = get_file(lo->lo_backing_file); + path = lo->lo_backing_file->f_path; + path_get(&path); mutex_unlock(&lo->lo_ctl_mutex); - ret = vfs_getattr(&file->f_path, &stat, STATX_INO, - AT_STATX_SYNC_AS_STAT); + ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); info->lo_inode = stat.ino; info->lo_rdevice = huge_encode_dev(stat.rdev); } - fput(file); + path_put(&path); return ret; } -- 1.8.3.1
Re: [PATCH] blk-mq: Allow blocking queue tag iter callbacks
Hi Bart On 09/25/2018 10:20 AM, Bart Van Assche wrote: > On 9/24/18 7:11 PM, jianchao.wang wrote: >> Hi Keith >> >> On 09/25/2018 05:09 AM, Keith Busch wrote: >>> - /* A deadlock might occur if a request is stuck requiring a >>> - * timeout at the same time a queue freeze is waiting >>> - * completion, since the timeout code would not be able to >>> - * acquire the queue reference here. >>> - * >>> - * That's why we don't use blk_queue_enter here; instead, we use >>> - * percpu_ref_tryget directly, because we need to be able to >>> - * obtain a reference even in the short window between the queue >>> - * starting to freeze, by dropping the first reference in >>> - * blk_freeze_queue_start, and the moment the last request is >>> - * consumed, marked by the instant q_usage_counter reaches >>> - * zero. >>> - */ >>> - if (!percpu_ref_tryget(&q->q_usage_counter)) >>> + if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next)) >>> return; >> >> We cannot discard the percpu_ref_tryget here. >> >> There following code in blk_mq_timeout_work still need it: >> >> if (next != 0) { >> mod_timer(&q->timeout, next); >> } else { >> queue_for_each_hw_ctx(q, hctx, i) { >> /* the hctx may be unmapped, so check it here */ >> if (blk_mq_hw_queue_mapped(hctx)) >> blk_mq_tag_idle(hctx); >> } >> } > > Hi Jianchao, > > Had you noticed that the percpu_ref_tryget() call has been moved into > blk_mq_queue_tag_busy_iter()? > Yes. But the issue is the left part of blk_mq_timeout_work is moved out of protection of q refcount. Thanks Jianchao
Re: [PATCH] blk-mq: Allow blocking queue tag iter callbacks
On 9/24/18 7:11 PM, jianchao.wang wrote: Hi Keith On 09/25/2018 05:09 AM, Keith Busch wrote: - /* A deadlock might occur if a request is stuck requiring a -* timeout at the same time a queue freeze is waiting -* completion, since the timeout code would not be able to -* acquire the queue reference here. -* -* That's why we don't use blk_queue_enter here; instead, we use -* percpu_ref_tryget directly, because we need to be able to -* obtain a reference even in the short window between the queue -* starting to freeze, by dropping the first reference in -* blk_freeze_queue_start, and the moment the last request is -* consumed, marked by the instant q_usage_counter reaches -* zero. -*/ - if (!percpu_ref_tryget(&q->q_usage_counter)) + if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next)) return; We cannot discard the percpu_ref_tryget here. There following code in blk_mq_timeout_work still need it: if (next != 0) { mod_timer(&q->timeout, next); } else { queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); } } Hi Jianchao, Had you noticed that the percpu_ref_tryget() call has been moved into blk_mq_queue_tag_busy_iter()? Bart.
Re: [PATCH] blk-mq: Allow blocking queue tag iter callbacks
Hi Keith On 09/25/2018 05:09 AM, Keith Busch wrote: > - /* A deadlock might occur if a request is stuck requiring a > - * timeout at the same time a queue freeze is waiting > - * completion, since the timeout code would not be able to > - * acquire the queue reference here. > - * > - * That's why we don't use blk_queue_enter here; instead, we use > - * percpu_ref_tryget directly, because we need to be able to > - * obtain a reference even in the short window between the queue > - * starting to freeze, by dropping the first reference in > - * blk_freeze_queue_start, and the moment the last request is > - * consumed, marked by the instant q_usage_counter reaches > - * zero. > - */ > - if (!percpu_ref_tryget(&q->q_usage_counter)) > + if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next)) > return; We cannot discard the percpu_ref_tryget here. There following code in blk_mq_timeout_work still need it: if (next != 0) { mod_timer(&q->timeout, next); } else { queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); } } Thanks Jianchao
Re: clean up physical merging helpers V2
On Mon, Sep 24, 2018 at 09:43:45AM +0200, Christoph Hellwig wrote: > Hi Jens, > > this series moves various helpers related to merging based on physical > addresses from the public headers into block/, cleans up the code a bit > and removes not nessecary includes from the block headers. > > Change since V1: > - dropped the Xen related changed which are moved into a new series >to be sent after this one Looks fine: Reviewed-by: Ming Lei Thanks, Ming
[PATCH] blk-mq: Allow blocking queue tag iter callbacks
A recent commit had tag iterator callbacks run under the rcu read lock. Existing callbacks exist that do not satisy the rcu non-blocking requirement. The commit intended to prevent an iterator from accessing a queue that's being modified. This patch fixes the original issue by taking a queue reference if the queue is not frozen instead of a rcu read lock. Fixes: f5e4d6357 ("blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter") Cc: Jianchao Wang Signed-off-by: Keith Busch --- block/blk-mq-tag.c | 30 +- block/blk-mq-tag.h | 2 +- block/blk-mq.c | 22 +- 3 files changed, 19 insertions(+), 35 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 94e1ed667b6e..63542642a017 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -314,24 +314,27 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +bool blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { struct blk_mq_hw_ctx *hctx; int i; - /* -* __blk_mq_update_nr_hw_queues will update the nr_hw_queues and -* queue_hw_ctx after freeze the queue. So we could use q_usage_counter -* to avoid race with it. __blk_mq_update_nr_hw_queues will users -* synchronize_rcu to ensure all of the users go out of the critical -* section below and see zeroed q_usage_counter. + /* A deadlock might occur if a request is stuck requiring a +* timeout at the same time a queue freeze is waiting +* completion, since the timeout code would not be able to +* acquire the queue reference here. +* +* That's why we don't use blk_queue_enter here; instead, we use +* percpu_ref_tryget directly, because we need to be able to +* obtain a reference even in the short window between the queue +* starting to freeze, by dropping the first reference in +* blk_freeze_queue_start, and the moment the last request is +* consumed, marked by the instant q_usage_counter reaches +* zero. */ - rcu_read_lock(); - if (percpu_ref_is_zero(&q->q_usage_counter)) { - rcu_read_unlock(); - return; - } + if (!percpu_ref_tryget(&q->q_usage_counter)) + return false; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; @@ -347,7 +350,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } - rcu_read_unlock(); + blk_queue_exit(q); + return true; } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 61deab0b5a5a..36b3bc90e867 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -33,7 +33,7 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +bool blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, diff --git a/block/blk-mq.c b/block/blk-mq.c index 85a1c1a59c72..019f9b169887 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -848,24 +848,9 @@ static void blk_mq_timeout_work(struct work_struct *work) struct blk_mq_hw_ctx *hctx; int i; - /* A deadlock might occur if a request is stuck requiring a -* timeout at the same time a queue freeze is waiting -* completion, since the timeout code would not be able to -* acquire the queue reference here. -* -* That's why we don't use blk_queue_enter here; instead, we use -* percpu_ref_tryget directly, because we need to be able to -* obtain a reference even in the short window between the queue -* starting to freeze, by dropping the first reference in -* blk_freeze_queue_start, and the moment the last request is -* consumed, marked by the instant q_usage_counter reaches -* zero. -*/ - if (!percpu_ref_tryget(&q->q_usage_counter)) + if (!blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next)) return; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); - if (next != 0) { mod_timer(&q->timeout, next); } else { @@ -881,7 +866,6 @@ stat
Re: [PATCH v4] block/loop: Serialize ioctl operations.
On 2018/09/25 3:47, Jan Kara wrote: >> +/* >> + * unlock_loop - Unlock loop_mutex as needed. >> + * >> + * Explicitly call this function before calling fput() or >> blkdev_reread_part() >> + * in order to avoid circular lock dependency. After this function is >> called, >> + * current thread is no longer allowed to access "struct loop_device" >> memory, >> + * for another thread would access that memory as soon as loop_mutex is >> held. >> + */ >> +static void unlock_loop(void) >> +{ >> +if (loop_mutex_owner == current) { > > Urgh, why this check? Conditional locking / unlocking is evil so it has to > have *very* good reasons and there is not any explanation here. So far I > don't see a reason why this is needed at all. Yeah, this is why Jens hates this patch. But any alternative? >> @@ -630,7 +669,12 @@ static void loop_reread_partitions(struct loop_device >> *lo, >> +unlock_loop(); > > Unlocking in loop_reread_partitions() makes the locking rules ugly. And > unnecessarily AFAICT. Can't we just use lo_refcnt to protect us against > loop_clr_fd() and freeing of 'lo' structure itself? Really? I think that just elevating lo->lo_refcnt will cause another lockdep warning because __blkdev_reread_part() requires bdev->bd_mutex being held. Don't we need to drop the lock in order to solve original lockdep warning at [2] ? int __blkdev_reread_part(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; lockdep_assert_held(&bdev->bd_mutex); return rescan_partitions(disk, bdev); } int blkdev_reread_part(struct block_device *bdev) { int res; mutex_lock(&bdev->bd_mutex); res = __blkdev_reread_part(bdev); mutex_unlock(&bdev->bd_mutex); return res; }
Re: [PATCH v10 5/8] percpu-refcount: Introduce percpu_ref_resurrect()
On Mon, 2018-09-24 at 11:01 -0700, Tejun Heo wrote: > Hello, Bart. > > On Fri, Sep 21, 2018 at 01:31:19PM -0700, Bart Van Assche wrote: > > +void percpu_ref_resurrect(struct percpu_ref *ref) > > +{ > > + unsigned long __percpu *percpu_count; > > unsigned long flags; > > > > spin_lock_irqsave(&percpu_ref_switch_lock, flags); > > > > - WARN_ON_ONCE(!percpu_ref_is_zero(ref)); > > + WARN_ON_ONCE(!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)); > > + WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); > > So, this in itself is fine but I'm a bit worried that this might it > easier to abuse percpu_ref's dying mode. More specifically, it isn't > a good idea to depend on percpu_ref's implementation details from > outside - ie. the only guarantee there ever is that percpu_ref won't > give out new refs after ->percpu_ref is called - e.g. users can't > piggy back on implied rcu grace period. > > Can you please note that in the function comment? Provided that, > > Acked-by: Tejun Heo Hi Tejun, Thanks for the review. But could you please clarify what "after ->percpu_ref is called" refers to? Thanks, Bart.
Re: Regression caused by f5bbbbe4d635
On Mon, Sep 24, 2018 at 12:51:07PM -0700, Bart Van Assche wrote: > On Mon, 2018-09-24 at 13:13 -0600, Keith Busch wrote: > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > index 85a1c1a59c72..28d128450621 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -848,22 +848,6 @@ static void blk_mq_timeout_work(struct work_struct > > *work) > > struct blk_mq_hw_ctx *hctx; > > int i; > > > > - /* A deadlock might occur if a request is stuck requiring a > > -* timeout at the same time a queue freeze is waiting > > -* completion, since the timeout code would not be able to > > -* acquire the queue reference here. > > -* > > -* That's why we don't use blk_queue_enter here; instead, we use > > -* percpu_ref_tryget directly, because we need to be able to > > -* obtain a reference even in the short window between the queue > > -* starting to freeze, by dropping the first reference in > > -* blk_freeze_queue_start, and the moment the last request is > > -* consumed, marked by the instant q_usage_counter reaches > > -* zero. > > -*/ > > - if (!percpu_ref_tryget(&q->q_usage_counter)) > > - return; > > - > > blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); > > > > if (next != 0) { > > @@ -881,7 +865,6 @@ static void blk_mq_timeout_work(struct work_struct > > *work) > > blk_mq_tag_idle(hctx); > > } > > } > > - blk_queue_exit(q); > > } > > Hi Keith, > > The above introduces a behavior change: if the percpu_ref_tryget() call inside > blk_mq_queue_tag_busy_iter() fails then blk_mq_timeout_work() will now call > blk_mq_tag_idle(). I think that's wrong if the percpu_ref_tryget() call fails > due to the queue having been frozen. Please make blk_mq_queue_tag_busy_iter() > return a bool that indicates whether or not it has iterated over the request > queue. Good point, thanks for the feedback.
Re: Regression caused by f5bbbbe4d635
On Mon, 2018-09-24 at 13:13 -0600, Keith Busch wrote: > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 85a1c1a59c72..28d128450621 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -848,22 +848,6 @@ static void blk_mq_timeout_work(struct work_struct *work) > struct blk_mq_hw_ctx *hctx; > int i; > > - /* A deadlock might occur if a request is stuck requiring a > - * timeout at the same time a queue freeze is waiting > - * completion, since the timeout code would not be able to > - * acquire the queue reference here. > - * > - * That's why we don't use blk_queue_enter here; instead, we use > - * percpu_ref_tryget directly, because we need to be able to > - * obtain a reference even in the short window between the queue > - * starting to freeze, by dropping the first reference in > - * blk_freeze_queue_start, and the moment the last request is > - * consumed, marked by the instant q_usage_counter reaches > - * zero. > - */ > - if (!percpu_ref_tryget(&q->q_usage_counter)) > - return; > - > blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); > > if (next != 0) { > @@ -881,7 +865,6 @@ static void blk_mq_timeout_work(struct work_struct *work) > blk_mq_tag_idle(hctx); > } > } > - blk_queue_exit(q); > } Hi Keith, The above introduces a behavior change: if the percpu_ref_tryget() call inside blk_mq_queue_tag_busy_iter() fails then blk_mq_timeout_work() will now call blk_mq_tag_idle(). I think that's wrong if the percpu_ref_tryget() call fails due to the queue having been frozen. Please make blk_mq_queue_tag_busy_iter() return a bool that indicates whether or not it has iterated over the request queue. Thanks, Bart.
Re: Regression caused by f5bbbbe4d635
On Mon, Sep 24, 2018 at 12:44:13PM -0600, Jens Axboe wrote: > Hi, > > This commit introduced a rcu_read_lock() inside > blk_mq_queue_tag_busy_iter() - this is problematic for the timout code, > since we now end up holding the RCU read lock over the timeout code. As > just one example, nvme ends up doing: > > nvme_timeout() > nvme_dev_disable() > mutex_lock(&dev->shutdown_lock); > > and things are then obviously unhappy... Yah, there's never been a requirement that tag iterator callbacks be non-blocking as far as I remember. The queue's reference in blk_mq_timeout_work looks applicable to any blk_mq_queue_tag_busy_iter user, so just moving it there looks like it should do what f5e4d635 was trying to fix. --- diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 94e1ed667b6e..850577a3de6d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -320,18 +320,21 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, struct blk_mq_hw_ctx *hctx; int i; - /* -* __blk_mq_update_nr_hw_queues will update the nr_hw_queues and -* queue_hw_ctx after freeze the queue. So we could use q_usage_counter -* to avoid race with it. __blk_mq_update_nr_hw_queues will users -* synchronize_rcu to ensure all of the users go out of the critical -* section below and see zeroed q_usage_counter. + /* A deadlock might occur if a request is stuck requiring a +* timeout at the same time a queue freeze is waiting +* completion, since the timeout code would not be able to +* acquire the queue reference here. +* +* That's why we don't use blk_queue_enter here; instead, we use +* percpu_ref_tryget directly, because we need to be able to +* obtain a reference even in the short window between the queue +* starting to freeze, by dropping the first reference in +* blk_freeze_queue_start, and the moment the last request is +* consumed, marked by the instant q_usage_counter reaches +* zero. */ - rcu_read_lock(); - if (percpu_ref_is_zero(&q->q_usage_counter)) { - rcu_read_unlock(); + if (!percpu_ref_tryget(&q->q_usage_counter)) return; - } queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; @@ -347,7 +350,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } - rcu_read_unlock(); + blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, diff --git a/block/blk-mq.c b/block/blk-mq.c index 85a1c1a59c72..28d128450621 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -848,22 +848,6 @@ static void blk_mq_timeout_work(struct work_struct *work) struct blk_mq_hw_ctx *hctx; int i; - /* A deadlock might occur if a request is stuck requiring a -* timeout at the same time a queue freeze is waiting -* completion, since the timeout code would not be able to -* acquire the queue reference here. -* -* That's why we don't use blk_queue_enter here; instead, we use -* percpu_ref_tryget directly, because we need to be able to -* obtain a reference even in the short window between the queue -* starting to freeze, by dropping the first reference in -* blk_freeze_queue_start, and the moment the last request is -* consumed, marked by the instant q_usage_counter reaches -* zero. -*/ - if (!percpu_ref_tryget(&q->q_usage_counter)) - return; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); if (next != 0) { @@ -881,7 +865,6 @@ static void blk_mq_timeout_work(struct work_struct *work) blk_mq_tag_idle(hctx); } } - blk_queue_exit(q); } struct flush_busy_ctx_data { @@ -2974,10 +2957,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); - /* -* Sync with blk_mq_queue_tag_busy_iter. -*/ - synchronize_rcu(); + /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done --
Regression caused by f5bbbbe4d635
Hi, This commit introduced a rcu_read_lock() inside blk_mq_queue_tag_busy_iter() - this is problematic for the timout code, since we now end up holding the RCU read lock over the timeout code. As just one example, nvme ends up doing: nvme_timeout() nvme_dev_disable() mutex_lock(&dev->shutdown_lock); and things are then obviously unhappy... -- Jens Axboe
Re: clean up physical merging helpers V2
On 9/24/18 1:43 AM, Christoph Hellwig wrote: > Hi Jens, > > this series moves various helpers related to merging based on physical > addresses from the public headers into block/, cleans up the code a bit > and removes not nessecary includes from the block headers. > > Change since V1: > - dropped the Xen related changed which are moved into a new series >to be sent after this one Looks good to me - #1 should just go to the arm guys separately, there's no point in me carrying this arch patch. -- Jens Axboe
Re: [PATCH v10 5/8] percpu-refcount: Introduce percpu_ref_resurrect()
Hello, Bart. On Fri, Sep 21, 2018 at 01:31:19PM -0700, Bart Van Assche wrote: > +void percpu_ref_resurrect(struct percpu_ref *ref) > +{ > + unsigned long __percpu *percpu_count; > unsigned long flags; > > spin_lock_irqsave(&percpu_ref_switch_lock, flags); > > - WARN_ON_ONCE(!percpu_ref_is_zero(ref)); > + WARN_ON_ONCE(!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)); > + WARN_ON_ONCE(__ref_is_percpu(ref, &percpu_count)); So, this in itself is fine but I'm a bit worried that this might it easier to abuse percpu_ref's dying mode. More specifically, it isn't a good idea to depend on percpu_ref's implementation details from outside - ie. the only guarantee there ever is that percpu_ref won't give out new refs after ->percpu_ref is called - e.g. users can't piggy back on implied rcu grace period. Can you please note that in the function comment? Provided that, Acked-by: Tejun Heo Thanks. -- tejun
Re: [PATCH] block: use nanosecond resolution for iostat
On Fri, Sep 21, 2018 at 08:27:17PM -0600, Jens Axboe wrote: > On 9/21/18 5:44 PM, Omar Sandoval wrote: > > From: Omar Sandoval > > > > Klaus Kusche reported that the I/O busy time in /proc/diskstats was not > > updating properly on 4.18. This is because we started using ktime to > > track elapsed time, and we convert nanoseconds to jiffies when we update > > the partition counter. However, this gets rounded down, so any I/Os that > > take less than a jiffy are not accounted for. Previously in this case, > > the value of jiffies would sometimes increment while we were doing I/O, > > so at least some I/Os were accounted for. > > > > Let's convert the stats to use nanoseconds internally. We still report > > milliseconds as before, now more accurately than ever. The value is > > still truncated to 32 bits for backwards compatibility. > > Thanks Omar, applied for 4.19. Thanks, Jens. I also just pushed a regression test to blktests.
[PATCH 06/10] block: add a missing BIOVEC_SEG_BOUNDARY check in bio_add_pc_page
The actual recaculation of segments in __blk_recalc_rq_segments will do this check, so there is no point in forcing it if we know it won't succeed. Signed-off-by: Christoph Hellwig --- block/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 81d90b839e05..c254e5aa331f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -731,7 +731,9 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt > 1 && biovec_phys_mergeable(bvec-1, bvec)) + if (bio->bi_vcnt > 1 && + biovec_phys_mergeable(bvec - 1, bvec) && + BIOVEC_SEG_BOUNDARY(q, bvec - 1, bvec)) bio_clear_flag(bio, BIO_SEG_VALID); done: -- 2.19.0
[PATCH 05/10] block: simplify BIOVEC_PHYS_MERGEABLE
Turn the macro into an inline, move it to blk.h and simplify the arch hooks a bit. Also rename the function to biovec_phys_mergeable as there is no need to shout. Signed-off-by: Christoph Hellwig --- arch/arm/include/asm/io.h | 5 ++--- arch/arm64/include/asm/io.h | 5 ++--- arch/x86/include/asm/io.h | 5 ++--- block/bio.c | 2 +- block/blk-integrity.c | 4 ++-- block/blk-merge.c | 10 +- block/blk.h | 14 ++ include/linux/bio.h | 13 - 8 files changed, 28 insertions(+), 30 deletions(-) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 6774553dc214..e58ca25eddb7 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -462,9 +462,8 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr); struct bio_vec; extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, const struct bio_vec *vec2); -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ -(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) +#define ARCH_BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ +(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)) #ifdef CONFIG_MMU #define ARCH_HAS_VALID_PHYS_ADDR_RANGE diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 35b2e50f17fb..774e03ea1bb0 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -208,9 +208,8 @@ extern int devmem_is_allowed(unsigned long pfn); struct bio_vec; extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, const struct bio_vec *vec2); -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ -(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) +#define ARCH_BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ +(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)) #endif /* __KERNEL__ */ #endif /* __ASM_IO_H */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 6de64840dd22..7c6106216d9c 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -376,9 +376,8 @@ struct bio_vec; extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, const struct bio_vec *vec2); -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ -(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) +#define ARCH_BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ +(!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)) #endif /* CONFIG_XEN */ #define IO_SPACE_LIMIT 0x diff --git a/block/bio.c b/block/bio.c index 1cd47f218200..81d90b839e05 100644 --- a/block/bio.c +++ b/block/bio.c @@ -731,7 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && biovec_phys_mergeable(bvec-1, bvec)) bio_clear_flag(bio, BIO_SEG_VALID); done: diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 6121611e1316..0f7267916509 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -49,7 +49,7 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { - if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv)) + if (!biovec_phys_mergeable(&ivprv, &iv)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) @@ -95,7 +95,7 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { - if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv)) + if (!biovec_phys_mergeable(&ivprv, &iv)) goto new_segment; if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) diff --git a/block/blk-merge.c b/block/blk-merge.c index ad8a226347a6..5e63e8259f92 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -21,7 +21,7 @@ static inline bool bios_segs_mergeable(struct request_queue *q, struct bio *prev, struct bio_vec *prev_last_bv, struct bio_vec *next_first_bv) { - if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) + if (!biovec_phys_mergeable(prev_last_bv, next_first_bv)) return false; if (!BIOVEC_SEG_BOUNDAR
[PATCH 10/10] block: don't include bug.h from bio.h
No need to pull in the BUG() defintion. Signed-off-by: Christoph Hellwig --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index b3d47862b1b4..f447b0ebb288 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -21,7 +21,6 @@ #include #include #include -#include #ifdef CONFIG_BLOCK /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ -- 2.19.0
[PATCH 08/10] block: remove bvec_to_phys
We only use it in biovec_phys_mergeable and a m68k paravirt driver, so just opencode it there. Also remove the pointless unsigned long cast for the offset in the opencoded instances. Signed-off-by: Christoph Hellwig Reviewed-by: Geert Uytterhoeven --- arch/m68k/emu/nfblock.c | 2 +- block/blk.h | 4 ++-- include/linux/bio.h | 5 - 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index e9110b9b8bcd..38049357d6d3 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -73,7 +73,7 @@ static blk_qc_t nfhd_make_request(struct request_queue *queue, struct bio *bio) len = bvec.bv_len; len >>= 9; nfhd_read_write(dev->id, 0, dir, sec >> shift, len >> shift, - bvec_to_phys(&bvec)); + page_to_phys(bvec.bv_page) + bvec.bv_offset); sec += len; } bio_endio(bio); diff --git a/block/blk.h b/block/blk.h index 8f7229b6f63e..50f74ce60453 100644 --- a/block/blk.h +++ b/block/blk.h @@ -157,8 +157,8 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { unsigned long mask = queue_segment_boundary(q); - phys_addr_t addr1 = bvec_to_phys(vec1); - phys_addr_t addr2 = bvec_to_phys(vec2); + phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; + phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; if (addr1 + vec1->bv_len != addr2) return false; diff --git a/include/linux/bio.h b/include/linux/bio.h index 9b580e1cb2e8..9ad4b0a487a4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -132,11 +132,6 @@ static inline bool bio_full(struct bio *bio) return bio->bi_vcnt >= bio->bi_max_vecs; } -/* - * will die - */ -#define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) - /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it -- 2.19.0
[PATCH 09/10] block: don't include io.h from bio.h
Now that we don't need an override for BIOVEC_PHYS_MERGEABLE there is no need to drag this header in. Signed-off-by: Christoph Hellwig --- include/linux/bio.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 9ad4b0a487a4..b3d47862b1b4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -24,9 +24,6 @@ #include #ifdef CONFIG_BLOCK - -#include - /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include -- 2.19.0
[PATCH 03/10] block: move req_gap_{back,front}_merge to blk-merge.c
Keep it close to the actual users instead of exposing the function to all drivers. Signed-off-by: Christoph Hellwig --- block/blk-merge.c | 65 +++ include/linux/blkdev.h | 69 -- 2 files changed, 65 insertions(+), 69 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index aaec38cc37b8..ad8a226347a6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -12,6 +12,71 @@ #include "blk.h" +/* + * Check if the two bvecs from two bios can be merged to one segment. If yes, + * no need to check gap between the two bios since the 1st bio and the 1st bvec + * in the 2nd bio can be handled in one segment. + */ +static inline bool bios_segs_mergeable(struct request_queue *q, + struct bio *prev, struct bio_vec *prev_last_bv, + struct bio_vec *next_first_bv) +{ + if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) + return false; + if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) + return false; + if (prev->bi_seg_back_size + next_first_bv->bv_len > + queue_max_segment_size(q)) + return false; + return true; +} + +static inline bool bio_will_gap(struct request_queue *q, + struct request *prev_rq, struct bio *prev, struct bio *next) +{ + struct bio_vec pb, nb; + + if (!bio_has_data(prev) || !queue_virt_boundary(q)) + return false; + + /* +* Don't merge if the 1st bio starts with non-zero offset, otherwise it +* is quite difficult to respect the sg gap limit. We work hard to +* merge a huge number of small single bios in case of mkfs. +*/ + if (prev_rq) + bio_get_first_bvec(prev_rq->bio, &pb); + else + bio_get_first_bvec(prev, &pb); + if (pb.bv_offset) + return true; + + /* +* We don't need to worry about the situation that the merged segment +* ends in unaligned virt boundary: +* +* - if 'pb' ends aligned, the merged segment ends aligned +* - if 'pb' ends unaligned, the next bio must include +* one single bvec of 'nb', otherwise the 'nb' can't +* merge with 'pb' +*/ + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); + if (bios_segs_mergeable(q, prev, &pb, &nb)) + return false; + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); +} + +static inline bool req_gap_back_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, req, req->biotail, bio); +} + +static inline bool req_gap_front_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, NULL, bio, req->bio); +} + static struct bio *blk_bio_discard_split(struct request_queue *q, struct bio *bio, struct bio_set *bs, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc534c857344..b7e676bb01bc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1695,75 +1695,6 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } -/* - * Check if the two bvecs from two bios can be merged to one segment. - * If yes, no need to check gap between the two bios since the 1st bio - * and the 1st bvec in the 2nd bio can be handled in one segment. - */ -static inline bool bios_segs_mergeable(struct request_queue *q, - struct bio *prev, struct bio_vec *prev_last_bv, - struct bio_vec *next_first_bv) -{ - if (!BIOVEC_PHYS_MERGEABLE(prev_last_bv, next_first_bv)) - return false; - if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) - return false; - if (prev->bi_seg_back_size + next_first_bv->bv_len > - queue_max_segment_size(q)) - return false; - return true; -} - -static inline bool bio_will_gap(struct request_queue *q, - struct request *prev_rq, - struct bio *prev, - struct bio *next) -{ - if (bio_has_data(prev) && queue_virt_boundary(q)) { - struct bio_vec pb, nb; - - /* -* don't merge if the 1st bio starts with non-zero -* offset, otherwise it is quite difficult to respect -* sg gap limit. We work hard to merge a huge number of small -* single bios in case of mkfs. -*/ - if (prev_rq) - bio_get_first_bvec(prev_rq->bio, &pb); - else - bio_get_first_bvec(prev, &pb); - if (pb.bv_offset) - return true; - - /* -* We don't
[PATCH 04/10] block: move req_gap_back_merge to blk.h
No need to expose these helpers outside the block layer. Signed-off-by: Christoph Hellwig --- block/blk.h| 19 +++ include/linux/blkdev.h | 19 --- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/block/blk.h b/block/blk.h index 441c2de1d4b9..63035c95689c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -149,6 +149,25 @@ static inline void blk_queue_enter_live(struct request_queue *q) percpu_ref_get(&q->q_usage_counter); } +static inline bool __bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + return offset || + ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); +} + +/* + * Check if adding a bio_vec after bprv with offset would create a gap in + * the SG list. Most drivers don't care about this, but some do. + */ +static inline bool bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + if (!queue_virt_boundary(q)) + return false; + return __bvec_gap_to_prev(q, bprv, offset); +} + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b7e676bb01bc..1d5e14139795 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1676,25 +1676,6 @@ static inline void put_dev_sector(Sector p) put_page(p.v); } -static inline bool __bvec_gap_to_prev(struct request_queue *q, - struct bio_vec *bprv, unsigned int offset) -{ - return offset || - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); -} - -/* - * Check if adding a bio_vec after bprv with offset would create a gap in - * the SG list. Most drivers don't care about this, but some do. - */ -static inline bool bvec_gap_to_prev(struct request_queue *q, - struct bio_vec *bprv, unsigned int offset) -{ - if (!queue_virt_boundary(q)) - return false; - return __bvec_gap_to_prev(q, bprv, offset); -} - int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); -- 2.19.0
[PATCH 07/10] block: merge BIOVEC_SEG_BOUNDARY into biovec_phys_mergeable
These two checks should always be performed together, so merge them into a single helper. Signed-off-by: Christoph Hellwig --- block/bio.c | 4 +--- block/blk-integrity.c | 12 ++-- block/blk-merge.c | 29 + block/blk.h | 12 +--- include/linux/bio.h | 8 5 files changed, 17 insertions(+), 48 deletions(-) diff --git a/block/bio.c b/block/bio.c index c254e5aa331f..e9f92b50724d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -731,9 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt > 1 && - biovec_phys_mergeable(bvec - 1, bvec) && - BIOVEC_SEG_BOUNDARY(q, bvec - 1, bvec)) + if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec)) bio_clear_flag(bio, BIO_SEG_VALID); done: diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 0f7267916509..d1ab089e0919 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -49,12 +49,8 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { - if (!biovec_phys_mergeable(&ivprv, &iv)) + if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; - - if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) - goto new_segment; - if (seg_size + iv.bv_len > queue_max_segment_size(q)) goto new_segment; @@ -95,12 +91,8 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, bio_for_each_integrity_vec(iv, bio, iter) { if (prev) { - if (!biovec_phys_mergeable(&ivprv, &iv)) + if (!biovec_phys_mergeable(q, &ivprv, &iv)) goto new_segment; - - if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv)) - goto new_segment; - if (sg->length + iv.bv_len > queue_max_segment_size(q)) goto new_segment; diff --git a/block/blk-merge.c b/block/blk-merge.c index 5e63e8259f92..42a46744c11b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -21,9 +21,7 @@ static inline bool bios_segs_mergeable(struct request_queue *q, struct bio *prev, struct bio_vec *prev_last_bv, struct bio_vec *next_first_bv) { - if (!biovec_phys_mergeable(prev_last_bv, next_first_bv)) - return false; - if (!BIOVEC_SEG_BOUNDARY(q, prev_last_bv, next_first_bv)) + if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv)) return false; if (prev->bi_seg_back_size + next_first_bv->bv_len > queue_max_segment_size(q)) @@ -199,9 +197,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, if (bvprvp && blk_queue_cluster(q)) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; - if (!biovec_phys_mergeable(bvprvp, &bv)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) + if (!biovec_phys_mergeable(q, bvprvp, &bv)) goto new_segment; seg_size += bv.bv_len; @@ -332,9 +328,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; - if (!biovec_phys_mergeable(&bvprv, &bv)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) + if (!biovec_phys_mergeable(q, &bvprv, &bv)) goto new_segment; seg_size += bv.bv_len; @@ -414,17 +408,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, bio_get_last_bvec(bio, &end_bv); bio_get_first_bvec(nxt, &nxt_bv); - if (!biovec_phys_mergeable(&end_bv, &nxt_bv)) - return 0; - - /* -* bio and nxt are contiguous in memory; check if the queue allows -* these two to be merged into one -*/ - if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv)) - return 1; - - return 0; + return biovec_phys_mergeable(q, &end_bv, &nxt_bv); } static inline void @@ -438,10 +422,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
[PATCH 01/10] arm: remove the unused BIOVEC_MERGEABLE define
Signed-off-by: Christoph Hellwig --- arch/arm/include/asm/io.h | 7 --- 1 file changed, 7 deletions(-) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 2cfbc531f63b..6774553dc214 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -459,13 +459,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr); #include -/* - * can the hardware map this into one segment or not, given no other - * constraints. - */ -#define BIOVEC_MERGEABLE(vec1, vec2) \ - ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) - struct bio_vec; extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, const struct bio_vec *vec2); -- 2.19.0
[PATCH 02/10] block: move integrity_req_gap_{back,front}_merge to blk.h
No need to expose these to drivers. Signed-off-by: Christoph Hellwig --- block/blk.h| 35 +-- include/linux/blkdev.h | 31 --- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/block/blk.h b/block/blk.h index 9db4e389582c..441c2de1d4b9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -158,7 +158,38 @@ static inline bool bio_integrity_endio(struct bio *bio) return __bio_integrity_endio(bio); return true; } -#else + +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + struct bio_integrity_payload *bip = bio_integrity(req->bio); + struct bio_integrity_payload *bip_next = bio_integrity(next); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} + +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_payload *bip_next = bio_integrity(req->bio); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + return false; +} +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + return false; +} + static inline void blk_flush_integrity(void) { } @@ -166,7 +197,7 @@ static inline bool bio_integrity_endio(struct bio *bio) { return true; } -#endif +#endif /* CONFIG_BLK_DEV_INTEGRITY */ void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d6869e0e2b64..bc534c857344 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1843,26 +1843,6 @@ queue_max_integrity_segments(struct request_queue *q) return q->limits.max_integrity_segments; } -static inline bool integrity_req_gap_back_merge(struct request *req, - struct bio *next) -{ - struct bio_integrity_payload *bip = bio_integrity(req->bio); - struct bio_integrity_payload *bip_next = bio_integrity(next); - - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], - bip_next->bip_vec[0].bv_offset); -} - -static inline bool integrity_req_gap_front_merge(struct request *req, -struct bio *bio) -{ - struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_integrity_payload *bip_next = bio_integrity(req->bio); - - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], - bip_next->bip_vec[0].bv_offset); -} - /** * bio_integrity_intervals - Return number of integrity intervals for a bio * @bi:blk_integrity profile for device @@ -1947,17 +1927,6 @@ static inline bool blk_integrity_merge_bio(struct request_queue *rq, return true; } -static inline bool integrity_req_gap_back_merge(struct request *req, - struct bio *next) -{ - return false; -} -static inline bool integrity_req_gap_front_merge(struct request *req, -struct bio *bio) -{ - return false; -} - static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { -- 2.19.0
clean up physical merging helpers V2
Hi Jens, this series moves various helpers related to merging based on physical addresses from the public headers into block/, cleans up the code a bit and removes not nessecary includes from the block headers. Change since V1: - dropped the Xen related changed which are moved into a new series to be sent after this one