[PATCH V4 14/14] blk-mq: improve bio merge from blk-mq sw queue
This patch uses hash table to do bio merge from sw queue, then we can align to blk-mq scheduler/block legacy's way for bio merge. Turns out bio merge via hash table is more efficient than simple merge on the last 8 requests in sw queue. On SCSI SRP, it is observed ~10% IOPS is increased in sequential IO test with this patch. It is also one step forward to real 'none' scheduler, in which way the blk-mq scheduler framework can be more clean. Signed-off-by: Ming Lei--- block/blk-mq-sched.c | 49 - block/blk-mq.c | 28 +--- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index afa7d9a258e4..df41cba49866 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -266,50 +266,25 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); -/* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. - */ -static bool blk_mq_attempt_merge(struct request_queue *q, +static bool blk_mq_ctx_try_merge(struct request_queue *q, struct blk_mq_ctx *ctx, struct bio *bio) { - struct request *rq; - int checked = 8; + struct request *rq, *free = NULL; + enum elv_merge type; + bool merged; lockdep_assert_held(>lock); - list_for_each_entry_reverse(rq, >rq_list, queuelist) { - bool merged = false; - - if (!checked--) - break; - - if (!blk_rq_merge_ok(rq, bio)) - continue; + type = elv_merge_ctx(q, , bio, ctx); + merged = __blk_mq_try_merge(q, bio, , rq, type); - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(q, rq, bio); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(q, rq, bio); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - continue; - } + if (free) + blk_mq_free_request(free); - if (merged) - ctx->rq_merged++; - return merged; - } + if (merged) + ctx->rq_merged++; - return false; + return merged; } bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) @@ -327,7 +302,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { /* default per sw-queue merge */ spin_lock(>lock); - ret = blk_mq_attempt_merge(q, ctx, bio); + ret = blk_mq_ctx_try_merge(q, ctx, bio); spin_unlock(>lock); } diff --git a/block/blk-mq.c b/block/blk-mq.c index fc3d26bbfc1a..d935f15c54da 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -847,6 +847,18 @@ static void blk_mq_timeout_work(struct work_struct *work) blk_queue_exit(q); } +static void blk_mq_ctx_remove_rq_list(struct blk_mq_ctx *ctx, + struct list_head *head) +{ + struct request *rq; + + lockdep_assert_held(>lock); + + list_for_each_entry(rq, head, queuelist) + rqhash_del(rq); + ctx->last_merge = NULL; +} + struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; @@ -861,6 +873,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) sbitmap_clear_bit(sb, bitnr); spin_lock(>lock); list_splice_tail_init(>rq_list, flush_data->list); + blk_mq_ctx_remove_rq_list(ctx, flush_data->list); spin_unlock(>lock); return true; } @@ -890,17 +903,23 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *d struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + struct request *rq = NULL; spin_lock(>lock); if (unlikely(!list_empty(>rq_list))) { - dispatch_data->rq = list_entry_rq(ctx->rq_list.next); - list_del_init(_data->rq->queuelist); + rq = list_entry_rq(ctx->rq_list.next); + list_del_init(>queuelist); + rqhash_del(rq); if (list_empty(>rq_list))
[PATCH V4 12/14] block: introduce .last_merge and .hash to blk_mq_ctx
Prepare for supporting bio merge to sw queue if no blk-mq io scheduler is taken. Signed-off-by: Ming Lei--- block/blk-mq.h | 4 block/blk.h | 3 +++ block/elevator.c | 22 +++--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.h b/block/blk-mq.h index 0277f9771fab..1b9742eb7399 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -18,6 +18,10 @@ struct blk_mq_ctx { unsigned long rq_dispatched[2]; unsigned long rq_merged; + /* bio merge via request hash table */ + struct request *last_merge; + DECLARE_HASHTABLE(hash, ELV_HASH_BITS); + /* incremented at completion time */ unsigned long cacheline_aligned_in_smp rq_completed[2]; diff --git a/block/blk.h b/block/blk.h index eb3436d4a73f..fa4f232afc18 100644 --- a/block/blk.h +++ b/block/blk.h @@ -198,6 +198,9 @@ static inline struct request *rqhash_find(struct hlist_head *hash, sector_t offs return NULL; } +enum elv_merge elv_merge_ctx(struct request_queue *q, struct request **req, +struct bio *bio, struct blk_mq_ctx *ctx); + void blk_insert_flush(struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) diff --git a/block/elevator.c b/block/elevator.c index 2424aea85393..0e13e5c18982 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -471,6 +471,13 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, return __elv_merge(q, req, bio, q->elevator->hash, q->last_merge); } +enum elv_merge elv_merge_ctx(struct request_queue *q, struct request **req, + struct bio *bio, struct blk_mq_ctx *ctx) +{ + WARN_ON_ONCE(!q->mq_ops); + return __elv_merge(q, req, bio, ctx->hash, ctx->last_merge); +} + /* * Attempt to do an insertion back merge. Only check for the case where * we can append 'rq' to an existing request, so we can throw 'rq' away @@ -516,16 +523,25 @@ void elv_merged_request(struct request_queue *q, struct request *rq, enum elv_merge type) { struct elevator_queue *e = q->elevator; + struct hlist_head *hash = e->hash; + + /* we do bio merge on blk-mq sw queue */ + if (q->mq_ops && !e) { + rq->mq_ctx->last_merge = rq; + hash = rq->mq_ctx->hash; + goto reposition; + } + + q->last_merge = rq; if (e->uses_mq && e->type->ops.mq.request_merged) e->type->ops.mq.request_merged(q, rq, type); else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn) e->type->ops.sq.elevator_merged_fn(q, rq, type); + reposition: if (type == ELEVATOR_BACK_MERGE) - elv_rqhash_reposition(q, rq); - - q->last_merge = rq; + rqhash_reposition(hash, rq); } void elv_merge_requests(struct request_queue *q, struct request *rq, -- 2.9.5
[PATCH V4 13/14] blk-mq-sched: refactor blk_mq_sched_try_merge()
This patch introduces one function __blk_mq_try_merge() which will be resued for bio merge to sw queue in the following patch. No functional change. Reviewed-by: Bart Van AsscheSigned-off-by: Ming Lei --- block/blk-mq-sched.c | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 1ff6f9bedd1a..afa7d9a258e4 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -228,12 +228,11 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } } -bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, - struct request **merged_request) +static bool __blk_mq_try_merge(struct request_queue *q, + struct bio *bio, struct request **merged_request, + struct request *rq, enum elv_merge type) { - struct request *rq; - - switch (elv_merge(q, , bio)) { + switch (type) { case ELEVATOR_BACK_MERGE: if (!blk_mq_sched_allow_merge(q, rq, bio)) return false; @@ -256,6 +255,15 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, return false; } } + +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, + struct request **merged_request) +{ + struct request *rq; + enum elv_merge type = elv_merge(q, , bio); + + return __blk_mq_try_merge(q, bio, merged_request, rq, type); +} EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); /* -- 2.9.5
[PATCH V4 10/14] block: move actual bio merge code into __elv_merge
So that we can reuse __elv_merge() to merge bio into requests from sw queue in the following patches. Signed-off-by: Ming Lei--- block/elevator.c | 19 +-- 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index 824cc3e69ac3..e11c7873fc21 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -409,8 +409,9 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(elv_dispatch_add_tail); -enum elv_merge elv_merge(struct request_queue *q, struct request **req, - struct bio *bio) +static enum elv_merge __elv_merge(struct request_queue *q, + struct request **req, struct bio *bio, + struct hlist_head *hash, struct request *last_merge) { struct elevator_queue *e = q->elevator; struct request *__rq; @@ -427,11 +428,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, /* * First try one-hit cache. */ - if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { - enum elv_merge ret = blk_try_merge(q->last_merge, bio); + if (last_merge && elv_bio_merge_ok(last_merge, bio)) { + enum elv_merge ret = blk_try_merge(last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { - *req = q->last_merge; + *req = last_merge; return ret; } } @@ -442,7 +443,7 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, /* * See if our hash lookup can find a potential backmerge. */ - __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); + __rq = rqhash_find(hash, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_BACK_MERGE; @@ -456,6 +457,12 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } +enum elv_merge elv_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + return __elv_merge(q, req, bio, q->elevator->hash, q->last_merge); +} + /* * Attempt to do an insertion back merge. Only check for the case where * we can append 'rq' to an existing request, so we can throw 'rq' away -- 2.9.5
[PATCH V4 11/14] block: add check on elevator for supporting bio merge via hashtable from blk-mq sw queue
blk_mq_sched_try_merge() will be reused in following patches to support bio merge to blk-mq sw queue, so add checkes to related functions which are called from blk_mq_sched_try_merge(). Signed-off-by: Ming Lei--- block/elevator.c | 16 1 file changed, 16 insertions(+) diff --git a/block/elevator.c b/block/elevator.c index e11c7873fc21..2424aea85393 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -71,6 +71,10 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) if (!blk_rq_merge_ok(rq, bio)) return false; + /* We need to support to merge bio from sw queue */ + if (!rq->q->elevator) + return true; + if (!elv_iosched_allow_bio_merge(rq, bio)) return false; @@ -449,6 +453,10 @@ static enum elv_merge __elv_merge(struct request_queue *q, return ELEVATOR_BACK_MERGE; } + /* no elevator when merging bio to blk-mq sw queue */ + if (!e) + return ELEVATOR_NO_MERGE; + if (e->uses_mq && e->type->ops.mq.request_merge) return e->type->ops.mq.request_merge(q, req, bio); else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn) @@ -711,6 +719,10 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + /* no elevator when merging bio to blk-mq sw queue */ + if (!e) + return NULL; + if (e->uses_mq && e->type->ops.mq.next_request) return e->type->ops.mq.next_request(q, rq); else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn) @@ -723,6 +735,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; + /* no elevator when merging bio to blk-mq sw queue */ + if (!e) + return NULL; + if (e->uses_mq && e->type->ops.mq.former_request) return e->type->ops.mq.former_request(q, rq); if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn) -- 2.9.5
[PATCH V4 09/14] block: introduce rqhash helpers
We need this helpers for supporting to use hashtable to improve bio merge from sw queue in the following patches. No functional change. Signed-off-by: Ming Lei--- block/blk.h | 52 block/elevator.c | 36 +++- 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/block/blk.h b/block/blk.h index fcb9775b997d..eb3436d4a73f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -146,6 +146,58 @@ static inline void blk_clear_rq_complete(struct request *rq) */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) +/* + * Merge hash stuff. + */ +#define rq_hash_key(rq)(blk_rq_pos(rq) + blk_rq_sectors(rq)) + +#define bucket(head, key) &((head)[hash_min((key), ELV_HASH_BITS)]) + +static inline void __rqhash_del(struct request *rq) +{ + hash_del(>hash); + rq->rq_flags &= ~RQF_HASHED; +} + +static inline void rqhash_del(struct request *rq) +{ + if (ELV_ON_HASH(rq)) + __rqhash_del(rq); +} + +static inline void rqhash_add(struct hlist_head *hash, struct request *rq) +{ + BUG_ON(ELV_ON_HASH(rq)); + hlist_add_head(>hash, bucket(hash, rq_hash_key(rq))); + rq->rq_flags |= RQF_HASHED; +} + +static inline void rqhash_reposition(struct hlist_head *hash, struct request *rq) +{ + __rqhash_del(rq); + rqhash_add(hash, rq); +} + +static inline struct request *rqhash_find(struct hlist_head *hash, sector_t offset) +{ + struct hlist_node *next; + struct request *rq = NULL; + + hlist_for_each_entry_safe(rq, next, bucket(hash, offset), hash) { + BUG_ON(!ELV_ON_HASH(rq)); + + if (unlikely(!rq_mergeable(rq))) { + __rqhash_del(rq); + continue; + } + + if (rq_hash_key(rq) == offset) + return rq; + } + + return NULL; +} + void blk_insert_flush(struct request *rq); static inline struct request *__elv_next_request(struct request_queue *q) diff --git a/block/elevator.c b/block/elevator.c index 153926a90901..824cc3e69ac3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -47,11 +47,6 @@ static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); /* - * Merge hash stuff. - */ -#define rq_hash_key(rq)(blk_rq_pos(rq) + blk_rq_sectors(rq)) - -/* * Query io scheduler to see if the current process issuing bio may be * merged with rq. */ @@ -268,14 +263,12 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { - hash_del(>hash); - rq->rq_flags &= ~RQF_HASHED; + __rqhash_del(rq); } void elv_rqhash_del(struct request_queue *q, struct request *rq) { - if (ELV_ON_HASH(rq)) - __elv_rqhash_del(rq); + rqhash_del(rq); } EXPORT_SYMBOL_GPL(elv_rqhash_del); @@ -283,37 +276,22 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - BUG_ON(ELV_ON_HASH(rq)); - hash_add(e->hash, >hash, rq_hash_key(rq)); - rq->rq_flags |= RQF_HASHED; + rqhash_add(e->hash, rq); } EXPORT_SYMBOL_GPL(elv_rqhash_add); void elv_rqhash_reposition(struct request_queue *q, struct request *rq) { - __elv_rqhash_del(rq); - elv_rqhash_add(q, rq); + struct elevator_queue *e = q->elevator; + + rqhash_reposition(e->hash, rq); } struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; - struct hlist_node *next; - struct request *rq; - - hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { - BUG_ON(!ELV_ON_HASH(rq)); - if (unlikely(!rq_mergeable(rq))) { - __elv_rqhash_del(rq); - continue; - } - - if (rq_hash_key(rq) == offset) - return rq; - } - - return NULL; + return rqhash_find(e->hash, offset); } /* -- 2.9.5
[PATCH V4 08/14] blk-mq-sched: use q->queue_depth as hint for q->nr_requests
SCSI sets q->queue_depth from shost->cmd_per_lun, and q->queue_depth is per request_queue and more related to scheduler queue compared with hw queue depth, which can be shared by queues, such as TAG_SHARED. This patch tries to use q->queue_depth as hint for computing q->nr_requests, which should be more effective than current way. Reviewed-by: Bart Van AsscheReviewed-by: Christoph Hellwig Signed-off-by: Ming Lei --- block/blk-mq-sched.h | 18 +++--- block/blk-mq.c | 27 +-- block/blk-mq.h | 1 + block/blk-settings.c | 2 ++ 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1d47f3fda1d0..906b10c54f78 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -99,12 +99,24 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) static inline unsigned blk_mq_sched_queue_depth(struct request_queue *q) { /* -* Default to double of smaller one between hw queue_depth and 128, +* q->queue_depth is more close to scheduler queue, so use it +* as hint for computing scheduler queue depth if it is valid +*/ + unsigned q_depth = q->queue_depth ?: q->tag_set->queue_depth; + + /* +* Default to double of smaller one between queue depth and 128, * since we don't split into sync/async like the old code did. * Additionally, this is a per-hw queue depth. */ - return 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_MAX_RQ); + q_depth = 2 * min_t(unsigned int, q_depth, BLKDEV_MAX_RQ); + + /* +* when queue depth of driver is too small, we set queue depth +* of scheduler queue as 128 which is the default setting of +* block legacy code. +*/ + return max_t(unsigned, q_depth, BLKDEV_MAX_RQ); } #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 6af56a71c1cd..fc3d26bbfc1a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2650,7 +2650,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_free_tag_set); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +static int __blk_mq_update_nr_requests(struct request_queue *q, + bool sched_only, + unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; @@ -2669,7 +2671,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ - if (!hctx->sched_tags) { + if (!sched_only && !hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, >tags, min(nr, set->queue_depth), false); @@ -2689,6 +2691,27 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + return __blk_mq_update_nr_requests(q, false, nr); +} + +/* + * When drivers update q->queue_depth, this API is called so that + * we can use this queue depth as hint for adjusting scheduler + * queue depth. + */ +int blk_mq_update_sched_queue_depth(struct request_queue *q) +{ + unsigned nr; + + if (!q->mq_ops || !q->elevator) + return 0; + + nr = blk_mq_sched_queue_depth(q); + return __blk_mq_update_nr_requests(q, true, nr); +} + static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { diff --git a/block/blk-mq.h b/block/blk-mq.h index e42748bfb959..0277f9771fab 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -37,6 +37,7 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); struct request *blk_mq_dispatch_rq_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); +int blk_mq_update_sched_queue_depth(struct request_queue *q); /* * Internal helpers for allocating/freeing the request map diff --git a/block/blk-settings.c b/block/blk-settings.c index 8559e9563c52..c2db38d2ec2b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -878,6 +878,8 @@ void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; wbt_set_queue_depth(q->rq_wb, depth); + + WARN_ON(blk_mq_update_sched_queue_depth(q)); } EXPORT_SYMBOL(blk_set_queue_depth); -- 2.9.5
[PATCH V4 03/14] blk-mq: introduce blk_mq_dispatch_rq_from_ctx()
This function is introduced for dequeuing request from sw queue so that we can dispatch it in scheduler's way. More importantly, some SCSI devices may set q->queue_depth, which is a per-request_queue limit, and applied on pending I/O from all hctxs. This function is introduced for avoiding to dequeue too many requests from sw queue when ->dispatch isn't flushed completely. Reviewed-by: Bart Van AsscheSigned-off-by: Ming Lei --- block/blk-mq.c | 38 ++ block/blk-mq.h | 2 ++ 2 files changed, 40 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3f18cff80050..f063dd0f197f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -880,6 +880,44 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) } EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); +struct dispatch_rq_data { + struct blk_mq_hw_ctx *hctx; + struct request *rq; +}; + +static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct dispatch_rq_data *dispatch_data = data; + struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + spin_lock(>lock); + if (unlikely(!list_empty(>rq_list))) { + dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + list_del_init(_data->rq->queuelist); + if (list_empty(>rq_list)) + sbitmap_clear_bit(sb, bitnr); + } + spin_unlock(>lock); + + return !dispatch_data->rq; +} + +struct request *blk_mq_dispatch_rq_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start) +{ + unsigned off = start ? start->index_hw : 0; + struct dispatch_rq_data data = { + .hctx = hctx, + .rq = NULL, + }; + + __sbitmap_for_each_set(>ctx_map, off, + dispatch_rq_from_ctx, ); + + return data.rq; +} + static inline unsigned int queued_to_index(unsigned int queued) { if (!queued) diff --git a/block/blk-mq.h b/block/blk-mq.h index 98252b79b80b..e42748bfb959 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, bool wait); +struct request *blk_mq_dispatch_rq_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start); /* * Internal helpers for allocating/freeing the request map -- 2.9.5
[PATCH V4 06/14] blk-mq-sched: don't dequeue request until all in ->dispatch are flushed
During dispatching, we moved all requests from hctx->dispatch to one temporary list, then dispatch them one by one from this list. Unfortunately during this period, run queue from other contexts may think the queue is idle, then start to dequeue from sw/scheduler queue and still try to dispatch because ->dispatch is empty. This way hurts sequential I/O performance because requests are dequeued when lld queue is busy. This patch introduces the state of BLK_MQ_S_DISPATCH_BUSY to make sure that request isn't dequeued until ->dispatch is flushed. Reviewed-by: Bart Van AsscheSigned-off-by: Ming Lei --- block/blk-mq-debugfs.c | 1 + block/blk-mq-sched.c | 58 +++--- block/blk-mq.c | 6 ++ include/linux/blk-mq.h | 1 + 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 980e73095643..7a27f262c96a 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -182,6 +182,7 @@ static const char *const hctx_state_name[] = { HCTX_STATE_NAME(SCHED_RESTART), HCTX_STATE_NAME(TAG_WAITING), HCTX_STATE_NAME(START_ON_RUN), + HCTX_STATE_NAME(DISPATCH_BUSY), }; #undef HCTX_STATE_NAME diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 735e432294ab..97e7a4fe3a32 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -146,7 +146,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; - bool do_sched_dispatch = true; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -177,8 +176,33 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(_list)) { blk_mq_sched_mark_restart_hctx(hctx); - do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); - } else if (!has_sched_dispatch && !q->queue_depth) { + blk_mq_dispatch_rq_list(q, _list); + + /* +* We may clear DISPATCH_BUSY just after it +* is set from another context, the only cost +* is that one request is dequeued a bit early, +* we can survive that. Given the window is +* small enough, no need to worry about performance +* effect. +*/ + if (list_empty_careful(>dispatch)) + clear_bit(BLK_MQ_S_DISPATCH_BUSY, >state); + } + + /* +* If DISPATCH_BUSY is set, that means hw queue is busy +* and requests in the list of hctx->dispatch need to +* be flushed first, so return early. +* +* Wherever DISPATCH_BUSY is set, blk_mq_run_hw_queue() +* will be run to try to make progress, so it is always +* safe to check the state here. +*/ + if (test_bit(BLK_MQ_S_DISPATCH_BUSY, >state)) + return; + + if (!has_sched_dispatch) { /* * If there is no per-request_queue depth, we * flush all requests in this hw queue, otherwise @@ -187,22 +211,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * is busy, which can be triggered easily by * per-request_queue queue depth */ - blk_mq_flush_busy_ctxs(hctx, _list); - blk_mq_dispatch_rq_list(q, _list); - } - - if (!do_sched_dispatch) - return; + if (!q->queue_depth) { + blk_mq_flush_busy_ctxs(hctx, _list); + blk_mq_dispatch_rq_list(q, _list); + } else { + blk_mq_do_dispatch_ctx(q, hctx); + } + } else { - /* -* We want to dispatch from the scheduler if we had no work left -* on the dispatch list, OR if we did have work but weren't able -* to make progress. -*/ - if (has_sched_dispatch) + /* +* We want to dispatch from the scheduler if we had no work left +* on the dispatch list, OR if we did have work but weren't able +* to make progress. +*/ blk_mq_do_dispatch_sched(q, e, hctx); - else - blk_mq_do_dispatch_ctx(q, hctx); + } } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, @@ -330,6 +353,7 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, */ spin_lock(>lock); list_add(>queuelist, >dispatch); + set_bit(BLK_MQ_S_DISPATCH_BUSY, >state); spin_unlock(>lock); return true; } diff --git
[PATCH V4 07/14] blk-mq-sched: introduce blk_mq_sched_queue_depth()
The following patch will use one hint to figure out default queue depth for scheduler queue, so introduce the helper of blk_mq_sched_queue_depth() for this purpose. Reviewed-by: Christoph HellwigReviewed-by: Bart Van Assche Signed-off-by: Ming Lei --- block/blk-mq-sched.c | 8 +--- block/blk-mq-sched.h | 11 +++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 97e7a4fe3a32..1ff6f9bedd1a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -595,13 +595,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; } - /* -* Default to double of smaller one between hw queue_depth and 128, -* since we don't split into sync/async like the old code did. -* Additionally, this is a per-hw queue depth. -*/ - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_MAX_RQ); + q->nr_requests = blk_mq_sched_queue_depth(q); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 9267d0b7c197..1d47f3fda1d0 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -96,4 +96,15 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) return test_bit(BLK_MQ_S_SCHED_RESTART, >state); } +static inline unsigned blk_mq_sched_queue_depth(struct request_queue *q) +{ + /* +* Default to double of smaller one between hw queue_depth and 128, +* since we don't split into sync/async like the old code did. +* Additionally, this is a per-hw queue depth. +*/ + return 2 * min_t(unsigned int, q->tag_set->queue_depth, + BLKDEV_MAX_RQ); +} + #endif -- 2.9.5
[PATCH V4 04/14] blk-mq-sched: move actual dispatching into one helper
So that it becomes easy to support to dispatch from sw queue in the following patch. No functional change. Reviewed-by: Bart Van AsscheSigned-off-by: Ming Lei --- block/blk-mq-sched.c | 28 ++-- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 845e5baf8af1..f69752961a34 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -89,6 +89,22 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } +static void blk_mq_do_dispatch(struct request_queue *q, + struct elevator_queue *e, + struct blk_mq_hw_ctx *hctx) +{ + LIST_HEAD(rq_list); + + do { + struct request *rq; + + rq = e->type->ops.mq.dispatch_request(hctx); + if (!rq) + break; + list_add(>queuelist, _list); + } while (blk_mq_dispatch_rq_list(q, _list)); +} + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -136,16 +152,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * on the dispatch list, OR if we did have work but weren't able * to make progress. */ - if (do_sched_dispatch && has_sched_dispatch) { - do { - struct request *rq; - - rq = e->type->ops.mq.dispatch_request(hctx); - if (!rq) - break; - list_add(>queuelist, _list); - } while (blk_mq_dispatch_rq_list(q, _list)); - } + if (do_sched_dispatch && has_sched_dispatch) + blk_mq_do_dispatch(q, e, hctx); } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, -- 2.9.5
[PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
SCSI devices use host-wide tagset, and the shared driver tag space is often quite big. Meantime there is also queue depth for each lun(.cmd_per_lun), which is often small. So lots of requests may stay in sw queue, and we always flush all belonging to same hw queue and dispatch them all to driver, unfortunately it is easy to cause queue busy because of the small per-lun queue depth. Once these requests are flushed out, they have to stay in hctx->dispatch, and no bio merge can participate into these requests, and sequential IO performance is hurted. This patch improves dispatching from sw queue when there is per-request-queue queue depth by taking request one by one from sw queue, just like the way of IO scheduler. Reviewed-by: Bart Van AsscheSigned-off-by: Ming Lei --- block/blk-mq-sched.c | 61 +- include/linux/blk-mq.h | 2 ++ 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index f69752961a34..735e432294ab 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } -static void blk_mq_do_dispatch(struct request_queue *q, - struct elevator_queue *e, - struct blk_mq_hw_ctx *hctx) +static void blk_mq_do_dispatch_sched(struct request_queue *q, +struct elevator_queue *e, +struct blk_mq_hw_ctx *hctx) { LIST_HEAD(rq_list); @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q, } while (blk_mq_dispatch_rq_list(q, _list)); } +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + unsigned idx = ctx->index_hw; + + if (++idx == hctx->nr_ctx) + idx = 0; + + return hctx->ctxs[idx]; +} + +static void blk_mq_do_dispatch_ctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + LIST_HEAD(rq_list); + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + bool dispatched; + + do { + struct request *rq; + + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); + if (!rq) + break; + list_add(>queuelist, _list); + + /* round robin for fair dispatch */ + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); + + dispatched = blk_mq_dispatch_rq_list(q, _list); + } while (dispatched); + + if (!dispatched) + WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx)); +} + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (!list_empty(_list)) { blk_mq_sched_mark_restart_hctx(hctx); do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); - } else if (!has_sched_dispatch) { + } else if (!has_sched_dispatch && !q->queue_depth) { + /* +* If there is no per-request_queue depth, we +* flush all requests in this hw queue, otherwise +* pick up request one by one from sw queue for +* avoiding to mess up I/O merge when dispatch +* is busy, which can be triggered easily by +* per-request_queue queue depth +*/ blk_mq_flush_busy_ctxs(hctx, _list); blk_mq_dispatch_rq_list(q, _list); } + if (!do_sched_dispatch) + return; + /* * We want to dispatch from the scheduler if we had no work left * on the dispatch list, OR if we did have work but weren't able * to make progress. */ - if (do_sched_dispatch && has_sched_dispatch) - blk_mq_do_dispatch(q, e, hctx); + if (has_sched_dispatch) + blk_mq_do_dispatch_sched(q, e, hctx); + else + blk_mq_do_dispatch_ctx(q, hctx); } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..7b7a366a97f3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -30,6 +30,8 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; + struct blk_mq_ctx *dispatch_from; + struct blk_mq_ctx **ctxs; unsigned intnr_ctx; -- 2.9.5
[PATCH V4 02/14] sbitmap: introduce __sbitmap_for_each_set()
We need to iterate ctx starting from any ctx in round robin way, so introduce this helper. Cc: Omar SandovalSigned-off-by: Ming Lei --- include/linux/sbitmap.h | 54 - 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index a1904aadbc45..2329b9e1a0e2 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb); */ bool sbitmap_any_bit_clear(const struct sbitmap *sb); +#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) +#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) + typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** * sbitmap_for_each_set() - Iterate over each set bit in a sbitmap. + * @off: Where to start the iteration * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. @@ -222,35 +226,57 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ -static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, - void *data) +static inline void __sbitmap_for_each_set(struct sbitmap *sb, + unsigned int off, + sb_for_each_fn fn, void *data) { - unsigned int i; + unsigned int index = SB_NR_TO_INDEX(sb, off); + unsigned int nr = SB_NR_TO_BIT(sb, off); + unsigned int scanned = 0; - for (i = 0; i < sb->map_nr; i++) { - struct sbitmap_word *word = >map[i]; - unsigned int off, nr; + while (1) { + struct sbitmap_word *word = >map[index]; + unsigned int depth = min_t(unsigned int, word->depth - nr, + sb->depth - scanned); + scanned += depth; if (!word->word) - continue; + goto next; - nr = 0; - off = i << sb->shift; + depth += nr; + off = index << sb->shift; while (1) { - nr = find_next_bit(>word, word->depth, nr); - if (nr >= word->depth) + nr = find_next_bit(>word, depth, nr); + if (nr >= depth) break; - if (!fn(sb, off + nr, data)) return; nr++; } + next: + if (scanned >= sb->depth) + break; + nr = 0; + if (++index >= sb->map_nr) + index = 0; } } -#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) -#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) +/** + * sbitmap_for_each_set() - Iterate over each set bit in a sbitmap. + * @sb: Bitmap to iterate over. + * @fn: Callback. Should return true to continue or false to break early. + * @data: Pointer to pass to callback. + * + * This is inline even though it's non-trivial so that the function calls to the + * callback will hopefully get optimized away. + */ +static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, + void *data) +{ + __sbitmap_for_each_set(sb, 0, fn, data); +} static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) -- 2.9.5
[PATCH V4 01/14] blk-mq-sched: fix scheduler bad performance
When hw queue is busy, we shouldn't take requests from scheduler queue any more, otherwise it is difficult to do IO merge. This patch fixes the awful IO performance on some SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber is used by not taking requests if hw queue is busy. Reviewed-by: Bart Van AsscheSigned-off-by: Ming Lei --- block/blk-mq-sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4ab69435708c..845e5baf8af1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; - bool did_work = false; + bool do_sched_dispatch = true; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -125,7 +125,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(_list)) { blk_mq_sched_mark_restart_hctx(hctx); - did_work = blk_mq_dispatch_rq_list(q, _list); + do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); } else if (!has_sched_dispatch) { blk_mq_flush_busy_ctxs(hctx, _list); blk_mq_dispatch_rq_list(q, _list); @@ -136,7 +136,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) * on the dispatch list, OR if we did have work but weren't able * to make progress. */ - if (!did_work && has_sched_dispatch) { + if (do_sched_dispatch && has_sched_dispatch) { do { struct request *rq; -- 2.9.5
Re: [PATCH V3 0/8] block/scsi: safe SCSI quiescing
Again, Tested-by: Oleksandr NatalenkoOn sobota 2. září 2017 15:08:32 CEST Ming Lei wrote: > Hi, > > The current SCSI quiesce isn't safe and easy to trigger I/O deadlock. > > Once SCSI device is put into QUIESCE, no new request except for RQF_PREEMPT > can be dispatched to SCSI successfully, and scsi_device_quiesce() just > simply waits for completion of I/Os dispatched to SCSI stack. It isn't > enough at all. > > Because new request still can be allocated, but all the allocated > requests can't be dispatched successfully, so request pool can be > consumed up easily. > > Then request with RQF_PREEMPT can't be allocated, and system may > hang forever, such as during system suspend or SCSI domain alidation. > > Both IO hang inside system suspend[1] or SCSI domain validation > were reported before. > > This patch introduces preempt freez, and tries to solve the issue > by preempt freezing block queue during SCSI quiesce, and allows > to allocate request of RQF_PREEMPT when queue is preempt-frozen. > > Both SCSI and SCSI_MQ have this IO deadlock issue, this patch fixes > them all by introducing blk_freeze_queue_preempt() and > blk_unfreeze_queue_preempt(), also unifying current interfaces for > freezing queue between block legacy and blk-mq. > > Oleksandr has verified that this patchset V2 fixes his I/O hang > during suspend/resume cycle. > > V3: > - introduce q->preempt_unfreezing to fix one bug of preempt freeze > - call blk_queue_enter_live() only when queue is preempt frozen > - cleanup a bit on the implementation of preempt freeze > - only patch 6 and 7 are changed > > V2: > - drop the 1st patch in V1 because percpu_ref_is_dying() is > enough as pointed by Tejun > - introduce preempt version of blk_[freeze|unfreeze]_queue > - sync between preempt freeze and normal freeze > - fix warning from percpu-refcount as reported by Oleksandr > > > [1] https://marc.info/?t=150340250100013=3=2 > > > > Ming Lei (8): > blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue > blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue > blk-mq: only run hw queues for blk-mq > blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait > block: tracking request allocation with q_usage_counter > block: introduce preempt version of blk_[freeze|unfreeze]_queue > block: allow to allocate req with REQF_PREEMPT when queue is preempt > frozen > SCSI: preempt freeze block queue when SCSI device is put into quiesce > > block/bfq-iosched.c | 2 +- > block/blk-cgroup.c | 8 +-- > block/blk-core.c | 53 --- > block/blk-mq.c | 170 > +++ block/blk-mq.h | > 1 - > block/blk.h | 17 + > block/elevator.c | 4 +- > drivers/block/loop.c | 16 ++--- > drivers/block/rbd.c | 2 +- > drivers/nvme/host/core.c | 8 +-- > drivers/scsi/scsi_lib.c | 22 +- > include/linux/blk-mq.h | 15 +++-- > include/linux/blkdev.h | 21 +- > 13 files changed, 273 insertions(+), 66 deletions(-)
Re: [PATCH V3 7/8] block: allow to allocate req with REQF_PREEMPT when queue is preempt frozen
On Sat, Sep 02, 2017 at 09:08:39PM +0800, Ming Lei wrote: > REQF_PREEMPT is a bit special because the request is required > to be dispatched to lld even when SCSI device is quiesced. > > So this patch introduces __blk_get_request() to allow block > layer to allocate request when queue is preempt frozen, since we > will preempt freeze queue before quiescing SCSI device in the > following patch for supporting safe SCSI quiescing. > > Signed-off-by: Ming Lei> --- > block/blk-core.c | 28 > block/blk-mq.c | 14 -- > include/linux/blk-mq.h | 7 --- > include/linux/blkdev.h | 17 +++-- > 4 files changed, 51 insertions(+), 15 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 2549b0a0535d..f7a6fbb87dea 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1404,7 +1404,8 @@ static struct request *get_request(struct request_queue > *q, unsigned int op, > } > > static struct request *blk_old_get_request(struct request_queue *q, > -unsigned int op, gfp_t gfp_mask) > +unsigned int op, gfp_t gfp_mask, > +unsigned int flags) > { > struct request *rq; > int ret = 0; > @@ -1414,9 +1415,20 @@ static struct request *blk_old_get_request(struct > request_queue *q, > /* create ioc upfront */ > create_io_context(gfp_mask, q->node); > > - ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); > + /* > + * We need to allocate req of REQF_PREEMPT in preempt freezing. > + * No normal freezing can be started when preempt freezing > + * is in-progress, and queue dying is checked before starting > + * preempt freezing, so it is safe to use blk_queue_enter_live() > + * in case of preempt freezing. > + */ > + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) > + blk_queue_enter_live(q); > + else > + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); > if (ret) > return ERR_PTR(ret); > + > spin_lock_irq(q->queue_lock); > rq = get_request(q, op, NULL, gfp_mask); > if (IS_ERR(rq)) { > @@ -1432,26 +1444,26 @@ static struct request *blk_old_get_request(struct > request_queue *q, > return rq; > } > > -struct request *blk_get_request(struct request_queue *q, unsigned int op, > - gfp_t gfp_mask) > +struct request *__blk_get_request(struct request_queue *q, unsigned int op, > + gfp_t gfp_mask, unsigned int flags) > { > struct request *req; > > if (q->mq_ops) { > req = blk_mq_alloc_request(q, op, > - (gfp_mask & __GFP_DIRECT_RECLAIM) ? > - 0 : BLK_MQ_REQ_NOWAIT); > + flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ? > + 0 : BLK_MQ_REQ_NOWAIT)); > if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) > q->mq_ops->initialize_rq_fn(req); > } else { > - req = blk_old_get_request(q, op, gfp_mask); > + req = blk_old_get_request(q, op, gfp_mask, flags); > if (!IS_ERR(req) && q->initialize_rq_fn) > q->initialize_rq_fn(req); > } > > return req; > } > -EXPORT_SYMBOL(blk_get_request); > +EXPORT_SYMBOL(__blk_get_request); > > /** > * blk_requeue_request - put a request back on queue > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 54b8d8b9f40e..e81001d1da27 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -496,9 +496,19 @@ struct request *blk_mq_alloc_request(struct > request_queue *q, unsigned int op, > { > struct blk_mq_alloc_data alloc_data = { .flags = flags }; > struct request *rq; > - int ret; > + int ret = 0; > > - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); > + /* > + * We need to allocate req of REQF_PREEMPT in preempt freezing. > + * No normal freezing can be started when preempt freezing > + * is in-progress, and queue dying is checked before starting > + * preempt freezing, so it is safe to use blk_queue_enter_live() > + * in case of preempt freezing. > + */ > + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) > + blk_queue_enter_live(q); > + else > + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); > if (ret) > return ERR_PTR(ret); > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 5ae8c82d6273..596f433eb54c 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -200,9 +200,10 @@ void blk_mq_free_request(struct request *rq); > bool blk_mq_can_queue(struct blk_mq_hw_ctx *); > > enum { > - BLK_MQ_REQ_NOWAIT = (1
[PATCH V3 6/8] block: introduce preempt version of blk_[freeze|unfreeze]_queue
The two APIs are required to allow request allocation of RQF_PREEMPT when queue is preempt frozen. The following two points have to be guaranteed for one queue: 1) preempt freezing can be started only after all in-progress normal & preempt freezings are completed 2) normal freezing can be started only if in-progress preempt freezing is completed Because for normal freezing, once blk_mq_freeze_queue_wait() is returned, we have to make sure no request is entering queue any more. rwsem should have been perfect for this kind of sync, but we need to support nested normal freeze, so spin_lock and normal_freezing & preempt_freezing flag are used for the sync between normal freeze and preempt freeze. Signed-off-by: Ming Lei--- block/blk-core.c | 2 + block/blk-mq.c | 120 +++-- block/blk.h| 16 +++ include/linux/blk-mq.h | 2 + include/linux/blkdev.h | 4 ++ 5 files changed, 141 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 85b15833a7a5..2549b0a0535d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -899,6 +899,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (blkcg_init_queue(q)) goto fail_ref; + spin_lock_init(>freeze_lock); + return q; fail_ref: diff --git a/block/blk-mq.c b/block/blk-mq.c index 24de78afbe9a..54b8d8b9f40e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -118,16 +118,75 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, ); } -void blk_freeze_queue_start(struct request_queue *q) +static bool queue_freeze_is_over(struct request_queue *q, bool preempt) +{ + /* +* For preempt freeze, we simply call blk_queue_enter_live() +* before allocating one request of RQF_PREEMPT, so we have +* to check if queue is dead, otherwise we may hang on dead +* queue. +* +* For normal freeze, no need to check blk_queue_dying() +* because it is checked in blk_queue_enter(). +*/ + if (preempt) + return !(q->normal_freezing + q->preempt_freezing) || + blk_queue_dying(q); + return !q->preempt_freezing; +} + +static bool __blk_freeze_queue_start(struct request_queue *q, bool preempt) { int freeze_depth; + bool start_freeze = true; + + /* +* Wait for completion of another kind of freezing. +* +* We have to sync between normal freeze and preempt +* freeze. preempt freeze can only be started iff all +* pending normal & preempt freezing are completed, +* meantime normal freeze can be started only if there +* isn't pending preempt freezing. +* +* rwsem should have been perfect for this kind of sync, +* but we need to support nested normal freeze, so use +* spin_lock with two flag for syncing between normal +* freeze and preempt freeze. +*/ + spin_lock(>freeze_lock); + wait_event_cmd(q->mq_freeze_wq, + queue_freeze_is_over(q, preempt), + spin_unlock(>freeze_lock), + spin_lock(>freeze_lock)); + + if (preempt && blk_queue_dying(q)) { + start_freeze = false; + goto unlock; + } freeze_depth = atomic_inc_return(>mq_freeze_depth); if (freeze_depth == 1) { + if (preempt) { + q->preempt_freezing = 1; + q->preempt_unfreezing = 0; + } else + q->normal_freezing = 1; + spin_unlock(>freeze_lock); + percpu_ref_kill(>q_usage_counter); if (q->mq_ops) blk_mq_run_hw_queues(q, false); - } + } else + unlock: + spin_unlock(>freeze_lock); + + return start_freeze; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + __blk_freeze_queue_start(q, false); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -166,7 +225,7 @@ void blk_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_freeze_queue); -void blk_unfreeze_queue(struct request_queue *q) +static void __blk_unfreeze_queue(struct request_queue *q, bool preempt) { int freeze_depth; @@ -174,12 +233,67 @@ void blk_unfreeze_queue(struct request_queue *q) WARN_ON_ONCE(freeze_depth < 0); if (!freeze_depth) { percpu_ref_reinit(>q_usage_counter); + + /* +* clearing the freeze flag so that any pending +* freeze can move on +*/ + spin_lock(>freeze_lock); + if (preempt) + q->preempt_freezing = 0; + else +
[PATCH V3 8/8] SCSI: preempt freeze block queue when SCSI device is put into quiesce
Simply quiesing SCSI device and waiting for completeion of IO dispatched to SCSI queue isn't safe, it is easy to use up requests because all these allocated requests can't be dispatched when device is put in QIUESCE. Then no request can be allocated for RQF_PREEMPT, and system may hang somewhere, such as When sending commands of sync_cache or start_stop during system suspend path. Before quiesing SCSI, this patch freezes block queue in preempt mode first, so no new normal request can enter queue any more, and all pending requests are drained too once blk_freeze_queue_preempt is returned. And only RQF_PREEMPT can be allocated in preempt freeze. This patch also uses __blk_get_request() for allocating request with RQF_PREEMPT, so that the allocation can succeed even though block queue is preempt frozen. Signed-off-by: Ming Lei--- drivers/scsi/scsi_lib.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index f6097b89d5d3..e1ad135cb209 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -243,10 +243,12 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, struct request *req; struct scsi_request *rq; int ret = DRIVER_ERROR << 24; + unsigned flag = sdev->sdev_state == SDEV_QUIESCE ? BLK_REQ_PREEMPT : 0; - req = blk_get_request(sdev->request_queue, + req = __blk_get_request(sdev->request_queue, data_direction == DMA_TO_DEVICE ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM, + flag); if (IS_ERR(req)) return ret; rq = scsi_req(req); @@ -2890,6 +2892,20 @@ scsi_device_quiesce(struct scsi_device *sdev) { int err; + /* +* Simply quiesing SCSI device isn't safe, it is easy +* to use up requests because all these allocated requests +* can't be dispatched when device is put in QIUESCE. +* Then no request can be allocated and we may hang +* somewhere, such as system suspend/resume. +* +* So we freeze block queue in preempt mode first, no new +* normal request can enter queue any more, and all pending +* requests are drained once blk_freeze_queue is returned. +* Only RQF_PREEMPT is allowed in preempt freeze. +*/ + blk_freeze_queue_preempt(sdev->request_queue); + mutex_lock(>state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE); mutex_unlock(>state_mutex); @@ -2926,6 +2942,8 @@ void scsi_device_resume(struct scsi_device *sdev) scsi_device_set_state(sdev, SDEV_RUNNING) == 0) scsi_run_queue(sdev->request_queue); mutex_unlock(>state_mutex); + + blk_unfreeze_queue_preempt(sdev->request_queue); } EXPORT_SYMBOL(scsi_device_resume); -- 2.9.5
[PATCH V3 5/8] block: tracking request allocation with q_usage_counter
This usage is basically same with blk-mq, so that we can support to freeze queue easily. Signed-off-by: Ming Lei--- block/blk-core.c | 8 1 file changed, 8 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index ce2d3b6f6c62..85b15833a7a5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1405,16 +1405,21 @@ static struct request *blk_old_get_request(struct request_queue *q, unsigned int op, gfp_t gfp_mask) { struct request *rq; + int ret = 0; WARN_ON_ONCE(q->mq_ops); /* create ioc upfront */ create_io_context(gfp_mask, q->node); + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); + if (ret) + return ERR_PTR(ret); spin_lock_irq(q->queue_lock); rq = get_request(q, op, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); + blk_queue_exit(q); return rq; } @@ -1586,6 +1591,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) blk_free_request(rl, req); freed_request(rl, sync, rq_flags); blk_put_rl(rl); + blk_queue_exit(q); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1867,8 +1873,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ + blk_queue_enter_live(q); req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { + blk_queue_exit(q); __wbt_done(q->rq_wb, wb_acct); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; -- 2.9.5
[PATCH V3 4/8] blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait
The only change on legacy is that blk_drain_queue() is run from blk_freeze_queue(), which is called in blk_cleanup_queue(). So this patch removes the explicite __blk_drain_queue() in blk_cleanup_queue(). Signed-off-by: Ming Lei--- block/blk-core.c | 17 +++-- block/blk-mq.c | 8 +--- block/blk.h | 1 + drivers/nvme/host/core.c | 2 +- include/linux/blk-mq.h | 2 +- 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index d579501f24ba..ce2d3b6f6c62 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -530,6 +530,21 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) } /** + * blk_drain_queue - drain requests from request_queue + * @q: queue to drain + * + * Drain requests from @q. All pending requests are drained. + * The caller is responsible for ensuring that no new requests + * which need to be drained are queued. + */ +void blk_drain_queue(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + __blk_drain_queue(q, true); + spin_unlock_irq(q->queue_lock); +} + +/** * blk_queue_bypass_start - enter queue bypass mode * @q: queue of interest * @@ -653,8 +668,6 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); spin_lock_irq(lock); - if (!q->mq_ops) - __blk_drain_queue(q, true); queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c532d8612e1..24de78afbe9a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -131,11 +131,13 @@ void blk_freeze_queue_start(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); -void blk_mq_freeze_queue_wait(struct request_queue *q) +void blk_freeze_queue_wait(struct request_queue *q) { + if (!q->mq_ops) + blk_drain_queue(q); wait_event(q->mq_freeze_wq, percpu_ref_is_zero(>q_usage_counter)); } -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); +EXPORT_SYMBOL_GPL(blk_freeze_queue_wait); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout) @@ -160,7 +162,7 @@ void blk_freeze_queue(struct request_queue *q) * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); - blk_mq_freeze_queue_wait(q); + blk_freeze_queue_wait(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue); diff --git a/block/blk.h b/block/blk.h index 6847c5435cca..242486e26a81 100644 --- a/block/blk.h +++ b/block/blk.h @@ -64,6 +64,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); void blk_queue_bypass_start(struct request_queue *q); void blk_queue_bypass_end(struct request_queue *q); +void blk_drain_queue(struct request_queue *q); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); void blk_freeze_queue(struct request_queue *q); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 986f2b4f9760..d34a9ffaa940 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2778,7 +2778,7 @@ void nvme_wait_freeze(struct nvme_ctrl *ctrl) mutex_lock(>namespaces_mutex); list_for_each_entry(ns, >namespaces, list) - blk_mq_freeze_queue_wait(ns->queue); + blk_freeze_queue_wait(ns->queue); mutex_unlock(>namespaces_mutex); } EXPORT_SYMBOL_GPL(nvme_wait_freeze); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8ae77e088c01..f90d78eb85df 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -259,7 +259,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, void blk_freeze_queue(struct request_queue *q); void blk_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); -void blk_mq_freeze_queue_wait(struct request_queue *q); +void blk_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, -- 2.9.5
[PATCH V3 7/8] block: allow to allocate req with REQF_PREEMPT when queue is preempt frozen
REQF_PREEMPT is a bit special because the request is required to be dispatched to lld even when SCSI device is quiesced. So this patch introduces __blk_get_request() to allow block layer to allocate request when queue is preempt frozen, since we will preempt freeze queue before quiescing SCSI device in the following patch for supporting safe SCSI quiescing. Signed-off-by: Ming Lei--- block/blk-core.c | 28 block/blk-mq.c | 14 -- include/linux/blk-mq.h | 7 --- include/linux/blkdev.h | 17 +++-- 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2549b0a0535d..f7a6fbb87dea 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1404,7 +1404,8 @@ static struct request *get_request(struct request_queue *q, unsigned int op, } static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, gfp_t gfp_mask) + unsigned int op, gfp_t gfp_mask, + unsigned int flags) { struct request *rq; int ret = 0; @@ -1414,9 +1415,20 @@ static struct request *blk_old_get_request(struct request_queue *q, /* create ioc upfront */ create_io_context(gfp_mask, q->node); - ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); + /* +* We need to allocate req of REQF_PREEMPT in preempt freezing. +* No normal freezing can be started when preempt freezing +* is in-progress, and queue dying is checked before starting +* preempt freezing, so it is safe to use blk_queue_enter_live() +* in case of preempt freezing. +*/ + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) + blk_queue_enter_live(q); + else + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM)); if (ret) return ERR_PTR(ret); + spin_lock_irq(q->queue_lock); rq = get_request(q, op, NULL, gfp_mask); if (IS_ERR(rq)) { @@ -1432,26 +1444,26 @@ static struct request *blk_old_get_request(struct request_queue *q, return rq; } -struct request *blk_get_request(struct request_queue *q, unsigned int op, - gfp_t gfp_mask) +struct request *__blk_get_request(struct request_queue *q, unsigned int op, + gfp_t gfp_mask, unsigned int flags) { struct request *req; if (q->mq_ops) { req = blk_mq_alloc_request(q, op, - (gfp_mask & __GFP_DIRECT_RECLAIM) ? - 0 : BLK_MQ_REQ_NOWAIT); + flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ? + 0 : BLK_MQ_REQ_NOWAIT)); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); } else { - req = blk_old_get_request(q, op, gfp_mask); + req = blk_old_get_request(q, op, gfp_mask, flags); if (!IS_ERR(req) && q->initialize_rq_fn) q->initialize_rq_fn(req); } return req; } -EXPORT_SYMBOL(blk_get_request); +EXPORT_SYMBOL(__blk_get_request); /** * blk_requeue_request - put a request back on queue diff --git a/block/blk-mq.c b/block/blk-mq.c index 54b8d8b9f40e..e81001d1da27 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -496,9 +496,19 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, { struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - int ret; + int ret = 0; - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); + /* +* We need to allocate req of REQF_PREEMPT in preempt freezing. +* No normal freezing can be started when preempt freezing +* is in-progress, and queue dying is checked before starting +* preempt freezing, so it is safe to use blk_queue_enter_live() +* in case of preempt freezing. +*/ + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q)) + blk_queue_enter_live(q); + else + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5ae8c82d6273..596f433eb54c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -200,9 +200,10 @@ void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); enum { - BLK_MQ_REQ_NOWAIT = (1 << 0), /* return when out of requests */ - BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */ - BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate
[PATCH V3 3/8] blk-mq: only run hw queues for blk-mq
This patch just makes it explicitely. Reviewed-by: Johannes ThumshirnSigned-off-by: Ming Lei --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8cf1f7cbef2b..4c532d8612e1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -125,7 +125,8 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(>mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(>q_usage_counter); - blk_mq_run_hw_queues(q, false); + if (q->mq_ops) + blk_mq_run_hw_queues(q, false); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); -- 2.9.5
[PATCH V3 0/8] block/scsi: safe SCSI quiescing
Hi, The current SCSI quiesce isn't safe and easy to trigger I/O deadlock. Once SCSI device is put into QUIESCE, no new request except for RQF_PREEMPT can be dispatched to SCSI successfully, and scsi_device_quiesce() just simply waits for completion of I/Os dispatched to SCSI stack. It isn't enough at all. Because new request still can be allocated, but all the allocated requests can't be dispatched successfully, so request pool can be consumed up easily. Then request with RQF_PREEMPT can't be allocated, and system may hang forever, such as during system suspend or SCSI domain alidation. Both IO hang inside system suspend[1] or SCSI domain validation were reported before. This patch introduces preempt freez, and tries to solve the issue by preempt freezing block queue during SCSI quiesce, and allows to allocate request of RQF_PREEMPT when queue is preempt-frozen. Both SCSI and SCSI_MQ have this IO deadlock issue, this patch fixes them all by introducing blk_freeze_queue_preempt() and blk_unfreeze_queue_preempt(), also unifying current interfaces for freezing queue between block legacy and blk-mq. Oleksandr has verified that this patchset V2 fixes his I/O hang during suspend/resume cycle. V3: - introduce q->preempt_unfreezing to fix one bug of preempt freeze - call blk_queue_enter_live() only when queue is preempt frozen - cleanup a bit on the implementation of preempt freeze - only patch 6 and 7 are changed V2: - drop the 1st patch in V1 because percpu_ref_is_dying() is enough as pointed by Tejun - introduce preempt version of blk_[freeze|unfreeze]_queue - sync between preempt freeze and normal freeze - fix warning from percpu-refcount as reported by Oleksandr [1] https://marc.info/?t=150340250100013=3=2 Ming Lei (8): blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue blk-mq: only run hw queues for blk-mq blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait block: tracking request allocation with q_usage_counter block: introduce preempt version of blk_[freeze|unfreeze]_queue block: allow to allocate req with REQF_PREEMPT when queue is preempt frozen SCSI: preempt freeze block queue when SCSI device is put into quiesce block/bfq-iosched.c | 2 +- block/blk-cgroup.c | 8 +-- block/blk-core.c | 53 --- block/blk-mq.c | 170 +++ block/blk-mq.h | 1 - block/blk.h | 17 + block/elevator.c | 4 +- drivers/block/loop.c | 16 ++--- drivers/block/rbd.c | 2 +- drivers/nvme/host/core.c | 8 +-- drivers/scsi/scsi_lib.c | 22 +- include/linux/blk-mq.h | 15 +++-- include/linux/blkdev.h | 21 +- 13 files changed, 273 insertions(+), 66 deletions(-) -- 2.9.5
[PATCH V3 1/8] blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue
We will support to freeze queue on block legacy path too. Signed-off-by: Ming Lei--- block/blk-cgroup.c | 4 ++-- block/blk-mq.c | 10 +- block/elevator.c | 2 +- drivers/block/loop.c | 8 drivers/nvme/host/core.c | 4 ++-- include/linux/blk-mq.h | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0480892e97e5..02e8a47ac77c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1337,7 +1337,7 @@ int blkcg_activate_policy(struct request_queue *q, spin_unlock_irq(q->queue_lock); out_bypass_end: if (q->mq_ops) - blk_mq_unfreeze_queue(q); + blk_unfreeze_queue(q); else blk_queue_bypass_end(q); if (pd_prealloc) @@ -1388,7 +1388,7 @@ void blkcg_deactivate_policy(struct request_queue *q, spin_unlock_irq(q->queue_lock); if (q->mq_ops) - blk_mq_unfreeze_queue(q); + blk_unfreeze_queue(q); else blk_queue_bypass_end(q); } diff --git a/block/blk-mq.c b/block/blk-mq.c index d935f15c54da..82136e83951d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -172,7 +172,7 @@ void blk_mq_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void blk_mq_unfreeze_queue(struct request_queue *q) +void blk_unfreeze_queue(struct request_queue *q) { int freeze_depth; @@ -183,7 +183,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q) wake_up_all(>mq_freeze_wq); } } -EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +EXPORT_SYMBOL_GPL(blk_unfreeze_queue); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the @@ -2250,7 +2250,7 @@ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, list_for_each_entry(q, >tag_list, tag_set_list) { blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); - blk_mq_unfreeze_queue(q); + blk_unfreeze_queue(q); } } @@ -2708,7 +2708,7 @@ static int __blk_mq_update_nr_requests(struct request_queue *q, if (!ret) q->nr_requests = nr; - blk_mq_unfreeze_queue(q); + blk_unfreeze_queue(q); return ret; } @@ -2757,7 +2757,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, } list_for_each_entry(q, >tag_list, tag_set_list) - blk_mq_unfreeze_queue(q); + blk_unfreeze_queue(q); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) diff --git a/block/elevator.c b/block/elevator.c index 0e465809d3f3..371c8165c9e8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -994,7 +994,7 @@ static int elevator_switch_mq(struct request_queue *q, blk_add_trace_msg(q, "elv switch: none"); out: - blk_mq_unfreeze_queue(q); + blk_unfreeze_queue(q); return ret; } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2fbd4089c20e..5c11ea44d470 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -217,7 +217,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; else lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - blk_mq_unfreeze_queue(lo->lo_queue); + blk_unfreeze_queue(lo->lo_queue); } static int @@ -605,7 +605,7 @@ static int loop_switch(struct loop_device *lo, struct file *file) do_loop_switch(lo, ); /* unfreeze */ - blk_mq_unfreeze_queue(lo->lo_queue); + blk_unfreeze_queue(lo->lo_queue); return 0; } @@ -1079,7 +1079,7 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - blk_mq_unfreeze_queue(lo->lo_queue); + blk_unfreeze_queue(lo->lo_queue); if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) loop_reread_partitions(lo, bdev); @@ -1191,7 +1191,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) __loop_update_dio(lo, lo->use_dio); exit: - blk_mq_unfreeze_queue(lo->lo_queue); + blk_unfreeze_queue(lo->lo_queue); if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 37046ac2c441..5c76b0a96be2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1226,7 +1226,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ctrl->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); - blk_mq_unfreeze_queue(disk->queue); + blk_unfreeze_queue(disk->queue); } static int nvme_revalidate_disk(struct gendisk
[PATCH V3 2/8] blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue
This APIs will be used by legacy path too. Signed-off-by: Ming Lei--- block/bfq-iosched.c | 2 +- block/blk-cgroup.c | 4 ++-- block/blk-mq.c | 17 - block/blk-mq.h | 1 - block/elevator.c | 2 +- drivers/block/loop.c | 8 drivers/block/rbd.c | 2 +- drivers/nvme/host/core.c | 2 +- include/linux/blk-mq.h | 2 +- 9 files changed, 15 insertions(+), 25 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 509f39998011..ce2b00e897e2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4757,7 +4757,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * The invocation of the next bfq_create_group_hierarchy * function is the head of a chain of function calls * (bfq_create_group_hierarchy->blkcg_activate_policy-> -* blk_mq_freeze_queue) that may lead to the invocation of the +* blk_freeze_queue) that may lead to the invocation of the * has_work hook function. For this reason, * bfq_create_group_hierarchy is invoked only after all * scheduler data has been initialized, apart from the fields diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 02e8a47ac77c..87c15f3947d5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1296,7 +1296,7 @@ int blkcg_activate_policy(struct request_queue *q, return 0; if (q->mq_ops) - blk_mq_freeze_queue(q); + blk_freeze_queue(q); else blk_queue_bypass_start(q); pd_prealloc: @@ -1363,7 +1363,7 @@ void blkcg_deactivate_policy(struct request_queue *q, return; if (q->mq_ops) - blk_mq_freeze_queue(q); + blk_freeze_queue(q); else blk_queue_bypass_start(q); diff --git a/block/blk-mq.c b/block/blk-mq.c index 82136e83951d..8cf1f7cbef2b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -161,16 +161,7 @@ void blk_freeze_queue(struct request_queue *q) blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } - -void blk_mq_freeze_queue(struct request_queue *q) -{ - /* -* ...just an alias to keep freeze and unfreeze actions balanced -* in the blk_mq_* namespace -*/ - blk_freeze_queue(q); -} -EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); +EXPORT_SYMBOL_GPL(blk_freeze_queue); void blk_unfreeze_queue(struct request_queue *q) { @@ -2248,7 +2239,7 @@ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, lockdep_assert_held(>tag_list_lock); list_for_each_entry(q, >tag_list, tag_set_list) { - blk_mq_freeze_queue(q); + blk_freeze_queue(q); queue_set_hctx_shared(q, shared); blk_unfreeze_queue(q); } @@ -2683,7 +2674,7 @@ static int __blk_mq_update_nr_requests(struct request_queue *q, if (!set) return -EINVAL; - blk_mq_freeze_queue(q); + blk_freeze_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { @@ -2747,7 +2738,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, return; list_for_each_entry(q, >tag_list, tag_set_list) - blk_mq_freeze_queue(q); + blk_freeze_queue(q); set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); diff --git a/block/blk-mq.h b/block/blk-mq.h index 1b9742eb7399..7ce29ef1e6f3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -30,7 +30,6 @@ struct blk_mq_ctx { } cacheline_aligned_in_smp; void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 371c8165c9e8..1164c8a3720f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -967,7 +967,7 @@ static int elevator_switch_mq(struct request_queue *q, { int ret; - blk_mq_freeze_queue(q); + blk_freeze_queue(q); if (q->elevator) { if (q->elevator->registered) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5c11ea44d470..b2e708b7e1e6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -211,7 +211,7 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup * will get updated by ioctl(LOOP_GET_STATUS) */ - blk_mq_freeze_queue(lo->lo_queue); + blk_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; if (use_dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; @@ -599,7 +599,7 @@ static int loop_switch(struct loop_device *lo, struct
Re: [PATCH V2 0/8] block/scsi: safe SCSI quiescing
With regard to suspend/resume cycle: Tested-by: Oleksandr NatalenkoOn pátek 1. září 2017 20:49:49 CEST Ming Lei wrote: > Hi, > > The current SCSI quiesce isn't safe and easy to trigger I/O deadlock. > > Once SCSI device is put into QUIESCE, no new request except for RQF_PREEMPT > can be dispatched to SCSI successfully, and scsi_device_quiesce() just > simply waits for completion of I/Os dispatched to SCSI stack. It isn't > enough at all. > > Because new request still can be allocated, but all the allocated > requests can't be dispatched successfully, so request pool can be > consumed up easily. > > Then request with RQF_PREEMPT can't be allocated, and system may > hang forever, such as during system suspend or SCSI domain alidation. > > Both IO hang inside system suspend[1] or SCSI domain validation > were reported before. > > This patch tries to solve the issue by freezing block queue during > SCSI quiescing, and allowing to allocate request of RQF_PREEMPT > when queue is frozen. > > Both SCSI and SCSI_MQ have this IO deadlock issue, this patch fixes > them all by introducing preempt version of blk_freeze_queue() and > blk_unfreeze_queue(). > > V2: > - drop the 1st patch in V1 because percpu_ref_is_dying() is > enough as pointed by Tejun > > - introduce preempt version of blk_[freeze|unfreeze]_queue > > - sync between preempt freeze and normal freeze > > - fix warning from percpu-refcount as reported by Oleksandr > > > [1] https://marc.info/?t=150340250100013=3=2 > > > > Ming Lei (8): > blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue > blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue > blk-mq: only run hw queues for blk-mq > blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait > block: tracking request allocation with q_usage_counter > block: allow to allocate req with REQF_PREEMPT when queue is frozen > block: introduce preempt version of blk_[freeze|unfreeze]_queue > SCSI: freeze block queue when SCSI device is put into quiesce > > block/bfq-iosched.c | 2 +- > block/blk-cgroup.c | 8 ++-- > block/blk-core.c | 50 > block/blk-mq.c | 119 > --- block/blk-mq.h | > 1 - > block/blk.h | 6 +++ > block/elevator.c | 4 +- > drivers/block/loop.c | 16 +++ > drivers/block/rbd.c | 2 +- > drivers/nvme/host/core.c | 8 ++-- > drivers/scsi/scsi_lib.c | 21 - > include/linux/blk-mq.h | 15 +++--- > include/linux/blkdev.h | 20 +++- > 13 files changed, 206 insertions(+), 66 deletions(-)