[PATCH V4 14/14] blk-mq: improve bio merge from blk-mq sw queue

2017-09-02 Thread Ming Lei
This patch uses hash table to do bio merge from sw queue,
then we can align to blk-mq scheduler/block legacy's way
for bio merge.

Turns out bio merge via hash table is more efficient than
simple merge on the last 8 requests in sw queue. On SCSI SRP,
it is observed ~10% IOPS is increased in sequential IO test
with this patch.

It is also one step forward to real 'none' scheduler, in which
way the blk-mq scheduler framework can be more clean.

Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 49 -
 block/blk-mq.c   | 28 +---
 2 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index afa7d9a258e4..df41cba49866 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -266,50 +266,25 @@ bool blk_mq_sched_try_merge(struct request_queue *q, 
struct bio *bio,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 
-/*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
- */
-static bool blk_mq_attempt_merge(struct request_queue *q,
+static bool blk_mq_ctx_try_merge(struct request_queue *q,
 struct blk_mq_ctx *ctx, struct bio *bio)
 {
-   struct request *rq;
-   int checked = 8;
+   struct request *rq, *free = NULL;
+   enum elv_merge type;
+   bool merged;
 
lockdep_assert_held(>lock);
 
-   list_for_each_entry_reverse(rq, >rq_list, queuelist) {
-   bool merged = false;
-
-   if (!checked--)
-   break;
-
-   if (!blk_rq_merge_ok(rq, bio))
-   continue;
+   type = elv_merge_ctx(q, , bio, ctx);
+   merged = __blk_mq_try_merge(q, bio, , rq, type);
 
-   switch (blk_try_merge(rq, bio)) {
-   case ELEVATOR_BACK_MERGE:
-   if (blk_mq_sched_allow_merge(q, rq, bio))
-   merged = bio_attempt_back_merge(q, rq, bio);
-   break;
-   case ELEVATOR_FRONT_MERGE:
-   if (blk_mq_sched_allow_merge(q, rq, bio))
-   merged = bio_attempt_front_merge(q, rq, bio);
-   break;
-   case ELEVATOR_DISCARD_MERGE:
-   merged = bio_attempt_discard_merge(q, rq, bio);
-   break;
-   default:
-   continue;
-   }
+   if (free)
+   blk_mq_free_request(free);
 
-   if (merged)
-   ctx->rq_merged++;
-   return merged;
-   }
+   if (merged)
+   ctx->rq_merged++;
 
-   return false;
+   return merged;
 }
 
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
@@ -327,7 +302,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, 
struct bio *bio)
if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
/* default per sw-queue merge */
spin_lock(>lock);
-   ret = blk_mq_attempt_merge(q, ctx, bio);
+   ret = blk_mq_ctx_try_merge(q, ctx, bio);
spin_unlock(>lock);
}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fc3d26bbfc1a..d935f15c54da 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -847,6 +847,18 @@ static void blk_mq_timeout_work(struct work_struct *work)
blk_queue_exit(q);
 }
 
+static void blk_mq_ctx_remove_rq_list(struct blk_mq_ctx *ctx,
+   struct list_head *head)
+{
+   struct request *rq;
+
+   lockdep_assert_held(>lock);
+
+   list_for_each_entry(rq, head, queuelist)
+   rqhash_del(rq);
+   ctx->last_merge = NULL;
+}
+
 struct flush_busy_ctx_data {
struct blk_mq_hw_ctx *hctx;
struct list_head *list;
@@ -861,6 +873,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int 
bitnr, void *data)
sbitmap_clear_bit(sb, bitnr);
spin_lock(>lock);
list_splice_tail_init(>rq_list, flush_data->list);
+   blk_mq_ctx_remove_rq_list(ctx, flush_data->list);
spin_unlock(>lock);
return true;
 }
@@ -890,17 +903,23 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, 
unsigned int bitnr, void *d
struct dispatch_rq_data *dispatch_data = data;
struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+   struct request *rq = NULL;
 
spin_lock(>lock);
if (unlikely(!list_empty(>rq_list))) {
-   dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
-   list_del_init(_data->rq->queuelist);
+   rq = list_entry_rq(ctx->rq_list.next);
+   list_del_init(>queuelist);
+   rqhash_del(rq);
if (list_empty(>rq_list))
 

[PATCH V4 12/14] block: introduce .last_merge and .hash to blk_mq_ctx

2017-09-02 Thread Ming Lei
Prepare for supporting bio merge to sw queue if no
blk-mq io scheduler is taken.

Signed-off-by: Ming Lei 
---
 block/blk-mq.h   |  4 
 block/blk.h  |  3 +++
 block/elevator.c | 22 +++---
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 0277f9771fab..1b9742eb7399 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -18,6 +18,10 @@ struct blk_mq_ctx {
unsigned long   rq_dispatched[2];
unsigned long   rq_merged;
 
+   /* bio merge via request hash table */
+   struct request  *last_merge;
+   DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
+
/* incremented at completion time */
unsigned long   cacheline_aligned_in_smp rq_completed[2];
 
diff --git a/block/blk.h b/block/blk.h
index eb3436d4a73f..fa4f232afc18 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -198,6 +198,9 @@ static inline struct request *rqhash_find(struct hlist_head 
*hash, sector_t offs
return NULL;
 }
 
+enum elv_merge elv_merge_ctx(struct request_queue *q, struct request **req,
+struct bio *bio, struct blk_mq_ctx *ctx);
+
 void blk_insert_flush(struct request *rq);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
diff --git a/block/elevator.c b/block/elevator.c
index 2424aea85393..0e13e5c18982 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -471,6 +471,13 @@ enum elv_merge elv_merge(struct request_queue *q, struct 
request **req,
return __elv_merge(q, req, bio, q->elevator->hash, q->last_merge);
 }
 
+enum elv_merge elv_merge_ctx(struct request_queue *q, struct request **req,
+   struct bio *bio, struct blk_mq_ctx *ctx)
+{
+   WARN_ON_ONCE(!q->mq_ops);
+   return __elv_merge(q, req, bio, ctx->hash, ctx->last_merge);
+}
+
 /*
  * Attempt to do an insertion back merge. Only check for the case where
  * we can append 'rq' to an existing request, so we can throw 'rq' away
@@ -516,16 +523,25 @@ void elv_merged_request(struct request_queue *q, struct 
request *rq,
enum elv_merge type)
 {
struct elevator_queue *e = q->elevator;
+   struct hlist_head *hash = e->hash;
+
+   /* we do bio merge on blk-mq sw queue */
+   if (q->mq_ops && !e) {
+   rq->mq_ctx->last_merge = rq;
+   hash = rq->mq_ctx->hash;
+   goto reposition;
+   }
+
+   q->last_merge = rq;
 
if (e->uses_mq && e->type->ops.mq.request_merged)
e->type->ops.mq.request_merged(q, rq, type);
else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
+ reposition:
if (type == ELEVATOR_BACK_MERGE)
-   elv_rqhash_reposition(q, rq);
-
-   q->last_merge = rq;
+   rqhash_reposition(hash, rq);
 }
 
 void elv_merge_requests(struct request_queue *q, struct request *rq,
-- 
2.9.5



[PATCH V4 13/14] blk-mq-sched: refactor blk_mq_sched_try_merge()

2017-09-02 Thread Ming Lei
This patch introduces one function __blk_mq_try_merge()
which will be resued for bio merge to sw queue in
the following patch.

No functional change.

Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 1ff6f9bedd1a..afa7d9a258e4 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -228,12 +228,11 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
}
 }
 
-bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-   struct request **merged_request)
+static bool __blk_mq_try_merge(struct request_queue *q,
+   struct bio *bio, struct request **merged_request,
+   struct request *rq, enum elv_merge type)
 {
-   struct request *rq;
-
-   switch (elv_merge(q, , bio)) {
+   switch (type) {
case ELEVATOR_BACK_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
@@ -256,6 +255,15 @@ bool blk_mq_sched_try_merge(struct request_queue *q, 
struct bio *bio,
return false;
}
 }
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
+   struct request **merged_request)
+{
+   struct request *rq;
+   enum elv_merge type = elv_merge(q, , bio);
+
+   return __blk_mq_try_merge(q, bio, merged_request, rq, type);
+}
 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 
 /*
-- 
2.9.5



[PATCH V4 10/14] block: move actual bio merge code into __elv_merge

2017-09-02 Thread Ming Lei
So that we can reuse __elv_merge() to merge bio
into requests from sw queue in the following patches.

Signed-off-by: Ming Lei 
---
 block/elevator.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 824cc3e69ac3..e11c7873fc21 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -409,8 +409,9 @@ void elv_dispatch_add_tail(struct request_queue *q, struct 
request *rq)
 }
 EXPORT_SYMBOL(elv_dispatch_add_tail);
 
-enum elv_merge elv_merge(struct request_queue *q, struct request **req,
-   struct bio *bio)
+static enum elv_merge __elv_merge(struct request_queue *q,
+   struct request **req, struct bio *bio,
+   struct hlist_head *hash, struct request *last_merge)
 {
struct elevator_queue *e = q->elevator;
struct request *__rq;
@@ -427,11 +428,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct 
request **req,
/*
 * First try one-hit cache.
 */
-   if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
-   enum elv_merge ret = blk_try_merge(q->last_merge, bio);
+   if (last_merge && elv_bio_merge_ok(last_merge, bio)) {
+   enum elv_merge ret = blk_try_merge(last_merge, bio);
 
if (ret != ELEVATOR_NO_MERGE) {
-   *req = q->last_merge;
+   *req = last_merge;
return ret;
}
}
@@ -442,7 +443,7 @@ enum elv_merge elv_merge(struct request_queue *q, struct 
request **req,
/*
 * See if our hash lookup can find a potential backmerge.
 */
-   __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
+   __rq = rqhash_find(hash, bio->bi_iter.bi_sector);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
return ELEVATOR_BACK_MERGE;
@@ -456,6 +457,12 @@ enum elv_merge elv_merge(struct request_queue *q, struct 
request **req,
return ELEVATOR_NO_MERGE;
 }
 
+enum elv_merge elv_merge(struct request_queue *q, struct request **req,
+   struct bio *bio)
+{
+   return __elv_merge(q, req, bio, q->elevator->hash, q->last_merge);
+}
+
 /*
  * Attempt to do an insertion back merge. Only check for the case where
  * we can append 'rq' to an existing request, so we can throw 'rq' away
-- 
2.9.5



[PATCH V4 11/14] block: add check on elevator for supporting bio merge via hashtable from blk-mq sw queue

2017-09-02 Thread Ming Lei
blk_mq_sched_try_merge() will be reused in following patches
to support bio merge to blk-mq sw queue, so add checkes to
related functions which are called from blk_mq_sched_try_merge().

Signed-off-by: Ming Lei 
---
 block/elevator.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/block/elevator.c b/block/elevator.c
index e11c7873fc21..2424aea85393 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -71,6 +71,10 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
if (!blk_rq_merge_ok(rq, bio))
return false;
 
+   /* We need to support to merge bio from sw queue */
+   if (!rq->q->elevator)
+   return true;
+
if (!elv_iosched_allow_bio_merge(rq, bio))
return false;
 
@@ -449,6 +453,10 @@ static enum elv_merge __elv_merge(struct request_queue *q,
return ELEVATOR_BACK_MERGE;
}
 
+   /* no elevator when merging bio to blk-mq sw queue */
+   if (!e)
+   return ELEVATOR_NO_MERGE;
+
if (e->uses_mq && e->type->ops.mq.request_merge)
return e->type->ops.mq.request_merge(q, req, bio);
else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
@@ -711,6 +719,10 @@ struct request *elv_latter_request(struct request_queue 
*q, struct request *rq)
 {
struct elevator_queue *e = q->elevator;
 
+   /* no elevator when merging bio to blk-mq sw queue */
+   if (!e)
+   return NULL;
+
if (e->uses_mq && e->type->ops.mq.next_request)
return e->type->ops.mq.next_request(q, rq);
else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
@@ -723,6 +735,10 @@ struct request *elv_former_request(struct request_queue 
*q, struct request *rq)
 {
struct elevator_queue *e = q->elevator;
 
+   /* no elevator when merging bio to blk-mq sw queue */
+   if (!e)
+   return NULL;
+
if (e->uses_mq && e->type->ops.mq.former_request)
return e->type->ops.mq.former_request(q, rq);
if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
-- 
2.9.5



[PATCH V4 09/14] block: introduce rqhash helpers

2017-09-02 Thread Ming Lei
We need this helpers for supporting to use hashtable to improve
bio merge from sw queue in the following patches.

No functional change.

Signed-off-by: Ming Lei 
---
 block/blk.h  | 52 
 block/elevator.c | 36 +++-
 2 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/block/blk.h b/block/blk.h
index fcb9775b997d..eb3436d4a73f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -146,6 +146,58 @@ static inline void blk_clear_rq_complete(struct request 
*rq)
  */
 #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
 
+/*
+ * Merge hash stuff.
+ */
+#define rq_hash_key(rq)(blk_rq_pos(rq) + blk_rq_sectors(rq))
+
+#define bucket(head, key)  &((head)[hash_min((key), ELV_HASH_BITS)])
+
+static inline void __rqhash_del(struct request *rq)
+{
+   hash_del(>hash);
+   rq->rq_flags &= ~RQF_HASHED;
+}
+
+static inline void rqhash_del(struct request *rq)
+{
+   if (ELV_ON_HASH(rq))
+   __rqhash_del(rq);
+}
+
+static inline void rqhash_add(struct hlist_head *hash, struct request *rq)
+{
+   BUG_ON(ELV_ON_HASH(rq));
+   hlist_add_head(>hash, bucket(hash, rq_hash_key(rq)));
+   rq->rq_flags |= RQF_HASHED;
+}
+
+static inline void rqhash_reposition(struct hlist_head *hash, struct request 
*rq)
+{
+   __rqhash_del(rq);
+   rqhash_add(hash, rq);
+}
+
+static inline struct request *rqhash_find(struct hlist_head *hash, sector_t 
offset)
+{
+   struct hlist_node *next;
+   struct request *rq = NULL;
+
+   hlist_for_each_entry_safe(rq, next, bucket(hash, offset), hash) {
+   BUG_ON(!ELV_ON_HASH(rq));
+
+   if (unlikely(!rq_mergeable(rq))) {
+   __rqhash_del(rq);
+   continue;
+   }
+
+   if (rq_hash_key(rq) == offset)
+   return rq;
+   }
+
+   return NULL;
+}
+
 void blk_insert_flush(struct request *rq);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
diff --git a/block/elevator.c b/block/elevator.c
index 153926a90901..824cc3e69ac3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -47,11 +47,6 @@ static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
 /*
- * Merge hash stuff.
- */
-#define rq_hash_key(rq)(blk_rq_pos(rq) + blk_rq_sectors(rq))
-
-/*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
  */
@@ -268,14 +263,12 @@ EXPORT_SYMBOL(elevator_exit);
 
 static inline void __elv_rqhash_del(struct request *rq)
 {
-   hash_del(>hash);
-   rq->rq_flags &= ~RQF_HASHED;
+   __rqhash_del(rq);
 }
 
 void elv_rqhash_del(struct request_queue *q, struct request *rq)
 {
-   if (ELV_ON_HASH(rq))
-   __elv_rqhash_del(rq);
+   rqhash_del(rq);
 }
 EXPORT_SYMBOL_GPL(elv_rqhash_del);
 
@@ -283,37 +276,22 @@ void elv_rqhash_add(struct request_queue *q, struct 
request *rq)
 {
struct elevator_queue *e = q->elevator;
 
-   BUG_ON(ELV_ON_HASH(rq));
-   hash_add(e->hash, >hash, rq_hash_key(rq));
-   rq->rq_flags |= RQF_HASHED;
+   rqhash_add(e->hash, rq);
 }
 EXPORT_SYMBOL_GPL(elv_rqhash_add);
 
 void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
-   __elv_rqhash_del(rq);
-   elv_rqhash_add(q, rq);
+   struct elevator_queue *e = q->elevator;
+
+   rqhash_reposition(e->hash, rq);
 }
 
 struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
struct elevator_queue *e = q->elevator;
-   struct hlist_node *next;
-   struct request *rq;
-
-   hash_for_each_possible_safe(e->hash, rq, next, hash, offset) {
-   BUG_ON(!ELV_ON_HASH(rq));
 
-   if (unlikely(!rq_mergeable(rq))) {
-   __elv_rqhash_del(rq);
-   continue;
-   }
-
-   if (rq_hash_key(rq) == offset)
-   return rq;
-   }
-
-   return NULL;
+   return rqhash_find(e->hash, offset);
 }
 
 /*
-- 
2.9.5



[PATCH V4 08/14] blk-mq-sched: use q->queue_depth as hint for q->nr_requests

2017-09-02 Thread Ming Lei
SCSI sets q->queue_depth from shost->cmd_per_lun, and
q->queue_depth is per request_queue and more related to
scheduler queue compared with hw queue depth, which can be
shared by queues, such as TAG_SHARED.

This patch tries to use q->queue_depth as hint for computing
q->nr_requests, which should be more effective than
current way.

Reviewed-by: Bart Van Assche 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.h | 18 +++---
 block/blk-mq.c   | 27 +--
 block/blk-mq.h   |  1 +
 block/blk-settings.c |  2 ++
 4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 1d47f3fda1d0..906b10c54f78 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -99,12 +99,24 @@ static inline bool blk_mq_sched_needs_restart(struct 
blk_mq_hw_ctx *hctx)
 static inline unsigned blk_mq_sched_queue_depth(struct request_queue *q)
 {
/*
-* Default to double of smaller one between hw queue_depth and 128,
+* q->queue_depth is more close to scheduler queue, so use it
+* as hint for computing scheduler queue depth if it is valid
+*/
+   unsigned q_depth = q->queue_depth ?: q->tag_set->queue_depth;
+
+   /*
+* Default to double of smaller one between queue depth and 128,
 * since we don't split into sync/async like the old code did.
 * Additionally, this is a per-hw queue depth.
 */
-   return 2 * min_t(unsigned int, q->tag_set->queue_depth,
-  BLKDEV_MAX_RQ);
+   q_depth = 2 * min_t(unsigned int, q_depth, BLKDEV_MAX_RQ);
+
+   /*
+* when queue depth of driver is too small, we set queue depth
+* of scheduler queue as 128 which is the default setting of
+* block legacy code.
+*/
+   return max_t(unsigned, q_depth, BLKDEV_MAX_RQ);
 }
 
 #endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6af56a71c1cd..fc3d26bbfc1a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2650,7 +2650,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
 
-int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+static int __blk_mq_update_nr_requests(struct request_queue *q,
+  bool sched_only,
+  unsigned int nr)
 {
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
@@ -2669,7 +2671,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
 * If we're using an MQ scheduler, just update the scheduler
 * queue depth. This is similar to what the old code would do.
 */
-   if (!hctx->sched_tags) {
+   if (!sched_only && !hctx->sched_tags) {
ret = blk_mq_tag_update_depth(hctx, >tags,
min(nr, 
set->queue_depth),
false);
@@ -2689,6 +2691,27 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
return ret;
 }
 
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
+{
+   return __blk_mq_update_nr_requests(q, false, nr);
+}
+
+/*
+ * When drivers update q->queue_depth, this API is called so that
+ * we can use this queue depth as hint for adjusting scheduler
+ * queue depth.
+ */
+int blk_mq_update_sched_queue_depth(struct request_queue *q)
+{
+   unsigned nr;
+
+   if (!q->mq_ops || !q->elevator)
+   return 0;
+
+   nr = blk_mq_sched_queue_depth(q);
+   return __blk_mq_update_nr_requests(q, true, nr);
+}
+
 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e42748bfb959..0277f9771fab 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -37,6 +37,7 @@ bool blk_mq_get_driver_tag(struct request *rq, struct 
blk_mq_hw_ctx **hctx,
bool wait);
 struct request *blk_mq_dispatch_rq_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
+int blk_mq_update_sched_queue_depth(struct request_queue *q);
 
 /*
  * Internal helpers for allocating/freeing the request map
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8559e9563c52..c2db38d2ec2b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -878,6 +878,8 @@ void blk_set_queue_depth(struct request_queue *q, unsigned 
int depth)
 {
q->queue_depth = depth;
wbt_set_queue_depth(q->rq_wb, depth);
+
+   WARN_ON(blk_mq_update_sched_queue_depth(q));
 }
 EXPORT_SYMBOL(blk_set_queue_depth);
 
-- 
2.9.5



[PATCH V4 03/14] blk-mq: introduce blk_mq_dispatch_rq_from_ctx()

2017-09-02 Thread Ming Lei
This function is introduced for dequeuing request
from sw queue so that we can dispatch it in
scheduler's way.

More importantly, some SCSI devices may set
q->queue_depth, which is a per-request_queue limit,
and applied on pending I/O from all hctxs. This
function is introduced for avoiding to dequeue too
many requests from sw queue when ->dispatch isn't
flushed completely.

Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 38 ++
 block/blk-mq.h |  2 ++
 2 files changed, 40 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3f18cff80050..f063dd0f197f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -880,6 +880,44 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, 
struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
+struct dispatch_rq_data {
+   struct blk_mq_hw_ctx *hctx;
+   struct request *rq;
+};
+
+static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void 
*data)
+{
+   struct dispatch_rq_data *dispatch_data = data;
+   struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
+   struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+   spin_lock(>lock);
+   if (unlikely(!list_empty(>rq_list))) {
+   dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+   list_del_init(_data->rq->queuelist);
+   if (list_empty(>rq_list))
+   sbitmap_clear_bit(sb, bitnr);
+   }
+   spin_unlock(>lock);
+
+   return !dispatch_data->rq;
+}
+
+struct request *blk_mq_dispatch_rq_from_ctx(struct blk_mq_hw_ctx *hctx,
+   struct blk_mq_ctx *start)
+{
+   unsigned off = start ? start->index_hw : 0;
+   struct dispatch_rq_data data = {
+   .hctx = hctx,
+   .rq   = NULL,
+   };
+
+   __sbitmap_for_each_set(>ctx_map, off,
+  dispatch_rq_from_ctx, );
+
+   return data.rq;
+}
+
 static inline unsigned int queued_to_index(unsigned int queued)
 {
if (!queued)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 98252b79b80b..e42748bfb959 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -35,6 +35,8 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, 
struct list_head *list);
 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
bool wait);
+struct request *blk_mq_dispatch_rq_from_ctx(struct blk_mq_hw_ctx *hctx,
+   struct blk_mq_ctx *start);
 
 /*
  * Internal helpers for allocating/freeing the request map
-- 
2.9.5



[PATCH V4 06/14] blk-mq-sched: don't dequeue request until all in ->dispatch are flushed

2017-09-02 Thread Ming Lei
During dispatching, we moved all requests from hctx->dispatch to
one temporary list, then dispatch them one by one from this list.
Unfortunately during this period, run queue from other contexts
may think the queue is idle, then start to dequeue from sw/scheduler
queue and still try to dispatch because ->dispatch is empty. This way
hurts sequential I/O performance because requests are dequeued when
lld queue is busy.

This patch introduces the state of BLK_MQ_S_DISPATCH_BUSY to
make sure that request isn't dequeued until ->dispatch is
flushed.

Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c |  1 +
 block/blk-mq-sched.c   | 58 +++---
 block/blk-mq.c |  6 ++
 include/linux/blk-mq.h |  1 +
 4 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 980e73095643..7a27f262c96a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -182,6 +182,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(SCHED_RESTART),
HCTX_STATE_NAME(TAG_WAITING),
HCTX_STATE_NAME(START_ON_RUN),
+   HCTX_STATE_NAME(DISPATCH_BUSY),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 735e432294ab..97e7a4fe3a32 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -146,7 +146,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-   bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -177,8 +176,33 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
-   } else if (!has_sched_dispatch && !q->queue_depth) {
+   blk_mq_dispatch_rq_list(q, _list);
+
+   /*
+* We may clear DISPATCH_BUSY just after it
+* is set from another context, the only cost
+* is that one request is dequeued a bit early,
+* we can survive that. Given the window is
+* small enough, no need to worry about performance
+* effect.
+*/
+   if (list_empty_careful(>dispatch))
+   clear_bit(BLK_MQ_S_DISPATCH_BUSY, >state);
+   }
+
+   /*
+* If DISPATCH_BUSY is set, that means hw queue is busy
+* and requests in the list of hctx->dispatch need to
+* be flushed first, so return early.
+*
+* Wherever DISPATCH_BUSY is set, blk_mq_run_hw_queue()
+* will be run to try to make progress, so it is always
+* safe to check the state here.
+*/
+   if (test_bit(BLK_MQ_S_DISPATCH_BUSY, >state))
+   return;
+
+   if (!has_sched_dispatch) {
/*
 * If there is no per-request_queue depth, we
 * flush all requests in this hw queue, otherwise
@@ -187,22 +211,21 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 * is busy, which can be triggered easily by
 * per-request_queue queue depth
 */
-   blk_mq_flush_busy_ctxs(hctx, _list);
-   blk_mq_dispatch_rq_list(q, _list);
-   }
-
-   if (!do_sched_dispatch)
-   return;
+   if (!q->queue_depth) {
+   blk_mq_flush_busy_ctxs(hctx, _list);
+   blk_mq_dispatch_rq_list(q, _list);
+   } else {
+   blk_mq_do_dispatch_ctx(q, hctx);
+   }
+   } else {
 
-   /*
-* We want to dispatch from the scheduler if we had no work left
-* on the dispatch list, OR if we did have work but weren't able
-* to make progress.
-*/
-   if (has_sched_dispatch)
+   /*
+* We want to dispatch from the scheduler if we had no work left
+* on the dispatch list, OR if we did have work but weren't able
+* to make progress.
+*/
blk_mq_do_dispatch_sched(q, e, hctx);
-   else
-   blk_mq_do_dispatch_ctx(q, hctx);
+   }
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
@@ -330,6 +353,7 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx 
*hctx,
 */
spin_lock(>lock);
list_add(>queuelist, >dispatch);
+   set_bit(BLK_MQ_S_DISPATCH_BUSY, >state);
spin_unlock(>lock);
return true;
 }
diff --git 

[PATCH V4 07/14] blk-mq-sched: introduce blk_mq_sched_queue_depth()

2017-09-02 Thread Ming Lei
The following patch will use one hint to figure out
default queue depth for scheduler queue, so introduce
the helper of blk_mq_sched_queue_depth() for this purpose.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c |  8 +---
 block/blk-mq-sched.h | 11 +++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 97e7a4fe3a32..1ff6f9bedd1a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -595,13 +595,7 @@ int blk_mq_init_sched(struct request_queue *q, struct 
elevator_type *e)
return 0;
}
 
-   /*
-* Default to double of smaller one between hw queue_depth and 128,
-* since we don't split into sync/async like the old code did.
-* Additionally, this is a per-hw queue depth.
-*/
-   q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
-  BLKDEV_MAX_RQ);
+   q->nr_requests = blk_mq_sched_queue_depth(q);
 
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_tags(q, hctx, i);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 9267d0b7c197..1d47f3fda1d0 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -96,4 +96,15 @@ static inline bool blk_mq_sched_needs_restart(struct 
blk_mq_hw_ctx *hctx)
return test_bit(BLK_MQ_S_SCHED_RESTART, >state);
 }
 
+static inline unsigned blk_mq_sched_queue_depth(struct request_queue *q)
+{
+   /*
+* Default to double of smaller one between hw queue_depth and 128,
+* since we don't split into sync/async like the old code did.
+* Additionally, this is a per-hw queue depth.
+*/
+   return 2 * min_t(unsigned int, q->tag_set->queue_depth,
+  BLKDEV_MAX_RQ);
+}
+
 #endif
-- 
2.9.5



[PATCH V4 04/14] blk-mq-sched: move actual dispatching into one helper

2017-09-02 Thread Ming Lei
So that it becomes easy to support to dispatch from
sw queue in the following patch.

No functional change.

Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 28 ++--
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 845e5baf8af1..f69752961a34 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,6 +89,22 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
+static void blk_mq_do_dispatch(struct request_queue *q,
+  struct elevator_queue *e,
+  struct blk_mq_hw_ctx *hctx)
+{
+   LIST_HEAD(rq_list);
+
+   do {
+   struct request *rq;
+
+   rq = e->type->ops.mq.dispatch_request(hctx);
+   if (!rq)
+   break;
+   list_add(>queuelist, _list);
+   } while (blk_mq_dispatch_rq_list(q, _list));
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
@@ -136,16 +152,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 * on the dispatch list, OR if we did have work but weren't able
 * to make progress.
 */
-   if (do_sched_dispatch && has_sched_dispatch) {
-   do {
-   struct request *rq;
-
-   rq = e->type->ops.mq.dispatch_request(hctx);
-   if (!rq)
-   break;
-   list_add(>queuelist, _list);
-   } while (blk_mq_dispatch_rq_list(q, _list));
-   }
+   if (do_sched_dispatch && has_sched_dispatch)
+   blk_mq_do_dispatch(q, e, hctx);
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
-- 
2.9.5



[PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-02 Thread Ming Lei
SCSI devices use host-wide tagset, and the shared
driver tag space is often quite big. Meantime
there is also queue depth for each lun(.cmd_per_lun),
which is often small.

So lots of requests may stay in sw queue, and we
always flush all belonging to same hw queue and
dispatch them all to driver, unfortunately it is
easy to cause queue busy because of the small
per-lun queue depth. Once these requests are flushed
out, they have to stay in hctx->dispatch, and no bio
merge can participate into these requests, and
sequential IO performance is hurted.

This patch improves dispatching from sw queue when
there is per-request-queue queue depth by taking
request one by one from sw queue, just like the way
of IO scheduler.

Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c   | 61 +-
 include/linux/blk-mq.h |  2 ++
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index f69752961a34..735e432294ab 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
-static void blk_mq_do_dispatch(struct request_queue *q,
-  struct elevator_queue *e,
-  struct blk_mq_hw_ctx *hctx)
+static void blk_mq_do_dispatch_sched(struct request_queue *q,
+struct elevator_queue *e,
+struct blk_mq_hw_ctx *hctx)
 {
LIST_HEAD(rq_list);
 
@@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q,
} while (blk_mq_dispatch_rq_list(q, _list));
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+   unsigned idx = ctx->index_hw;
+
+   if (++idx == hctx->nr_ctx)
+   idx = 0;
+
+   return hctx->ctxs[idx];
+}
+
+static void blk_mq_do_dispatch_ctx(struct request_queue *q,
+  struct blk_mq_hw_ctx *hctx)
+{
+   LIST_HEAD(rq_list);
+   struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+   bool dispatched;
+
+   do {
+   struct request *rq;
+
+   rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
+   if (!rq)
+   break;
+   list_add(>queuelist, _list);
+
+   /* round robin for fair dispatch */
+   ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+   dispatched = blk_mq_dispatch_rq_list(q, _list);
+   } while (dispatched);
+
+   if (!dispatched)
+   WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx));
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
@@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
if (!list_empty(_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
-   } else if (!has_sched_dispatch) {
+   } else if (!has_sched_dispatch && !q->queue_depth) {
+   /*
+* If there is no per-request_queue depth, we
+* flush all requests in this hw queue, otherwise
+* pick up request one by one from sw queue for
+* avoiding to mess up I/O merge when dispatch
+* is busy, which can be triggered easily by
+* per-request_queue queue depth
+*/
blk_mq_flush_busy_ctxs(hctx, _list);
blk_mq_dispatch_rq_list(q, _list);
}
 
+   if (!do_sched_dispatch)
+   return;
+
/*
 * We want to dispatch from the scheduler if we had no work left
 * on the dispatch list, OR if we did have work but weren't able
 * to make progress.
 */
-   if (do_sched_dispatch && has_sched_dispatch)
-   blk_mq_do_dispatch(q, e, hctx);
+   if (has_sched_dispatch)
+   blk_mq_do_dispatch_sched(q, e, hctx);
+   else
+   blk_mq_do_dispatch_ctx(q, hctx);
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..7b7a366a97f3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
 
struct sbitmap  ctx_map;
 
+   struct blk_mq_ctx   *dispatch_from;
+
struct blk_mq_ctx   **ctxs;
unsigned intnr_ctx;
 
-- 
2.9.5



[PATCH V4 02/14] sbitmap: introduce __sbitmap_for_each_set()

2017-09-02 Thread Ming Lei
We need to iterate ctx starting from any ctx in round robin
way, so introduce this helper.

Cc: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/sbitmap.h | 54 -
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index a1904aadbc45..2329b9e1a0e2 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -211,10 +211,14 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb);
  */
 bool sbitmap_any_bit_clear(const struct sbitmap *sb);
 
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
 typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
 
 /**
  * sbitmap_for_each_set() - Iterate over each set bit in a  sbitmap.
+ * @off: Where to start the iteration
  * @sb: Bitmap to iterate over.
  * @fn: Callback. Should return true to continue or false to break early.
  * @data: Pointer to pass to callback.
@@ -222,35 +226,57 @@ typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned 
int, void *);
  * This is inline even though it's non-trivial so that the function calls to 
the
  * callback will hopefully get optimized away.
  */
-static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
-   void *data)
+static inline void __sbitmap_for_each_set(struct sbitmap *sb,
+ unsigned int off,
+ sb_for_each_fn fn, void *data)
 {
-   unsigned int i;
+   unsigned int index = SB_NR_TO_INDEX(sb, off);
+   unsigned int nr = SB_NR_TO_BIT(sb, off);
+   unsigned int scanned = 0;
 
-   for (i = 0; i < sb->map_nr; i++) {
-   struct sbitmap_word *word = >map[i];
-   unsigned int off, nr;
+   while (1) {
+   struct sbitmap_word *word = >map[index];
+   unsigned int depth = min_t(unsigned int, word->depth - nr,
+  sb->depth - scanned);
 
+   scanned += depth;
if (!word->word)
-   continue;
+   goto next;
 
-   nr = 0;
-   off = i << sb->shift;
+   depth += nr;
+   off = index << sb->shift;
while (1) {
-   nr = find_next_bit(>word, word->depth, nr);
-   if (nr >= word->depth)
+   nr = find_next_bit(>word, depth, nr);
+   if (nr >= depth)
break;
-
if (!fn(sb, off + nr, data))
return;
 
nr++;
}
+ next:
+   if (scanned >= sb->depth)
+   break;
+   nr = 0;
+   if (++index >= sb->map_nr)
+   index = 0;
}
 }
 
-#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
-#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a  sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ *
+ * This is inline even though it's non-trivial so that the function calls to 
the
+ * callback will hopefully get optimized away.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+   void *data)
+{
+   __sbitmap_for_each_set(sb, 0, fn, data);
+}
 
 static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
unsigned int bitnr)
-- 
2.9.5



[PATCH V4 01/14] blk-mq-sched: fix scheduler bad performance

2017-09-02 Thread Ming Lei
When hw queue is busy, we shouldn't take requests from
scheduler queue any more, otherwise it is difficult to do
IO merge.

This patch fixes the awful IO performance on some
SCSI devices(lpfc, qla2xxx, ...) when mq-deadline/kyber
is used by not taking requests if hw queue is busy.

Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4ab69435708c..845e5baf8af1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -94,7 +94,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
-   bool did_work = false;
+   bool do_sched_dispatch = true;
LIST_HEAD(rq_list);
 
/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -125,7 +125,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 */
if (!list_empty(_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
-   did_work = blk_mq_dispatch_rq_list(q, _list);
+   do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
} else if (!has_sched_dispatch) {
blk_mq_flush_busy_ctxs(hctx, _list);
blk_mq_dispatch_rq_list(q, _list);
@@ -136,7 +136,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
 * on the dispatch list, OR if we did have work but weren't able
 * to make progress.
 */
-   if (!did_work && has_sched_dispatch) {
+   if (do_sched_dispatch && has_sched_dispatch) {
do {
struct request *rq;
 
-- 
2.9.5



Re: [PATCH V3 0/8] block/scsi: safe SCSI quiescing

2017-09-02 Thread Oleksandr Natalenko
Again,

Tested-by: Oleksandr Natalenko 

On sobota 2. září 2017 15:08:32 CEST Ming Lei wrote:
> Hi,
> 
> The current SCSI quiesce isn't safe and easy to trigger I/O deadlock.
> 
> Once SCSI device is put into QUIESCE, no new request except for RQF_PREEMPT
> can be dispatched to SCSI successfully, and scsi_device_quiesce() just
> simply waits for completion of I/Os dispatched to SCSI stack. It isn't
> enough at all.
> 
> Because new request still can be allocated, but all the allocated
> requests can't be dispatched successfully, so request pool can be
> consumed up easily.
> 
> Then request with RQF_PREEMPT can't be allocated, and system may
> hang forever, such as during system suspend or SCSI domain alidation.
> 
> Both IO hang inside system suspend[1] or SCSI domain validation
> were reported before.
> 
> This patch introduces preempt freez, and tries to solve the issue
> by preempt freezing block queue during SCSI quiesce, and allows
> to allocate request of RQF_PREEMPT when queue is preempt-frozen.
> 
> Both SCSI and SCSI_MQ have this IO deadlock issue, this patch fixes
> them all by introducing blk_freeze_queue_preempt() and
> blk_unfreeze_queue_preempt(), also unifying current interfaces for
> freezing queue between block legacy and blk-mq.
> 
> Oleksandr has verified that this patchset V2 fixes his I/O hang
> during suspend/resume cycle.
> 
> V3:
>   - introduce q->preempt_unfreezing to fix one bug of preempt freeze
>   - call blk_queue_enter_live() only when queue is preempt frozen
>   - cleanup a bit on the implementation of preempt freeze
>   - only patch 6 and 7 are changed
> 
> V2:
>   - drop the 1st patch in V1 because percpu_ref_is_dying() is
>   enough as pointed by Tejun
>   - introduce preempt version of blk_[freeze|unfreeze]_queue
>   - sync between preempt freeze and normal freeze
>   - fix warning from percpu-refcount as reported by Oleksandr
> 
> 
> [1] https://marc.info/?t=150340250100013=3=2
> 
> 
> 
> Ming Lei (8):
>   blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue
>   blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue
>   blk-mq: only run hw queues for blk-mq
>   blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait
>   block: tracking request allocation with q_usage_counter
>   block: introduce preempt version of blk_[freeze|unfreeze]_queue
>   block: allow to allocate req with REQF_PREEMPT when queue is preempt
> frozen
>   SCSI: preempt freeze block queue when SCSI device is put into quiesce
> 
>  block/bfq-iosched.c  |   2 +-
>  block/blk-cgroup.c   |   8 +--
>  block/blk-core.c |  53 ---
>  block/blk-mq.c   | 170
> +++ block/blk-mq.h   | 
>  1 -
>  block/blk.h  |  17 +
>  block/elevator.c |   4 +-
>  drivers/block/loop.c |  16 ++---
>  drivers/block/rbd.c  |   2 +-
>  drivers/nvme/host/core.c |   8 +--
>  drivers/scsi/scsi_lib.c  |  22 +-
>  include/linux/blk-mq.h   |  15 +++--
>  include/linux/blkdev.h   |  21 +-
>  13 files changed, 273 insertions(+), 66 deletions(-)




Re: [PATCH V3 7/8] block: allow to allocate req with REQF_PREEMPT when queue is preempt frozen

2017-09-02 Thread Ming Lei
On Sat, Sep 02, 2017 at 09:08:39PM +0800, Ming Lei wrote:
> REQF_PREEMPT is a bit special because the request is required
> to be dispatched to lld even when SCSI device is quiesced.
> 
> So this patch introduces __blk_get_request() to allow block
> layer to allocate request when queue is preempt frozen, since we
> will preempt freeze queue before quiescing SCSI device in the
> following patch for supporting safe SCSI quiescing.
> 
> Signed-off-by: Ming Lei 
> ---
>  block/blk-core.c   | 28 
>  block/blk-mq.c | 14 --
>  include/linux/blk-mq.h |  7 ---
>  include/linux/blkdev.h | 17 +++--
>  4 files changed, 51 insertions(+), 15 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 2549b0a0535d..f7a6fbb87dea 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1404,7 +1404,8 @@ static struct request *get_request(struct request_queue 
> *q, unsigned int op,
>  }
>  
>  static struct request *blk_old_get_request(struct request_queue *q,
> -unsigned int op, gfp_t gfp_mask)
> +unsigned int op, gfp_t gfp_mask,
> +unsigned int flags)
>  {
>   struct request *rq;
>   int ret = 0;
> @@ -1414,9 +1415,20 @@ static struct request *blk_old_get_request(struct 
> request_queue *q,
>   /* create ioc upfront */
>   create_io_context(gfp_mask, q->node);
>  
> - ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
> + /*
> +  * We need to allocate req of REQF_PREEMPT in preempt freezing.
> +  * No normal freezing can be started when preempt freezing
> +  * is in-progress, and queue dying is checked before starting
> +  * preempt freezing, so it is safe to use blk_queue_enter_live()
> +  * in case of preempt freezing.
> +  */
> + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
> + blk_queue_enter_live(q);
> + else
> + ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
>   if (ret)
>   return ERR_PTR(ret);
> +
>   spin_lock_irq(q->queue_lock);
>   rq = get_request(q, op, NULL, gfp_mask);
>   if (IS_ERR(rq)) {
> @@ -1432,26 +1444,26 @@ static struct request *blk_old_get_request(struct 
> request_queue *q,
>   return rq;
>  }
>  
> -struct request *blk_get_request(struct request_queue *q, unsigned int op,
> - gfp_t gfp_mask)
> +struct request *__blk_get_request(struct request_queue *q, unsigned int op,
> +   gfp_t gfp_mask, unsigned int flags)
>  {
>   struct request *req;
>  
>   if (q->mq_ops) {
>   req = blk_mq_alloc_request(q, op,
> - (gfp_mask & __GFP_DIRECT_RECLAIM) ?
> - 0 : BLK_MQ_REQ_NOWAIT);
> + flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ?
> + 0 : BLK_MQ_REQ_NOWAIT));
>   if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
>   q->mq_ops->initialize_rq_fn(req);
>   } else {
> - req = blk_old_get_request(q, op, gfp_mask);
> + req = blk_old_get_request(q, op, gfp_mask, flags);
>   if (!IS_ERR(req) && q->initialize_rq_fn)
>   q->initialize_rq_fn(req);
>   }
>  
>   return req;
>  }
> -EXPORT_SYMBOL(blk_get_request);
> +EXPORT_SYMBOL(__blk_get_request);
>  
>  /**
>   * blk_requeue_request - put a request back on queue
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 54b8d8b9f40e..e81001d1da27 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -496,9 +496,19 @@ struct request *blk_mq_alloc_request(struct 
> request_queue *q, unsigned int op,
>  {
>   struct blk_mq_alloc_data alloc_data = { .flags = flags };
>   struct request *rq;
> - int ret;
> + int ret = 0;
>  
> - ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
> + /*
> +  * We need to allocate req of REQF_PREEMPT in preempt freezing.
> +  * No normal freezing can be started when preempt freezing
> +  * is in-progress, and queue dying is checked before starting
> +  * preempt freezing, so it is safe to use blk_queue_enter_live()
> +  * in case of preempt freezing.
> +  */
> + if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
> + blk_queue_enter_live(q);
> + else
> + ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
>   if (ret)
>   return ERR_PTR(ret);
>  
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 5ae8c82d6273..596f433eb54c 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -200,9 +200,10 @@ void blk_mq_free_request(struct request *rq);
>  bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
>  
>  enum {
> - BLK_MQ_REQ_NOWAIT   = (1 

[PATCH V3 6/8] block: introduce preempt version of blk_[freeze|unfreeze]_queue

2017-09-02 Thread Ming Lei
The two APIs are required to allow request allocation of
RQF_PREEMPT when queue is preempt frozen.

The following two points have to be guaranteed for one queue:

1) preempt freezing can be started only after all in-progress
normal & preempt freezings are completed

2) normal freezing can be started only if in-progress preempt
freezing is completed

Because for normal freezing, once blk_mq_freeze_queue_wait()
is returned, we have to make sure no request is entering queue
any more.

rwsem should have been perfect for this kind of sync, but we need
to support nested normal freeze, so spin_lock and normal_freezing &
preempt_freezing flag are used for the sync between normal freeze
and preempt freeze.

Signed-off-by: Ming Lei 
---
 block/blk-core.c   |   2 +
 block/blk-mq.c | 120 +++--
 block/blk.h|  16 +++
 include/linux/blk-mq.h |   2 +
 include/linux/blkdev.h |   4 ++
 5 files changed, 141 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 85b15833a7a5..2549b0a0535d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -899,6 +899,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
if (blkcg_init_queue(q))
goto fail_ref;
 
+   spin_lock_init(>freeze_lock);
+
return q;
 
 fail_ref:
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 24de78afbe9a..54b8d8b9f40e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -118,16 +118,75 @@ void blk_mq_in_flight(struct request_queue *q, struct 
hd_struct *part,
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, );
 }
 
-void blk_freeze_queue_start(struct request_queue *q)
+static bool queue_freeze_is_over(struct request_queue *q, bool preempt)
+{
+   /*
+* For preempt freeze, we simply call blk_queue_enter_live()
+* before allocating one request of RQF_PREEMPT, so we have
+* to check if queue is dead, otherwise we may hang on dead
+* queue.
+*
+* For normal freeze, no need to check blk_queue_dying()
+* because it is checked in blk_queue_enter().
+*/
+   if (preempt)
+   return !(q->normal_freezing + q->preempt_freezing) ||
+   blk_queue_dying(q);
+   return !q->preempt_freezing;
+}
+
+static bool __blk_freeze_queue_start(struct request_queue *q, bool preempt)
 {
int freeze_depth;
+   bool start_freeze = true;
+
+   /*
+* Wait for completion of another kind of freezing.
+*
+* We have to sync between normal freeze and preempt
+* freeze. preempt freeze can only be started iff all
+* pending normal & preempt freezing are completed,
+* meantime normal freeze can be started only if there
+* isn't pending preempt freezing.
+*
+* rwsem should have been perfect for this kind of sync,
+* but we need to support nested normal freeze, so use
+* spin_lock with two flag for syncing between normal
+* freeze and preempt freeze.
+*/
+   spin_lock(>freeze_lock);
+   wait_event_cmd(q->mq_freeze_wq,
+  queue_freeze_is_over(q, preempt),
+  spin_unlock(>freeze_lock),
+  spin_lock(>freeze_lock));
+
+   if (preempt && blk_queue_dying(q)) {
+   start_freeze = false;
+   goto unlock;
+   }
 
freeze_depth = atomic_inc_return(>mq_freeze_depth);
if (freeze_depth == 1) {
+   if (preempt) {
+   q->preempt_freezing = 1;
+   q->preempt_unfreezing = 0;
+   } else
+   q->normal_freezing = 1;
+   spin_unlock(>freeze_lock);
+
percpu_ref_kill(>q_usage_counter);
if (q->mq_ops)
blk_mq_run_hw_queues(q, false);
-   }
+   } else
+ unlock:
+   spin_unlock(>freeze_lock);
+
+   return start_freeze;
+}
+
+void blk_freeze_queue_start(struct request_queue *q)
+{
+   __blk_freeze_queue_start(q, false);
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
@@ -166,7 +225,7 @@ void blk_freeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue);
 
-void blk_unfreeze_queue(struct request_queue *q)
+static void __blk_unfreeze_queue(struct request_queue *q, bool preempt)
 {
int freeze_depth;
 
@@ -174,12 +233,67 @@ void blk_unfreeze_queue(struct request_queue *q)
WARN_ON_ONCE(freeze_depth < 0);
if (!freeze_depth) {
percpu_ref_reinit(>q_usage_counter);
+
+   /*
+* clearing the freeze flag so that any pending
+* freeze can move on
+*/
+   spin_lock(>freeze_lock);
+   if (preempt)
+   q->preempt_freezing = 0;
+   else
+   

[PATCH V3 8/8] SCSI: preempt freeze block queue when SCSI device is put into quiesce

2017-09-02 Thread Ming Lei
Simply quiesing SCSI device and waiting for completeion of IO
dispatched to SCSI queue isn't safe, it is easy to use up
requests because all these allocated requests can't be dispatched
when device is put in QIUESCE. Then no request can be allocated
for RQF_PREEMPT, and system may hang somewhere, such as
When sending commands of sync_cache or start_stop during
system suspend path.

Before quiesing SCSI, this patch freezes block queue in preempt
mode first, so no new normal request can enter queue any more,
and all pending requests are drained too once blk_freeze_queue_preempt
is returned. And only RQF_PREEMPT can be allocated in preempt freeze.

This patch also uses __blk_get_request() for allocating
request with RQF_PREEMPT, so that the allocation can
succeed even though block queue is preempt frozen.

Signed-off-by: Ming Lei 
---
 drivers/scsi/scsi_lib.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f6097b89d5d3..e1ad135cb209 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -243,10 +243,12 @@ int scsi_execute(struct scsi_device *sdev, const unsigned 
char *cmd,
struct request *req;
struct scsi_request *rq;
int ret = DRIVER_ERROR << 24;
+   unsigned flag = sdev->sdev_state == SDEV_QUIESCE ? BLK_REQ_PREEMPT : 0;
 
-   req = blk_get_request(sdev->request_queue,
+   req = __blk_get_request(sdev->request_queue,
data_direction == DMA_TO_DEVICE ?
-   REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM);
+   REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM,
+   flag);
if (IS_ERR(req))
return ret;
rq = scsi_req(req);
@@ -2890,6 +2892,20 @@ scsi_device_quiesce(struct scsi_device *sdev)
 {
int err;
 
+   /*
+* Simply quiesing SCSI device isn't safe, it is easy
+* to use up requests because all these allocated requests
+* can't be dispatched when device is put in QIUESCE.
+* Then no request can be allocated and we may hang
+* somewhere, such as system suspend/resume.
+*
+* So we freeze block queue in preempt mode first, no new
+* normal request can enter queue any more, and all pending
+* requests are drained once blk_freeze_queue is returned.
+* Only RQF_PREEMPT is allowed in preempt freeze.
+*/
+   blk_freeze_queue_preempt(sdev->request_queue);
+
mutex_lock(>state_mutex);
err = scsi_device_set_state(sdev, SDEV_QUIESCE);
mutex_unlock(>state_mutex);
@@ -2926,6 +2942,8 @@ void scsi_device_resume(struct scsi_device *sdev)
scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
scsi_run_queue(sdev->request_queue);
mutex_unlock(>state_mutex);
+
+   blk_unfreeze_queue_preempt(sdev->request_queue);
 }
 EXPORT_SYMBOL(scsi_device_resume);
 
-- 
2.9.5



[PATCH V3 5/8] block: tracking request allocation with q_usage_counter

2017-09-02 Thread Ming Lei
This usage is basically same with blk-mq, so that we can
support to freeze queue easily.

Signed-off-by: Ming Lei 
---
 block/blk-core.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index ce2d3b6f6c62..85b15833a7a5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1405,16 +1405,21 @@ static struct request *blk_old_get_request(struct 
request_queue *q,
   unsigned int op, gfp_t gfp_mask)
 {
struct request *rq;
+   int ret = 0;
 
WARN_ON_ONCE(q->mq_ops);
 
/* create ioc upfront */
create_io_context(gfp_mask, q->node);
 
+   ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
+   if (ret)
+   return ERR_PTR(ret);
spin_lock_irq(q->queue_lock);
rq = get_request(q, op, NULL, gfp_mask);
if (IS_ERR(rq)) {
spin_unlock_irq(q->queue_lock);
+   blk_queue_exit(q);
return rq;
}
 
@@ -1586,6 +1591,7 @@ void __blk_put_request(struct request_queue *q, struct 
request *req)
blk_free_request(rl, req);
freed_request(rl, sync, rq_flags);
blk_put_rl(rl);
+   blk_queue_exit(q);
}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1867,8 +1873,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
 * Grab a free request. This is might sleep but can not fail.
 * Returns with the queue unlocked.
 */
+   blk_queue_enter_live(q);
req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
if (IS_ERR(req)) {
+   blk_queue_exit(q);
__wbt_done(q->rq_wb, wb_acct);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
-- 
2.9.5



[PATCH V3 4/8] blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait

2017-09-02 Thread Ming Lei
The only change on legacy is that blk_drain_queue() is run
from blk_freeze_queue(), which is called in blk_cleanup_queue().

So this patch removes the explicite __blk_drain_queue() in
blk_cleanup_queue().

Signed-off-by: Ming Lei 
---
 block/blk-core.c | 17 +++--
 block/blk-mq.c   |  8 +---
 block/blk.h  |  1 +
 drivers/nvme/host/core.c |  2 +-
 include/linux/blk-mq.h   |  2 +-
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d579501f24ba..ce2d3b6f6c62 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -530,6 +530,21 @@ static void __blk_drain_queue(struct request_queue *q, 
bool drain_all)
 }
 
 /**
+ * blk_drain_queue - drain requests from request_queue
+ * @q: queue to drain
+ *
+ * Drain requests from @q.  All pending requests are drained.
+ * The caller is responsible for ensuring that no new requests
+ * which need to be drained are queued.
+ */
+void blk_drain_queue(struct request_queue *q)
+{
+   spin_lock_irq(q->queue_lock);
+   __blk_drain_queue(q, true);
+   spin_unlock_irq(q->queue_lock);
+}
+
+/**
  * blk_queue_bypass_start - enter queue bypass mode
  * @q: queue of interest
  *
@@ -653,8 +668,6 @@ void blk_cleanup_queue(struct request_queue *q)
 */
blk_freeze_queue(q);
spin_lock_irq(lock);
-   if (!q->mq_ops)
-   __blk_drain_queue(q, true);
queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c532d8612e1..24de78afbe9a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -131,11 +131,13 @@ void blk_freeze_queue_start(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
-void blk_mq_freeze_queue_wait(struct request_queue *q)
+void blk_freeze_queue_wait(struct request_queue *q)
 {
+   if (!q->mq_ops)
+   blk_drain_queue(q);
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(>q_usage_counter));
 }
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_wait);
 
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 unsigned long timeout)
@@ -160,7 +162,7 @@ void blk_freeze_queue(struct request_queue *q)
 * exported to drivers as the only user for unfreeze is blk_mq.
 */
blk_freeze_queue_start(q);
-   blk_mq_freeze_queue_wait(q);
+   blk_freeze_queue_wait(q);
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue);
 
diff --git a/block/blk.h b/block/blk.h
index 6847c5435cca..242486e26a81 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -64,6 +64,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request 
*rq,
struct bio *bio);
 void blk_queue_bypass_start(struct request_queue *q);
 void blk_queue_bypass_end(struct request_queue *q);
+void blk_drain_queue(struct request_queue *q);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 void blk_freeze_queue(struct request_queue *q);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 986f2b4f9760..d34a9ffaa940 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2778,7 +2778,7 @@ void nvme_wait_freeze(struct nvme_ctrl *ctrl)
 
mutex_lock(>namespaces_mutex);
list_for_each_entry(ns, >namespaces, list)
-   blk_mq_freeze_queue_wait(ns->queue);
+   blk_freeze_queue_wait(ns->queue);
mutex_unlock(>namespaces_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8ae77e088c01..f90d78eb85df 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -259,7 +259,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 void blk_freeze_queue(struct request_queue *q);
 void blk_unfreeze_queue(struct request_queue *q);
 void blk_freeze_queue_start(struct request_queue *q);
-void blk_mq_freeze_queue_wait(struct request_queue *q);
+void blk_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 unsigned long timeout);
 int blk_mq_reinit_tagset(struct blk_mq_tag_set *set,
-- 
2.9.5



[PATCH V3 7/8] block: allow to allocate req with REQF_PREEMPT when queue is preempt frozen

2017-09-02 Thread Ming Lei
REQF_PREEMPT is a bit special because the request is required
to be dispatched to lld even when SCSI device is quiesced.

So this patch introduces __blk_get_request() to allow block
layer to allocate request when queue is preempt frozen, since we
will preempt freeze queue before quiescing SCSI device in the
following patch for supporting safe SCSI quiescing.

Signed-off-by: Ming Lei 
---
 block/blk-core.c   | 28 
 block/blk-mq.c | 14 --
 include/linux/blk-mq.h |  7 ---
 include/linux/blkdev.h | 17 +++--
 4 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2549b0a0535d..f7a6fbb87dea 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1404,7 +1404,8 @@ static struct request *get_request(struct request_queue 
*q, unsigned int op,
 }
 
 static struct request *blk_old_get_request(struct request_queue *q,
-  unsigned int op, gfp_t gfp_mask)
+  unsigned int op, gfp_t gfp_mask,
+  unsigned int flags)
 {
struct request *rq;
int ret = 0;
@@ -1414,9 +1415,20 @@ static struct request *blk_old_get_request(struct 
request_queue *q,
/* create ioc upfront */
create_io_context(gfp_mask, q->node);
 
-   ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
+   /*
+* We need to allocate req of REQF_PREEMPT in preempt freezing.
+* No normal freezing can be started when preempt freezing
+* is in-progress, and queue dying is checked before starting
+* preempt freezing, so it is safe to use blk_queue_enter_live()
+* in case of preempt freezing.
+*/
+   if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
+   blk_queue_enter_live(q);
+   else
+   ret = blk_queue_enter(q, !(gfp_mask & __GFP_DIRECT_RECLAIM));
if (ret)
return ERR_PTR(ret);
+
spin_lock_irq(q->queue_lock);
rq = get_request(q, op, NULL, gfp_mask);
if (IS_ERR(rq)) {
@@ -1432,26 +1444,26 @@ static struct request *blk_old_get_request(struct 
request_queue *q,
return rq;
 }
 
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
-   gfp_t gfp_mask)
+struct request *__blk_get_request(struct request_queue *q, unsigned int op,
+ gfp_t gfp_mask, unsigned int flags)
 {
struct request *req;
 
if (q->mq_ops) {
req = blk_mq_alloc_request(q, op,
-   (gfp_mask & __GFP_DIRECT_RECLAIM) ?
-   0 : BLK_MQ_REQ_NOWAIT);
+   flags | ((gfp_mask & __GFP_DIRECT_RECLAIM) ?
+   0 : BLK_MQ_REQ_NOWAIT));
if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
q->mq_ops->initialize_rq_fn(req);
} else {
-   req = blk_old_get_request(q, op, gfp_mask);
+   req = blk_old_get_request(q, op, gfp_mask, flags);
if (!IS_ERR(req) && q->initialize_rq_fn)
q->initialize_rq_fn(req);
}
 
return req;
 }
-EXPORT_SYMBOL(blk_get_request);
+EXPORT_SYMBOL(__blk_get_request);
 
 /**
  * blk_requeue_request - put a request back on queue
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 54b8d8b9f40e..e81001d1da27 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -496,9 +496,19 @@ struct request *blk_mq_alloc_request(struct request_queue 
*q, unsigned int op,
 {
struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
-   int ret;
+   int ret = 0;
 
-   ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+   /*
+* We need to allocate req of REQF_PREEMPT in preempt freezing.
+* No normal freezing can be started when preempt freezing
+* is in-progress, and queue dying is checked before starting
+* preempt freezing, so it is safe to use blk_queue_enter_live()
+* in case of preempt freezing.
+*/
+   if ((flags & BLK_MQ_REQ_PREEMPT) && blk_queue_is_preempt_frozen(q))
+   blk_queue_enter_live(q);
+   else
+   ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
if (ret)
return ERR_PTR(ret);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5ae8c82d6273..596f433eb54c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -200,9 +200,10 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
-   BLK_MQ_REQ_NOWAIT   = (1 << 0), /* return when out of requests */
-   BLK_MQ_REQ_RESERVED = (1 << 1), /* allocate from reserved pool */
-   BLK_MQ_REQ_INTERNAL = (1 << 2), /* allocate 

[PATCH V3 3/8] blk-mq: only run hw queues for blk-mq

2017-09-02 Thread Ming Lei
This patch just makes it explicitely.

Reviewed-by: Johannes Thumshirn 
Signed-off-by: Ming Lei 
---
 block/blk-mq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8cf1f7cbef2b..4c532d8612e1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -125,7 +125,8 @@ void blk_freeze_queue_start(struct request_queue *q)
freeze_depth = atomic_inc_return(>mq_freeze_depth);
if (freeze_depth == 1) {
percpu_ref_kill(>q_usage_counter);
-   blk_mq_run_hw_queues(q, false);
+   if (q->mq_ops)
+   blk_mq_run_hw_queues(q, false);
}
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
-- 
2.9.5



[PATCH V3 0/8] block/scsi: safe SCSI quiescing

2017-09-02 Thread Ming Lei
Hi,

The current SCSI quiesce isn't safe and easy to trigger I/O deadlock.

Once SCSI device is put into QUIESCE, no new request except for RQF_PREEMPT
can be dispatched to SCSI successfully, and scsi_device_quiesce() just simply
waits for completion of I/Os dispatched to SCSI stack. It isn't enough at all.

Because new request still can be allocated, but all the allocated
requests can't be dispatched successfully, so request pool can be
consumed up easily.

Then request with RQF_PREEMPT can't be allocated, and system may
hang forever, such as during system suspend or SCSI domain alidation.

Both IO hang inside system suspend[1] or SCSI domain validation
were reported before.

This patch introduces preempt freez, and tries to solve the issue
by preempt freezing block queue during SCSI quiesce, and allows
to allocate request of RQF_PREEMPT when queue is preempt-frozen.

Both SCSI and SCSI_MQ have this IO deadlock issue, this patch fixes
them all by introducing blk_freeze_queue_preempt() and
blk_unfreeze_queue_preempt(), also unifying current interfaces for
freezing queue between block legacy and blk-mq.

Oleksandr has verified that this patchset V2 fixes his I/O hang
during suspend/resume cycle.

V3:
- introduce q->preempt_unfreezing to fix one bug of preempt freeze
- call blk_queue_enter_live() only when queue is preempt frozen
- cleanup a bit on the implementation of preempt freeze
- only patch 6 and 7 are changed

V2:
- drop the 1st patch in V1 because percpu_ref_is_dying() is
enough as pointed by Tejun
- introduce preempt version of blk_[freeze|unfreeze]_queue
- sync between preempt freeze and normal freeze
- fix warning from percpu-refcount as reported by Oleksandr


[1] https://marc.info/?t=150340250100013=3=2



Ming Lei (8):
  blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue
  blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue
  blk-mq: only run hw queues for blk-mq
  blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait
  block: tracking request allocation with q_usage_counter
  block: introduce preempt version of blk_[freeze|unfreeze]_queue
  block: allow to allocate req with REQF_PREEMPT when queue is preempt
frozen
  SCSI: preempt freeze block queue when SCSI device is put into quiesce

 block/bfq-iosched.c  |   2 +-
 block/blk-cgroup.c   |   8 +--
 block/blk-core.c |  53 ---
 block/blk-mq.c   | 170 +++
 block/blk-mq.h   |   1 -
 block/blk.h  |  17 +
 block/elevator.c |   4 +-
 drivers/block/loop.c |  16 ++---
 drivers/block/rbd.c  |   2 +-
 drivers/nvme/host/core.c |   8 +--
 drivers/scsi/scsi_lib.c  |  22 +-
 include/linux/blk-mq.h   |  15 +++--
 include/linux/blkdev.h   |  21 +-
 13 files changed, 273 insertions(+), 66 deletions(-)

-- 
2.9.5



[PATCH V3 1/8] blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue

2017-09-02 Thread Ming Lei
We will support to freeze queue on block legacy path too.

Signed-off-by: Ming Lei 
---
 block/blk-cgroup.c   |  4 ++--
 block/blk-mq.c   | 10 +-
 block/elevator.c |  2 +-
 drivers/block/loop.c |  8 
 drivers/nvme/host/core.c |  4 ++--
 include/linux/blk-mq.h   |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0480892e97e5..02e8a47ac77c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1337,7 +1337,7 @@ int blkcg_activate_policy(struct request_queue *q,
spin_unlock_irq(q->queue_lock);
 out_bypass_end:
if (q->mq_ops)
-   blk_mq_unfreeze_queue(q);
+   blk_unfreeze_queue(q);
else
blk_queue_bypass_end(q);
if (pd_prealloc)
@@ -1388,7 +1388,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
spin_unlock_irq(q->queue_lock);
 
if (q->mq_ops)
-   blk_mq_unfreeze_queue(q);
+   blk_unfreeze_queue(q);
else
blk_queue_bypass_end(q);
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d935f15c54da..82136e83951d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -172,7 +172,7 @@ void blk_mq_freeze_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
-void blk_mq_unfreeze_queue(struct request_queue *q)
+void blk_unfreeze_queue(struct request_queue *q)
 {
int freeze_depth;
 
@@ -183,7 +183,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
wake_up_all(>mq_freeze_wq);
}
 }
-EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
+EXPORT_SYMBOL_GPL(blk_unfreeze_queue);
 
 /*
  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
@@ -2250,7 +2250,7 @@ static void blk_mq_update_tag_set_depth(struct 
blk_mq_tag_set *set,
list_for_each_entry(q, >tag_list, tag_set_list) {
blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
-   blk_mq_unfreeze_queue(q);
+   blk_unfreeze_queue(q);
}
 }
 
@@ -2708,7 +2708,7 @@ static int __blk_mq_update_nr_requests(struct 
request_queue *q,
if (!ret)
q->nr_requests = nr;
 
-   blk_mq_unfreeze_queue(q);
+   blk_unfreeze_queue(q);
 
return ret;
 }
@@ -2757,7 +2757,7 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
}
 
list_for_each_entry(q, >tag_list, tag_set_list)
-   blk_mq_unfreeze_queue(q);
+   blk_unfreeze_queue(q);
 }
 
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
diff --git a/block/elevator.c b/block/elevator.c
index 0e465809d3f3..371c8165c9e8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -994,7 +994,7 @@ static int elevator_switch_mq(struct request_queue *q,
blk_add_trace_msg(q, "elv switch: none");
 
 out:
-   blk_mq_unfreeze_queue(q);
+   blk_unfreeze_queue(q);
return ret;
 }
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2fbd4089c20e..5c11ea44d470 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -217,7 +217,7 @@ static void __loop_update_dio(struct loop_device *lo, bool 
dio)
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
else
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
-   blk_mq_unfreeze_queue(lo->lo_queue);
+   blk_unfreeze_queue(lo->lo_queue);
 }
 
 static int
@@ -605,7 +605,7 @@ static int loop_switch(struct loop_device *lo, struct file 
*file)
do_loop_switch(lo, );
 
/* unfreeze */
-   blk_mq_unfreeze_queue(lo->lo_queue);
+   blk_unfreeze_queue(lo->lo_queue);
 
return 0;
 }
@@ -1079,7 +1079,7 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
-   blk_mq_unfreeze_queue(lo->lo_queue);
+   blk_unfreeze_queue(lo->lo_queue);
 
if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
loop_reread_partitions(lo, bdev);
@@ -1191,7 +1191,7 @@ loop_set_status(struct loop_device *lo, const struct 
loop_info64 *info)
__loop_update_dio(lo, lo->use_dio);
 
  exit:
-   blk_mq_unfreeze_queue(lo->lo_queue);
+   blk_unfreeze_queue(lo->lo_queue);
 
if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
 !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 37046ac2c441..5c76b0a96be2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1226,7 +1226,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, 
struct nvme_id_ns *id)
 
if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
-   blk_mq_unfreeze_queue(disk->queue);
+   blk_unfreeze_queue(disk->queue);
 }
 
 static int nvme_revalidate_disk(struct gendisk 

[PATCH V3 2/8] blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue

2017-09-02 Thread Ming Lei
This APIs will be used by legacy path too.

Signed-off-by: Ming Lei 
---
 block/bfq-iosched.c  |  2 +-
 block/blk-cgroup.c   |  4 ++--
 block/blk-mq.c   | 17 -
 block/blk-mq.h   |  1 -
 block/elevator.c |  2 +-
 drivers/block/loop.c |  8 
 drivers/block/rbd.c  |  2 +-
 drivers/nvme/host/core.c |  2 +-
 include/linux/blk-mq.h   |  2 +-
 9 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 509f39998011..ce2b00e897e2 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4757,7 +4757,7 @@ static int bfq_init_queue(struct request_queue *q, struct 
elevator_type *e)
 * The invocation of the next bfq_create_group_hierarchy
 * function is the head of a chain of function calls
 * (bfq_create_group_hierarchy->blkcg_activate_policy->
-* blk_mq_freeze_queue) that may lead to the invocation of the
+* blk_freeze_queue) that may lead to the invocation of the
 * has_work hook function. For this reason,
 * bfq_create_group_hierarchy is invoked only after all
 * scheduler data has been initialized, apart from the fields
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 02e8a47ac77c..87c15f3947d5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1296,7 +1296,7 @@ int blkcg_activate_policy(struct request_queue *q,
return 0;
 
if (q->mq_ops)
-   blk_mq_freeze_queue(q);
+   blk_freeze_queue(q);
else
blk_queue_bypass_start(q);
 pd_prealloc:
@@ -1363,7 +1363,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
return;
 
if (q->mq_ops)
-   blk_mq_freeze_queue(q);
+   blk_freeze_queue(q);
else
blk_queue_bypass_start(q);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 82136e83951d..8cf1f7cbef2b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -161,16 +161,7 @@ void blk_freeze_queue(struct request_queue *q)
blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
 }
-
-void blk_mq_freeze_queue(struct request_queue *q)
-{
-   /*
-* ...just an alias to keep freeze and unfreeze actions balanced
-* in the blk_mq_* namespace
-*/
-   blk_freeze_queue(q);
-}
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
+EXPORT_SYMBOL_GPL(blk_freeze_queue);
 
 void blk_unfreeze_queue(struct request_queue *q)
 {
@@ -2248,7 +2239,7 @@ static void blk_mq_update_tag_set_depth(struct 
blk_mq_tag_set *set,
lockdep_assert_held(>tag_list_lock);
 
list_for_each_entry(q, >tag_list, tag_set_list) {
-   blk_mq_freeze_queue(q);
+   blk_freeze_queue(q);
queue_set_hctx_shared(q, shared);
blk_unfreeze_queue(q);
}
@@ -2683,7 +2674,7 @@ static int __blk_mq_update_nr_requests(struct 
request_queue *q,
if (!set)
return -EINVAL;
 
-   blk_mq_freeze_queue(q);
+   blk_freeze_queue(q);
 
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
@@ -2747,7 +2738,7 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
return;
 
list_for_each_entry(q, >tag_list, tag_set_list)
-   blk_mq_freeze_queue(q);
+   blk_freeze_queue(q);
 
set->nr_hw_queues = nr_hw_queues;
blk_mq_update_queue_map(set);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1b9742eb7399..7ce29ef1e6f3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -30,7 +30,6 @@ struct blk_mq_ctx {
 } cacheline_aligned_in_smp;
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
-void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
diff --git a/block/elevator.c b/block/elevator.c
index 371c8165c9e8..1164c8a3720f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -967,7 +967,7 @@ static int elevator_switch_mq(struct request_queue *q,
 {
int ret;
 
-   blk_mq_freeze_queue(q);
+   blk_freeze_queue(q);
 
if (q->elevator) {
if (q->elevator->registered)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5c11ea44d470..b2e708b7e1e6 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -211,7 +211,7 @@ static void __loop_update_dio(struct loop_device *lo, bool 
dio)
 * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
 * will get updated by ioctl(LOOP_GET_STATUS)
 */
-   blk_mq_freeze_queue(lo->lo_queue);
+   blk_freeze_queue(lo->lo_queue);
lo->use_dio = use_dio;
if (use_dio)
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
@@ -599,7 +599,7 @@ static int loop_switch(struct loop_device *lo, struct 

Re: [PATCH V2 0/8] block/scsi: safe SCSI quiescing

2017-09-02 Thread Oleksandr Natalenko
With regard to suspend/resume cycle:

Tested-by: Oleksandr Natalenko 

On pátek 1. září 2017 20:49:49 CEST Ming Lei wrote:
> Hi,
> 
> The current SCSI quiesce isn't safe and easy to trigger I/O deadlock.
> 
> Once SCSI device is put into QUIESCE, no new request except for RQF_PREEMPT
> can be dispatched to SCSI successfully, and scsi_device_quiesce() just
> simply waits for completion of I/Os dispatched to SCSI stack. It isn't
> enough at all.
> 
> Because new request still can be allocated, but all the allocated
> requests can't be dispatched successfully, so request pool can be
> consumed up easily.
> 
> Then request with RQF_PREEMPT can't be allocated, and system may
> hang forever, such as during system suspend or SCSI domain alidation.
> 
> Both IO hang inside system suspend[1] or SCSI domain validation
> were reported before.
> 
> This patch tries to solve the issue by freezing block queue during
> SCSI quiescing, and allowing to allocate request of RQF_PREEMPT
> when queue is frozen.
> 
> Both SCSI and SCSI_MQ have this IO deadlock issue, this patch fixes
> them all by introducing preempt version of blk_freeze_queue() and
> blk_unfreeze_queue().
> 
> V2:
>   - drop the 1st patch in V1 because percpu_ref_is_dying() is
>   enough as pointed by Tejun
> 
>   - introduce preempt version of blk_[freeze|unfreeze]_queue
> 
>   - sync between preempt freeze and normal freeze
> 
>   - fix warning from percpu-refcount as reported by Oleksandr
> 
> 
> [1] https://marc.info/?t=150340250100013=3=2
> 
> 
> 
> Ming Lei (8):
>   blk-mq: rename blk_mq_unfreeze_queue as blk_unfreeze_queue
>   blk-mq: rename blk_mq_freeze_queue as blk_freeze_queue
>   blk-mq: only run hw queues for blk-mq
>   blk-mq: rename blk_mq_freeze_queue_wait as blk_freeze_queue_wait
>   block: tracking request allocation with q_usage_counter
>   block: allow to allocate req with REQF_PREEMPT when queue is frozen
>   block: introduce preempt version of blk_[freeze|unfreeze]_queue
>   SCSI: freeze block queue when SCSI device is put into quiesce
> 
>  block/bfq-iosched.c  |   2 +-
>  block/blk-cgroup.c   |   8 ++--
>  block/blk-core.c |  50 
>  block/blk-mq.c   | 119
> --- block/blk-mq.h   | 
>  1 -
>  block/blk.h  |   6 +++
>  block/elevator.c |   4 +-
>  drivers/block/loop.c |  16 +++
>  drivers/block/rbd.c  |   2 +-
>  drivers/nvme/host/core.c |   8 ++--
>  drivers/scsi/scsi_lib.c  |  21 -
>  include/linux/blk-mq.h   |  15 +++---
>  include/linux/blkdev.h   |  20 +++-
>  13 files changed, 206 insertions(+), 66 deletions(-)