Since commit 1d9bd5161ba3 ("blk-mq: replace timeout synchronization with a
RCU and generation based scheme") I observe lockdep complaints (full
message attached):
[ 21.763369]CPU0CPU1
[ 21.763370]
[ 21.763371] lock(>gstate_seq);
[ 21.763374]local_irq_disable();
[ 21.763375]lock(&(>lock)->rlock);
[ 21.763377]lock(>gstate_seq);
[ 21.763379]
[ 21.763380] lock(&(>lock)->rlock);
[ 21.763382]
*** DEADLOCK ***
This only happens in combination with DASD (s390 specific) and another
blk-mq driver (scsi, null_blk). The distinctive behavior of DASD is that
it calls blk_mq_start_request with irqs disabled.
This is a false positive since gstate_seq on CPU0 is from a different
request queue / block driver than gstate_seq on CPU1.
Possible fixes are to use local_irq_save/restore in blk_mq_start_request.
Or, since it's a false positive, teach lockdep that these are different
locks - like below:
-->8
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0dc9e341c2a7..d1a2c2fa015d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2040,7 +2040,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set
*set, struct request *rq,
return ret;
}
- seqcount_init(>gstate_seq);
+ __seqcount_init(>gstate_seq, ">gstate_seq",
set->rq_seqcount_key);
u64_stats_init(>aborted_gstate_sync);
return 0;
}
@@ -2774,7 +2774,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set
*set)
* requested depth down, if if it too large. In that case, the set
* value will be stored in set->queue_depth.
*/
-int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
+int __blk_mq_alloc_tag_set(struct blk_mq_tag_set *set, struct lock_class_key
*key)
{
int ret;
@@ -2825,6 +2825,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->mq_map)
goto out_free_tags;
+ set->rq_seqcount_key = key;
ret = blk_mq_update_queue_map(set);
if (ret)
goto out_free_mq_map;
@@ -2846,7 +2847,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
set->tags = NULL;
return ret;
}
-EXPORT_SYMBOL(blk_mq_alloc_tag_set);
+EXPORT_SYMBOL(__blk_mq_alloc_tag_set);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e3986f4b3461..1fb8bc1fdb07 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -85,6 +85,8 @@ struct blk_mq_tag_set {
struct mutextag_list_lock;
struct list_headtag_list;
+
+ struct lock_class_key *rq_seqcount_key;
};
struct blk_mq_queue_data {
@@ -201,7 +203,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct
blk_mq_tag_set *set,
int blk_mq_register_dev(struct device *, struct request_queue *);
void blk_mq_unregister_dev(struct device *, struct request_queue *);
-int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+int __blk_mq_alloc_tag_set(struct blk_mq_tag_set *set, struct lock_class_key
*key);
+
+static inline int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
+{
+ static struct lock_class_key _seqcount_key;
+
+ return __blk_mq_alloc_tag_set(set, &_seqcount_key);
+}
+
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);[ 74.182295] WARNING: possible irq lock inversion dependency detected
[ 74.182296] 4.16.0-11766-ge241e3f2bf97 #166 Not tainted
[ 74.182297]
[ 74.182298] kworker/u64:7/175 just changed the state of lock:
[ 74.182301] 1b8db3f6 (>gstate_seq){+.+.}, at:
scsi_queue_rq+0x460/0x578
[ 74.182308] but this lock was taken by another, SOFTIRQ-safe lock in the
past:
[ 74.182310] (&(>lock)->rlock){..-.}
[ 74.182311]
and interrupts could create inverse lock ordering between them.
[ 74.182315]
other info that might help us debug this:
[ 74.182316] Possible interrupt unsafe locking scenario:
[ 74.182318]CPU0CPU1
[ 74.182319]
[ 74.182320] lock(>gstate_seq);
[ 74.182340]local_irq_disable();
[ 74.182341]lock(&(>lock)->rlock);
[ 74.182343]lock(>gstate_seq);
[ 74.182345]
[ 74.182346] lock(&(>lock)->rlock);
[ 74.182348]
*** DEADLOCK ***
[ 74.182350] 4 locks held by kworker/u64:7/175:
[ 74.182351] #0: 1751423f
((wq_completion)"%s"shost->work_q_name){+.+.}, at: process_one_work+0x1a0/0x6f0
[ 74.182359] #1: 1c880b65
((work_completion)(>scan_work)){+.+.}, at: process_one_work+0x1a0/0x6f0
[