On 12/20/18 11:21 AM, Jens Axboe wrote:
> On 12/20/18 11:01 AM, Bart Van Assche wrote:
>> On Thu, 2018-12-20 at 06:07 -0700, Jens Axboe wrote:
>>> On 12/20/18 6:02 AM, Jens Axboe wrote:
>>>>> I'm afraid this cannot work.
>>>>>
>>>>> The 'tags' here could be the hctx->sched_tags, but what we need to
>>>>> clear is hctx->tags->rqs[].
>>>>
>>>> You are right, of course, a bit too quick on the trigger. This one
>>>> should work better, and also avoids that silly quadratic loop. I don't
>>>> think we need the tag == -1 check, but probably best to be safe.
>>>
>>> Sent out the wrong one, here it is. Bart, if you can reproduce, can you
>>> give it a whirl?
>>>
>>>
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 2de972857496..fc04bb648f18 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -2025,7 +2025,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, 
>>> struct blk_mq_tags *tags,
>>>  {
>>>     struct page *page;
>>>  
>>> -   if (tags->rqs && set->ops->exit_request) {
>>> +   if (tags->rqs) {
>>>             int i;
>>>  
>>>             for (i = 0; i < tags->nr_tags; i++) {
>>> @@ -2033,8 +2033,14 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, 
>>> struct blk_mq_tags *tags,
>>>  
>>>                     if (!rq)
>>>                             continue;
>>> -                   set->ops->exit_request(set, rq, hctx_idx);
>>> +                   if (set->ops->exit_request)
>>> +                           set->ops->exit_request(set, rq, hctx_idx);
>>>                     tags->static_rqs[i] = NULL;
>>> +
>>> +                   if (rq->tag == -1)
>>> +                           continue;
>>> +                   if (set->tags[hctx_idx]->rqs[rq->tag] == rq)
>>> +                           set->tags[hctx_idx]->rqs[rq->tag] = NULL;
>>>             }
>>>     }
>>>  
>>> @@ -2113,6 +2119,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set 
>>> *set, struct request *rq,
>>>                     return ret;
>>>     }
>>>  
>>> +   rq->tag = -1;
>>>     WRITE_ONCE(rq->state, MQ_RQ_IDLE);
>>>     return 0;
>>>  }
>>
>> Hi Jens,
>>
>> Are you sure this is sufficient?
> 
> No, I don't think it is.
> 
>> My understanding is that
>> blk_mq_queue_tag_busy_iter() iterates over all tags in the tag set. So if the
>> request queue on which part_in_flight() is called and the request queue for
>> which blk_mq_free_rqs() is called share their tag set then part_in_flight()
>> and blk_mq_free_rqs() can run concurrently. That can cause ugly race
>> conditions. Do you think it would be a good idea to modify the inflight
>> accounting code such that it only considers the requests of a single request
>> queue instead of all requests for a given tag set?
> 
> That would of course solve it, the question is how to do it. One option
> would be to have ->rqs[] be:
> 
> struct rq_entry {
>       struct request_queue *q;
>       struct request *rq;
> };
> 
> instead of just a request, since then you could check the queue without
> having to dereference the request. The current race is inherent in that
> we set ->rqs[] AFTER having acquired the tag, so there's a window where
> you could find a stale entry. That's not normally an issue since
> requests are persistent, but for shared tag maps and queues disappearing
> it can pose a problem.

Something like this, totally untested. If the queue matches, we know it's
safe to dereference it.

A safer API to export as well...


diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2089c6c62f44..78192b544fa2 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int 
bitnr, void *data)
 
        if (!reserved)
                bitnr += tags->nr_reserved_tags;
-       rq = tags->rqs[bitnr];
+       if (tags->rqs[bitnr].queue != hctx->queue)
+               return true;
 
        /*
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assigning ->rqs[].
         */
-       if (rq && rq->q == hctx->queue)
+       rq = tags->rqs[bitnr].rq;
+       if (rq)
                return iter_data->fn(hctx, rq, iter_data->data, reserved);
        return true;
 }
@@ -268,6 +270,7 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct 
sbitmap_queue *bt,
 
 struct bt_tags_iter_data {
        struct blk_mq_tags *tags;
+       struct blk_mq_hw_ctx *hctx;
        busy_tag_iter_fn *fn;
        void *data;
        bool reserved;
@@ -287,7 +290,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned 
int bitnr, void *data)
         * We can hit rq == NULL here, because the tagging functions
         * test and set the bit before assining ->rqs[].
         */
-       rq = tags->rqs[bitnr];
+       rq = tags->rqs[bitnr].rq;
        if (rq && blk_mq_request_started(rq))
                return iter_data->fn(rq, iter_data->data, reserved);
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..bb84d1f099c7 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -4,6 +4,11 @@
 
 #include "blk-mq.h"
 
+struct rq_tag_entry {
+       struct request_queue *queue;
+       struct request *rq;
+};
+
 /*
  * Tag address space map.
  */
@@ -16,7 +21,7 @@ struct blk_mq_tags {
        struct sbitmap_queue bitmap_tags;
        struct sbitmap_queue breserved_tags;
 
-       struct request **rqs;
+       struct rq_tag_entry *rqs;
        struct request **static_rqs;
        struct list_head page_list;
 };
@@ -78,7 +83,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
                unsigned int tag, struct request *rq)
 {
-       hctx->tags->rqs[tag] = rq;
+       hctx->tags->rqs[tag].queue = hctx->queue;
+       hctx->tags->rqs[tag].rq = rq;
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ba37b9e15e9..4f194946dbd9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
                rq->tag = -1;
                rq->internal_tag = tag;
        } else {
-               if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
+               struct blk_mq_hw_ctx *hctx = data->hctx;
+
+               if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
                        rq_flags = RQF_MQ_INFLIGHT;
-                       atomic_inc(&data->hctx->nr_active);
+                       atomic_inc(&hctx->nr_active);
                }
                rq->tag = tag;
                rq->internal_tag = -1;
-               data->hctx->tags->rqs[rq->tag] = rq;
+               hctx->tags->rqs[rq->tag].queue = hctx->queue;
+               hctx->tags->rqs[rq->tag].rq = rq;
        }
 
        /* csd/requeue_work/fifo_time is initialized before use */
@@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
        if (tag < tags->nr_tags) {
-               prefetch(tags->rqs[tag]);
-               return tags->rqs[tag];
+               prefetch(tags->rqs[tag].rq);
+               return tags->rqs[tag].rq;
        }
 
        return NULL;
@@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx 
*hctx, struct request *rq,
                               void *priv, bool reserved)
 {
        /*
-        * If we find a request that is inflight and the queue matches,
-        * we know the queue is busy. Return false to stop the iteration.
+        * We're only called here if the queue matches. If the rq state is
+        * inflight, we know the queue is busy. Return false to stop the
+        * iteration.
         */
-       if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+       if (rq->state == MQ_RQ_IN_FLIGHT) {
                bool *busy = priv;
 
                *busy = true;
@@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq)
        shared = blk_mq_tag_busy(data.hctx);
        rq->tag = blk_mq_get_tag(&data);
        if (rq->tag >= 0) {
+               struct blk_mq_hw_ctx *hctx = data.hctx;
+
                if (shared) {
                        rq->rq_flags |= RQF_MQ_INFLIGHT;
                        atomic_inc(&data.hctx->nr_active);
                }
-               data.hctx->tags->rqs[rq->tag] = rq;
+               hctx->tags->rqs[rq->tag].queue = hctx->queue;
+               hctx->tags->rqs[rq->tag].rq = rq;
        }
 
 done:
@@ -2069,7 +2076,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct 
blk_mq_tag_set *set,
        if (!tags)
                return NULL;
 
-       tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
+       tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 node);
        if (!tags->rqs) {

-- 
Jens Axboe

Reply via email to