[PATCH v3 3/4] block: drop shared-tag fairness throttling

Sumit Saxena Tue, 09 Jun 2026 21:39:59 -0700

From: Bart Van Assche <[email protected]>

Original patch [1] by Bart Van Assche; this version is rebased onto the
current tree.  In testing it improves IOPS by roughly 16-18% by removing
the fair-sharing throttle on shared tag queues.


This patch removes the following code and structure members:
- The function hctx_may_queue().
- blk_mq_hw_ctx.nr_active and request_queue.nr_active_requests_shared_tags
  and also all the code that modifies these two member variables.

[1]: 
https://lore.kernel.org/linux-block/[email protected]/

Signed-off-by: Bart Van Assche <[email protected]>
Signed-off-by: Sumit Saxena <[email protected]>
---
 block/blk-core.c       |   2 -
 block/blk-mq-debugfs.c |  22 ++++++++-
 block/blk-mq-tag.c     |   4 --
 block/blk-mq.c         |  17 +------
 block/blk-mq.h         | 100 -----------------------------------------
 include/linux/blk-mq.h |   6 ---
 include/linux/blkdev.h |   2 -
 7 files changed, 22 insertions(+), 131 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 17450058ea6d..129acc1b27e5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -421,8 +421,6 @@ struct request_queue *blk_alloc_queue(struct queue_limits 
*lim, int node_id)
 
        q->node = node_id;
 
-       atomic_set(&q->nr_active_requests_shared_tags, 0);
-
        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
        INIT_WORK(&q->timeout_work, blk_timeout_work);
        INIT_LIST_HEAD(&q->icq_list);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 047ec887456b..8b85a7f8e987 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -468,11 +468,31 @@ static int hctx_sched_tags_bitmap_show(void *data, struct 
seq_file *m)
        return 0;
 }
 
+struct count_active_params {
+       struct blk_mq_hw_ctx    *hctx;
+       int                     *active;
+};
+
+static bool hctx_count_active(struct request *rq, void *data)
+{
+       const struct count_active_params *params = data;
+
+       if (rq->mq_hctx == params->hctx)
+               (*params->active)++;
+
+       return true;
+}
+
 static int hctx_active_show(void *data, struct seq_file *m)
 {
        struct blk_mq_hw_ctx *hctx = data;
+       int active = 0;
+       struct count_active_params params = { .hctx = hctx, .active = &active };
+
+       blk_mq_all_tag_iter(hctx->sched_tags ?: hctx->tags, hctx_count_active,
+                           &params);
 
-       seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
+       seq_printf(m, "%d\n", active);
        return 0;
 }
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..bfd27cc6249b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -109,10 +109,6 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                            struct sbitmap_queue *bt)
 {
-       if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
-                       !hctx_may_queue(data->hctx, bt))
-               return BLK_MQ_NO_TAG;
-
        if (data->shallow_depth)
                return sbitmap_queue_get_shallow(bt, data->shallow_depth);
        else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c5c16cce4f8..bbac59a06044 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -489,8 +489,6 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data 
*data)
                }
        } while (data->nr_tags > nr);
 
-       if (!(data->rq_flags & RQF_SCHED_TAGS))
-               blk_mq_add_active_requests(data->hctx, nr);
        /* caller already holds a reference, add for remainder */
        percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
        data->nr_tags -= nr;
@@ -587,8 +585,6 @@ static struct request *__blk_mq_alloc_requests(struct 
blk_mq_alloc_data *data)
                goto retry;
        }
 
-       if (!(data->rq_flags & RQF_SCHED_TAGS))
-               blk_mq_inc_active_requests(data->hctx);
        rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        return rq;
@@ -763,8 +759,6 @@ struct request *blk_mq_alloc_request_hctx(struct 
request_queue *q,
        tag = blk_mq_get_tag(&data);
        if (tag == BLK_MQ_NO_TAG)
                goto out_queue_exit;
-       if (!(data.rq_flags & RQF_SCHED_TAGS))
-               blk_mq_inc_active_requests(data.hctx);
        rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        rq->__data_len = 0;
@@ -807,10 +801,8 @@ static void __blk_mq_free_request(struct request *rq)
        blk_pm_mark_last_busy(rq);
        rq->mq_hctx = NULL;
 
-       if (rq->tag != BLK_MQ_NO_TAG) {
-               blk_mq_dec_active_requests(hctx);
+       if (rq->tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->tags, ctx, rq->tag);
-       }
        if (sched_tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
        blk_mq_sched_restart(hctx);
@@ -1188,8 +1180,6 @@ static inline void blk_mq_flush_tag_batch(struct 
blk_mq_hw_ctx *hctx,
 {
        struct request_queue *q = hctx->queue;
 
-       blk_mq_sub_active_requests(hctx, nr_tags);
-
        blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
        percpu_ref_put_many(&q->q_usage_counter, nr_tags);
 }
@@ -1875,9 +1865,6 @@ bool __blk_mq_alloc_driver_tag(struct request *rq)
        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
                bt = &rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
-       } else {
-               if (!hctx_may_queue(rq->mq_hctx, bt))
-                       return false;
        }
 
        tag = __sbitmap_queue_get(bt);
@@ -1885,7 +1872,6 @@ bool __blk_mq_alloc_driver_tag(struct request *rq)
                return false;
 
        rq->tag = tag + tag_offset;
-       blk_mq_inc_active_requests(rq->mq_hctx);
        return true;
 }
 
@@ -4058,7 +4044,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct 
blk_mq_tag_set *set,
        if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
                goto free_hctx;
 
-       atomic_set(&hctx->nr_active, 0);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
        hctx->numa_node = node;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index aa15d31aaae9..8dfb67c55f5d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -291,70 +291,9 @@ static inline int blk_mq_get_rq_budget_token(struct 
request *rq)
        return -1;
 }
 
-static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
-                                               int val)
-{
-       if (blk_mq_is_shared_tags(hctx->flags))
-               atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
-       else
-               atomic_add(val, &hctx->nr_active);
-}
-
-static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
-{
-       __blk_mq_add_active_requests(hctx, 1);
-}
-
-static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
-               int val)
-{
-       if (blk_mq_is_shared_tags(hctx->flags))
-               atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
-       else
-               atomic_sub(val, &hctx->nr_active);
-}
-
-static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
-{
-       __blk_mq_sub_active_requests(hctx, 1);
-}
-
-static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
-                                             int val)
-{
-       if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
-               __blk_mq_add_active_requests(hctx, val);
-}
-
-static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
-{
-       if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
-               __blk_mq_inc_active_requests(hctx);
-}
-
-static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
-                                             int val)
-{
-       if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
-               __blk_mq_sub_active_requests(hctx, val);
-}
-
-static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
-{
-       if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
-               __blk_mq_dec_active_requests(hctx);
-}
-
-static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
-{
-       if (blk_mq_is_shared_tags(hctx->flags))
-               return 
atomic_read(&hctx->queue->nr_active_requests_shared_tags);
-       return atomic_read(&hctx->nr_active);
-}
 static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
                                           struct request *rq)
 {
-       blk_mq_dec_active_requests(hctx);
        blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
        rq->tag = BLK_MQ_NO_TAG;
 }
@@ -396,45 +335,6 @@ static inline void blk_mq_free_requests(struct list_head 
*list)
        }
 }
 
-/*
- * For shared tag users, we track the number of currently active users
- * and attempt to provide a fair share of the tag depth for each of them.
- */
-static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
-                                 struct sbitmap_queue *bt)
-{
-       unsigned int depth, users;
-
-       if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
-               return true;
-
-       /*
-        * Don't try dividing an ant
-        */
-       if (bt->sb.depth == 1)
-               return true;
-
-       if (blk_mq_is_shared_tags(hctx->flags)) {
-               struct request_queue *q = hctx->queue;
-
-               if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
-                       return true;
-       } else {
-               if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
-                       return true;
-       }
-
-       users = READ_ONCE(hctx->tags->active_queues);
-       if (!users)
-               return true;
-
-       /*
-        * Allow at least some tags
-        */
-       depth = max((bt->sb.depth + users - 1) / users, 4U);
-       return __blk_mq_active_requests(hctx) < depth;
-}
-
 /* run the code block in @dispatch_ops with rcu/srcu read lock held */
 #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops)        \
 do {                                                           \
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 18a2388ba581..ccbb07559402 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -432,12 +432,6 @@ struct blk_mq_hw_ctx {
        /** @queue_num: Index of this hardware queue. */
        unsigned int            queue_num;
 
-       /**
-        * @nr_active: Number of active requests. Only used when a tag set is
-        * shared across request queues.
-        */
-       atomic_t                nr_active;
-
        /** @cpuhp_online: List to store request if CPU is going to die */
        struct hlist_node       cpuhp_online;
        /** @cpuhp_dead: List to store request if some CPU die. */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..95525b1d7b74 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -567,8 +567,6 @@ struct request_queue {
        struct timer_list       timeout;
        struct work_struct      timeout_work;
 
-       atomic_t                nr_active_requests_shared_tags;
-
        struct blk_mq_tags      *sched_shared_tags;
 
        struct list_head        icq_list;
-- 
2.43.7

[PATCH v3 3/4] block: drop shared-tag fairness throttling

Reply via email to