On Thu, 2026-03-19 at 18:19 -0400, Aaron Tomlin wrote: > In high-performance storage environments, particularly when utilising > RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), > severe > latency spikes can occur when fast devices are starved of available > tags. > > This patch introduces two new debugfs attributes for each block > hardware queue: > - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag > - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag > > These files expose atomic counters that increment each time a > submitting > context is forced into an uninterruptible sleep via io_schedule() due > to > the complete exhaustion of physical driver tags or software scheduler > tags, respectively. > > To guarantee zero performance overhead for production kernels > compiled > without debugfs, the underlying atomic_t variables and their > associated > increment routines are strictly guarded behind CONFIG_BLK_DEBUG_FS. > When this configuration is disabled, the tracking logic compiles down > to a safe no-op. > > Signed-off-by: Aaron Tomlin <[email protected]> > --- > block/blk-mq-debugfs.c | 56 > ++++++++++++++++++++++++++++++++++++++++++ > block/blk-mq-debugfs.h | 7 ++++++ > block/blk-mq-tag.c | 4 +++ > include/linux/blk-mq.h | 10 ++++++++ > 4 files changed, 77 insertions(+) > > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c > index 28167c9baa55..078561d7da38 100644 > --- a/block/blk-mq-debugfs.c > +++ b/block/blk-mq-debugfs.c > @@ -483,6 +483,42 @@ static int hctx_dispatch_busy_show(void *data, > struct seq_file *m) > return 0; > } > > +/** > + * hctx_wait_on_hw_tag_show - display hardware tag starvation count > + * @data: generic pointer to the associated hardware context (hctx) > + * @m: seq_file pointer for debugfs output formatting > + * > + * Prints the cumulative number of times a submitting context was > forced > + * to block due to the exhaustion of physical hardware driver tags. > + * > + * Return: 0 on success. > + */ > +static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m) > +{ > + struct blk_mq_hw_ctx *hctx = data; > + > + seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_hw_tag)); > + return 0; > +} > + > +/** > + * hctx_wait_on_sched_tag_show - display scheduler tag starvation > count > + * @data: generic pointer to the associated hardware context (hctx) > + * @m: seq_file pointer for debugfs output formatting > + * > + * Prints the cumulative number of times a submitting context was > forced > + * to block due to the exhaustion of software scheduler tags. > + * > + * Return: 0 on success. > + */ > +static int hctx_wait_on_sched_tag_show(void *data, struct seq_file > *m) > +{ > + struct blk_mq_hw_ctx *hctx = data; > + > + seq_printf(m, "%d\n", atomic_read(&hctx- > >wait_on_sched_tag)); > + return 0; > +} > + > #define CTX_RQ_SEQ_OPS(name, > type) \ > static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t > *pos) \ > __acquires(&ctx- > >lock) \ > @@ -598,6 +634,8 @@ static const struct blk_mq_debugfs_attr > blk_mq_debugfs_hctx_attrs[] = { > {"active", 0400, hctx_active_show}, > {"dispatch_busy", 0400, hctx_dispatch_busy_show}, > {"type", 0400, hctx_type_show}, > + {"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show}, > + {"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show}, > {}, > }; > > @@ -814,3 +852,21 @@ void blk_mq_debugfs_unregister_sched_hctx(struct > blk_mq_hw_ctx *hctx) > debugfs_remove_recursive(hctx->sched_debugfs_dir); > hctx->sched_debugfs_dir = NULL; > } > + > +/** > + * blk_mq_debugfs_inc_wait_tags - increment the tag starvation > counters > + * @hctx: hardware context associated with the tag allocation > + * @is_sched: boolean indicating whether the starved pool is the > software scheduler > + * > + * Evaluates the exhausted tag pool and increments the appropriate > debugfs > + * starvation counter. This is invoked immediately before the > submitting > + * context is forced into an uninterruptible sleep via > io_schedule(). > + */ > +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, > + bool is_sched) > +{ > + if (is_sched) > + atomic_inc(&hctx->wait_on_sched_tag); > + else > + atomic_inc(&hctx->wait_on_hw_tag); > +} > diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h > index 49bb1aaa83dc..2cda555d5730 100644 > --- a/block/blk-mq-debugfs.h > +++ b/block/blk-mq-debugfs.h > @@ -34,6 +34,8 @@ void blk_mq_debugfs_register_sched_hctx(struct > request_queue *q, > void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx > *hctx); > > void blk_mq_debugfs_register_rq_qos(struct request_queue *q); > +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, > + bool is_sched); > #else > static inline void blk_mq_debugfs_register(struct request_queue *q) > { > @@ -77,6 +79,11 @@ static inline void > blk_mq_debugfs_register_rq_qos(struct request_queue *q) > { > } > > +static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx > *hctx, > + bool is_sched) > +{ > +} > + > #endif > > #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c > index 66138dd043d4..3cc6a97a87a0 100644 > --- a/block/blk-mq-tag.c > +++ b/block/blk-mq-tag.c > @@ -17,6 +17,7 @@ > #include "blk.h" > #include "blk-mq.h" > #include "blk-mq-sched.h" > +#include "blk-mq-debugfs.h" > > /* > * Recalculate wakeup batch when tag is shared by hctx. > @@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct > blk_mq_alloc_data *data) > trace_block_rq_tag_wait(data->q, data->hctx, > data->rq_flags & > RQF_SCHED_TAGS); > > + blk_mq_debugfs_inc_wait_tags(data->hctx, > + data->rq_flags & > RQF_SCHED_TAGS); > + > bt_prev = bt; > io_schedule(); > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 18a2388ba581..f3d8ea93b23f 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -453,6 +453,16 @@ struct blk_mq_hw_ctx { > struct dentry *debugfs_dir; > /** @sched_debugfs_dir: debugfs directory for the > scheduler. */ > struct dentry *sched_debugfs_dir; > + /** > + * @wait_on_hw_tag: Cumulative counter incremented each time > a submitting > + * context is forced to block due to physical hardware > driver tag exhaustion. > + */ > + atomic_t wait_on_hw_tag; > + /** > + * @wait_on_sched_tag: Cumulative counter incremented each > time a submitting > + * context is forced to block due to software scheduler tag > exhaustion. > + */ > + atomic_t wait_on_sched_tag; > #endif > > /**
For [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs Tested-by: Laurence Oberman <[email protected]> Every 10.0s: grep . /sys/kernel/debug/block/nvme0n1/hctx0/wait_on_* rhel95: Fri Mar 20 11:04:15 2026 /sys/kernel/debug/block/nvme0n1/hctx0/wait_on_hw_tag:103260 <--- cumulative /sys/kernel/debug/block/nvme0n1/hctx0/wait_on_sched_tag:0 The patch to me looks good, but will need others to confirm Reviewed-by: Laurence Oberman <[email protected]>
