On Thu, 2026-03-19 at 18:19 -0400, Aaron Tomlin wrote:
> In high-performance storage environments, particularly when utilising
> RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED),
> severe
> latency spikes can occur when fast devices are starved of available
> tags.
> 
> This patch introduces two new debugfs attributes for each block
> hardware queue:
>   - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag
>   - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag
> 
> These files expose atomic counters that increment each time a
> submitting
> context is forced into an uninterruptible sleep via io_schedule() due
> to
> the complete exhaustion of physical driver tags or software scheduler
> tags, respectively.
> 
> To guarantee zero performance overhead for production kernels
> compiled
> without debugfs, the underlying atomic_t variables and their
> associated
> increment routines are strictly guarded behind CONFIG_BLK_DEBUG_FS.
> When this configuration is disabled, the tracking logic compiles down
> to a safe no-op.
> 
> Signed-off-by: Aaron Tomlin <[email protected]>
> ---
>  block/blk-mq-debugfs.c | 56
> ++++++++++++++++++++++++++++++++++++++++++
>  block/blk-mq-debugfs.h |  7 ++++++
>  block/blk-mq-tag.c     |  4 +++
>  include/linux/blk-mq.h | 10 ++++++++
>  4 files changed, 77 insertions(+)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 28167c9baa55..078561d7da38 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -483,6 +483,42 @@ static int hctx_dispatch_busy_show(void *data,
> struct seq_file *m)
>       return 0;
>  }
>  
> +/**
> + * hctx_wait_on_hw_tag_show - display hardware tag starvation count
> + * @data: generic pointer to the associated hardware context (hctx)
> + * @m: seq_file pointer for debugfs output formatting
> + *
> + * Prints the cumulative number of times a submitting context was
> forced
> + * to block due to the exhaustion of physical hardware driver tags.
> + *
> + * Return: 0 on success.
> + */
> +static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
> +{
> +     struct blk_mq_hw_ctx *hctx = data;
> +
> +     seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_hw_tag));
> +     return 0;
> +}
> +
> +/**
> + * hctx_wait_on_sched_tag_show - display scheduler tag starvation
> count
> + * @data: generic pointer to the associated hardware context (hctx)
> + * @m: seq_file pointer for debugfs output formatting
> + *
> + * Prints the cumulative number of times a submitting context was
> forced
> + * to block due to the exhaustion of software scheduler tags.
> + *
> + * Return: 0 on success.
> + */
> +static int hctx_wait_on_sched_tag_show(void *data, struct seq_file
> *m)
> +{
> +     struct blk_mq_hw_ctx *hctx = data;
> +
> +     seq_printf(m, "%d\n", atomic_read(&hctx-
> >wait_on_sched_tag));
> +     return 0;
> +}
> +
>  #define CTX_RQ_SEQ_OPS(name,
> type)                                 \
>  static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t
> *pos) \
>       __acquires(&ctx-
> >lock)                                                \
> @@ -598,6 +634,8 @@ static const struct blk_mq_debugfs_attr
> blk_mq_debugfs_hctx_attrs[] = {
>       {"active", 0400, hctx_active_show},
>       {"dispatch_busy", 0400, hctx_dispatch_busy_show},
>       {"type", 0400, hctx_type_show},
> +     {"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
> +     {"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
>       {},
>  };
>  
> @@ -814,3 +852,21 @@ void blk_mq_debugfs_unregister_sched_hctx(struct
> blk_mq_hw_ctx *hctx)
>       debugfs_remove_recursive(hctx->sched_debugfs_dir);
>       hctx->sched_debugfs_dir = NULL;
>  }
> +
> +/**
> + * blk_mq_debugfs_inc_wait_tags - increment the tag starvation
> counters
> + * @hctx: hardware context associated with the tag allocation
> + * @is_sched: boolean indicating whether the starved pool is the
> software scheduler
> + *
> + * Evaluates the exhausted tag pool and increments the appropriate
> debugfs
> + * starvation counter. This is invoked immediately before the
> submitting
> + * context is forced into an uninterruptible sleep via
> io_schedule().
> + */
> +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
> +                               bool is_sched)
> +{
> +     if (is_sched)
> +             atomic_inc(&hctx->wait_on_sched_tag);
> +     else
> +             atomic_inc(&hctx->wait_on_hw_tag);
> +}
> diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
> index 49bb1aaa83dc..2cda555d5730 100644
> --- a/block/blk-mq-debugfs.h
> +++ b/block/blk-mq-debugfs.h
> @@ -34,6 +34,8 @@ void blk_mq_debugfs_register_sched_hctx(struct
> request_queue *q,
>  void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx
> *hctx);
>  
>  void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
> +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
> +                               bool is_sched);
>  #else
>  static inline void blk_mq_debugfs_register(struct request_queue *q)
>  {
> @@ -77,6 +79,11 @@ static inline void
> blk_mq_debugfs_register_rq_qos(struct request_queue *q)
>  {
>  }
>  
> +static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx
> *hctx,
> +                                             bool is_sched)
> +{
> +}
> +
>  #endif
>  
>  #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 66138dd043d4..3cc6a97a87a0 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -17,6 +17,7 @@
>  #include "blk.h"
>  #include "blk-mq.h"
>  #include "blk-mq-sched.h"
> +#include "blk-mq-debugfs.h"
>  
>  /*
>   * Recalculate wakeup batch when tag is shared by hctx.
> @@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct
> blk_mq_alloc_data *data)
>               trace_block_rq_tag_wait(data->q, data->hctx,
>                                       data->rq_flags &
> RQF_SCHED_TAGS);
>  
> +             blk_mq_debugfs_inc_wait_tags(data->hctx,
> +                                          data->rq_flags &
> RQF_SCHED_TAGS);
> +
>               bt_prev = bt;
>               io_schedule();
>  
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 18a2388ba581..f3d8ea93b23f 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -453,6 +453,16 @@ struct blk_mq_hw_ctx {
>       struct dentry           *debugfs_dir;
>       /** @sched_debugfs_dir: debugfs directory for the
> scheduler. */
>       struct dentry           *sched_debugfs_dir;
> +     /**
> +      * @wait_on_hw_tag: Cumulative counter incremented each time
> a submitting
> +      * context is forced to block due to physical hardware
> driver tag exhaustion.
> +      */
> +     atomic_t                wait_on_hw_tag;
> +     /**
> +      * @wait_on_sched_tag: Cumulative counter incremented each
> time a submitting
> +      * context is forced to block due to software scheduler tag
> exhaustion.
> +      */
> +     atomic_t                wait_on_sched_tag;
>  #endif
>  
>       /**

For [PATCH v3 2/2] blk-mq: expose tag starvation counts via debugfs

Tested-by: Laurence Oberman <[email protected]>

Every 10.0s: grep . /sys/kernel/debug/block/nvme0n1/hctx0/wait_on_*   
rhel95: Fri Mar 20 11:04:15 2026

/sys/kernel/debug/block/nvme0n1/hctx0/wait_on_hw_tag:103260 <---
cumulative 
/sys/kernel/debug/block/nvme0n1/hctx0/wait_on_sched_tag:0

The patch to me looks good, but will need others to confirm
Reviewed-by: Laurence Oberman <[email protected]>


Reply via email to