In high-performance storage environments, particularly when utilising RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe latency spikes can occur when fast devices are starved of available tags.
This patch introduces two new debugfs attributes for each block hardware queue: - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag These files expose atomic counters that increment each time a submitting context is forced into an uninterruptible sleep via io_schedule() due to the complete exhaustion of physical driver tags or software scheduler tags, respectively. To guarantee zero performance overhead for production kernels compiled without debugfs, the underlying atomic_t variables and their associated increment routines are strictly guarded behind CONFIG_BLK_DEBUG_FS. When this configuration is disabled, the tracking logic compiles down to a safe no-op. Signed-off-by: Aaron Tomlin <[email protected]> --- block/blk-mq-debugfs.c | 56 ++++++++++++++++++++++++++++++++++++++++++ block/blk-mq-debugfs.h | 7 ++++++ block/blk-mq-tag.c | 4 +++ include/linux/blk-mq.h | 10 ++++++++ 4 files changed, 77 insertions(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..078561d7da38 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -483,6 +483,42 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m) return 0; } +/** + * hctx_wait_on_hw_tag_show - display hardware tag starvation count + * @data: generic pointer to the associated hardware context (hctx) + * @m: seq_file pointer for debugfs output formatting + * + * Prints the cumulative number of times a submitting context was forced + * to block due to the exhaustion of physical hardware driver tags. + * + * Return: 0 on success. + */ +static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_hw_tag)); + return 0; +} + +/** + * hctx_wait_on_sched_tag_show - display scheduler tag starvation count + * @data: generic pointer to the associated hardware context (hctx) + * @m: seq_file pointer for debugfs output formatting + * + * Prints the cumulative number of times a submitting context was forced + * to block due to the exhaustion of software scheduler tags. + * + * Return: 0 on success. + */ +static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%d\n", atomic_read(&hctx->wait_on_sched_tag)); + return 0; +} + #define CTX_RQ_SEQ_OPS(name, type) \ static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ __acquires(&ctx->lock) \ @@ -598,6 +634,8 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, + {"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show}, + {"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show}, {}, }; @@ -814,3 +852,21 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } + +/** + * blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters + * @hctx: hardware context associated with the tag allocation + * @is_sched: boolean indicating whether the starved pool is the software scheduler + * + * Evaluates the exhausted tag pool and increments the appropriate debugfs + * starvation counter. This is invoked immediately before the submitting + * context is forced into an uninterruptible sleep via io_schedule(). + */ +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, + bool is_sched) +{ + if (is_sched) + atomic_inc(&hctx->wait_on_sched_tag); + else + atomic_inc(&hctx->wait_on_hw_tag); +} diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 49bb1aaa83dc..2cda555d5730 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -34,6 +34,8 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_rq_qos(struct request_queue *q); +void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, + bool is_sched); #else static inline void blk_mq_debugfs_register(struct request_queue *q) { @@ -77,6 +79,11 @@ static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q) { } +static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx, + bool is_sched) +{ +} + #endif #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 66138dd043d4..3cc6a97a87a0 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -17,6 +17,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" +#include "blk-mq-debugfs.h" /* * Recalculate wakeup batch when tag is shared by hctx. @@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) trace_block_rq_tag_wait(data->q, data->hctx, data->rq_flags & RQF_SCHED_TAGS); + blk_mq_debugfs_inc_wait_tags(data->hctx, + data->rq_flags & RQF_SCHED_TAGS); + bt_prev = bt; io_schedule(); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 18a2388ba581..f3d8ea93b23f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -453,6 +453,16 @@ struct blk_mq_hw_ctx { struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; + /** + * @wait_on_hw_tag: Cumulative counter incremented each time a submitting + * context is forced to block due to physical hardware driver tag exhaustion. + */ + atomic_t wait_on_hw_tag; + /** + * @wait_on_sched_tag: Cumulative counter incremented each time a submitting + * context is forced to block due to software scheduler tag exhaustion. + */ + atomic_t wait_on_sched_tag; #endif /** -- 2.51.0
