Ping...

Thanks,
Joseph

On 19/7/25 11:03, Joseph Qi wrote:
> Add blkio.throttle.io_service_time and blkio.throttle.io_wait_time to
> get per-cgroup io delay statistics in blk-throttle layer.
> io_service_time represents the time spent after io throttle to io
> completion, while io_wait_time represents the time spent on throttle
> queue.
> 
> Signed-off-by: Joseph Qi <joseph...@linux.alibaba.com>
> ---
>  block/bio.c               |   4 ++
>  block/blk-throttle.c      | 130 
> +++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/blk_types.h |  34 ++++++++++++
>  3 files changed, 167 insertions(+), 1 deletion(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 299a0e7..3206462 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1826,6 +1826,10 @@ void bio_endio(struct bio *bio)
>       blk_throtl_bio_endio(bio);
>       /* release cgroup info */
>       bio_uninit(bio);
> +#ifdef CONFIG_BLK_DEV_THROTTLING
> +     if (bio->bi_tg_end_io)
> +             bio->bi_tg_end_io(bio);
> +#endif
>       if (bio->bi_end_io)
>               bio->bi_end_io(bio);
>  }
> diff --git a/block/blk-throttle.c b/block/blk-throttle.c
> index 8ab6c81..a5880f0 100644
> --- a/block/blk-throttle.c
> +++ b/block/blk-throttle.c
> @@ -176,6 +176,11 @@ struct throtl_grp {
>       unsigned int bio_cnt; /* total bios */
>       unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
>       unsigned long bio_cnt_reset_time;
> +
> +     /* total time spent on lower layer: scheduler, device and others */
> +     struct blkg_rwstat service_time;
> +     /* total time spent on block throttle */
> +     struct blkg_rwstat wait_time;
>  };
>  
>  /* We measure latency for request size from <= 4k to >= 1M */
> @@ -487,6 +492,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t 
> gfp, int node)
>       if (!tg)
>               return NULL;
>  
> +     if (blkg_rwstat_init(&tg->service_time, gfp) ||
> +         blkg_rwstat_init(&tg->wait_time, gfp))
> +             goto err;
> +
>       throtl_service_queue_init(&tg->service_queue);
>  
>       for (rw = READ; rw <= WRITE; rw++) {
> @@ -511,6 +520,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t 
> gfp, int node)
>       tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
>  
>       return &tg->pd;
> +
> +err:
> +     blkg_rwstat_exit(&tg->service_time);
> +     blkg_rwstat_exit(&tg->wait_time);
> +     kfree(tg);
> +     return NULL;
>  }
>  
>  static void throtl_pd_init(struct blkg_policy_data *pd)
> @@ -592,6 +607,8 @@ static void blk_throtl_update_limit_valid(struct 
> throtl_data *td)
>  static void throtl_pd_offline(struct blkg_policy_data *pd)
>  {
>       struct throtl_grp *tg = pd_to_tg(pd);
> +     struct blkcg_gq *blkg = pd_to_blkg(pd);
> +     struct blkcg_gq *parent = blkg->parent;
>  
>       tg->bps[READ][LIMIT_LOW] = 0;
>       tg->bps[WRITE][LIMIT_LOW] = 0;
> @@ -602,6 +619,12 @@ static void throtl_pd_offline(struct blkg_policy_data 
> *pd)
>  
>       if (!tg->td->limit_valid[tg->td->limit_index])
>               throtl_upgrade_state(tg->td);
> +     if (parent) {
> +             blkg_rwstat_add_aux(&blkg_to_tg(parent)->service_time,
> +                                 &tg->service_time);
> +             blkg_rwstat_add_aux(&blkg_to_tg(parent)->wait_time,
> +                                 &tg->wait_time);
> +     }
>  }
>  
>  static void throtl_pd_free(struct blkg_policy_data *pd)
> @@ -609,9 +632,19 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
>       struct throtl_grp *tg = pd_to_tg(pd);
>  
>       del_timer_sync(&tg->service_queue.pending_timer);
> +     blkg_rwstat_exit(&tg->service_time);
> +     blkg_rwstat_exit(&tg->wait_time);
>       kfree(tg);
>  }
>  
> +static void throtl_pd_reset(struct blkg_policy_data *pd)
> +{
> +     struct throtl_grp *tg = pd_to_tg(pd);
> +
> +     blkg_rwstat_reset(&tg->service_time);
> +     blkg_rwstat_reset(&tg->wait_time);
> +}
> +
>  static struct throtl_grp *
>  throtl_rb_first(struct throtl_service_queue *parent_sq)
>  {
> @@ -1019,6 +1052,64 @@ static bool tg_may_dispatch(struct throtl_grp *tg, 
> struct bio *bio,
>       return false;
>  }
>  
> +static void throtl_stats_update_completion(struct throtl_grp *tg,
> +                                        uint64_t start_time,
> +                                        uint64_t io_start_time,
> +                                        int op)
> +{
> +     unsigned long flags;
> +     uint64_t now = sched_clock();
> +
> +     local_irq_save(flags);
> +     if (time_after64(now, io_start_time))
> +             blkg_rwstat_add(&tg->service_time, op, now - io_start_time);
> +     if (time_after64(io_start_time, start_time))
> +             blkg_rwstat_add(&tg->wait_time, op, io_start_time - start_time);
> +     local_irq_restore(flags);
> +}
> +
> +static void throtl_bio_end_io(struct bio *bio)
> +{
> +     struct throtl_grp *tg;
> +
> +     rcu_read_lock();
> +     /* see comments in throtl_bio_stats_start() */
> +     if (bio_flagged(bio, BIO_THROTL_STATED))
> +             goto out;
> +
> +     tg = (struct throtl_grp *)bio->bi_tg_private;
> +     if (!tg)
> +             goto out;
> +
> +     throtl_stats_update_completion(tg, bio_start_time_ns(bio),
> +                                    bio_io_start_time_ns(bio),
> +                                    bio_op(bio));
> +     blkg_put(tg_to_blkg(tg));
> +     bio_clear_flag(bio, BIO_THROTL_STATED);
> +out:
> +     rcu_read_unlock();
> +}
> +
> +static inline void throtl_bio_stats_start(struct bio *bio, struct throtl_grp 
> *tg)
> +{
> +     int op = bio_op(bio);
> +
> +     /*
> +      * It may happen that end_io will be called twice like dm-thin,
> +      * which will save origin end_io first, and call its overwrite
> +      * end_io and then the saved end_io. We use bio flag
> +      * BIO_THROTL_STATED to do only once statistics.
> +      */
> +     if ((op == REQ_OP_READ || op == REQ_OP_WRITE) &&
> +         !bio_flagged(bio, BIO_THROTL_STATED)) {
> +             blkg_get(tg_to_blkg(tg));
> +             bio_set_flag(bio, BIO_THROTL_STATED);
> +             bio->bi_tg_end_io = throtl_bio_end_io;
> +             bio->bi_tg_private = tg;
> +             bio_set_start_time_ns(bio);
> +     }
> +}
> +
>  static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
>  {
>       bool rw = bio_data_dir(bio);
> @@ -1462,6 +1553,25 @@ static ssize_t tg_set_conf_uint(struct 
> kernfs_open_file *of,
>       return tg_set_conf(of, buf, nbytes, off, false);
>  }
>  
> +static u64 tg_prfill_rwstat_field(struct seq_file *sf,
> +                               struct blkg_policy_data *pd,
> +                               int off)
> +{
> +     struct throtl_grp *tg = pd_to_tg(pd);
> +     struct blkg_rwstat_sample rwstat = { };
> +
> +     blkg_rwstat_read((void *)tg + off, &rwstat);
> +     return __blkg_prfill_rwstat(sf, pd, &rwstat);
> +}
> +
> +static int tg_print_rwstat(struct seq_file *sf, void *v)
> +{
> +     blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
> +                       tg_prfill_rwstat_field, &blkcg_policy_throtl,
> +                       seq_cft(sf)->private, true);
> +     return 0;
> +}
> +
>  static struct cftype throtl_legacy_files[] = {
>       {
>               .name = "throttle.read_bps_device",
> @@ -1507,6 +1617,16 @@ static ssize_t tg_set_conf_uint(struct 
> kernfs_open_file *of,
>               .private = (unsigned long)&blkcg_policy_throtl,
>               .seq_show = blkg_print_stat_ios_recursive,
>       },
> +     {
> +             .name = "throttle.io_service_time",
> +             .private = offsetof(struct throtl_grp, service_time),
> +             .seq_show = tg_print_rwstat,
> +     },
> +     {
> +             .name = "throttle.io_wait_time",
> +             .private = offsetof(struct throtl_grp, wait_time),
> +             .seq_show = tg_print_rwstat,
> +     },
>       { }     /* terminate */
>  };
>  
> @@ -1732,6 +1852,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
>       .pd_online_fn           = throtl_pd_online,
>       .pd_offline_fn          = throtl_pd_offline,
>       .pd_free_fn             = throtl_pd_free,
> +     .pd_reset_stats_fn      = throtl_pd_reset,
>  };
>  
>  static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
> @@ -2125,7 +2246,12 @@ bool blk_throtl_bio(struct request_queue *q, struct 
> blkcg_gq *blkg,
>       WARN_ON_ONCE(!rcu_read_lock_held());
>  
>       /* see throtl_charge_bio() */
> -     if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
> +     if (bio_flagged(bio, BIO_THROTTLED))
> +             goto out;
> +
> +     throtl_bio_stats_start(bio, tg);
> +
> +     if (!tg->has_rules[rw])
>               goto out;
>  
>       spin_lock_irq(&q->queue_lock);
> @@ -2212,6 +2338,8 @@ bool blk_throtl_bio(struct request_queue *q, struct 
> blkcg_gq *blkg,
>  out_unlock:
>       spin_unlock_irq(&q->queue_lock);
>  out:
> +     if (!throttled)
> +             bio_set_io_start_time_ns(bio);
>       bio_set_flag(bio, BIO_THROTTLED);
>  
>  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index feff3fe..6906bc6 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -9,6 +9,7 @@
>  #include <linux/types.h>
>  #include <linux/bvec.h>
>  #include <linux/ktime.h>
> +#include <linux/sched/clock.h>
>  
>  struct bio_set;
>  struct bio;
> @@ -169,6 +170,12 @@ struct bio {
>        */
>       struct blkcg_gq         *bi_blkg;
>       struct bio_issue        bi_issue;
> +#ifdef CONFIG_BLK_DEV_THROTTLING
> +     unsigned long long      start_time_ns;  /* when passed to block 
> throttle */
> +     unsigned long long      io_start_time_ns;       /* when no more 
> throttle */
> +     bio_end_io_t            *bi_tg_end_io;
> +     void                    *bi_tg_private;
> +#endif
>  #endif
>       union {
>  #if defined(CONFIG_BLK_DEV_INTEGRITY)
> @@ -218,6 +225,7 @@ enum {
>                                * of this bio. */
>       BIO_QUEUE_ENTERED,      /* can use blk_queue_enter_live() */
>       BIO_TRACKED,            /* set if bio goes through the rq_qos path */
> +     BIO_THROTL_STATED,      /* bio already stated */
>       BIO_FLAG_LAST
>  };
>  
> @@ -248,6 +256,32 @@ enum {
>   */
>  #define BIO_RESET_BITS       BVEC_POOL_OFFSET
>  
> +#ifdef CONFIG_BLK_DEV_THROTTLING
> +static inline void bio_set_start_time_ns(struct bio *bio)
> +{
> +     preempt_disable();
> +     bio->start_time_ns = sched_clock();
> +     preempt_enable();
> +}
> +
> +static inline void bio_set_io_start_time_ns(struct bio *bio)
> +{
> +     preempt_disable();
> +     bio->io_start_time_ns = sched_clock();
> +     preempt_enable();
> +}
> +
> +static inline uint64_t bio_start_time_ns(struct bio *bio)
> +{
> +     return bio->start_time_ns;
> +}
> +
> +static inline uint64_t bio_io_start_time_ns(struct bio *bio)
> +{
> +     return bio->io_start_time_ns;
> +}
> +#endif
> +
>  typedef __u32 __bitwise blk_mq_req_flags_t;
>  
>  /*
> 

Reply via email to