On Fri, Apr 18, 2025 at 9:17 AM Yu Kuai <yuku...@huaweicloud.com> wrote: > > From: Yu Kuai <yuku...@huawei.com> > > Currently if sync speed is above speed_min and below speed_max, > md_do_sync() will wait for all sync IOs to be done before issuing new > sync IO, means sync IO depth is limited to just 1. > > This limit is too low, in order to prevent sync speed drop conspicuously > after fixing is_mddev_idle() in the next patch, add a new api for > limiting sync IO depth, the default value is 32. > > Signed-off-by: Yu Kuai <yuku...@huawei.com> > --- > drivers/md/md.c | 109 +++++++++++++++++++++++++++++++++++++++--------- > drivers/md/md.h | 1 + > 2 files changed, 91 insertions(+), 19 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 438e71e45c16..52cadfce7e8d 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread > __rcu *thread); > /* Default safemode delay: 200 msec */ > #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) > /* > - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' > - * is 1000 KB/sec, so the extra system load does not show up that much. > - * Increase it if you want to have more _guaranteed_ speed. Note that > - * the RAID driver will use the maximum available bandwidth if the IO > - * subsystem is idle. There is also an 'absolute maximum' reconstruction > - * speed limit - in case reconstruction slows down your system despite > - * idle IO detection. > + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' > + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system > load > + * does not show up that much. Increase it if you want to have more > guaranteed > + * speed. Note that the RAID driver will use the maximum bandwidth > + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is > idle. > * > - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. > - * or /sys/block/mdX/md/sync_speed_{min,max} > + * Background sync IO speed control: > + * > + * - below speed min: > + * no limit; > + * - above speed min and below speed max: > + * a) if mddev is idle, then no limit; > + * b) if mddev is busy handling normal IO, then limit inflight sync IO > + * to sync_io_depth; > + * - above speed max: > + * sync IO can't be issued; > + * > + * Following configurations can be changed via /proc/sys/dev/raid/ for system > + * or /sys/block/mdX/md/ for one array. > */ > - > static int sysctl_speed_limit_min = 1000; > static int sysctl_speed_limit_max = 200000; > -static inline int speed_min(struct mddev *mddev) > +static int sysctl_sync_io_depth = 32; > + > +static int speed_min(struct mddev *mddev) > { > return mddev->sync_speed_min ? > mddev->sync_speed_min : sysctl_speed_limit_min; > } > > -static inline int speed_max(struct mddev *mddev) > +static int speed_max(struct mddev *mddev) > { > return mddev->sync_speed_max ? > mddev->sync_speed_max : sysctl_speed_limit_max; > } > > +static int sync_io_depth(struct mddev *mddev) > +{ > + return mddev->sync_io_depth ? > + mddev->sync_io_depth : sysctl_sync_io_depth; > +} > + > static void rdev_uninit_serial(struct md_rdev *rdev) > { > if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) > @@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = { > .procname = "speed_limit_min", > .data = &sysctl_speed_limit_min, > .maxlen = sizeof(int), > - .mode = S_IRUGO|S_IWUSR, > + .mode = 0644, > .proc_handler = proc_dointvec, > }, > { > .procname = "speed_limit_max", > .data = &sysctl_speed_limit_max, > .maxlen = sizeof(int), > - .mode = S_IRUGO|S_IWUSR, > + .mode = 0644, > + .proc_handler = proc_dointvec, > + }, > + { > + .procname = "sync_io_depth", > + .data = &sysctl_sync_io_depth, > + .maxlen = sizeof(int), > + .mode = 0644, > .proc_handler = proc_dointvec, > }, > }; > @@ -5091,7 +5114,7 @@ static ssize_t > sync_min_show(struct mddev *mddev, char *page) > { > return sprintf(page, "%d (%s)\n", speed_min(mddev), > - mddev->sync_speed_min ? "local": "system"); > + mddev->sync_speed_min ? "local" : "system"); > } > > static ssize_t > @@ -5100,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, > size_t len) > unsigned int min; > int rv; > > - if (strncmp(buf, "system", 6)==0) { > + if (strncmp(buf, "system", 6) == 0) { > min = 0; > } else { > rv = kstrtouint(buf, 10, &min); > @@ -5120,7 +5143,7 @@ static ssize_t > sync_max_show(struct mddev *mddev, char *page) > { > return sprintf(page, "%d (%s)\n", speed_max(mddev), > - mddev->sync_speed_max ? "local": "system"); > + mddev->sync_speed_max ? "local" : "system"); > } > > static ssize_t > @@ -5129,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, > size_t len) > unsigned int max; > int rv; > > - if (strncmp(buf, "system", 6)==0) { > + if (strncmp(buf, "system", 6) == 0) { > max = 0; > } else { > rv = kstrtouint(buf, 10, &max); > @@ -5145,6 +5168,35 @@ sync_max_store(struct mddev *mddev, const char *buf, > size_t len) > static struct md_sysfs_entry md_sync_max = > __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); > > +static ssize_t > +sync_io_depth_show(struct mddev *mddev, char *page) > +{ > + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), > + mddev->sync_io_depth ? "local" : "system"); > +} > + > +static ssize_t > +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) > +{ > + unsigned int max; > + int rv; > + > + if (strncmp(buf, "system", 6) == 0) { > + max = 0; > + } else { > + rv = kstrtouint(buf, 10, &max); > + if (rv < 0) > + return rv; > + if (max == 0) > + return -EINVAL; > + } > + mddev->sync_io_depth = max; > + return len; > +} > + > +static struct md_sysfs_entry md_sync_io_depth = > +__ATTR_RW(sync_io_depth); > + > static ssize_t > degraded_show(struct mddev *mddev, char *page) > { > @@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = { > &md_mismatches.attr, > &md_sync_min.attr, > &md_sync_max.attr, > + &md_sync_io_depth.attr, > &md_sync_speed.attr, > &md_sync_force_parallel.attr, > &md_sync_completed.attr, > @@ -8927,6 +8980,23 @@ static sector_t md_sync_position(struct mddev *mddev, > enum sync_action action) > } > } > > +static bool sync_io_within_limit(struct mddev *mddev) > +{ > + int io_sectors; > + > + /* > + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's > + * RESYNC_PAGES(64k) per IO. > + */ > + if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) > + io_sectors = 8; > + else > + io_sectors = 128; > + > + return atomic_read(&mddev->recovery_active) < > + io_sectors * sync_io_depth(mddev); > +} > + > #define SYNC_MARKS 10 > #define SYNC_MARK_STEP (3*HZ) > #define UPDATE_FREQUENCY (5*60*HZ) > @@ -9195,7 +9265,8 @@ void md_do_sync(struct md_thread *thread) > msleep(500); > goto repeat; > } > - if (!is_mddev_idle(mddev, 0)) { > + if (!sync_io_within_limit(mddev) && > + !is_mddev_idle(mddev, 0)) { > /* > * Give other IO more of a chance. > * The faster the devices, the less we wait. > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 9d55b4630077..b57842188f18 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -484,6 +484,7 @@ struct mddev { > /* if zero, use the system-wide default */ > int sync_speed_min; > int sync_speed_max; > + int sync_io_depth; > > /* resync even though the same disks are shared among md-devices */ > int parallel_resync; > -- > 2.39.2 >
Looks good to me, reviewed-by: Xiao Ni <x...@redhat.com>