Please ignore patches 1-4 of this series... -George
On Fri, Apr 28, 2023 at 9:08 AM George Dunlap <[email protected]> wrote: > The credit scheduler tries as hard as it can to ensure that it always > runs scheduling units with positive credit (PRI_TS_UNDER) before > running those with negative credit (PRI_TS_OVER). If the next > runnable scheduling unit is of priority OVER, it will always run the > load balancer, which will scour the system looking for another > scheduling unit of the UNDER priority. > > Unfortunately, as the number of cores on a system has grown, the cost > of the work-stealing algorithm has dramatically increased; a recent > trace on a system with 128 cores showed this taking over 50 > microseconds. > > Add a parameter, load_balance_ratelimit, to limit the frequency of > load balance operations on a given pcpu. Default this to 1 > millisecond. > > Invert the load balancing conditional to make it more clear, and line > up more closely with the comment above it. > > Overall it might be cleaner to have the last_load_balance checking > happen inside csched_load_balance(), but that would require either > passing both now and spc into the function, or looking them up again; > both of which seemed to be worse than simply checking and setting the > values before calling it. > > Signed-off-by: George Dunlap <[email protected]> > --- > docs/misc/xen-command-line.pandoc | 6 +++++ > xen/common/sched/credit.c | 40 ++++++++++++++++++++++++++----- > xen/include/public/sysctl.h | 6 +++++ > 3 files changed, 46 insertions(+), 6 deletions(-) > > diff --git a/docs/misc/xen-command-line.pandoc > b/docs/misc/xen-command-line.pandoc > index e0b89b7d33..ae51a8cfa2 100644 > --- a/docs/misc/xen-command-line.pandoc > +++ b/docs/misc/xen-command-line.pandoc > @@ -1840,6 +1840,12 @@ By default, Xen will use the INVPCID instruction > for TLB management if > it is available. This option can be used to cause Xen to fall back to > older mechanisms, which are generally slower. > > +### load-balance-ratelimit > +> `= <integer>` > + > +The minimum interval between load balancing events on a given pcpu. > +At the moment only credit honors this parameter. > + > ### noirqbalance (x86) > > `= <boolean>` > > diff --git a/xen/common/sched/credit.c b/xen/common/sched/credit.c > index f2cd3d9da3..b8bdfd5f6a 100644 > --- a/xen/common/sched/credit.c > +++ b/xen/common/sched/credit.c > @@ -50,6 +50,8 @@ > #define CSCHED_TICKS_PER_TSLICE 3 > /* Default timeslice: 30ms */ > #define CSCHED_DEFAULT_TSLICE_MS 30 > +/* Default load balancing ratelimit: 1ms */ > +#define CSCHED_DEFAULT_LOAD_BALANCE_RATELIMIT_US 1000 > #define CSCHED_CREDITS_PER_MSEC 10 > /* Never set a timer shorter than this value. */ > #define CSCHED_MIN_TIMER XEN_SYSCTL_SCHED_RATELIMIT_MIN > @@ -153,6 +155,7 @@ struct csched_pcpu { > > unsigned int idle_bias; > unsigned int nr_runnable; > + s_time_t last_load_balance; > > unsigned int tick; > struct timer ticker; > @@ -218,7 +221,7 @@ struct csched_private { > > /* Period of master and tick in milliseconds */ > unsigned int tick_period_us, ticks_per_tslice; > - s_time_t ratelimit, tslice, unit_migr_delay; > + s_time_t ratelimit, tslice, unit_migr_delay, load_balance_ratelimit; > > struct list_head active_sdom; > uint32_t weight; > @@ -612,6 +615,8 @@ init_pdata(struct csched_private *prv, struct > csched_pcpu *spc, int cpu) > BUG_ON(!is_idle_unit(curr_on_cpu(cpu))); > cpumask_set_cpu(cpu, prv->idlers); > spc->nr_runnable = 0; > + > + spc->last_load_balance = NOW(); > } > > static void cf_check > @@ -1267,7 +1272,8 @@ csched_sys_cntl(const struct scheduler *ops, > && (params->ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX > || params->ratelimit_us < > XEN_SYSCTL_SCHED_RATELIMIT_MIN)) > || MICROSECS(params->ratelimit_us) > > MILLISECS(params->tslice_ms) > - || params->vcpu_migr_delay_us > > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US ) > + || params->vcpu_migr_delay_us > > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US > + || params->load_balance_ratelimit_us > > XEN_SYSCTL_CSCHED_LB_RATE_MAX_US) > goto out; > > spin_lock_irqsave(&prv->lock, flags); > @@ -1278,6 +1284,7 @@ csched_sys_cntl(const struct scheduler *ops, > printk(XENLOG_INFO "Disabling context switch rate > limiting\n"); > prv->ratelimit = MICROSECS(params->ratelimit_us); > prv->unit_migr_delay = MICROSECS(params->vcpu_migr_delay_us); > + prv->load_balance_ratelimit = > MICROSECS(params->load_balance_ratelimit_us); > spin_unlock_irqrestore(&prv->lock, flags); > > /* FALLTHRU */ > @@ -1285,6 +1292,7 @@ csched_sys_cntl(const struct scheduler *ops, > params->tslice_ms = prv->tslice / MILLISECS(1); > params->ratelimit_us = prv->ratelimit / MICROSECS(1); > params->vcpu_migr_delay_us = prv->unit_migr_delay / MICROSECS(1); > + params->load_balance_ratelimit_us = prv->load_balance_ratelimit / > MICROSECS(1); > rc = 0; > break; > } > @@ -1676,9 +1684,17 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, > int balance_step) > return NULL; > } > > +/* > + * Minimum delay, in microseconds, between load balance operations. > + * This prevents spending too much time doing load balancing, particularly > + * when the system has a high number of YIELDs due to spinlock priority > inversion. > + */ > +static unsigned int __read_mostly load_balance_ratelimit_us = > CSCHED_DEFAULT_LOAD_BALANCE_RATELIMIT_US; > +integer_param("load-balance-ratelimit", load_balance_ratelimit_us); > + > static struct csched_unit * > csched_load_balance(struct csched_private *prv, int cpu, > - struct csched_unit *snext, bool *stolen) > + struct csched_unit *snext, bool *stolen) > { > const struct cpupool *c = get_sched_res(cpu)->cpupool; > struct csched_unit *speer; > @@ -1963,10 +1979,12 @@ static void cf_check csched_schedule( > * urgent work... If not, csched_load_balance() will return > snext, but > * already removed from the runq. > */ > - if ( snext->pri > CSCHED_PRI_TS_OVER ) > - __runq_remove(snext); > - else > + if ( snext->pri <= CSCHED_PRI_TS_OVER > + && now - spc->last_load_balance > > prv->load_balance_ratelimit) { > + spc->last_load_balance = now; > snext = csched_load_balance(prv, sched_cpu, snext, &migrated); > + } else > + __runq_remove(snext); > > } while ( !unit_runnable_state(snext->unit) ); > > @@ -2181,6 +2199,14 @@ csched_global_init(void) > XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US, vcpu_migration_delay_us); > } > > + if ( load_balance_ratelimit_us > XEN_SYSCTL_CSCHED_LB_RATE_MAX_US ) > + { > + load_balance_ratelimit_us = > CSCHED_DEFAULT_LOAD_BALANCE_RATELIMIT_US; > + printk("WARNING: load-balance-ratelimit outside of valid range > [0,%d]us.\n" > + "Resetting to default: %u\n", > + XEN_SYSCTL_CSCHED_LB_RATE_MAX_US, > load_balance_ratelimit_us); > + } > + > return 0; > } > > @@ -2223,6 +2249,8 @@ csched_init(struct scheduler *ops) > > prv->unit_migr_delay = MICROSECS(vcpu_migration_delay_us); > > + prv->load_balance_ratelimit = MICROSECS(load_balance_ratelimit_us); > + > return 0; > } > > diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h > index 2b24d6bfd0..192458d635 100644 > --- a/xen/include/public/sysctl.h > +++ b/xen/include/public/sysctl.h > @@ -637,6 +637,12 @@ struct xen_sysctl_credit_schedule { > */ > #define XEN_SYSCTL_CSCHED_MGR_DLY_MAX_US (100 * 1000) > uint32_t vcpu_migr_delay_us; > + /* > + * Minimum delay, in microseconds, between load balance > + * operations; max 1 second. > + */ > +#define XEN_SYSCTL_CSCHED_LB_RATE_MAX_US (1000000) > + uint32_t load_balance_ratelimit_us; > }; > > struct xen_sysctl_credit2_schedule { > -- > 2.25.1 > >
