Ack. Let's give this a try. It might solve problems which we have with default affinity algo.
On Fri, Feb 7, 2025 at 5:11 PM Liu Kui <kui....@virtuozzo.com> wrote: > > Currently the rpc work is scheduled to run on the sender's cpu with > the default rpc affinity mode. However there is a serious problem in > this mode for certain workload that a majority of rpc work ends up > being run on just one or two cpus even though the rest of cpus are > idle, thus resulting in significant drop in performance. The issue > of rpc work concentrating on just few cpus is fatal in that once it > happens it can no longer escape from it. > > The newly added mode tries to prevent the concentration from happening > by capping the number of rpc work assigned to each cpu, while still > trying to prioritize the affinity to the sender's cpu. Initial test > shows quite significant performance improvement for some workloads, > however also degradation for some other workloads. However we need > to do a comprehensive test to compare the pros and cons. > > Related to #VSTOR-99387 > Signed-off-by: Liu Kui <kui....@virtuozzo.com> > --- > fs/fuse/kio/pcs/pcs_rpc.c | 96 +++++++++++++++++++++++++++++++++++++++ > fs/fuse/kio/pcs/pcs_rpc.h | 7 +++ > 2 files changed, 103 insertions(+) > > diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c > index e74448f2074a..b9774ce1ab34 100644 > --- a/fs/fuse/kio/pcs/pcs_rpc.c > +++ b/fs/fuse/kio/pcs/pcs_rpc.c > @@ -44,8 +44,18 @@ static unsigned long rpc_cpu_time_slice = > PCS_RPC_CPU_SLICE; > module_param(rpc_cpu_time_slice, ulong, 0644); > MODULE_PARM_DESC(rpc_cpu_time_slice, "Time slice for RPC rebinding"); > > +static unsigned long rpc_cpu_timeout = PCS_RPC_CPU_TIMEOUT; // 500 ms > +module_param(rpc_cpu_timeout, ulong, 0644); > +MODULE_PARM_DESC(rpc_cpu_timeout, "Timeout for RPC binding after become > idle"); > + > +static unsigned int rpc_cpu_nr_base = 2; > +module_param(rpc_cpu_nr_base, uint, 0644); > +MODULE_PARM_DESC(rpc_cpu_nr_base, "The minimum cap of numbers of rpc per > cpu"); > + > DECLARE_WAIT_QUEUE_HEAD(pcs_waitq); > > +static DEFINE_PER_CPU(struct pcs_rpc_cpu, rpc_cpu) = { .nr_attached = > ATOMIC_INIT(0) }; > + > static void timer_work(struct work_struct *w); > static int rpc_gc_classify(struct pcs_rpc * ep); > > @@ -360,6 +370,7 @@ static void pcs_rpc_destroy(struct pcs_rpc *ep) > > cancel_delayed_work_sync(&ep->calendar_work); > flush_work(&ep->work); > + flush_delayed_work(&ep->cpu_timer_work); > > /* pcs_free(ep->sun); */ > /* ep->sun = NULL; */ > @@ -789,6 +800,61 @@ static int pcs_rpc_cpu_next(void) > return new; > } > > +static void pcs_rpc_cpu_select(struct pcs_rpc *ep) > +{ > + struct pcs_rpc_cpu *prc; > + int cpu, node, max_rpc_per_cpu; > + > + if (ep->cpu != WORK_CPU_UNBOUND) > + atomic_dec_if_positive(&per_cpu_ptr(&rpc_cpu, > ep->cpu)->nr_attached); > + > + /* > + * lock protection for reading eng->nrpcs is unnecessary, as > + * we just need to derive a rough value. > + */ > + max_rpc_per_cpu = ep->eng->nrpcs / nr_cpu_ids + rpc_cpu_nr_base; > + > + /* Check current cpu first.*/ > + cpu = smp_processor_id(); > + prc = per_cpu_ptr(&rpc_cpu, cpu); > + if (atomic_read(&prc->nr_attached) < max_rpc_per_cpu) > + goto found; > + > + /* Try to find one cpu from same numa node. */ > + node = cpu_to_node(cpu); > + cpu = cpumask_first_and(cpumask_of_node(node), cpu_online_mask); > + while (cpu < nr_cpu_ids) { > + prc = per_cpu_ptr(&rpc_cpu, cpu); > + if (atomic_read(&prc->nr_attached) < max_rpc_per_cpu) > + goto found; > + cpu = cpumask_next_and(cpu, cpumask_of_node(node), > cpu_online_mask); > + } > + > + /* > + * Otherwise, search all cpus to find one. It is a bit inefficient > here, > + * however we don't expect this function to be called frequently in > performance > + * critical path. So simplicity is preferred. > + */ > + for_each_online_cpu(cpu) { > + prc = per_cpu_ptr(&rpc_cpu, cpu); > + if (atomic_read(&prc->nr_attached) < max_rpc_per_cpu) > + goto found; > + } > + > + // Should not reach here > + WARN_ONCE(1, "Failed to find a cpu for pcs_rpc work"); > + ep->cpu = WORK_CPU_UNBOUND; > + > + return; > + > +found: > + atomic_inc(&prc->nr_attached); > + ep->cpu = cpu; > + ep->cpu_stamp = jiffies + rpc_cpu_time_slice; > + if (unlikely(!timer_pending(&ep->cpu_timer_work.timer))) > + mod_delayed_work(cc_from_rpc(ep->eng)->wq, > &ep->cpu_timer_work, rpc_cpu_timeout); > +} > + > static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle) > { > switch(rpc_affinity_mode) { > @@ -814,6 +880,10 @@ static void pcs_rpc_affinity(struct pcs_rpc *ep, bool > was_idle) > ep->cpu = pcs_rpc_cpu_next(); > } > break; > + case RPC_AFFINITY_FAIR_SPREAD: > + if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) > + pcs_rpc_cpu_select(ep); > + break; > default: > pr_err("Unknown affinity mode: %u\n", > rpc_affinity_mode); > } > @@ -834,6 +904,31 @@ void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * > msg) > pcs_rpc_kick_queue(ep); > } > > +static void rpc_cpu_timer_work(struct work_struct *w) > +{ > + struct pcs_rpc *ep = container_of(w, struct pcs_rpc, > cpu_timer_work.work); > + struct pcs_rpc_cpu *prc; > + > + if (unlikely(ep->cpu == WORK_CPU_UNBOUND)) > + return; > + > + spin_lock(&ep->q_lock); > + if ((ep->state == PCS_RPC_WORK) && > + time_is_after_jiffies(ep->cpu_stamp + rpc_cpu_timeout)) { > + unsigned long timeout; > + > + spin_unlock(&ep->q_lock); > + timeout = rpc_cpu_timeout - (jiffies - ep->cpu_stamp); > + mod_delayed_work(cc_from_rpc(ep->eng)->wq, > &ep->cpu_timer_work, timeout); > + return; > + } > + > + prc = per_cpu_ptr(&rpc_cpu, ep->cpu); > + ep->cpu = WORK_CPU_UNBOUND; > + atomic_dec(&prc->nr_attached); > + spin_unlock(&ep->q_lock); > +} > + > static void calendar_work(struct work_struct *w) > { > struct pcs_rpc * ep = container_of(w, struct pcs_rpc, > calendar_work.work); > @@ -1022,6 +1117,7 @@ void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, > struct pcs_rpc_params *parm, > INIT_WORK(&ep->close_work, rpc_close_work); > INIT_DELAYED_WORK(&ep->timer_work, timer_work); > INIT_DELAYED_WORK(&ep->calendar_work, calendar_work); > + INIT_DELAYED_WORK(&ep->cpu_timer_work, rpc_cpu_timer_work); > > for (i = 0; i < RPC_MAX_CALENDAR; i++) > INIT_HLIST_HEAD(&ep->kill_calendar[i]); > diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h > index baec7f844e38..cb18557a3da5 100644 > --- a/fs/fuse/kio/pcs/pcs_rpc.h > +++ b/fs/fuse/kio/pcs/pcs_rpc.h > @@ -40,6 +40,7 @@ enum { > RPC_AFFINITY_RETENT = 1, > RPC_AFFINITY_SPREAD = 2, > RPC_AFFINITY_RSS = 3, > + RPC_AFFINITY_FAIR_SPREAD = 4, > }; > > extern unsigned int rpc_affinity_mode; > @@ -78,6 +79,7 @@ typedef union __pre_aligned(8) _PCS_CLUSTER_ID_T { > ///////////////////////////// > > #define PCS_RPC_CPU_SLICE (100 * HZ / 1000) /* 100ms */ > +#define PCS_RPC_CPU_TIMEOUT (500 * HZ / 1000) /* 500ms */ > struct pcs_rpc > { > struct hlist_node link; /* Link in hash table */ > @@ -139,6 +141,7 @@ struct pcs_rpc > struct list_head input_queue; /* Queue of requests waiting > to be handled */ > int cpu; > unsigned long cpu_stamp; > + struct delayed_work cpu_timer_work; /* reset cpu affinity after > being idle */ > > struct mutex mutex; > u64 accounted; > @@ -160,6 +163,10 @@ struct pcs_rpc > struct work_struct close_work; > }; > > +struct pcs_rpc_cpu { > + atomic_t nr_attached; > +}; > + > struct pcs_rpc_engine > { > spinlock_t lock; > -- > 2.39.5 (Apple Git-154) _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel