For smp_call_function_many(), whenever a CPU queues a CSD to a target
CPU, it will send an IPI to let the target CPU to handle the work.
This isn't necessary - we need only send IPI when queueing a CSD
to an empty call_single_queue.

The reason:
flush_smp_call_function_queue() that is called upon a CPU receiving an
IPI will empty the queue and then handle all of the CSDs there. So if
the target CPU's call_single_queue is not empty, we know that:
i.  An IPI for the target CPU has already been sent by previous queuers*;
ii. flush_smp_call_function_queue() hasn't emptied that CPU's queue yet.
Thus, it's safe for us to just queue our CSD there without sending an
addtional IPI.

*For previous queuers, we can limit it to the first queuer.

The workload used to see the effectiveness of this change is
vm-scalability's case-swap-w-seq-mt:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-anon-w-seq-mt

What it does is to spawn 88 threads to do continous anonymous memory
consumption. The test machine is a 2-node Broadwell EP with 88 threads
and 32G memory. The test size is 100G so a lot of swap out happened
after available memory is used up. Due to:
i.  shrink_page_list() will need to
    try_to_unmap_flush() -> flush_tlb_others();
ii. this process' mm_cpumask() is almost full.
the number of IPI sent during the workload is huge.

Base:
"interrupts.CAL:Function_call_interrupts": 819388170.0,
"vm-scalability.throughput": 5051472.0,
This patch:
"interrupts.CAL:Function_call_interrupts": 92214434.66666667, ↓88.7%
"vm-scalability.throughput": 5991764.333333333 ↑18.6%

Interrupts dropped a lot and performance increased 18.6%.

Signed-off-by: Aaron Lu <[email protected]>
---
 kernel/smp.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..76d16fe3c427 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
 struct call_function_data {
        struct call_single_data __percpu *csd;
        cpumask_var_t           cpumask;
+       cpumask_var_t           cpumask_ipi;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
        if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                     cpu_to_node(cpu)))
                return -ENOMEM;
+       if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+                                    cpu_to_node(cpu))) {
+               free_cpumask_var(cfd->cpumask);
+               return -ENOMEM;
+       }
        cfd->csd = alloc_percpu(struct call_single_data);
        if (!cfd->csd) {
                free_cpumask_var(cfd->cpumask);
+               free_cpumask_var(cfd->cpumask_ipi);
                return -ENOMEM;
        }
 
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
 
        free_cpumask_var(cfd->cpumask);
+       free_cpumask_var(cfd->cpumask_ipi);
        free_percpu(cfd->csd);
        return 0;
 }
@@ -434,6 +442,7 @@ void smp_call_function_many(const struct cpumask *mask,
        if (unlikely(!cpumask_weight(cfd->cpumask)))
                return;
 
+       cpumask_clear(cfd->cpumask_ipi);
        for_each_cpu(cpu, cfd->cpumask) {
                struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
 
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
                        csd->flags |= CSD_FLAG_SYNCHRONOUS;
                csd->func = func;
                csd->info = info;
-               llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+               if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+                       cpumask_set_cpu(cpu, cfd->cpumask_ipi);
        }
 
        /* Send a message to all CPUs in the map */
-       arch_send_call_function_ipi_mask(cfd->cpumask);
+       arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
 
        if (wait) {
                for_each_cpu(cpu, cfd->cpumask) {
-- 
2.9.3

Reply via email to