From: Long Li <lon...@microsoft.com>

On a large system with multiple devices of the same class (e.g. NVMe disks,
using managed IRQs), the kernel tends to concentrate their IRQs on several
CPUs.

The issue is that when NVMe calls irq_matrix_alloc_managed(), the assigned
CPU tends to be the first several CPUs in the cpumask, because they check for
cpumap->available that will not change after managed IRQs are reserved.

For a managed IRQ, it tends to reserve more than one CPU, based on cpumask in
irq_matrix_reserve_managed. But later when actually allocating CPU for this
IRQ, only one CPU is allocated. Because "available" is calculated at the time
managed IRQ is reserved, it tends to indicate a CPU has more IRQs than the 
actual
number it's assigned.

To get a more even distribution for allocating managed IRQs, we need to keep 
track
of how many of them are allocated on a given CPU. Introduce "managed_allocated"
in struct cpumap to track those managed IRQs that are allocated on this CPU, and
change the code to use this information for deciding how to allocate CPU for
managed IRQs.

Signed-off-by: Long Li <lon...@microsoft.com>
---
 kernel/irq/matrix.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 6e6d467f3dec..92337703ca9f 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -14,6 +14,7 @@ struct cpumap {
        unsigned int            available;
        unsigned int            allocated;
        unsigned int            managed;
+       unsigned int            managed_allocated;
        bool                    initialized;
        bool                    online;
        unsigned long           alloc_map[IRQ_MATRIX_SIZE];
@@ -145,6 +146,27 @@ static unsigned int matrix_find_best_cpu(struct irq_matrix 
*m,
        return best_cpu;
 }
 
+/* Find the best CPU which has the lowest number of managed IRQs allocated */
+static unsigned int matrix_find_best_cpu_managed(struct irq_matrix *m,
+                                               const struct cpumask *msk)
+{
+       unsigned int cpu, best_cpu, allocated = UINT_MAX;
+       struct cpumap *cm;
+
+       best_cpu = UINT_MAX;
+
+       for_each_cpu(cpu, msk) {
+               cm = per_cpu_ptr(m->maps, cpu);
+
+               if (!cm->online || cm->managed_allocated > allocated)
+                       continue;
+
+               best_cpu = cpu;
+               allocated = cm->managed_allocated;
+       }
+       return best_cpu;
+}
+
 /**
  * irq_matrix_assign_system - Assign system wide entry in the matrix
  * @m:         Matrix pointer
@@ -269,7 +291,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
        if (cpumask_empty(msk))
                return -EINVAL;
 
-       cpu = matrix_find_best_cpu(m, msk);
+       cpu = matrix_find_best_cpu_managed(m, msk);
        if (cpu == UINT_MAX)
                return -ENOSPC;
 
@@ -282,6 +304,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, const 
struct cpumask *msk,
                return -ENOSPC;
        set_bit(bit, cm->alloc_map);
        cm->allocated++;
+       cm->managed_allocated++;
        m->total_allocated++;
        *mapped_cpu = cpu;
        trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
@@ -395,6 +418,8 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
 
        clear_bit(bit, cm->alloc_map);
        cm->allocated--;
+       if(managed)
+               cm->managed_allocated--;
 
        if (cm->online)
                m->total_allocated--;
@@ -464,13 +489,14 @@ void irq_matrix_debug_show(struct seq_file *sf, struct 
irq_matrix *m, int ind)
        seq_printf(sf, "Total allocated:  %6u\n", m->total_allocated);
        seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
                   m->system_map);
-       seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+       seq_printf(sf, "%*s| CPU | avl | man | mac | act | vectors\n", ind, " 
");
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
 
-               seq_printf(sf, "%*s %4d  %4u  %4u  %4u  %*pbl\n", ind, " ",
-                          cpu, cm->available, cm->managed, cm->allocated,
+               seq_printf(sf, "%*s %4d  %4u  %4u  %4u %4u  %*pbl\n", ind, " ",
+                          cpu, cm->available, cm->managed,
+                          cm->managed_allocated, cm->allocated,
                           m->matrix_bits, cm->alloc_map);
        }
        cpus_read_unlock();
-- 
2.14.1

Reply via email to