Re: [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask

Keith Busch Thu, 01 Sep 2016 16:20:07 -0700

On Thu, Sep 01, 2016 at 10:24:10AM -0400, Keith Busch wrote:
> Yeah, I gathered that's what it was providing, but that's just barely
> not enough information to do something useful. The CPUs that aren't set
> have to use a previously assigned vector/queue, but which one?


Unless I'm totally missing how to infer paired CPUs, I think we need
arrays.

Here's a stab at that. I'm using the "old" algorithm the NVMe driver used
to pair vectors and cpus. It's not the most efficient way of pairing
that I know of, but it is easy to follow (relatively speaking), and it
actually utilizes every hardware resource available so I get very good
CPU <-> Queue mappings.

---
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9cc08c6..c5c038e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2283,7 +2283,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set 
*set)
 static int blk_mq_create_mq_map(struct blk_mq_tag_set *set,
                const struct cpumask *affinity_mask)
 {
-       int queue = -1, cpu = 0;
+       int queue;
 
        set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
                        GFP_KERNEL, set->numa_node);
@@ -2293,11 +2293,10 @@ static int blk_mq_create_mq_map(struct blk_mq_tag_set 
*set,
        if (!affinity_mask)
                return 0;       /* map all cpus to queue 0 */
 
-       /* If cpus are offline, map them to first hctx */
-       for_each_online_cpu(cpu) {
-               if (cpumask_test_cpu(cpu, affinity_mask))
-                       queue++;
-               if (queue >= 0)
+       for (queue = 0; queue < set->nr_hw_queues; queue++) {
+               int cpu;
+
+               for_each_cpu(cpu, &affinity_mask[queue])
                        set->mq_map[cpu] = queue;
        }
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 98f1222..03a1ffc 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -683,15 +683,11 @@ static int msix_setup_entries(struct pci_dev *dev, void 
__iomem *base,
 {
        const struct cpumask *mask = NULL;
        struct msi_desc *entry;
-       int cpu = -1, i;
+       int i;
 
        for (i = 0; i < nvec; i++) {
-               if (dev->irq_affinity) {
-                       cpu = cpumask_next(cpu, dev->irq_affinity);
-                       if (cpu >= nr_cpu_ids)
-                               cpu = cpumask_first(dev->irq_affinity);
-                       mask = cpumask_of(cpu);
-               }
+               if (dev->irq_affinity)
+                       mask = &dev->irq_affinity[i];
 
                entry = alloc_msi_entry(&dev->dev);
                if (!entry) {
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 32f6cfc..9fe548b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,14 +4,47 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 
-static int get_first_sibling(unsigned int cpu)
+static int find_closest_node(int node)
 {
-       unsigned int ret;
+       int n, val, min_val = INT_MAX, best_node = node;
+
+       for_each_online_node(n) {
+               if (n == node)
+                       continue;
+               val = node_distance(node, n);
+               if (val < min_val) {
+                       min_val = val;
+                       best_node = n;
+               }
+       }
+       return best_node;
+}
+
+static void set_vec_cpus(const cpumask_t *qmask, struct cpumask *affinity_mask,
+                                                               int count)
+{
+       int cpu;
+
+       for_each_cpu(cpu, qmask) {
+               if (cpumask_weight(affinity_mask) >= count)
+                       break;
+               cpumask_set_cpu(cpu, affinity_mask);
+       }
+}
+
+static void add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus,
+       const cpumask_t *new_mask, struct cpumask *affinity_mask,
+       int cpus_per_queue)
+{
+       int next_cpu;
+
+       for_each_cpu(next_cpu, new_mask) {
+               cpumask_or(mask, mask, get_cpu_mask(next_cpu));
+               cpumask_or(mask, mask, topology_sibling_cpumask(next_cpu));
+               cpumask_and(mask, mask, unassigned_cpus);
+       }
+       set_vec_cpus(mask, affinity_mask, cpus_per_queue);
 
-       ret = cpumask_first(topology_sibling_cpumask(cpu));
-       if (ret < nr_cpu_ids)
-               return ret;
-       return cpu;
 }
 
 /*
@@ -27,37 +60,76 @@ static int get_first_sibling(unsigned int cpu)
  */
 struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 {
-       struct cpumask *affinity_mask;
-       unsigned int max_vecs = *nr_vecs;
+       struct cpumask *affinity_mask, *masks;
+       unsigned int max_vecs = *nr_vecs, cpu, cpus_per_vec, remainder, i;
+       cpumask_var_t unassigned_cpus;
 
        if (max_vecs == 1)
                return NULL;
 
-       affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
-       if (!affinity_mask) {
+       masks = kcalloc(max_vecs, sizeof(*affinity_mask), GFP_KERNEL);
+       if (!masks) {
                *nr_vecs = 1;
                return NULL;
        }
 
        get_online_cpus();
-       if (max_vecs >= num_online_cpus()) {
-               cpumask_copy(affinity_mask, cpu_online_mask);
-               *nr_vecs = num_online_cpus();
-       } else {
-               unsigned int vecs = 0, cpu;
-
-               for_each_online_cpu(cpu) {
-                       if (cpu == get_first_sibling(cpu)) {
-                               cpumask_set_cpu(cpu, affinity_mask);
-                               vecs++;
-                       }
-
-                       if (--max_vecs == 0)
-                               break;
-               }
-               *nr_vecs = vecs;
+
+       cpus_per_vec = num_online_cpus() / max_vecs;
+       remainder = max_vecs - (num_online_cpus() - max_vecs * cpus_per_vec);
+
+       cpumask_copy(unassigned_cpus, cpu_online_mask);
+       cpu = cpumask_first(unassigned_cpus);
+
+       for (i = 0; i < max_vecs; i++) {
+               cpumask_t mask;
+
+               if (!cpumask_weight(unassigned_cpus))
+                       break;
+
+               affinity_mask = &masks[i];
+
+               mask = *get_cpu_mask(cpu);
+               set_vec_cpus(&mask, affinity_mask, cpus_per_vec);
+
+               if (cpumask_weight(&mask) < cpus_per_vec)
+                       add_cpus(&mask, unassigned_cpus,
+                               topology_sibling_cpumask(cpu),
+                               affinity_mask, cpus_per_vec);
+               if (cpumask_weight(&mask) < cpus_per_vec)
+                       add_cpus(&mask, unassigned_cpus,
+                               topology_core_cpumask(cpu),
+                               affinity_mask, cpus_per_vec);
+               if (cpumask_weight(&mask) < cpus_per_vec)
+                       add_cpus(&mask, unassigned_cpus,
+                               cpumask_of_node(cpu_to_node(cpu)),
+                               affinity_mask, cpus_per_vec);
+               if (cpumask_weight(&mask) < cpus_per_vec)
+                       add_cpus(&mask, unassigned_cpus,
+                               cpumask_of_node(
+                                       find_closest_node(
+                                               cpu_to_node(cpu))),
+                               affinity_mask, cpus_per_vec);
+               if (cpumask_weight(&mask) < cpus_per_vec)
+                       add_cpus(&mask, unassigned_cpus,
+                               unassigned_cpus, affinity_mask,
+                               cpus_per_vec);
+
+               cpumask_andnot(unassigned_cpus, unassigned_cpus, affinity_mask);
+               cpu = cpumask_next(cpu, unassigned_cpus);
+
+               if (remainder && !--remainder)
+                       cpus_per_vec++;
        }
        put_online_cpus();
 
-       return affinity_mask;
+       i = 0;
+       cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask);
+       for_each_cpu(cpu, unassigned_cpus) {
+               set_vec_cpus(get_cpu_mask(cpu), &masks[i], ~0);
+               i = (i + 1) % max_vecs;
+       }
+       free_cpumask_var(unassigned_cpus);
+
+       return masks;
 }
--
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/7] blk-mq: allow the driver to pass in an affinity mask

Reply via email to