On 8/1/2018 8:12 AM, Sagi Grimberg wrote:
Hi Max,

Hi,


Yes, since nvmf is the only user of this function.
Still waiting for comments on the suggested patch :)


Sorry for the late response (but I'm on vacation so I have
an excuse ;))

NP :) currently the code works..


I'm thinking that we should avoid trying to find an assignment
when stuff like irqbalance daemon is running and changing
the affinitization.

but this is exactly what Steve complained and Leon try to fix (and break the connection establishment). If this is the case and we all agree then we're good without Leon's patch and without our suggestions.


This extension was made to apply optimal affinity assignment
when the device irq affinity is lined up in a vector per
core.

I'm thinking that when we identify this is not the case, we immediately
fallback to the default mapping.

1. when we get a mask, if its weight != 1, we fallback.
2. if a queue was left unmapped, we fallback.

Maybe something like the following:

did you test it ? I think it will not work since you need to map all the queues and all the CPUs.

--
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 996167f1de18..1ada6211c55e 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -35,17 +35,26 @@ int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
         const struct cpumask *mask;
         unsigned int queue, cpu;

+       /* reset all CPUs mapping */
+       for_each_possible_cpu(cpu)
+               set->mq_map[cpu] = UINT_MAX;
+
         for (queue = 0; queue < set->nr_hw_queues; queue++) {
                 mask = ib_get_vector_affinity(dev, first_vec + queue);
                 if (!mask)
                         goto fallback;

-               for_each_cpu(cpu, mask)
-                       set->mq_map[cpu] = queue;
+               if (cpumask_weight(mask) != 1)
+                       goto fallback;
+
+               cpu = cpumask_first(mask);
+               if (set->mq_map[cpu] != UINT_MAX)
+                       goto fallback;
+
+               set->mq_map[cpu] = queue;
         }

         return 0;
-
  fallback:
         return blk_mq_map_queues(set);
  }
--

see attached another algorithem that can improve the mapping (although it's not a short one)...

it will try to map according to affinity mask, and also in case the mask weight > 1 it will try to be better than the naive mapping I suggest in the previous email.

From 007d773af7b65a1f1ca543f031ca58b3afa5b7d9 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <m...@mellanox.com>
Date: Thu, 19 Jul 2018 12:42:00 +0000
Subject: [PATCH 1/1] blk-mq: fix RDMA queue/cpu mappings assignments for mq

Signed-off-by: Max Gurtovoy <m...@mellanox.com>
Signed-off-by: Israel Rukshin <isra...@mellanox.com>
---
 block/blk-mq-cpumap.c  | 41 ++++++++++++++----------
 block/blk-mq-rdma.c    | 84 ++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/blk-mq.h |  1 +
 3 files changed, 103 insertions(+), 23 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 3eb169f..02b888f 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -30,29 +30,36 @@ static int get_first_sibling(unsigned int cpu)
        return cpu;
 }
 
-int blk_mq_map_queues(struct blk_mq_tag_set *set)
+void blk_mq_map_queue_to_cpu(struct blk_mq_tag_set *set, unsigned int cpu)
 {
        unsigned int *map = set->mq_map;
        unsigned int nr_queues = set->nr_hw_queues;
-       unsigned int cpu, first_sibling;
+       unsigned int first_sibling;
 
-       for_each_possible_cpu(cpu) {
-               /*
-                * First do sequential mapping between CPUs and queues.
-                * In case we still have CPUs to map, and we have some number of
-                * threads per cores then map sibling threads to the same queue 
for
-                * performace optimizations.
-                */
-               if (cpu < nr_queues) {
+       /*
+        * First do sequential mapping between CPUs and queues.
+        * In case we still have CPUs to map, and we have some number of
+        * threads per cores then map sibling threads to the same queue for
+        * performace optimizations.
+        */
+       if (cpu < nr_queues) {
+               map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+       } else {
+               first_sibling = get_first_sibling(cpu);
+               if (first_sibling == cpu)
                        map[cpu] = cpu_to_queue_index(nr_queues, cpu);
-               } else {
-                       first_sibling = get_first_sibling(cpu);
-                       if (first_sibling == cpu)
-                               map[cpu] = cpu_to_queue_index(nr_queues, cpu);
-                       else
-                               map[cpu] = map[first_sibling];
-               }
+               else
+                       map[cpu] = map[first_sibling];
        }
+}
+EXPORT_SYMBOL_GPL(blk_mq_map_queue_to_cpu);
+
+int blk_mq_map_queues(struct blk_mq_tag_set *set)
+{
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu)
+               blk_mq_map_queue_to_cpu(set, cpu);
 
        return 0;
 }
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 996167f..621d5f0 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -14,6 +14,33 @@
 #include <linux/blk-mq-rdma.h>
 #include <rdma/ib_verbs.h>
 
+static int blk_mq_rdma_map_queues_by_affinity(struct blk_mq_tag_set *set,
+               struct ib_device *dev, int first_vec, int max_mapping)
+{
+       const struct cpumask *mask;
+       unsigned int queue, cpu;
+       int num_of_mapping = 0;
+
+       for (queue = 0; queue < set->nr_hw_queues; queue++) {
+               if (num_of_mapping == max_mapping)
+                       return num_of_mapping;
+
+               mask = ib_get_vector_affinity(dev, first_vec + queue);
+               if (!mask)
+                       return -1;
+
+               for_each_cpu(cpu, mask) {
+                       if (set->mq_map[cpu] == UINT_MAX) {
+                               set->mq_map[cpu] = queue;
+                               num_of_mapping++;
+                               /* Each queue mapped to 1 cpu */
+                               break;
+                       }
+               }
+       }
+       return num_of_mapping;
+}
+
 /**
  * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device
  * @set:       tagset to provide the mapping for
@@ -32,18 +59,63 @@
 int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
                struct ib_device *dev, int first_vec)
 {
-       const struct cpumask *mask;
        unsigned int queue, cpu;
+       bool mapped;
+       int remaining_cpus = num_possible_cpus();
+       int unmapped_queues = set->nr_hw_queues;
+       int ret;
 
-       for (queue = 0; queue < set->nr_hw_queues; queue++) {
-               mask = ib_get_vector_affinity(dev, first_vec + queue);
-               if (!mask)
+       /* reset all CPUs mapping */
+       for_each_possible_cpu(cpu)
+               set->mq_map[cpu] = UINT_MAX;
+
+       /* Try to map the queues according to affinity */
+       ret = blk_mq_rdma_map_queues_by_affinity(set, dev, first_vec,
+                                                remaining_cpus);
+       if (ret == -1)
+               goto fallback;
+       remaining_cpus -= ret;
+       unmapped_queues -= ret;
+
+       /* Map queues with more than one cpu according to affinity */
+       while (remaining_cpus > unmapped_queues) {
+               ret = blk_mq_rdma_map_queues_by_affinity(set, dev, first_vec,
+                                       remaining_cpus - unmapped_queues);
+               if (ret == -1)
                        goto fallback;
+               if (!ret)
+                       break;
+               remaining_cpus -= ret;
+       }
 
-               for_each_cpu(cpu, mask)
-                       set->mq_map[cpu] = queue;
+       /* Map the unmapped queues in a naive way */
+       for (queue = 0; queue < set->nr_hw_queues; queue++) {
+               mapped = false;
+               for_each_possible_cpu(cpu) {
+                       if (set->mq_map[cpu] == queue) {
+                               mapped = true;
+                               break;
+                       }
+               }
+               if (!mapped) {
+                       for_each_possible_cpu(cpu) {
+                               if (set->mq_map[cpu] == UINT_MAX) {
+                                       set->mq_map[cpu] = queue;
+                                       mapped = true;
+                                       break;
+                               }
+                       }
+               }
+               /* This case should never happen */
+               if (WARN_ON_ONCE(!mapped))
+                       goto fallback;
        }
 
+       /* set all the rest of the CPUs */
+       for_each_possible_cpu(cpu)
+               if (set->mq_map[cpu] == UINT_MAX)
+                       blk_mq_map_queue_to_cpu(set, cpu);
+
        return 0;
 
 fallback:
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e3147eb..d6cd114 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -282,6 +282,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout);
 
+void blk_mq_map_queue_to_cpu(struct blk_mq_tag_set *set, unsigned int cpu);
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
-- 
1.8.3.1

Reply via email to