Add code for path selection.

NVMe ANA is abstracted into enum mpath_access_state. The motivation here is
so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized,
.get_access_state are added to get the path access state.

Path selection modes round-robin, NUMA, and queue-depth are added, same
as NVMe supports.

NVMe has almost like-for-like equivalents here:
- __mpath_find_path() -> __nvme_find_path()
- mpath_find_path() -> nvme_find_path()

and similar for all introduced callee functions.

Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for
setting default iopolicy.

A separate mpath_iopolicy structure is introduced. There is no iopolicy
member included in the mpath_head structure as it may not suit NVMe, where
iopolicy is per-subsystem and not per namespace.

Signed-off-by: John Garry <[email protected]>
---
 include/linux/multipath.h |  36 ++++++
 lib/multipath.c           | 251 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 287 insertions(+)

diff --git a/include/linux/multipath.h b/include/linux/multipath.h
index be9dd9fb83345..c964a1aba9c42 100644
--- a/include/linux/multipath.h
+++ b/include/linux/multipath.h
@@ -7,6 +7,22 @@
 
 extern const struct block_device_operations mpath_ops;
 
+enum mpath_iopolicy_e {
+       MPATH_IOPOLICY_NUMA,
+       MPATH_IOPOLICY_RR,
+       MPATH_IOPOLICY_QD,
+};
+
+struct mpath_iopolicy {
+       enum mpath_iopolicy_e   iopolicy;
+};
+
+enum mpath_access_state {
+       MPATH_STATE_OPTIMIZED,
+       MPATH_STATE_ACTIVE,
+       MPATH_STATE_INVALID     = 0xFF
+};
+
 struct mpath_disk {
        struct gendisk          *disk;
        struct kref             ref;
@@ -18,10 +34,16 @@ struct mpath_disk {
 
 struct mpath_device {
        struct list_head        siblings;
+       atomic_t                nr_active;
        struct gendisk          *disk;
+       int                     numa_node;
 };
 
 struct mpath_head_template {
+       bool (*is_disabled)(struct mpath_device *);
+       bool (*is_optimized)(struct mpath_device *);
+       enum mpath_access_state (*get_access_state)(struct mpath_device *);
+       enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *);
        const struct attribute_group **device_groups;
 };
 
@@ -50,6 +72,14 @@ static inline struct mpath_disk 
*mpath_gendisk_to_disk(struct gendisk *disk)
        return mpath_bd_device_to_disk(disk_to_dev(disk));
 }
 
+static inline enum mpath_iopolicy_e mpath_read_iopolicy(
+                       struct mpath_iopolicy *mpath_iopolicy)
+{
+       return READ_ONCE(mpath_iopolicy->iopolicy);
+}
+void mpath_synchronize(struct mpath_head *mpath_head);
+int mpath_set_iopolicy(const char *val, int *iopolicy);
+int mpath_get_iopolicy(char *buf, int iopolicy);
 int mpath_get_head(struct mpath_head *mpath_head);
 void mpath_put_head(struct mpath_head *mpath_head);
 struct mpath_head *mpath_alloc_head(void);
@@ -66,4 +96,10 @@ static inline bool is_mpath_head(struct gendisk *disk)
 {
        return disk->fops == &mpath_ops;
 }
+
+static inline bool mpath_qd_iopolicy(struct mpath_iopolicy *mpath_iopolicy)
+{
+       return mpath_read_iopolicy(mpath_iopolicy) == MPATH_IOPOLICY_QD;
+}
+
 #endif // _LIBMULTIPATH_H
diff --git a/lib/multipath.c b/lib/multipath.c
index 88efb0ae16acb..65a0d2d2bf524 100644
--- a/lib/multipath.c
+++ b/lib/multipath.c
@@ -6,8 +6,243 @@
 #include <linux/module.h>
 #include <linux/multipath.h>
 
+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head);
+
 static struct workqueue_struct *mpath_wq;
 
+static const char *mpath_iopolicy_names[] = {
+       [MPATH_IOPOLICY_NUMA]   = "numa",
+       [MPATH_IOPOLICY_RR]     = "round-robin",
+       [MPATH_IOPOLICY_QD]     = "queue-depth",
+};
+
+int mpath_set_iopolicy(const char *val, int *iopolicy)
+{
+       if (!val)
+               return -EINVAL;
+       if (!strncmp(val, "numa", 4))
+               *iopolicy = MPATH_IOPOLICY_NUMA;
+       else if (!strncmp(val, "round-robin", 11))
+               *iopolicy = MPATH_IOPOLICY_RR;
+       else if (!strncmp(val, "queue-depth", 11))
+               *iopolicy = MPATH_IOPOLICY_QD;
+       else
+               return -EINVAL;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mpath_set_iopolicy);
+
+int mpath_get_iopolicy(char *buf, int iopolicy)
+{
+       return sprintf(buf, "%s\n", mpath_iopolicy_names[iopolicy]);
+}
+EXPORT_SYMBOL_GPL(mpath_get_iopolicy);
+
+
+void mpath_synchronize(struct mpath_head *mpath_head)
+{
+       synchronize_srcu(&mpath_head->srcu);
+}
+EXPORT_SYMBOL_GPL(mpath_synchronize);
+
+static bool mpath_path_is_disabled(struct mpath_head *mpath_head,
+                               struct mpath_device *mpath_device)
+{
+       return mpath_head->mpdt->is_disabled(mpath_device);
+}
+
+static struct mpath_device *__mpath_find_path(struct mpath_head *mpath_head,
+                       enum mpath_iopolicy_e iopolicy, int node)
+{
+       int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
+       struct mpath_device *mpath_dev_found, *mpath_dev_fallback,
+                       *mpath_device;
+
+       list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+               srcu_read_lock_held(&mpath_head->srcu)) {
+               if (mpath_path_is_disabled(mpath_head, mpath_device))
+                       continue;
+
+               if (mpath_device->numa_node != NUMA_NO_NODE &&
+                   (iopolicy == MPATH_IOPOLICY_NUMA))
+                       distance = node_distance(node, mpath_device->numa_node);
+               else
+                       distance = LOCAL_DISTANCE;
+
+               switch(mpath_head->mpdt->get_access_state(mpath_device)) {
+               case MPATH_STATE_OPTIMIZED:
+                   if (distance < found_distance) {
+                           found_distance = distance;
+                           mpath_dev_found = mpath_device;
+                   }
+                   break;
+               case MPATH_STATE_ACTIVE:
+                   if (distance < fallback_distance) {
+                           fallback_distance = distance;
+                           mpath_dev_fallback = mpath_device;
+                   }
+                   break;
+               default:
+                   break;
+               }
+       }
+
+       if (!mpath_dev_found)
+               mpath_dev_found = mpath_dev_fallback;
+
+       if (mpath_dev_found)
+               rcu_assign_pointer(mpath_head->current_path[node],
+                       mpath_dev_found);
+
+       return mpath_dev_found;
+}
+
+static struct mpath_device *mpath_next_dev(struct mpath_head *mpath_head,
+                       struct mpath_device *mpath_dev)
+{
+       mpath_dev = list_next_or_null_rcu(&mpath_head->dev_list,
+                       &mpath_dev->siblings, struct mpath_device,
+                       siblings);
+
+       if (mpath_dev)
+               return mpath_dev;
+       return list_first_or_null_rcu(&mpath_head->dev_list,
+                               struct mpath_device, siblings);
+}
+
+static struct mpath_device *mpath_round_robin_path(
+                               struct mpath_head *mpath_head,
+                               enum mpath_iopolicy_e iopolicy)
+{
+       struct mpath_device *mpath_device, *found = NULL;
+       int node = numa_node_id();
+       enum mpath_access_state access_state_old;
+       struct mpath_device *old =
+                       srcu_dereference(mpath_head->current_path[node],
+                               &mpath_head->srcu);
+
+       if (unlikely(!old))
+               return __mpath_find_path(mpath_head, iopolicy, node);
+
+       if (list_is_singular(&mpath_head->dev_list)) {
+               if (mpath_path_is_disabled(mpath_head, old))
+                       return NULL;
+               return old;
+       }
+
+       for (mpath_device = mpath_next_dev(mpath_head, old);
+           mpath_device && mpath_device != old;
+           mpath_device = mpath_next_dev(mpath_head, mpath_device)) {
+               enum mpath_access_state access_state;
+
+               if (mpath_path_is_disabled(mpath_head, mpath_device))
+                       continue;
+               access_state = mpath_head->mpdt->get_access_state(mpath_device);
+               if (access_state == MPATH_STATE_OPTIMIZED) {
+                       found = mpath_device;
+                       goto out;
+               }
+               if (access_state == MPATH_STATE_ACTIVE)
+                       found = mpath_device;
+       }
+
+       /*
+        * The loop above skips the current path for round-robin semantics.
+        * Fall back to the current path if either:
+        *  - no other optimized path found and current is optimized,
+        *  - no other usable path found and current is usable.
+        */
+       access_state_old = mpath_head->mpdt->get_access_state(old);
+       if (!mpath_path_is_disabled(mpath_head, old) &&
+           (access_state_old == MPATH_STATE_OPTIMIZED ||
+           (!found && access_state_old == MPATH_STATE_ACTIVE)))
+               return old;
+
+       if (!found)
+               return NULL;
+out:
+       rcu_assign_pointer(mpath_head->current_path[node], found);
+
+       return found;
+}
+
+static struct mpath_device *mpath_queue_depth_path(struct mpath_head 
*mpath_head)
+{
+       struct mpath_device *best_opt = NULL, *mpath_device;
+       struct mpath_device *best_nonopt = NULL;
+       unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+       unsigned int depth;
+
+       list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+                                srcu_read_lock_held(&mpath_head->srcu)) {
+
+               if (mpath_path_is_disabled(mpath_head, mpath_device))
+                       continue;
+
+               depth = atomic_read(&mpath_device->nr_active);
+
+               switch (mpath_head->mpdt->get_access_state(mpath_device)) {
+               case MPATH_STATE_OPTIMIZED:
+                       if (depth < min_depth_opt) {
+                               min_depth_opt = depth;
+                               best_opt = mpath_device;
+                       }
+                       break;
+               case MPATH_STATE_ACTIVE:
+                       if (depth < min_depth_nonopt) {
+                               min_depth_nonopt = depth;
+                               best_nonopt = mpath_device;
+                       }
+                       break;
+               default:
+                       break;
+               }
+
+               if (min_depth_opt == 0)
+                       return best_opt;
+       }
+
+       return best_opt ? best_opt : best_nonopt;
+}
+
+static inline bool mpath_path_is_optimized(struct mpath_head *mpath_head,
+                                       struct mpath_device *mpath_device)
+{
+       return mpath_head->mpdt->is_optimized(mpath_device);
+}
+
+static struct mpath_device *mpath_numa_path(struct mpath_head *mpath_head,
+                                       enum mpath_iopolicy_e iopolicy)
+{
+       int node = numa_node_id();
+       struct mpath_device *mpath_device;
+
+       mpath_device = srcu_dereference(mpath_head->current_path[node],
+                                       &mpath_head->srcu);
+       if (unlikely(!mpath_device))
+               return __mpath_find_path(mpath_head, iopolicy, node);
+       if (unlikely(!mpath_path_is_optimized(mpath_head, mpath_device)))
+               return __mpath_find_path(mpath_head, iopolicy, node);
+       return mpath_device;
+}
+
+__maybe_unused
+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
+{
+       enum mpath_iopolicy_e iopolicy =
+                       mpath_head->mpdt->get_iopolicy(mpath_head);
+
+       switch (iopolicy) {
+       case MPATH_IOPOLICY_QD:
+               return mpath_queue_depth_path(mpath_head);
+       case MPATH_IOPOLICY_RR:
+               return mpath_round_robin_path(mpath_head, iopolicy);
+       default:
+               return mpath_numa_path(mpath_head, iopolicy);
+       }
+}
+
 static void mpath_free_head(struct kref *ref)
 {
        struct mpath_head *mpath_head =
@@ -99,6 +334,7 @@ void mpath_remove_disk(struct mpath_disk *mpath_disk)
        if (test_and_clear_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags)) {
                struct gendisk *disk = mpath_disk->disk;
 
+               mpath_synchronize(mpath_head);
                del_gendisk(disk);
        }
 }
@@ -158,6 +394,21 @@ void mpath_device_set_live(struct mpath_disk *mpath_disk,
                }
                queue_work(mpath_wq, &mpath_disk->partition_scan_work);
        }
+
+       mutex_lock(&mpath_head->lock);
+       if (mpath_path_is_optimized(mpath_head, mpath_device)) {
+               int node, srcu_idx;
+
+               srcu_idx = srcu_read_lock(&mpath_head->srcu);
+               for_each_online_node(node)
+                       __mpath_find_path(mpath_head,
+                               mpath_head->mpdt->get_iopolicy(mpath_head),
+                               node);
+               srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+       }
+       mutex_unlock(&mpath_head->lock);
+
+       mpath_synchronize(mpath_head);
 }
 EXPORT_SYMBOL_GPL(mpath_device_set_live);
 
-- 
2.43.5


Reply via email to