Hi Sean,

I took a look on the local-sa and it turns that its been through quite a
big change between the last two ofed releases (below). Does this diff
relates to the structural change of the local-sa being now embedded in the
ib-sa module? is it documented somewhere (eg by change-log of patch/es
in a git tree)? looking at your rdma-dev git tree I didn't evidence to
this change...

Or.

--- OFED-1.2/SOURCES/ofa_kernel-1.2/drivers/infiniband/core/local_sa.c  
2008-06-17 07:43:55.000000000 +0000
+++ 
ofed-1.3.1/OFED-1.3.1/SOURCES/ofa_kernel-1.3.1/drivers/infiniband/core/local_sa.c
   2008-06-17 07:41:13.000000000 +0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -34,113 +34,162 @@
 #include <linux/err.h>
 #include <linux/interrupt.h>
 #include <linux/rbtree.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>

 #include <rdma/ib_cache.h>
-#include <rdma/ib_local_sa.h>
+#include <rdma/ib_sa.h>
+#include "sa.h"

 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("InfiniBand subnet administration caching");
 MODULE_LICENSE("Dual BSD/GPL");

-static int retry_timer = 5000; /* 5 sec */
-module_param(retry_timer, int, 0444);
-MODULE_PARM_DESC(retry_timer, "Time in ms between retried requests.");
-
-static int retries = 3;
-module_param(retries, int, 0444);
-MODULE_PARM_DESC(retries, "Number of times to retry a request.");
-
-static unsigned long cache_timeout = 15 * 60 * 1000; /* 15 min */
-module_param(cache_timeout, ulong, 0444);
-MODULE_PARM_DESC(cache_timeout, "Time in ms between cache updates.  "
-                               "Set to 0 to disable cache.");
-
-static unsigned long hold_time = 30 * 1000; /* 30 sec */
-module_param(hold_time, ulong, 0444);
-MODULE_PARM_DESC(hold_timer, "Minimal time in ms between cache updates.");
-
-static unsigned long update_delay = 3000; /* 3 sec */
-module_param(update_delay, ulong, 0444);
-MODULE_PARM_DESC(update_delay, "Delay in ms between an event and an update.");
-
 enum {
-       IB_MAX_PATHS_PER_DEST = 0x7F
+       SA_DB_MAX_PATHS_PER_DEST = 0x7F,
+       SA_DB_MIN_RETRY_TIMER    = 4000,  /*   4 sec */
+       SA_DB_MAX_RETRY_TIMER    = 256000 /* 256 sec */
 };

-static unsigned long paths_per_dest = IB_MAX_PATHS_PER_DEST;
-module_param(paths_per_dest, ulong, 0444);
+static int set_paths_per_dest(const char *val, struct kernel_param *kp);
+static unsigned long paths_per_dest = 0;
+module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong,
+                 &paths_per_dest, 0644);
 MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve "
                                 "to each destination (DGID).  Set to 0 "
                                 "to disable cache.");

-static void sa_db_add_one(struct ib_device *device);
-static void sa_db_remove_one(struct ib_device *device);
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp);
+static char subscribe_inform_info = 1;
+module_param_call(subscribe_inform_info, set_subscribe_inform_info,
+                 param_get_bool, &subscribe_inform_info, 0644);
+MODULE_PARM_DESC(subscribe_inform_info,
+                "Subscribe for SA InformInfo/Notice events.");
+
+static int do_refresh(const char *val, struct kernel_param *kp);
+module_param_call(refresh, do_refresh, NULL, NULL, 0200);
+
+static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER;
+
+enum sa_db_lookup_method {
+       SA_DB_LOOKUP_LEAST_USED,
+       SA_DB_LOOKUP_RANDOM
+};
+
+static int set_lookup_method(const char *val, struct kernel_param *kp);
+static int get_lookup_method(char *buf, struct kernel_param *kp);
+static unsigned long lookup_method;
+module_param_call(lookup_method, set_lookup_method, get_lookup_method,
+                 &lookup_method, 0644);
+MODULE_PARM_DESC(lookup_method, "Method used to return path records when "
+                               "multiple paths exist to a given destination.");
+
+static void sa_db_add_dev(struct ib_device *device);
+static void sa_db_remove_dev(struct ib_device *device);

 static struct ib_client sa_db_client = {
        .name   = "local_sa",
-       .add    = sa_db_add_one,
-       .remove = sa_db_remove_one
+       .add    = sa_db_add_dev,
+       .remove = sa_db_remove_dev
 };

 static LIST_HEAD(dev_list);
-static DECLARE_RWSEM(lock);
-static unsigned long hold_time, update_delay;
+static DEFINE_MUTEX(lock);
+static rwlock_t rwlock;
 static struct workqueue_struct *sa_wq;
+static struct ib_sa_client sa_client;
+
+enum sa_db_state {
+       SA_DB_IDLE,
+       SA_DB_REFRESH,
+       SA_DB_DESTROY
+};

 struct sa_db_port {
-       struct sa_db_device *dev;
-       struct ib_mad_agent *agent;
-       struct rb_root paths;
-       unsigned long update_time;
-       int update;
-       struct delayed_work work;
-       union ib_gid gid;
-       int port_num;
+       struct sa_db_device     *dev;
+       struct ib_mad_agent     *agent;
+       /* Limit number of outstanding MADs to SA to reduce SA flooding */
+       struct ib_mad_send_buf  *msg;
+       u16                     sm_lid;
+       u8                      sm_sl;
+       struct ib_inform_info   *in_info;
+       struct ib_inform_info   *out_info;
+       struct rb_root          paths;
+       struct list_head        update_list;
+       unsigned long           update_id;
+       enum sa_db_state        state;
+       struct work_struct      work;
+       union ib_gid            gid;
+       int                     port_num;
 };

 struct sa_db_device {
-       struct list_head list;
-       struct ib_device *device;
+       struct list_head        list;
+       struct ib_device        *device;
        struct ib_event_handler event_handler;
-       struct sa_db_port port[0];
+       int                     start_port;
+       int                     port_count;
+       struct sa_db_port       port[0];
 };

 struct ib_sa_iterator {
        struct ib_sa_iterator   *next;
 };

+struct ib_sa_attr_iter {
+       struct ib_sa_iterator   *iter;
+       unsigned long           flags;
+};
+
 struct ib_sa_attr_list {
        struct ib_sa_iterator   iter;
        struct ib_sa_iterator   *tail;
-       int                     update;
+       int                     update_id;
        union ib_gid            gid;
        struct rb_node          node;
 };

 struct ib_path_rec_info {
-       struct ib_sa_iterator   iter;   /* keep first for ib_get_next_sa_attr */
+       struct ib_sa_iterator   iter; /* keep first */
        struct ib_sa_path_rec   rec;
+       unsigned long           lookups;
 };

-struct ib_sa_iter {
-       struct ib_mad_recv_wc *recv_wc;
-       struct ib_mad_recv_buf *recv_buf;
-       int attr_size;
-       int attr_offset;
-       int data_offset;
-       int data_left;
-       void *attr;
-       u8 attr_data[0];
+struct ib_sa_mad_iter {
+       struct ib_mad_recv_wc   *recv_wc;
+       struct ib_mad_recv_buf  *recv_buf;
+       int                     attr_size;
+       int                     attr_offset;
+       int                     data_offset;
+       int                     data_left;
+       void                    *attr;
+       u8                      attr_data[0];
 };

-static void send_handler(struct ib_mad_agent *agent,
-                        struct ib_mad_send_wc *mad_send_wc)
-{
-       ib_destroy_ah(mad_send_wc->send_buf->ah);
-       ib_free_send_mad(mad_send_wc->send_buf);
-}
+enum sa_update_type {
+       SA_UPDATE_FULL,
+       SA_UPDATE_ADD,
+       SA_UPDATE_REMOVE
+};
+
+struct update_info {
+       struct list_head        list;
+       union ib_gid            gid;
+       enum sa_update_type     type;
+};
+
+struct sa_path_request {
+       struct work_struct      work;
+       struct ib_sa_client     *client;
+       void                    (*callback)(int, struct ib_sa_path_rec *, void 
*);
+       void                    *context;
+       struct ib_sa_path_rec   path_rec;
+};
+
+static void process_updates(struct sa_db_port *port);

 static void free_attr_list(struct ib_sa_attr_list *attr_list)
 {
@@ -153,23 +202,44 @@ static void free_attr_list(struct ib_sa_
        attr_list->tail = &attr_list->iter;
 }

+static void remove_attr(struct rb_root *root, struct ib_sa_attr_list 
*attr_list)
+{
+       rb_erase(&attr_list->node, root);
+       free_attr_list(attr_list);
+       kfree(attr_list);
+}
+
 static void remove_all_attrs(struct rb_root *root)
 {
        struct rb_node *node, *next_node;
        struct ib_sa_attr_list *attr_list;

+       write_lock_irq(&rwlock);
        for (node = rb_first(root); node; node = next_node) {
                next_node = rb_next(node);
-               rb_erase(node, root);
                attr_list = rb_entry(node, struct ib_sa_attr_list, node);
-               free_attr_list(attr_list);
-               kfree(attr_list);
+               remove_attr(root, attr_list);
        }
+       write_unlock_irq(&rwlock);
 }

-static struct ib_sa_attr_list * insert_attr_list(struct rb_root *root,
-                                                struct ib_sa_attr_list
-                                                       *attr_list)
+static void remove_old_attrs(struct rb_root *root, unsigned long update_id)
+{
+       struct rb_node *node, *next_node;
+       struct ib_sa_attr_list *attr_list;
+
+       write_lock_irq(&rwlock);
+       for (node = rb_first(root); node; node = next_node) {
+               next_node = rb_next(node);
+               attr_list = rb_entry(node, struct ib_sa_attr_list, node);
+               if (attr_list->update_id != update_id)
+                       remove_attr(root, attr_list);
+       }
+       write_unlock_irq(&rwlock);
+}
+
+static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root,
+                                               struct ib_sa_attr_list 
*attr_list)
 {
        struct rb_node **link = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -193,8 +263,7 @@ static struct ib_sa_attr_list * insert_a
        return NULL;
 }

-static struct ib_sa_attr_list * find_attr_list(struct rb_root *root,
-                                              u8 *gid)
+static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid)
 {
        struct rb_node *node = root->rb_node;
        struct ib_sa_attr_list *attr_list;
@@ -213,46 +282,47 @@ static struct ib_sa_attr_list * find_att
        return NULL;
 }

-static int insert_attr(struct rb_root *root, int update, void *key,
+static int insert_attr(struct rb_root *root, unsigned long update_id, void 
*key,
                       struct ib_sa_iterator *iter)
 {
        struct ib_sa_attr_list *attr_list;
        void *err;

+       write_lock_irq(&rwlock);
        attr_list = find_attr_list(root, key);
        if (!attr_list) {
+               write_unlock_irq(&rwlock);
                attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL);
                if (!attr_list)
                        return -ENOMEM;

                attr_list->iter.next = NULL;
                attr_list->tail = &attr_list->iter;
-               attr_list->update = update;
+               attr_list->update_id = update_id;
                memcpy(attr_list->gid.raw, key, sizeof attr_list->gid);

+               write_lock_irq(&rwlock);
                err = insert_attr_list(root, attr_list);
                if (err) {
+                       write_unlock_irq(&rwlock);
                        kfree(attr_list);
                        return PTR_ERR(err);
                }
-       } else if (attr_list->update != update) {
+       } else if (attr_list->update_id != update_id) {
                free_attr_list(attr_list);
-               attr_list->update = update;
+               attr_list->update_id = update_id;
        }

-       /*
-        * Assume that the SA returned the best attribute first, and insert
-        * attributes on the tail.
-        */
        attr_list->tail->next = iter;
        iter->next = NULL;
        attr_list->tail = iter;
+       write_unlock_irq(&rwlock);
        return 0;
 }

-static struct ib_sa_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc)
+static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc 
*mad_recv_wc)
 {
-       struct ib_sa_iter *iter;
+       struct ib_sa_mad_iter *iter;
        struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad;
        int attr_size, attr_offset;

@@ -273,12 +343,12 @@ static struct ib_sa_iter *ib_sa_iter_cre
        return iter;
 }

-static void ib_sa_iter_free(struct ib_sa_iter *iter)
+static void ib_sa_iter_free(struct ib_sa_mad_iter *iter)
 {
        kfree(iter);
 }

-static void *ib_sa_iter_next(struct ib_sa_iter *iter)
+static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter)
 {
        struct ib_sa_mad *mad;
        int left, offset = 0;
@@ -319,67 +389,48 @@ out:
 }

 /*
- * Copy a path record from a received MAD and insert it into our index.
- * The path record in the MAD is in network order, so must be swapped.  It
- * can also span multiple MADs, just to make our life hard.
+ * Copy path records from a received response and insert them into our cache.
+ * A path record in the MADs are in network order, packed, and may
+ * span multiple MAD buffers, just to make our life hard.
  */
-static void update_path_rec(struct sa_db_port *port,
-                           struct ib_mad_recv_wc *mad_recv_wc)
+static void update_path_db(struct sa_db_port *port,
+                          struct ib_mad_recv_wc *mad_recv_wc,
+                          enum sa_update_type type)
 {
-       struct ib_sa_iter *iter;
+       struct ib_sa_mad_iter *iter;
        struct ib_path_rec_info *path_info;
        void *attr;
+       int ret;

        iter = ib_sa_iter_create(mad_recv_wc);
        if (IS_ERR(iter))
                return;

-       down_write(&lock);
-       port->update++;
+       port->update_id += (type == SA_UPDATE_FULL);
+
        while ((attr = ib_sa_iter_next(iter)) &&
               (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) {

                ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC);
-               if (insert_attr(&port->paths, port->update,
-                               path_info->rec.dgid.raw,
-                               &path_info->iter)) {
+
+               ret = insert_attr(&port->paths, port->update_id,
+                                 path_info->rec.dgid.raw, &path_info->iter);
+               if (ret) {
                        kfree(path_info);
                        break;
                }
        }
-       up_write(&lock);
        ib_sa_iter_free(iter);
-}
-
-static void recv_handler(struct ib_mad_agent *mad_agent,
-                        struct ib_mad_recv_wc *mad_recv_wc)
-{
-       struct ib_sa_mad *mad = (void *) mad_recv_wc->recv_buf.mad;
-
-       if (mad->mad_hdr.status)
-               goto done;

-       switch (cpu_to_be16(mad->mad_hdr.attr_id)) {
-       case IB_SA_ATTR_PATH_REC:
-               update_path_rec(mad_agent->context, mad_recv_wc);
-               break;
-       default:
-               break;
-       }
-done:
-       ib_free_recv_mad(mad_recv_wc);
+       if (type == SA_UPDATE_FULL)
+               remove_old_attrs(&port->paths, port->update_id);
 }

-static struct ib_mad_send_buf* get_sa_msg(struct sa_db_port *port)
+static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port,
+                                         struct update_info *update)
 {
-       struct ib_port_attr     port_attr;
-       struct ib_ah_attr       ah_attr;
-       struct ib_mad_send_buf  *msg;
-       int ret;
-
-       ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
-       if (ret || port_attr.state != IB_PORT_ACTIVE)
-               return NULL;
+       struct ib_ah_attr ah_attr;
+       struct ib_mad_send_buf *msg;

        msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR,
                                 IB_MGMT_SA_DATA, GFP_KERNEL);
@@ -387,8 +438,8 @@ static struct ib_mad_send_buf* get_sa_ms
                return NULL;

        memset(&ah_attr, 0, sizeof ah_attr);
-       ah_attr.dlid = port_attr.sm_lid;
-       ah_attr.sl = port_attr.sm_sl;
+       ah_attr.dlid = port->sm_lid;
+       ah_attr.sl = port->sm_sl;
        ah_attr.port_num = port->port_num;

        msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
@@ -398,8 +449,9 @@ static struct ib_mad_send_buf* get_sa_ms
        }

        msg->timeout_ms = retry_timer;
-       msg->retries = retries;
+       msg->retries = 0;
        msg->context[0] = port;
+       msg->context[1] = update;
        return msg;
 }

@@ -411,6 +463,7 @@ static __be64 form_tid(u32 hi_tid)
 }

 static void format_path_req(struct sa_db_port *port,
+                           struct update_info *update,
                            struct ib_mad_send_buf *msg)
 {
        struct ib_sa_mad *mad = msg->mad;
@@ -426,198 +479,749 @@ static void format_path_req(struct sa_db
        mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH;

        path_rec.sgid = port->gid;
-       path_rec.numb_path = paths_per_dest;
+       path_rec.numb_path = (u8) paths_per_dest;
+
+       if (update->type == SA_UPDATE_ADD) {
+               mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID;
+               memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid);
+       }
+
        ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC);
 }

-static void update_cache(struct work_struct *work)
+static int send_query(struct sa_db_port *port,
+                     struct update_info *update)
 {
-       struct sa_db_port *port;
-       struct ib_mad_send_buf *msg;
+       int ret;

-       port = container_of(work, typeof(*port), work.work);
-       msg = get_sa_msg(port);
-       if (!msg)
-               return;
+       port->msg = get_sa_msg(port, update);
+       if (!port->msg)
+               return -ENOMEM;

-       format_path_req(port, msg);
+       format_path_req(port, update, port->msg);

-       if (ib_post_send_mad(msg, NULL)) {
-               ib_destroy_ah(msg->ah);
-               ib_free_send_mad(msg);
-               return;
+       ret = ib_post_send_mad(port->msg, NULL);
+       if (ret)
+               goto err;
+
+       return 0;
+
+err:
+       ib_destroy_ah(port->msg->ah);
+       ib_free_send_mad(port->msg);
+       return ret;
+}
+
+static void add_update(struct sa_db_port *port, u8 *gid,
+                      enum sa_update_type type)
+{
+       struct update_info *update;
+
+       update = kmalloc(sizeof *update, GFP_KERNEL);
+       if (update) {
+               if (gid)
+                       memcpy(&update->gid, gid, sizeof update->gid);
+               update->type = type;
+               list_add(&update->list, &port->update_list);
        }

-       /*
-        * We record the time that we requested the update, rather than use the
-        * time that the update occurred.  This allows us to generate a new
-        * update if an event occurs while we're still processing this one.
-        */
-       port->update_time = jiffies;
-       queue_delayed_work(sa_wq, &port->work, cache_timeout);
+       if (port->state == SA_DB_IDLE) {
+               port->state = SA_DB_REFRESH;
+               process_updates(port);
+       }
 }

-static void schedule_update(struct sa_db_port *port)
+static void clean_update_list(struct sa_db_port *port)
 {
-       unsigned long time, delay;
+       struct update_info *update;

-       if (!paths_per_dest)
+       while (!list_empty(&port->update_list)) {
+               update = list_entry(port->update_list.next,
+                                   struct update_info, list);
+               list_del(&update->list);
+               kfree(update);
+       }
+}
+
+static int notice_handler(int status, struct ib_inform_info *info,
+                         struct ib_sa_notice *notice)
+{
+       struct sa_db_port *port = info->context;
+       struct ib_sa_notice_data_gid *gid_data;
+       struct ib_inform_info **pinfo;
+       enum sa_update_type type;
+
+       if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) {
+               pinfo = &port->in_info;
+               type = SA_UPDATE_ADD;
+       } else {
+               pinfo = &port->out_info;
+               type = SA_UPDATE_REMOVE;
+       }
+
+       mutex_lock(&lock);
+       if (port->state == SA_DB_DESTROY || !*pinfo) {
+               mutex_unlock(&lock);
+               return 0;
+       }
+
+       if (notice) {
+               gid_data = (struct ib_sa_notice_data_gid *)
+                          &notice->data_details;
+               add_update(port, gid_data->gid, type);
+               mutex_unlock(&lock);
+       } else if (status == -ENETRESET) {
+               *pinfo = NULL;
+               mutex_unlock(&lock);
+       } else {
+               if (status)
+                       *pinfo = ERR_PTR(-EINVAL);
+               port->state = SA_DB_IDLE;
+               clean_update_list(port);
+               mutex_unlock(&lock);
+               queue_work(sa_wq, &port->work);
+       }
+
+       return status;
+}
+
+static int reg_in_info(struct sa_db_port *port)
+{
+       int ret = 0;
+
+       port->in_info = ib_sa_register_inform_info(&sa_client,
+                                                  port->dev->device,
+                                                  port->port_num,
+                                                  IB_SA_SM_TRAP_GID_IN_SERVICE,
+                                                  GFP_KERNEL, notice_handler,
+                                                  port);
+       if (IS_ERR(port->in_info))
+               ret = PTR_ERR(port->in_info);
+
+       return ret;
+}
+
+static int reg_out_info(struct sa_db_port *port)
+{
+       int ret = 0;
+
+       port->out_info = ib_sa_register_inform_info(&sa_client,
+                                                   port->dev->device,
+                                                   port->port_num,
+                                                   
IB_SA_SM_TRAP_GID_OUT_OF_SERVICE,
+                                                   GFP_KERNEL, notice_handler,
+                                                   port);
+       if (IS_ERR(port->out_info))
+               ret = PTR_ERR(port->out_info);
+
+       return ret;
+}
+
+static void unsubscribe_port(struct sa_db_port *port)
+{
+       if (port->in_info && !IS_ERR(port->in_info))
+               ib_sa_unregister_inform_info(port->in_info);
+
+       if (port->out_info && !IS_ERR(port->out_info))
+               ib_sa_unregister_inform_info(port->out_info);
+
+       port->out_info = NULL;
+       port->in_info = NULL;
+
+}
+
+static void cleanup_port(struct sa_db_port *port)
+{
+       unsubscribe_port(port);
+
+       clean_update_list(port);
+       remove_all_attrs(&port->paths);
+}
+
+static int update_port_info(struct sa_db_port *port)
+{
+       struct ib_port_attr port_attr;
+       int ret;
+
+       ret = ib_query_port(port->dev->device, port->port_num, &port_attr);
+       if (ret)
+               return ret;
+
+       if (port_attr.state != IB_PORT_ACTIVE)
+               return -ENODATA;
+
+        port->sm_lid = port_attr.sm_lid;
+       port->sm_sl = port_attr.sm_sl;
+       return 0;
+}
+
+static void process_updates(struct sa_db_port *port)
+{
+       struct update_info *update;
+       struct ib_sa_attr_list *attr_list;
+       int ret;
+
+       if (!paths_per_dest || update_port_info(port)) {
+               cleanup_port(port);
+               goto out;
+       }
+
+       /* Event registration is an optimization, so ignore failures. */
+       if (subscribe_inform_info) {
+               if (!port->out_info) {
+                       ret = reg_out_info(port);
+                       if (!ret)
+                               return;
+               }
+
+               if (!port->in_info) {
+                       ret = reg_in_info(port);
+                       if (!ret)
+                               return;
+               }
+       } else
+               unsubscribe_port(port);
+
+       while (!list_empty(&port->update_list)) {
+               update = list_entry(port->update_list.next,
+                                   struct update_info, list);
+
+               if (update->type == SA_UPDATE_REMOVE) {
+                       write_lock_irq(&rwlock);
+                       attr_list = find_attr_list(&port->paths,
+                                                  update->gid.raw);
+                       if (attr_list)
+                               remove_attr(&port->paths, attr_list);
+                       write_unlock_irq(&rwlock);
+               } else {
+                       ret = send_query(port, update);
+                       if (!ret)
+                               return;
+
+               }
+               list_del(&update->list);
+               kfree(update);
+       }
+out:
+       port->state = SA_DB_IDLE;
+}
+
+static void refresh_port_db(struct sa_db_port *port)
+{
+       if (port->state == SA_DB_DESTROY)
                return;

-       time = jiffies;
-       if (time_after(time, port->update_time + hold_time))
-               delay = update_delay;
-       else
-               delay = port->update_time + hold_time - time;
+       if (port->state == SA_DB_REFRESH) {
+               clean_update_list(port);
+               ib_cancel_mad(port->agent, port->msg);
+       }

-       cancel_delayed_work(&port->work);
-       queue_delayed_work(sa_wq, &port->work, delay);
+       add_update(port, NULL, SA_UPDATE_FULL);
 }

-static void handle_event(struct ib_event_handler *event_handler,
-                        struct ib_event *event)
+static void refresh_dev_db(struct sa_db_device *dev)
+{
+       int i;
+
+       for (i = 0; i < dev->port_count; i++)
+               refresh_port_db(&dev->port[i]);
+}
+
+static void refresh_db(void)
 {
        struct sa_db_device *dev;
-       dev = container_of(event_handler, typeof(*dev), event_handler);

-       if (event->event == IB_EVENT_PORT_ERR    ||
-           event->event == IB_EVENT_PORT_ACTIVE ||
-           event->event == IB_EVENT_LID_CHANGE  ||
-           event->event == IB_EVENT_PKEY_CHANGE ||
-           event->event == IB_EVENT_SM_CHANGE   ||
-           event->event == IB_EVENT_CLIENT_REREGISTER)
-               schedule_update(&dev->port[event->element.port_num - 1]);
+       list_for_each_entry(dev, &dev_list, list)
+               refresh_dev_db(dev);
 }

-int ib_get_path_rec(struct ib_device *device, u8 port_num, union ib_gid *sgid,
-                   union ib_gid *dgid, u16 pkey, struct ib_sa_path_rec *rec)
+static int do_refresh(const char *val, struct kernel_param *kp)
 {
-       struct ib_sa_iterator *iter;
-       struct ib_sa_path_rec *path;
-       int ret = -ENODATA;
+       mutex_lock(&lock);
+       refresh_db();
+       mutex_unlock(&lock);
+       return 0;
+}

-       iter = ib_create_path_iter(device, port_num, dgid);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
+static int get_lookup_method(char *buf, struct kernel_param *kp)
+{
+       return sprintf(buf,
+                      "%c %d round robin\n"
+                      "%c %d random",
+                      (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ',
+                      SA_DB_LOOKUP_LEAST_USED,
+                      (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ',
+                      SA_DB_LOOKUP_RANDOM);
+}

-       for (path = ib_get_next_sa_attr(&iter); path;
-            path = ib_get_next_sa_attr(&iter)) {
-               if (pkey == path->pkey &&
-                   !memcmp(sgid, path->sgid.raw, sizeof *sgid)) {
-                       memcpy(rec, path, sizeof *rec);
-                       ret = 0;
-                       break;
-                   }
+static int set_lookup_method(const char *val, struct kernel_param *kp)
+{
+       unsigned long method;
+       int ret = 0;
+
+       method = simple_strtoul(val, NULL, 0);
+
+       switch (method) {
+       case SA_DB_LOOKUP_LEAST_USED:
+       case SA_DB_LOOKUP_RANDOM:
+               lookup_method = method;
+               break;
+       default:
+               ret = -EINVAL;
+               break;
        }

-       ib_free_sa_iter(iter);
        return ret;
 }
-EXPORT_SYMBOL(ib_get_path_rec);

-struct ib_sa_iterator *ib_create_path_iter(struct ib_device *device,
-                                          u8 port_num, union ib_gid *dgid)
+static int set_paths_per_dest(const char *val, struct kernel_param *kp)
+{
+       int ret;
+
+       mutex_lock(&lock);
+       ret = param_set_ulong(val, kp);
+       if (ret)
+               goto out;
+
+       if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST)
+               paths_per_dest = SA_DB_MAX_PATHS_PER_DEST;
+       refresh_db();
+out:
+       mutex_unlock(&lock);
+       return ret;
+}
+
+static int set_subscribe_inform_info(const char *val, struct kernel_param *kp)
+{
+       int ret;
+
+       ret = param_set_bool(val, kp);
+       if (ret)
+               return ret;
+
+       return do_refresh(val, kp);
+}
+
+static void port_work_handler(struct work_struct *work)
+{
+       struct sa_db_port *port;
+
+       port = container_of(work, typeof(*port), work);
+       mutex_lock(&lock);
+       refresh_port_db(port);
+       mutex_unlock(&lock);
+}
+
+static void handle_event(struct ib_event_handler *event_handler,
+                        struct ib_event *event)
+{
+       struct sa_db_device *dev;
+       struct sa_db_port *port;
+
+       dev = container_of(event_handler, typeof(*dev), event_handler);
+       port = &dev->port[event->element.port_num - dev->start_port];
+
+       switch (event->event) {
+       case IB_EVENT_PORT_ERR:
+       case IB_EVENT_LID_CHANGE:
+       case IB_EVENT_SM_CHANGE:
+       case IB_EVENT_CLIENT_REREGISTER:
+       case IB_EVENT_PKEY_CHANGE:
+       case IB_EVENT_PORT_ACTIVE:
+               queue_work(sa_wq, &port->work);
+               break;
+       default:
+               break;
+       }
+}
+
+static void ib_free_path_iter(struct ib_sa_attr_iter *iter)
+{
+       read_unlock_irqrestore(&rwlock, iter->flags);
+}
+
+static int ib_create_path_iter(struct ib_device *device, u8 port_num,
+                              union ib_gid *dgid, struct ib_sa_attr_iter *iter)
 {
        struct sa_db_device *dev;
        struct sa_db_port *port;
        struct ib_sa_attr_list *list;
-       int ret;

-       down_read(&lock);
        dev = ib_get_client_data(device, &sa_db_client);
-       if (!dev) {
-               ret = -ENODEV;
-               goto err;
-       }
-       port = &dev->port[port_num - 1];
+       if (!dev)
+               return -ENODEV;

+       port = &dev->port[port_num - dev->start_port];
+
+       read_lock_irqsave(&rwlock, iter->flags);
        list = find_attr_list(&port->paths, dgid->raw);
        if (!list) {
-               ret = -ENODATA;
-               goto err;
+               ib_free_path_iter(iter);
+               return -ENODATA;
        }

-       return &list->iter;
-err:
-       up_read(&lock);
-       return ERR_PTR(ret);
+       iter->iter = &list->iter;
+       return 0;
+}
+
+static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter)
+{
+       struct ib_path_rec_info *next_path;
+
+       iter->iter = iter->iter->next;
+       if (iter->iter) {
+               next_path = container_of(iter->iter, struct ib_path_rec_info, 
iter);
+               return &next_path->rec;
+       } else
+               return NULL;
 }
-EXPORT_SYMBOL(ib_create_path_iter);

-void ib_free_sa_iter(struct ib_sa_iterator *iter)
+static int cmp_rec(struct ib_sa_path_rec *src,
+                  struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask)
 {
-       up_read(&lock);
+       /* DGID check already done */
+       if (comp_mask & IB_SA_PATH_REC_SGID &&
+           memcmp(&src->sgid, &dst->sgid, sizeof src->sgid))
+               return -EINVAL;
+       if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid)
+               return -EINVAL;
+       if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid)
+               return -EINVAL;
+       if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC &&
+           src->raw_traffic != dst->raw_traffic)
+               return -EINVAL;
+
+       if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL &&
+           src->flow_label != dst->flow_label)
+               return -EINVAL;
+       if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT &&
+           src->hop_limit != dst->hop_limit)
+               return -EINVAL;
+       if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS &&
+           src->traffic_class != dst->traffic_class)
+               return -EINVAL;
+       if (comp_mask & IB_SA_PATH_REC_REVERSIBLE &&
+           dst->reversible && !src->reversible)
+               return -EINVAL;
+       /* Numb path check already done */
+       if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey)
+               return -EINVAL;
+
+       if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl)
+               return -EINVAL;
+
+       if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR,
+                                IB_SA_PATH_REC_MTU, dst->mtu_selector,
+                                src->mtu, dst->mtu))
+               return -EINVAL;
+       if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR,
+                                IB_SA_PATH_REC_RATE, dst->rate_selector,
+                                src->rate, dst->rate))
+               return -EINVAL;
+       if (ib_sa_check_selector(comp_mask,
+                                IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR,
+                                IB_SA_PATH_REC_PACKET_LIFE_TIME,
+                                dst->packet_life_time_selector,
+                                src->packet_life_time, dst->packet_life_time))
+               return -EINVAL;
+
+       return 0;
+}
+
+static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter,
+                                             struct ib_sa_path_rec *req_path,
+                                             ib_sa_comp_mask comp_mask)
+{
+       struct ib_sa_path_rec *path, *rand_path = NULL;
+       int num, count = 0;
+
+       for (path = ib_get_next_path(iter); path;
+            path = ib_get_next_path(iter)) {
+               if (!cmp_rec(path, req_path, comp_mask)) {
+                       get_random_bytes(&num, sizeof num);
+                       if ((num % ++count) == 0)
+                               rand_path = path;
+               }
+       }
+
+       return rand_path;
 }
-EXPORT_SYMBOL(ib_free_sa_iter);

-void *ib_get_next_sa_attr(struct ib_sa_iterator **iter)
+static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter,
+                                           struct ib_sa_path_rec *req_path,
+                                           ib_sa_comp_mask comp_mask)
 {
-       *iter = (*iter)->next;
-       if (*iter)
-               return ((void *)(*iter)) + sizeof(**iter);
-       else
+       struct ib_path_rec_info *cur_path, *next_path = NULL;
+       struct ib_sa_path_rec *path;
+       unsigned long lookups = ~0;
+
+       for (path = ib_get_next_path(iter); path;
+            path = ib_get_next_path(iter)) {
+               if (!cmp_rec(path, req_path, comp_mask)) {
+
+                       cur_path = container_of(iter->iter, struct 
ib_path_rec_info,
+                                               iter);
+                       if (cur_path->lookups < lookups) {
+                               lookups = cur_path->lookups;
+                               next_path = cur_path;
+                       }
+               }
+       }
+
+       if (next_path) {
+               next_path->lookups++;
+               return &next_path->rec;
+       } else
                return NULL;
 }
-EXPORT_SYMBOL(ib_get_next_sa_attr);

-static void sa_db_add_one(struct ib_device *device)
+static void report_path(struct work_struct *work)
+{
+       struct sa_path_request *req;
+
+       req = container_of(work, struct sa_path_request, work);
+       req->callback(0, &req->path_rec, req->context);
+       ib_sa_client_put(req->client);
+       kfree(req);
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+                      struct ib_device *device, u8 port_num,
+                      struct ib_sa_path_rec *rec,
+                      ib_sa_comp_mask comp_mask,
+                      int timeout_ms, gfp_t gfp_mask,
+                      void (*callback)(int status,
+                                       struct ib_sa_path_rec *resp,
+                                       void *context),
+                      void *context,
+                      struct ib_sa_query **sa_query)
+{
+       struct sa_path_request *req;
+       struct ib_sa_attr_iter iter;
+       struct ib_sa_path_rec *path_rec;
+       int ret;
+
+       if (!paths_per_dest)
+               goto query_sa;
+
+       if (!(comp_mask & IB_SA_PATH_REC_DGID) ||
+           !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1)
+               goto query_sa;
+
+       req = kmalloc(sizeof *req, gfp_mask);
+       if (!req)
+               goto query_sa;
+
+       ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter);
+       if (ret)
+               goto free_req;
+
+       if (lookup_method == SA_DB_LOOKUP_RANDOM)
+               path_rec = get_random_path(&iter, rec, comp_mask);
+       else
+               path_rec = get_next_path(&iter, rec, comp_mask);
+
+       if (!path_rec)
+               goto free_iter;
+
+       memcpy(&req->path_rec, path_rec, sizeof *path_rec);
+       ib_free_path_iter(&iter);
+
+       INIT_WORK(&req->work, report_path);
+       req->client = client;
+       req->callback = callback;
+       req->context = context;
+
+       ib_sa_client_get(client);
+       queue_work(sa_wq, &req->work);
+       *sa_query = ERR_PTR(-EEXIST);
+       return 0;
+
+free_iter:
+       ib_free_path_iter(&iter);
+free_req:
+       kfree(req);
+query_sa:
+       return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask,
+                                   timeout_ms, gfp_mask, callback, context,
+                                   sa_query);
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+                        struct ib_mad_recv_wc *mad_recv_wc)
+{
+       struct sa_db_port *port;
+       struct update_info *update;
+       struct ib_mad_send_buf *msg;
+       enum sa_update_type type;
+
+       msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id;
+       port = msg->context[0];
+       update = msg->context[1];
+
+       mutex_lock(&lock);
+       if (port->state == SA_DB_DESTROY ||
+           update != list_entry(port->update_list.next,
+                                struct update_info, list)) {
+               mutex_unlock(&lock);
+       } else {
+               type = update->type;
+               mutex_unlock(&lock);
+               update_path_db(mad_agent->context, mad_recv_wc, type);
+       }
+
+       ib_free_recv_mad(mad_recv_wc);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+                        struct ib_mad_send_wc *mad_send_wc)
+{
+       struct ib_mad_send_buf *msg;
+       struct sa_db_port *port;
+       struct update_info *update;
+       int ret;
+
+       msg = mad_send_wc->send_buf;
+       port = msg->context[0];
+       update = msg->context[1];
+
+       mutex_lock(&lock);
+       if (port->state == SA_DB_DESTROY)
+               goto unlock;
+
+       if (update == list_entry(port->update_list.next,
+                                struct update_info, list)) {
+
+               if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR &&
+                   msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) {
+
+                       msg->timeout_ms <<= 1;
+                       ret = ib_post_send_mad(msg, NULL);
+                       if (!ret) {
+                               mutex_unlock(&lock);
+                               return;
+                       }
+               }
+               list_del(&update->list);
+               kfree(update);
+       }
+       process_updates(port);
+unlock:
+       mutex_unlock(&lock);
+
+       ib_destroy_ah(msg->ah);
+       ib_free_send_mad(msg);
+}
+
+static int init_port(struct sa_db_device *dev, int port_num)
+{
+       struct sa_db_port *port;
+       int ret;
+
+       port = &dev->port[port_num - dev->start_port];
+       port->dev = dev;
+       port->port_num = port_num;
+       INIT_WORK(&port->work, port_work_handler);
+       port->paths = RB_ROOT;
+       INIT_LIST_HEAD(&port->update_list);
+
+       ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid);
+       if (ret)
+               return ret;
+
+       port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI,
+                                           NULL, IB_MGMT_RMPP_VERSION,
+                                           send_handler, recv_handler, port);
+       if (IS_ERR(port->agent))
+               ret = PTR_ERR(port->agent);
+
+       return ret;
+}
+
+static void destroy_port(struct sa_db_port *port)
+{
+       mutex_lock(&lock);
+       port->state = SA_DB_DESTROY;
+       mutex_unlock(&lock);
+
+       ib_unregister_mad_agent(port->agent);
+       cleanup_port(port);
+       flush_workqueue(sa_wq);
+}
+
+static void sa_db_add_dev(struct ib_device *device)
 {
        struct sa_db_device *dev;
        struct sa_db_port *port;
-       int i;
+       int s, e, i, ret;

        if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
                return;

-       dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
-                     GFP_KERNEL);
+       if (device->node_type == RDMA_NODE_IB_SWITCH) {
+               s = e = 0;
+       } else {
+               s = 1;
+               e = device->phys_port_cnt;
+       }
+
+       dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL);
        if (!dev)
                return;

-       for (i = 1; i <= device->phys_port_cnt; i++) {
-               port = &dev->port[i-1];
-               port->dev = dev;
-               port->port_num = i;
-               port->update_time = jiffies - hold_time;
-               port->update = 0;
-               INIT_DELAYED_WORK(&port->work, update_cache);
-               port->paths = RB_ROOT;
-
-               if (ib_get_cached_gid(device, i, 0, &port->gid))
-                       goto err;
-
-               port->agent = ib_register_mad_agent(device, i, IB_QPT_GSI,
-                                                   NULL, IB_MGMT_RMPP_VERSION,
-                                                   send_handler, recv_handler,
-                                                   port);
-               if (IS_ERR(port->agent))
+       dev->start_port = s;
+       dev->port_count = e - s + 1;
+       dev->device = device;
+       for (i = 0; i < dev->port_count; i++) {
+               ret = init_port(dev, s + i);
+               if (ret)
                        goto err;
        }

-       dev->device = device;
        ib_set_client_data(device, &sa_db_client, dev);

-       down_write(&lock);
+       INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
+
+       mutex_lock(&lock);
        list_add_tail(&dev->list, &dev_list);
-       up_write(&lock);
+       refresh_dev_db(dev);
+       mutex_unlock(&lock);

-       /* Initialization must be complete before cache updates can occur. */
-       INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event);
        ib_register_event_handler(&dev->event_handler);
-
-       /* Force an update now. */
-       for (i = 1; i <= device->phys_port_cnt; i++)
-               schedule_update(&dev->port[i-1]);
        return;
 err:
-       while (--i) {
-               ib_unregister_mad_agent(dev->port[i-1].agent);
-               remove_all_attrs(&dev->port[i-1].paths);
-       }
+       while (i--)
+               destroy_port(&dev->port[i]);
        kfree(dev);
 }

-static void sa_db_remove_one(struct ib_device *device)
+static void sa_db_remove_dev(struct ib_device *device)
 {
        struct sa_db_device *dev;
        int i;
@@ -627,53 +1231,43 @@ static void sa_db_remove_one(struct ib_d
                return;

        ib_unregister_event_handler(&dev->event_handler);
-       for (i = 0; i < device->phys_port_cnt; i++)
-               cancel_delayed_work(&dev->port[i].work);
        flush_workqueue(sa_wq);

-       for (i = 0; i < device->phys_port_cnt; i++) {
-               ib_unregister_mad_agent(dev->port[i].agent);
-               remove_all_attrs(&dev->port[i].paths);
-       }
+       for (i = 0; i < dev->port_count; i++)
+               destroy_port(&dev->port[i]);

-       down_write(&lock);
+       mutex_lock(&lock);
        list_del(&dev->list);
-       up_write(&lock);
+       mutex_unlock(&lock);
+
        kfree(dev);
 }

-static int __init sa_db_init(void)
+int sa_db_init(void)
 {
        int ret;

-       cache_timeout = msecs_to_jiffies(cache_timeout);
-       hold_time = msecs_to_jiffies(hold_time);
-       update_delay = msecs_to_jiffies(update_delay);
-
-       if (!cache_timeout)
-               paths_per_dest = 0;
-       else if (paths_per_dest > IB_MAX_PATHS_PER_DEST)
-               paths_per_dest = IB_MAX_PATHS_PER_DEST;
-
+       rwlock_init(&rwlock);
        sa_wq = create_singlethread_workqueue("local_sa");
        if (!sa_wq)
                return -ENOMEM;

+       ib_sa_register_client(&sa_client);
        ret = ib_register_client(&sa_db_client);
        if (ret)
                goto err;
+
        return 0;

 err:
+       ib_sa_unregister_client(&sa_client);
        destroy_workqueue(sa_wq);
        return ret;
 }

-static void __exit sa_db_cleanup(void)
+void sa_db_cleanup(void)
 {
        ib_unregister_client(&sa_db_client);
+       ib_sa_unregister_client(&sa_client);
        destroy_workqueue(sa_wq);
 }
-
-module_init(sa_db_init);
-module_exit(sa_db_cleanup);
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to