Re: [ofa-general] [PATCH v3] iw_cxgb3: Support iwarp-only interfaces to avoid 4-tuple conflicts.

2007-09-27 Thread Sean Hefty

The sysadmin creates for iwarp use only alias interfaces of the form
devname:iw* where devname is the native interface name (eg eth0) for the
iwarp netdev device.  The alias label can be anything starting with iw.
The iw immediately after the ':' is the key used by the iw_cxgb3 driver.


I'm still not sure about this, but haven't come up with anything better 
myself.  And if there's a good chance of other rnic's needing the same 
support, I'd rather see the common code separated out, even if just 
encapsulated within this module for easy re-use.


As for the code, I have a couple of questions about whether deadlock and 
a race condition are possible, plus a few minor comments.



+static void insert_ifa(struct iwch_dev *rnicp, struct in_ifaddr *ifa)
+{
+   struct iwch_addrlist *addr;
+
+   addr = kmalloc(sizeof *addr, GFP_KERNEL);
+   if (!addr) {
+   printk(KERN_ERR MOD %s - failed to alloc memory!\n,
+  __FUNCTION__);
+   return;
+   }
+   addr-ifa = ifa;
+   mutex_lock(rnicp-mutex);
+   list_add_tail(addr-entry, rnicp-addrlist);
+   mutex_unlock(rnicp-mutex);
+}


Should this return success/failure?


+static int nb_callback(struct notifier_block *self, unsigned long event,
+  void *ctx)
+{
+   struct in_ifaddr *ifa = ctx;
+   struct iwch_dev *rnicp = container_of(self, struct iwch_dev, nb);
+
+   PDBG(%s rnicp %p event %lx\n, __FUNCTION__, rnicp, event);
+
+   switch (event) {
+   case NETDEV_UP:
+   if (netdev_is_ours(rnicp, ifa-ifa_dev-dev) 
+   is_iwarp_label(ifa-ifa_label)) {
+   PDBG(%s label %s addr 0x%x added\n,
+   __FUNCTION__, ifa-ifa_label, ifa-ifa_address);
+   insert_ifa(rnicp, ifa);
+   iwch_listeners_add_addr(rnicp, ifa-ifa_address);


If insert_ifa() fails, what will iwch_listeners_add_addr() do?  (I'm not 
easily seeing the relationship between the address list and the listen 
list at this point.)



+   }
+   break;
+   case NETDEV_DOWN:
+   if (netdev_is_ours(rnicp, ifa-ifa_dev-dev) 
+   is_iwarp_label(ifa-ifa_label)) {
+   PDBG(%s label %s addr 0x%x deleted\n,
+   __FUNCTION__, ifa-ifa_label, ifa-ifa_address);
+   iwch_listeners_del_addr(rnicp, ifa-ifa_address);
+   remove_ifa(rnicp, ifa);
+   }
+   break;
+   default:
+   break;
+   }
+   return 0;
+}
+
+static void delete_addrlist(struct iwch_dev *rnicp)
+{
+   struct iwch_addrlist *addr, *tmp;
+
+   mutex_lock(rnicp-mutex);
+   list_for_each_entry_safe(addr, tmp, rnicp-addrlist, entry) {
+   list_del(addr-entry);
+   kfree(addr);
+   }
+   mutex_unlock(rnicp-mutex);
+}
+
+static void populate_addrlist(struct iwch_dev *rnicp)
+{
+   int i;
+   struct in_device *indev;
+
+   for (i = 0; i  rnicp-rdev.port_info.nports; i++) {
+   indev = in_dev_get(rnicp-rdev.port_info.lldevs[i]);
+   if (!indev)
+   continue;
+   for_ifa(indev)
+   if (is_iwarp_label(ifa-ifa_label)) {
+   PDBG(%s label %s addr 0x%x added\n,
+__FUNCTION__, ifa-ifa_label,
+ifa-ifa_address);
+   insert_ifa(rnicp, ifa);
+   }
+   endfor_ifa(indev);
+   }
+}
+
 static void rnic_init(struct iwch_dev *rnicp)
 {
PDBG(%s iwch_dev %p\n, __FUNCTION__,  rnicp);
@@ -70,6 +187,12 @@ static void rnic_init(struct iwch_dev *r
idr_init(rnicp-qpidr);
idr_init(rnicp-mmidr);
spin_lock_init(rnicp-lock);
+   INIT_LIST_HEAD(rnicp-addrlist);
+   INIT_LIST_HEAD(rnicp-listen_eps);
+   mutex_init(rnicp-mutex);
+   rnicp-nb.notifier_call = nb_callback;
+   populate_addrlist(rnicp);
+   register_inetaddr_notifier(rnicp-nb);
 
 	rnicp-attr.vendor_id = 0x168;

rnicp-attr.vendor_part_id = 7;
@@ -148,6 +271,8 @@ static void close_rnic_dev(struct t3cdev
mutex_lock(dev_mutex);
list_for_each_entry_safe(dev, tmp, dev_list, entry) {
if (dev-rdev.t3cdev_p == tdev) {
+   unregister_inetaddr_notifier(dev-nb);
+   delete_addrlist(dev);
list_del(dev-entry);
iwch_unregister_device(dev);
cxio_rdev_close(dev-rdev);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h 
b/drivers/infiniband/hw/cxgb3/iwch.h
index caf4e60..7fa0a47 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.h
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -36,6 +36,8 @@ #include linux/mutex.h
 #include linux/list.h
 #include linux/spinlock.h
 

Re: [ofa-general] [PATCH v3] iw_cxgb3: Support iwarp-only interfaces to avoid 4-tuple conflicts.

2007-09-27 Thread Steve Wise



Sean Hefty wrote:

The sysadmin creates for iwarp use only alias interfaces of the form
devname:iw* where devname is the native interface name (eg eth0) for 
the

iwarp netdev device.  The alias label can be anything starting with iw.
The iw immediately after the ':' is the key used by the iw_cxgb3 
driver.


I'm still not sure about this, but haven't come up with anything better 
myself.  And if there's a good chance of other rnic's needing the same 
support, I'd rather see the common code separated out, even if just 
encapsulated within this module for easy re-use.


As for the code, I have a couple of questions about whether deadlock and 
a race condition are possible, plus a few minor comments.




Thanks for reviewing!  Responses are in-line below.



+static void insert_ifa(struct iwch_dev *rnicp, struct in_ifaddr *ifa)
+{
+struct iwch_addrlist *addr;
+
+addr = kmalloc(sizeof *addr, GFP_KERNEL);
+if (!addr) {
+printk(KERN_ERR MOD %s - failed to alloc memory!\n,
+   __FUNCTION__);
+return;
+}
+addr-ifa = ifa;
+mutex_lock(rnicp-mutex);
+list_add_tail(addr-entry, rnicp-addrlist);
+mutex_unlock(rnicp-mutex);
+}


Should this return success/failure?



I think so.  See below...


+static int nb_callback(struct notifier_block *self, unsigned long event,
+   void *ctx)
+{
+struct in_ifaddr *ifa = ctx;
+struct iwch_dev *rnicp = container_of(self, struct iwch_dev, nb);
+
+PDBG(%s rnicp %p event %lx\n, __FUNCTION__, rnicp, event);
+
+switch (event) {
+case NETDEV_UP:
+if (netdev_is_ours(rnicp, ifa-ifa_dev-dev) 
+is_iwarp_label(ifa-ifa_label)) {
+PDBG(%s label %s addr 0x%x added\n,
+__FUNCTION__, ifa-ifa_label, ifa-ifa_address);
+insert_ifa(rnicp, ifa);
+iwch_listeners_add_addr(rnicp, ifa-ifa_address);


If insert_ifa() fails, what will iwch_listeners_add_addr() do?  (I'm not 
easily seeing the relationship between the address list and the listen 
list at this point.)




I guess insert_ifa() needs to return success/failure.  Then if we failed 
to add the ifa to the list we won't update the listeners.


The relationship is this:

- when a listen is done on addr 0.0.0.0, the code walks the list of 
addresses to do specific listens on each address.


- when an address is added or deleted, then the list of current 
listeners is walked and updated accordingly.



+}
+break;
+case NETDEV_DOWN:
+if (netdev_is_ours(rnicp, ifa-ifa_dev-dev) 
+is_iwarp_label(ifa-ifa_label)) {
+PDBG(%s label %s addr 0x%x deleted\n,
+__FUNCTION__, ifa-ifa_label, ifa-ifa_address);
+iwch_listeners_del_addr(rnicp, ifa-ifa_address);
+remove_ifa(rnicp, ifa);
+}
+break;
+default:
+break;
+}
+return 0;
+}
+
+static void delete_addrlist(struct iwch_dev *rnicp)
+{
+struct iwch_addrlist *addr, *tmp;
+
+mutex_lock(rnicp-mutex);
+list_for_each_entry_safe(addr, tmp, rnicp-addrlist, entry) {
+list_del(addr-entry);
+kfree(addr);
+}
+mutex_unlock(rnicp-mutex);
+}
+
+static void populate_addrlist(struct iwch_dev *rnicp)
+{
+int i;
+struct in_device *indev;
+
+for (i = 0; i  rnicp-rdev.port_info.nports; i++) {
+indev = in_dev_get(rnicp-rdev.port_info.lldevs[i]);
+if (!indev)
+continue;
+for_ifa(indev)
+if (is_iwarp_label(ifa-ifa_label)) {
+PDBG(%s label %s addr 0x%x added\n,
+ __FUNCTION__, ifa-ifa_label,
+ ifa-ifa_address);
+insert_ifa(rnicp, ifa);
+}
+endfor_ifa(indev);
+}
+}
+
 static void rnic_init(struct iwch_dev *rnicp)
 {
 PDBG(%s iwch_dev %p\n, __FUNCTION__,  rnicp);
@@ -70,6 +187,12 @@ static void rnic_init(struct iwch_dev *r
 idr_init(rnicp-qpidr);
 idr_init(rnicp-mmidr);
 spin_lock_init(rnicp-lock);
+INIT_LIST_HEAD(rnicp-addrlist);
+INIT_LIST_HEAD(rnicp-listen_eps);
+mutex_init(rnicp-mutex);
+rnicp-nb.notifier_call = nb_callback;
+populate_addrlist(rnicp);
+register_inetaddr_notifier(rnicp-nb);
 
 rnicp-attr.vendor_id = 0x168;

 rnicp-attr.vendor_part_id = 7;
@@ -148,6 +271,8 @@ static void close_rnic_dev(struct t3cdev
 mutex_lock(dev_mutex);
 list_for_each_entry_safe(dev, tmp, dev_list, entry) {
 if (dev-rdev.t3cdev_p == tdev) {
+unregister_inetaddr_notifier(dev-nb);
+delete_addrlist(dev);
 list_del(dev-entry);
 iwch_unregister_device(dev);
 cxio_rdev_close(dev-rdev);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h 
b/drivers/infiniband/hw/cxgb3/iwch.h

index caf4e60..7fa0a47 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.h
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -36,6 +36,8 @@ #include linux/mutex.h
 #include linux/list.h
 #include 

RE: [ofa-general] [PATCH v3] iw_cxgb3: Support iwarp-only interfaces to avoid 4-tuple conflicts.

2007-09-27 Thread Sean Hefty
It is ok to block while holding a mutex, yes?

It's okay, I just didn't try to trace through the code to see if it ever tries
to acquire the same mutex in the thread that needs to signal the event.

- Sean
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ofa-general] [PATCH v3] iw_cxgb3: Support iwarp-only interfaces to avoid 4-tuple conflicts.

2007-09-26 Thread Steve Wise

Rolan/Sean,

What do you all think?

Steve.


Steve Wise wrote:

iw_cxgb3: Support iwarp-only interfaces to avoid 4-tuple conflicts.

Version 3:

- don't use list_del_init() where list_del() is sufficient.

Version 2:

- added a per-device mutex for the address and listening endpoints lists.

- wait for all replies if sending multiple passive_open requests to rnic.

- log warning if no addresses are available when a listen is issued.

- tested

---

Design:

The sysadmin creates for iwarp use only alias interfaces of the form
devname:iw* where devname is the native interface name (eg eth0) for the
iwarp netdev device.  The alias label can be anything starting with iw.
The iw immediately after the ':' is the key used by the iw_cxgb3 driver.

EG:
ifconfig eth0 192.168.70.123 up
ifconfig eth0:iw1 192.168.71.123 up
ifconfig eth0:iw2 192.168.72.123 up

In the above example, 192.168.70/24 is for TCP traffic, while
192.168.71/24 and 192.168.72/24 are for iWARP/RDMA use.

The rdma-only interface must be on its own IP subnet. This allows routing
all rdma traffic onto this interface.

The iWARP driver must translate all listens on address 0.0.0.0 to the
set of rdma-only ip addresses for the device in question.  This prevents
incoming connect requests to the TCP ipaddresses from going up the
rdma stack.

Implementation Details:

- The iw_cxgb3 driver registers for inetaddr events via
register_inetaddr_notifier().  This allows tracking the iwarp-only
addresses/subnets as they get added and deleted.  The iwarp driver
maintains a list of the current iwarp-only addresses.

- The iw_cxgb3 driver builds the list of iwarp-only addresses for its
devices at module insert time.  This is needed because the inetaddr
notifier callbacks don't replay address-add events when someone
registers.  So the driver must build the initial list at module load time.

- When a listen is done on address 0.0.0.0, then the iw_cxgb3 driver
must translate that into a set of listens on the iwarp-only addresses.
This is implemented by maintaining a list of stid/addr entries per
listening endpoint.

- When a new iwarp-only address is added or removed, the iw_cxgb3 driver
must traverse the set of listening endpoints and update them accordingly.
This allows an application to bind to 0.0.0.0 prior to the iwarp-only
interfaces being configured.  It also allows changing the iwarp-only set
of addresses and getting the expected behavior for apps already bound
to 0.0.0.0.  This is done by maintaining a list of listening endpoints
off the device struct.

- The address list, the listening endpoint list, and each list of
stid/addrs in use per listening endpoint are all protected via a mutex
per iw_cxgb3 device.

Signed-off-by: Steve Wise [EMAIL PROTECTED]
---

 drivers/infiniband/hw/cxgb3/iwch.c|  125 
 drivers/infiniband/hw/cxgb3/iwch.h|   11 +
 drivers/infiniband/hw/cxgb3/iwch_cm.c |  259 +++--
 drivers/infiniband/hw/cxgb3/iwch_cm.h |   15 ++
 4 files changed, 360 insertions(+), 50 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb3/iwch.c 
b/drivers/infiniband/hw/cxgb3/iwch.c
index 0315c9d..d81d46e 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -63,6 +63,123 @@ struct cxgb3_client t3c_client = {
 static LIST_HEAD(dev_list);
 static DEFINE_MUTEX(dev_mutex);
 
+static void insert_ifa(struct iwch_dev *rnicp, struct in_ifaddr *ifa)

+{
+   struct iwch_addrlist *addr;
+
+   addr = kmalloc(sizeof *addr, GFP_KERNEL);
+   if (!addr) {
+   printk(KERN_ERR MOD %s - failed to alloc memory!\n,
+  __FUNCTION__);
+   return;
+   }
+   addr-ifa = ifa;
+   mutex_lock(rnicp-mutex);
+   list_add_tail(addr-entry, rnicp-addrlist);
+   mutex_unlock(rnicp-mutex);
+}
+
+static void remove_ifa(struct iwch_dev *rnicp, struct in_ifaddr *ifa)
+{
+   struct iwch_addrlist *addr, *tmp;
+
+   mutex_lock(rnicp-mutex);
+   list_for_each_entry_safe(addr, tmp, rnicp-addrlist, entry) {
+   if (addr-ifa == ifa) {
+   list_del(addr-entry);
+   kfree(addr);
+   goto out;
+   }
+   }
+out:
+   mutex_unlock(rnicp-mutex);
+}
+
+static int netdev_is_ours(struct iwch_dev *rnicp, struct net_device *netdev)
+{
+   int i;
+
+   for (i = 0; i  rnicp-rdev.port_info.nports; i++)
+   if (netdev == rnicp-rdev.port_info.lldevs[i])
+   return 1;
+   return 0;
+}
+
+static inline int is_iwarp_label(char *label)
+{
+   char *colon;
+
+   colon = strchr(label, ':');
+   if (colon  !strncmp(colon+1, iw, 2))
+   return 1;
+   return 0;
+}
+
+static int nb_callback(struct notifier_block *self, unsigned long event,
+  void *ctx)
+{
+   struct in_ifaddr *ifa = ctx;
+   struct iwch_dev *rnicp = container_of(self, struct