[ofa-general] [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.

2007-08-07 Thread Steve Wise

Networking experts,

I'd like input on the patch below, and help in solving this bug 
properly.  iWARP devices that support both native stack TCP and iWARP 
(aka RDMA over TCP/IP/Ethernet) connections on the same interface need 
the fix below or some similar fix to the RDMA connection manager.


This is a BUG in the Linux RDMA-CMA code as it stands today.

Here is the issue:

Consider an mpi cluster running mvapich2.  And the cluster runs 
MPI/Sockets jobs concurrently with MPI/RDMA jobs.  It is possible, 
without the patch below, for MPI/Sockets processes to mistakenly get 
incoming RDMA connections and vice versa.  The way mvapich2 works is 
that the ranks all bind and listen to a random port (retrying new random 
ports if the bind fails with "in use").  Once they get a free port and
bind/listen, they advertise that port number to the peers to do 
connection setup.  Currently, without the patch below, the mpi/rdma 
processes can end up binding/listening to the _same_ port number as the 
mpi/sockets processes running over the native tcp stack.  This is due to 
duplicate port spaces for native stack TCP and the rdma cm's RDMA_PS_TCP 
port space.  If this happens, then the connections can get screwed up.


The correct solution in my mind is to use the host stack's TCP port 
space for _all_ RDMA_PS_TCP port allocations.   The patch below is a 
minimal delta to unify the port spaces by using the kernel stack to bind 
ports.  This is done by allocating a kernel socket and binding to the 
appropriate local addr/port.  It also allows the kernel stack to pick 
ephemeral ports by virtue of just passing in port 0 on the kernel bind 
operation.


There has been a discussion already on the RDMA list if anyone is 
interested:


http://www.mail-archive.com/general@lists.openfabrics.org/msg05162.html


Thanks,

Steve.


---

RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.

This is needed for iwarp providers that support native and rdma
connections over the same interface.

Signed-off-by: Steve Wise <[EMAIL PROTECTED]>
---

drivers/infiniband/core/cma.c |   27 ++-
1 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9e0ab04..e4d2d7f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -111,6 +111,7 @@ struct rdma_id_private {
struct rdma_cm_id   id;

struct rdma_bind_list   *bind_list;
+   struct socket   *sock;
struct hlist_node   node;
struct list_headlist;
struct list_headlisten_list;
@@ -695,6 +696,8 @@ static void cma_release_port(struct rdma
kfree(bind_list);
}
mutex_unlock(&lock);
+   if (id_priv->sock)
+   sock_release(id_priv->sock);
}

void rdma_destroy_id(struct rdma_cm_id *id)
@@ -1790,6 +1793,25 @@ static int cma_use_port(struct idr *ps,
return 0;
}

+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+   int ret;
+   struct socket *sock;
+
+   ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+   if (ret)
+   return ret;
+   ret = sock->ops->bind(sock,
+ (struct socketaddr *)&id_priv->id.route.addr.src_addr,
+ ip_addr_size(&id_priv->id.route.addr.src_addr));
+   if (ret) {
+   sock_release(sock);
+   return ret;
+   }
+   id_priv->sock = sock;
+   return 0;   
+}
+
static int cma_get_port(struct rdma_id_private *id_priv)
{
struct idr *ps;
@@ -1801,6 +1823,9 @@ static int cma_get_port(struct rdma_id_p
break;
case RDMA_PS_TCP:
ps = &tcp_ps;
+   ret = cma_get_tcp_port(id_priv); /* Synch with native stack */
+   if (ret)
+   goto out;
break;
case RDMA_PS_UDP:
ps = &udp_ps;
@@ -1815,7 +1840,7 @@ static int cma_get_port(struct rdma_id_p
else
ret = cma_use_port(ps, id_priv);
mutex_unlock(&lock);
-
+out:
return ret;
}


___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general


[ofa-general] [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.

2007-07-29 Thread Steve Wise

RDMA experts,

I'd like input on the patch below.  iWARP devices that support both 
native stack TCP and iwarp connections on the same interface need the 
fix below or some similar enhancement to the rdma cm.  This is a bug in 
the ofed-1.2 RDMA-CM code as it stands.  I propose we fix this for 
ofed-1.2.1 or ofed-1.3.


Here is the issue:

Consider an mpi cluster running mvapich2.  And the cluster runs 
MPI/Sockets jobs concurrently with MPI/RDMA jobs.  It is possible, 
without the patch below, for MPI/Sockets processes to mistakenly get 
incoming RDMA connections and vice versa.  The way mvapich2 works is 
that the ranks all bind and listen to a random port (retrying new random 
ports if the bind fails with "in use").  Once they get a free port and 
bind/listen, they advertise that port to the peers to do connection 
setup.  Currently, without the patch below, the mpi/rdma processes can 
end up binding/listening to the _same_ port number as the mpi/sockets 
processes running over the native tcp stack.  This is due to duplicate 
port spaces for native stack TCP and the rdma cm's RDMA_PS_TCP port 
space.  If this happens, then the connections can get screwed up.


The correct solution in my mind is to use the host stack's TCP port 
space for _all_ RDMA_PS_TCP port allocations.   The patch below is a 
minimal delta to unify the port spaces bay using the kernel stack to 
bind ports.  This is done by allocating a kernel socket and binding to 
the appropriate local addr/port.  It also allows the kernel stack to 
pick ephemeral ports by virtue of just passing in port 0 on the kernel 
bind operation.


I'd like to discuss this with the RDMA folks first and iron out an 
agreement on how this should be implemented, then widen the audience to 
lklm/netdev.  With a goal of inclusion in 2.6.23 and ofed-1.2.1 or 1.3.


Thanks,

Steve.



 Original Message 
Subject: [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP 
port space.

Date: Sun, 29 Jul 2007 15:17:04 -0500
From: Steve Wise <[EMAIL PROTECTED]>
To: [EMAIL PROTECTED]


RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.

This is needed for iwarp providers that support native and rdma
connections over the same interface.

Signed-off-by: Steve Wise <[EMAIL PROTECTED]>
---

 drivers/infiniband/core/cma.c |   27 ++-
 1 files changed, 26 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9e0ab04..e4d2d7f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -111,6 +111,7 @@ struct rdma_id_private {
struct rdma_cm_id   id;

struct rdma_bind_list   *bind_list;
+   struct socket   *sock;
struct hlist_node   node;
struct list_headlist;
struct list_headlisten_list;
@@ -695,6 +696,8 @@ static void cma_release_port(struct rdma
kfree(bind_list);
}
mutex_unlock(&lock);
+   if (id_priv->sock)
+   sock_release(id_priv->sock);
 }

 void rdma_destroy_id(struct rdma_cm_id *id)
@@ -1790,6 +1793,25 @@ static int cma_use_port(struct idr *ps,
return 0;
 }

+static int cma_get_tcp_port(struct rdma_id_private *id_priv)
+{
+   int ret;
+   struct socket *sock;
+
+   ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+   if (ret)
+   return ret;
+   ret = sock->ops->bind(sock,
+ (struct socketaddr *)&id_priv->id.route.addr.src_addr,
+ ip_addr_size(&id_priv->id.route.addr.src_addr));
+   if (ret) {
+   sock_release(sock);
+   return ret;
+   }
+   id_priv->sock = sock;
+   return 0;   
+}
+
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
struct idr *ps;
@@ -1801,6 +1823,9 @@ static int cma_get_port(struct rdma_id_p
break;
case RDMA_PS_TCP:
ps = &tcp_ps;
+   ret = cma_get_tcp_port(id_priv); /* Synch with native stack */
+   if (ret)
+   goto out;
break;
case RDMA_PS_UDP:
ps = &udp_ps;
@@ -1815,7 +1840,7 @@ static int cma_get_port(struct rdma_id_p
else
ret = cma_use_port(ps, id_priv);
mutex_unlock(&lock);
-
+out:
return ret;
 }

___
general mailing list
general@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general