[ewg] [ANNOUNCE] libmlx5 1.0.2 released

2015-01-12 Thread Eli Cohen
Hi,

I would like to announce the availability of libmlx5 version 1.0.2,
the provider library for Mellanox Connect-IB devices.

Main changes:
Add support for XRC
Add Connect-IB and ConnectX-4 devices to the list of supported
devices.

The library can be found here:
https://www.openfabrics.org/downloads/mlx5/libmlx5-1.0.2.tar.gz

Thanks,
Eli

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/mailman/listinfo/ewg


[ewg] [ANNOUNCE] libmlx5 1.0.1 released

2014-02-03 Thread Eli Cohen
Hi,
I would like to announce the availability of libmlx5 version 1.0.1, the provider
library for Mellanox Connect-IB devices.

This version brings:

Fixes to micro UAR allocator
Implementation of modify CQ

The library can be found here:
www.openfabrics.org/downloads/mlx5/libmlx5-1.0.1.tar.gz

Thanks,
Eli
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] How many QPs are used so far ?

2012-03-28 Thread Eli Cohen
No, it is not possible.

On Wed, Mar 28, 2012 at 6:00 PM, Ker Can kercan7...@yahoo.com wrote:
 Hi,

 Our HCA, has a max_qp limit of 260032. Is there a way to query how many QPs 
 are currently being used ?

 thanks
 K.Can
 ___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH v4] IB Core: RAW ETH support

2010-06-14 Thread Eli Cohen
Vlad,
please push to OFED.

On Mon, Jun 14, 2010 at 11:48 AM, Alekseys Senin aleks...@voltaire.com wrote:
 This patch adds support to RAW ETH QP in ib core.

 diff --git a/kernel_patches/fixes/core_0560_raw_eth_common.patch 
 b/kernel_patches/fixes/core_0560_raw_eth_common.patch
 new file mode 100644
 index 000..ae43298
 --- /dev/null
 +++ b/kernel_patches/fixes/core_0560_raw_eth_common.patch
 @@ -0,0 +1,63 @@
 + Add new RAW_ETH QP type in order to build RAW Ethernet packets
 + over iWARP and RoCEE.
 +
 +
 +Signed-off-by: Aleksey Senin aleks...@voltaire.com
 +---
 + drivers/infiniband/core/verbs.c |   13 +++--
 + include/rdma/ib_verbs.h         |    1 +
 + 2 files changed, 12 insertions(+), 2 deletions(-)
 +
 +diff --git a/drivers/infiniband/core/verbs.c 
 b/drivers/infiniband/core/verbs.c
 +index 881850e..bb4dcd5 100644
 +--- a/drivers/infiniband/core/verbs.c
  b/drivers/infiniband/core/verbs.c
 +@@ -382,6 +382,7 @@ static const struct {
 +                               [IB_QPT_UD]  = (IB_QP_PKEY_INDEX              
   |
 +                                               IB_QP_PORT                    
   |
 +                                               IB_QP_QKEY),
 ++                              [IB_QPT_RAW_ETH] = IB_QP_PORT,
 +                               [IB_QPT_UC]  = (IB_QP_PKEY_INDEX              
   |
 +                                               IB_QP_PORT                    
   |
 +                                               IB_QP_ACCESS_FLAGS),
 +@@ -1004,7 +1005,11 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid 
 *gid, u16 lid)
 +
 +       switch (rdma_node_get_transport(qp-device-node_type)) {
 +       case RDMA_TRANSPORT_IB:
 +-              if (gid-raw[0] != 0xff || qp-qp_type != IB_QPT_UD)
 ++              if (qp-qp_type == IB_QPT_RAW_ETH) {
 ++                      /* In raw Etherent mgids the 63 msb's should be 0 */
 ++                      if (gid-global.subnet_prefix  cpu_to_be64(~1ULL))
 ++                              return -EINVAL;
 ++              } else if (gid-raw[0] != 0xff || qp-qp_type != IB_QPT_UD)
 +                       return -EINVAL;
 +               break;
 +       case RDMA_TRANSPORT_IWARP:
 +@@ -1023,7 +1028,11 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid 
 *gid, u16 lid)
 +
 +       switch (rdma_node_get_transport(qp-device-node_type)) {
 +       case RDMA_TRANSPORT_IB:
 +-              if (gid-raw[0] != 0xff || qp-qp_type != IB_QPT_UD)
 ++              if (qp-qp_type == IB_QPT_RAW_ETH) {
 ++                      /* In raw Etherent mgids the 63 msb's should be 0 */
 ++                      if (gid-global.subnet_prefix  cpu_to_be64(~1ULL))
 ++                              return -EINVAL;
 ++              } else if (gid-raw[0] != 0xff || qp-qp_type != IB_QPT_UD)
 +                       return -EINVAL;
 +               break;
 +       case RDMA_TRANSPORT_IWARP:
 +diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
 +index 3a5a40f..2162253 100644
 +--- a/include/rdma/ib_verbs.h
  b/include/rdma/ib_verbs.h
 +@@ -571,6 +571,7 @@ enum ib_qp_type {
 +       IB_QPT_UD,
 +       IB_QPT_XRC,
 +       IB_QPT_RAW_IPV6,
 ++      IB_QPT_RAW_ETH,
 +       IB_QPT_RAW_ETY
 + };
 +
 +--
 +1.6.5.2
 +


 ___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg

 ___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH v4] MLX4: RAW ETH support

2010-06-14 Thread Eli Cohen
On Mon, Jun 14, 2010 at 11:56 AM, Alekseys Senin aleks...@voltaire.com wrote:

 +@@ -902,6 +903,7 @@ static int to_mlx4_st(enum ib_qp_type type)
 +       case IB_QPT_RAW_ETY:
 +       case IB_QPT_SMI:
 +       case IB_QPT_GSI:        return MLX4_QP_ST_MLX;
 ++      case IB_QPT_RAW_ETH:    return MLX4_QP_ST_MLX;
It would look cleaner if you'd remove the first return above let it
fallback to the later.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH v4] IB Core: RAW ETH support

2010-06-14 Thread Eli Cohen
I don't think there is a substantial dependence between RoCEE and raw
ethernet. I don't know what Moni meant. Moni? 

-Original Message-
From: Or Gerlitz [mailto:ogerl...@voltaire.com] 
Sent: Monday, June 14, 2010 1:42 PM
To: Moni Shoua; Eli Cohen
Cc: Aleksey Senin; Vladimir Sokolovsky; v...@dev.mellanox.co.il; 
e...@openfabrics.org
Subject: Re: [ewg] [PATCH v4] IB Core: RAW ETH support

Moni Shoua wrote:
 The patches can't be applies to upstream kernel. An attempt to do this 
 failed. I guess that some of RoCEE patches are still missing in kernel 
 upstream.
Eli, can you elaborate on that? is there any real dependence between the RoCE 
patches to the raw qp ones? what is this dependence in high level, is it 
placement of change sets in the code or actual flows?

Or.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [ANNOUNCE] OFED 1.5.2 rc1 is available

2010-06-14 Thread Eli Cohen
Or,
I don't know what you're talking about WRT 2043. It's status is RESOLVED, not 
VERIFIED. And you may address me directly, we've met a few times, haven't we?

We'll check about the global frames case.

-Original Message-
From: Or Gerlitz [mailto:ogerl...@voltaire.com] 
Sent: Monday, June 14, 2010 2:17 PM
To: tzipo...@dev.mellanox.co.il
Cc: OpenFabrics EWG; Eli Cohen
Subject: Re: [ewg] [ANNOUNCE] OFED 1.5.2 rc1 is available

Tziporet Koren wrote:
 Hi Or, sorry for not answering before
 Issue 2043 - is already fixed
 Issue 2005 - We released 2.7.700 FW that solve this issue.
 Issue 2024 - Eli just answered in bugzilla.

Hi Tziporet,

1st and most, good to hear from you... thanks for the detailed answer. 

Still, for bz 2024, Eli's reply was addressing a case where the Cisco switch 
was instrumented for PFC, but not the case of global pause under which the 
problem still happens, per the original description. 

Did someone from Mellanox actually got to validate bz 2043, it was declared 
fixed and verified in the very same minute and by the same person...

Or.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [ANNOUNCE] OFED 1.5.2 rc1 is available

2010-06-14 Thread Eli Cohen
OK, no problem. 

-Original Message-
From: Or Gerlitz [mailto:ogerl...@voltaire.com] 
Sent: Monday, June 14, 2010 3:10 PM
To: Eli Cohen
Cc: tzipo...@dev.mellanox.co.il; OpenFabrics EWG
Subject: Re: [ewg] [ANNOUNCE] OFED 1.5.2 rc1 is available

Eli Cohen wrote:
 I don't know what you're talking about WRT 2043. It's status is 
 RESOLVED, not VERIFIED. And you may address me directly [...] 
 we'll check about the global frames case
Hi Eli, please hold the horses... I was confusing between resolved to 
verified, mistakes happen, you know. Saying all that, thanks for looking into 
validating the case!

Or.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] RAW ETH support for Mellanox v2 [PATCH 1/1]

2010-06-10 Thread Eli Cohen
Hi Aleksey,
please run checkpatch.pl and add a changelog.

On Thu, Jun 10, 2010 at 11:05:03AM +0300, Aleksey Senin wrote:
 Differences from V1
 Added new mlx4_mcast_prot enum
 Fix counters support
 
 RAW_ETH support in Mellanox
 
 
 
 Signed-off-by: Aleksey Senin aleks...@voltaire.com
 ---
  drivers/infiniband/hw/mlx4/main.c |   13 +
  drivers/infiniband/hw/mlx4/qp.c   |   22 ++
  drivers/net/mlx4/mcg.c|   22 +-
  include/linux/mlx4/device.h   |7 +--
  include/linux/mlx4/driver.h   |5 +
  5 files changed, 50 insertions(+), 19 deletions(-)
 
 diff --git a/drivers/infiniband/hw/mlx4/main.c 
 b/drivers/infiniband/hw/mlx4/main.c
 index c146b84..43d0ccc 100644
 --- a/drivers/infiniband/hw/mlx4/main.c
 +++ b/drivers/infiniband/hw/mlx4/main.c
 @@ -684,7 +684,9 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union 
 ib_gid *gid, u16 lid)
   struct mlx4_ib_qp *mqp = to_mqp(ibqp);
  
   err = mlx4_multicast_attach(mdev-dev, mqp-mqp, gid-raw, 
 !!(mqp-flags 
 - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK));
 + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
 + (ibqp-qp_type == IB_QPT_RAW_ETH) ?
 + MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
   if (err)
   return err;
  
 @@ -695,7 +697,9 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union 
 ib_gid *gid, u16 lid)
   return 0;
  
  err_add:
 - mlx4_multicast_detach(mdev-dev, mqp-mqp, gid-raw);
 + mlx4_multicast_detach(mdev-dev, mqp-mqp, gid-raw,
 + (ibqp-qp_type == IB_QPT_RAW_ETH) ?
 + MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
   return err;
  }
  
 @@ -724,8 +728,9 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union 
 ib_gid *gid, u16 lid)
   struct net_device *ndev;
   struct gid_entry *ge;
  
 - err = mlx4_multicast_detach(mdev-dev,
 - mqp-mqp, gid-raw);
 + err = mlx4_multicast_detach(mdev-dev, mqp-mqp, gid-raw,
 + (ibqp-qp_type == IB_QPT_RAW_ETH) ?
 + MLX4_MCAST_PROT_EN : MLX4_MCAST_PROT_IB);
   if (err)
   return err;
  
 diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
 index 422d367..c50e110 100644
 --- a/drivers/infiniband/hw/mlx4/qp.c
 +++ b/drivers/infiniband/hw/mlx4/qp.c
 @@ -811,6 +811,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
   case IB_QPT_RC:
   case IB_QPT_UC:
   case IB_QPT_UD:
 + case IB_QPT_RAW_ETH:
   {
   qp = kzalloc(sizeof *qp, GFP_KERNEL);
   if (!qp)
 @@ -902,6 +903,7 @@ static int to_mlx4_st(enum ib_qp_type type)
   case IB_QPT_RAW_ETY:
   case IB_QPT_SMI:
   case IB_QPT_GSI:return MLX4_QP_ST_MLX;
 + case IB_QPT_RAW_ETH:return MLX4_QP_ST_MLX;
   default:return -1;
   }
  }
 @@ -1064,8 +1066,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
   break;
   }
   }
 -
 - if (ibqp-qp_type == IB_QPT_GSI || ibqp-qp_type == IB_QPT_SMI ||
 + if (ibqp-qp_type == IB_QPT_RAW_ETH)
 + context-mtu_msgmax = 0xff;
 + else if (ibqp-qp_type == IB_QPT_GSI || ibqp-qp_type == IB_QPT_SMI ||
   ibqp-qp_type == IB_QPT_RAW_ETY)
   context-mtu_msgmax = (IB_MTU_4096  5) | 11;
   else if (ibqp-qp_type == IB_QPT_UD) {
 @@ -1237,7 +1240,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
   if (cur_state == IB_QPS_INIT 
   new_state == IB_QPS_RTR  
   (ibqp-qp_type == IB_QPT_GSI || ibqp-qp_type == IB_QPT_SMI ||
 -  ibqp-qp_type == IB_QPT_UD || ibqp-qp_type == IB_QPT_RAW_ETY)) {
 +  ibqp-qp_type == IB_QPT_UD || ibqp-qp_type == IB_QPT_RAW_ETY ||
 + ibqp-qp_type == IB_QPT_RAW_ETH)) {
   context-pri_path.sched_queue = (qp-port - 1)  6;
   if (is_qp0(dev, qp))
   context-pri_path.sched_queue |= 
 MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
 @@ -1356,7 +1360,7 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct 
 ib_qp_attr *attr,
   goto out;
   }
  
 - if ((attr_mask  IB_QP_PORT) 
 + if ((attr_mask  IB_QP_PORT)  (ibqp-qp_type != IB_QPT_RAW_ETH) 
   (attr-port_num == 0 || attr-port_num  dev-num_ports)) {
   mlx4_ib_dbg(qpn 0x%x: invalid port number (%d) specified 
   for transition %d to %d. qp_type %d,
 @@ -1365,6 +1369,16 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct 
 ib_qp_attr *attr,
   goto out;
   }
  
 + if ((attr_mask  IB_QP_PORT)  (ibqp-qp_type == IB_QPT_RAW_ETH) 
 + (rdma_port_link_layer(dev-ib_dev, attr-port_num)
 + != IB_LINK_LAYER_ETHERNET)) {
 + mlx4_ib_dbg(qpn 0x%x: invalid port (%d) specified (not 

Re: [ewg] RAW ETH support for Mellanox v2 [PATCH 1/1]

2010-06-10 Thread Eli Cohen
I get 42 Errors and 5 warnings when running checkpatch.pl on this
patch. You need a changelog to describer the patch anyway. If you run
git log you will see what this means. You can consult one of your
co-workers about this.

On Thu, Jun 10, 2010 at 11:50:58AM +0300, Aleksey Senin wrote:
 Eli, I did it. The only difference on what you see, that the patch was 
 renamed to mlx4_x_0050_raw_eth.patch
 Where should I specify a change log? I wrote in the e-mail what are the 
 differences. Do I need to write in the patch? Why?
 
 alst60-~$~/src/vt/kernel/linux-2.6/scripts/checkpatch.pl  
 /host/home/alekseys/tmp/raw/mlx4_x_0050_raw_eth.patch
 total: 0 errors, 0 warnings, 195 lines checked
 
 /host/home/alekseys/tmp/raw/mlx4_x_0050_raw_eth.patch has no obvious style 
 problems and is ready for submission.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] RAW ETH support for Mellanox v1 [PATCH 1/1]

2010-06-09 Thread Eli Cohen
On Wed, Jun 09, 2010 at 02:29:57PM +0300, Aleksey Senin wrote:
  
   err = mlx4_multicast_attach(mdev-dev, mqp-mqp, gid-raw, 
 !!(mqp-flags 
 - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK));
 + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
 + (ibqp-qp_type == IB_QPT_RAW_ETH) ?
 + MLX4_PROT_EN : MLX4_PROT_IB);
   if (err)
   return err;

Usage of MLX4_PROT_EN and MLX4_PROT_IB is wrong in this context since
they are used for a totally different purpose. You need to define a
new enum and explicitly set values for it to reflect hardware
definitions.

 @@ -1237,12 +1240,16 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
   if (cur_state == IB_QPS_INIT 
   new_state == IB_QPS_RTR  
   (ibqp-qp_type == IB_QPT_GSI || ibqp-qp_type == IB_QPT_SMI ||
 -  ibqp-qp_type == IB_QPT_UD || ibqp-qp_type == IB_QPT_RAW_ETY)) {
 +  ibqp-qp_type == IB_QPT_UD || ibqp-qp_type == IB_QPT_RAW_ETY ||
 + ibqp-qp_type == IB_QPT_RAW_ETH)) {
   context-pri_path.sched_queue = (qp-port - 1)  6;
   if (is_qp0(dev, qp))
   context-pri_path.sched_queue |= 
 MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
   else
   context-pri_path.sched_queue |= 
 MLX4_IB_DEFAULT_SCHED_QUEUE;
 +
 + /* Default counter for non-RC QPs */
 + context-pri_path.counter_index = 0xff;

Looks like this breaks hardware counters. Why are you using this
statement? Also it appears that the patches were not created against
latest OFED 1.5.2 sources.

Did you check that counters are still working for RoCEE after this
patch?

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] please pull

2010-05-27 Thread Eli Cohen
ssh://ofa.org/home/eli/scm/ofed-1.5/kernel.git/linux-2.6.git ofed_kernel_1_5

Fixes a bug inserted in 369cd4c
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 05/11] ib_core: IBoE UD packet packing support

2010-05-23 Thread Eli Cohen
On Mon, May 17, 2010 at 09:18:54AM -0700, Roland Dreier wrote:
   I thought you were referring to the changes made by this patch
   920d706.  Should I re-send this patch?
 
 Yes, please work out what changes are still required and send a new patch.

OK. I am sending a series of three patches. The first one contains the
original changes. The following two are for mlx4 and mthca to sync
with the API changes. The patches are made agianst the iboe branch.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 02/11] ib_core: IBoE support only QP1

2010-05-17 Thread Eli Cohen
On Thu, May 13, 2010 at 10:35:14AM -0700, Roland Dreier wrote:
   One reason is that get_src_path_mask() may access ah_lock.
 
 I don't see that:
 
 static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
 {
   struct ib_sa_device *sa_dev;
   struct ib_sa_port   *port;
   unsigned long flags;
   u8 src_path_mask;
 
   sa_dev = ib_get_client_data(device, sa_client);
   if (!sa_dev)
   return 0x7f;
 
 so if we never add the sa_client structure it just returns.
 
I see... so I have no good reason to not treat sa_query.c in the same
manner I did for multicast.c.
By the way, how do you prefer to work out all these comments.  Would
you prefer a new patch set or would you prefer to push the fixed
versions to your tree?
Also, there are a few fixes that I pushed to OFED that were pushed
after I sent V8. How would you like me to communicate them?
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 05/11] ib_core: IBoE UD packet packing support

2010-05-17 Thread Eli Cohen
On Thu, May 13, 2010 at 10:36:27AM -0700, Roland Dreier wrote:
 
 Doesn't seem to be all there... eg there is nothing for ethernet headers.

I thought you were referring to the changes made by this patch
920d706.  Should I re-send this patch?
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 04/11] ib_core: IBoE CMA device binding

2010-05-13 Thread Eli Cohen
On Wed, May 12, 2010 at 01:03:42PM -0700, Roland Dreier wrote:
int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
   -   struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
   +   struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr,
   +   int force_grh)
 
 Rather than this change in API, could we just have this function look at
 the link layer, and if it's ethernet, then always set the GRH flag?
 Seems simpler than requiring the upper layers to do this and then pass
 the result in?

I guess that would keep the function more versatile but changing the
implenetation to check the port's link layer seems OK to me.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 04/11] ib_core: IBoE CMA device binding

2010-05-13 Thread Eli Cohen
On Wed, May 12, 2010 at 01:05:06PM -0700, Roland Dreier wrote:
   +static void iboe_mcast_work_handler(struct work_struct *work)
   +{
   +  struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, 
 work);
   +  struct cma_multicast *mc = mw-mc;
   +  struct ib_sa_multicast *m = mc-multicast.ib;
   +
   +  mc-multicast.ib-context = mc;
   +  cma_ib_mc_handler(0, m);
   +  kref_put(mc-mcref, release_mc);
   +  kfree(mw);
   +}
 
 I'm having a hard time working out why the iboe case needs to schedule
 to a work queue here since its already in process context, right?  It
 seems it would be really preferable to avoid all the extra pointer
 munging and reference counting, and just call things directly.
 

I assume that the caller might attempt to acquire the same lock when
calling join and in the callback. Specifically, ucma_join_multicast()
calls rdma_join_multicast() with file-mut acquired and
ucma_event_handler() does the same.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 05/11] ib_core: IBoE UD packet packing support

2010-05-13 Thread Eli Cohen
On Wed, May 12, 2010 at 01:06:36PM -0700, Roland Dreier wrote:
   2. Fix wrong implementation of ib_ud_header_init(). A different patch was 
 sent
  to Roland.
 
 This patch no longer applies, I guess because you already sent me this
 fix (which is now upstream since 2.6.34-rc1).

Right, it's already applied.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 08/11] mlx4: Allow interfaces to correspond to each other

2010-05-13 Thread Eli Cohen
On Wed, May 12, 2010 at 01:30:22PM -0700, Roland Dreier wrote:
   +void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int 
 port)
   +{
   +  return mlx4_find_get_prot_dev(dev, proto, port);
   +}
   +EXPORT_SYMBOL(mlx4_get_prot_dev);
 
 Not sure I understand why you have a wrapper to call another function
 with exactly the same parameters?  Can't we get rid of this and just
 rename mlx4_find_get_prot_dev() to mlx4_get_prot_dev()?

Sure, let's change that.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 04/11] ib_core: IBoE CMA device binding

2010-05-13 Thread Eli Cohen
On Wed, May 12, 2010 at 01:14:33PM -0700, Roland Dreier wrote:
   Multicast GIDs are always mapped to multicast MACs
   as is done in IPv6. Some helper functions are added to ib_addr.h.  IPv4
   multicast is enabled by translating IPv4 multicast addresses to IPv6 
 multicast
   as described in
   http://www.mail-archive.com/i...@sunroof.eng.sun.com/msg02134.html.
 
 I guess it's a bit unfortunate that the RoCE annex completely ignored
 how to map multicast GIDs to ethernet addresses (I suppose as part of
 the larger decision to ignore address resolution entirely).  Anyway,
 looking at the email message you reference, it seems to be someone
 asking what the right way to map IPv4 multicast addresses to IPv6
 addresses is.  Is there a more definitive document you can point to?

I am not aware of any definitive document that addresses this issue.

 
 It seems that unfortunately the way the layering of addresses is done,
 there's no way to just use the standard mapping of IPv4 multicast
 addresses to Ethernet addresses (since IBoE is does addressing via
 the CMA mapping to GIDs followed by an unspecified mapping from GIDs to
 Ethernet addresses).
 

It is natural to treat gids as IPv6 addresses, so using the standard
mapping from IPv6 multicast addresses to mac addresses seems reasonable.

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 07/11] ib_core: Add API to support IBoE from userspace

2010-05-13 Thread Eli Cohen
On Wed, May 12, 2010 at 01:28:31PM -0700, Roland Dreier wrote:
   Add ib_uverbs_get_eth_l2_addr() to allow ibv_create_ah() to resolve sgid,
   dgid to vlan, dmac for any gid type.  Although user-space might bypass 
 this
   call for link-local gids, it is better not to replicate the kernel 
 resolution
   policy.  Port link layer is also returned by ibv_query_port().
 
 A high-level comment/question, followed by some notes about the specific 
 patch.
 
 At the highest level, is having this very low-level command exposed as
 part of the kernel uverbs - userspace API the right place to split
 things?  Making the Ethernet address resolution part of the low-level
 driver implies that it's not really a generic part of the verbs interface.

Correct - Ethernet address resolution really isn't part of the verbs
interface accoring to the RoCE specification; it should happen below
the verbs.

 
 Maybe it is generic, and we should have a generic function instead of
 calling into the low-level driver.  I see the argument that we should
 keep the policy in the kernel, although I'm not sure how strong that
 argument is -- once we start shipping a kernel with a certain policy
 (and I guess OFED has in effect already done that), how could we ever
 change that policy?  We'll have interoperability issues anyway, so it
 seems having userspace and kernel use different policies doesn't cause
 much further problems anyway.

Address mapping policies depend on the address type. This patch only
defines a policy for mapping link-local addresses, and we should
indeed take care not to change it (if possible).
Later on, we can add more policies for other address types (e.g.,
normal IPv6 addresses, mapped IPv4 addresses, etc.)
 
 Or maybe it is device-specific, and we could wrap it up into the create
 AH uverbs call we already have?

Interesting idea. Not sure what is better here: a seperate ABI
call or some additional 'u8 ctx[32]' field in struct
ib_uverbs_create_ah_resp that will be interpreted by the hw-specific
user-space driver.

 
 Low-level comments:
 
   +ssize_t ib_uverbs_get_eth_l2_addr(struct ib_uverbs_file *file, const char 
 __user *buf,
   +int in_len, int out_len)
   +{
   +  struct ib_uverbs_get_eth_l2_addr   cmd;
   +  struct ib_uverbs_get_eth_l2_addr_resp  resp;
   +  int  ret;
   +  struct ib_pd*pd;
   +
   +  if (out_len  sizeof resp)
   +  return -ENOSPC;
   +
   +  if (copy_from_user(cmd, buf, sizeof cmd))
   +  return -EFAULT;
   +
   +  pd = idr_read_pd(cmd.pd_handle, file-ucontext);
   +  if (!pd)
   +  return -EINVAL;
   +
   +  ret = ib_get_eth_l2_addr(pd-device, cmd.port, (union ib_gid *)cmd.gid,
   +   cmd.sgid_idx, resp.mac, resp.vlan_id, 
 resp.tagged);
   +  put_pd_read(pd);
   +  if (!ret) {
   +  if (copy_to_user((void __user *) (unsigned long) cmd.response,
   +   resp, sizeof resp))
   +  return -EFAULT;
 
 This leaks kernel memory contents to userspace since the stack variable
 resp is never cleared.  Also will cause problems if we ever need to use
 the reserved fields for anything.

I see, I missed that subtle detail.


 
 Also I'm not sure I understand why you pass the PD into this method?  It
 seems you just use it to get a pointer to the device, but you already
 have that from the uverbs_file structure that's passed into all commands.

True, missed that too.

 
   +int ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid 
 *gid,
   + int sgid_idx, u8 *mac, __u16 *vlan_id, u8 *tagged)
   +{
   +  if (!device-get_eth_l2_addr)
   +  return -ENOSYS;
   +
   +  return device-get_eth_l2_addr(device, port, gid, sgid_idx, mac, 
 vlan_id, tagged);
   +}
   +EXPORT_SYMBOL(ib_get_eth_l2_addr);
 
 I don't think we need this wrapper, since uverbs can just call the
 get_eth_l2_addr method directly (we already do that for quite a few
 other methods, eg alloc_ucontext is a uverbs-only method that has no
 in-kernel wrapper).  Also the -ENOSYS test isn't necessary, since a
 device-driver shouldn't allow this method unless it actually implements
 it.
 
I agree.

By the way, do you prefer that I re-create the patch with the fixes or
would you make the necessary changes.

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 02/11] ib_core: IBoE support only QP1

2010-05-13 Thread Eli Cohen
On Thu, May 13, 2010 at 08:48:06AM -0700, Roland Dreier wrote:
 
 Right, I'm not saying it shouldn't be in multicast.c.  I'm just
 wondering why you don't have the same thing for sa_query.c.
 
One reason is that get_src_path_mask() may access ah_lock.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 07/11] ib_core: Add API to support IBoE from userspace

2010-05-13 Thread Eli Cohen
On Thu, May 13, 2010 at 08:55:13AM -0700, Sean Hefty wrote:
 
 Interesting idea. Not sure what is better here: a seperate ABI
 call or some additional 'u8 ctx[32]' field in struct
 ib_uverbs_create_ah_resp that will be interpreted by the hw-specific
 user-space driver.
 
 I don't understand why mapping remote addresses should be driver specific.

In this case it will not be just mapping remote addresses but creating
the required AH information which is unique to each device.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] Has anyone tried the ISC DHCP patches for Infiniband with ISC DHCP 4.1.x?

2010-05-10 Thread Eli Cohen
On Mon, May 10, 2010 at 08:36:22PM +1000, Justin Clift wrote:
 The patches for ISC DHCP that come with OFED 1.5.1 seem to be for an 
 older version of ISC DHCP (3.0.x).
 
 Before I start looking into it, I'm wondering if anyone has tested them 
 with ISC DHCP 4.1.x?

I tried once but and building the package faile but I can't recall the
reason.

 
 Hoping to save some time/effort if anyone's been down this track already. :)
 
 Regards and best wishes,
 
 Justin Clift
 
 -- 
 Salasaga  -  Open Source eLearning IDE
http://www.salasaga.org
 ___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 01/11] ib core: Add link layer property to ports

2010-05-06 Thread Eli Cohen
On Wed, May 05, 2010 at 03:19:50PM -0700, Roland Dreier wrote:
 Hi Eli,
 
 I'm hoping to get this IBoE stuff in for 2.6.35.  I started an iboe
 branch in my tree (similar to the xrc branch I've been carrying for a
 while), and I added this patch in, except I renamed
 rdma_port_link_layer() to rdma_port_get_link_layer().  This seems to
 match rdma_node_get_transport() better.
 
 In any case as I add patches to my branch, you can stop worrying about
 them, which should make keeping this series updated easier.
 

Hi Roland,

I am glad to hear this and will be happy to help.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 06/11] ipoib: avoid ipoib over IBoE

2010-05-06 Thread Eli Cohen
On Wed, May 05, 2010 at 03:27:51PM -0700, Roland Dreier wrote:
   @@ -1383,6 +1385,9 @@ static void ipoib_remove_one(struct ib_device 
 *device)
  dev_list = ib_get_client_data(device, ipoib_client);

  list_for_each_entry_safe(priv, tmp, dev_list, list) {
   +  if (rdma_port_link_layer(device, priv-port) != 
 IB_LINK_LAYER_INFINIBAND)
   +  continue;
 
 Why do we need this chunk here?  How could a netdev get on our list if
 we never create IPoIB interfaces for IBoE ports?
 

Right, this is not necessary and can be removed.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCHv8 02/11] ib_core: IBoE support only QP1

2010-05-06 Thread Eli Cohen
On Wed, May 05, 2010 at 04:12:15PM -0700, Roland Dreier wrote:
   @@ -795,11 +799,12 @@ static void mcast_add_one(struct ib_device *device)
  struct mcast_device *dev;
  struct mcast_port *port;
  int i;
   +  int count = 0;

  if (rdma_node_get_transport(device-node_type) != RDMA_TRANSPORT_IB)
  return;

   -  dev = kmalloc(sizeof *dev + device-phys_port_cnt * sizeof *port,
   +  dev = kzalloc(sizeof *dev + device-phys_port_cnt * sizeof *port,
 
   @@ -1007,7 +1010,7 @@ static void ib_sa_add_one(struct ib_device *device)
  e = device-phys_port_cnt;
  }

   -  sa_dev = kmalloc(sizeof *sa_dev +
   +  sa_dev = kzalloc(sizeof *sa_dev +
 
 Do you happen to remember why you needed these kmalloc - kzalloc conversions?

I can't remember why. I do have this habbit of prefering kzalloc
over kmalloc because it saves troubles sometimes.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCH] mlx4_core: request MSIX vectors as much as there CPU cores

2010-05-05 Thread Eli Cohen
The current code requires num_possible_cpus() + 1 MSIX vectors. However,
num_possible_cpus() stands for the max number of supported CPUs by the kernel.
We should use num_online_cpus() which is the number of available CPUs for the
system.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/net/mlx4/main.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index e3e0d54..0559df4 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -969,7 +969,7 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 
if (msi_x) {
nreq = min_t(int, dev-caps.num_eqs - dev-caps.reserved_eqs,
-num_possible_cpus() + 1);
+num_online_cpus() + 1);
entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
if (!entries)
goto no_msi;
-- 
1.7.1

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] RoCE supplement to IB Spec

2010-04-29 Thread Eli Cohen
Yes, we're compiant to the spec. If any bugs are found please report
them.

On Tue, Apr 27, 2010 at 05:31:11PM -0700, Pradeep Satyanarayana wrote:
 Is it the intent that the RoCE implementation in OFED-1.5.1 corresponds to 
 the RoCE supplement to the IB spec dated April 6th 2010,
 excepting bugs of course? Or are there known deviations from the spec?
 
 Pradeep
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] abi version differences

2010-03-21 Thread Eli Cohen
On Sun, Mar 21, 2010 at 05:38:39PM +0200, Anatoly Greenblatt wrote:
 Hello everyone and welcome back from Sonoma.
 
  
 
 We found that functionality of a customer application has been broken
 due to a change in abi version (from 7 to 6).
 
 There were more fields added to the structure but abi version has
 returned to 6. I was wondering if this is a bug or intention.
 
 I'd appreciate if anyone could comment on this.
 
  

After being reviewed by the list, we came to the conclusion that ABI
change was not required. We added the new field in the place that was
formerly reserved. You can find the discussion about this here:

http://www.mail-archive.com/linux-r...@vger.kernel.org/msg01762.html
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH] ipoib: Fix lockup of the tx queue

2010-03-16 Thread Eli Cohen
On Mon, Mar 15, 2010 at 08:25:51AM -0800, Josh England wrote:
 Everything has MT264328 ConnectX cards using the mlx4_ib driver.
 Boot/file servers are using an HP OEM 2.7.000 firmware.  Compute nodes
 have cards using Sun OEM 2.6.200 FW.
 

You probably mean MT26428? Anyway, do you still see the post send
failed messages? If you do, could you apply this patch so we'll have
better insight as for the reason?
http://patchwork.kernel.org/patch/83593/
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] broken support for UC in rdma_cm

2010-03-15 Thread Eli Cohen
We got failing tests in our regression that use UC transport from
userspace. Looking at cma_connect_ib(), I see that it uses IB_QPT_RC
in the CM request instead of using whatever the transport type of the
requesting QP. There is no easy way to retrieve that information too
since the QP is created in userspace. We do have the QP number. One
way to address this problem is to have an API that returns a pointer
to struct ib_qp given the ib device and the QP number. Another way
would be to pass qp_type through the call to rdma_init_qp_attr().
Thoughts?
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] broken support for UC in rdma_cm

2010-03-15 Thread Eli Cohen
I just noticed that Vlad already opened a bugzilla bug (1874) on this.
I quote Sean's response:

RDMA CM supports UD and RC QPs (port spaces UDP/TCP) only.  Support
for UC QPs should come from another port space.

This makes sense to me. Still we need to address the issues I raised
below. Sean, are you going to fix this?

On Mon, Mar 15, 2010 at 01:12:28PM +0200, Eli Cohen wrote:
 We got failing tests in our regression that use UC transport from
 userspace. Looking at cma_connect_ib(), I see that it uses IB_QPT_RC
 in the CM request instead of using whatever the transport type of the
 requesting QP. There is no easy way to retrieve that information too
 since the QP is created in userspace. We do have the QP number. One
 way to address this problem is to have an API that returns a pointer
 to struct ib_qp given the ib device and the QP number. Another way
 would be to pass qp_type through the call to rdma_init_qp_attr().
 Thoughts?
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] broken support for UC in rdma_cm

2010-03-15 Thread Eli Cohen
On Mon, Mar 15, 2010 at 08:38:00AM -0700, Sean Hefty wrote:
 
 This is really a new feature request, which I may not have time to address.  
 It
 depends on the scope of the changes that are needed.
 
 If both sides try using UC QPs, do you know what specific setting causes the
 connection to fail?

The actual failure is to modify the QP from init to rtr since the
attribute mask sets IB_QP_MAX_DEST_RD_ATOMIC and IB_QP_MIN_RNR_TIMER
which are invalid attributes in UC. The flags are set in
cm_init_qp_rtr_attr() since the transport is IB_QPT_RC. 
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH] ipoib: Fix lockup of the tx queue

2010-03-13 Thread Eli Cohen
On Thu, Mar 11, 2010 at 01:38:38PM -0800, Roland Dreier wrote:
 
 I do worry (as Moni mentioned) that this doesn't explain why you would
 get send failures in this case, but the patch itself is well-explained
 and looks obviously correct so I think we should apply it.

It could be a problem in the hardware driver.
Josh, can you tell what kind of hardware you were using?
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] problem with ipoib_mcast_fix_ip_ib_mc_map_to_2_6_24.patch

2010-03-11 Thread Eli Cohen
I am handling this.

On Thu, Mar 11, 2010 at 09:54:34AM +0200, Tziporet Koren wrote:
 On 3/10/2010 9:05 PM, Jason Gunthorpe wrote:
 On Wed, Mar 10, 2010 at 08:57:17PM +0200, Eli Cohen wrote:
 On Wed, Mar 10, 2010 at 10:42:22AM -0700, Jason Gunthorpe wrote:
 I guess, the best fix is to revert 
 c12481586c4ba09cb88dc2090c67fdce7c856cde,
 alter ipoib_mcast_addr_is_valid to not compare bytes 5, 8 and 9,
 
 and
 fixup the 'Add in the P_Key' hunk to also fixup the scope byte too.
 Can you elaborate on this?
 +
 ++  /* Work around broken ip_ib_mc_map */
 ++  if (mclist-dmi_addrlen == INFINIBAND_ALEN) {
 ++  mclist-dmi_addr[5] = 0x10 | (dev-broadcast[5]  0xF);
 ++  mclist-dmi_addr[8] = dev-broadcast[8];
 ++  mclist-dmi_addr[9] = dev-broadcast[9];
 ++  }
 
 5 in the dmi_addr is the scope byte. The old patch:
 
 -+  /* Add in the P_Key */
 -+  mgid.raw[4] = (priv-pkey  8)  0xff;
 -+  mgid.raw[5] = priv-pkey  0xff;
 -+
 
 Only includes the dmi_addr bytes 8 and 9. This is also a small bug.
 
 The above should read something like:
 
 mgid.raw[1] = 0x10 | (dev-broadcast[5]  0xF);
 mgid.raw[4] = dev-broadcast[8];
 mgid.raw[5] = dev-broadcast[9];
 
 Jason
 Eli
 Can you take care for it now or you need the complete pathc from Jason?
 
 Vlad
 Please revert the patch that causing the problem
 
 Tziporet
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] problem with ipoib_mcast_fix_ip_ib_mc_map_to_2_6_24.patch

2010-03-10 Thread Eli Cohen
On Wed, Mar 10, 2010 at 10:42:22AM -0700, Jason Gunthorpe wrote:
 
 I guess, the best fix is to revert c12481586c4ba09cb88dc2090c67fdce7c856cde,
 alter ipoib_mcast_addr_is_valid to not compare bytes 5, 8 and 9,


 and
 fixup the 'Add in the P_Key' hunk to also fixup the scope byte too.

Can you elaborate on this? 
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] possible bug in rds?

2010-03-10 Thread Eli Cohen
On Wed, Mar 10, 2010 at 03:51:36PM -0800, Andy Grover wrote:
 
 I've opened a bug:
 
 https://bugs.openfabrics.org/show_bug.cgi?id=1983
 
 Did this just start happening?  What is the test doing when this
 occurred? Please add to the bug if possible, and I'll try to diagnose
 further.
 

Follow up response in bugzilla.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] OpenSM problem on today's OFED-1.5.1 daily build

2010-02-21 Thread Eli Cohen
There was a bug in build_mlx_header() that I commited on Thursday. The
bug was a failure to edit the MAD correctly. It has been fixed in the
latest build. Sorry for any inconvenience.

On Fri, Feb 19, 2010 at 03:02:07PM -0800, Woodruff, Robert J wrote:
 
 I have 2 systems that have Mellanox dual port connectx
 cards with one of the ports connected to a and SDR switch
 and the other port direct connected.
 
 With today's OFED-1.5.1 daily build, the OpenSM does
 not seem to transition the port all the way up.
 If I use OFED-1.5.1-rc1, it works fine.
 
 [r...@woody-10 woody]# /etc/init.d/opensmd start
 Starting IB Subnet Manager.[  OK  ]
 [r...@woody-10 woody]# /usr/sbin/ibstat
 CA 'mlx4_0'
 CA type: MT26428
 Number of ports: 2
 Firmware version: 2.7.0
 Hardware version: a0
 Node GUID: 0x0002c90300044fa8
 System image GUID: 0x0002c90300044fab
 Port 1:
 State: Armed
 Physical state: LinkUp
 Rate: 10
 Base lid: 1
 LMC: 0
 SM lid: 1
 Capability mask: 0x0251086a
 Port GUID: 0x0002c90300044fa9
 Port 2:
 State: Initializing
 Physical state: LinkUp
 Rate: 40
 Base lid: 0
 LMC: 0
 SM lid: 0
 Capability mask: 0x02510868
 Port GUID: 0x0002c90300044faa
 [r...@woody-10 woody]# 
 ___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 0/11] IBoE support to Infiniband

2010-02-18 Thread Eli Cohen
IBoE allows running the IB transport protocol using Ethernet frames, enabling
the deployment of IB semantics on lossless Ethernet fabrics.

IBoE packets are standard Ethernet frames with an IEEE assigned Ethertype, a
GRH, unmodified IB transport headers and payload.  IB subnet management and SA
services are not required for IBoE operation; Ethernet management practices are
used instead. IBoE encodes IP addresses into its GIDs and resolves MAC
addresses using the host IP stack. For multicast GIDs, standard IP to MAC
mappings apply.

The OFA RDMA Verbs API is syntactically unmodified. The CMA is adapted to
support IBoE ports allowing existing RDMA applications to run over IBoE with no
changes.

Address handles for IBoE are required to contain valid L3 addresses (GIDs) and
the IB L2 address fields become reserved. The complementary Ethernet L2 address
information is subsequently resolved below the API.

As there is no SA in IBoE, the CMA code is adapted to locally fill-in
corresponding path record attributes for IBoE address handles. Also, the CMA
provides the required address handle attributes for SIDR requests and joining
of multicast groups.

With this patch set, each IBoE port is assigned a GID equal to the link local
address of its corresponding net device, and one more GID for each one of the
VLAN devices which are derived from it. iboe packets are tagged with the VLAN
ID of the corresponding netdevice through which they are generated.

The priority field in the 802.1q header of IBoE packets is derived from the SL
field in the address vector. rdma_cm applications can set the TOS value of the
rdma_cm_id object through the rdma_set_option() API which then maps to SL.

With these patches, IBoE multicast frames may be broadcast as there is
currently no use of a L2 multicast group membership protocol.

To enable IBoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib drivers
must be loaded, and the netdevice for the corresponding IBoE port must be
running. Individual ports of a multi port HCA can be independently configured
as Ethernet (with support for IBoE) or as IB, as it was already the case.

We have successfully tested MPI, SDP, RDS, and native Verbs applications over
IBoE.

Following is a series of 11 patches based on Roland's for-next branch.  This
new series reflects changes based on feedback from the community on the
previous patch set.

Changes from v7
1. Rebase on 2.6.33-rc3
2. Add VLAN support
3. Bug fixes and improvements (see in the patches changelog).

Signed-off-by: Eli Cohen e...@mellanox.co.il
---


 drivers/infiniband/core/agent.c   |   37 +
 drivers/infiniband/core/cm.c  |5 
 drivers/infiniband/core/cma.c |  287 ++-
 drivers/infiniband/core/mad.c |   27 +
 drivers/infiniband/core/multicast.c   |   25 +
 drivers/infiniband/core/sa_query.c|   46 +-
 drivers/infiniband/core/ucma.c|   54 ++
 drivers/infiniband/core/ud_header.c   |  129 +-
 drivers/infiniband/core/user_mad.c|   11 
 drivers/infiniband/core/uverbs.h  |1 
 drivers/infiniband/core/uverbs_cmd.c  |   33 +
 drivers/infiniband/core/uverbs_main.c |1 
 drivers/infiniband/core/verbs.c   |   26 +
 drivers/infiniband/hw/mlx4/ah.c   |  196 --
 drivers/infiniband/hw/mlx4/mad.c  |   32 +
 drivers/infiniband/hw/mlx4/main.c |  557 --
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |   35 +
 drivers/infiniband/hw/mlx4/qp.c   |  180 +++--
 drivers/infiniband/hw/mthca/mthca_qp.c|2 
 drivers/infiniband/ulp/ipoib/ipoib_main.c |7 
 drivers/net/mlx4/en_main.c|   15 
 drivers/net/mlx4/en_netdev.c  |   10 
 drivers/net/mlx4/en_port.c|4 
 drivers/net/mlx4/en_port.h|3 
 drivers/net/mlx4/fw.c |3 
 drivers/net/mlx4/intf.c   |   20 +
 drivers/net/mlx4/main.c   |6 
 drivers/net/mlx4/mlx4.h   |1 
 drivers/net/mlx4/mlx4_en.h|1 
 drivers/net/mlx4/port.c   |   19 +
 include/linux/mlx4/cmd.h  |1 
 include/linux/mlx4/device.h   |   32 +
 include/linux/mlx4/driver.h   |   16 
 include/linux/mlx4/qp.h   |9 
 include/rdma/ib_addr.h|  139 +++
 include/rdma/ib_pack.h|   28 +
 include/rdma/ib_sa.h  |3 
 include/rdma/ib_user_verbs.h  |   22 +
 include/rdma/ib_verbs.h   |   29 +
 39 files changed, 1813 insertions(+), 239 deletions(-)
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 01/11] ib core: Add link layer property to ports

2010-02-18 Thread Eli Cohen
This patch adds the infrastructure for querying the link layer of a port, which
can be either IB_LINK_LAYER_INFINIBAND or IB_LINK_LAYER_ETHERNET. This is
required for adding IBoE support to Infiniband drivers so that branching
decisions can be made according to the value of this property. For devices that
do not provide an implementation for querying the link layer property of a
port, the returned value depends on the node transport such that
RMA_TRANSPORT_IB nodes will return IB_LINK_LAYER_INFINIBAND and
RDMA_TRANSPORT_IWARP nodes will return IB_LINK_LAYER_ETHERNET.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/verbs.c |   16 
 include/rdma/ib_verbs.h |   12 
 2 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index a7da9be..f9cbdb6 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -94,6 +94,22 @@ rdma_node_get_transport(enum rdma_node_type node_type)
 }
 EXPORT_SYMBOL(rdma_node_get_transport);
 
+enum rdma_link_layer rdma_port_link_layer(struct ib_device *device, u8 
port_num)
+{
+   if (device-get_link_layer)
+   return device-get_link_layer(device, port_num);
+
+   switch (rdma_node_get_transport(device-node_type)) {
+   case RDMA_TRANSPORT_IB:
+   return IB_LINK_LAYER_INFINIBAND;
+   case RDMA_TRANSPORT_IWARP:
+   return IB_LINK_LAYER_ETHERNET;
+   default:
+   return IB_LINK_LAYER_UNSPECIFIED;
+   }
+}
+EXPORT_SYMBOL(rdma_port_link_layer);
+
 /* Protection domains */
 
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 09509ed..bbfe315 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -75,6 +75,12 @@ enum rdma_transport_type {
 enum rdma_transport_type
 rdma_node_get_transport(enum rdma_node_type node_type) __attribute_const__;
 
+enum rdma_link_layer {
+   IB_LINK_LAYER_UNSPECIFIED,
+   IB_LINK_LAYER_INFINIBAND,
+   IB_LINK_LAYER_ETHERNET,
+};
+
 enum ib_device_cap_flags {
IB_DEVICE_RESIZE_MAX_WR = 1,
IB_DEVICE_BAD_PKEY_CNTR = (11),
@@ -298,6 +304,7 @@ struct ib_port_attr {
u8  active_width;
u8  active_speed;
u8  phys_state;
+   enum rdma_link_layerlink_layer;
 };
 
 enum ib_device_modify_flags {
@@ -1003,6 +1010,8 @@ struct ib_device {
int(*query_port)(struct ib_device *device,
 u8 port_num,
 struct ib_port_attr 
*port_attr);
+   enum rdma_link_layer   (*get_link_layer)(struct ib_device *device,
+u8 port_num);
int(*query_gid)(struct ib_device *device,
u8 port_num, int index,
union ib_gid *gid);
@@ -1213,6 +1222,9 @@ int ib_query_device(struct ib_device *device,
 int ib_query_port(struct ib_device *device,
  u8 port_num, struct ib_port_attr *port_attr);
 
+enum rdma_link_layer rdma_port_link_layer(struct ib_device *device,
+ u8 port_num);
+
 int ib_query_gid(struct ib_device *device,
 u8 port_num, int index, union ib_gid *gid);
 
-- 
1.7.0

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 02/11] ib_core: IBoE support only QP1

2010-02-18 Thread Eli Cohen
Since IBoE is using Ethernet as its link layer, there is no central management
entity so there is need for QP0.  QP1 is still needed since it handles
communications between CM agents. This patch will create only QP1 for IBoE
ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
Changes from v7:

1. Remove always true code
2. Fix failure to initialize port ah_lock in ib_sa_add_one


 drivers/infiniband/core/agent.c |   37 +++
 drivers/infiniband/core/mad.c   |   27 +++---
 drivers/infiniband/core/multicast.c |   25 ++---
 drivers/infiniband/core/sa_query.c  |   41 ++
 4 files changed, 93 insertions(+), 37 deletions(-)

diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index ae7c288..964f4fb 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -48,6 +48,8 @@
 struct ib_agent_port_private {
struct list_head port_list;
struct ib_mad_agent *agent[2];
+   struct ib_device*device;
+   u8   port_num;
 };
 
 static DEFINE_SPINLOCK(ib_agent_port_list_lock);
@@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
 {
struct ib_agent_port_private *entry;
 
-   list_for_each_entry(entry, ib_agent_port_list, port_list) {
-   if (entry-agent[0]-device == device 
-   entry-agent[0]-port_num == port_num)
+   list_for_each_entry(entry, ib_agent_port_list, port_list)
+   if (entry-device == device  entry-port_num == port_num)
return entry;
-   }
+
return NULL;
 }
 
@@ -155,14 +156,16 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error1;
}
 
-   /* Obtain send only MAD agent for SMI QP */
-   port_priv-agent[0] = ib_register_mad_agent(device, port_num,
-   IB_QPT_SMI, NULL, 0,
-   agent_send_handler,
-   NULL, NULL);
-   if (IS_ERR(port_priv-agent[0])) {
-   ret = PTR_ERR(port_priv-agent[0]);
-   goto error2;
+   if (rdma_port_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) 
{
+   /* Obtain send only MAD agent for SMI QP */
+   port_priv-agent[0] = ib_register_mad_agent(device, port_num,
+   IB_QPT_SMI, NULL, 0,
+   agent_send_handler,
+   NULL, NULL);
+   if (IS_ERR(port_priv-agent[0])) {
+   ret = PTR_ERR(port_priv-agent[0]);
+   goto error2;
+   }
}
 
/* Obtain send only MAD agent for GSI QP */
@@ -175,6 +178,9 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error3;
}
 
+   port_priv-device = device;
+   port_priv-port_num = port_num;
+
spin_lock_irqsave(ib_agent_port_list_lock, flags);
list_add_tail(port_priv-port_list, ib_agent_port_list);
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
@@ -182,7 +188,8 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
return 0;
 
 error3:
-   ib_unregister_mad_agent(port_priv-agent[0]);
+   if (rdma_port_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND)
+   ib_unregister_mad_agent(port_priv-agent[0]);
 error2:
kfree(port_priv);
 error1:
@@ -205,7 +212,9 @@ int ib_agent_port_close(struct ib_device *device, int 
port_num)
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
 
ib_unregister_mad_agent(port_priv-agent[1]);
-   ib_unregister_mad_agent(port_priv-agent[0]);
+   if (rdma_port_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND)
+   ib_unregister_mad_agent(port_priv-agent[0]);
+
kfree(port_priv);
return 0;
 }
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 7522008..f546ab7 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2610,6 +2610,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info 
*qp_info)
struct ib_mad_private *recv;
struct ib_mad_list_head *mad_list;
 
+   if (!qp_info-qp)
+   return;
+
while (!list_empty(qp_info-recv_queue.list)) {
 
mad_list = list_entry(qp_info-recv_queue.list.next,
@@ -2651,6 +2654,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
qp = port_priv-qp_info[i].qp;
+   if (!qp)
+   continue;
+
/*
 * PKey index for QP1 is irrelevant

[ewg] [PATCHv8 03/11] IB/umad: Enable support only for IB ports

2010-02-18 Thread Eli Cohen
Initialize umad context only for Infiniband (as opposed to Ethernet) ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/user_mad.c |   11 +++
 1 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/user_mad.c 
b/drivers/infiniband/core/user_mad.c
index 7de0296..e962c5a 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -1138,8 +1138,9 @@ static void ib_umad_add_one(struct ib_device *device)
for (i = s; i = e; ++i) {
umad_dev-port[i - s].umad_dev = umad_dev;
 
-   if (ib_umad_init_port(device, i, umad_dev-port[i - s]))
-   goto err;
+   if (rdma_port_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+   if (ib_umad_init_port(device, i, umad_dev-port[i - 
s]))
+   goto err;
}
 
ib_set_client_data(device, umad_client, umad_dev);
@@ -1148,7 +1149,8 @@ static void ib_umad_add_one(struct ib_device *device)
 
 err:
while (--i = s)
-   ib_umad_kill_port(umad_dev-port[i - s]);
+   if (rdma_port_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND)
+   ib_umad_kill_port(umad_dev-port[i - s]);
 
kref_put(umad_dev-ref, ib_umad_release_dev);
 }
@@ -1162,7 +1164,8 @@ static void ib_umad_remove_one(struct ib_device *device)
return;
 
for (i = 0; i = umad_dev-end_port - umad_dev-start_port; ++i)
-   ib_umad_kill_port(umad_dev-port[i]);
+   if (rdma_port_link_layer(device, i + 1) == 
IB_LINK_LAYER_INFINIBAND)
+   ib_umad_kill_port(umad_dev-port[i]);
 
kref_put(umad_dev-ref, ib_umad_release_dev);
 }
-- 
1.7.0

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 04/11] ib_core: IBoE CMA device binding

2010-02-18 Thread Eli Cohen
Add support for IBoE device binding and IP -- GID resolution. Path resolving
and multicast joining are implemented within cma.c by filling the responses and
pushing the callbacks to the cma work queue. IP-GID resolution always yields
IPv6 link local addresses - remote GIDs are derived from the destination MAC
address of the remote port. Multicast GIDs are always mapped to multicast MACs
as is done in IPv6. Some helper functions are added to ib_addr.h.  IPv4
multicast is enabled by translating IPv4 multicast addresses to IPv6 multicast
as described in
http://www.mail-archive.com/i...@sunroof.eng.sun.com/msg02134.html.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

Chages from v7:

1. Add force_grh flag to ib_init_ah_from_path() to request IB_AH_GRH for
   IB_LINK_LAYER_ETHERNET ports thus allowing to use hop limit 1 in path
   records.
2. cma_acquire_dev() finds the cma_dev by first assuming an iboe type device
   for none ARPHRD_INFINIBAND dev type. If it fails to do that, it falls back to
   old method.


 drivers/infiniband/core/cm.c  |5 +-
 drivers/infiniband/core/cma.c |  283 +++--
 drivers/infiniband/core/sa_query.c|5 +-
 drivers/infiniband/core/ucma.c|   45 -
 drivers/infiniband/ulp/ipoib/ipoib_main.c |2 +-
 include/rdma/ib_addr.h|   98 ++-
 include/rdma/ib_sa.h  |3 +-
 7 files changed, 412 insertions(+), 29 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 5130fc5..6513b1c 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -351,6 +351,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, 
struct cm_av *av)
unsigned long flags;
int ret;
u8 p;
+   int force_grh;
 
read_lock_irqsave(cm.device_lock, flags);
list_for_each_entry(cm_dev, cm.device_list, list) {
@@ -371,8 +372,10 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, 
struct cm_av *av)
return ret;
 
av-port = port;
+   force_grh = rdma_port_link_layer(cm_dev-ib_device, port-port_num) ==
+   IB_LINK_LAYER_ETHERNET ? 1 : 0;
ib_init_ah_from_path(cm_dev-ib_device, port-port_num, path,
-av-ah_attr);
+av-ah_attr, force_grh);
av-timeout = path-packet_life_time + 1;
return 0;
 }
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index cc9b594..df5f636 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -58,6 +58,7 @@ MODULE_LICENSE(Dual BSD/GPL);
 #define CMA_CM_RESPONSE_TIMEOUT 20
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define IBOE_PACKET_LIFETIME 18
 
 static void cma_add_one(struct ib_device *device);
 static void cma_remove_one(struct ib_device *device);
@@ -157,6 +158,7 @@ struct cma_multicast {
struct list_headlist;
void*context;
struct sockaddr_storage addr;
+   struct kref mcref;
 };
 
 struct cma_work {
@@ -173,6 +175,12 @@ struct cma_ndev_work {
struct rdma_cm_eventevent;
 };
 
+struct iboe_mcast_work {
+   struct work_struct   work;
+   struct rdma_id_private  *id;
+   struct cma_multicast*mc;
+};
+
 union cma_ip_addr {
struct in6_addr ip6;
struct {
@@ -281,6 +289,8 @@ static void cma_attach_to_dev(struct rdma_id_private 
*id_priv,
atomic_inc(cma_dev-refcount);
id_priv-cma_dev = cma_dev;
id_priv-id.device = cma_dev-device;
+   id_priv-id.route.addr.dev_addr.transport =
+   rdma_node_get_transport(cma_dev-device-node_type);
list_add_tail(id_priv-list, cma_dev-id_list);
 }
 
@@ -290,6 +300,14 @@ static inline void cma_deref_dev(struct cma_device 
*cma_dev)
complete(cma_dev-comp);
 }
 
+static inline void release_mc(struct kref *kref)
+{
+   struct cma_multicast *mc = container_of(kref, struct cma_multicast, 
mcref);
+
+   kfree(mc-multicast.ib);
+   kfree(mc);
+}
+
 static void cma_detach_from_dev(struct rdma_id_private *id_priv)
 {
list_del(id_priv-list);
@@ -330,15 +348,29 @@ static int cma_acquire_dev(struct rdma_id_private 
*id_priv)
union ib_gid gid;
int ret = -ENODEV;
 
-   rdma_addr_get_sgid(dev_addr, gid);
+   if (dev_addr-dev_type != ARPHRD_INFINIBAND) {
+   iboe_addr_get_sgid(dev_addr, gid);
+   list_for_each_entry(cma_dev, dev_list, list) {
+   ret = ib_find_cached_gid(cma_dev-device, gid,
+id_priv-id.port_num, NULL);
+   if (!ret)
+   goto out;
+   }
+   }
+
+   memcpy(gid, dev_addr-src_dev_addr +
+  rdma_addr_gid_offset(dev_addr), sizeof gid

[ewg] [PATCHv8 05/11] ib_core: IBoE UD packet packing support

2010-02-18 Thread Eli Cohen
Add support functions to aid in packing IBoE packets.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/ud_header.c |  100 ++
 include/rdma/ib_pack.h  |   29 +-
 2 files changed, 92 insertions(+), 37 deletions(-)

Changes from v7:

1. Re-work the changes so they extend the original idea behind these functions.
2. Fix wrong implementation of ib_ud_header_init(). A different patch was sent
   to Roland.


diff --git a/drivers/infiniband/core/ud_header.c 
b/drivers/infiniband/core/ud_header.c
index 8ec7876..7650313 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -80,6 +80,29 @@ static const struct ib_field lrh_table[]  = {
  .size_bits= 16 }
 };
 
+static const struct ib_field eth_table[]  = {
+   { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits  = 0,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits  = 16,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits  = 0,
+ .size_bits= 16 }
+};
+
 static const struct ib_field grh_table[]  = {
{ STRUCT_FIELD(grh, ip_version),
  .offset_words = 0,
@@ -180,56 +203,51 @@ static const struct ib_field deth_table[] = {
 /**
  * ib_ud_header_init - Initialize UD header structure
  * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
  * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @immediate_present: specify if immediate data is present
  * @header:Structure to initialize
- *
- * ib_ud_header_init() initializes the lrh.link_version, lrh.link_next_header,
- * lrh.packet_length, grh.ip_version, grh.payload_length,
- * grh.next_header, bth.opcode, bth.pad_count and
- * bth.transport_header_version fields of a struct ib_ud_header given
- * the payload length and whether a GRH will be included.
  */
 void ib_ud_header_init(int payload_bytes,
+  int  lrh_present,
+  int  eth_present,
   int  grh_present,
+  int  immediate_present,
   struct ib_ud_header *header)
 {
-   int header_len;
u16 packet_length;
 
memset(header, 0, sizeof *header);
 
-   header_len =
-   IB_LRH_BYTES  +
-   IB_BTH_BYTES  +
-   IB_DETH_BYTES;
-   if (grh_present) {
-   header_len += IB_GRH_BYTES;
+   if (lrh_present) {
+   header-lrh.link_version = 0;
+   header-lrh.link_next_header =
+   grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+   packet_length = IB_LRH_BYTES;
}
 
-   header-lrh.link_version = 0;
-   header-lrh.link_next_header =
-   grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
-   packet_length= (IB_LRH_BYTES +
-   IB_BTH_BYTES +
-   IB_DETH_BYTES+
-   payload_bytes+
-   4+ /* ICRC */
-   3) / 4;/* round up */
-
-   header-grh_present  = grh_present;
+   if (eth_present)
+   packet_length = IB_ETH_BYTES;
+
+   packet_length += IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes +
+   4   + /* ICRC */
+   3;/* round up */
+   packet_length /= 4;
if (grh_present) {
-   packet_length  += IB_GRH_BYTES / 4;
-   header-grh.ip_version  = 6;
-   header-grh.payload_length  =
-   cpu_to_be16((IB_BTH_BYTES +
-IB_DETH_BYTES+
-payload_bytes+
-4+ /* ICRC */
-3)  ~3);  /* round up */
+   packet_length += IB_GRH_BYTES / 4;
+   header-grh.ip_version = 6;
+   header-grh.payload_length =
+   cpu_to_be16((IB_BTH_BYTES  +
+IB_DETH_BYTES +
+payload_bytes +
+4 + /* ICRC */
+3)  ~3

[ewg] [PATCHv8 06/11] ipoib: avoid ipoib over IBoE

2010-02-18 Thread Eli Cohen
IPoIB is an implementation of IP over Infiniband transport. In the case of
IBoE, the link layer is Ethernet so IP can work directly over Ethernet, so
disable IPoIB for none IB_LINK_LAYER_INFINIBAND ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 06014d2..5e6c2de 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1362,6 +1362,8 @@ static void ipoib_add_one(struct ib_device *device)
}
 
for (p = s; p = e; ++p) {
+   if (rdma_port_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+   continue;
dev = ipoib_add_port(ib%d, device, p);
if (!IS_ERR(dev)) {
priv = netdev_priv(dev);
@@ -1383,6 +1385,9 @@ static void ipoib_remove_one(struct ib_device *device)
dev_list = ib_get_client_data(device, ipoib_client);
 
list_for_each_entry_safe(priv, tmp, dev_list, list) {
+   if (rdma_port_link_layer(device, priv-port) != 
IB_LINK_LAYER_INFINIBAND)
+   continue;
+
ib_unregister_event_handler(priv-event_handler);
 
rtnl_lock();
-- 
1.7.0

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 07/11] ib_core: Add API to support IBoE from userspace

2010-02-18 Thread Eli Cohen
Add ib_uverbs_get_eth_l2_addr() to allow ibv_create_ah() to resolve sgid,
dgid to vlan, dmac for any gid type.  Although user-space might bypass this
call for link-local gids, it is better not to replicate the kernel resolution
policy.  Port link layer is also returned by ibv_query_port().

Signed-off-by: Eli Cohen e...@mellanox.co.il
---

Changes from v7:

1. ib_uverbs_get_mac() was renamed to ib_uverbs_get_eth_l2_addr() and it now
   returns both MAC, VLAN ID and a tagged indication to indicate if packets
   should go out tagged..

 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   33 +
 drivers/infiniband/core/uverbs_main.c |1 +
 drivers/infiniband/core/verbs.c   |   10 ++
 include/rdma/ib_user_verbs.h  |   22 --
 include/rdma/ib_verbs.h   |   17 +
 6 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b3ea958..79359f6 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq);
 IB_UVERBS_DECLARE_CMD(modify_srq);
 IB_UVERBS_DECLARE_CMD(query_srq);
 IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(get_eth_l2_addr);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 112d397..19b4827 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width= attr.active_width;
resp.active_speed= attr.active_speed;
resp.phys_state  = attr.phys_state;
+   resp.link_layer  = attr.link_layer;
 
if (copy_to_user((void __user *) (unsigned long) cmd.response,
 resp, sizeof resp))
@@ -1824,6 +1825,38 @@ err:
return ret;
 }
 
+ssize_t ib_uverbs_get_eth_l2_addr(struct ib_uverbs_file *file, const char 
__user *buf,
+ int in_len, int out_len)
+{
+   struct ib_uverbs_get_eth_l2_addr   cmd;
+   struct ib_uverbs_get_eth_l2_addr_resp  resp;
+   int  ret;
+   struct ib_pd*pd;
+
+   if (out_len  sizeof resp)
+   return -ENOSPC;
+
+   if (copy_from_user(cmd, buf, sizeof cmd))
+   return -EFAULT;
+
+   pd = idr_read_pd(cmd.pd_handle, file-ucontext);
+   if (!pd)
+   return -EINVAL;
+
+   ret = ib_get_eth_l2_addr(pd-device, cmd.port, (union ib_gid *)cmd.gid,
+cmd.sgid_idx, resp.mac, resp.vlan_id, 
resp.tagged);
+   put_pd_read(pd);
+   if (!ret) {
+   if (copy_to_user((void __user *) (unsigned long) cmd.response,
+resp, sizeof resp))
+   return -EFAULT;
+
+   return in_len;
+   }
+
+   return ret;
+}
+
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 const char __user *buf, int in_len, int out_len)
 {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index 5f284ff..ef9eaa5 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -109,6 +109,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file 
*file,
[IB_USER_VERBS_CMD_MODIFY_SRQ]  = ib_uverbs_modify_srq,
[IB_USER_VERBS_CMD_QUERY_SRQ]   = ib_uverbs_query_srq,
[IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+   [IB_USER_VERBS_CMD_GET_ETH_L2_ADDR] = ib_uverbs_get_eth_l2_addr,
 };
 
 static struct vfsmount *uverbs_event_mnt;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index f9cbdb6..f586702 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -920,3 +920,13 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, 
u16 lid)
return qp-device-detach_mcast(qp, gid, lid);
 }
 EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid *gid,
+  int sgid_idx, u8 *mac, __u16 *vlan_id, u8 *tagged)
+{
+   if (!device-get_eth_l2_addr)
+   return -ENOSYS;
+
+   return device-get_eth_l2_addr(device, port, gid, sgid_idx, mac, 
vlan_id, tagged);
+}
+EXPORT_SYMBOL(ib_get_eth_l2_addr);
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index a17f771..09f38df 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -81,7 +81,8 @@ enum {
IB_USER_VERBS_CMD_MODIFY_SRQ,
IB_USER_VERBS_CMD_QUERY_SRQ,
IB_USER_VERBS_CMD_DESTROY_SRQ,
-   IB_USER_VERBS_CMD_POST_SRQ_RECV
+   IB_USER_VERBS_CMD_POST_SRQ_RECV,
+   IB_USER_VERBS_CMD_GET_ETH_L2_ADDR

[ewg] [PATCHv8 09/11] mlx4: Add support for IBoE - address resolution

2010-02-18 Thread Eli Cohen
The following patch handles address vectors creation for IBoE ports. mlx4 needs
the MAC address of the remote node to include it in the WQE of a UD QP or in
the QP context of connected QPs. Address resolution is done atomically in the
case of a link local address or a multicast GID and otherwise -EINVAL is
returned.  mlx4 transport packets were changed too to accommodate for IBoE.
Multicast groups attach/detach calls dev_mc_add/remove to update the NIC's
multicast filters.Since attaching a QP to a multicast group does not require
the QP to be in a state different then INIT - this is fine for IB. For IBoE
however, we need the port assigned to the QP in order to call dev_mc_add() for
the correct netdevice, while port is assigned when moving from INIT to RTR.
Hence, we must keep track of all the multicast groups attached to a QP and call
dev_mc_add() when the port becomes available.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
Changes from v7:

1. Fix failure to initialize gid_index in create_iboe_ah()
2. Move call register_netdevice_notifier() after call to ib_register_device()
3. Call flush_workqueue() after unregister notifier to flush any pending work
   requesets.
4. Change build_mlx_header to match changes to ud_header.c


 drivers/infiniband/hw/mlx4/ah.c|  182 ++---
 drivers/infiniband/hw/mlx4/mad.c   |   32 ++-
 drivers/infiniband/hw/mlx4/main.c  |  497 +---
 drivers/infiniband/hw/mlx4/mlx4_ib.h   |   35 +++-
 drivers/infiniband/hw/mlx4/qp.c|  139 +++--
 drivers/infiniband/hw/mthca/mthca_qp.c |2 +
 drivers/net/mlx4/en_port.c |4 +-
 drivers/net/mlx4/en_port.h |3 +-
 drivers/net/mlx4/fw.c  |3 +-
 include/linux/mlx4/cmd.h   |1 +
 include/linux/mlx4/device.h|   31 ++-
 include/linux/mlx4/qp.h|7 +-
 12 files changed, 819 insertions(+), 117 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index c75ac94..0a2f1fb 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -31,63 +31,158 @@
  */
 
 #include mlx4_ib.h
+#include rdma/ib_addr.h
+#include linux/inet.h
+#include linux/string.h
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr 
*ah_attr,
+   u8 *mac, int *is_mcast, u8 port)
 {
-   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
-   struct mlx4_ib_ah *ah;
+   struct mlx4_ib_iboe *iboe = dev-iboe;
+   struct in6_addr in6;
 
-   ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-   if (!ah)
-   return ERR_PTR(-ENOMEM);
+   *is_mcast = 0;
+   spin_lock(iboe-lock);
+   if (!iboe-netdevs[port - 1]) {
+   spin_unlock(iboe-lock);
+   return -EINVAL;
+   }
+   spin_unlock(iboe-lock);
 
-   memset(ah-av, 0, sizeof ah-av);
+   memcpy(in6, ah_attr-grh.dgid.raw, sizeof in6);
+   if (rdma_link_local_addr(in6))
+   rdma_get_ll_mac(in6, mac);
+   else if (rdma_is_multicast_addr(in6)) {
+   rdma_get_mcast_mac(in6, mac);
+   *is_mcast = 1;
+   } else
+   return -EINVAL;
 
-   ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
-   ah-av.g_slid  = ah_attr-src_path_bits;
-   ah-av.dlid= cpu_to_be16(ah_attr-dlid);
-   if (ah_attr-static_rate) {
-   ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET;
-   while (ah-av.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
-  !(1  ah-av.stat_rate  dev-caps.stat_rate_support))
-   --ah-av.stat_rate;
-   }
-   ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
+   return 0;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
+
+   ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
+   ah-av.ib.g_slid  = ah_attr-src_path_bits;
if (ah_attr-ah_flags  IB_AH_GRH) {
-   ah-av.g_slid   |= 0x80;
-   ah-av.gid_index = ah_attr-grh.sgid_index;
-   ah-av.hop_limit = ah_attr-grh.hop_limit;
-   ah-av.sl_tclass_flowlabel |=
+   ah-av.ib.g_slid   |= 0x80;
+   ah-av.ib.gid_index = ah_attr-grh.sgid_index;
+   ah-av.ib.hop_limit = ah_attr-grh.hop_limit;
+   ah-av.ib.sl_tclass_flowlabel |=
cpu_to_be32((ah_attr-grh.traffic_class  20) |
ah_attr-grh.flow_label);
-   memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16);
+   memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16);
+   }
+
+   ah-av.ib.dlid= cpu_to_be16

[ewg] [PATCHv8 10/11] ib_core: Add VLAN support to IBoE

2010-02-18 Thread Eli Cohen
Add 802.1q vlan support to IBoE. The vlan tag is encoded within the GID
derived from a link local address in the following way:

GID[11] GID[12] contain the vlan ID.
The 3 bit user priority field is identical to the 3 bits of the SL.

In case rdma_cm apps, the TOS field is used to generate the SL field by doing a
shift right of 5 bits effectively taking to 3 MS bits of the TOS field. In
order to support userspace verbs consumers, ib_uverbs_get_mac has changed into
ib_uverbs_get_eth_l2_addr and now returns both MAC and VLAN information.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cma.c   |   20 -
 drivers/infiniband/core/ucma.c  |   13 -
 drivers/infiniband/core/ud_header.c |   31 -
 include/rdma/ib_addr.h  |   49 ---
 include/rdma/ib_pack.h  |   19 ++---
 5 files changed, 106 insertions(+), 26 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index df5f636..108d1bb 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1763,6 +1763,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private 
*id_priv)
struct sockaddr_in *src_addr = (struct sockaddr_in 
*)route-addr.src_addr;
struct sockaddr_in *dst_addr = (struct sockaddr_in 
*)route-addr.dst_addr;
struct net_device *ndev = NULL;
+   u16 vid;
 
if (src_addr-sin_family != dst_addr-sin_family)
return -EINVAL;
@@ -1782,14 +1783,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private 
*id_priv)
 
route-num_paths = 1;
 
-   iboe_mac_to_ll(route-path_rec-sgid, addr-dev_addr.src_dev_addr);
-   iboe_mac_to_ll(route-path_rec-dgid, addr-dev_addr.dst_dev_addr);
-
-   route-path_rec-hop_limit = 1;
-   route-path_rec-reversible = 1;
-   route-path_rec-pkey = cpu_to_be16(0x);
-   route-path_rec-mtu_selector = IB_SA_EQ;
-
if (addr-dev_addr.bound_dev_if)
ndev = dev_get_by_index(init_net, addr-dev_addr.bound_dev_if);
if (!ndev) {
@@ -1797,6 +1790,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private 
*id_priv)
goto err2;
}
 
+   vid = rdma_vlan_dev_vlan_id(ndev);
+
+   iboe_mac_vlan_to_ll(route-path_rec-sgid, 
addr-dev_addr.src_dev_addr, vid);
+   iboe_mac_vlan_to_ll(route-path_rec-dgid, 
addr-dev_addr.dst_dev_addr, vid);
+
+   route-path_rec-hop_limit = 1;
+   route-path_rec-reversible = 1;
+   route-path_rec-pkey = cpu_to_be16(0x);
+   route-path_rec-mtu_selector = IB_SA_EQ;
+   route-path_rec-sl = id_priv-tos  5;
+
route-path_rec-mtu = iboe_get_mtu(ndev-mtu);
route-path_rec-rate_selector = IB_SA_EQ;
route-path_rec-rate = iboe_get_rate(ndev);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index fcc27bc..ed670f5 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -586,13 +586,22 @@ static void ucma_copy_iboe_route(struct 
rdma_ucm_query_route_resp *resp,
 struct rdma_route *route)
 {
struct rdma_dev_addr *dev_addr;
+   struct net_device *dev;
+   u16 vid = 0;
 
resp-num_paths = route-num_paths;
switch (route-num_paths) {
case 0:
dev_addr = route-addr.dev_addr;
-   iboe_mac_to_ll((union ib_gid *) resp-ib_route[0].dgid,
-  dev_addr-dst_dev_addr);
+   dev = dev_get_by_index(init_net, dev_addr-bound_dev_if);
+   if (dev) {
+   vid = rdma_vlan_dev_vlan_id(dev);
+   dev_put(dev);
+   }
+
+
+   iboe_mac_vlan_to_ll((union ib_gid *) resp-ib_route[0].dgid,
+   dev_addr-dst_dev_addr, vid);
iboe_addr_get_sgid(dev_addr,
   (union ib_gid *) resp-ib_route[0].sgid);
resp-ib_route[0].pkey = cpu_to_be16(0x);
diff --git a/drivers/infiniband/core/ud_header.c 
b/drivers/infiniband/core/ud_header.c
index 7650313..7d03cf1 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -33,6 +33,7 @@
 
 #include linux/errno.h
 #include linux/string.h
+#include linux/if_ether.h
 
 #include rdma/ib_pack.h
 
@@ -103,6 +104,17 @@ static const struct ib_field eth_table[]  = {
  .size_bits= 16 }
 };
 
+static const struct ib_field vlan_table[]  = {
+   { STRUCT_FIELD(vlan, tag),
+ .offset_words = 0,
+ .offset_bits  = 0,
+ .size_bits= 16 },
+   { STRUCT_FIELD(vlan, type),
+ .offset_words = 0,
+ .offset_bits  = 16,
+ .size_bits= 16 }
+};
+
 static const struct ib_field grh_table[]  = {
{ STRUCT_FIELD(grh, ip_version),
  .offset_words = 0,
@@ -205,6 +217,7

[ewg] [PATCHv8 11/11] mlx4: Add vlan support to IBoE

2010-02-18 Thread Eli Cohen
This patch allows IBoE traffic to be encapsulated in 802.1q tagged VLAN
frames. The VLAN tag is encoded in the GID and derived from it by a simple
computation. The netdev notifier callback is modified to catch new VLAN devices
addition/removal and the port's GID table is updated to reflect the change such
that for each netdevice there is an entry in the GID table. When the port's GID
table is exhausted, GID entries will not be added. Only children of the main
interface's can add to the GID table. If a vlan interface is added on another
vlan interface (e.g. vconfig add eth2.6 8), then that interfaces will not add
an entry to the GID table.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/ah.c|   22 +++--
 drivers/infiniband/hw/mlx4/main.c  |   84 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h   |4 +-
 drivers/infiniband/hw/mlx4/qp.c|   49 +++---
 drivers/infiniband/hw/mthca/mthca_qp.c |2 +-
 drivers/net/mlx4/en_netdev.c   |   10 
 drivers/net/mlx4/mlx4_en.h |1 +
 drivers/net/mlx4/port.c|   19 +++
 include/linux/mlx4/device.h|1 +
 include/linux/mlx4/qp.h|2 +-
 10 files changed, 166 insertions(+), 28 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index 0a2f1fb..32911c0 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -34,6 +34,7 @@
 #include rdma/ib_addr.h
 #include linux/inet.h
 #include linux/string.h
+#include rdma/ib_cache.h
 
 int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr 
*ah_attr,
u8 *mac, int *is_mcast, u8 port)
@@ -98,6 +99,8 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct 
ib_ah_attr *ah_attr
u8 mac[6];
int err;
int is_mcast;
+   u16 vlan_tag;
+   union ib_gid sgid;
 
err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, is_mcast, 
ah_attr-port_num);
if (err)
@@ -105,8 +108,14 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, 
struct ib_ah_attr *ah_attr
 
memcpy(ah-av.eth.mac_0_1, mac, 2);
memcpy(ah-av.eth.mac_2_5, mac + 2, 4);
+   err = ib_get_cached_gid(pd-device, ah_attr-port_num, 
ah_attr-grh.sgid_index, sgid);
+   if (err)
+   return ERR_PTR(err);
+   vlan_tag = rdma_get_vlan_id(sgid);
+   vlan_tag |= (ah_attr-sl  7)  13;
ah-av.eth.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num 
 24));
ah-av.eth.gid_index = ah_attr-grh.sgid_index;
+   ah-av.eth.vlan = cpu_to_be16(vlan_tag);
if (ah_attr-static_rate) {
ah-av.eth.stat_rate = ah_attr-static_rate + 
MLX4_STAT_RATE_OFFSET;
while (ah-av.eth.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
@@ -194,8 +203,8 @@ int mlx4_ib_destroy_ah(struct ib_ah *ah)
return 0;
 }
 
-int mlx4_ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid 
*gid,
-   int sgid_idx, u8 *mac, u16 *vlan_id)
+int mlx4_ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid 
*dgid,
+   int sgid_idx, u8 *mac, u16 *vlan_id, u8 *tagged)
 {
int err;
struct mlx4_ib_dev *ibdev = to_mdev(device);
@@ -203,13 +212,18 @@ int mlx4_ib_get_eth_l2_addr(struct ib_device *device, u8 
port, union ib_gid *gid
.port_num = port,
};
int is_mcast;
+   union ib_gid sgid;
 
-   memcpy(ah_attr.grh.dgid.raw, gid, 16);
+   memcpy(ah_attr.grh.dgid.raw, dgid, 16);
err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, is_mcast, port);
if (err)
ERR_PTR(err);
 
-   *vlan_id = 0;
+   err = ib_get_cached_gid(device, port, sgid_idx, sgid);
+   if (err)
+   return err;
+   *vlan_id = rdma_get_vlan_id(sgid);
+   *tagged = !!(*vlan_id);
 
return 0;
 }
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 3b8ab83..f02897d 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -37,6 +37,7 @@
 #include linux/netdevice.h
 #include linux/inetdevice.h
 #include linux/rtnetlink.h
+#include linux/if_vlan.h
 
 #include rdma/ib_smi.h
 #include rdma/ib_user_verbs.h
@@ -78,6 +79,8 @@ static void init_query_mad(struct ib_smp *mad)
mad-method= IB_MGMT_METHOD_GET;
 }
 
+static union ib_gid zgid;
+
 static int mlx4_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props)
 {
@@ -800,12 +803,17 @@ static struct device_attribute *mlx4_class_attributes[] = 
{
dev_attr_board_id
 };
 
-static void mlx4_addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+static void mlx4_addrconf_ifid_eui48(u8 *eui, int is_vlan, u16 vlan_id, struct 
net_device *dev)
 {
memcpy(eui, dev-dev_addr, 3);
memcpy(eui + 5, dev

[ewg] [PATCHv8 1/4] libibverbs: Add link layer field to ibv_port_attr

2010-02-18 Thread Eli Cohen
This field can have one of the values - IBV_LINK_LAYER_UNSPECIFIED,
IBV_LINK_LAYER_INFINIBAND, IBV_LINK_LAYER_ETHERNET. It can be used by
applications to know the link layer used by the port, which can be either
Infiniband or Ethernet. The addition of the new field does not change the size
of struct ibv_port_attr due to alignment of the preceding field. Binary
compatibility is not compromised either since new apps with old libraries will
determine the link layer as IB while old applications with new a new library do
not read this field.

Solution suggested by:
   Roland Dreier rola...@cisco.com
   Jason Gunthorpe jguntho...@obsidianresearch.com
Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 include/infiniband/verbs.h |   21 +
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/include/infiniband/verbs.h b/include/infiniband/verbs.h
index 0f1cb2e..17df3ff 100644
--- a/include/infiniband/verbs.h
+++ b/include/infiniband/verbs.h
@@ -161,6 +161,12 @@ enum ibv_port_state {
IBV_PORT_ACTIVE_DEFER   = 5
 };
 
+enum {
+   IBV_LINK_LAYER_UNSPECIFIED,
+   IBV_LINK_LAYER_INFINIBAND,
+   IBV_LINK_LAYER_ETHERNET,
+};
+
 struct ibv_port_attr {
enum ibv_port_state state;
enum ibv_mtumax_mtu;
@@ -181,6 +187,8 @@ struct ibv_port_attr {
uint8_t active_width;
uint8_t active_speed;
uint8_t phys_state;
+   uint8_t link_layer;
+   uint8_t pad;
 };
 
 enum ibv_event_type {
@@ -693,6 +701,16 @@ struct ibv_context {
void   *abi_compat;
 };
 
+static inline int ___ibv_query_port(struct ibv_context *context,
+   uint8_t port_num,
+   struct ibv_port_attr *port_attr)
+{
+   port_attr-link_layer = IBV_LINK_LAYER_UNSPECIFIED;
+   port_attr-pad = 0;
+
+   return context-ops.query_port(context, port_num, port_attr);
+}
+
 /**
  * ibv_get_device_list - Get list of IB devices currently available
  * @num_devices: optional.  if non-NULL, set to the number of devices
@@ -1097,4 +1115,7 @@ END_C_DECLS
 
 #  undef __attribute_const
 
+#define ibv_query_port(context, port_num, port_attr) \
+   ___ibv_query_port(context, port_num, port_attr)
+
 #endif /* INFINIBAND_VERBS_H */
-- 
1.7.0

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 2/4] libibverbs: change kernel API to accept link layer

2010-02-18 Thread Eli Cohen
Modify the code to allow passing the link layer of a port from kernel to user.
Update ibv_query_port.3 man page with the change.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 include/infiniband/kern-abi.h |3 ++-
 man/ibv_query_port.3  |1 +
 src/cmd.c |1 +
 3 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/include/infiniband/kern-abi.h b/include/infiniband/kern-abi.h
index 0db083a..619ea7e 100644
--- a/include/infiniband/kern-abi.h
+++ b/include/infiniband/kern-abi.h
@@ -223,7 +223,8 @@ struct ibv_query_port_resp {
__u8  active_width;
__u8  active_speed;
__u8  phys_state;
-   __u8  reserved[3];
+   __u8  link_layer;
+   __u8  reserved[2];
 };
 
 struct ibv_alloc_pd {
diff --git a/man/ibv_query_port.3 b/man/ibv_query_port.3
index 882470d..6d8b873 100644
--- a/man/ibv_query_port.3
+++ b/man/ibv_query_port.3
@@ -44,6 +44,7 @@ uint8_t init_type_reply;/* Type of 
initialization performed by S
 uint8_t active_width;   /* Currently active link width */
 uint8_t active_speed;   /* Currently active link speed */
 uint8_t phys_state; /* Physical port state */
+uint8_t link_layer; /* link layer protocol of the port */  
 
 .in -8
 };
 .sp
diff --git a/src/cmd.c b/src/cmd.c
index cbd5288..39af833 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -196,6 +196,7 @@ int ibv_cmd_query_port(struct ibv_context *context, uint8_t 
port_num,
port_attr-active_width= resp.active_width;
port_attr-active_speed= resp.active_speed;
port_attr-phys_state  = resp.phys_state;
+   port_attr-link_layer  = resp.link_layer;
 
return 0;
 }
-- 
1.7.0

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 3/4] libibverbs: Add API to retrieve eth link layer address

2010-02-18 Thread Eli Cohen
Add a command to retrieve the layer 2 address of an ethernet port. The layer 2
address is comprised of the port's MAC address and the VLAN ID.This is required
by libraries to build work requests when the port's link layer is Ethernet.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 include/infiniband/driver.h   |2 ++
 include/infiniband/kern-abi.h |   23 ++-
 src/cmd.c |   24 
 src/libibverbs.map|1 +
 4 files changed, 49 insertions(+), 1 deletions(-)

diff --git a/include/infiniband/driver.h b/include/infiniband/driver.h
index 9a81416..3e09548 100644
--- a/include/infiniband/driver.h
+++ b/include/infiniband/driver.h
@@ -131,6 +131,8 @@ int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah,
 int ibv_cmd_destroy_ah(struct ibv_ah *ah);
 int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t 
lid);
 int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t 
lid);
+int ibv_cmd_get_eth_l2_addr(struct ibv_pd *pd, uint8_t port, const union 
ibv_gid *gid,
+   int sgid_idx, uint8_t *mac, uint16_t *vlan_id, 
uint8_t *tagged);
 
 int ibv_dontfork_range(void *base, size_t size);
 int ibv_dofork_range(void *base, size_t size);
diff --git a/include/infiniband/kern-abi.h b/include/infiniband/kern-abi.h
index 619ea7e..642c7db 100644
--- a/include/infiniband/kern-abi.h
+++ b/include/infiniband/kern-abi.h
@@ -85,7 +85,8 @@ enum {
IB_USER_VERBS_CMD_MODIFY_SRQ,
IB_USER_VERBS_CMD_QUERY_SRQ,
IB_USER_VERBS_CMD_DESTROY_SRQ,
-   IB_USER_VERBS_CMD_POST_SRQ_RECV
+   IB_USER_VERBS_CMD_POST_SRQ_RECV,
+   IB_USER_VERBS_CMD_GET_ETH_L2_ADDR,
 };
 
 /*
@@ -804,6 +805,7 @@ enum {
 * trick opcodes in IBV_INIT_CMD() doesn't break.
 */
IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL_V2 = -1,
+   IB_USER_VERBS_CMD_GET_ETH_L2_ADDR_V2 = -1,
 };
 
 struct ibv_destroy_cq_v1 {
@@ -879,4 +881,23 @@ struct ibv_create_srq_resp_v5 {
__u32 srq_handle;
 };
 
+struct ibv_get_eth_l2_addr {
+   __u32 command;
+   __u16 in_words;
+   __u16 out_words;
+   __u64 response;
+   __u32 pd_handle;
+   __u8  port;
+   __u8  sgid_idx;
+   __u8  reserved[2];
+   __u8  dgid[16];
+};
+
+struct ibv_get_eth_l2_addr_resp {
+   __u8mac[6];
+   __u16   vlan_id;
+   __u8tagged;
+   __u8reserved[3];
+};
+
 #endif /* KERN_ABI_H */
diff --git a/src/cmd.c b/src/cmd.c
index 39af833..6a3c101 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -1123,3 +1123,27 @@ int ibv_cmd_detach_mcast(struct ibv_qp *qp, const union 
ibv_gid *gid, uint16_t l
 
return 0;
 }
+
+int ibv_cmd_get_eth_l2_addr(struct ibv_pd *pd, uint8_t port, const union 
ibv_gid *gid,
+   int sgid_idx, uint8_t *mac, uint16_t *vlan_id, 
uint8_t *tagged)
+
+{
+   struct ibv_get_eth_l2_addr cmd;
+   struct ibv_get_eth_l2_addr_resp resp;
+
+   IBV_INIT_CMD_RESP(cmd, sizeof cmd, GET_ETH_L2_ADDR, resp, sizeof 
resp);
+   memcpy(cmd.dgid, gid, sizeof cmd.dgid);
+   cmd.pd_handle = pd-handle;
+   cmd.port = port;
+   cmd.sgid_idx = sgid_idx;
+
+   if (write(pd-context-cmd_fd, cmd, sizeof cmd) != sizeof cmd)
+   return errno;
+
+   memcpy(mac, resp.mac, 6);
+   *vlan_id = resp.vlan_id;
+   *tagged = resp.tagged;
+
+   return 0;
+}
+
diff --git a/src/libibverbs.map b/src/libibverbs.map
index 1827da0..af52d0d 100644
--- a/src/libibverbs.map
+++ b/src/libibverbs.map
@@ -64,6 +64,7 @@ IBVERBS_1.0 {
ibv_cmd_destroy_ah;
ibv_cmd_attach_mcast;
ibv_cmd_detach_mcast;
+   ibv_cmd_get_eth_l2_addr;
ibv_copy_qp_attr_from_kern;
ibv_copy_path_rec_from_kern;
ibv_copy_path_rec_to_kern;
-- 
1.7.0

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 4/4] libibverbs: Update examples for IBoE

2010-02-18 Thread Eli Cohen
Since IBoE requires usage of GRH, update ibv_*_pinpong examples to accept GIDs.
GIDs are given as an index to the local port's table and are exchanged between
the client and the server through the socket connection. The examples are also
modified to pass the gid index to the code that creates the address vector as a
preparation to using gids other the the on in index 0.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 examples/devinfo.c  |   14 +++
 examples/pingpong.c |   31 
 examples/pingpong.h |4 ++
 examples/rc_pingpong.c  |   91 ++
 examples/srq_pingpong.c |   84 ---
 examples/uc_pingpong.c  |   82 +++---
 examples/ud_pingpong.c  |   81 ++
 7 files changed, 297 insertions(+), 90 deletions(-)

diff --git a/examples/devinfo.c b/examples/devinfo.c
index 84f95c7..393ec04 100644
--- a/examples/devinfo.c
+++ b/examples/devinfo.c
@@ -184,6 +184,19 @@ static int print_all_port_gids(struct ibv_context *ctx, 
uint8_t port_num, int tb
return rc;
 }
 
+static const char *link_layer_str(uint8_t link_layer)
+{
+   switch (link_layer) {
+   case IBV_LINK_LAYER_UNSPECIFIED:
+   case IBV_LINK_LAYER_INFINIBAND:
+   return IB;
+   case IBV_LINK_LAYER_ETHERNET:
+   return Ethernet;
+   default:
+   return Unknown;
+   }
+}
+
 static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port)
 {
struct ibv_context *ctx;
@@ -284,6 +297,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t 
ib_port)
printf(\t\t\tsm_lid:\t\t\t%d\n, port_attr.sm_lid);
printf(\t\t\tport_lid:\t\t%d\n, port_attr.lid);
printf(\t\t\tport_lmc:\t\t0x%02x\n, port_attr.lmc);
+   printf(\t\t\tlink_layer:\t\t%s\n, 
link_layer_str(port_attr.link_layer));
 
if (verbose) {
printf(\t\t\tmax_msg_sz:\t\t0x%x\n, 
port_attr.max_msg_sz);
diff --git a/examples/pingpong.c b/examples/pingpong.c
index b916f59..806f446 100644
--- a/examples/pingpong.c
+++ b/examples/pingpong.c
@@ -31,6 +31,10 @@
  */
 
 #include pingpong.h
+#include arpa/inet.h
+#include stdlib.h
+#include stdio.h
+#include string.h
 
 enum ibv_mtu pp_mtu_to_enum(int mtu)
 {
@@ -53,3 +57,30 @@ uint16_t pp_get_local_lid(struct ibv_context *context, int 
port)
 
return attr.lid;
 }
+
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr)
+{
+   return ibv_query_port(context, port, attr);
+}
+
+void wire_gid_to_gid(const char *wgid, union ibv_gid *gid)
+{
+   char tmp[9];
+   uint32_t v32;
+   int i;
+
+   for (tmp[8] = 0, i = 0; i  4; ++i) {
+   memcpy(tmp, wgid + i * 8, 8);
+   sscanf(tmp, %x, v32);
+   *(uint32_t *)(gid-raw[i * 4]) = ntohl(v32);
+   }
+}
+
+void gid_to_wire_gid(const union ibv_gid *gid, char wgid[])
+{
+   int i;
+
+   for (i = 0; i  4; ++i)
+   sprintf(wgid[i * 8], %08x, htonl(*(uint32_t *)(gid-raw + i 
* 4)));
+}
diff --git a/examples/pingpong.h b/examples/pingpong.h
index 71d7c3f..9cdc03e 100644
--- a/examples/pingpong.h
+++ b/examples/pingpong.h
@@ -37,5 +37,9 @@
 
 enum ibv_mtu pp_mtu_to_enum(int mtu);
 uint16_t pp_get_local_lid(struct ibv_context *context, int port);
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr);
+void wire_gid_to_gid(const char *wgid, union ibv_gid *gid);
+void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]);
 
 #endif /* IBV_PINGPONG_H */
diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c
index fa969e0..a63905d 100644
--- a/examples/rc_pingpong.c
+++ b/examples/rc_pingpong.c
@@ -67,17 +67,19 @@ struct pingpong_context {
int  size;
int  rx_depth;
int  pending;
+   struct ibv_port_attr portinfo;
 };
 
 struct pingpong_dest {
int lid;
int qpn;
int psn;
+   union ibv_gid gid;
 };
 
 static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn,
  enum ibv_mtu mtu, int sl,
- struct pingpong_dest *dest)
+ struct pingpong_dest *dest, int sgid_idx)
 {
struct ibv_qp_attr attr = {
.qp_state   = IBV_QPS_RTR,
@@ -94,6 +96,13 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int 
port, int my_psn,
.port_num   = port
}
};
+
+   if (dest-gid.global.interface_id) {
+   attr.ah_attr.is_global = 1;
+   attr.ah_attr.grh.hop_limit = 1;
+   attr.ah_attr.grh.dgid = dest-gid;
+   attr.ah_attr.grh.sgid_index = sgid_idx

[ewg] [PATCHv8 2/2] libmlx4: Add Eth devices to ib devices list

2010-02-18 Thread Eli Cohen
With the new IBoE implementation, Ethernet devices expose also IB devices.
Update the list of supported devices with that of the kernel.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 src/mlx4.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/src/mlx4.c b/src/mlx4.c
index 1295c53..6068208 100644
--- a/src/mlx4.c
+++ b/src/mlx4.c
@@ -66,6 +66,13 @@ struct {
HCA(MELLANOX, 0x6354),  /* MT25408 Hermon QDR */
HCA(MELLANOX, 0x6732),  /* MT25408 Hermon DDR PCIe gen2 */
HCA(MELLANOX, 0x673c),  /* MT25408 Hermon QDR PCIe gen2 */
+   HCA(MELLANOX, 0x6368), /* MT25448 [ConnectX EN 10GigE, PCIe 2.0 
2.5GT/s] */
+   HCA(MELLANOX, 0x6750), /* MT26448 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] 
*/
+   HCA(MELLANOX, 0x6372), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe 
2.0 2.5GT/s] */
+   HCA(MELLANOX, 0x675a), /* MT25408 [ConnectX EN 10GigE 10GBaseT, PCIe 
Gen2 5GT/s] */
+   HCA(MELLANOX, 0x6764), /* MT26468 [ConnectX EN 10GigE, PCIe 2.0 5GT/s] 
*/
+   HCA(MELLANOX, 0x6746), /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */
+   HCA(MELLANOX, 0x676e), /* MT26478 ConnectX2 40GigE PCIe gen2 */
 };
 
 static struct ibv_context_ops mlx4_ctx_ops = {
-- 
1.7.0

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv8 1/2] libmlx4: Add IBoE support

2010-02-18 Thread Eli Cohen
Modify libmlx4 to support IBoE. The change involves retrieving the ethernet
layer 2 address of a port based on its GID and source index through a new
system call, ibv_cmd_eth_l2_addr(), and embedding the layer 2 information in
the address vector representation of mlx4.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 src/mlx4.h  |4 
 src/qp.c|8 +++-
 src/verbs.c |   34 ++
 src/wqe.h   |6 --
 4 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/src/mlx4.h b/src/mlx4.h
index 4445998..4b12456 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -236,11 +236,15 @@ struct mlx4_av {
uint8_t hop_limit;
uint32_tsl_tclass_flowlabel;
uint8_t dgid[16];
+   uint8_t mac[8];
 };
 
 struct mlx4_ah {
struct ibv_ah   ibv_ah;
struct mlx4_av  av;
+   uint16_tvlan;
+   uint8_t mac[6];
+   uint8_t tagged;
 };
 
 static inline unsigned long align(unsigned long val, unsigned long align)
diff --git a/src/qp.c b/src/qp.c
index d194ae3..fa70889 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -143,6 +143,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg 
*dseg,
memcpy(dseg-av, to_mah(wr-wr.ud.ah)-av, sizeof (struct mlx4_av));
dseg-dqpn = htonl(wr-wr.ud.remote_qpn);
dseg-qkey = htonl(wr-wr.ud.remote_qkey);
+   dseg-vlan = htons(to_mah(wr-wr.ud.ah)-vlan);
+   memcpy(dseg-mac, to_mah(wr-wr.ud.ah)-mac, 6);
 }
 
 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
@@ -281,6 +283,10 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr 
*wr,
set_datagram_seg(wqe, wr);
wqe  += sizeof (struct mlx4_wqe_datagram_seg);
size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+   if (to_mah(wr-wr.ud.ah)-tagged) {
+   ctrl-ins_vlan = 1  6;
+   ctrl-vlan_tag = 
htons(to_mah(wr-wr.ud.ah)-vlan);
+   }
break;
 
default:
@@ -393,7 +399,7 @@ out:
 
if (nreq == 1  inl  size  1  size  ctx-bf_buf_size / 16) {
ctrl-owner_opcode |= htonl((qp-sq.head  0x)  8);
-   *(uint32_t *) ctrl-reserved |= qp-doorbell_qpn;
+   *(uint32_t *) (ctrl-vlan_tag) |= qp-doorbell_qpn;
/*
 * Make sure that descriptor is written to memory
 * before writing to BlueFlame page.
diff --git a/src/verbs.c b/src/verbs.c
index 1ac1362..48731a7 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -614,9 +614,21 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp)
return 0;
 }
 
+static int mcast_mac(uint8_t *mac)
+{
+   int i;
+   uint8_t val = 0xff;
+
+   for (i = 0; i  6; ++i)
+   val = mac[i];
+
+   return val == 0xff;
+}
+
 struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
 {
struct mlx4_ah *ah;
+   struct ibv_port_attr port_attr;
 
ah = malloc(sizeof *ah);
if (!ah)
@@ -642,7 +654,29 @@ struct ibv_ah *mlx4_create_ah(struct ibv_pd *pd, struct 
ibv_ah_attr *attr)
memcpy(ah-av.dgid, attr-grh.dgid.raw, 16);
}
 
+   if (ibv_query_port(pd-context, attr-port_num, port_attr))
+   goto err;
+
+   if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+   if (ibv_cmd_get_eth_l2_addr(pd, attr-port_num,
+   (const union ibv_gid *)ah-av.dgid,
+   attr-grh.sgid_index,
+   ah-mac, ah-vlan, ah-tagged))
+   goto err;
+
+   if (mcast_mac(ah-mac))
+   ah-av.dlid = htons(0xc000);
+   if (ah-tagged) {
+   ah-av.port_pd |= htonl(1  29);
+   ah-vlan |= (attr-sl  7)  13;
+   }
+   }
+
+
return ah-ibv_ah;
+err:
+   free(ah);
+   return NULL;
 }
 
 int mlx4_destroy_ah(struct ibv_ah *ah)
diff --git a/src/wqe.h b/src/wqe.h
index 6f7f309..1e6159c 100644
--- a/src/wqe.h
+++ b/src/wqe.h
@@ -54,7 +54,8 @@ enum {
 
 struct mlx4_wqe_ctrl_seg {
uint32_towner_opcode;
-   uint8_t reserved[3];
+   uint16_tvlan_tag;
+   uint8_t ins_vlan;
uint8_t fence_size;
/*
 * High 24 bits are SRC remote buffer; low 8 bits are flags:
@@ -78,7 +79,8 @@ struct mlx4_wqe_datagram_seg {
uint32_tav[8];
uint32_tdqpn;
uint32_tqkey;
-   uint32_treserved[2];
+   uint16_t

Re: [ewg] OFED-1.5.1 failure over iWarp

2010-02-04 Thread Eli Cohen
On Thu, Feb 04, 2010 at 09:46:58AM -0600, Steve Wise wrote:
 Never mind.  I see you already committed the change.  I just pulled
 the latest and rping works over iwarp.
 

Thanks for checking this.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] OFED-1.5.1 failure over iWarp

2010-02-03 Thread Eli Cohen
On Wed, Feb 03, 2010 at 09:17:57AM -0600, Steve Wise wrote:
 This patch didn't work.  I still get an address resolution error
 event with status -2.

Can you tell whether the bind suceeded? Can you give some more details
about the failure. 
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] libmlx4 build error in 1-19 build

2010-01-21 Thread Eli Cohen
Hi Tung,

I just checked this on OFED-1.5.1-20100120-0600.tgz and it worked
without problem. Since nothing has changed in libmlx4 and the last few
days, it shouldn't matter which version I chose. I believe the problem
is with old header files in your system. Please remove old header
files under /usr/include/infiniband or /usr/local/include/infiniband.

The system I was using is sles11 x86_64.

On Wed, Jan 20, 2010 at 09:48:41AM -0700, Tung, Chien Tin wrote:
 I pulled down OFED 1.5.1 (OFED-1.5.1-20100119-0600.tgz, non-RDMAoE version) 
 and got this error (install.pl --hpc):
 
 gcc -DHAVE_CONFIG_H -I. -I. -I. -g -Wall -D_GNU_SOURCE -O2 -g -pipe -Wall 
 -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector 
 --param=ssp-buffer-size=4 -m64 -mtune=generic -MT buf.lo -MD -MP -MF 
 .deps/buf.Tpo -c src/buf.c -o buf.o /dev/null 21
 src/verbs.c: In function 'mlx4_create_ah':
 src/verbs.c:686: error: 'struct ibv_port_attr' has no member named 
 'link_layer'
 src/verbs.c:686: error: 'IBV_LINK_LAYER_ETHERNET' undeclared (first use in 
 this function)
 src/verbs.c:686: error: (Each undeclared identifier is reported only once
 src/verbs.c:686: error: for each function it appears in.)
 src/verbs.c:687: warning: implicit declaration of function 'ibv_cmd_get_mac'
 make[1]: *** [verbs.lo] Error 1
 make[1]: *** Waiting for unfinished jobs
 
 
 similar error in MVAPICH as well.
 
 Perhaps something from RDMAoE leaked into this one?
 
 Chien
 
 --
 Chien Tung | chien.tin.t...@intel.com
 
 
 
 ___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [Annonce] OFED-RDMAoE-1.5-rc4 is available

2009-12-15 Thread Eli Cohen
OFED-RDMAoE-1.5-rc4 is available

The tarball is available on:
http://www.openfabrics.org/downloads/OFED/ofed-rdmaoe-1.5/OFED-RDMAoE-1.5-rc4.tgz


To get BUILD_ID run ofed_info

Please report any issues in bugzilla https://bugs.openfabrics.org/ for
OFED 1.5, component RDMAoE


Release information:

Linux Operating Systems:
   - RedHat EL4 up6   2.6.9-67.ELsmp
   - RedHat EL4 up7   2.6.9-78.ELsmp
   - RedHat EL4 up8   2.6.9-89.ELsmp
   - RedHat EL5 up2   2.6.18-92.el5
   - RedHat EL5 up3   2.6.18-128.el5
   - RedHat EL5 up4   2.6.18-164.el5
   - SLES10 SP2   2.6.16.60-0.21-smp
   - SLES10 SP3   2.6.16.60-0.54-smp
   - SLES11   2.6.27.19-5-default
   - OEL 4 up72.6.9-78.ELsmp
   - OEL 5 up22.6.18-92.el5
   - CentOS5.22.6.18-92.el5
   - CentOS5.32.6.18-128.el5
   - Fedora Cor   2.6.29  *
   - OpenSuSE 11  2.6.25.5-1.1*
   - kernel.org   2.6.29 and 2.6.30


 * Minimal QA for these versions

Systems:
 * x86_64
 * x86
 * ia64
 * ppc64

Main changes from 1.5 rc3:

1. Updated packages:

libnes-1.0.0.tar.gz


- Management packages:
libibumad-1.3.3_20091204_86353ad.tar.gz
libibmad-1.3.3_20091204_86353ad.tar.gz
opensm-3.3.3_20091204_86353ad.tar.gz
infiniband-diags-1.5.3_20091204_86353ad.tar.gz


- MPI packages:
openmpi-1.4-1.src.rpm


2. Bug fixes (attached log file)
Pull bug fixes from ofed_kernel_1_5


Limitations:

- SLES10 SP3 on IA64 is not supported yet


Notes:
RDMAoE and mlx4_en capabilities are supported only for x86 and x86_64
architectures.

commit f0923c26074cf44e64e70e9b08050cfd13c6207e
Merge: 2b8d43e 028c20d
Author: Eli Cohen e...@mellanox.co.il
Date:   Thu Dec 10 15:38:32 2009 +0200

Merge branch 'ofed_kernel_1_5' into rdmaoe

commit 028c20d5e069a486aa23ef12ab3ff1707cac9cd8
Author: Eli Cohen e...@mellanox.co.il
Date:   Thu Dec 10 15:18:22 2009 +0200

Revert mlx4_en: Controlling pfc parameters through sysfs interface

This reverts commit 62df0ecb9442f9c61ad7d1169bf8cba55504d073.

commit 8b554a1e3493e3aeab93b04f2f22fa5a9b4983df
Author: Eli Cohen e...@mellanox.co.il
Date:   Thu Dec 10 15:17:19 2009 +0200

Revert mlx4_en: fix bug in mlx4_en_init_netdev

This reverts commit 8b276e57a9eca2f1b3f118db6ec1143cf15d06fe.

commit 003f147a4dce39bfadd10d253c75f3b799ae4142
Author: Eli Cohen e...@mellanox.co.il
Date:   Thu Dec 10 10:32:09 2009 +0200

Fix backports for 2.6.9 kernels

Signed-off-by: Eli Cohen e...@mellanox.co.il

commit 2dd9441b2784e1d36d3025693f19a950cb1b39b8
Merge: 5f541de f4996b2
Author: Vladimir Sokolovsky v...@mellanox.co.il
Date:   Thu Dec 10 09:37:56 2009 +0200

Merge branch 'ofed_kernel_1_5' of 
git://sofa.openfabrics.org/~ctung/ofed-1.5 into ofed_kernel_1_5

commit f4996b2c3b72e2cb4c9b99e59f704adff7baad20
Author: Chien Tung chien.tin.t...@intel.com
Date:   Wed Dec 9 20:47:19 2009 -0600

RDMA/nes: udpate backports for RC4

Signed-off-by: Chien Tung chien.tin.t...@intel.com

commit ac0fd81fbc830f15e3e76d7b4baa2ceb0bfad7b2
Author: Chien Tung chien.tin.t...@intel.com
Date:   Wed Dec 9 20:25:58 2009 -0600

RDMA/nes: update for OFED 1.5 RC4

add:

nes_0019_max_cqe.patch - MAX CQE check on cq create.
nes_0020_copyright.patch - update copyright
nes_0021_stale_arp.patch - stale ARP fix
nes_0022_sync_with_kernel_org.patch - match up with kernel.org content

Signed-off-by: Chien Tung chien.tin.t...@intel.com

commit 5f541de203009c2bcc13c110f8b70cb976b0e0d2
Author: Ralph Campbell (QLogic) ral...@hosting.openfabrics.org
Date:   Wed Dec 9 12:16:03 2009 -0800

IB/qib: fix backport patches for RHEL 5

The RHEL 5 backports for the IPoIB device stop deadlock commit had
a bug where the wrong work queue was being flushed.
This updates the patches to use the correct flush.

Signed-off-by: Ralph Campbell ralph.campb...@qlogic.com

commit 2b8d43ea78ca8f7f20cc0251ee450183f33ccc92
Merge: 5ed038c c4ce625
Author: Eli Cohen e...@mellanox.co.il
Date:   Wed Dec 9 17:38:51 2009 +0200

Merge branch 'ofed_kernel_1_5' into rdmaoe

commit c4ce6250a52e5a2087bff72ca9ff0de6e57da169
Author: Eli Cohen e...@mellanox.co.il
Date:   Wed Dec 9 17:07:17 2009 +0200

Fix type of trigger_port()

Signed-off-by: Eli Cohen e...@mellanox.co.il

commit 9662bcf2f8ab7b518b73bb9b2bd0b00646bd7ec7
Author: Eli Cohen e...@mellanox.co.il
Date:   Wed Dec 9 16:49:40 2009 +0200

mlx4_core: Bug fix in error flow

If mlx4_init_port_info() fails, cleanup only the initialized ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il

commit 8b276e57a9eca2f1b3f118db6ec1143cf15d06fe
Author: Eli Cohen e...@mellanox.co.il
Date:   Wed Dec 9 16:37:38 2009 +0200

mlx4_en: fix bug in mlx4_en_init_netdev

device_create_file

Re: [ewg] Re: [PATCH] rdmaoe/libibverbs: handle binary compatibility

2009-12-14 Thread Eli Cohen
On Thu, Dec 10, 2009 at 02:49:45PM -0700, Jason Gunthorpe wrote:
 
 - Change the library API for ibv_port_attr to include a link_layer
 - Change the kernel API to retrieve link_layer
 - Add ibv_cmd_get_mac and other stuff to support RDMAoE
 - Update verbs examples to support RDMAoE
 - Update man pages (you missed these)
 

Will be addressed in the next patch set.
 --- a/include/infiniband/kern-abi.h
 +++ b/include/infiniband/kern-abi.h
 @@ -46,7 +46,7 @@
   * The minimum and maximum kernel ABI that we can handle.
   */
   #define IB_USER_VERBS_MIN_ABI_VERSION  1
  -#define IB_USER_VERBS_MAX_ABI_VERSION  6
  +#define IB_USER_VERBS_MAX_ABI_VERSION  7
  
 Whats this about? That seems like it needs a much bigger review,
 changing the kernel ABI version instantly breaks every existing
 libibverbs, shouldn't be done without alot of discussion!!
I think we can do without chagning the ABI version so I am going to
ommit it in the next patch set.

 
 Extra include?

Yes, thanks.


 
 @@ -86,6 +86,7 @@ default_symver(__ibv_query_device, ibv_query_device);
  int __ibv_query_port(struct ibv_context *context, uint8_t port_num,
  struct ibv_port_attr *port_attr)
  {
 +   port_attr-link_layer = IBV_LINK_LAYER_UNSPECIFIED;
 return context-ops.query_port(context, port_num, port_attr);
  }
 
 Seems like this should be just
  return ___ibv_query_port(context,port_num,port_attr);
 

A leftover, thanks.

  diff --git a/drivers/infiniband/core/uverbs_cmd.c 
  b/drivers/infiniband/core/uverbs_cmd.c
  index 012aadf..d592bd2 100644
  +++ b/drivers/infiniband/core/uverbs_cmd.c
  @@ -452,7 +452,8 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file 
  *file,
  resp.active_width= attr.active_width;
  resp.active_speed= attr.active_speed;
  resp.phys_state  = attr.phys_state;
  -   resp.transport   = attr.transport;
  +   resp.transport   = attr.transport == RDMA_TRANSPORT_RDMAOE ?
  +   IB_LINK_LAYER_ETHERNET : IB_LINK_LAYER_INFINIBAND;
 
 Are you going to change the kernel patches to use the new link_layer
 name?
 

Yes, in the next patch set.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] Re: [PATCH] rdmaoe/libibverbs: handle binary compatibility

2009-12-14 Thread Eli Cohen
On Thu, Dec 10, 2009 at 01:50:00PM -0800, Sean Hefty wrote:
 @@ -306,7 +314,7 @@ static struct pingpong_context *pp_init_ctx(struct
 ibv_device *ib_dev, int size,
  return NULL;
  }
 
 -memset(ctx-buf, 0, size);
 +memset(ctx-buf, 0x7b + is_server, size);
 
  ctx-context = ibv_open_device(ib_dev);
  if (!ctx-context) {
 @@ -481,6 +489,7 @@ static void usage(const char *argv0)
  printf(  -n, --iters=itersnumber of exchanges (default 1000)\n);
  printf(  -l, --sl=sl  service level value\n);
  printf(  -e, --events   sleep on CQ events (default poll)\n);
 +printf(  -g, --gid=remote gid gid of the other port\n);
 
 It seems easier for the user if the tests discovered its local GID and 
 exchanged
 it with the remote side like it does with the LID and QPN.
  

Will be changed in the next patch set -- the user will specify the
index to the GID table of the local port.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH] IPoIB: synchronize timer deletion with completion handler

2009-12-10 Thread Eli Cohen
On Wed, Dec 09, 2009 at 02:39:49PM -0800, Roland Dreier wrote:
 
   When calling del_timer_sync on priv-poll_timer, it is necessary to prevent
   farther arming of the timer which can be done by a completion handler. 
 Though
   it is harmless since the timer will eventually stop being rearmed, it is 
 better
   practice to avoid calling the timer function after it is deleted. This 
 patch
   handles this by using a new flag that is checked before arming the timer.
 
 have you seen this in practice?  If it can happen then it's not
 harmless, since the module could be unloaded with the timer pending.
 However I don't see how it could happen, since we only seem to delete
 the timer after we know that no more completions are coming (except for
 the case where we decide that the hardware is wedged but it really only
 takes a *long* time to respond at exactly the wrong time, and we somehow
 get a completion between the del_timer_sync and the modify QP to reset
 state -- which is so unlikely it seems not worth adding this extra
 complexity for -- maybe we could add the del_timer_sync to after we
 delete the CQ or something if you're really worried)

No, I haven't actually seen this happening but I think it could.
Consider this scenario:

1. ipoib_send() calls ib_req_notify_cq()
2. ipoib_ib_dev_stop() gets called
3. All pending WR complete; interrupt is generated but not yet
serviced.
4. ipoib_ib_dev_stop() calls del_timer_sync() and passes it.
5. Completion handler is finally serviced - it schedules
ipoib_ib_tx_timer_func()
6. IPoIB unloads and we're in trouble.

Not so likely to happen indeed. Your suggestion to defer deleting the
timer until after the CQ is destroyed will also solve this.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH] mlx4: Fix bug in mlx4_ib_mcg_attach

2009-12-10 Thread Eli Cohen
On Wed, Dec 09, 2009 at 02:49:47PM -0800, Roland Dreier wrote:
 
 I don't see anything touching this code there.  The patch that
 introduced this code upstream, 521e575b (IB/mlx4: Add support for
 blocking multicast loopback packets) doesn't have this bug and I don't
 see anything else that changed that area of the code.
 

Your branch, my bug :-) I put it in one of my commits.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCH v2] mlx4_core: Cleanup bug in __mlx4_init_one()

2009-12-10 Thread Eli Cohen
If mlx4_init_port_info() fails, cleanup the initialized ports only.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
Changes: rebase the patch on Roland's for-next branch

 drivers/net/mlx4/main.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 291a505..3cf56d9 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -1174,7 +1174,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const 
struct pci_device_id *id)
return 0;
 
 err_port:
-   for (port = 1; port = dev-caps.num_ports; port++)
+   for (--port; port = 1; --port)
mlx4_cleanup_port_info(priv-port[port]);
 
mlx4_cleanup_mcg_table(dev);
-- 
1.6.5.5

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCH] rdmaoe/libibverbs: handle binary compatibility

2009-12-10 Thread Eli Cohen
Hi,
here is the patch I prepared based on the discussions we had. The patch is 
based on the last
rdmaoe/libibverbs patch I sent. libmlx4 was modified too, a trivial
change that changes name. Both fixes were push to OFED. I will send a
new patch set in a few days.

The introduction of a new field to struct ibv_port_attr to distinguish between
the different link layers supported (IB or Ethernet), which is located in the
padding area of the struct, poses a binary compatibilty issue between new
applications (compiled against the new version of the struct) and older
libraries, becuase old libraries do not initialize the padding area. To
overcome this, ibv_query_port is redefined to call an inline function that sets
the link_layer field to a predefined value which is interpreted as IB (the
previous only value for this field). The patch also clears any padding left
should we need this in the future. This patch also changes the previous field
name from protocol to link_layer, and also modifies all the examples to adapt
to the change.

Solution suggested by:
Roland Dreier rola...@cisco.com
Jason Gunthorpe jguntho...@obsidianresearch.com
Signed-off-by: Eli Cohen e...@mellanox.co.il

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 examples/devinfo.c|   15 +++
 examples/rc_pingpong.c|2 +-
 examples/srq_pingpong.c   |2 +-
 examples/uc_pingpong.c|2 +-
 examples/ud_pingpong.c|2 +-
 include/infiniband/kern-abi.h |2 +-
 include/infiniband/verbs.h|   33 -
 src/cmd.c |2 +-
 src/verbs.c   |1 +
 9 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/examples/devinfo.c b/examples/devinfo.c
index 88e0557..393ec04 100644
--- a/examples/devinfo.c
+++ b/examples/devinfo.c
@@ -184,15 +184,14 @@ static int print_all_port_gids(struct ibv_context *ctx, 
uint8_t port_num, int tb
return rc;
 }
 
-static const char *transport_type_str(enum rdma_transport_type type)
+static const char *link_layer_str(uint8_t link_layer)
 {
-   switch (type) {
-   case RDMA_TRANSPORT_IB:
+   switch (link_layer) {
+   case IBV_LINK_LAYER_UNSPECIFIED:
+   case IBV_LINK_LAYER_INFINIBAND:
return IB;
-   case RDMA_TRANSPORT_IWARP:
-   return IWARP;
-   case RDMA_TRANSPORT_RDMAOE:
-   return RDMAOE;
+   case IBV_LINK_LAYER_ETHERNET:
+   return Ethernet;
default:
return Unknown;
}
@@ -298,7 +297,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t 
ib_port)
printf(\t\t\tsm_lid:\t\t\t%d\n, port_attr.sm_lid);
printf(\t\t\tport_lid:\t\t%d\n, port_attr.lid);
printf(\t\t\tport_lmc:\t\t0x%02x\n, port_attr.lmc);
-   printf(\t\t\ttrasnport_type:\t\t%s\n, 
transport_type_str(port_attr.transport));
+   printf(\t\t\tlink_layer:\t\t%s\n, 
link_layer_str(port_attr.link_layer));
 
if (verbose) {
printf(\t\t\tmax_msg_sz:\t\t0x%x\n, 
port_attr.max_msg_sz);
diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c
index e0cde29..4d0bd0d 100644
--- a/examples/rc_pingpong.c
+++ b/examples/rc_pingpong.c
@@ -653,7 +653,7 @@ int main(int argc, char *argv[])
}
 
my_dest.lid = ctx-portinfo.lid;
-   if (ctx-portinfo.transport == RDMA_TRANSPORT_RDMAOE) {
+   if (ctx-portinfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
if (!grh) {
fprintf(stderr, Must supply remote gid\n);
return 1;
diff --git a/examples/srq_pingpong.c b/examples/srq_pingpong.c
index bd10f90..eda9013 100644
--- a/examples/srq_pingpong.c
+++ b/examples/srq_pingpong.c
@@ -744,7 +744,7 @@ int main(int argc, char *argv[])
for (i = 0; i  num_qp; ++i) {
my_dest[i].qpn = ctx-qp[i]-qp_num;
my_dest[i].psn = lrand48()  0xff;
-   if (ctx-portinfo.transport == RDMA_TRANSPORT_RDMAOE) {
+   if (ctx-portinfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
if (!grh) {
fprintf(stderr, Must supply remote gid\n);
return 1;
diff --git a/examples/uc_pingpong.c b/examples/uc_pingpong.c
index 6cfffd2..2bc7da5 100644
--- a/examples/uc_pingpong.c
+++ b/examples/uc_pingpong.c
@@ -641,7 +641,7 @@ int main(int argc, char *argv[])
}
 
my_dest.lid = ctx-portinfo.lid;
-   if (ctx-portinfo.transport == RDMA_TRANSPORT_RDMAOE) {
+   if (ctx-portinfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
if (!grh) {
fprintf(stderr, Must supply remote gid\n);
return 1;
diff --git a/examples/ud_pingpong.c b/examples/ud_pingpong.c
index 33601a3..e30d6d6 100644
--- a/examples/ud_pingpong.c
+++ b/examples/ud_pingpong.c
@@ -641,7 +641,7

Re: [ewg] Re: [PATCH] rdmaoe/libibverbs: handle binary compatibility

2009-12-10 Thread Eli Cohen
On Thu, Dec 10, 2009 at 10:33:53AM -0700, Jason Gunthorpe wrote:
 Could you prepare this based on Roland's tree? This patch won't apply.
 

I quote two patches, one for libibverbs based on 74638ac, and the
other for libmlx4 based on 444f634. I changed the padding handling as
you requested for libibverbs. You also need to apply a patch to the
kernel driver to match the new values for link_layer. I put it here
too.

libibverbs:


diff --git a/examples/devinfo.c b/examples/devinfo.c
index 84f95c7..393ec04 100644
--- a/examples/devinfo.c
+++ b/examples/devinfo.c
@@ -184,6 +184,19 @@ static int print_all_port_gids(struct ibv_context *ctx, 
uint8_t port_num, int tb
return rc;
 }
 
+static const char *link_layer_str(uint8_t link_layer)
+{
+   switch (link_layer) {
+   case IBV_LINK_LAYER_UNSPECIFIED:
+   case IBV_LINK_LAYER_INFINIBAND:
+   return IB;
+   case IBV_LINK_LAYER_ETHERNET:
+   return Ethernet;
+   default:
+   return Unknown;
+   }
+}
+
 static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port)
 {
struct ibv_context *ctx;
@@ -284,6 +297,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t 
ib_port)
printf(\t\t\tsm_lid:\t\t\t%d\n, port_attr.sm_lid);
printf(\t\t\tport_lid:\t\t%d\n, port_attr.lid);
printf(\t\t\tport_lmc:\t\t0x%02x\n, port_attr.lmc);
+   printf(\t\t\tlink_layer:\t\t%s\n, 
link_layer_str(port_attr.link_layer));
 
if (verbose) {
printf(\t\t\tmax_msg_sz:\t\t0x%x\n, 
port_attr.max_msg_sz);
diff --git a/examples/pingpong.c b/examples/pingpong.c
index b916f59..d4a46e4 100644
--- a/examples/pingpong.c
+++ b/examples/pingpong.c
@@ -31,6 +31,8 @@
  */
 
 #include pingpong.h
+#include arpa/inet.h
+#include stdlib.h
 
 enum ibv_mtu pp_mtu_to_enum(int mtu)
 {
@@ -53,3 +55,10 @@ uint16_t pp_get_local_lid(struct ibv_context *context, int 
port)
 
return attr.lid;
 }
+
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr)
+{
+   return ibv_query_port(context, port, attr);
+}
+
diff --git a/examples/pingpong.h b/examples/pingpong.h
index 71d7c3f..16d3466 100644
--- a/examples/pingpong.h
+++ b/examples/pingpong.h
@@ -37,5 +37,7 @@
 
 enum ibv_mtu pp_mtu_to_enum(int mtu);
 uint16_t pp_get_local_lid(struct ibv_context *context, int port);
+int pp_get_port_info(struct ibv_context *context, int port,
+struct ibv_port_attr *attr);
 
 #endif /* IBV_PINGPONG_H */
diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c
index fa969e0..4d0bd0d 100644
--- a/examples/rc_pingpong.c
+++ b/examples/rc_pingpong.c
@@ -67,6 +67,8 @@ struct pingpong_context {
int  size;
int  rx_depth;
int  pending;
+   struct ibv_port_attr portinfo;
+   union ibv_giddgid;
 };
 
 struct pingpong_dest {
@@ -94,6 +96,12 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int 
port, int my_psn,
.port_num   = port
}
};
+
+   if (ctx-dgid.global.interface_id) {
+   attr.ah_attr.is_global = 1;
+   attr.ah_attr.grh.hop_limit = 1;
+   attr.ah_attr.grh.dgid = ctx-dgid;
+   }
if (ibv_modify_qp(ctx-qp, attr,
  IBV_QP_STATE  |
  IBV_QP_AV |
@@ -289,11 +297,11 @@ out:
 
 static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int 
size,
int rx_depth, int port,
-   int use_event)
+   int use_event, int is_server)
 {
struct pingpong_context *ctx;
 
-   ctx = malloc(sizeof *ctx);
+   ctx = calloc(1, sizeof *ctx);
if (!ctx)
return NULL;
 
@@ -306,7 +314,7 @@ static struct pingpong_context *pp_init_ctx(struct 
ibv_device *ib_dev, int size,
return NULL;
}
 
-   memset(ctx-buf, 0, size);
+   memset(ctx-buf, 0x7b + is_server, size);
 
ctx-context = ibv_open_device(ib_dev);
if (!ctx-context) {
@@ -481,6 +489,7 @@ static void usage(const char *argv0)
printf(  -n, --iters=itersnumber of exchanges (default 1000)\n);
printf(  -l, --sl=sl  service level value\n);
printf(  -e, --events   sleep on CQ events (default poll)\n);
+   printf(  -g, --gid=remote gid gid of the other port\n);
 }
 
 int main(int argc, char *argv[])
@@ -504,6 +513,7 @@ int main(int argc, char *argv[])
int  rcnt, scnt;
int  num_cq_events = 0;
int  sl = 0;
+   char*grh = NULL;
 
srand48(getpid() * time(NULL));
 
@@ -520,10 +530,11 @@ int 

[ewg] [PATCH] mlx4_core: Cleanup bug in __mlx4_init_one()

2009-12-09 Thread Eli Cohen
If mlx4_init_port_info() fails, cleanup the initialized ports only.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/net/mlx4/main.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 6f5a3cf..0c868c9 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -1294,7 +1294,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const 
struct pci_device_id *id)
return 0;
 
 err_port:
-   for (port = 1; port = dev-caps.num_ports; port++)
+   for (--port; port = 1; --port)
mlx4_cleanup_port_info(priv-port[port]);
 
mlx4_cleanup_counters_table(dev);
-- 
1.6.5.5

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] Re: [PATCH] mlx4_core: Cleanup bug in __mlx4_init_one()

2009-12-09 Thread Eli Cohen
On Wed, Dec 09, 2009 at 02:35:18PM -0800, Roland Dreier wrote:
 Looks like a valid fix but your patch doesn't apply.  I don't seem to
 have mlx4_cleanup_counters_table() in my tree.

I noticed that already. I'll send a new patch in my next working day.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [PATCH] mlx4: Fix bug in mlx4_ib_mcg_attach

2009-12-09 Thread Eli Cohen
On Wed, Dec 09, 2009 at 02:33:31PM -0800, Roland Dreier wrote:
 This bug doesn't seem to ever have been present in the upstream
 kernel -- what are you generating this patch against?
 

I think it came from your for-next branch.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCH] mlx4: Fix bug in mlx4_ib_mcg_attach

2009-12-08 Thread Eli Cohen
The ! operator has precedence over the  operator so parenthesis are
required to properly evaluate the user's loopback requirement.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/main.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 0f6ef38..736eea0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -586,8 +586,8 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union 
ib_gid *gid, u16 lid)
struct mlx4_ib_dev *mdev = to_mdev(ibqp-device);
struct mlx4_ib_qp *mqp = to_mqp(ibqp);
 
-   err = mlx4_multicast_attach(mdev-dev, mqp-mqp, gid-raw, 
!!mqp-flags 
-   MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK);
+   err = mlx4_multicast_attach(mdev-dev, mqp-mqp, gid-raw, 
!!(mqp-flags 
+   MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK));
if (err)
return err;
 
-- 
1.6.5.5

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] Strange behavior with IPoIB in today's Dec 1 OFED daily build

2009-12-02 Thread Eli Cohen
Did you check that ib0 is up on c0-6?

On Tue, Dec 01, 2009 at 03:36:12PM -0800, Woodruff, Robert J wrote:
 I loaded today's daily build on RedHat EL 5.4 and
 I am seeing some strange behavior with the route table
 that is generated.
 
 my ifcfg-ib0 script lools like this
 
 DEVICE=ib0
 BOOTPROTO=static
 IPADDR=169.254.104.248
 ONBOOT=yes
 
 On OFED-1.5-rc2, the route table looks like this, which is what I expect.
 
 
 [r...@compute-0-7 woody]# netstat -r
 Kernel IP routing table
 Destination Gateway Genmask Flags   MSS Window  irtt Iface
 192.168.25.0*   255.255.255.0   U 0 0  0 eth0
 169.254.0.0 *   255.255.0.0 U 0 0  0 ib0
 default cst-head.local  0.0.0.0 UG0 0  0 eth0
 
 On today's daily build, the route table seems to be routing the IPoIB trafic
 out eth0 instead of ib0.
 
 [r...@compute-0-7 woody]# ssh c0-6
 r...@c0-6's password: 
 Last login: Tue Dec  1 11:09:10 2009 from compute-0-5.local
 [r...@compute-0-6 ~]# netstat -r
 Kernel IP routing table
 Destination Gateway Genmask Flags   MSS Window  irtt Iface
 192.168.25.0*   255.255.255.0   U 0 0  0 eth0
 169.254.0.0 *   255.255.0.0 U 0 0  0 eth0 
-- should be ib0
 default cst-head.local  0.0.0.0 UG0 0  0 
 eth0___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] [ANNOUNCE] OFED-RDMAoE-1.5-rc3 release is available

2009-12-01 Thread Eli Cohen
Hi,
I am resending this mail with just the title fixed to rc3.

On Tue, Dec 01, 2009 at 06:41:37PM +0200, Eli Cohen wrote:
 OFED-RDMAoE-1.5-rc3 is available
 
 The tarball is available on:
 http://www.openfabrics.org/downloads/OFED/ofed-rdmaoe-1.5/OFED-RDMAoE-1.5-rc3.tgz
 
 
 To get BUILD_ID run ofed_info
 
 Please report any issues in bugzilla https://bugs.openfabrics.org/ for
 OFED 1.5, component RDMAoE
 
 
 Release information:
 
 Linux Operating Systems:
- RedHat EL4 up6   2.6.9-67.ELsmp
- RedHat EL4 up7   2.6.9-78.ELsmp
- RedHat EL4 up8   2.6.9-89.ELsmp
- RedHat EL5 up2   2.6.18-92.el5
- RedHat EL5 up3   2.6.18-128.el5
- RedHat EL5 up4   2.6.18-164.el5
- SLES10 SP2   2.6.16.60-0.21-smp
- SLES10 SP3   2.6.16.60-0.54-smp
- SLES11   2.6.27.19-5-default
- OEL 4 up72.6.9-78.ELsmp
- OEL 5 up22.6.18-92.el5
- CentOS5.22.6.18-92.el5
- CentOS5.32.6.18-128.el5
- Fedora Cor   2.6.29  *
- OpenSuSE 11  2.6.25.5-1.1*
- kernel.org   2.6.29 and 2.6.30
 
 
  * Minimal QA for these versions
 
 Systems:
  * x86_64
  * x86
  * ia64
  * ppc64
 
 Main changes from 1.5 rc2:
 
 1. Updated packages:
 
 libibverbs-1.1.2-0.8.g66ece2f.tar.gz
 libipathverbs-1.2.tar.gz
 dapl-2.0.25.tar.gz
 compat-dapl-1.2.15.tar.gz
 ib-bonding-0.9.0-41.src.rpm
 rnfs-utils-1.1.5-10.OFED.src.rpm
 
 - Management packages:
 libibumad-1.3.3_20091130_4ea1c4d.tar.gz
 libibmad-1.3.3_20091130_4ea1c4d.tar.gz
 opensm-3.3.3_20091130_4ea1c4d.tar.gz
 infiniband-diags-1.5.3_20091130_4ea1c4d.tar.gz
 ibutils-1.2-0.1.ge8e69b7.tar.gz
 
 - MPI packages:
 mvapich2-1.4-2.src.rpm
 mpitests-3.2-916.src.rpm (Updated IMB and OSU benchmark tests)
 
 2. Bug fixes (attached log file)
 
 
 Limitations:
 
 - SLES10 SP3 on IA64 is not supported yet
 
 
 Notes:
 RDMAoE and mlx4_en capabilities are supported only for x86 and x86_64
 architectures.
 
 

 commit 437d9b4dfd4dbd58a8e52348c6f12b70ebf9bb1d
 Merge: 5c0b86c... b9e3b12...
 Author: Eli Cohen e...@mellanox.co.il
 Date:   Tue Dec 1 11:02:05 2009 +0200
 
 Merge branch 'ofed_kernel_1_5' into rdmaoe
 
 commit b9e3b127bc904ef0bd9a3efb77bdf0c66a85f386
 Author: Yevgeny Petrilin yevge...@mellanox.co.il
 Date:   Tue Dec 1 10:27:50 2009 +0200
 
 mlx4_core: Using selected vector number when setting EQ in cqc
 
 Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il
 
 commit 56bfadef0b8fd97cfcd6b202f84d566ce6e9ccd3
 Author: Amir Vadai am...@mellanox.co.il
 Date:   Mon Nov 30 16:14:14 2009 +0200
 
 sdp: cancel_work_sync on 2.6.22 didn't return a value
 
 Since the return value was used for debug message only,
 gave up this message to make backports easier.
 
 Signed-off-by: Amir Vadai am...@mellanox.co.il
 
 commit 5c0b86cb9540abe2d130769d932e9b2c482218f0
 Merge: 3dc9160... 56bfade...
 Author: Eli Cohen e...@mellanox.co.il
 Date:   Mon Nov 30 17:07:56 2009 +0200
 
 Merge branch 'ofed_kernel_1_5' into rdmaoe
 
 commit 3dc9160cac856a60f3ea04bd253cf9f436572b5a
 Merge: 4e401c5... c842ed7...
 Author: Eli Cohen e...@mellanox.co.il
 Date:   Mon Nov 30 16:37:33 2009 +0200
 
 Merge branch 'ofed_kernel_1_5' into rdmaoe
 
 commit 108d9abbab4251aa6806b57bf84a8ea63caa73d4
 Author: Amir Vadai am...@mellanox.co.il
 Date:   Mon Nov 30 15:04:05 2009 +0200
 
 sdp: fix compilation error on kernel 2.6.20
 
 Signed-off-by: Amir Vadai am...@mellanox.co.il
 
 commit c842ed79fe5811b2e42c3ea6fa5675125d5b4875
 Author: Yevgeny Petrilin yevge...@mellanox.co.il
 Date:   Mon Nov 30 09:47:45 2009 +0200
 
 mlx4: Fixed a bug in dynamic completion vector allocation
 
 The cq allocation function would return -EINVAL in case that it
 received MLX4_LEAST_ATTACHED_VECTOR as the vector number.
 
 Signed-off-by: Yevgeny Petrilin yevge...@mellanox.co.il
 
 commit 4e401c545d857ed52066c46a00c533b9f9a0facc
 Author: Eli Cohen e...@mellanox.co.il
 Date:   Sun Nov 29 12:17:55 2009 +0200
 
 From 0952e0395387e0cfcbeaa2232f92ee4545453014 Mon Sep 17 00:00:00 2001 
 From: Eli Cohen e...@mellanox.co.il Date: Sun, 29 Nov 2009 11:51:48 +0200 
 Subject: [PATCH 2/2] rdmaoe: Fix crash when using loopback
 
 When we use a loopback connection, we do not find the corresponding 
 netdev.
 Failure to do so caused a crash when trying to retrieve the MTU. Use the 
 max
 MTU for loopback devices.
 Note: loopback still does not work in this version due to FW 
 configuration but
 the system will not crash.
 
 Signed-off-by: Eli Cohen e...@mellanox.co.il
 
 commit f44485baeb44593e0faaa27e0a500c38add138e8
 Author: Eli Cohen e...@mellanox.co.il
 Date:   Sun Nov 29 12:11:46 2009 +0200
 
 From c057fed4df8bccefdcba0d8411f2e007e131d968 Mon

Re: [ewg] Re: [ofw] SC'09 BOF - Meeting notes

2009-11-23 Thread Eli Cohen
On Fri, Nov 20, 2009 at 01:38:59AM +0200, Or Gerlitz wrote:
 
 yes, this would be simply not supportable, think about that, you want
 to hand your customers with a code which didn't pass review nor
 acceptance by the Linux IB stack maintainers (Roland and Sean), say,
 next a crash happens at this or that module / line, next, what you
 except the maintainers to do?
 
Saying that the patch set did not go through a review process would
not inaccurate.
Bellow is a brief log of major changes done on the RDMAoE patches for
your reference. A detailed correspondence can be found at the
openfabrics general list.

Rev1 - June 15 2009, first patch set sent for review
Rev2 - June 25 2009, Sean - move path resolution to a new module
(rdmaoe_sa)
Rev3 - July 13 2009, Sean, Roland, share data structs between
multicast.c and rdmaoe_sa.c, distinguish between rdmaoe and ib calls
at the cma, increment ABI version
Rev4 - Aug 5 2009, Woody Sean Or, ports are differentiated by port
protocol rather than port type, move rdmaoe sa functionality to cma
Rev5 - Aug 19 2009, Roland, Sean, don't use broadcast MACs to map
multicast GIDs, MAD service disabled for userspace, add
rdma_is_transport_supported()
Annonuce - Sep 17 2009, OFED-RDMAoE branch announce, daily builds
available
Rev6 - Nov 16 2009, NIC programming moved from CMA to hw driver so
verbs consumer can utilize it.


___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


Re: [ewg] Re: [ofw] SC'09 BOF - Meeting notes

2009-11-23 Thread Eli Cohen
On Mon, Nov 23, 2009 at 10:11:21AM +0200, Eli Cohen wrote:

Would like to fix a typo: I meant bellow:
Saying that the patch set did not go through a review process would
**be** inaccurate.

 On Fri, Nov 20, 2009 at 01:38:59AM +0200, Or Gerlitz wrote:
  
  yes, this would be simply not supportable, think about that, you want
  to hand your customers with a code which didn't pass review nor
  acceptance by the Linux IB stack maintainers (Roland and Sean), say,
  next a crash happens at this or that module / line, next, what you
  except the maintainers to do?
  
 Saying that the patch set did not go through a review process would
 not inaccurate.
 Bellow is a brief log of major changes done on the RDMAoE patches for
 your reference. A detailed correspondence can be found at the
 openfabrics general list.
 
 Rev1 - June 15 2009, first patch set sent for review
 Rev2 - June 25 2009, Sean - move path resolution to a new module
 (rdmaoe_sa)
 Rev3 - July 13 2009, Sean, Roland, share data structs between
 multicast.c and rdmaoe_sa.c, distinguish between rdmaoe and ib calls
 at the cma, increment ABI version
 Rev4 - Aug 5 2009, Woody Sean Or, ports are differentiated by port
 protocol rather than port type, move rdmaoe sa functionality to cma
 Rev5 - Aug 19 2009, Roland, Sean, don't use broadcast MACs to map
 multicast GIDs, MAD service disabled for userspace, add
 rdma_is_transport_supported()
 Annonuce - Sep 17 2009, OFED-RDMAoE branch announce, daily builds
 available
 Rev6 - Nov 16 2009, NIC programming moved from CMA to hw driver so
 verbs consumer can utilize it.
 
 
 ___
 ewg mailing list
 ewg@lists.openfabrics.org
 http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv6 0/10] RDMAoE support

2009-11-16 Thread Eli Cohen
RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using
Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet
fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned
Ethertype, a GRH, unmodified IB transport headers and payload.  IB subnet
management and SA services are not required for RDMAoE operation; Ethernet
management practices are used instead. RDMAoE encodes IP addresses into its
GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs,
standard IP to MAC mappings apply.

To support RDMAoE, a new transport protocol was added to the IB core. An RDMA
device can have ports with different transports, which are identified by a port
transport attribute.  The RDMA Verbs API is syntactically unmodified. When
referring to RDMAoE ports, Address handles are required to contain GIDs while
LID fields are ignored. The Ethernet L2 information is subsequently obtained by
the vendor-specific driver (in both kernel and user-space) while modifying QPs
to RTR and creating address handles.  As there is no SA in RDMAoE, the CMA code
is modified to fill the necessary path record attributes locally before sending
CM packets. Similarly, the CMA provides to the user the required address handle
attributes when processing SIDR requests and joining multicast groups.

In this patch set, an RDMAoE port is currently assigned a single GID, encoding
the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code
temporarily uses IPv6 link-local addresses as GIDs instead of the IP address
provided by the user, thereby supporting any IP address.

To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib
drivers must be loaded, and the netdevice for the corresponding RDMAoE port
must be running. Individual ports of a multi port HCA can be independently
configured as Ethernet (with support for RDMAoE) or IB, as is already the case.
We have successfully tested MPI, SDP, RDS, and native Verbs applications over
RDMAoE.

Following is a series of 10 patches based on Roland's for-next branch. This new
series reflects changes based on feedback from the community on the previous
set of patches, and is tagged v6. Previous series were posted to the
openfabrics general list only.

Changes from v5:
1. Bug fixes.
2. NIC programming through dev_mc_add/delete were moved from CMA to the hw
specific driver. This allows native verbs applications to receive multicast
traffic too.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---


 drivers/infiniband/core/agent.c   |   39 +-
 drivers/infiniband/core/cm.c  |   26 +
 drivers/infiniband/core/cma.c |  272 --
 drivers/infiniband/core/mad.c |   78 +++--
 drivers/infiniband/core/multicast.c   |4 
 drivers/infiniband/core/sa_query.c|   39 +-
 drivers/infiniband/core/ucm.c |   14 
 drivers/infiniband/core/ucma.c|   33 ++
 drivers/infiniband/core/ud_header.c   |  111 +++
 drivers/infiniband/core/user_mad.c|   13 
 drivers/infiniband/core/uverbs.h  |1 
 drivers/infiniband/core/uverbs_cmd.c  |   32 ++
 drivers/infiniband/core/uverbs_main.c |1 
 drivers/infiniband/core/verbs.c   |   35 ++
 drivers/infiniband/hw/mlx4/ah.c   |  181 ++--
 drivers/infiniband/hw/mlx4/mad.c  |   32 +-
 drivers/infiniband/hw/mlx4/main.c |  436 +++---
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |   29 +
 drivers/infiniband/hw/mlx4/qp.c   |  199 +
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   12 
 drivers/net/mlx4/en_main.c|   15 -
 drivers/net/mlx4/en_port.c|4 
 drivers/net/mlx4/en_port.h|3 
 drivers/net/mlx4/fw.c |3 
 drivers/net/mlx4/intf.c   |   20 +
 drivers/net/mlx4/main.c   |6 
 drivers/net/mlx4/mlx4.h   |1 
 include/linux/mlx4/cmd.h  |1 
 include/linux/mlx4/device.h   |   31 ++
 include/linux/mlx4/driver.h   |   16 -
 include/linux/mlx4/qp.h   |8 
 include/rdma/ib_addr.h|   92 ++
 include/rdma/ib_pack.h|   26 +
 include/rdma/ib_user_verbs.h  |   21 +
 include/rdma/ib_verbs.h   |   26 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c   |3 
 net/sunrpc/xprtrdma/svc_rdma_transport.c  |2 
 37 files changed, 1594 insertions(+), 271 deletions(-)
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv6 01/10] ib_core: Refine device personality from node type to port type

2009-11-16 Thread Eli Cohen
As a preparation to devices that, in general, support a different transport
protocol for each port, specifically RDMAoE, this patch defines a transport
type for each of a device's ports. As a result, rdma_node_get_transport() has
been unexported and is used internally by the implementation of the new API,
rdma_port_get_transport(), which gives the transport protocol of the queried
port. rdma_is_transport_supported() is also added to be used for verifying if a
given device supports a given protocol on any of its ports. All references to
rdma_node_get_transport() are changed to use the new APIs. Also, ib_port_attr
is extended to contain enum rdma_transport_type.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cm.c  |   25 +
 drivers/infiniband/core/cma.c |   54 +++--
 drivers/infiniband/core/mad.c |   49 -
 drivers/infiniband/core/multicast.c   |4 +-
 drivers/infiniband/core/sa_query.c|   39 +
 drivers/infiniband/core/ucm.c |8 +++-
 drivers/infiniband/core/ucma.c|2 +-
 drivers/infiniband/core/user_mad.c|6 +++-
 drivers/infiniband/core/verbs.c   |   25 -
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   12 +++---
 include/rdma/ib_verbs.h   |   11 --
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c   |3 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c  |2 +-
 13 files changed, 153 insertions(+), 87 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 5130fc5..d082f59 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3678,8 +3678,9 @@ static void cm_add_one(struct ib_device *ib_device)
unsigned long flags;
int ret;
u8 i;
+   enum rdma_transport_type tt;
 
-   if (rdma_node_get_transport(ib_device-node_type) != RDMA_TRANSPORT_IB)
+   if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB))
return;
 
cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
@@ -3700,6 +3701,10 @@ static void cm_add_one(struct ib_device *ib_device)
 
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i = ib_device-phys_port_cnt; i++) {
+   tt = rdma_port_get_transport(ib_device, i);
+   if (tt != RDMA_TRANSPORT_IB)
+   continue;
+
port = kzalloc(sizeof *port, GFP_KERNEL);
if (!port)
goto error1;
@@ -3742,9 +3747,11 @@ error1:
port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
while (--i) {
port = cm_dev-port[i-1];
-   ib_modify_port(ib_device, port-port_num, 0, port_modify);
-   ib_unregister_mad_agent(port-mad_agent);
-   cm_remove_port_fs(port);
+   if (port) {
+   ib_modify_port(ib_device, port-port_num, 0, 
port_modify);
+   ib_unregister_mad_agent(port-mad_agent);
+   cm_remove_port_fs(port);
+   }
}
device_unregister(cm_dev-device);
kfree(cm_dev);
@@ -3770,10 +3777,12 @@ static void cm_remove_one(struct ib_device *ib_device)
 
for (i = 1; i = ib_device-phys_port_cnt; i++) {
port = cm_dev-port[i-1];
-   ib_modify_port(ib_device, port-port_num, 0, port_modify);
-   ib_unregister_mad_agent(port-mad_agent);
-   flush_workqueue(cm.wq);
-   cm_remove_port_fs(port);
+   if (port) {
+   ib_modify_port(ib_device, port-port_num, 0, 
port_modify);
+   ib_unregister_mad_agent(port-mad_agent);
+   flush_workqueue(cm.wq);
+   cm_remove_port_fs(port);
+   }
}
device_unregister(cm_dev-device);
kfree(cm_dev);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 0753178..8dc3472 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -329,24 +329,26 @@ static int cma_acquire_dev(struct rdma_id_private 
*id_priv)
struct cma_device *cma_dev;
union ib_gid gid;
int ret = -ENODEV;
-
-   switch (rdma_node_get_transport(dev_addr-dev_type)) {
-   case RDMA_TRANSPORT_IB:
-   ib_addr_get_sgid(dev_addr, gid);
-   break;
-   case RDMA_TRANSPORT_IWARP:
-   iw_addr_get_sgid(dev_addr, gid);
-   break;
-   default:
-   return -ENODEV;
-   }
+   int port;
 
list_for_each_entry(cma_dev, dev_list, list) {
-   ret = ib_find_cached_gid(cma_dev-device, gid,
-id_priv-id.port_num, NULL);
-   if (!ret) {
-   cma_attach_to_dev(id_priv, cma_dev

[ewg] [PATCHv6 03/10] ib_core: RDMAoE support only QP1

2009-11-16 Thread Eli Cohen
Since RDMAoE is using Ethernet as its link layer, there is no need for QP0. QP1
is still needed since it handles communications between CM agents. This patch
will create only QP1 for RDMAoE ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/agent.c |   39 +--
 drivers/infiniband/core/mad.c   |   37 +
 2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index ae7c288..45c6d20 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -48,6 +48,8 @@
 struct ib_agent_port_private {
struct list_head port_list;
struct ib_mad_agent *agent[2];
+   struct ib_device*device;
+   u8   port_num;
 };
 
 static DEFINE_SPINLOCK(ib_agent_port_list_lock);
@@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num)
 {
struct ib_agent_port_private *entry;
 
-   list_for_each_entry(entry, ib_agent_port_list, port_list) {
-   if (entry-agent[0]-device == device 
-   entry-agent[0]-port_num == port_num)
+   list_for_each_entry(entry, ib_agent_port_list, port_list)
+   if (entry-device == device  entry-port_num == port_num)
return entry;
-   }
+
return NULL;
 }
 
@@ -146,6 +147,7 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
struct ib_agent_port_private *port_priv;
unsigned long flags;
int ret;
+   enum rdma_transport_type tt;
 
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
@@ -155,14 +157,17 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error1;
}
 
-   /* Obtain send only MAD agent for SMI QP */
-   port_priv-agent[0] = ib_register_mad_agent(device, port_num,
-   IB_QPT_SMI, NULL, 0,
-   agent_send_handler,
-   NULL, NULL);
-   if (IS_ERR(port_priv-agent[0])) {
-   ret = PTR_ERR(port_priv-agent[0]);
-   goto error2;
+   tt = rdma_port_get_transport(device, port_num);
+   if (tt == RDMA_TRANSPORT_IB) {
+   /* Obtain send only MAD agent for SMI QP */
+   port_priv-agent[0] = ib_register_mad_agent(device, port_num,
+   IB_QPT_SMI, NULL, 0,
+   agent_send_handler,
+   NULL, NULL);
+   if (IS_ERR(port_priv-agent[0])) {
+   ret = PTR_ERR(port_priv-agent[0]);
+   goto error2;
+   }
}
 
/* Obtain send only MAD agent for GSI QP */
@@ -175,6 +180,9 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
goto error3;
}
 
+   port_priv-device = device;
+   port_priv-port_num = port_num;
+
spin_lock_irqsave(ib_agent_port_list_lock, flags);
list_add_tail(port_priv-port_list, ib_agent_port_list);
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
@@ -182,7 +190,8 @@ int ib_agent_port_open(struct ib_device *device, int 
port_num)
return 0;
 
 error3:
-   ib_unregister_mad_agent(port_priv-agent[0]);
+   if (tt == RDMA_TRANSPORT_IB)
+   ib_unregister_mad_agent(port_priv-agent[0]);
 error2:
kfree(port_priv);
 error1:
@@ -205,7 +214,9 @@ int ib_agent_port_close(struct ib_device *device, int 
port_num)
spin_unlock_irqrestore(ib_agent_port_list_lock, flags);
 
ib_unregister_mad_agent(port_priv-agent[1]);
-   ib_unregister_mad_agent(port_priv-agent[0]);
+   if (rdma_port_get_transport(device, port_num) == RDMA_TRANSPORT_IB)
+   ib_unregister_mad_agent(port_priv-agent[0]);
+
kfree(port_priv);
return 0;
 }
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 28a47a8..857049c 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2610,6 +2610,9 @@ static void cleanup_recv_queue(struct ib_mad_qp_info 
*qp_info)
struct ib_mad_private *recv;
struct ib_mad_list_head *mad_list;
 
+   if (!qp_info-qp)
+   return;
+
while (!list_empty(qp_info-recv_queue.list)) {
 
mad_list = list_entry(qp_info-recv_queue.list.next,
@@ -2651,6 +2654,9 @@ static int ib_mad_port_start(struct ib_mad_port_private 
*port_priv)
 
for (i = 0; i  IB_MAD_QPS_CORE; i++) {
qp = port_priv-qp_info[i].qp;
+   if (!qp)
+   continue;
+
/*
 * PKey index for QP1 is irrelevant

[ewg] [PATCHv6 02/10] ib_core: Add RDMAoE transport protocol

2009-11-16 Thread Eli Cohen
Add a new transport protocol, RDMAoE, used for transporting Infiniband traffic
over Ethernet fabrics.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 include/rdma/ib_verbs.h |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4cf42f3..d9146c4 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -69,7 +69,8 @@ enum rdma_node_type {
 
 enum rdma_transport_type {
RDMA_TRANSPORT_IB,
-   RDMA_TRANSPORT_IWARP
+   RDMA_TRANSPORT_IWARP,
+   RDMA_TRANSPORT_RDMAOE
 };
 
 enum ib_device_cap_flags {
-- 
1.6.5.2

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv6 06/10] ib_core: CMA device binding

2009-11-16 Thread Eli Cohen
Add support for RDMAoE device binding and IP -- GID resolution. Path resolving
and multicast joining are implemented within cma.c by filling the responses and
pushing the callbacks to the cma work queue. IP-GID resolution always yields
IPv6 link local addresses - remote GIDs are derived from the destination MAC
address of the remote port. Multicast GIDs are always mapped to multicast MACs
as is done in IPv6. Some helper functions are added to ib_addr.h.  IPv4
multicast is enabled by translating IPv4 multicast addresses to IPv6 multicast
as described in
http://www.mail-archive.com/i...@sunroof.eng.sun.com/msg02134.html.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cma.c  |  218 ++-
 drivers/infiniband/core/ucma.c |   31 ++
 include/rdma/ib_addr.h |   92 +
 3 files changed, 335 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 8dc3472..ad1cd75 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -58,6 +58,7 @@ MODULE_LICENSE(Dual BSD/GPL);
 #define CMA_CM_RESPONSE_TIMEOUT 20
 #define CMA_MAX_CM_RETRIES 15
 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define RDMAOE_PACKET_LIFETIME 18
 
 static void cma_add_one(struct ib_device *device);
 static void cma_remove_one(struct ib_device *device);
@@ -157,6 +158,7 @@ struct cma_multicast {
struct list_headlist;
void*context;
struct sockaddr_storage addr;
+   struct kref mcref;
 };
 
 struct cma_work {
@@ -173,6 +175,12 @@ struct cma_ndev_work {
struct rdma_cm_eventevent;
 };
 
+struct rdmaoe_mcast_work {
+   struct work_struct   work;
+   struct rdma_id_private  *id;
+   struct cma_multicast*mc;
+};
+
 union cma_ip_addr {
struct in6_addr ip6;
struct {
@@ -290,6 +298,14 @@ static inline void cma_deref_dev(struct cma_device 
*cma_dev)
complete(cma_dev-comp);
 }
 
+static inline void release_mc(struct kref *kref)
+{
+   struct cma_multicast *mc = container_of(kref, struct cma_multicast, 
mcref);
+
+   kfree(mc-multicast.ib);
+   kfree(mc);
+}
+
 static void cma_detach_from_dev(struct rdma_id_private *id_priv)
 {
list_del(id_priv-list);
@@ -340,6 +356,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv)
case RDMA_TRANSPORT_IWARP:
iw_addr_get_sgid(dev_addr, gid);
break;
+   case RDMA_TRANSPORT_RDMAOE:
+   rdmaoe_addr_get_sgid(dev_addr, gid);
+   break;
default:
return -ENODEV;
}
@@ -568,10 +587,16 @@ static int cma_ib_init_qp_attr(struct rdma_id_private 
*id_priv,
 {
struct rdma_dev_addr *dev_addr = id_priv-id.route.addr.dev_addr;
int ret;
+   u16 pkey;
+
+   if (rdma_port_get_transport(id_priv-id.device, id_priv-id.port_num) ==
+   RDMA_TRANSPORT_IB)
+   pkey = ib_addr_get_pkey(dev_addr);
+   else
+   pkey = 0x;
 
ret = ib_find_cached_pkey(id_priv-id.device, id_priv-id.port_num,
- ib_addr_get_pkey(dev_addr),
- qp_attr-pkey_index);
+ pkey, qp_attr-pkey_index);
if (ret)
return ret;
 
@@ -601,6 +626,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct 
ib_qp_attr *qp_attr,
id_priv = container_of(id, struct rdma_id_private, id);
switch (rdma_port_get_transport(id_priv-id.device, 
id_priv-id.port_num)) {
case RDMA_TRANSPORT_IB:
+   case RDMA_TRANSPORT_RDMAOE:
if (!id_priv-cm_id.ib || cma_is_ud_ps(id_priv-id.ps))
ret = cma_ib_init_qp_attr(id_priv, qp_attr, 
qp_attr_mask);
else
@@ -828,8 +854,17 @@ static void cma_leave_mc_groups(struct rdma_id_private 
*id_priv)
mc = container_of(id_priv-mc_list.next,
  struct cma_multicast, list);
list_del(mc-list);
-   ib_sa_free_multicast(mc-multicast.ib);
-   kfree(mc);
+   switch (rdma_port_get_transport(id_priv-cma_dev-device, 
id_priv-id.port_num)) {
+   case RDMA_TRANSPORT_IB:
+   ib_sa_free_multicast(mc-multicast.ib);
+   kfree(mc);
+   break;
+   case RDMA_TRANSPORT_RDMAOE:
+   kref_put(mc-mcref, release_mc);
+   break;
+   default:
+   break;
+   }
}
 }
 
@@ -847,6 +882,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
mutex_unlock(lock);
switch

[ewg] [PATCHv6 09/10] mlx4: Add support for RDMAoE - address resolution

2009-11-16 Thread Eli Cohen
The following patch handles address vectors creation for RDMAoE ports. mlx4
needs the MAC address of the remote node to include it in the WQE of a UD QP or
in the QP context of connected QPs. Address resolution is done atomically in
the case of a link local address or a multicast GID and otherwise -EINVAL is
returned.  mlx4 transport packets were changed too to accomodate for RDMAoE.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/ah.c  |  181 +++--
 drivers/infiniband/hw/mlx4/mad.c |   32 ---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   18 +++-
 drivers/infiniband/hw/mlx4/qp.c  |  172 ++--
 drivers/net/mlx4/fw.c|3 +-
 include/linux/mlx4/device.h  |   31 ++-
 include/linux/mlx4/qp.h  |8 +-
 7 files changed, 340 insertions(+), 105 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index c75ac94..3451929 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -31,63 +31,160 @@
  */
 
 #include mlx4_ib.h
+#include rdma/ib_addr.h
+#include linux/inet.h
+#include linux/string.h
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr 
*ah_attr,
+   u8 *mac, int *is_mcast, u8 port)
 {
-   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
-   struct mlx4_ib_ah *ah;
+   struct mlx4_ib_rdmaoe *rdmaoe = dev-rdmaoe;
+   struct in6_addr in6;
 
-   ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-   if (!ah)
-   return ERR_PTR(-ENOMEM);
+   *is_mcast = 0;
+   spin_lock(rdmaoe-lock);
+   if (!rdmaoe-netdevs[port - 1]) {
+   spin_unlock(rdmaoe-lock);
+   return -EINVAL;
+   }
+   spin_unlock(rdmaoe-lock);
 
-   memset(ah-av, 0, sizeof ah-av);
+   memcpy(in6, ah_attr-grh.dgid.raw, sizeof in6);
+   if (rdma_link_local_addr(in6))
+   rdma_get_ll_mac(in6, mac);
+   else if (rdma_is_multicast_addr(in6)) {
+   rdma_get_mcast_mac(in6, mac);
+   *is_mcast = 1;
+   } else
+   return -EINVAL;
 
-   ah-av.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
-   ah-av.g_slid  = ah_attr-src_path_bits;
-   ah-av.dlid= cpu_to_be16(ah_attr-dlid);
-   if (ah_attr-static_rate) {
-   ah-av.stat_rate = ah_attr-static_rate + MLX4_STAT_RATE_OFFSET;
-   while (ah-av.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
-  !(1  ah-av.stat_rate  dev-caps.stat_rate_support))
-   --ah-av.stat_rate;
-   }
-   ah-av.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
+   return 0;
+}
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+ struct mlx4_ib_ah *ah)
+{
+   struct mlx4_dev *dev = to_mdev(pd-device)-dev;
+
+   ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn | (ah_attr-port_num  
24));
+   ah-av.ib.g_slid  = ah_attr-src_path_bits;
if (ah_attr-ah_flags  IB_AH_GRH) {
-   ah-av.g_slid   |= 0x80;
-   ah-av.gid_index = ah_attr-grh.sgid_index;
-   ah-av.hop_limit = ah_attr-grh.hop_limit;
-   ah-av.sl_tclass_flowlabel |=
+   ah-av.ib.g_slid   |= 0x80;
+   ah-av.ib.gid_index = ah_attr-grh.sgid_index;
+   ah-av.ib.hop_limit = ah_attr-grh.hop_limit;
+   ah-av.ib.sl_tclass_flowlabel |=
cpu_to_be32((ah_attr-grh.traffic_class  20) |
ah_attr-grh.flow_label);
-   memcpy(ah-av.dgid, ah_attr-grh.dgid.raw, 16);
+   memcpy(ah-av.ib.dgid, ah_attr-grh.dgid.raw, 16);
+   }
+
+   ah-av.ib.dlid= cpu_to_be16(ah_attr-dlid);
+   if (ah_attr-static_rate) {
+   ah-av.ib.stat_rate = ah_attr-static_rate + 
MLX4_STAT_RATE_OFFSET;
+   while (ah-av.ib.stat_rate  IB_RATE_2_5_GBPS + 
MLX4_STAT_RATE_OFFSET 
+  !(1  ah-av.ib.stat_rate  
dev-caps.stat_rate_support))
+   --ah-av.ib.stat_rate;
}
+   ah-av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr-sl  28);
 
return ah-ibah;
 }
 
+static struct ib_ah *create_rdmaoe_ah(struct ib_pd *pd, struct ib_ah_attr 
*ah_attr,
+  struct mlx4_ib_ah *ah)
+{
+   struct mlx4_ib_dev *ibdev = to_mdev(pd-device);
+   struct mlx4_dev *dev = ibdev-dev;
+   u8 mac[6];
+   int err;
+   int is_mcast;
+
+   err = mlx4_ib_resolve_grh(ibdev, ah_attr, mac, is_mcast, 
ah_attr-port_num);
+   if (err)
+   return ERR_PTR(err);
+
+   memcpy(ah-av.eth.mac_0_1, mac, 2);
+   memcpy(ah-av.eth.mac_2_5, mac + 2, 4);
+   ah-av.ib.port_pd = cpu_to_be32(to_mpd(pd)-pdn

[ewg] [PATCHv6 10/10] mlx4: Add RDMAoE support - allow interfaces to correspond to each other

2009-11-16 Thread Eli Cohen
This patch adds RDMAoE support for mlx4. Since mlx4_ib now needs to reference
mlx4_en netdevices, a new mechanism was added. Two new fields were added to
struct mlx4_interface to define a protocol and a get_prot_dev method to
retrieve the corresponding protocol's net device.  An implementation of the new
verb ib_get_port_link_type() - mlx4_ib_get_port_link_type - was added.
mlx4_ib_query_port() has been modified to support eth link types. An interface
is considered to be active if its corresponding eth interface is active. Code
for setting the GID table of a port has been added. Currently, each IB port has
a single GID entry in its table and that GID entery equals the link local IPv6
address.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/main.c|  436 +++---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   11 +
 drivers/infiniband/hw/mlx4/qp.c  |   27 ++-
 drivers/net/mlx4/en_main.c   |   15 +-
 drivers/net/mlx4/en_port.c   |4 +-
 drivers/net/mlx4/en_port.h   |3 +-
 drivers/net/mlx4/intf.c  |   20 ++
 drivers/net/mlx4/main.c  |6 +
 drivers/net/mlx4/mlx4.h  |1 +
 include/linux/mlx4/cmd.h |1 +
 include/linux/mlx4/driver.h  |   16 +-
 11 files changed, 493 insertions(+), 47 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index e596537..df52e57 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -34,9 +34,13 @@
 #include linux/module.h
 #include linux/init.h
 #include linux/errno.h
+#include linux/netdevice.h
+#include linux/inetdevice.h
+#include linux/rtnetlink.h
 
 #include rdma/ib_smi.h
 #include rdma/ib_user_verbs.h
+#include rdma/ib_addr.h
 
 #include linux/mlx4/driver.h
 #include linux/mlx4/cmd.h
@@ -57,6 +61,15 @@ static const char mlx4_ib_version[] =
DRV_NAME : Mellanox ConnectX InfiniBand driver v
DRV_VERSION  ( DRV_RELDATE )\n;
 
+struct update_gid_work {
+   struct work_struct work;
+   union ib_gid gids[128];
+   int port;
+   struct mlx4_ib_dev *dev;
+};
+
+static struct workqueue_struct *wq;
+
 static void init_query_mad(struct ib_smp *mad)
 {
mad-base_version  = 1;
@@ -152,28 +165,19 @@ out:
return err;
 }
 
-static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
- struct ib_port_attr *props)
+static enum rdma_transport_type
+mlx4_ib_port_get_transport(struct ib_device *device, u8 port_num)
 {
-   struct ib_smp *in_mad  = NULL;
-   struct ib_smp *out_mad = NULL;
-   int err = -ENOMEM;
-
-   in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
-   out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
-   if (!in_mad || !out_mad)
-   goto out;
-
-   memset(props, 0, sizeof *props);
+   struct mlx4_dev *dev = to_mdev(device)-dev;
 
-   init_query_mad(in_mad);
-   in_mad-attr_id  = IB_SMP_ATTR_PORT_INFO;
-   in_mad-attr_mod = cpu_to_be32(port);
-
-   err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, 
out_mad);
-   if (err)
-   goto out;
+   return dev-caps.port_mask  (1  (port_num - 1)) ?
+   RDMA_TRANSPORT_IB : RDMA_TRANSPORT_RDMAOE;
+}
 
+static void ib_link_query_port(struct ib_device *ibdev, u8 port,
+  struct ib_port_attr *props,
+  struct ib_smp *out_mad)
+{
props-lid  = be16_to_cpup((__be16 *) (out_mad-data + 16));
props-lmc  = out_mad-data[34]  0x7;
props-sm_lid   = be16_to_cpup((__be16 *) (out_mad-data + 18));
@@ -193,6 +197,67 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 
port,
props-subnet_timeout   = out_mad-data[51]  0x1f;
props-max_vl_num   = out_mad-data[37]  4;
props-init_type_reply  = out_mad-data[41]  4;
+   props-transport= RDMA_TRANSPORT_IB;
+}
+
+static void eth_link_query_port(struct ib_device *ibdev, u8 port,
+   struct ib_port_attr *props,
+   struct ib_smp *out_mad)
+{
+   struct mlx4_ib_rdmaoe *rdmaoe = to_mdev(ibdev)-rdmaoe;
+   struct net_device *ndev;
+
+   props-port_cap_flags   = IB_PORT_CM_SUP;
+   props-gid_tbl_len  = to_mdev(ibdev)-dev-caps.gid_table_len[port];
+   props-max_msg_sz   = to_mdev(ibdev)-dev-caps.max_msg_sz;
+   props-pkey_tbl_len = 1;
+   props-bad_pkey_cntr= be16_to_cpup((__be16 *) (out_mad-data + 46));
+   props-qkey_viol_cntr   = be16_to_cpup((__be16 *) (out_mad-data + 48));
+   props-active_width = 0;
+   props-active_speed = 0;
+   props-max_mtu  = out_mad-data[41]  0xf;
+   props-subnet_timeout   = 0;
+   props-max_vl_num   = out_mad-data[37]  4;
+   props-init_type_reply  = 0;
+   props-transport

[ewg] [PATCHv6 05/10] ib/cm: Enable CM support for RDMAoE

2009-11-16 Thread Eli Cohen
CM messages can be transported on RDMAoE protocol ports so they are enabled
here.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/cm.c  |5 +++--
 drivers/infiniband/core/ucm.c |   12 +---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index d082f59..c9f9122 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3680,7 +3680,8 @@ static void cm_add_one(struct ib_device *ib_device)
u8 i;
enum rdma_transport_type tt;
 
-   if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB))
+   if (!rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_IB) 
+   !rdma_is_transport_supported(ib_device, RDMA_TRANSPORT_RDMAOE))
return;
 
cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
@@ -3702,7 +3703,7 @@ static void cm_add_one(struct ib_device *ib_device)
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
for (i = 1; i = ib_device-phys_port_cnt; i++) {
tt = rdma_port_get_transport(ib_device, i);
-   if (tt != RDMA_TRANSPORT_IB)
+   if (tt != RDMA_TRANSPORT_IB  tt != RDMA_TRANSPORT_RDMAOE)
continue;
 
port = kzalloc(sizeof *port, GFP_KERNEL);
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 43700b4..dce2cbb 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1241,13 +1241,19 @@ static void ib_ucm_add_one(struct ib_device *device)
 {
struct ib_ucm_device *ucm_dev;
int i;
+   enum rdma_transport_type tt;
 
if (!device-alloc_ucontext)
return;
 
-   for (i = 1; i = device-phys_port_cnt; ++i)
-   if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB)
-   return;
+   for (i = 1; i = device-phys_port_cnt; ++i) {
+   tt = rdma_port_get_transport(device, i);
+   if (tt == RDMA_TRANSPORT_IB || tt == RDMA_TRANSPORT_RDMAOE)
+   break;
+   }
+
+   if (i  device-phys_port_cnt)
+   return;
 
ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
if (!ucm_dev)
-- 
1.6.5.2

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv6 07/10] ib_core: RDMAoE UD packet packing support

2009-11-16 Thread Eli Cohen
Add support functions to aid in packing RDMAoE packets.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/ud_header.c |  111 +++
 include/rdma/ib_pack.h  |   26 
 2 files changed, 137 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/ud_header.c 
b/drivers/infiniband/core/ud_header.c
index 8ec7876..d04b6f2 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -80,6 +80,29 @@ static const struct ib_field lrh_table[]  = {
  .size_bits= 16 }
 };
 
+static const struct ib_field eth_table[]  = {
+   { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits  = 0,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits  = 16,
+ .size_bits= 16 },
+   { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits  = 0,
+ .size_bits= 32 },
+   { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits  = 0,
+ .size_bits= 16 }
+};
+
 static const struct ib_field grh_table[]  = {
{ STRUCT_FIELD(grh, ip_version),
  .offset_words = 0,
@@ -241,6 +264,53 @@ void ib_ud_header_init(int 
payload_bytes,
 EXPORT_SYMBOL(ib_ud_header_init);
 
 /**
+ * ib_rdmaoe_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @header:Structure to initialize
+ *
+ * ib_rdmaoe_ud_header_init() initializes the grh.ip_version, 
grh.payload_length,
+ * grh.next_header, bth.opcode, bth.pad_count and
+ * bth.transport_header_version fields of a struct eth_ud_header given
+ * the payload length and whether a GRH will be included.
+ */
+void ib_rdmaoe_ud_header_init(int  payload_bytes,
+  int  grh_present,
+  struct eth_ud_header*header)
+{
+   int header_len;
+
+   memset(header, 0, sizeof *header);
+
+   header_len =
+   sizeof header-eth  +
+   IB_BTH_BYTES  +
+   IB_DETH_BYTES;
+   if (grh_present)
+   header_len += IB_GRH_BYTES;
+
+   header-grh_present  = grh_present;
+   if (grh_present) {
+   header-grh.ip_version  = 6;
+   header-grh.payload_length  =
+   cpu_to_be16((IB_BTH_BYTES +
+IB_DETH_BYTES+
+payload_bytes+
+4+ /* ICRC */
+3)  ~3);  /* round up */
+   header-grh.next_header = 0x1b;
+   }
+
+   if (header-immediate_present)
+   header-bth.opcode   = 
IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+   else
+   header-bth.opcode   = IB_OPCODE_UD_SEND_ONLY;
+   header-bth.pad_count= (4 - payload_bytes)  3;
+   header-bth.transport_header_version = 0;
+}
+EXPORT_SYMBOL(ib_rdmaoe_ud_header_init);
+
+/**
  * ib_ud_header_pack - Pack UD header struct into wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
@@ -281,6 +351,47 @@ int ib_ud_header_pack(struct ib_ud_header *header,
 EXPORT_SYMBOL(ib_ud_header_pack);
 
 /**
+ * rdmaoe_ud_header_pack - Pack UD header struct into eth wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int rdmaoe_ud_header_pack(struct eth_ud_header *header,
+  void *buf)
+{
+   int len = 0;
+
+   ib_pack(eth_table, ARRAY_SIZE(eth_table),
+   header-eth, buf);
+   len += IB_ETH_BYTES;
+
+   if (header-grh_present) {
+   ib_pack(grh_table, ARRAY_SIZE(grh_table),
+   header-grh, buf + len);
+   len += IB_GRH_BYTES;
+   }
+
+   ib_pack(bth_table, ARRAY_SIZE(bth_table),
+   header-bth, buf + len);
+   len += IB_BTH_BYTES;
+
+   ib_pack(deth_table, ARRAY_SIZE(deth_table),
+   header-deth, buf + len);
+   len += IB_DETH_BYTES;
+
+   if (header-immediate_present) {
+   memcpy(buf + len, header-immediate_data,
+  sizeof header-immediate_data);
+   len += sizeof header-immediate_data;
+   }
+
+   return len;
+}
+EXPORT_SYMBOL(rdmaoe_ud_header_pack);
+
+/**
  * ib_ud_header_unpack - Unpack UD header struct from wire format
  * @header:UD header struct
  * @buf:Buffer to pack into
diff --git a/include/rdma/ib_pack.h b/include

[ewg] [PATCHv6 04/10] IB/umad: Enable support only for IB ports

2009-11-16 Thread Eli Cohen
Initialize umad context for devices that have any of their ports IB. Since
devices may have ports of two different protocols (for example,
RDMA_TRANSPORT_IB and RDMA_TRANSPORT_RDMAOE), ib_umad_add_one() needs to
succeed if any of the ports is IB but ib_umad_init_port() is called only for IB
ports.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/user_mad.c |   15 +++
 1 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/core/user_mad.c 
b/drivers/infiniband/core/user_mad.c
index 6d6795d..d069689 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -1124,10 +1124,6 @@ static void ib_umad_add_one(struct ib_device *device)
e = device-phys_port_cnt;
}
 
-   for (i = s; i = e; ++i)
-   if (rdma_port_get_transport(device, i) != RDMA_TRANSPORT_IB)
-   return;
-
umad_dev = kzalloc(sizeof *umad_dev +
   (e - s + 1) * sizeof (struct ib_umad_port),
   GFP_KERNEL);
@@ -1142,8 +1138,9 @@ static void ib_umad_add_one(struct ib_device *device)
for (i = s; i = e; ++i) {
umad_dev-port[i - s].umad_dev = umad_dev;
 
-   if (ib_umad_init_port(device, i, umad_dev-port[i - s]))
-   goto err;
+   if (rdma_port_get_transport(device, i) == RDMA_TRANSPORT_IB)
+   if (ib_umad_init_port(device, i, umad_dev-port[i - 
s]))
+   goto err;
}
 
ib_set_client_data(device, umad_client, umad_dev);
@@ -1152,7 +1149,8 @@ static void ib_umad_add_one(struct ib_device *device)
 
 err:
while (--i = s)
-   ib_umad_kill_port(umad_dev-port[i - s]);
+   if (rdma_port_get_transport(device, i) == RDMA_TRANSPORT_IB)
+   ib_umad_kill_port(umad_dev-port[i - s]);
 
kref_put(umad_dev-ref, ib_umad_release_dev);
 }
@@ -1166,7 +1164,8 @@ static void ib_umad_remove_one(struct ib_device *device)
return;
 
for (i = 0; i = umad_dev-end_port - umad_dev-start_port; ++i)
-   ib_umad_kill_port(umad_dev-port[i]);
+   if (rdma_port_get_transport(device, i + 1) == RDMA_TRANSPORT_IB)
+   ib_umad_kill_port(umad_dev-port[i]);
 
kref_put(umad_dev-ref, ib_umad_release_dev);
 }
-- 
1.6.5.2

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCHv6 08/10] ib_core: Add API to support RDMAoE from userspace

2009-11-16 Thread Eli Cohen
Add ib_uverbs_get_mac() to be used by ibv_create_ah() to retirieve the remore
port's MAC address. Port transport is also returned by ibv_query_port().
ABI version is incremented from 6 to 7.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   32 
 drivers/infiniband/core/uverbs_main.c |1 +
 drivers/infiniband/core/verbs.c   |   10 ++
 include/rdma/ib_user_verbs.h  |   21 ++---
 include/rdma/ib_verbs.h   |   12 
 6 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b3ea958..e69b04c 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq);
 IB_UVERBS_DECLARE_CMD(modify_srq);
 IB_UVERBS_DECLARE_CMD(query_srq);
 IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(get_mac);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 56feab6..012aadf 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width= attr.active_width;
resp.active_speed= attr.active_speed;
resp.phys_state  = attr.phys_state;
+   resp.transport   = attr.transport;
 
if (copy_to_user((void __user *) (unsigned long) cmd.response,
 resp, sizeof resp))
@@ -1824,6 +1825,37 @@ err:
return ret;
 }
 
+ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file, const char __user *buf,
+ int in_len, int out_len)
+{
+   struct ib_uverbs_get_maccmd;
+   struct ib_uverbs_get_mac_resp   resp;
+   int  ret;
+   struct ib_pd*pd;
+
+   if (out_len  sizeof resp)
+   return -ENOSPC;
+
+   if (copy_from_user(cmd, buf, sizeof cmd))
+   return -EFAULT;
+
+   pd = idr_read_pd(cmd.pd_handle, file-ucontext);
+   if (!pd)
+   return -EINVAL;
+
+   ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac);
+   put_pd_read(pd);
+   if (!ret) {
+   if (copy_to_user((void __user *) (unsigned long) cmd.response,
+resp, sizeof resp))
+   return -EFAULT;
+
+   return in_len;
+   }
+
+   return ret;
+}
+
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 const char __user *buf, int in_len, int out_len)
 {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index aec0fbd..d8f98f3 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -109,6 +109,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file 
*file,
[IB_USER_VERBS_CMD_MODIFY_SRQ]  = ib_uverbs_modify_srq,
[IB_USER_VERBS_CMD_QUERY_SRQ]   = ib_uverbs_query_srq,
[IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+   [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac,
 };
 
 static struct vfsmount *uverbs_event_mnt;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index d81e217..aaa8778 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -925,3 +925,13 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, 
u16 lid)
return qp-device-detach_mcast(qp, gid, lid);
 }
 EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac)
+{
+   if (!device-get_mac)
+   return -ENOSYS;
+
+   return device-get_mac(device, port, gid, mac);
+}
+EXPORT_SYMBOL(ib_get_mac);
+
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index a17f771..49eee8a 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -42,7 +42,7 @@
  * Increment this value if any changes that break userspace ABI
  * compatibility are made.
  */
-#define IB_USER_VERBS_ABI_VERSION  6
+#define IB_USER_VERBS_ABI_VERSION  7
 
 enum {
IB_USER_VERBS_CMD_GET_CONTEXT,
@@ -81,7 +81,8 @@ enum {
IB_USER_VERBS_CMD_MODIFY_SRQ,
IB_USER_VERBS_CMD_QUERY_SRQ,
IB_USER_VERBS_CMD_DESTROY_SRQ,
-   IB_USER_VERBS_CMD_POST_SRQ_RECV
+   IB_USER_VERBS_CMD_POST_SRQ_RECV,
+   IB_USER_VERBS_CMD_GET_MAC
 };
 
 /*
@@ -205,7 +206,8 @@ struct ib_uverbs_query_port_resp {
__u8  active_width;
__u8  active_speed;
__u8  phys_state;
-   __u8  reserved[3];
+   __u8  transport;
+   __u8  reserved[2];
 };
 
 struct ib_uverbs_alloc_pd {
@@ -621,6 +623,19 @@ struct ib_uverbs_destroy_ah {
__u32 ah_handle;
 };
 
+struct

Re: [ewg] Re: Possible process deadlock in RMPP flow

2009-10-20 Thread Eli Cohen
On Mon, Oct 19, 2009 at 01:30:47PM -0700, Sean Hefty wrote:
 
 I can't find anything off in the code for this.  It's odd, since
 unregister_mad_agent() does:
 
 flush_workqueue(port_priv-wq);
 ib_cancel_rmpp_recvs(mad_agent_priv);
 
 and ib_cancel_rmpp_recvs() does:
 
 spin_lock_irqsave(agent-lock, flags);
 list_for_each_entry(rmpp_recv, agent-rmpp_list, list) {
 cancel_delayed_work(rmpp_recv-timeout_work);
 cancel_delayed_work(rmpp_recv-cleanup_work);
 }
 spin_unlock_irqrestore(agent-lock, flags);
 
 flush_workqueue(agent-qp_info-port_priv-wq);
 
 which basically just flushes the same work queue.
 
 I haven't been able to reproduce the problem, but I'm running the latest 
 kernel
 - not sure that matters in this case.  Does ibnetdiscover just hang forever at
 the end of the test when this occurs?  Is there any more information 
 available?
 

We are checking if the problem is a firmware bug, it looks like it.
Once we verify this I will send an update. 
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Announce: OFED-RDMAoE-1.5 rc1 is available

2009-10-15 Thread Eli Cohen
Abstract


RDMA over Ethernet (RDMAoE) allows running the IB transport protocol using
Ethernet frames, enabling the deployment of IB semantics on lossless Ethernet
fabrics. RDMAoE packets are standard Ethernet frames with an IEEE assigned
Ethertype, a GRH, unmodified IB transport headers and payload.  IB subnet
management and SA services are not required for RDMAoE operation; Ethernet
management practices are used instead. RDMAoE encodes IP addresses into its
GIDs and resolves MAC addresses using the host IP stack. For multicast GIDs,
standard IP to MAC mappings apply.

To support RDMAoE, a new transport protocol was added to the IB core. An RDMA
device can have ports with different transports, which are identified by a port
transport attribute.  The RDMA Verbs API is syntactically unmodified. When
referring to RDMAoE ports, Address handles are required to contain GIDs while
LID fields are ignored. The Ethernet L2 information is subsequently obtained by
the vendor-specific driver (both in kernel- and user-space) while modifying QPs
to RTR and creating address handles.  As there is no SA in RDMAoE, the CMA code
is modified to fill the necessary path record attributes locally before sending
CM packets. Similarly, the CMA provides to the user the required address handle
attributes when processing SIDR requests and joining multicast groups.

In this version, an RDMAoE port is currently assigned a single GID, encoding
the IPv6 link-local address of the corresponding netdev; the CMA RDMAoE code
temporarily uses IPv6 link-local addresses as GIDs instead of the IP address
provided by the user, thereby supporting any IP address.

To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and mlx4_ib
drivers must be loaded, and the netdevice for the corresponding RDMAoE port
must be running. Individual ports of a multi port HCA can be independently
configured as Ethernet (with support for RDMAoE) or IB, as is already the case.
We have successfully tested MPI, SDP, RDS, and native Verbs applications over
RDMAoE.

API changes
---
There are no API changes to the verbs interface in order to use RDMAoE.
However, applications are required to set up the QPs they use such that they
will use GRH. For this they need to supply the remote host's GID. For connected
QPs, this needs to be done when modifying the QP from INIT to RTR. For UD QPs
this has to be done when creating the UD address vector.

struct ibv_port_attr has been extended and now it contains a new field named
transport which is used to return the transport type of the corresponding port;
it can have the values RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP or
RDMA_TRANSPORT_RDMAOE. Applications can use this in order to take actions as
for whether GRH is a must or not.


Userspace applications
--

The following applications have been adapted:

ibv_devinfo - now it show the link type.
ibv_rc_pingpong
ibv_ud_pingpong
ib_uc_pingpong
ibv_srq_pingpong
ib_write_lat


___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Resend Announce: OFED-RDMAoE-1.5 rc1 is available

2009-10-15 Thread Eli Cohen
Resend due to a problem in my mail client:

OFED-RDMAoE-1.5-rc1 is available

The tarball is available on:
http://www.openfabrics.org/downloads/OFED/ofed-rdmaoe-1.5/OFED-RDMAoE-1.
5-rc1.tgz


To get BUILD_ID run ofed_info

Please report any issues in bugzilla https://bugs.openfabrics.org/ for
OFED 1.5, component RDMAoE




Release information:

Linux Operating Systems:
   - RedHat EL4 up6   2.6.9-67.ELsmp
   - RedHat EL4 up7   2.6.9-78.ELsmp
   - RedHat EL4 up8   2.6.9-89.ELsmp
   - RedHat EL5 up2   2.6.18-92.el5
   - RedHat EL5 up3   2.6.18-128.el5
   - RedHat EL5 up4   2.6.18-164.el5
   - SLES10 SP2   2.6.16.60-0.21-smp
   - SLES10 SP3   2.6.16.60-0.54-smp
   - SLES11   2.6.27.19-5-default
   - OEL 4 up72.6.9-78.ELsmp
   - OEL 5 up22.6.18-92.el5
   - CentOS5.22.6.18-92.el5
   - CentOS5.32.6.18-128.el5
   - Fedora Cor   2.6.29  *
   - OpenSuSE 11  2.6.25.5-1.1*
   - kernel.org   2.6.29 and 2.6.30


 * Minimal QA for these versions

Systems:
 * x86_64
 * x86
 * ia64
 * ppc64

Notes:
RDMAoE and mlx4_en capabilities are supported only for x86 and x86_64
architectures.

The attached README file gives an overview of the technology.


README
Description: README
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg

Re: [ewg] [PATCH] mlx4: remove limitation on LSO header size

2009-10-11 Thread Eli Cohen
On Wed, Oct 07, 2009 at 03:45:16PM -0700, Roland Dreier wrote:
 
   +  *blh = unlikely(halign  64) ? 1 : 0;
 
 This idiom of (boolean condition) ? 1 : 0 looks odd to me... doesn't
 (halign  64) already evaluate to 1 or 0 anyway?  Does the unlikely()
 actually affect code generation here?
True, (halign  64) is the same and is cleaner. As for the unlikely()
-- well it's already been there and besides, we're never sure if it
will improve anything so the same question could be asked for other
places in the code.


 
 With that said, see below...
 
   +  int blh = 0;
 
 I assume this initialization is to avoid a compiler warning.  But the
 code is actually correct without initializing blh -- so I think that we
 save a tiny bit of code by doing uninitialized_var() instead?
We must initialize blh since it is used for any send request and not
just LSO opcodes. 


 
   +  (blh ? cpu_to_be32(1  6) : 0);
 
 ...given that the only use of blh is as a flag to decide what constant
 to use here, does it generate better code to make blh be __be32 and set
 the value directly in build_lso_seg, ie do:
 
   *blh = unlikely(halign  64) ? cpu_to_be32(1  6) : 0;
 
 and then use blh without ?: in mlx4_ib_post_send...
 

So we can let build_lso_header() put the corrent value for blh and
unconditionally or it into owner_opcode.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCH] mlx4: remove limitation on LSO header size

2009-09-30 Thread Eli Cohen
Current code has a limitation as for the size of an LSO header not allowed to
cross a 64 byte boundary. This patch removes this limitation by setting the WQE
RR for large headers thus allowing LSO headers of any size. The extra buffer
reserved for MLX4_IB_QP_LSO QPs has been doubled, from 64 to 128 bytes,
assuming this is reasonable upper limit to header length.
Also, this patch will cause IB_DEVICE_UD_TSO to be set only of FW versions that
set MLX4_DEV_CAP_FLAG_BLH; e.g. FW version 2.6.000 and higher.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/hw/mlx4/main.c |2 +-
 drivers/infiniband/hw/mlx4/qp.c   |   17 +++--
 include/linux/mlx4/device.h   |1 +
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 3cb3f47..e596537 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -103,7 +103,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props-device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
if (dev-dev-caps.flags  MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
props-device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
-   if (dev-dev-caps.max_gso_sz)
+   if (dev-dev-caps.max_gso_sz  dev-dev-caps.flags  
MLX4_DEV_CAP_FLAG_BLH)
props-device_cap_flags |= IB_DEVICE_UD_TSO;
if (dev-dev-caps.bmme_flags  MLX4_BMME_FLAG_RESERVED_LKEY)
props-device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 219b103..1b356cf 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -261,7 +261,7 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 
flags)
case IB_QPT_UD:
return sizeof (struct mlx4_wqe_ctrl_seg) +
sizeof (struct mlx4_wqe_datagram_seg) +
-   ((flags  MLX4_IB_QP_LSO) ? 64 : 0);
+   ((flags  MLX4_IB_QP_LSO) ? 128 : 0);
case IB_QPT_UC:
return sizeof (struct mlx4_wqe_ctrl_seg) +
sizeof (struct mlx4_wqe_raddr_seg);
@@ -1467,16 +1467,11 @@ static void __set_data_seg(struct mlx4_wqe_data_seg 
*dseg, struct ib_sge *sg)
 
 static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
-__be32 *lso_hdr_sz)
+__be32 *lso_hdr_sz, int *blh)
 {
unsigned halign = ALIGN(sizeof *wqe + wr-wr.ud.hlen, 16);
 
-   /*
-* This is a temporary limitation and will be removed in
-* a forthcoming FW release:
-*/
-   if (unlikely(halign  64))
-   return -EINVAL;
+   *blh = unlikely(halign  64) ? 1 : 0;
 
if (unlikely(!(qp-flags  MLX4_IB_QP_LSO) 
 wr-num_sge  qp-sq.max_gs - (halign  4)))
@@ -1523,6 +1518,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
__be32 *lso_wqe;
__be32 uninitialized_var(lso_hdr_sz);
int i;
+   int blh = 0;
 
spin_lock_irqsave(qp-sq.lock, flags);
 
@@ -1616,7 +1612,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 
if (wr-opcode == IB_WR_LSO) {
-   err = build_lso_seg(wqe, wr, qp, seglen, 
lso_hdr_sz);
+   err = build_lso_seg(wqe, wr, qp, seglen, 
lso_hdr_sz, blh);
if (unlikely(err)) {
*bad_wr = wr;
goto out;
@@ -1687,7 +1683,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
}
 
ctrl-owner_opcode = mlx4_ib_opcode[wr-opcode] |
-   (ind  qp-sq.wqe_cnt ? cpu_to_be32(1  31) : 0);
+   (ind  qp-sq.wqe_cnt ? cpu_to_be32(1  31) : 0) |
+   (blh ? cpu_to_be32(1  6) : 0);
 
stamp = ind + qp-sq_spare_wqes;
ind += DIV_ROUND_UP(size * 16, 1U  qp-sq.wqe_shift);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index ce7cc6c..e92d1bf 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -61,6 +61,7 @@ enum {
MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1   8,
MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1   9,
MLX4_DEV_CAP_FLAG_DPDP  = 1  12,
+   MLX4_DEV_CAP_FLAG_BLH   = 1  15,
MLX4_DEV_CAP_FLAG_MEM_WINDOW= 1  16,
MLX4_DEV_CAP_FLAG_APM   = 1  17,
MLX4_DEV_CAP_FLAG_ATOMIC= 1  18,
-- 
1.6.4.3

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Re: Possible process deadlock in RMPP flow

2009-09-27 Thread Eli Cohen
On Thu, Sep 24, 2009 at 08:53:24AM -0700, Sean Hefty wrote:
 Thanks Or. This one is already in OFED 1.4.2 but apparently this is a
 different problem. Once I have information whether the patch Roland
 posted fixed it I will update the list.
 
 If ibnetdiscover doesn't use RMPP as Hal indicated, I don't think Roland's 
 patch
 will help.

Right, it doesn't help. Still it appears that ibnetdiscover triggers
this problem and the lock seams to appear at ib_cancel_rmpp_recvs()
waiting for flush_workqueue() to return. Do you know which apps or
ULPs make use of RMPPs?
Any other ideas what this could be?
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Re: Possible process deadlock in RMPP flow

2009-09-24 Thread Eli Cohen
On Thu, Sep 24, 2009 at 09:38:43AM +0300, Or Gerlitz wrote:
 
 commit b61d92d8ae6aa13b17d1c31e69d123879cec2ee2
 Author: Sean Hefty sean.he...@intel.com
 Date:   Fri Nov 30 17:30:18 2007 -0800
 
 IB/mad: Fix incorrect access to items on local_list
 
Thanks Or. This one is already in OFED 1.4.2 but apparently this is a
different problem. Once I have information whether the patch Roland
posted fixed it I will update the list.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Possible process deadlock in RMPP flow

2009-09-23 Thread Eli Cohen
Hi Sean,
one of our customers experiences problems when running ibnetdiscover.
The problem happens from time to time.
Here is the call stack the he gets:

ibnetdiscover D 80149b8d 0 26968  26544
(L-TLB)
 8102c900bd88 0046 81037e8e 81037e8e02e8
 8102c900bd78 000a 8102c5b50820 81038a929820
 011837bf6105 0ede 8102c5b50a08 0001
Call Trace:
 [80064207] wait_for_completion+0x79/0xa2
 [8008b4cc] default_wake_function+0x0/0xe
 [882271d9] :ib_mad:ib_cancel_rmpp_recvs+0x87/0xde
 [88224485] :ib_mad:ib_unregister_mad_agent+0x30d/0x424
 [883983e9] :ib_umad:ib_umad_close+0x9d/0xd6
 [80012e22] __fput+0xae/0x198
 [80023de6] filp_close+0x5c/0x64
 [800393df] put_files_struct+0x63/0xae
 [80015b26] do_exit+0x31c/0x911
 [8004971a] cpuset_exit+0x0/0x6c
 [8005e116] system_call+0x7e/0x83

From the dump it seems that the process is waits on the call to
flush_workqueue() in ib_cancel_rmpp_recvs(). The package they use is
OFED 1.4.2.

Do you have any idea or suggestions how to sort this out?

Thanks.
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] Re: Possible process deadlock in RMPP flow

2009-09-23 Thread Eli Cohen
On Wed, Sep 23, 2009 at 09:08:28AM -0700, Sean Hefty wrote:
 
 Roland just submitted a patch in this area yesterday.  I don't know if the 
 patch
 would fix their issue, but it may be worth trying.  What kernel does 1.4.2 map
 to?
I think OFED 1.4.2 is based on kernel 2.6.27 but they're using RHEL
5.3.
Thanks, we'll try this.

 
 What RMPP messages does ibnetdiscover use?  If the program is completing
 successfully, there may be a different race with the rmpp cleanup.  I'll see 
 if
 anything else stands out in that area.
 
 - Sean
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [Announce] OFED 1.5 based branch with RDMAoE support

2009-09-17 Thread Eli Cohen
There is an OFED-1.5 based package with RDMAoE support. The pacage can
be downloaded from
http://www.openfabrics.org/downloads/OFED/ofed-rdmaoe-1.5-daily/

The sources can be found here:
ssh://git.openfabrics.org/home/eli/scm/ofed-1.5/kernel.git/linux-2.6.git rdmaoe
ssh://git.openfabrics.org/home/eli/scm/ofed-1.5/user.git/libibverbs.git rdmaoe
ssh://git.openfabrics.org/home/eli/scm/ofed-1.5/user.git/libmlx4.git rdmaoe


The following tests have been converted:
libibverbs/examples:
ibv_rc_pingpong
ibv_uc_pingpong
ibv_uc_pingpong
ibv_srq_pingpong

The package was tested and works with RDS:
rds-ping
rds-stress

rdma_cm tests:
mckey
ucmatose

SDP sockets work: iperf was used to check this.

devices supported:
ConnectX Ethernet devices
ConnectX VPI devices

Fw version:
For RDMAoE to work, FW 2.6.900 and newer must be used.

Geneal notes:
1. RDMAoE requires verbs consumers to use GRH. libibverbs/examples
tests have been modify to accept -g remote gid
2. Formerly, IB links were usually 2K. Now RDMAoE links are likely to
show 1K; UD applications should be aware of that since UD message size
is limited by MTU.

___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCH] mlx4_core: map sufficient ICM memory for EQs

2009-07-30 Thread Eli Cohen
Current implementation allocates a single host page for EQ context memory. As
the number of CPU cores increases, and since the number of required EQs depends
on this number, this patch removes the hard coded limit and makes the
allocation dependent on EQ entry size and the number of required EQs.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/net/mlx4/eq.c   |   42 --
 drivers/net/mlx4/main.c |1 +
 drivers/net/mlx4/mlx4.h |1 +
 include/linux/mlx4/device.h |1 +
 4 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c
index b9ceddd..1d41b1a 100644
--- a/drivers/net/mlx4/eq.c
+++ b/drivers/net/mlx4/eq.c
@@ -530,29 +530,34 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
 {
struct mlx4_priv *priv = mlx4_priv(dev);
int ret;
+   int host_pages, icm_pages;
+   int i;
 
-   /*
-* We assume that mapping one page is enough for the whole EQ
-* context table.  This is fine with all current HCAs, because
-* we only use 32 EQs and each EQ uses 64 bytes of context
-* memory, or 1 KB total.
-*/
+   host_pages = ALIGN(min_t(int, dev-caps.num_eqs, num_possible_cpus() + 
1) *
+  dev-caps.eqc_entry_size, PAGE_SIZE)  PAGE_SHIFT;
+   priv-eq_table.order = ilog2(roundup_pow_of_two(host_pages));
priv-eq_table.icm_virt = icm_virt;
-   priv-eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+   priv-eq_table.icm_page = alloc_pages(GFP_HIGHUSER, 
priv-eq_table.order);
if (!priv-eq_table.icm_page)
return -ENOMEM;
priv-eq_table.icm_dma  = pci_map_page(dev-pdev, 
priv-eq_table.icm_page, 0,
-  PAGE_SIZE, 
PCI_DMA_BIDIRECTIONAL);
+  PAGE_SIZE  
priv-eq_table.order,
+  PCI_DMA_BIDIRECTIONAL);
if (pci_dma_mapping_error(dev-pdev, priv-eq_table.icm_dma)) {
-   __free_page(priv-eq_table.icm_page);
+   __free_pages(priv-eq_table.icm_page, priv-eq_table.order);
return -ENOMEM;
}
 
-   ret = mlx4_MAP_ICM_page(dev, priv-eq_table.icm_dma, icm_virt);
-   if (ret) {
-   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, PAGE_SIZE,
-  PCI_DMA_BIDIRECTIONAL);
-   __free_page(priv-eq_table.icm_page);
+   icm_pages = (PAGE_SIZE / MLX4_ICM_PAGE_SIZE) * (1  
priv-eq_table.order);
+   for (i = 0; i  icm_pages; ++i) {
+   ret = mlx4_MAP_ICM_page(dev, priv-eq_table.icm_dma, icm_virt + 
i * MLX4_ICM_PAGE_SIZE);
+   if (ret) {
+   mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, i);
+   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, 
PAGE_SIZE,
+  PCI_DMA_BIDIRECTIONAL);
+   __free_pages(priv-eq_table.icm_page, 
priv-eq_table.order);
+   break;
+   }
}
 
return ret;
@@ -561,11 +566,12 @@ int mlx4_map_eq_icm(struct mlx4_dev *dev, u64 icm_virt)
 void mlx4_unmap_eq_icm(struct mlx4_dev *dev)
 {
struct mlx4_priv *priv = mlx4_priv(dev);
+   int icm_pages = (PAGE_SIZE / MLX4_ICM_PAGE_SIZE) * (1  
priv-eq_table.order);
 
-   mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, 1);
-   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma, PAGE_SIZE,
-  PCI_DMA_BIDIRECTIONAL);
-   __free_page(priv-eq_table.icm_page);
+   mlx4_UNMAP_ICM(dev, priv-eq_table.icm_virt, icm_pages);
+   pci_unmap_page(dev-pdev, priv-eq_table.icm_dma,
+  PAGE_SIZE  priv-eq_table.order, 
PCI_DMA_BIDIRECTIONAL);
+   __free_pages(priv-eq_table.icm_page, priv-eq_table.order);
 }
 
 int mlx4_alloc_eq_table(struct mlx4_dev *dev)
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index dac621b..872becd 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -207,6 +207,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
dev-caps.max_cqes   = dev_cap-max_cq_sz - 1;
dev-caps.reserved_cqs   = dev_cap-reserved_cqs;
dev-caps.reserved_eqs   = dev_cap-reserved_eqs;
+   dev-caps.eqc_entry_size = dev_cap-eqc_entry_sz;
dev-caps.mtts_per_seg   = 1  log_mtts_per_seg;
dev-caps.reserved_mtts  = DIV_ROUND_UP(dev_cap-reserved_mtts,
dev-caps.mtts_per_seg);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 5bd79c2..1a20fa3 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -211,6 +211,7 @@ struct mlx4_eq_table {
struct mlx4_icm_table   cmpt_table;
int have_irq;
u8  inta_pin;
+   int

[ewg] [PATCH 0/8 v3] RDMAoE support

2009-07-13 Thread Eli Cohen
RDMA over Ethernet (RDMAoE) allows running the IB transport protocol
using Ethernet frames allowing the deployment of IB semantics on
lossless Ethernet fabrics. RDMAoE packets are standard Ethernet frames
with an IEEE assigned Ethertype, a GRH, unmodified IB transport
headers and payload. Aside from the considerations pointed out below,
RDMAoE ports are functionally equivalent to regular IB ports from the
RDMA stack perspective.

IB subnet management and SA services are not required for RDMAoE
operation; Ethernet management practices are used instead. In
Ethernet, nodes are commonly referred to by applications by means of
an IP address. RDMAoE encodes the IP addresses that were assigned to
the corresponding Ethernet port into its GIDs, and makes use of the IP
stack to bind a destination address to the corresponding netdevice
(just as the CMA does today for IB and iWARP) and to obtain its L2 MAC
addresses.

The RDMA Verbs API is syntactically unmodified. When referring to
RDMAoE ports, Address handles are required to contain GIDs and the L2
address fields in the API are ignored. The Ethernet L2 information is
then obtained by the vendor-specific driver (both in kernel- and
user-space) while modifying QPs to RTR and creating address handles.

In order to maximize transparency for applications, RDMAoE implements
a dedicated API that provides services equivalent to some of those
provided by the IB-SA. The current approach is strictly local but may
evolve in the future. This API is implemented using an independent
source code file which allows for seamless evolution of the code
without affecting the IB native SA interfaces. We have successfully
tested MPI, SDP, RDS, and native Verbs applications over RDMAoE.

To enable RDMAoE with the mlx4 driver stack, both the mlx4_en and
mlx4_ib drivers must be loaded, and the netdevice for the
corresponding RDMAoE port must be running. Individual ports of a multi
port HCA can be independently configured as Ethernet (with support for
RDMAoE) or IB, as is already the case.

Following is a series of 8 patches based on version 2.6.30 of the
Linux kernel. This new series reflects changes based on feedback from
the community on the previous set of patches. The whole series is
tagged v3.

Signed-off-by: Eli Cohen e...@mellanox.co.il


 drivers/infiniband/core/Makefile  |2 
 drivers/infiniband/core/addr.c|   20 
 drivers/infiniband/core/agent.c   |   12 
 drivers/infiniband/core/cma.c |  124 +++
 drivers/infiniband/core/mad.c |   48 +
 drivers/infiniband/core/multicast.c   |   43 -
 drivers/infiniband/core/multicast.h   |   79 ++
 drivers/infiniband/core/rdmaoe_sa.c   |  942 ++
 drivers/infiniband/core/sa.h  |   24 
 drivers/infiniband/core/sa_query.c|   26 
 drivers/infiniband/core/ud_header.c   |  111 +++
 drivers/infiniband/core/uverbs.h  |1 
 drivers/infiniband/core/uverbs_cmd.c  |   33 +
 drivers/infiniband/core/uverbs_main.c |1 
 drivers/infiniband/core/verbs.c   |   17 
 drivers/infiniband/hw/mlx4/ah.c   |  228 ++-
 drivers/infiniband/hw/mlx4/main.c |  276 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |   30 
 drivers/infiniband/hw/mlx4/qp.c   |  253 ++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c |3 
 drivers/net/mlx4/cmd.c|6 
 drivers/net/mlx4/en_main.c|   15 
 drivers/net/mlx4/en_port.c|4 
 drivers/net/mlx4/en_port.h|3 
 drivers/net/mlx4/intf.c   |   20 
 drivers/net/mlx4/main.c   |6 
 drivers/net/mlx4/mlx4.h   |1 
 include/linux/mlx4/cmd.h  |1 
 include/linux/mlx4/device.h   |   31 
 include/linux/mlx4/driver.h   |   16 
 include/linux/mlx4/qp.h   |8 
 include/rdma/ib_addr.h|   53 +
 include/rdma/ib_pack.h|   26 
 include/rdma/ib_user_verbs.h  |   21 
 include/rdma/ib_verbs.h   |   22 
 include/rdma/rdmaoe_sa.h  |   66 ++
 36 files changed, 2333 insertions(+), 239 deletions(-)
___
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg


[ewg] [PATCH 1/8 v3] ib_core: Add API to support RDMAoE

2009-07-13 Thread Eli Cohen
Add two API functions needed for RDMAoE.

ib_get_port_link_type() returns the link type support by the given device's
port. It can be either PORT_LINK_IB for IB link layer or PORT_LINK_ETH for
Ethernet links. Link type is reported to in query_port verb.

ib_get_mac() will return the Ethernet MAC address leading to the port whose GID
is spcified. This function is exported to userspace applications.

ABI version is incremented from 6 to 7.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |   33 +
 drivers/infiniband/core/uverbs_main.c |1 +
 drivers/infiniband/core/verbs.c   |   17 +
 include/rdma/ib_user_verbs.h  |   21 ++---
 include/rdma/ib_verbs.h   |   22 ++
 6 files changed, 92 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b3ea958..e69b04c 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -194,5 +194,6 @@ IB_UVERBS_DECLARE_CMD(create_srq);
 IB_UVERBS_DECLARE_CMD(modify_srq);
 IB_UVERBS_DECLARE_CMD(query_srq);
 IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(get_mac);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 56feab6..eefc414 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -452,6 +452,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width= attr.active_width;
resp.active_speed= attr.active_speed;
resp.phys_state  = attr.phys_state;
+   resp.link_type   = attr.link_type;
 
if (copy_to_user((void __user *) (unsigned long) cmd.response,
 resp, sizeof resp))
@@ -1824,6 +1825,38 @@ err:
return ret;
 }
 
+ssize_t ib_uverbs_get_mac(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+   struct ib_uverbs_get_maccmd;
+   struct ib_uverbs_get_mac_resp   resp;
+   int  ret;
+   struct ib_pd*pd;
+
+   if (out_len  sizeof resp)
+   return -ENOSPC;
+
+   if (copy_from_user(cmd, buf, sizeof cmd))
+   return -EFAULT;
+
+   pd = idr_read_pd(cmd.pd_handle, file-ucontext);
+   if (!pd)
+   return -EINVAL;
+
+   ret = ib_get_mac(pd-device, cmd.port, cmd.gid, resp.mac);
+   put_pd_read(pd);
+   if (!ret) {
+   if (copy_to_user((void __user *) (unsigned long) cmd.response,
+resp, sizeof resp)) {
+   return -EFAULT;
+   }
+   return in_len;
+   }
+
+   return ret;
+}
+
 ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
 const char __user *buf, int in_len, int out_len)
 {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index eb36a81..b2f148f 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -108,6 +108,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file 
*file,
[IB_USER_VERBS_CMD_MODIFY_SRQ]  = ib_uverbs_modify_srq,
[IB_USER_VERBS_CMD_QUERY_SRQ]   = ib_uverbs_query_srq,
[IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+   [IB_USER_VERBS_CMD_GET_MAC] = ib_uverbs_get_mac
 };
 
 static struct vfsmount *uverbs_event_mnt;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index a7da9be..bde5b0d 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -94,6 +94,13 @@ rdma_node_get_transport(enum rdma_node_type node_type)
 }
 EXPORT_SYMBOL(rdma_node_get_transport);
 
+enum ib_port_link_type ib_get_port_link_type(struct ib_device *device, u8 
port_num)
+{
+   return device-get_port_link_type ?
+   device-get_port_link_type(device, port_num) : PORT_LINK_IB;
+}
+EXPORT_SYMBOL(ib_get_port_link_type);
+
 /* Protection domains */
 
 struct ib_pd *ib_alloc_pd(struct ib_device *device)
@@ -904,3 +911,13 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, 
u16 lid)
return qp-device-detach_mcast(qp, gid, lid);
 }
 EXPORT_SYMBOL(ib_detach_mcast);
+
+int ib_get_mac(struct ib_device *device, u8 port, u8 *gid, u8 *mac)
+{
+   if (!device-get_mac)
+   return -ENOSYS;
+
+   return device-get_mac(device, port, gid, mac);
+}
+EXPORT_SYMBOL(ib_get_mac);
+
diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h
index a17f771..184203b 100644
--- a/include/rdma/ib_user_verbs.h
+++ b/include/rdma/ib_user_verbs.h
@@ -42,7 +42,7 @@
  * Increment this value if any changes that break userspace ABI

[ewg] [PATCH 3/8 v3] ib_core: Add RDMAoE SA support

2009-07-13 Thread Eli Cohen
Add support for resolving paths and joining multicast group for RDMAoE ports.
For Eth links, path resolution will complete immediately but will call the
callback from a workqueue context to avoid deadloks. Multicast joins are
handled in nearly the same way as IB mulitcast joins are handled in
multicast.c. However they are handled entirly at the host and no MADs are
involved. This allows for a client to create groups and dictate the qkey to be
used in that group. The code is put in rdmaoe_sa.c which handles both multicast
joins/leaves and path resolution.

The following files were added:
drivers/infiniband/core/multicast.h
drivers/infiniband/core/rdmaoe_sa.c
include/rdma/rdmaoe_sa.h

There are changes made in vers/infiniband/core/multicast.c,
drivers/infiniband/core/sa_query.c and drivers/infiniband/core/sa.h to allow
sharing of data structs. New API functions are added for RDMAoE and comsumenrs
who want to use this API need to be changed.

Signed-off-by: Eli Cohen e...@mellanox.co.il
---
 drivers/infiniband/core/Makefile|2 +-
 drivers/infiniband/core/multicast.c |   43 +--
 drivers/infiniband/core/multicast.h |   79 +++
 drivers/infiniband/core/rdmaoe_sa.c |  938 +++
 drivers/infiniband/core/sa.h|   24 +
 drivers/infiniband/core/sa_query.c  |   26 +-
 include/rdma/rdmaoe_sa.h|   66 +++
 7 files changed, 1113 insertions(+), 65 deletions(-)
 create mode 100644 drivers/infiniband/core/multicast.h
 create mode 100644 drivers/infiniband/core/rdmaoe_sa.c
 create mode 100644 include/rdma/rdmaoe_sa.h

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index cb1ab3e..96db705 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -2,7 +2,7 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)  := ib_addr.o 
rdma_cm.o
 user_access-$(CONFIG_INFINIBAND_ADDR_TRANS):= rdma_ucm.o
 
 obj-$(CONFIG_INFINIBAND) +=ib_core.o ib_mad.o ib_sa.o \
-   ib_cm.o iw_cm.o $(infiniband-y)
+   ib_cm.o iw_cm.o rdmaoe_sa.o 
$(infiniband-y)
 obj-$(CONFIG_INFINIBAND_USER_MAD) +=   ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=ib_uverbs.o ib_ucm.o \
$(user_access-y)
diff --git a/drivers/infiniband/core/multicast.c 
b/drivers/infiniband/core/multicast.c
index 107f170..727a55a 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -39,6 +39,7 @@
 
 #include rdma/ib_cache.h
 #include sa.h
+#include multicast.h
 
 static void mcast_add_one(struct ib_device *device);
 static void mcast_remove_one(struct ib_device *device);
@@ -72,52 +73,10 @@ struct mcast_device {
struct mcast_port   port[0];
 };
 
-enum mcast_state {
-   MCAST_JOINING,
-   MCAST_MEMBER,
-   MCAST_ERROR,
-};
-
-enum mcast_group_state {
-   MCAST_IDLE,
-   MCAST_BUSY,
-   MCAST_GROUP_ERROR,
-   MCAST_PKEY_EVENT
-};
-
 enum {
MCAST_INVALID_PKEY_INDEX = 0x
 };
 
-struct mcast_member;
-
-struct mcast_group {
-   struct ib_sa_mcmember_rec rec;
-   struct rb_node  node;
-   struct mcast_port   *port;
-   spinlock_t  lock;
-   struct work_struct  work;
-   struct list_headpending_list;
-   struct list_headactive_list;
-   struct mcast_member *last_join;
-   int members[3];
-   atomic_trefcount;
-   enum mcast_group_state  state;
-   struct ib_sa_query  *query;
-   int query_id;
-   u16 pkey_index;
-};
-
-struct mcast_member {
-   struct ib_sa_multicast  multicast;
-   struct ib_sa_client *client;
-   struct mcast_group  *group;
-   struct list_headlist;
-   enum mcast_statestate;
-   atomic_trefcount;
-   struct completion   comp;
-};
-
 static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
 void *context);
 static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
diff --git a/drivers/infiniband/core/multicast.h 
b/drivers/infiniband/core/multicast.h
new file mode 100644
index 000..17eb9fe
--- /dev/null
+++ b/drivers/infiniband/core/multicast.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions

  1   2   3   >